├── .gitignore
├── README.md
├── Tests
    ├── Data
    │   ├── t10k-images-idx3-ubyte.gz
    │   ├── t10k-labels-idx1-ubyte.gz
    │   ├── train-images-idx3-ubyte.gz
    │   └── train-labels-idx1-ubyte.gz
    ├── Helpers.py
    ├── TestActivationFunctions.py
    ├── TestBatchNormalization.py
    ├── TestConv.py
    ├── TestDropout.py
    ├── TestFullyConnected.py
    ├── TestInitializers.py
    ├── TestMaxPoolLayer.py
    └── TestSoftMaxCrossEntropyLoss.py
├── img
    ├── ann.png
    ├── conv_back_weights.png
    ├── conv_forward.png
    ├── fcn.png
    ├── lenet.jpg
    ├── numerical_maxpooling.gif
    ├── padding_strides.gif
    ├── pooling.gif
    ├── restacking_filters.gif
    └── same_padding_no_strides.gif
├── license.txt
├── src
    ├── base.py
    ├── layers
    │   ├── activation_functions.py
    │   ├── batch_normalization.py
    │   ├── conv.py
    │   ├── dropout.py
    │   ├── fully_connected.py
    │   ├── initializers.py
    │   ├── pooling.py
    │   └── softmax_crossentropy.py
    └── network.py
└── tutorial_dl.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | .ipynb_checkpoints/*
3 | *__pycache__*
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial: How to Build a Deep Learning Framework
 2 | 
 3 | by Katharina Breininger and Tobias Wuerfl
 4 | 
 5 | Pattern Recognition Lab, Friedrich-Alexander University Erlangen-Nuernberg, Erlangen, Germany
 6 | 
 7 | ## How to start the tutorial:
 8 | 
 9 | The tutorial is implemented as a jupyter notebook. 
10 | 
11 | To start the tutorial, open a terminal, change to the the main folder (the folder this README is located in) and then type jupyter notebook to launch the Jupyter Notebook App. It will open in a new browser tab.
12 | 
13 | Click on "tutorial_dl.ipynb" in the files listed to run the notebook.
14 | 
15 | More information can be found at http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/execute.html


--------------------------------------------------------------------------------
/Tests/Data/t10k-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/t10k-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/Tests/Data/t10k-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/t10k-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/Tests/Data/train-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/train-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/Tests/Data/train-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/train-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/Tests/Helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import gzip
  5 | import struct
  6 | from pathlib import Path
  7 | from random import shuffle
  8 | from sklearn.preprocessing import OneHotEncoder
  9 | from sklearn.datasets import load_iris, load_digits
 10 | 
 11 | class SoftMax:
 12 |     def __init__(self):
 13 |         self.prediction = None
 14 | 
 15 |     def forward(self, input_tensor, label_tensor):
 16 |         prediction = self.predict(input_tensor)
 17 |         indices = np.where(label_tensor == 1)
 18 |         loss = np.sum( - np.log(prediction[indices]))
 19 |         return loss
 20 | 
 21 |     def backward(self, label_tensor):
 22 |         indices = np.where(label_tensor == 1)
 23 |         error = self.prediction.copy()
 24 |         error[indices] = error[indices] - 1
 25 |         return error
 26 | 
 27 |     def predict(self, input_tensor):
 28 |         input_tensor = input_tensor - np.max(input_tensor)
 29 |         denominator = np.tile(np.sum(np.exp(input_tensor),axis = 1),(input_tensor.shape[1],1)).T
 30 |         prediction = np.exp(input_tensor)/denominator
 31 |         self.prediction = prediction
 32 |         return prediction
 33 | 
 34 | def gradient_check(layers, input_tensor, label_tensor):
 35 |     epsilon = 1e-5
 36 |     difference = np.zeros_like(input_tensor)
 37 |     
 38 |     activation_tensor = input_tensor.copy()
 39 |     for layer in layers[:-1]:
 40 |         activation_tensor = layer.forward(activation_tensor)
 41 |     layers[-1].forward(activation_tensor, label_tensor)
 42 | 
 43 |     error_tensor = layers[-1].backward(label_tensor)
 44 |     for layer in reversed(layers[:-1]):
 45 |         error_tensor = layer.backward(error_tensor)
 46 |     
 47 |     it = np.nditer(input_tensor, flags=['multi_index'])
 48 |     while not it.finished:
 49 |         plus_epsilon = input_tensor.copy()
 50 |         plus_epsilon[it.multi_index] += epsilon
 51 |         minus_epsilon = input_tensor.copy()
 52 |         minus_epsilon[it.multi_index] -= epsilon
 53 | 
 54 |         analytical_derivative = error_tensor[it.multi_index]
 55 | 
 56 |         for layer in layers[:-1]:
 57 |             plus_epsilon = layer.forward(plus_epsilon)
 58 |             minus_epsilon = layer.forward(minus_epsilon)
 59 |         upper_error = layers[-1].forward(plus_epsilon, label_tensor)
 60 |         lower_error = layers[-1].forward(minus_epsilon, label_tensor)
 61 | 
 62 |         numerical_derivative = (upper_error - lower_error) / (2 * epsilon)
 63 |             
 64 |         #print('Analytical: ' + str(analytical_derivative) + ' vs Numerical :' + str(numerical_derivative))
 65 |         normalizing_constant = max(np.abs(analytical_derivative), np.abs(numerical_derivative))
 66 | 
 67 |         if normalizing_constant < 1e-15:
 68 |             difference[it.multi_index] = 0
 69 |         else:
 70 |             difference[it.multi_index] = np.abs(analytical_derivative - numerical_derivative) / normalizing_constant
 71 |         
 72 |         it.iternext()    
 73 |     return difference
 74 | 
 75 | 
 76 | def plot_difference(plot, description, shape, difference, directory):
 77 |     if plot:
 78 |         image = difference[0, :]
 79 |         image = image.reshape(shape)
 80 |         fig = plt.figure(description)
 81 |         plt.imshow(image)
 82 |         plt.colorbar()
 83 |         fig.savefig(os.path.join(directory, description + ".pdf"), transparent=True, bbox_inches='tight', pad_inches=0)
 84 |         plt.close('all')
 85 | 
 86 | 
 87 | def gradient_check_weights(layers, input_tensor, label_tensor, bias):
 88 |     epsilon = 1e-5
 89 |     if bias:
 90 |         weights = layers[0].bias
 91 |     else:
 92 |         weights = layers[0].weights
 93 |     difference = np.zeros_like(weights)
 94 | 
 95 |     it = np.nditer(weights, flags=['multi_index'])
 96 |     while not it.finished:
 97 |         plus_epsilon = weights.copy()
 98 |         plus_epsilon[it.multi_index] += epsilon
 99 |         minus_epsilon = weights.copy()
100 |         minus_epsilon[it.multi_index] -= epsilon
101 | 
102 |         activation_tensor = input_tensor.copy()
103 |         if bias:
104 |             layers[0].bias = weights
105 |         else:
106 |             layers[0].weights = weights
107 |         for layer in layers[:-1]:
108 |             activation_tensor = layer.forward(activation_tensor)
109 |         layers[-1].forward(activation_tensor, label_tensor)
110 | 
111 |         error_tensor = layers[-1].backward(label_tensor)
112 |         for layer in reversed(layers[:-1]):
113 |             error_tensor = layer.backward(error_tensor)
114 |         if bias:
115 |             analytical_derivative = layers[0].get_gradient_bias()
116 |         else:
117 |             analytical_derivative = layers[0].get_gradient_weights()
118 |         analytical_derivative = analytical_derivative[it.multi_index]
119 | 
120 |         if bias:
121 |             layers[0].bias = plus_epsilon
122 |         else:
123 |             layers[0].weights = plus_epsilon
124 |         plus_epsilon_activation = input_tensor.copy()
125 |         for layer in layers[:-1]:
126 |             plus_epsilon_activation = layer.forward(plus_epsilon_activation)
127 | 
128 |         if bias:
129 |             layers[0].bias = minus_epsilon
130 |         else:
131 |             layers[0].weights = minus_epsilon
132 |         minus_epsilon_activation = input_tensor.copy()
133 |         for layer in layers[:-1]:
134 |             minus_epsilon_activation = layer.forward(minus_epsilon_activation)
135 | 
136 |         upper_error = layers[-1].forward(plus_epsilon_activation, label_tensor)
137 |         lower_error = layers[-1].forward(minus_epsilon_activation, label_tensor)
138 | 
139 |         numerical_derivative = (upper_error - lower_error) / (2 * epsilon)
140 |         normalizing_constant = max(np.abs(analytical_derivative), np.abs(numerical_derivative))
141 | 
142 |         if normalizing_constant < 1e-15:
143 |             difference[it.multi_index] = 0
144 |         else:
145 |             difference[it.multi_index] = np.abs(analytical_derivative - numerical_derivative) / normalizing_constant
146 | 
147 | 
148 |         it.iternext()
149 |     return difference
150 | 
151 | 
152 | 
153 | def calculate_accuracy(results, labels):
154 | 
155 |     index_maximum = np.argmax(results, axis=1)
156 |     one_hot_vector = np.zeros_like(results)
157 |     for i in range(one_hot_vector.shape[0]):
158 |         one_hot_vector[i, index_maximum[i]] = 1
159 | 
160 |     correct = 0.
161 |     wrong = 0.
162 |     for column_results, column_labels in zip(one_hot_vector, labels):
163 |         if column_results[column_labels > 0.].all() > 0.:
164 |             correct += 1.
165 |         else:
166 |             wrong += 1.
167 | 
168 |     return correct / (correct + wrong)
169 | 
170 | 
171 | def shuffle_data(input_tensor, label_tensor):
172 |     index_shuffling = [i for i in range(input_tensor.shape[0])]
173 |     shuffle(index_shuffling)
174 |     shuffled_input = [input_tensor[i, :] for i in index_shuffling]
175 |     shuffled_labels = [label_tensor[i, :] for i in index_shuffling]
176 |     return (np.array(shuffled_input)), (np.array(shuffled_labels))
177 | 
178 | 
179 | 
180 | class RandomData:
181 |     def __init__(self, input_size, batch_size, categories):
182 |         self.input_size = input_size
183 |         self.batch_size = batch_size
184 |         self.categories = categories
185 |         self.label_tensor = np.zeros([self.batch_size, self.categories])
186 | 
187 |     def forward(self):
188 |         input_tensor = np.random.random([self.batch_size, self.input_size])
189 | 
190 |         self.label_tensor = np.zeros([self.batch_size, self.categories])
191 |         for i in range(self.batch_size):
192 |             self.label_tensor[i, np.random.randint(0, self.categories)] = 1
193 | 
194 |         return input_tensor, self.label_tensor
195 | 
196 | 
197 | class IrisData:
198 |     def __init__(self):
199 |         self.data = load_iris()
200 |         self.label_tensor = np.zeros([150, 3])
201 |         for i in range(150):
202 |             self.label_tensor[i, self.data.target[i]] = 1
203 | 
204 |         self.input_tensor, self.label_tensor = shuffle_data((np.array(self.data.data)), self.label_tensor)
205 |         self.input_tensor = self.input_tensor
206 |         self.label_tensor = self.label_tensor
207 | 
208 |     def forward(self):
209 |         return self.input_tensor[0:100, :], self.label_tensor[0:100, :]
210 | 
211 |     def get_test_set(self):
212 |         return self.input_tensor[100:150, :], self.label_tensor[100:150, :]
213 | 
214 | 
215 | class DigitData:
216 |     def __init__(self, batch_size):
217 |         self.batch_size = batch_size
218 |         self._data = load_digits(n_class=10)
219 |         self._label_tensor = OneHotEncoder(sparse=False).fit_transform(self._data.target.reshape(-1, 1))
220 |         self._input_tensor = self._data.data
221 |         self._input_tensor /= np.abs(self._input_tensor).max()
222 | 
223 |         self.split = int(self._input_tensor.shape[0]*(2/3))  # train / test split  == number of samples in train set
224 | 
225 |         self._input_tensor, self._label_tensor = shuffle_data(self._input_tensor, self._label_tensor)
226 |         self._input_tensor_train = self._input_tensor[:self.split, :]
227 |         self._label_tensor_train = self._label_tensor[:self.split, :]
228 |         self._input_tensor_test = self._input_tensor[self.split:, :]
229 |         self._label_tensor_test = self._label_tensor[self.split:, :]
230 | 
231 |         self._current_forward_idx_iterator = self._forward_idx_iterator()
232 | 
233 |     def _forward_idx_iterator(self):
234 |         num_iterations = int(np.ceil(self.split / self.batch_size))
235 |         idx = np.arange(self.split)
236 |         while True:
237 |             this_idx = np.random.choice(idx, self.split, replace=False)
238 |             for i in range(num_iterations):
239 |                 yield this_idx[i * self.batch_size:(i + 1) * self.batch_size]
240 | 
241 |     def forward(self):
242 |         idx = next(self._current_forward_idx_iterator)
243 | 
244 |         return self._input_tensor_train[idx, :], self._label_tensor_train[idx, :]
245 | 
246 |     def get_test_set(self):
247 |         return self._input_tensor_test, self._label_tensor_test
248 | 
249 | 
250 | 
251 | 
252 | class MNISTData:
253 |     def __init__(self, batch_size):
254 |         self.batch_size = batch_size
255 |         self.train, self.labels = self._read()
256 |         self.test, self.testLabels = self._read(dataset="testing")
257 | 
258 |         self._current_forward_idx_iterator = self._forward_idx_iterator()
259 | 
260 |     def _forward_idx_iterator(self):
261 |         num_iterations = int(self.train.shape[0] / self.batch_size)
262 |         idx = np.arange(self.train.shape[0])
263 |         while True:
264 |             this_idx = np.random.choice(idx, self.train.shape[0], replace=False)
265 |             for i in range(num_iterations):
266 |                 yield this_idx[i * self.batch_size:(i + 1) * self.batch_size]
267 | 
268 |     def forward(self):
269 |         idx = next(self._current_forward_idx_iterator)
270 |         current = self.train[idx, :].reshape(-1,1,28,28)
271 |         return current, self.labels[idx, :]
272 | 
273 |     def show_random_training_image(self):
274 |         image = self.train[np.random.randint(0, self.train.shape[0]-1), :28 * 28]
275 |         plt.imshow(image.reshape(28, 28), cmap='gray')
276 |         plt.show()
277 | 
278 |     def show_image(self, index, test=True):
279 |         if test:
280 |             image = self.test[index, :28 * 28]
281 |         else:
282 |             image = self.train[index, :28 * 28]
283 | 
284 |         plt.imshow(image.reshape(28, 28), cmap='gray')
285 |         plt.show()
286 | 
287 |     def get_test_set(self):
288 |         return self.test, self.testLabels
289 | 
290 |     def get_random_test_sample(self):
291 |         img_id = np.random.randint(0, self.test.shape[0]-1)
292 |         image = self.test[img_id, :].reshape(-1,1,28,28)
293 |         label = self.testLabels[img_id]
294 |         return image, label
295 | 
296 | 
297 |     @staticmethod
298 |     def _read(dataset="training"):
299 |         """
300 |         Python function for importing the MNIST data set.  It returns an iterator
301 |         of 2-tuples with the first element being the label and the second element
302 |         being a numpy.uint8 2D array of pixel data for the given image.
303 |         """
304 | 
305 |         root_dir = Path(__file__)
306 | 
307 |         if dataset is "training":
308 |             fname_img = root_dir.parent.joinpath('Data', 'train-images-idx3-ubyte.gz')
309 |             fname_lbl = root_dir.parent.joinpath('Data', 'train-labels-idx1-ubyte.gz')
310 |         elif dataset is "testing":
311 |             fname_img = root_dir.parent.joinpath('Data', 't10k-images-idx3-ubyte.gz')
312 |             fname_lbl = root_dir.parent.joinpath('Data', 't10k-labels-idx1-ubyte.gz')
313 |         else:
314 |             raise ValueError("dataset must be 'testing' or 'training'")
315 | 
316 |         # Load everything in some numpy arrays
317 |         with gzip.open(str(fname_lbl), 'rb') as flbl:
318 |             magic, num = struct.unpack(">II", flbl.read(8))
319 | 
320 |             s = flbl.read(num)
321 |             lbl = np.frombuffer(s, dtype=np.int8)
322 |             one_hot = np.zeros((lbl.shape[0],10))
323 |             for idx, l in enumerate(lbl):
324 |                 one_hot[idx, l] = 1
325 | 
326 |         with gzip.open(str(fname_img), 'rb') as fimg:
327 |             magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
328 | 
329 |             buffer = fimg.read(num * 28 * 28 * 8)
330 |             img = np.frombuffer(buffer, dtype=np.uint8).reshape(len(lbl), rows * cols)
331 |             img = img.astype(np.float64)
332 |             img /= 255.0
333 | 
334 |         img = img[:num, :]
335 |         one_hot = one_hot[:num, :]
336 |         return img, one_hot
337 | 		
338 | if __name__ == "__main__":
339 | 	pass
340 | 
341 | 


--------------------------------------------------------------------------------
/Tests/TestActivationFunctions.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import Helpers
 4 | 
 5 | class TestReLU(unittest.TestCase):
 6 | 
 7 |     def setUp(self):
 8 |         self.input_size = 5
 9 |         self.batch_size = 10
10 |         self.half_batch_size = int(self.batch_size / 2)
11 |         self.input_tensor = np.ones([self.batch_size, self.input_size])
12 |         self.input_tensor[0:self.half_batch_size,:] -= 2
13 | 
14 |         self.label_tensor = np.zeros([self.batch_size, self.input_size])
15 |         for i in range(self.batch_size):
16 |             self.label_tensor[i, np.random.randint(0, self.input_size)] = 1
17 | 
18 |     def test_forward(self):
19 |         expected_tensor = np.zeros([self.batch_size, self.input_size])
20 |         expected_tensor[self.half_batch_size:self.batch_size, :] = 1
21 | 
22 |         layer = self.ReLU()
23 |         output_tensor = layer.forward(self.input_tensor)
24 |         self.assertEqual(np.sum(np.power(output_tensor-expected_tensor, 2)), 0)
25 | 
26 |     def test_backward(self):
27 |         expected_tensor = np.zeros([self.batch_size, self.input_size])
28 |         expected_tensor[self.half_batch_size:self.batch_size, :] = 2
29 | 
30 |         layer = self.ReLU()
31 |         layer.forward(self.input_tensor)
32 |         output_tensor = layer.backward(self.input_tensor*2)
33 |         self.assertEqual(np.sum(np.power(output_tensor - expected_tensor, 2)), 0)
34 | 
35 |     def test_gradient(self):
36 |         input_tensor = np.abs(np.random.random((self.batch_size, self.input_size)))
37 |         input_tensor *= 2.
38 |         input_tensor -= 1.
39 |         layers = list()
40 |         layers.append(self.ReLU())
41 |         layers.append(Helpers.SoftMax())
42 |         difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor)
43 |         self.assertLessEqual(np.sum(difference), 1e-5)
44 | 
45 | class TestSigmoid(unittest.TestCase):
46 |     Sigmoid = None
47 | 
48 |     def setUp(self):
49 |         self.input_size = 5
50 |         self.batch_size = 10
51 |         self.half_batch_size = int(self.batch_size / 2)
52 |         self.input_tensor = np.abs(np.random.random((self.input_size, self.batch_size))).T
53 |         self.input_tensor *= 2.
54 |         self.input_tensor -= 1.
55 | 
56 |         self.label_tensor = np.zeros([self.input_size, self.batch_size]).T
57 |         for i in range(self.batch_size):
58 |             self.label_tensor[i, np.random.randint(0, self.input_size)] = 1
59 | 
60 |     def test_forward(self):
61 |         expected_tensor = 0.5 * (1. + np.tanh(self.input_tensor / 2.))
62 | 
63 |         layer = self.Sigmoid()
64 |         output_tensor = layer.forward(self.input_tensor)
65 |         self.assertAlmostEqual(np.sum(np.power(output_tensor-expected_tensor, 2)), 0)
66 | 
67 |     def test_range(self):
68 |         layer = self.Sigmoid()
69 |         output_tensor = layer.forward(self.input_tensor*2)
70 | 
71 |         out_max = np.max(output_tensor)
72 |         out_min = np.min(output_tensor)
73 | 
74 |         self.assertLessEqual(out_max, 1.)
75 |         self.assertGreaterEqual(out_min, 0.)
76 | 
77 |     def test_gradient(self):
78 |         layers = list()
79 |         layers.append(self.Sigmoid())
80 |         layers.append(Helpers.SoftMax())
81 |         difference = Helpers.gradient_check(layers, self.input_tensor, self.label_tensor)
82 |         self.assertLessEqual(np.sum(difference), 1e-5)
83 | 
84 | if __name__ == "__main__":
85 |     pass


--------------------------------------------------------------------------------
/Tests/TestBatchNormalization.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import Helpers
  4 | 
  5 | class TestBatchNorm(unittest.TestCase):
  6 | 
  7 |     def setUp(self):
  8 |         self.batch_size = 200
  9 |         self.channels = 2
 10 |         self.input_shape = (self.channels, 3, 3)
 11 |         self.input_size = np.prod(self.input_shape)
 12 | 
 13 |         np.random.seed(0)
 14 |         self.input_tensor = np.abs(np.random.random((self.input_size, self.batch_size))).T
 15 | 
 16 |         self.categories = 5
 17 |         self.label_tensor = np.zeros([self.categories, self.batch_size]).T
 18 |         for i in range(self.batch_size):
 19 |             self.label_tensor[i, np.random.randint(0, self.categories)] = 1
 20 | 
 21 |         self.layers = list()
 22 |         self.layers.append(None)
 23 |         self.layers.append(self.FullyConnected(self.input_size, self.categories,0.))
 24 |         self.layers.append(Helpers.SoftMax())
 25 | 
 26 |     @staticmethod
 27 |     def _channel_moments(tensor, channels):
 28 |         in_shape = tensor.shape
 29 |         tensor = tensor.reshape(tensor.shape[0], channels, -1)
 30 |         tensor = np.transpose(tensor, (0, 2, 1))
 31 |         tensor = tensor.reshape(in_shape[1]//channels * in_shape[0], channels)
 32 |         mean = np.mean(tensor, axis=0)
 33 |         var = np.var(tensor, axis=0)
 34 |         return mean, var
 35 | 
 36 |     def test_forward_shape(self):
 37 |         layer = self.BatchNormalization(0.)
 38 |         output = layer.forward(self.input_tensor)
 39 | 
 40 |         self.assertEqual(output.shape[0], self.input_tensor.shape[0])
 41 |         self.assertEqual(output.shape[1], self.input_tensor.shape[1])
 42 | 
 43 |     def test_forward_shape_convolutional(self):
 44 |         layer = self.BatchNormalization(0., self.channels)
 45 |         output = layer.forward(self.input_tensor)
 46 | 
 47 |         self.assertEqual(output.shape[0], self.input_tensor.shape[0])
 48 |         self.assertEqual(output.shape[1], self.input_tensor.shape[1])
 49 | 
 50 |     def test_forward(self):
 51 |         layer = self.BatchNormalization(0.)
 52 |         output = layer.forward(self.input_tensor)
 53 |         mean = np.mean(output, axis=0)
 54 |         var = np.var(output, axis=0)
 55 | 
 56 |         self.assertAlmostEqual(np.sum(np.square(mean - np.zeros(mean.shape[0]))), 0)
 57 |         self.assertAlmostEqual(np.sum(np.square(var - np.ones(var.shape[0]))), 0)
 58 | 
 59 |     def test_forward_convolutional(self):
 60 |         layer = self.BatchNormalization(0., self.channels)
 61 |         output = layer.forward(self.input_tensor)
 62 |         mean, var = TestBatchNorm._channel_moments(output, self.channels)
 63 | 
 64 |         self.assertAlmostEqual(np.sum(np.square(mean)), 0)
 65 |         self.assertAlmostEqual(np.sum(np.square(var - np.ones_like(var))), 0)
 66 | 
 67 |     def test_forward_train_phase(self):
 68 |         layer = self.BatchNormalization(0.)
 69 |         layer.forward(self.input_tensor)
 70 | 
 71 |         output = layer.forward((np.zeros_like(self.input_tensor)))
 72 | 
 73 |         mean = np.mean(output, axis=0)
 74 | 
 75 |         mean_input = np.mean(self.input_tensor, axis=0)
 76 |         var_input = np.var(self.input_tensor, axis=0)
 77 | 
 78 |         self.assertNotEqual(np.sum(np.square(mean + (mean_input/np.sqrt(var_input)))), 0)
 79 | 
 80 |     def test_forward_train_phase_convolutional(self):
 81 |         layer = self.BatchNormalization(0., self.channels)
 82 |         layer.forward(self.input_tensor)
 83 | 
 84 |         output = layer.forward((np.zeros_like(self.input_tensor)))
 85 | 
 86 |         mean, var = TestBatchNorm._channel_moments(output, self.channels)
 87 |         mean_input, var_input = TestBatchNorm._channel_moments(self.input_tensor, self.channels)
 88 | 
 89 |         self.assertNotEqual(np.sum(np.square(mean + (mean_input/np.sqrt(var_input)))), 0)
 90 | 
 91 |     def test_forward_test_phase(self):
 92 |         layer = self.BatchNormalization(0.)
 93 |         layer.forward(self.input_tensor)
 94 |         layer.phase = self.Phase.test
 95 | 
 96 |         output = layer.forward((np.zeros_like(self.input_tensor)))
 97 | 
 98 |         mean = np.mean(output, axis=0)
 99 |         var = np.var(output, axis=0)
100 | 
101 |         mean_input = np.mean(self.input_tensor, axis=0)
102 |         var_input = np.var(self.input_tensor, axis=0)
103 | 
104 |         self.assertAlmostEqual(np.sum(np.square(mean + (mean_input/np.sqrt(var_input)))), 0)
105 |         self.assertAlmostEqual(np.sum(np.square(var)), 0)
106 | 
107 |     def test_forward_test_phase_convolutional(self):
108 |         layer = self.BatchNormalization(0., self.channels)
109 |         layer.forward(self.input_tensor)
110 |         layer.phase = self.Phase.test
111 | 
112 |         output = layer.forward((np.zeros_like(self.input_tensor)))
113 | 
114 |         mean, var = TestBatchNorm._channel_moments(output, self.channels)
115 |         mean_input, var_input = TestBatchNorm._channel_moments(self.input_tensor, self.channels)
116 | 
117 |         self.assertAlmostEqual(np.sum(np.square(mean + (mean_input / np.sqrt(var_input)))), 0, places=4)
118 |         self.assertAlmostEqual(np.sum(np.square(var)), 0, places=4)
119 | 
120 |     def test_gradient(self):
121 |         self.layers[0] = self.BatchNormalization(0.)
122 | 
123 |         difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor)
124 | 
125 |         self.assertLessEqual(np.sum(difference), 1e-4)
126 | 
127 |     def test_gradient_weights(self):
128 |         self.layers[0] = self.BatchNormalization(0.)
129 |         self.layers[0].forward(self.input_tensor)
130 | 
131 |         difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, False)
132 | 
133 |         self.assertLessEqual(np.sum(difference), 1e-6)
134 | 
135 |     def test_gradient_bias(self):
136 |         self.layers[0] = self.BatchNormalization(0.)
137 |         self.layers[0].forward(self.input_tensor)
138 | 
139 |         difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, True)
140 | 
141 |         self.assertLessEqual(np.sum(difference), 1e-6)
142 | 
143 |     def test_gradient_convolutional(self):
144 |         self.layers[0] = self.BatchNormalization(0., self.channels)
145 | 
146 |         difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor)
147 | 
148 |         self.assertLessEqual(np.sum(difference), 1e-3)
149 | 
150 |     def test_gradient_weights_convolutional(self):
151 |         self.layers[0] = self.BatchNormalization(0., self.channels)
152 |         self.layers[0].forward(self.input_tensor)
153 | 
154 |         difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, False)
155 | 
156 |         self.assertLessEqual(np.sum(difference), 1e-6)
157 | 
158 |     def test_gradient_bias_convolutional(self):
159 |         self.layers[0] = self.BatchNormalization(0., self.channels)
160 |         self.layers[0].forward(self.input_tensor)
161 | 
162 |         difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, True)
163 | 
164 |         self.assertLessEqual(np.sum(difference), 1e-6)
165 | 
166 | if __name__ == "__main__":
167 |     pass


--------------------------------------------------------------------------------
/Tests/TestConv.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import Helpers
  4 | from scipy.ndimage.filters import gaussian_filter
  5 | 
  6 | class TestConv(unittest.TestCase):
  7 | 
  8 |     class TestInitializer:
  9 | 
 10 |         @staticmethod
 11 |         def initialize(weights):
 12 |             weights = np.zeros((1, 3, 3, 3))
 13 |             weights[0, 1, 1, 1] = 1
 14 |             return weights
 15 | 
 16 |     def setUp(self):
 17 |         self.batch_size = 2
 18 |         self.input_shape = (3, 10, 14)
 19 |         self.uneven_input_shape = (3, 11, 15)
 20 |         self.spatial_input_size = np.prod(self.input_shape[1:])
 21 |         self.kernel_shape = (3, 5, 8)
 22 |         self.num_kernels = 4
 23 |         self.hidden_channels = 3
 24 | 
 25 |         self.categories = 5
 26 |         self.label_tensor = np.zeros([self.batch_size, self.categories])
 27 |         for i in range(self.batch_size):
 28 |             self.label_tensor[i, np.random.randint(0, self.categories)] = 1
 29 | 
 30 |     def test_forward_size(self):
 31 |         conv = self.Conv( (1, 1), self.kernel_shape, self.num_kernels, 0.)
 32 |         input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float)
 33 |         input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape)
 34 |         output_tensor = conv.forward(input_tensor)
 35 |         self.assertEqual(output_tensor.shape, (self.batch_size, self.num_kernels, *self.input_shape[1:]))
 36 | 
 37 |     def test_forward_size_stride(self):
 38 |         conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels, 0.)
 39 |         input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float)
 40 |         input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape)
 41 |         output_tensor = conv.forward(input_tensor)
 42 |         self.assertEqual(output_tensor.shape, (self.batch_size, self.num_kernels, 4, 7))
 43 | 
 44 |     def test_forward_size_stride_uneven_image(self):
 45 |         conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels + 1, 0.)
 46 |         input_tensor = np.array(range(np.prod(self.uneven_input_shape) * (self.batch_size + 1)), dtype=np.float)
 47 |         input_tensor = input_tensor.reshape(self.batch_size + 1, *self.uneven_input_shape)
 48 |         output_tensor = conv.forward(input_tensor)
 49 |         self.assertEqual(output_tensor.shape, ( self.batch_size+1, self.num_kernels+1, 4, 8))
 50 | 
 51 |     def test_forward(self):
 52 |         np.random.seed(1337)
 53 |         conv = self.Conv((1, 1), (1, 3, 3), 1, 0.)
 54 |         conv.weights = (1./15.) * np.array([[[1, 2, 1], [2, 3, 2], [1, 2, 1]]])
 55 |         conv.bias = np.array([0])
 56 |         conv.weights = np.expand_dims(conv.weights, 0)
 57 |         input_tensor = np.random.random((1, 1, 10, 14))
 58 |         expected_output = gaussian_filter(input_tensor[0, 0, :, :], 0.85, mode='constant', cval=0.0, truncate=1.0)
 59 |         output_tensor = conv.forward(input_tensor).reshape((10, 14))
 60 |         difference = np.max(np.abs(expected_output - output_tensor))
 61 |         self.assertAlmostEqual(difference, 0., places=1)
 62 | 
 63 |     def test_forward_fully_connected_channels(self):
 64 |         np.random.seed(1337)
 65 |         conv = self.Conv((1, 1), (3, 3, 3), 1, 0.)
 66 |         conv.weights = (1. / 15.) * np.array([[[1, 2, 1], [2, 3, 2], [1, 2, 1]], [[1, 2, 1], [2, 3, 2], [1, 2, 1]], [[1, 2, 1], [2, 3, 2], [1, 2, 1]]])
 67 |         conv.bias = np.array([0])
 68 |         conv.weights = np.expand_dims(conv.weights, 0)
 69 |         tensor = np.random.random((1, 1, 10, 14))
 70 |         input_tensor = np.zeros((1, 3 , 10, 14))
 71 |         input_tensor[:,0] = tensor.copy()
 72 |         input_tensor[:,1] = tensor.copy()
 73 |         input_tensor[:,2] = tensor.copy()
 74 |         expected_output = 3 * gaussian_filter(input_tensor[0, 0, :, :], 0.85, mode='constant', cval=0.0, truncate=1.0)
 75 |         output_tensor = conv.forward(input_tensor).reshape((10, 14))
 76 |         difference = np.max(np.abs(expected_output - output_tensor))
 77 |         self.assertLess(difference, 0.2)
 78 | 
 79 |     def test_backward_size(self):
 80 |         conv = self.Conv((1, 1), self.kernel_shape, self.num_kernels, 0.)
 81 |         input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float)
 82 |         input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape)
 83 |         output_tensor = conv.forward(input_tensor)
 84 |         error_tensor = conv.backward(output_tensor)
 85 |         self.assertEqual(error_tensor.shape, (self.batch_size, *self.input_shape))
 86 | 
 87 |     def test_backward_size_stride(self):
 88 |         conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels, 0.)
 89 |         input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float)
 90 |         input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape)
 91 |         output_tensor = conv.forward(input_tensor)
 92 |         error_tensor = conv.backward(output_tensor)
 93 |         self.assertEqual(error_tensor.shape, (self.batch_size, *self.input_shape))
 94 |         
 95 |     def test_layout_preservation(self):
 96 |         conv = self.Conv((1, 1), (3, 3, 3), 1, 0.)
 97 |         conv.initialize(TestConv.TestInitializer(), self.Constant(0.0))
 98 |         input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float)
 99 |         input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape)
100 |         output_tensor = conv.forward(input_tensor)
101 |         self.assertAlmostEqual(np.sum(np.abs(np.squeeze(output_tensor)-input_tensor[:,1,:,:])), 0.)
102 | 
103 |     def test_gradient(self):
104 |         np.random.seed(1337)
105 |         input_tensor = np.abs(np.random.random((2, 3, 5, 7)))
106 |         layers = list()
107 |         layers.append(self.Conv((1, 1), (3, 3, 3), self.hidden_channels, 0.))
108 |         layers.append(self.Flatten())
109 |         layers.append(self.FullyConnected(35 * self.hidden_channels, self.categories, 0))
110 |         layers.append(Helpers.SoftMax())
111 |         difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor)
112 |         self.assertLessEqual(np.sum(difference), 5e-2)
113 | 
114 |     def test_gradient_weights(self):
115 |         np.random.seed(1337)
116 |         input_tensor = np.abs(np.random.random((2, 3, 5, 7)))
117 |         layers = list()
118 |         layers.append(self.Conv((1, 1), (3, 3, 3), self.hidden_channels, 0.))
119 |         layers.append(self.Flatten())
120 |         layers.append(self.FullyConnected(35*self.hidden_channels, self.categories, 0))
121 |         layers.append(Helpers.SoftMax())
122 |         difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, False)
123 | 
124 |         self.assertLessEqual(np.sum(difference), 1e-5)
125 | 
126 |     def test_gradient_weights_strided(self):
127 |         np.random.seed(1337)
128 |         input_tensor = np.abs(np.random.random((2, 3, 5, 7)))
129 |         layers = list()
130 |         layers.append(self.Conv((2, 2), (3, 3, 3), self.hidden_channels, 0.))
131 |         layers.append(self.Flatten())
132 |         layers.append(self.FullyConnected(12*self.hidden_channels, self.categories, 0))
133 |         layers.append(Helpers.SoftMax())
134 |         difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, False)
135 | 
136 |         self.assertLessEqual(np.sum(difference), 1e-5)
137 | 
138 |     def test_gradient_bias(self):
139 |         np.random.seed(1337)
140 |         input_tensor = np.abs(np.random.random((2, 3, 5, 7)))
141 |         layers = list()
142 |         layers.append(self.Conv((1, 1), (3, 3, 3), self.hidden_channels, 0.))
143 |         layers.append(self.Flatten())
144 |         layers.append(self.FullyConnected(35 * self.hidden_channels, self.categories, 0))
145 |         layers.append(Helpers.SoftMax())
146 |         difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, True)
147 | 
148 |         self.assertLessEqual(np.sum(difference), 1e-5)
149 | 
150 |     def test_gradient_stride(self):
151 |         np.random.seed(1337)
152 |         input_tensor = np.abs(np.random.random((2, 3, 5, 14)))
153 |         layers = list()
154 |         layers.append(self.Conv( (1, 2), (3, 3, 3), 1, 0.))
155 |         layers.append(self.Flatten())
156 |         layers.append(self.FullyConnected(35, self.categories, 0))
157 |         layers.append(Helpers.SoftMax())
158 |         difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor)
159 | 
160 |         self.assertLessEqual(np.sum(difference), 1e-4)
161 | 
162 |     def test_update(self):
163 |         input_tensor = np.abs(np.random.random((self.batch_size, *self.input_shape)))
164 |         conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels, 1.)
165 |         conv.initialize(self.He(), self.Constant(0.1))
166 |         for _ in range(10):
167 |             output_tensor = conv.forward(input_tensor)
168 |             error_tensor = np.zeros_like(output_tensor)
169 |             error_tensor -= output_tensor
170 |             conv.backward(error_tensor)
171 |             new_output_tensor = conv.forward(input_tensor)
172 |             self.assertLess(np.sum(np.power(output_tensor, 2)), np.sum(np.power(new_output_tensor, 2)))	
173 | 
174 | if __name__ == "__main__":
175 |     pass


--------------------------------------------------------------------------------
/Tests/TestDropout.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import Helpers
 4 | 
 5 | class TestDropout(unittest.TestCase):
 6 |     def setUp(self):
 7 |         self.batch_size = 10000
 8 |         self.input_size = 10
 9 | 
10 |         self.input_tensor = np.ones((self.batch_size, self.input_size))
11 | 
12 |     def test_forward_trainTime(self):
13 |         drop_layer = self.DropOut(0.5)
14 |         output = drop_layer.forward(self.input_tensor)
15 | 
16 |         self.assertEqual(np.max(output), 2)
17 |         self.assertEqual(np.min(output), 0)
18 |         sum_over_mean = np.sum(np.mean(output, axis=0))
19 |         self.assertAlmostEqual(sum_over_mean, 1. * self.input_size, places=1)
20 | 
21 |     def test_forward_testTime(self):
22 |         drop_layer = self.DropOut(0.5)
23 |         drop_layer.phase = self.Phase.test
24 |         output = drop_layer.forward(self.input_tensor)
25 | 
26 |         self.assertEqual(np.max(output), 1.)
27 |         self.assertEqual(np.min(output), 1.)
28 |         sum_over_mean = np.sum(np.mean(output, axis=0))
29 |         self.assertEqual(sum_over_mean, 1. * self.input_size)
30 | 
31 |     def test_backward(self):
32 |         drop_layer = self.DropOut(0.5)
33 |         drop_layer.forward(self.input_tensor)
34 |         output = drop_layer.backward(self.input_tensor)
35 | 
36 |         self.assertEqual(np.max(output), 1)
37 |         self.assertEqual(np.min(output), 0)
38 |         sum_over_mean = np.sum(np.mean(output, axis=0))
39 |         self.assertAlmostEqual(sum_over_mean, .5 * self.input_size, places=1)
40 | 
41 | if __name__ == "__main__":
42 |     pass


--------------------------------------------------------------------------------
/Tests/TestFullyConnected.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import Helpers
 4 | 
 5 | class TestFullyConnected(unittest.TestCase):
 6 | 
 7 |     class TestInitializer:
 8 | 
 9 |         @staticmethod
10 |         def initialize(weights_shape):
11 |             return np.random.rand(*weights_shape)
12 | 
13 |     def setUp(self):
14 |         self.batch_size = 9
15 |         self.input_size = 4
16 |         self.output_size = 3
17 |         self.input_tensor = np.random.rand(self.batch_size, self.input_size)
18 | 
19 |         self.categories = 4
20 |         self.label_tensor = np.zeros([self.batch_size, self.categories])
21 |         for i in range(self.batch_size):
22 |             self.label_tensor[i, np.random.randint(0, self.categories)] = 1
23 | 
24 |     def test_forward_size(self):
25 |         layer = self.FullyConnected(self.input_size, self.output_size, 0)
26 |         layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer)
27 |         output_tensor = layer.forward(self.input_tensor)
28 |         self.assertEqual(output_tensor.shape[1], self.output_size)
29 |         self.assertEqual(output_tensor.shape[0], self.batch_size)
30 | 
31 |     def test_backward_size(self):
32 |         layer = self.FullyConnected(self.input_size, self.output_size, 0)
33 |         layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer)
34 |         output_tensor = layer.forward(self.input_tensor)
35 |         error_tensor = layer.backward(output_tensor)
36 |         self.assertEqual(error_tensor.shape[1], self.input_size)
37 |         self.assertEqual(error_tensor.shape[0], self.batch_size)
38 | 
39 |     def test_update(self):
40 |         layer = self.FullyConnected(self.input_size, self.output_size, 1)
41 |         layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer)
42 |         for _ in range(10):
43 |             output_tensor = layer.forward(self.input_tensor)
44 |             error_tensor = np.zeros([ self.batch_size, self.output_size])
45 |             error_tensor -= output_tensor
46 |             layer.backward(error_tensor)
47 |             new_output_tensor = layer.forward(self.input_tensor)
48 |             self.assertLess(np.sum(np.power(output_tensor, 2)), np.sum(np.power(new_output_tensor, 2)))
49 | 
50 |     def test_gradient(self):
51 |         input_tensor = np.abs(np.random.random((self.batch_size, self.input_size)))
52 |         layers = list()
53 |         layers.append(self.FullyConnected(self.input_size, self.categories, 0))
54 |         layers[0].initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer)
55 |         layers.append(Helpers.SoftMax())
56 |         difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor)
57 |         self.assertLessEqual(np.sum(difference), 1e-5)
58 | 
59 |     def test_gradient_weights(self):
60 |         input_tensor = np.abs(np.random.random((self.batch_size, self.input_size)))
61 |         layers = list()
62 |         layers.append(self.FullyConnected(self.input_size, self.categories, 0))
63 |         layers[0].initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer)
64 |         layers.append(Helpers.SoftMax())
65 |         difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, False)
66 |         self.assertLessEqual(np.sum(difference), 1e-5)
67 | 
68 |     def test_bias(self):
69 |         input_tensor = np.zeros((1, 100000))
70 |         layer = self.FullyConnected(100000, 1, 0)
71 |         layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer)
72 |         result = layer.forward(input_tensor)
73 |         self.assertGreater(np.sum(result), 0)
74 | 
75 | if __name__ == "__main__":
76 |     pass


--------------------------------------------------------------------------------
/Tests/TestInitializers.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import Helpers
 4 | from scipy import stats
 5 | 
 6 | class TestInitializers(unittest.TestCase):
 7 |     
 8 |     class DummyLayer:
 9 |         def __init__(self, input_size, output_size):
10 |             self.weights = np.random.random_sample((output_size, input_size))
11 |         
12 |         def initialize(self, initializer):
13 |             self.weights = initializer.initialize(self.weights.shape)
14 |     
15 |     def setUp(self):
16 |         self.batch_size = 9
17 |         self.input_size = 200
18 |         self.output_size = 50
19 |         
20 |     def _performInitialization(self, initializer):
21 |         np.random.seed(1337)
22 |         layer = TestInitializers.DummyLayer(self.input_size, self.output_size)
23 |         weights_before_init = layer.weights.copy()
24 |         layer.initialize(initializer)
25 |         weights_after_init = layer.weights.copy()
26 |         return weights_before_init, weights_after_init
27 |         
28 |     def test_const_shape(self):
29 |         weights_before_init, weights_after_init = self._performInitialization(self.Const(0.1))
30 |         
31 |         self.assertEqual(weights_before_init.shape, weights_after_init.shape)
32 |         self.assertFalse(np.allclose(weights_before_init, weights_after_init))
33 | 
34 |     def test_const_distribution(self):
35 |         weights_before_init, weights_after_init = self._performInitialization(self.Const(0.1))
36 |         self.assertTrue(np.allclose(weights_after_init, 0.1))
37 | 
38 |     def test_uniform_shape(self):
39 |         weights_before_init, weights_after_init = self._performInitialization(self.Uniform())
40 |         
41 |         self.assertEqual(weights_before_init.shape, weights_after_init.shape)
42 |         self.assertFalse(np.allclose(weights_before_init, weights_after_init))
43 | 
44 |     def test_uniform_distribution(self):
45 |         weights_before_init, weights_after_init = self._performInitialization(self.Uniform())
46 | 
47 |         p_value = stats.kstest(weights_after_init.flat, 'uniform', args=(0, 1)).pvalue
48 |         self.assertGreater(p_value, 0.01)
49 | 
50 |     def test_he_shape(self):
51 |         weights_before_init, weights_after_init = self._performInitialization(self.He())
52 |         
53 |         self.assertEqual(weights_before_init.shape, weights_after_init.shape)
54 |         self.assertFalse(np.allclose(weights_before_init, weights_after_init))
55 | 
56 |     def test_he_distribution(self):
57 |         weights_before_init, weights_after_init = self._performInitialization(self.He())
58 | 
59 |         scale = np.sqrt(2.) / np.sqrt(self.input_size)
60 |         p_value = stats.kstest(weights_after_init.flat, 'norm', args=(0, scale)).pvalue
61 |         self.assertGreater(p_value, 0.01)
62 | 
63 | if __name__ == "__main__":
64 |     pass


--------------------------------------------------------------------------------
/Tests/TestMaxPoolLayer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import Helpers
 4 | 
 5 | class TestMaxPooling(unittest.TestCase):
 6 | 
 7 |     def setUp(self):
 8 |         self.batch_size = 2
 9 |         self.input_shape = (2, 4, 7)
10 | 
11 |         np.random.seed(1337)
12 |         self.input_tensor = np.abs(np.random.random((self.batch_size, *self.input_shape)))
13 | 
14 |         self.categories = 5
15 |         self.label_tensor = np.zeros([self.batch_size, self.categories])
16 |         for i in range(self.batch_size):
17 |             self.label_tensor[i, np.random.randint(0, self.categories)] = 1
18 | 
19 |         self.layers = list()
20 |         self.layers.append(None)
21 |         self.layers.append(self.Flatten())
22 |         self.layers.append(None)
23 |         self.layers.append(Helpers.SoftMax())
24 | 
25 |     def test_shape(self):
26 |         layer = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2))
27 |         result = layer.forward(self.input_tensor)
28 |         expected_shape = np.array([self.batch_size, 2, 2, 3])
29 |         self.assertEqual(np.abs(np.sum(np.array(result.shape) - expected_shape)), 0)
30 | 
31 |     def test_overlapping_shape(self):
32 |         layer = self.MaxPooling(neighborhood=(2, 2), stride=(2, 1))
33 |         result = layer.forward(self.input_tensor)
34 |         expected_shape = np.array([self.batch_size, 2, 2, 6])
35 |         self.assertEqual(np.abs(np.sum(np.array(result.shape) - expected_shape)), 0)
36 | 
37 |     def test_subsampling_shape(self):
38 |         layer = self.MaxPooling(neighborhood=(2, 2), stride=(3, 2))
39 |         result = layer.forward(self.input_tensor)
40 |         expected_shape = np.array([self.batch_size, 2, 1, 3])
41 |         self.assertEqual(np.abs(np.sum(np.array(result.shape) - expected_shape)), 0)
42 | 
43 |     def test_gradient_stride(self):
44 |         self.layers[0] = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2))
45 |         self.layers[2] = self.FullyConnected(12, self.categories, 0.)
46 | 
47 |         difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor)
48 | 
49 |         self.assertLessEqual(np.sum(difference), 1e-6)
50 | 
51 |     def test_gradient_overlapping_stride(self):
52 |         self.layers[0] = self.MaxPooling(neighborhood=(2, 2), stride=(2, 1))
53 |         self.layers[2] = self.FullyConnected(24, self.categories, 0.)
54 | 
55 |         difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor)
56 | 
57 |         self.assertLessEqual(np.sum(difference), 1e-6)
58 | 
59 |     def test_gradient_subsampling_stride(self):
60 | 
61 |         self.layers[0] = self.MaxPooling(neighborhood=(2, 2), stride=(3, 2))
62 |         self.layers[2] = self.FullyConnected(6, self.categories, 0.)
63 | 
64 |         difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor)
65 | 
66 |         self.assertLessEqual(np.sum(difference), 1e-6)
67 | 
68 |     def test_layout_preservation(self):
69 |         pool = self.MaxPooling(neighborhood=(1, 1), stride=(1, 1))
70 |         input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float)
71 |         input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape)
72 |         output_tensor = pool.forward(input_tensor)
73 |         self.assertAlmostEqual(np.sum(np.abs(output_tensor-input_tensor)), 0.)
74 | 
75 |     def test_expected_output_valid_edgecase(self):
76 |         input_shape = (1, 3, 3)
77 |         pool = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2))
78 |         batch_size = 2
79 |         input_tensor = np.array(range(np.prod(input_shape) * batch_size), dtype=np.float)
80 |         input_tensor = input_tensor.reshape(batch_size, *input_shape)
81 | 
82 |         result = pool.forward(input_tensor)
83 |         expected_result = np.array([[4], [13]]).T
84 |         self.assertEqual(np.abs(np.sum(result - expected_result)), 0)
85 | 
86 |     def test_expected_output(self):
87 |         input_shape = (1, 4, 4)
88 |         pool = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2))
89 |         batch_size = 2
90 |         input_tensor = np.array(range(np.prod(input_shape) * batch_size), dtype=np.float)
91 |         input_tensor = input_tensor.reshape(batch_size, *input_shape)
92 | 
93 |         result = pool.forward(input_tensor)
94 |         expected_result = np.array([[[[ 5.,  7.],[13., 15.]]],[[[21., 23.],[29., 31.]]]]).T
95 |         self.assertEqual(np.abs(np.sum(result - expected_result)), 0)
96 | 
97 | if __name__ == "__main__":
98 |     pass


--------------------------------------------------------------------------------
/Tests/TestSoftMaxCrossEntropyLoss.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import Helpers
  4 | 
  5 | class TestSoftMaxCrossEntropyLoss(unittest.TestCase):
  6 |     
  7 |     def setUp(self):
  8 |         self.batch_size = 9
  9 |         self.categories = 4
 10 |         self.label_tensor = np.zeros([self.batch_size, self.categories])
 11 |         for i in range(self.batch_size):
 12 |             self.label_tensor[i, np.random.randint(0, self.categories)] = 1
 13 | 
 14 |     def test_forward_zero_loss(self):
 15 |         input_tensor = self.label_tensor * 100.
 16 |         layer = self.SoftMaxCrossEntropyLoss()
 17 |         loss = layer.forward(input_tensor, self.label_tensor)
 18 | 
 19 |         self.assertLess(loss, 1e-10)
 20 | 
 21 |     def test_backward_zero_loss(self):
 22 |         input_tensor = self.label_tensor * 100.
 23 |         layer = self.SoftMaxCrossEntropyLoss()
 24 |         layer.forward(input_tensor, self.label_tensor)
 25 |         error = layer.backward(self.label_tensor)
 26 | 
 27 |         self.assertAlmostEqual(np.sum(error), 0)
 28 | 
 29 |     def test_regression_high_loss(self):
 30 |         input_tensor = self.label_tensor - 1.
 31 |         input_tensor *= -100.
 32 |         layer = self.SoftMaxCrossEntropyLoss()
 33 |         loss = layer.forward(input_tensor, self.label_tensor)
 34 | 
 35 |         # test a specific value here
 36 |         self.assertAlmostEqual(float(loss), 909.8875105980)
 37 | 
 38 |     def test_regression_backward_high_loss(self):
 39 |         input_tensor = self.label_tensor - 1.
 40 |         input_tensor *= -100.
 41 |         layer = self.SoftMaxCrossEntropyLoss()
 42 |         layer.forward(input_tensor, self.label_tensor)
 43 |         error = layer.backward(self.label_tensor)
 44 | 
 45 |         # test if every wrong class confidence is decreased
 46 |         for element in error[self.label_tensor == 0]:
 47 |             self.assertGreaterEqual(element, 1 / 3)
 48 | 
 49 |         # test if every correct class confidence is increased
 50 |         for element in error[self.label_tensor == 1]:
 51 |             self.assertAlmostEqual(element, -1)
 52 | 
 53 |     def test_regression_forward(self):
 54 |         np.random.seed(1337)
 55 |         input_tensor = np.abs(np.random.random(self.label_tensor.shape))
 56 |         layer = self.SoftMaxCrossEntropyLoss()
 57 |         loss = layer.forward(input_tensor, self.label_tensor)
 58 | 
 59 |         # just see if it's bigger then zero
 60 |         self.assertGreater(float(loss), 0.)
 61 | 
 62 |     def test_regression_backward(self):
 63 |         input_tensor = np.abs(np.random.random(self.label_tensor.shape))
 64 |         layer = self.SoftMaxCrossEntropyLoss()
 65 |         layer.forward(input_tensor, self.label_tensor)
 66 |         error = layer.backward(self.label_tensor)
 67 | 
 68 |         # test if every wrong class confidence is decreased
 69 |         for element in error[self.label_tensor == 0]:
 70 |             self.assertGreaterEqual(element, 0)
 71 | 
 72 |         # test if every correct class confidence is increased
 73 |         for element in error[self.label_tensor == 1]:
 74 |             self.assertLessEqual(element, 0)
 75 | 
 76 |     def test_gradient(self):
 77 |         input_tensor = np.abs(np.random.random(self.label_tensor.shape))
 78 |         layer = self.SoftMaxCrossEntropyLoss()
 79 |         difference = Helpers.gradient_check([layer], input_tensor, self.label_tensor)
 80 |         self.assertLessEqual(np.sum(difference), 1e-5)
 81 | 
 82 |     def test_predict(self):
 83 |         input_tensor = np.arange(self.categories * self.batch_size)
 84 |         input_tensor = input_tensor / 100.
 85 |         input_tensor = input_tensor.reshape((self.batch_size, self.categories))
 86 |         layer = self.SoftMaxCrossEntropyLoss()
 87 |         prediction = layer.predict(input_tensor)
 88 |         expected_values = [[0.24626259, 0.24873757, 0.25123743, 0.25376241],
 89 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241],
 90 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241],
 91 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241],
 92 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241],
 93 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241],
 94 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241],
 95 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241],
 96 |                            [0.24626259, 0.24873757, 0.25123743, 0.25376241]]
 97 |         np.testing.assert_almost_equal(prediction, expected_values)
 98 | 
 99 | if __name__ == "__main__":
100 |     pass


--------------------------------------------------------------------------------
/img/ann.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/ann.png


--------------------------------------------------------------------------------
/img/conv_back_weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/conv_back_weights.png


--------------------------------------------------------------------------------
/img/conv_forward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/conv_forward.png


--------------------------------------------------------------------------------
/img/fcn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/fcn.png


--------------------------------------------------------------------------------
/img/lenet.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/lenet.jpg


--------------------------------------------------------------------------------
/img/numerical_maxpooling.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/numerical_maxpooling.gif


--------------------------------------------------------------------------------
/img/padding_strides.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/padding_strides.gif


--------------------------------------------------------------------------------
/img/pooling.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/pooling.gif


--------------------------------------------------------------------------------
/img/restacking_filters.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/restacking_filters.gif


--------------------------------------------------------------------------------
/img/same_padding_no_strides.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/same_padding_no_strides.gif


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
  1 | Attribution-NonCommercial-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public: 
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 58 | Public License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 63 | ("Public License"). To the extent this Public License may be
 64 | interpreted as a contract, You are granted the Licensed Rights in
 65 | consideration of Your acceptance of these terms and conditions, and the
 66 | Licensor grants You such rights in consideration of benefits the
 67 | Licensor receives from making the Licensed Material available under
 68 | these terms and conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-NC-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution, NonCommercial, and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. NonCommercial means not primarily intended for or directed towards
126 |      commercial advantage or monetary compensation. For purposes of
127 |      this Public License, the exchange of the Licensed Material for
128 |      other material subject to Copyright and Similar Rights by digital
129 |      file-sharing or similar means is NonCommercial provided there is
130 |      no payment of monetary compensation in connection with the
131 |      exchange.
132 | 
133 |   l. Share means to provide material to the public by any means or
134 |      process that requires permission under the Licensed Rights, such
135 |      as reproduction, public display, public performance, distribution,
136 |      dissemination, communication, or importation, and to make material
137 |      available to the public including in ways that members of the
138 |      public may access the material from a place and at a time
139 |      individually chosen by them.
140 | 
141 |   m. Sui Generis Database Rights means rights other than copyright
142 |      resulting from Directive 96/9/EC of the European Parliament and of
143 |      the Council of 11 March 1996 on the legal protection of databases,
144 |      as amended and/or succeeded, as well as other essentially
145 |      equivalent rights anywhere in the world.
146 | 
147 |   n. You means the individual or entity exercising the Licensed Rights
148 |      under this Public License. Your has a corresponding meaning.
149 | 
150 | 
151 | Section 2 -- Scope.
152 | 
153 |   a. License grant.
154 | 
155 |        1. Subject to the terms and conditions of this Public License,
156 |           the Licensor hereby grants You a worldwide, royalty-free,
157 |           non-sublicensable, non-exclusive, irrevocable license to
158 |           exercise the Licensed Rights in the Licensed Material to:
159 | 
160 |             a. reproduce and Share the Licensed Material, in whole or
161 |                in part, for NonCommercial purposes only; and
162 | 
163 |             b. produce, reproduce, and Share Adapted Material for
164 |                NonCommercial purposes only.
165 | 
166 |        2. Exceptions and Limitations. For the avoidance of doubt, where
167 |           Exceptions and Limitations apply to Your use, this Public
168 |           License does not apply, and You do not need to comply with
169 |           its terms and conditions.
170 | 
171 |        3. Term. The term of this Public License is specified in Section
172 |           6(a).
173 | 
174 |        4. Media and formats; technical modifications allowed. The
175 |           Licensor authorizes You to exercise the Licensed Rights in
176 |           all media and formats whether now known or hereafter created,
177 |           and to make technical modifications necessary to do so. The
178 |           Licensor waives and/or agrees not to assert any right or
179 |           authority to forbid You from making technical modifications
180 |           necessary to exercise the Licensed Rights, including
181 |           technical modifications necessary to circumvent Effective
182 |           Technological Measures. For purposes of this Public License,
183 |           simply making modifications authorized by this Section 2(a)
184 |           (4) never produces Adapted Material.
185 | 
186 |        5. Downstream recipients.
187 | 
188 |             a. Offer from the Licensor -- Licensed Material. Every
189 |                recipient of the Licensed Material automatically
190 |                receives an offer from the Licensor to exercise the
191 |                Licensed Rights under the terms and conditions of this
192 |                Public License.
193 | 
194 |             b. Additional offer from the Licensor -- Adapted Material.
195 |                Every recipient of Adapted Material from You
196 |                automatically receives an offer from the Licensor to
197 |                exercise the Licensed Rights in the Adapted Material
198 |                under the conditions of the Adapter's License You apply.
199 | 
200 |             c. No downstream restrictions. You may not offer or impose
201 |                any additional or different terms or conditions on, or
202 |                apply any Effective Technological Measures to, the
203 |                Licensed Material if doing so restricts exercise of the
204 |                Licensed Rights by any recipient of the Licensed
205 |                Material.
206 | 
207 |        6. No endorsement. Nothing in this Public License constitutes or
208 |           may be construed as permission to assert or imply that You
209 |           are, or that Your use of the Licensed Material is, connected
210 |           with, or sponsored, endorsed, or granted official status by,
211 |           the Licensor or others designated to receive attribution as
212 |           provided in Section 3(a)(1)(A)(i).
213 | 
214 |   b. Other rights.
215 | 
216 |        1. Moral rights, such as the right of integrity, are not
217 |           licensed under this Public License, nor are publicity,
218 |           privacy, and/or other similar personality rights; however, to
219 |           the extent possible, the Licensor waives and/or agrees not to
220 |           assert any such rights held by the Licensor to the limited
221 |           extent necessary to allow You to exercise the Licensed
222 |           Rights, but not otherwise.
223 | 
224 |        2. Patent and trademark rights are not licensed under this
225 |           Public License.
226 | 
227 |        3. To the extent possible, the Licensor waives any right to
228 |           collect royalties from You for the exercise of the Licensed
229 |           Rights, whether directly or through a collecting society
230 |           under any voluntary or waivable statutory or compulsory
231 |           licensing scheme. In all other cases the Licensor expressly
232 |           reserves any right to collect such royalties, including when
233 |           the Licensed Material is used other than for NonCommercial
234 |           purposes.
235 | 
236 | 
237 | Section 3 -- License Conditions.
238 | 
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 | 
242 |   a. Attribution.
243 | 
244 |        1. If You Share the Licensed Material (including in modified
245 |           form), You must:
246 | 
247 |             a. retain the following if it is supplied by the Licensor
248 |                with the Licensed Material:
249 | 
250 |                  i. identification of the creator(s) of the Licensed
251 |                     Material and any others designated to receive
252 |                     attribution, in any reasonable manner requested by
253 |                     the Licensor (including by pseudonym if
254 |                     designated);
255 | 
256 |                 ii. a copyright notice;
257 | 
258 |                iii. a notice that refers to this Public License;
259 | 
260 |                 iv. a notice that refers to the disclaimer of
261 |                     warranties;
262 | 
263 |                  v. a URI or hyperlink to the Licensed Material to the
264 |                     extent reasonably practicable;
265 | 
266 |             b. indicate if You modified the Licensed Material and
267 |                retain an indication of any previous modifications; and
268 | 
269 |             c. indicate the Licensed Material is licensed under this
270 |                Public License, and include the text of, or the URI or
271 |                hyperlink to, this Public License.
272 | 
273 |        2. You may satisfy the conditions in Section 3(a)(1) in any
274 |           reasonable manner based on the medium, means, and context in
275 |           which You Share the Licensed Material. For example, it may be
276 |           reasonable to satisfy the conditions by providing a URI or
277 |           hyperlink to a resource that includes the required
278 |           information.
279 |        3. If requested by the Licensor, You must remove any of the
280 |           information required by Section 3(a)(1)(A) to the extent
281 |           reasonably practicable.
282 | 
283 |   b. ShareAlike.
284 | 
285 |      In addition to the conditions in Section 3(a), if You Share
286 |      Adapted Material You produce, the following conditions also apply.
287 | 
288 |        1. The Adapter's License You apply must be a Creative Commons
289 |           license with the same License Elements, this version or
290 |           later, or a BY-NC-SA Compatible License.
291 | 
292 |        2. You must include the text of, or the URI or hyperlink to, the
293 |           Adapter's License You apply. You may satisfy this condition
294 |           in any reasonable manner based on the medium, means, and
295 |           context in which You Share Adapted Material.
296 | 
297 |        3. You may not offer or impose any additional or different terms
298 |           or conditions on, or apply any Effective Technological
299 |           Measures to, Adapted Material that restrict exercise of the
300 |           rights granted under the Adapter's License You apply.
301 | 
302 | 
303 | Section 4 -- Sui Generis Database Rights.
304 | 
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 | 
308 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 |      to extract, reuse, reproduce, and Share all or a substantial
310 |      portion of the contents of the database for NonCommercial purposes
311 |      only;
312 | 
313 |   b. if You include all or a substantial portion of the database
314 |      contents in a database in which You have Sui Generis Database
315 |      Rights, then the database in which You have Sui Generis Database
316 |      Rights (but not its individual contents) is Adapted Material,
317 |      including for purposes of Section 3(b); and
318 | 
319 |   c. You must comply with the conditions in Section 3(a) if You Share
320 |      all or a substantial portion of the contents of the database.
321 | 
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 | 
326 | 
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 | 
329 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 | 
340 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 | 
350 |   c. The disclaimer of warranties and limitation of liability provided
351 |      above shall be interpreted in a manner that, to the extent
352 |      possible, most closely approximates an absolute disclaimer and
353 |      waiver of all liability.
354 | 
355 | 
356 | Section 6 -- Term and Termination.
357 | 
358 |   a. This Public License applies for the term of the Copyright and
359 |      Similar Rights licensed here. However, if You fail to comply with
360 |      this Public License, then Your rights under this Public License
361 |      terminate automatically.
362 | 
363 |   b. Where Your right to use the Licensed Material has terminated under
364 |      Section 6(a), it reinstates:
365 | 
366 |        1. automatically as of the date the violation is cured, provided
367 |           it is cured within 30 days of Your discovery of the
368 |           violation; or
369 | 
370 |        2. upon express reinstatement by the Licensor.
371 | 
372 |      For the avoidance of doubt, this Section 6(b) does not affect any
373 |      right the Licensor may have to seek remedies for Your violations
374 |      of this Public License.
375 | 
376 |   c. For the avoidance of doubt, the Licensor may also offer the
377 |      Licensed Material under separate terms or conditions or stop
378 |      distributing the Licensed Material at any time; however, doing so
379 |      will not terminate this Public License.
380 | 
381 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 |      License.
383 | 
384 | 
385 | Section 7 -- Other Terms and Conditions.
386 | 
387 |   a. The Licensor shall not be bound by any additional or different
388 |      terms or conditions communicated by You unless expressly agreed.
389 | 
390 |   b. Any arrangements, understandings, or agreements regarding the
391 |      Licensed Material not stated herein are separate from and
392 |      independent of the terms and conditions of this Public License.
393 | 
394 | 
395 | Section 8 -- Interpretation.
396 | 
397 |   a. For the avoidance of doubt, this Public License does not, and
398 |      shall not be interpreted to, reduce, limit, restrict, or impose
399 |      conditions on any use of the Licensed Material that could lawfully
400 |      be made without permission under this Public License.
401 | 
402 |   b. To the extent possible, if any provision of this Public License is
403 |      deemed unenforceable, it shall be automatically reformed to the
404 |      minimum extent necessary to make it enforceable. If the provision
405 |      cannot be reformed, it shall be severed from this Public License
406 |      without affecting the enforceability of the remaining terms and
407 |      conditions.
408 | 
409 |   c. No term or condition of this Public License will be waived and no
410 |      failure to comply consented to unless expressly agreed to by the
411 |      Licensor.
412 | 
413 |   d. Nothing in this Public License constitutes or may be interpreted
414 |      as a limitation upon, or waiver of, any privileges and immunities
415 |      that apply to the Licensor or You, including from the legal
416 |      processes of any jurisdiction or authority.
417 | 
418 | =======================================================================
419 | 
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 | 
437 | Creative Commons may be contacted at creativecommons.org.
438 | 
439 | 


--------------------------------------------------------------------------------
/src/base.py:
--------------------------------------------------------------------------------
 1 | def enum(*sequential, **named):
 2 |     # Enum definition for backcompatibility
 3 |     enums = dict(zip(sequential, range(len(sequential))), **named)
 4 |     return type('Enum', (), enums)
 5 | 
 6 | # Enum to encode the which phase a layer is in at the moment.
 7 | Phase = enum('train', 'test', 'validation')
 8 | 
 9 | class BaseLayer:
10 |     
11 |     def __init__(self):
12 |         self.phase = Phase.train
13 |         
14 |     def forward(self, x):
15 |         """ Return the result of the forward pass of this layer. Save intermediate results
16 |         necessary to compute the gradients in the backward pass. 
17 |         """
18 |         raise NotImplementedError('Base class - method is not implemented')
19 |     
20 |     def backward(self, error):
21 |         """ Update the parameters/weights of this layer (if applicable), 
22 |         and return the gradient with respect to the input.
23 |         """
24 |         raise NotImplementedError('Base class - method is not implemented')


--------------------------------------------------------------------------------
/src/layers/activation_functions.py:
--------------------------------------------------------------------------------
 1 | class Sigmoid(BaseLayer):
 2 |     
 3 |     def forward(self, x):
 4 |         """ Return the element-wise sigmoid of the input.
 5 |             param: x (np.ndarray): input to the activation function, of arbitrary shape
 6 |             returns (np.ndarray): element-wise sigmoid(x), of the same shape as x
 7 |         """
 8 |         # TODO: Implement forward pass of the Sigmoid
 9 |         pass
10 |         
11 |     def backward(self, error):
12 |         """ Return the gradient with respect to the input.
13 |             param: error (np.ndarray): the gradient passed down from the subsequent layer, of the same 
14 |                    shape as x in the forward pass
15 |             returns (np.ndarray): the gradient with respect to the previous layer, of the same shape as error 
16 |         """
17 |         # TODO: Implement backward pass of the Sigmoid
18 |         pass
19 |     
20 | 
21 | class ReLU(BaseLayer):
22 |     
23 |     def forward(self, x):
24 |         """ Return the result of a ReLU activation of the input.
25 |             param: x (np.ndarray): input to the activation function, of arbitrary shape
26 |             returns (np.ndarray): element-wise ReLU(x), of the same shape as x
27 |         """
28 |         # TODO: Implement forward pass of the ReLU
29 |         pass
30 |     
31 |     def backward(self, error):
32 |         """ Return the gradient with respect to the input.
33 |             param: error (np.ndarray): the gradient passed down from the previous layer, arbitrary shape (same as x)
34 |             returns (np.ndarray): gradient with respect to the input, of the same shape as error 
35 |         """
36 |         # TODO: Implement backward pass of the ReLU
37 |         pass


--------------------------------------------------------------------------------
/src/layers/batch_normalization.py:
--------------------------------------------------------------------------------
 1 | class BatchNorm(BaseLayer):
 2 |     def __init__(self, learning_rate, convolutional=False):
 3 |         """ Batch normalization layer.
 4 |             param: learning_rate (float): the learning rate of this layer
 5 |             param: convolutional(boolean): if true, only a scalar mean and a scalar variance is 
 6 |                    calculated for every channel, otherwise mean and variance have the same dimension 
 7 |                    as the input
 8 |         """
 9 |         # TODO: Implement initialization
10 |         pass
11 | 
12 |     def forward(self, x):
13 |         """ Return the batch normalized input.
14 |             param: x(np.ndarray): input, of arbitrary shape
15 |             returns (np.ndarray): result of batch normalization, of the same shape as x
16 |         """
17 |         # TODO: Implement forward pass of the batch normalization layer
18 |         
19 |         # Hint 1: Make sure to treat training and test phase accordingly.
20 |         # Hint 2: If the network has never seen any training data, but is applied in "test mode", the network 
21 |         #         should not change the distribution of the input. Initialize the respective variable after the
22 |         #         first training input is received.
23 |         pass
24 | 
25 |     def backward(self, error):
26 |         """ Return the gradient with respect to the previous layer.
27 |             param: error(np.ndarray): error passed down from the subsequent layer, of the same shape as the input
28 |                    in the forward pass
29 |             returns (np.ndarray): gradient w.r.t. the input, of the same shape as error
30 |         """
31 |         # TODO: Implement backward pass of the batch normalization layer
32 |         pass
33 | 
34 |     def get_gradient_weights(self):
35 |         """ Returns the gradient with respect to the weights, i.e. \gamma, from the last call of backward() """
36 |         # TODO: Implement
37 |         pass
38 | 
39 |     def get_gradient_bias(self):
40 |         """ Returns the gradient with respect to the bias, i.e. \beta, from the last call of backward() """ 
41 |         # TODO: Implement
42 |         pass
43 | 


--------------------------------------------------------------------------------
/src/layers/conv.py:
--------------------------------------------------------------------------------
 1 | import Initializers
 2 | 
 3 | class FlattenLayer(BaseLayer):
 4 |     def __init__(self):
 5 |         # TODO: define the necessary class variables
 6 |         pass
 7 |     
 8 |     def forward(self, x):
 9 |         """ Return a flattened version of the input.
10 |             param: x (np.ndarray): input, of shape [b, n_channels, p, q] where b is the batch size, 
11 |                    n_channels is the number of channels and p x q is the image size
12 |             returns (np.ndarray): a flattened representation of x of shape [b, v] 
13 |                    where b is the batch size and v is the output size = n_channels * p * q
14 |         """
15 |         # TODO: Implement flattening of the image
16 |         pass
17 |     
18 |     def backward(self, error):
19 |         """ Return the gradient with respect to the input.
20 |             param: error (np.ndarray): the gradient passed down from the subsequent layer, of shape [b, m],
21 |                    where b is the batch size and m is the output size with m = n_channels * p * q from 
22 |                    the forward pass
23 |             returns (np.ndarray): the error with restored dimensions from the forward pass, i.e. with 
24 |                    shape [b, n_channels, p, q] where b is the batch size, n_channels is the number of 
25 |                    channels and p x q is the image size
26 |         """
27 |         # TODO: Restore the image dimensions
28 |         pass
29 | 
30 | 
31 | class ConvolutionalLayer(BaseLayer):
32 |     
33 |     def __init__(self, stride_shape, kernel_shape, n_kernels, learning_rate, weights_initializer=UniformRandom(), bias_initializer=Const(0.1)):
34 |         """ 
35 |             param: stride: tuple in the form of (np, nq) which denote the subsampling factor of the 
36 |                    convolution operation in the spatial dimensions
37 |             param: kernel_shape: integer tuple in the form of (n_channels, m, n) where n_channels is 
38 |                    the number of input channels and m x n is the size of the filter kernels
39 |             param: n_kernels (int): number of kernels and therefore the number of output channels
40 |             param: learning_rate (float): learning rate of this layer
41 |             param: weights_initializer: initializer object for the filter weights
42 |             param: bias_initializer: initializer object for the bias
43 |         """
44 |         # TODO: define the neccesary class variables, initialize the weights and bias
45 |         self.weights = ...
46 |         self.bias = ...
47 |         pass 
48 |     
49 |     def forward(self, x):
50 |         """ Return the result of the forward pass of the convolutional layer.
51 |             param: x(np.ndarray): input, of shape [b, n_channels, p, q],  where b is the batch size, 
52 |                    n_channels is the number of input channels and p x q is the image size
53 |             returns (np.ndarray): result of the forward pass, of shape (b, n_kernels, p', q') 
54 |                    where b is the batch size, n_kernels is the number of kernels in this layer and 
55 |                    p' x q' is the output image size (which depends on the stride)
56 |         """
57 |         # TODO: Implement forward pass of the convolutional layer
58 |         pass
59 |     
60 |     def backward(self, error):
61 |         """ Update the weights of this layer and return the gradient with respect to the input.
62 |             param: error (np.ndarray): of shape (b, n_kernels, p', q') where b is the batch size, n_kernels
63 |                    is the number of kernels and p' x q' is the spacial error size (depends on the stride)
64 |             returns (np.ndarray): the gradient with respect to the input, of shape (b, n_channels, p, q) 
65 |                    where b is the batch size, n_channels is the number of input channels to this layer and 
66 |                    p x q is the image size.
67 |         """ 
68 |         # TODO: Implement backward pass of the convolutional layer
69 |         pass
70 |     
71 |     def get_gradient_weights(self):
72 |         """ Returns the gradient with respect to the weights from the last call of backward() """
73 |         # TODO: Implement
74 |         pass
75 | 
76 |     def get_gradient_bias(self):
77 |         """ Returns the gradient with respect to the bias from the last call of backward() """
78 |         # TODO: Implement
79 |         pass
80 |     
81 |     def initialize(self, weights_initializer, bias_initializer):
82 |         """ Initializes the weights/bias of this layer with the given initializers.
83 |             param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape)
84 |                    which will return initialized weights with the given shape
85 |             param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) 
86 |                    which will return an initialized bias with the given shape
87 |         """
88 |         # TODO: Implement. To make sure that He initialization works as intended, make sure the second dimension 
89 |         # of weights_shape contains the number of input nodes that can be computed as n_in = n_channels * m * n
90 |         # and reshape the weights to the correct shape afterwards.
91 |         pass


--------------------------------------------------------------------------------
/src/layers/dropout.py:
--------------------------------------------------------------------------------
 1 | class DropOut(BaseLayer):
 2 |     
 3 |     def __init__(self, probability):
 4 |         """ DropOut Layer.
 5 |             param: probability: probability of each individual activation to be set to zero, in range [0, 1]    
 6 |         """
 7 |         # TODO: Implement initialization
 8 |         
 9 |         pass
10 |     
11 |     def forward(self, x):
12 |         """ Forward pass through the layer: Set activations of the input randomly to zero.
13 |             param: x (np.ndarray): input
14 |             returns (np.ndarray): a new array of the same shape as x, after dropping random elements
15 |         """
16 |         # TODO: Implement forward pass of the Dropout layer
17 |         # Hint: Make sure to treat training and test phase accordingly.
18 |         pass
19 |     
20 |     def backward(self, error):
21 |         """ Backward pass through the layer: Return the gradient with respect to the input.
22 |             param: error (np.ndarray): error passed down from the subsequent layer, of the same shape as the 
23 |                    output of the forward pass
24 |             returns (np.ndarray):  gradient with respect to the input, of the same shape as error
25 |         """
26 |         # TODO: Implement backward pass of the Dropout layer
27 |         pass


--------------------------------------------------------------------------------
/src/layers/fully_connected.py:
--------------------------------------------------------------------------------
 1 | class FullyConnectedLayer(BaseLayer):
 2 |     def __init__(self, input_size, output_size, learning_rate):
 3 |         """ A fully connected layer.
 4 |             param: input_size (int): dimension n of the input vector
 5 |             param: output_size (int): dimension m of the output vector
 6 |             param: learning_rate (float): the learning rate of this layer
 7 |         """
 8 |         # TODO: define the neccesary class variables
 9 |         pass
10 | 
11 |     def forward(self, x):
12 |         """ Compute the foward pass through the layer.
13 |             param: x (np.ndarray): input with shape [b, n] where b is the batch size and n is the input size
14 |             returns (np.ndarray): result of the forward pass, of shape [b, m] where b is the batch size and
15 |                    m is the output size
16 |         """
17 |         # TODO: Implement forward pass of the fully connected layer
18 |         # Hint: Think about what you need to store during the forward pass to be able to compute 
19 |         # the gradients in the backward pass 
20 |         pass
21 |     
22 |     def get_gradient_weights(self):
23 |         """ 
24 |         returns (np.ndarray): the gradient with respect to the weights and biases from the last call of backward(...)
25 |         """
26 |         # TODO: Implement 
27 |         pass
28 |     
29 |     def backward(self, error):
30 |         """ Update the weights of this layer and return the gradient with respect to the previous layer.
31 |             param: error (np.ndarray): of shape [b, m] where b is the batch size and m is the output size
32 |             returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n] where b is the 
33 |                    batch size and n is the input size
34 |         """
35 |         # TODO: Implement backward pass of the fully connected layer
36 |         # Hint: Be careful about the order of applying the update to the weights and the calculation of 
37 |         # the error with respect to the previous layer.
38 |         pass
39 |     
40 |     def initialize(self, weights_initializer, bias_initializer):
41 |         """ Initializes the weights/bias of this layer with the given initializers.
42 |             param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape)
43 |                    which will return initialized weights with the given shape
44 |             param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) 
45 |                    which will return an initialized bias with the given shape
46 |         """
47 |         # TODO: Implement
48 |         pass


--------------------------------------------------------------------------------
/src/layers/initializers.py:
--------------------------------------------------------------------------------
 1 | class Initializer:
 2 |     """ Base class for initializers. """
 3 |     def initialize(self, weight_shape):
 4 |         """ Return weights initialized according to the subclass definition. 
 5 |             Required to work for arbitrary weight shapes.
 6 |             Base class. 
 7 |         """
 8 |         
 9 |         # Raises an exeption in base class.
10 |         raise NotImplementedError('Method is not implemented')
11 | 
12 |         
13 | class Const(Initializer):
14 |     
15 |     def __init__(self, value):
16 |         """ Create a constant initializer.
17 |             params: value (float): constant that is used for initialization of weights
18 |         """
19 |         # TODO: Implement
20 |         pass
21 | 
22 |     def initialize(self, weight_shape):
23 |         """ Return a new array of weights initialized with a constant value provided by self.value.
24 |             param: weight_shape: shape of the new array
25 |             returns (np.ndarray): array of the given shape
26 |         """
27 |         # TODO: Implement
28 |         pass
29 | 
30 | class UniformRandom(Initializer):
31 |     
32 |     def initialize(self, weight_shape):
33 |         """ Return a new array of weights initialized by drawing from a uniform distribution with range [0, 1].
34 |             param: weight_shape: shape of new array
35 |             returns (np.ndarray): array of the given shape
36 |         """
37 |         # TODO: Implement
38 |         pass
39 | 
40 |         
41 | class He(Initializer):
42 |        
43 |     def initialize(self, weight_shape):
44 |         """ Return a new array of weights initialized according to He et al.: Delving Deep into Rectifiers.
45 |             param: weight_shape: shape of the np.array to be returned, the second dimension is assumed to be the 
46 |                    number of input nodes
47 |             returns (np.ndarray): array of the given shape
48 |         """        
49 |         # TODO: Implement
50 |         pass
51 |         


--------------------------------------------------------------------------------
/src/layers/pooling.py:
--------------------------------------------------------------------------------
 1 | class MaxPoolLayer(BaseLayer):
 2 |     
 3 |     def __init__(self, neighborhood=(2, 2), stride=(2, 2)):
 4 |         """ Max pooling layer.
 5 |            param: neighborhood: tuple with shape (sp, sq) which denote the kernel size of the pooling operation in 
 6 |            the spatial dimensions
 7 |            param: stride: tuple with shape (np, nq) which denote the subsampling factor of the pooling operation in
 8 |            the spacial dimensions
 9 |         """
10 |         # TODO: define necessary class variables
11 |         pass
12 |     
13 |     def forward(self, x):
14 |         """ Return the result of maxpooling on the input.
15 |             param: x (np.ndarray) with shape (b, n_channels, p, q) where b is the batch size, 
16 |                    n_channels is the number of input channels and p x q is the image size
17 |             returns (np.ndarray): the result of max pooling, of shape (b, n_channels, p', q')
18 |                    where b is the batch size, n_channels is the number of input channels and 
19 |                    p' x q' is the new image size reduced by the stride. 
20 |         """
21 |         # TODO: Implement forward pass of max pooling
22 |         pass
23 |     
24 |     def backward(self, error):
25 |         """ Return the gradient with respect to the previous layer.
26 |             param: error(np.ndarray): the gradient passed own from the subsequent layer, 
27 |                    of shape [b, n_channels, p', q'] where b is the batch size, n_channels is the 
28 |                    number of channels and p' x q' is the image size reduced by the stride
29 |             returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n_channels, p, q] 
30 |                    where b is the batch size, n_channels is the number of input channels to this layer and 
31 |                    p x q is the image size prior to downsampling.
32 |         """
33 |         # TODO: Implement backward pass of max pooling
34 |         pass


--------------------------------------------------------------------------------
/src/layers/softmax_crossentropy.py:
--------------------------------------------------------------------------------
 1 | class SoftMaxCrossEntropyLoss(BaseLayer):
 2 |     
 3 |     def forward(self, x, labels):
 4 |         """ Return the cross entropy loss of the input and the labels after applying the softmax to the input. 
 5 |             param: x (np.ndarray): input, of shape [b, k] where b is the batch size and k is the input size
 6 |             param: labels (np.ndarray): the corresponding labels of the training set in one-hot encoding for 
 7 |                    the current input, of the same shape as x
 8 |             returns (float): the loss of the current prediction and the label
 9 |         """
10 |         # Todo: Implement forward pass
11 |         pass
12 |     
13 |     def backward(self, labels):
14 |         """ Return the gradient of the SoftMaxCrossEntropy loss with respect to the previous layer.
15 |             param: labels (np.ndarray): (again) the corresponding labels of the training set for the current input, 
16 |                    of shape [b, k] where b is the batch size and k is the input size
17 |             returns (np.ndarray): the error w.r.t. the previous layer, of shape [b, k] where b is the batch 
18 |                    size and n is the input size
19 |         """
20 |         # TODO: Implement backward pass
21 |         pass
22 |     
23 |     def predict(self, x):
24 |         """ Return the softmax of the input.  This can be interpreted as probabilistic prediction of the class.
25 |             param: x (np.ndarray): input with shape [b, k], where b is the batch size and n is the input size
26 |             returns (np.ndarray): the result softmax(x), of the same shape as x
27 |         """
28 |         # TODO: Implement softmax
29 |         pass


--------------------------------------------------------------------------------
/src/network.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from src.base import Phase
 3 | # Nothing to do in this cell: Just make yourself familiar with the NeuralNetwork class.
 4 | 
 5 | 
 6 | class NeuralNetwork:
 7 |     def __init__(self, weights_initializer, bias_initializer):
 8 |         # list which will contain the loss after training
 9 |         self.loss = []
10 |         self.data_layer = None   # the layer providing data
11 |         self.loss_layer = None   # the layer calculating the loss and the prediction
12 |         self.layers = []
13 |         self.weights_initializer = weights_initializer
14 |         self.bias_initializer = bias_initializer
15 |         self.label_tensor = None # the labels of the current iteration
16 | 
17 |     def append_fixed_layer(self, layer):
18 |         """ Add a non-trainable layer to the network. """
19 |         self.layers.append(layer)
20 |     
21 |     def append_trainable_layer(self, layer):
22 |         """ Add a new layer with trainable parameters to the network. Initialize the parameters of 
23 |         the network using the object's initializers for weights and bias.
24 |         """
25 |         layer.initialize(self.weights_initializer, self.bias_initializer)
26 |         self.layers.append(layer)
27 | 
28 |     def forward(self):
29 |         """ Compute the forward pass through the network. """
30 |         # fetch some training data
31 |         input_tensor, self.label_tensor = self.data_layer.forward()
32 |         # defer iterating through the network
33 |         activation_tensor = self.__forward_input(input_tensor)
34 |         # calculate the loss of the network using the final loss layer
35 |         return self.loss_layer.forward(activation_tensor, self.label_tensor)
36 | 
37 |     def __forward_input(self, input_tensor):
38 |         """ Compute the forward pass through the network, stopping before the 
39 |             loss layer.
40 |             param: input_tensor (np.ndarray): input to the network
41 |             returns: activation of the last "regular" layer
42 |         """
43 |         activation_tensor = input_tensor
44 |         # pass the input up the network
45 |         for layer in self.layers:
46 |             activation_tensor = layer.forward(activation_tensor)
47 |         # return the activation of the last layer
48 |         return activation_tensor
49 | 
50 |     def backward(self):
51 |         """ Perform the backward pass during training. """
52 |         error_tensor = self.loss_layer.backward(self.label_tensor)
53 |         # pass back the error recursively
54 |         for layer in reversed(self.layers):
55 |             error_tensor = layer.backward(error_tensor)
56 | 
57 |     def train(self, iterations):
58 |         """ Train the network for a fixed number of steps.
59 |             param: iterations (int): number of iterations for training 
60 |         """
61 |         for layer in self.layers:
62 |             layer.phase = Phase.train  # Make sure phase is set to "train" for all layers
63 |         for i in range(iterations):
64 |             loss = self.forward()  # go up the network
65 |             self.loss.append(loss)  # save the loss
66 |             self.backward()  # and down again
67 |             print('.', end='')
68 | 
69 | 
70 |     def test(self, input_tensor):
71 |         """ Apply the (trained) network to input data to generate a prediction. 
72 |             param: input_tensor (nd.nparray): input (image or vector)
73 |             returns (np.ndarray): prediction by the network
74 |         """
75 |         for layer in self.layers:
76 |             layer.phase = Phase.test  # Make sure phase is set to "test" for all layers
77 |         activation_tensor = self.__forward_input(input_tensor)
78 |         return self.loss_layer.predict(activation_tensor)


--------------------------------------------------------------------------------
/tutorial_dl.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "deletable": false,
   7 |     "editable": false
   8 |    },
   9 |    "source": [
  10 |     "# Tutorial: How to Build a Deep Learning Framework\n",
  11 |     "\n",
  12 |     "by Katharina Breininger and Tobias Würfl\n",
  13 |     "\n",
  14 |     "Pattern Recognition Lab, Friedrich-Alexander-University Erlangen-Nürnberg, Erlangen, Germany "
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {
  20 |     "deletable": false,
  21 |     "editable": false
  22 |    },
  23 |    "source": [
  24 |     "## Introduction\n",
  25 |     "\n",
  26 |     "Neural networks, especially convolutional neural networks (CNNs), have had an incredible impact on research in medical imaging and medical signal processing in recent years. Frameworks like TensorFlow, Caffe and PyTorch make it easy to implement network architectures  to carry out experiments by simply stacking together operators. This has helped to speed up research immensely - it is easy to try out new ideas and translate insights from other fields. BUT: Never having to understand the technical details of the frameworks and operators deprives researchers of one avenue to potential innovation in the field. Improvements like trainable region-proposals and depth-wise separable convolutions are easier to come up with a thorough understanding the details of the machinery. Many essential advances in DL, such as the ReLU, batch normalization and better initialization strategies, have originated in understanding and improving drawbacks of building blocks in neural networks.\n",
  27 |     "\n",
  28 |     "With this tutorial, we aim to support you in understanding what's going in neural network frameworks in detail, and teach you how the most common operators work during inference in a network and how they are adapted by training. This will enable you to implement a broader range of ideas, relying on innovative new operators embedded into neural networks. \n",
  29 |     "\n",
  30 |     "As prerequisites we expect some conceptual knowledge about neural networks as acquired in offline or online courses, like the [Stanford DL course](http://cs231n.stanford.edu/), or our course ([DL_course_videos](https://www.video.uni-erlangen.de/clip/id/8947)), as well as basic Python/NumPy programming experience.\n",
  31 |     "\n",
  32 |     "- How it works:\n",
  33 |     "    - We provide the necessary math and code skeletons of building blocks\n",
  34 |     "    - You translate this math into code\n",
  35 |     "    - Our unit-tests will give you feedback on the correctness of your implementation\n",
  36 |     "    - At the end, we will put these building blocks together to a working network\n",
  37 |     "- What we don't do:\n",
  38 |     "    - Teach you Python programming\n",
  39 |     "    - Teach you about the fundamentals of machine learning\n",
  40 |     "    - Give a thorough introduction into the subject of deep learning\n",
  41 |     "    - Implement a framework with a focus on performance and efficiency\n",
  42 |     "- Elements in this tutorial\n",
  43 |     "    - Implementing a multilayer perceptron framework\n",
  44 |     "    - Extending this framework with state-of-the-art initialization\n",
  45 |     "    - Adding the basic operators of CNNs\n",
  46 |     "    - Including some operators for regularization to the framework\n",
  47 |     "    \n",
  48 |     "If you have feedback or suggestions for improvement, please contact us at katharina.breininger@fau.de and tobias.wuerfl@fau.de. The most recent version can be found at https://github.com/kbreininger/tutorial-dlframework.\n",
  49 |     "\n",
  50 |     "Have fun!"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "code",
  55 |    "execution_count": null,
  56 |    "metadata": {
  57 |     "collapsed": true
  58 |    },
  59 |    "outputs": [],
  60 |    "source": [
  61 |     "# minor set-up work\n",
  62 |     "import numpy  # we will definitely need this\n",
  63 |     "\n",
  64 |     "# automatic reloading\n",
  65 |     "%load_ext autoreload\n",
  66 |     "%autoreload 2\n",
  67 |     "\n",
  68 |     "%matplotlib inline"
  69 |    ]
  70 |   },
  71 |   {
  72 |    "cell_type": "markdown",
  73 |    "metadata": {
  74 |     "deletable": false,
  75 |     "editable": false
  76 |    },
  77 |    "source": [
  78 |     "## The General Idea of the Framework\n",
  79 |     "<a id='network_description'></a>\n",
  80 |     "\n",
  81 |     "Almost all tasks in this tutorial will revolve around implementing \"layers\". All layers are derived from the base class defined in the next cell. Each layer needs to implement the methods ```forward``` and ```backward```. We will use the term \"layer\" to represent any operator in the network that can be considered as a \"unit\" during forward and backward pass, e.g., a \"fully connected layer\", an \"activation layer\" or a \"loss layer\". \n",
  82 |     "\n",
  83 |     "In ```forward(x)```, the forward pass of the layer is computed by applying the respective operation to the input ```x```. Furthermore, intermediate results necessary to compute the gradients in the backward pass have to be stored. \n",
  84 |     "In ```backward(error)```, the layer receives the error passed down from the subsequent layer, updates its parameters accordingly and returns the error with respect to its input.\n",
  85 |     "\n",
  86 |     "This way, a simple network for classification can be expressed by a list of layer objects. Given an initial input ```x``` and a corresponding ```label```, the forward pass through the network is computed by subsequently calling ```forward``` for each layer in the list. The respective output is passed as input to the next layer. The very last layer, the \"loss\" layer, additionally receives the label to compute the loss. To adapt the weights in each layer, we then go backwards through the list, calling ```backward```, backpropagating the error through the network. The network is trained by alternating the forward and backward pass through the network while iterating through the training data.\n",
  87 |     "\n",
  88 |     "During test-time, only the forward pass through the network is computed to generate a prediction.\n",
  89 |     "\n",
  90 |     "### Basic notation and terminology\n",
  91 |     "\n",
  92 |     "We will work with the following notation and terminology:\n",
  93 |     "\n",
  94 |     "- $\\mathbf{X}$ and $\\mathbf{x}$ represent the input, \n",
  95 |     "- $\\mathbf{W}$ and $\\mathbf{w}$ the trainable weights/parameters and\n",
  96 |     "- $\\mathbf{Y}$ and $\\mathbf{y}$ the output of a layer.\n",
  97 |     "- $L$ represents the loss. Accordingly,\n",
  98 |     "- $E_\\mathbf{Y} = \\frac{\\partial L}{\\partial \\mathbf{Y}}$ is the error passed down from the subsequent layer,\n",
  99 |     "- $E_\\mathbf{W} = \\frac{\\partial L}{\\partial \\mathbf{W}}$ the error with respect to the weights and\n",
 100 |     "- $E_\\mathbf{X} = \\frac{\\partial L}{\\partial \\mathbf{X}}$ is the error with respect to the input.\n",
 101 |     "\n",
 102 |     "Note that $x$ and $y$ always have \"local\" meaning, i.e., with respect to the __current__ layer. The $y$ of the previous layer is the $x$ to the next, and vice versa.\n",
 103 |     "\n",
 104 |     "\n",
 105 |     "Have a look at the class definitions below and make yourself familiar with the concepts before continuing with the next part of the tutorial, the fully connected layer."
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "code",
 110 |    "execution_count": null,
 111 |    "metadata": {
 112 |     "collapsed": true,
 113 |     "deletable": false,
 114 |     "editable": false
 115 |    },
 116 |    "outputs": [],
 117 |    "source": [
 118 |     "# %load src/base.py\n",
 119 |     "def enum(*sequential, **named):\n",
 120 |     "    # Enum definition for backcompatibility\n",
 121 |     "    enums = dict(zip(sequential, range(len(sequential))), **named)\n",
 122 |     "    return type('Enum', (), enums)\n",
 123 |     "\n",
 124 |     "# Enum to encode the which phase a layer is in at the moment.\n",
 125 |     "Phase = enum('train', 'test', 'validation')\n",
 126 |     "\n",
 127 |     "class BaseLayer:\n",
 128 |     "    \n",
 129 |     "    def __init__(self):\n",
 130 |     "        self.phase = Phase.train\n",
 131 |     "        \n",
 132 |     "    def forward(self, x):\n",
 133 |     "        \"\"\" Return the result of the forward pass of this layer. Save intermediate results\n",
 134 |     "        necessary to compute the gradients in the backward pass. \n",
 135 |     "        \"\"\"\n",
 136 |     "        raise NotImplementedError('Base class - method is not implemented')\n",
 137 |     "    \n",
 138 |     "    def backward(self, error):\n",
 139 |     "        \"\"\" Update the parameters/weights of this layer (if applicable), \n",
 140 |     "        and return the gradient with respect to the input.\n",
 141 |     "        \"\"\"\n",
 142 |     "        raise NotImplementedError('Base class - method is not implemented')"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "metadata": {
 148 |     "deletable": false,
 149 |     "editable": false
 150 |    },
 151 |    "source": [
 152 |     "## Fully Connected Layers\n",
 153 |     "\n",
 154 |     "Fully connected (FC) layers are the essential building blocks in (multi-layer) perceptrons. Inspired by biological neurons, they are able to represent any connection topology between two layers (without same-layer connections).\n",
 155 |     "\n",
 156 |     "<img src=\"img/ann.png\" width=\"600\">\n",
 157 |     "\n",
 158 |     "Let's have a look at the forward pass: Given an input vector $\\mathbf{x} \\in \\mathbb{R}^{n}$ to an FC layer, the output $y$ of a single neuron can be described as a weighted sum of the input values plus a bias:\n",
 159 |     "\\begin{equation}\n",
 160 |     "y = w_{n+1} + \\sum_{j=1}^n w_j x_j ,\n",
 161 |     "\\end{equation}\n",
 162 |     "\n",
 163 |     "where we collect the weights in a vector $\\mathbf{w} \\in \\mathbb{R}^{n + 1}$.\n",
 164 |     "\n",
 165 |     "This is simply a vector-vector multiplication: \n",
 166 |     "\n",
 167 |     "\\begin{equation}\n",
 168 |     "y = \\begin{pmatrix} \n",
 169 |     "  w_{1}&\\dots&w_{n}&w_{n+1} \\end{pmatrix}\n",
 170 |     "\\begin{pmatrix} \n",
 171 |     "  x_{1}    \\\\ \n",
 172 |     "  \\vdots \\\\\n",
 173 |     "  x_{n} \\\\\n",
 174 |     "  1\n",
 175 |     "\\end{pmatrix}\n",
 176 |     "\\end{equation}\n",
 177 |     "\n",
 178 |     "By extending $\\mathbf{x}$ with an additional \"1\", we can include the bias directly in the multiplication. \n",
 179 |     "\n",
 180 |     "\n",
 181 |     "Since we want to have a layer able to generate multiple outputs, we need multiple neurons:\n",
 182 |     "\n",
 183 |     "<img src=\"img/fcn.png\" width=\"150\">\n",
 184 |     "\n",
 185 |     "To achieve this, we extend the weight vector to a matrix to allow for an output vector $\\mathbf{y} \\in \\mathbb{R}^{m}$:\n",
 186 |     "\n",
 187 |     "\\begin{align}\n",
 188 |     "\\begin{pmatrix} \n",
 189 |     "y_1    \\\\ \n",
 190 |     "\\vdots \\\\\n",
 191 |     "y_m\n",
 192 |     "\\end{pmatrix} &=\n",
 193 |     "\\begin{pmatrix} \n",
 194 |     "w_{1,1}    & \\dots & w_{n,1} & w_{n+1,1} \\\\\n",
 195 |     "\\vdots & \\ddots & \\vdots & \\vdots \\\\%\n",
 196 |     "w_{1,m}    & \\dots & w_{n,m} & w_{n+1,m}\n",
 197 |     "\\end{pmatrix}\n",
 198 |     "\\begin{pmatrix} \n",
 199 |     "x_1    \\\\ \n",
 200 |     "\\vdots \\\\\n",
 201 |     "x_n\t \\\\\n",
 202 |     "1\n",
 203 |     "\\end{pmatrix}\\\\\n",
 204 |     "\\mathbf{y} &= \\mathbf{W}\\mathbf{x} \n",
 205 |     "\\end{align}\n",
 206 |     "\n",
 207 |     "For batch processing, we can accordingly stack multiple input vectors in a matrix $\\mathbf{X}$:\n",
 208 |     "\n",
 209 |     "\\begin{equation}\n",
 210 |     "\\mathbf{Y} = \\mathbf{W}\\mathbf{X}\n",
 211 |     "\\end{equation}\n",
 212 |     "\n",
 213 |     "The weight matrix represents the trainable parameters of the FC layer. To be able to update the parameters, we need the gradient of the loss with respect to these weights.\n",
 214 |     "Given the error with respect to the output $\\mathbf{Y}$ of the current layer $\\frac{\\partial L}{\\partial \\mathbf{Y}} = E_\\mathbf{Y}$, we can compute the gradient with respect to the weights $\\frac{\\partial L}{\\partial \\mathbf{W}} = E_\\mathbf{W}$ using backpropagation, i.e., the chain rule. To backpropagate the error to the previous layer (and then update the weights there), we further need to compute the error with respect to the inputs $\\frac{\\partial L}{\\partial \\mathbf{X}} = E_\\mathbf{X}$.\n",
 215 |     "\n",
 216 |     "Using the formula of the fully connected layer $\\mathbf{Y} = \\mathbf{W}\\mathbf{X}$, we can compute the wanted gradients:\n",
 217 |     "\n",
 218 |     "\\begin{align}\n",
 219 |     "\\frac{\\partial L}{\\partial \\mathbf{W}} &= \\frac{\\partial L}{\\partial \\mathbf{Y}} \\frac{\\partial \\mathbf{Y}}{\\partial \\mathbf{W}}\\\\\n",
 220 |     "                              &= E_\\mathbf{Y} \\mathbf{X}^T\\\\\n",
 221 |     "\\end{align}\n",
 222 |     "\n",
 223 |     "\\begin{align}\n",
 224 |     "\\frac{\\partial L}{\\partial \\mathbf{X}} &= \\frac{\\partial L}{\\partial \\mathbf{Y}} \\frac{\\partial \\mathbf{Y}}{\\partial \\mathbf{X}}\\\\\n",
 225 |     "                              &= \\mathbf{W}^T E_\\mathbf{Y}\\\\\n",
 226 |     "\\end{align}\n",
 227 |     "\n",
 228 |     "We will use (mini-batch) stochastic gradient descent in this tutorial, so the update rule for the weights is as follows:\n",
 229 |     "\n",
 230 |     "\\begin{equation}\n",
 231 |     "\\mathbf{W}^{t+1} = \\mathbf{W}^{t} - \\eta E_{\\mathbf{W}^t} \\enspace{,}\n",
 232 |     "\\end{equation}\n",
 233 |     "\n",
 234 |     "where $\\eta$ is the learning rate and ${t}$ denotes the iteration.\n",
 235 |     "\n",
 236 |     "\n",
 237 |     "### Implementation task\n",
 238 |     "\n",
 239 |     "**Now it is your turn**: In the next cell, implement the methods ```init```, ```forward```, ```backward```, and ```get_gradient_weights``` and test the method by running the cell after the next. The method ```get_gradient_weights``` should return the gradient with respect to the weights and biases of the last backward pass.\n",
 240 |     "\n",
 241 |     "**Note that input and output, and accordingly the respective errors, are actually transposed compared to the formulas above**. This is due to performance reasons and consistency with known frameworks. Make sure to consider this in your implementation.\n",
 242 |     "\n",
 243 |     "Furthermore, implement the method ```initialize```. For the moment, take the initializer objects as given, we will return to them later. Just make sure to use them with the correct weight shapes to initialize weights and biases. Implement the update of these parameters as part of the backward pass."
 244 |    ]
 245 |   },
 246 |   {
 247 |    "cell_type": "code",
 248 |    "execution_count": null,
 249 |    "metadata": {
 250 |     "collapsed": true
 251 |    },
 252 |    "outputs": [],
 253 |    "source": [
 254 |     "# %load src/layers/fully_connected.py\n",
 255 |     "class FullyConnectedLayer(BaseLayer):\n",
 256 |     "    def __init__(self, input_size, output_size, learning_rate):\n",
 257 |     "        \"\"\" A fully connected layer.\n",
 258 |     "            param: input_size (int): dimension n of the input vector\n",
 259 |     "            param: output_size (int): dimension m of the output vector\n",
 260 |     "            param: learning_rate (float): the learning rate of this layer\n",
 261 |     "        \"\"\"\n",
 262 |     "        # TODO: define the neccesary class variables\n",
 263 |     "        self.weights = ... #\n",
 264 |     "        pass\n",
 265 |     "\n",
 266 |     "    def forward(self, x):\n",
 267 |     "        \"\"\" Compute the foward pass through the layer.\n",
 268 |     "            param: x (np.ndarray): input with shape [b, n] where b is the batch size and n is the input size\n",
 269 |     "            returns (np.ndarray): result of the forward pass, of shape [b, m] where b is the batch size and\n",
 270 |     "                   m is the output size\n",
 271 |     "        \"\"\"\n",
 272 |     "        # TODO: Implement forward pass of the fully connected layer\n",
 273 |     "        # Hint: Think about what you need to store during the forward pass to be able to compute \n",
 274 |     "        # the gradients in the backward pass \n",
 275 |     "        pass\n",
 276 |     "    \n",
 277 |     "    def get_gradient_weights(self):\n",
 278 |     "        \"\"\" \n",
 279 |     "        returns (np.ndarray): the gradient with respect to the weights and biases from the last call of backward(...)\n",
 280 |     "        \"\"\"\n",
 281 |     "        # TODO: Implement \n",
 282 |     "        pass\n",
 283 |     "    \n",
 284 |     "    def backward(self, error):\n",
 285 |     "        \"\"\" Update the weights of this layer and return the gradient with respect to the previous layer.\n",
 286 |     "            param: error (np.ndarray): of shape [b, m] where b is the batch size and m is the output size\n",
 287 |     "            returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n] where b is the \n",
 288 |     "                   batch size and n is the input size\n",
 289 |     "        \"\"\"\n",
 290 |     "        # TODO: Implement backward pass of the fully connected layer\n",
 291 |     "        # Hint: Be careful about the order of applying the update to the weights and the calculation of \n",
 292 |     "        # the error with respect to the previous layer.\n",
 293 |     "        pass\n",
 294 |     "    \n",
 295 |     "    def initialize(self, weights_initializer, bias_initializer):\n",
 296 |     "        \"\"\" Initializes the weights/bias of this layer with the given initializers.\n",
 297 |     "            param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape)\n",
 298 |     "                   which will return initialized weights with the given shape\n",
 299 |     "            param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) \n",
 300 |     "                   which will return an initialized bias with the given shape\n",
 301 |     "        \"\"\"\n",
 302 |     "        # TODO: Implement\n",
 303 |     "        pass"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": null,
 309 |    "metadata": {
 310 |     "collapsed": true,
 311 |     "deletable": false,
 312 |     "editable": false
 313 |    },
 314 |    "outputs": [],
 315 |    "source": [
 316 |     "# Running the testsuite\n",
 317 |     "%run Tests/TestFullyConnected.py\n",
 318 |     "TestFullyConnected.FullyConnected = FullyConnectedLayer\n",
 319 |     "unittest.main(argv=['first-arg-is-ignored'], exit=False)"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "metadata": {
 325 |     "deletable": false,
 326 |     "editable": false
 327 |    },
 328 |    "source": [
 329 |     "## Activation Functions\n",
 330 |     "\n",
 331 |     "Activation functions play an essential role in neural networks: They introduce non-linearity. In this tutorial, we are going to implement two activation functions: The sigmoid and the rectified linear unit (ReLU).\n",
 332 |     "\n",
 333 |     "### Sigmoid activation function\n",
 334 |     "Historically, the Sigmoid function has played a big role in the development of neural networks. Given the motivation of biological neurons and their all-or-nothing response, the sigmoid is an obvious choice close to a true step function: It scales the input between 0 and 1, and its gradient exists everywhere.\n",
 335 |     "For each element of the input, it is defined as:\n",
 336 |     "\\begin{equation}\n",
 337 |     "\\mathrm{sig}(x) = \\frac{1}{1 + e^{-x}} \\enspace{.}\n",
 338 |     "\\end{equation}\n",
 339 |     "\n",
 340 |     "To be able to backpropagate the error through the network, we need the gradient with respect to the input. \n",
 341 |     "\n",
 342 |     "\\begin{align}\n",
 343 |     "\\frac{\\partial \\mathrm{sig}(x)}{\\partial x} &= \\frac{1}{1 + e^{-x}} (1 - \\frac{1}{1 + e^{-x}}) \\\\\n",
 344 |     "                                   &= \\mathrm{sig}(x) (1-\\mathrm{sig}(x)) \\enspace{.}\n",
 345 |     "\\end{align}\n",
 346 |     "\n",
 347 |     "### ReLU activation function\n",
 348 |     "\n",
 349 |     "While the sigmoid function is still frequently used for example in recurrent networks and as the last layer for binary segmentation/classification, it has been overtaken by the rectified linear unit (ReLU) and its variants in many other setting.\n",
 350 |     "The main drawback of the sigmoid function is that its gradient is close to zero everywhere apart from a small region around the origin. This can cause the so-called vanishing gradient problem, meaning that the network will learn very slow or will stop learning completely. The ReLU is much less affected by this problem, as the output is linear for inputs $>0$:\n",
 351 |     "\n",
 352 |     "\\begin{equation}\n",
 353 |     "\\mathrm{relu}(x) = \n",
 354 |     "\\begin{cases}\n",
 355 |     "x \\quad \\text{if}~x > 0,\\\\\n",
 356 |     "0 \\quad \\text{else}.\n",
 357 |     "\\end{cases}\n",
 358 |     "\\end{equation}\n",
 359 |     "\n",
 360 |     "However, due to the kink at position 0, the function is not continuously differentiable. Instead, we need to compute subgradients in the backward pass:\n",
 361 |     "\n",
 362 |     "\\begin{equation}\n",
 363 |     "\\frac{\\partial \\mathrm{relu}(x)}{x} = \n",
 364 |     "\\begin{cases}\n",
 365 |     "1 \\quad \\text{if}~x > 0,\\\\\n",
 366 |     "0 \\quad \\text{else}.\n",
 367 |     "\\end{cases}\n",
 368 |     "\\end{equation}\n",
 369 |     "\n",
 370 |     "For both activation functions, we need to apply the chain rule to compute the result of the backward pass:\n",
 371 |     "\\begin{align}\n",
 372 |     "\\frac{\\partial L}{\\partial x} &= \\frac{\\partial L}{\\partial f(x)} \\frac{\\partial f(x)}{\\partial x} \\enspace{,}\n",
 373 |     "\\end{align}\n",
 374 |     "where $f(x)$ stands for any of the two functions.\n",
 375 |     "\n",
 376 |     "### Implementation task\n",
 377 |     "\n",
 378 |     "In the following, implement the ```Simoid``` and ```ReLU``` activation functions. Test your implementation by running the cell below."
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": null,
 384 |    "metadata": {
 385 |     "collapsed": true
 386 |    },
 387 |    "outputs": [],
 388 |    "source": [
 389 |     "# %load src/layers/activation_functions.py\n",
 390 |     "class Sigmoid(BaseLayer):\n",
 391 |     "    \n",
 392 |     "    def forward(self, x):\n",
 393 |     "        \"\"\" Return the element-wise sigmoid of the input.\n",
 394 |     "            param: x (np.ndarray): input to the activation function, of arbitrary shape\n",
 395 |     "            returns (np.ndarray): element-wise sigmoid(x), of the same shape as x\n",
 396 |     "        \"\"\"\n",
 397 |     "        # TODO: Implement forward pass of the Sigmoid\n",
 398 |     "        pass\n",
 399 |     "        \n",
 400 |     "    def backward(self, error):\n",
 401 |     "        \"\"\" Return the gradient with respect to the input.\n",
 402 |     "            param: error (np.ndarray): the gradient passed down from the subsequent layer, of the same \n",
 403 |     "                   shape as x in the forward pass\n",
 404 |     "            returns (np.ndarray): the gradient with respect to the previous layer, of the same shape as error \n",
 405 |     "        \"\"\"\n",
 406 |     "        # TODO: Implement backward pass of the Sigmoid\n",
 407 |     "        pass\n",
 408 |     "    \n",
 409 |     "\n",
 410 |     "class ReLU(BaseLayer):\n",
 411 |     "    \n",
 412 |     "    def forward(self, x):\n",
 413 |     "        \"\"\" Return the result of a ReLU activation of the input.\n",
 414 |     "            param: x (np.ndarray): input to the activation function, of arbitrary shape\n",
 415 |     "            returns (np.ndarray): element-wise ReLU(x), of the same shape as x\n",
 416 |     "        \"\"\"\n",
 417 |     "        # TODO: Implement forward pass of the ReLU\n",
 418 |     "        pass\n",
 419 |     "    \n",
 420 |     "    def backward(self, error):\n",
 421 |     "        \"\"\" Return the gradient with respect to the input.\n",
 422 |     "            param: error (np.ndarray): the gradient passed down from the previous layer, arbitrary shape (same as x)\n",
 423 |     "            returns (np.ndarray): gradient with respect to the input, of the same shape as error \n",
 424 |     "        \"\"\"\n",
 425 |     "        # TODO: Implement backward pass of the ReLU\n",
 426 |     "        pass"
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "code",
 431 |    "execution_count": null,
 432 |    "metadata": {
 433 |     "collapsed": true,
 434 |     "deletable": false,
 435 |     "editable": false
 436 |    },
 437 |    "outputs": [],
 438 |    "source": [
 439 |     "%run Tests/TestActivationFunctions.py\n",
 440 |     "TestReLU.ReLU = ReLU\n",
 441 |     "TestSigmoid.Sigmoid = Sigmoid\n",
 442 |     "unittest.main(argv=['first-arg-is-ignored'], exit=False)"
 443 |    ]
 444 |   },
 445 |   {
 446 |    "cell_type": "markdown",
 447 |    "metadata": {
 448 |     "deletable": false,
 449 |     "editable": false
 450 |    },
 451 |    "source": [
 452 |     "## Softmax and Loss Layer\n",
 453 |     "\n",
 454 |     "By combining the layers we implemented so far, we can represent a non-linear function of the input. For example, we can compute an output vector with $K$ elements to classify between $K$ classes.\n",
 455 |     "\n",
 456 |     "### Softmax\n",
 457 |     "The output of this computation is not further restricted. In many cases, however, it is beneficial if a prediction for the targeted classification has the properties of a probability distribution, i.e., \n",
 458 |     "\n",
 459 |     "\\begin{align}\n",
 460 |     "\\sum_{k=1}^{K} y_k &= 1 \\enspace{,}\\\\\n",
 461 |     "y_k &\\le 0 \\quad \\forall k~\\text{in}~{1, ..., K} \\enspace{.}\n",
 462 |     "\\end{align}\n",
 463 |     "\n",
 464 |     "This makes it for example easier to compare the prediction with the ground truth of the classification task.\n",
 465 |     "We can achieve these properties by applying the softmax function as a last activation function. It is defined as: \n",
 466 |     "\n",
 467 |     "\\begin{equation}\n",
 468 |     "\\mathrm{softmax}(x_k) = \\frac{\\mathrm{exp}(x_k)}{\\sum_{j=1}^{K}\\mathrm{exp}(x_j)} \\enspace{.}\n",
 469 |     "\\end{equation}\n",
 470 |     "\n",
 471 |     "However, if the activations in $\\mathbf{x}$ are high, $\\mathrm{exp}(x_k)$ can become very large. This can cause numerical instabilities. To avoid this, the activations can be shifted by the maximum value of $\\mathbf{x}$ before applying the softmax:\n",
 472 |     "\n",
 473 |     "\\begin{equation}\n",
 474 |     "\\mathbf{\\widetilde{x}} = \\mathbf{x} - \\mathrm{max}(\\mathbf{x}) \\enspace{.}\n",
 475 |     "\\end{equation}\n",
 476 |     "\n",
 477 |     "After the softmax, the predictions of the network have the properties of a probability distribution.\n",
 478 |     "\n",
 479 |     "### Loss function\n",
 480 |     "To adapt the parameters of the network, we to know how \"well\" the network performs compared to a given ground truth (or label) - we need a loss function. Then, we can \"train\" the network by minimizing this loss by iteratively adapting the weights and biases using our training data.\n",
 481 |     "\n",
 482 |     "A common loss function is cross entropy. To compute it, we need the ground truth $\\mathbf{y^*}$ in \"one-hot\"-vector encoding. The ground truth is represented as a vector with $K$ elements where only the value that corresponds to the true class is $\\neq 0$:\n",
 483 |     "\n",
 484 |     "\\begin{equation}\n",
 485 |     "\\mathbf{y^*} = \n",
 486 |     "\\begin{pmatrix}\n",
 487 |     "  0 \\\\\n",
 488 |     "  \\vdots\\\\\n",
 489 |     "  1\\\\\n",
 490 |     "  \\vdots\\\\\n",
 491 |     "  0\n",
 492 |     "\\end{pmatrix}\n",
 493 |     "\\end{equation}\n",
 494 |     "\n",
 495 |     "Then, the cross entropy loss for a batch of b samples is defined as:\n",
 496 |     "\n",
 497 |     "\\begin{equation}\n",
 498 |     "L(\\mathbf{Y^*},\\mathbf{Y}) = - \\sum_b \\sum_{k=1}^K \\ln( y_{b, k} ) y^*_{b, k}\n",
 499 |     "\\end{equation}\n",
 500 |     "\n",
 501 |     "### Combining both\n",
 502 |     "\n",
 503 |     "The softmax activation and the cross entropy loss are frequently combined, and sometimes called the \"SoftMax loss\". Together, their gradient has a simple and elegant form:\n",
 504 |     "\n",
 505 |     "\\begin{equation}\n",
 506 |     "e_k = \n",
 507 |     "y_k - y^*_k \\enspace{.}\n",
 508 |     "\\end{equation}\n",
 509 |     "\n",
 510 |     "for every element of the batch.\n",
 511 |     "\n",
 512 |     "### Implementation task\n",
 513 |     "\n",
 514 |     "Implement the softmax function and the cross entropy loss combined in the class ```SoftMaxCrossEntropyLoss```. Since the two functions are combined in ```forward```, additionally implement a function ```predict``` that computes only the softmax of the input. This function can be used during test-time, when we are interested in a prediction for unseen data."
 515 |    ]
 516 |   },
 517 |   {
 518 |    "cell_type": "code",
 519 |    "execution_count": null,
 520 |    "metadata": {
 521 |     "collapsed": true
 522 |    },
 523 |    "outputs": [],
 524 |    "source": [
 525 |     "# %load src/layers/softmax_crossentropy.py\n",
 526 |     "class SoftMaxCrossEntropyLoss(BaseLayer):\n",
 527 |     "    \n",
 528 |     "    def forward(self, x, labels):\n",
 529 |     "        \"\"\" Return the cross entropy loss of the input and the labels after applying the softmax to the input. \n",
 530 |     "            param: x (np.ndarray): input, of shape [b, k] where b is the batch size and k is the input size\n",
 531 |     "            param: labels (np.ndarray): the corresponding labels of the training set in one-hot encoding for \n",
 532 |     "                   the current input, of the same shape as x\n",
 533 |     "            returns (float): the loss of the current prediction and the label\n",
 534 |     "        \"\"\"\n",
 535 |     "        # Todo: Implement forward pass\n",
 536 |     "        pass\n",
 537 |     "    \n",
 538 |     "    def backward(self, labels):\n",
 539 |     "        \"\"\" Return the gradient of the SoftMaxCrossEntropy loss with respect to the previous layer.\n",
 540 |     "            param: labels (np.ndarray): (again) the corresponding labels of the training set for the current input, \n",
 541 |     "                   of shape [b, k] where b is the batch size and k is the input size\n",
 542 |     "            returns (np.ndarray): the error w.r.t. the previous layer, of shape [b, k] where b is the batch \n",
 543 |     "                   size and n is the input size\n",
 544 |     "        \"\"\"\n",
 545 |     "        # TODO: Implement backward pass\n",
 546 |     "        pass\n",
 547 |     "    \n",
 548 |     "    def predict(self, x):\n",
 549 |     "        \"\"\" Return the softmax of the input.  This can be interpreted as probabilistic prediction of the class.\n",
 550 |     "            param: x (np.ndarray): input with shape [b, k], where b is the batch size and n is the input size\n",
 551 |     "            returns (np.ndarray): the result softmax(x), of the same shape as x\n",
 552 |     "        \"\"\"\n",
 553 |     "        # TODO: Implement softmax\n",
 554 |     "        pass"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "code",
 559 |    "execution_count": null,
 560 |    "metadata": {
 561 |     "collapsed": true,
 562 |     "deletable": false,
 563 |     "editable": false
 564 |    },
 565 |    "outputs": [],
 566 |    "source": [
 567 |     "%run Tests/TestSoftMaxCrossEntropyLoss.py\n",
 568 |     "TestSoftMaxCrossEntropyLoss.SoftMaxCrossEntropyLoss = SoftMaxCrossEntropyLoss\n",
 569 |     "unittest.main(argv=['first-arg-is-ignored'], exit=False)"
 570 |    ]
 571 |   },
 572 |   {
 573 |    "cell_type": "markdown",
 574 |    "metadata": {
 575 |     "deletable": false,
 576 |     "editable": false
 577 |    },
 578 |    "source": [
 579 |     "## Initialization\n",
 580 |     "\n",
 581 |     "Initialization is very critical for non-convex optimization problems, and neural networks are no exception. The most simple strategy is initialization with a constant value, which is frequently used for bias initialization. Generally, bias initialization with a constant of 0 is common, however, with ReLU as activation function, a small positive value is sensible to reduce the risk of \"dying ReLUs\".  \n",
 582 |     "\n",
 583 |     "For other weights in FC layers and for weights in convolutional layers that we will look at in a bit, we need a different initialization strategy. If all weights are initialized with the same value, each node would receive the same update and training becomes impossible. One option to break this symmetry is uniform random initialization. Each element of $\\mathbf{W}$ is drawn from a uniform distribution with a certain range, commonly [0, 1].\n",
 584 |     "\n",
 585 |     "However, even with random initialization, finding the right range for weights is still tricky. If the weights are too small, activations become subsequently smaller when they are passed through the layers. Conversely, if they are too large, the signal grows which each subsequent layer. Both effects hinder effective training.\n",
 586 |     "\n",
 587 |     "Glorot and Bengio$^1$ investigated this problem in more detail and presented a strategy to find the \"sweet spot\" for weight initialization that keeps the variance of the input and output gradient the same. Given certain assumptions, this can be achieved by drawing the weights from a Gaussian distribution $\\mathcal{N}(0, \\sigma)$ with zero mean and a standard deviation depending on the number of inputs  $n_\\mathrm{in}$ and outputs  $n_\\mathrm{out}$ of the layer. He et al.$^2$ showed that for ReLU activations, an adapted version is required to retain this property:\n",
 588 |     "\n",
 589 |     "\\begin{equation}\n",
 590 |     "\\sigma = \\sqrt{\\frac{2}{n_\\mathrm{in}}} \\enspace{.}\n",
 591 |     "\\end{equation}\n",
 592 |     "\n",
 593 |     "### Implementation task\n",
 594 |     "\n",
 595 |     "As the next task, implement the initializers ```Const```, ```UniformRandom``` and ```He``` that provide the method ```initialize``` for arbitrary weight shapes. For He initialization, the second dimension of ```weight_shape``` is assumed to be the number of input nodes. As before, run the cell below to test your implementation.\n",
 596 |     "\n",
 597 |     "$^1$ Glorot X. and Bengio Y. Understanding the difficulty of training deep feedforward neural networks. In Proc. AISTATS, PMLR 9:249-256, 2010.\n",
 598 |     "\n",
 599 |     "$^2$ He K. et al. Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification. In CoRR, abs/1502.01852, 2015."
 600 |    ]
 601 |   },
 602 |   {
 603 |    "cell_type": "code",
 604 |    "execution_count": null,
 605 |    "metadata": {
 606 |     "collapsed": true
 607 |    },
 608 |    "outputs": [],
 609 |    "source": [
 610 |     "# %load src/layers/initializers.py\n",
 611 |     "class Initializer:\n",
 612 |     "    \"\"\" Base class for initializers. \"\"\"\n",
 613 |     "    def initialize(self, weight_shape):\n",
 614 |     "        \"\"\" Return weights initialized according to the subclass definition. \n",
 615 |     "            Required to work for arbitrary weight shapes.\n",
 616 |     "            Base class. \n",
 617 |     "        \"\"\"\n",
 618 |     "        \n",
 619 |     "        # Raises an exeption in base class.\n",
 620 |     "        raise NotImplementedError('Method is not implemented')\n",
 621 |     "\n",
 622 |     "        \n",
 623 |     "class Const(Initializer):\n",
 624 |     "    \n",
 625 |     "    def __init__(self, value):\n",
 626 |     "        \"\"\" Create a constant initializer.\n",
 627 |     "            params: value (float): constant that is used for initialization of weights\n",
 628 |     "        \"\"\"\n",
 629 |     "        # TODO: Implement\n",
 630 |     "        pass\n",
 631 |     "\n",
 632 |     "    def initialize(self, weight_shape):\n",
 633 |     "        \"\"\" Return a new array of weights initialized with a constant value provided by self.value.\n",
 634 |     "            param: weight_shape: shape of the new array\n",
 635 |     "            returns (np.ndarray): array of the given shape\n",
 636 |     "        \"\"\"\n",
 637 |     "        # TODO: Implement\n",
 638 |     "        pass\n",
 639 |     "\n",
 640 |     "class UniformRandom(Initializer):\n",
 641 |     "    \n",
 642 |     "    def initialize(self, weight_shape):\n",
 643 |     "        \"\"\" Return a new array of weights initialized by drawing from a uniform distribution with range [0, 1].\n",
 644 |     "            param: weight_shape: shape of new array\n",
 645 |     "            returns (np.ndarray): array of the given shape\n",
 646 |     "        \"\"\"\n",
 647 |     "        # TODO: Implement\n",
 648 |     "        pass\n",
 649 |     "\n",
 650 |     "        \n",
 651 |     "class He(Initializer):\n",
 652 |     "       \n",
 653 |     "    def initialize(self, weight_shape):\n",
 654 |     "        \"\"\" Return a new array of weights initialized according to He et al.: Delving Deep into Rectifiers.\n",
 655 |     "            param: weight_shape: shape of the np.array to be returned, the second dimension is assumed to be the \n",
 656 |     "                   number of input nodes\n",
 657 |     "            returns (np.ndarray): array of the given shape\n",
 658 |     "        \"\"\"        \n",
 659 |     "        # TODO: Implement\n",
 660 |     "        pass\n",
 661 |     "        "
 662 |    ]
 663 |   },
 664 |   {
 665 |    "cell_type": "code",
 666 |    "execution_count": null,
 667 |    "metadata": {
 668 |     "collapsed": true,
 669 |     "deletable": false,
 670 |     "editable": false
 671 |    },
 672 |    "outputs": [],
 673 |    "source": [
 674 |     "%run Tests/TestInitializers.py\n",
 675 |     "TestInitializers.Const = Const\n",
 676 |     "TestInitializers.Uniform = UniformRandom\n",
 677 |     "TestInitializers.He = He\n",
 678 |     "unittest.main(argv=['first-arg-is-ignored'], exit=False) "
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "markdown",
 683 |    "metadata": {
 684 |     "deletable": false,
 685 |     "editable": false
 686 |    },
 687 |    "source": [
 688 |     "## Convolutional layers\n",
 689 |     "\n",
 690 |     "Convolutional layers are without doubt one of the key elements of the success of neural networks in recent years. The main idea is simple: Convolution with trainable filters. They allow to learn which features are important for a given task in a data driven manner. One of their big advantages is that they inherently consider the spatial layout of the data. The animation below shows an example of a 2-D convolution of a padded input (blue) with a $3 \\times 3$ filter kernel that generates the output in green:\n",
 691 |     "\n",
 692 |     "<figure>\n",
 693 |     "<img src=\"files/img/same_padding_no_strides.gif\" width=\"200\">\n",
 694 |     "<figcaption><center>Source: https://github.com/vdumoulin/conv_arithmetic</center></figcaption>\n",
 695 |     "</figure>\n",
 696 |     "\n",
 697 |     "In this tutorial, we will implement a 2-D convolutional layer that is fully connected in the depth/channel direction. Accordingly, given an input with $C$ channels, each filter has a shape of $M \\times N \\times C$, where $M$ and $N$ describe the spacial dimensions of the filter. The number of channels of the output depends on the number of filters  $S$ in the convolutional layer.\n",
 698 |     "\n",
 699 |     "<img src=\"files/img/conv_forward.png\" width=\"400\">\n",
 700 |     "\n",
 701 |     "In the example above, the input has $C = 3$ channels and the convolutional layer has $S = 2$ filters fully connected in depth direction. Accordingly, the output has two channels.\n",
 702 |     "\n",
 703 |     "### Forward pass in a Conv layer:\n",
 704 |     "We can compute the forward pass in multiple ways:\n",
 705 |     "\n",
 706 |     "#### As a special case of a fully connected layer: Matrix multiplication\n",
 707 |     "Given a fixed input size, a convolutional layer can be considered as a special case of a fully connected layer. Accordingly, we can express the forward pass using a multiplication with a sparse matrix that represents the local connections within a convolutional layer. This allows us to use the same formulas as in the forward pass for the FC layer. While this presents a rather inefficient implementation, it can help to illustrate the connection between the convolutional and the FC layer.\n",
 708 |     "\n",
 709 |     "#### Convolution\n",
 710 |     "The forward pass of a *convolutional* layer can of course also be straight forwardly implemented as a convolution. Different very efficient low-level implementations of convolutions are available, e.g., implementations that use fast Fourier transforms (FFT), generalized matrix multiplication (GEMM) or that are based on Winograd minimal filtering algorithms$^3$. In this tutorial, we will consider a \"naive\" convolution where we slide a filter over the image to facilitate a better understanding, and subordinate efficiency.\n",
 711 |     "\n",
 712 |     "#### Cross-correlation\n",
 713 |     "Cross-correlation is simply a convolution without a flipped filter. For filters that are initialized randomly, we are free to use cross-correlation instead of convolution in the forward pass. We will see that it saves us a bit of kernel flipping in the backward pass.\n",
 714 |     "\n",
 715 |     "In all cases, the bias in a convolutional layer is an element-wise addition of a scalar value for each output channel.\n",
 716 |     "\n",
 717 |     "### Backward pass in a Conv layer:\n",
 718 |     "\n",
 719 |     "In the backward pass, we need to compute the gradient with respect to the weights of the convolutional kernel, the bias and the input, given the backpropagated error tensor $E_Y$.\n",
 720 |     "\n",
 721 |     "#### Matrix multiplication\n",
 722 |     "Like in the forward pass, we can implement the backward pass by reusing the formulas from the fully connected layer if we express the convolution as a matrix multiplication. \n",
 723 |     "\n",
 724 |     "#### Convolution/cross-correlation\n",
 725 |     "We may want to have a detailed look at the animation above, pick up pen and paper and track which pixels of the input/weight and correspondingly which pixels of the error contribute to respective gradient. For the gradient with respect to the input, we can then see that we need flipped kernel weights in the spacial dimensions (width and height). Alternatively, if we used convolution in the forward pass, we can now apply cross-correlation, and vice versa.\n",
 726 |     "\n",
 727 |     "Next, let's have a look at the channels: If we have $S$ kernels in the forward pass, and the input has $C$ channels, we obviously need to re-arrange the weights to $C$ kernels with $S$ channels for the backward pass. \n",
 728 |     "\n",
 729 |     "<img src=\"files/img/restacking_filters.gif\" width=\"400\">\n",
 730 |     "\n",
 731 |     "In the animation above shows that channel $c$ of $E_X$ depends only on the channel $c$ of the kernel weights. You can further see how the channels of the kernels can be recombined to compute the gradient with respect to the input. \n",
 732 |     "\n",
 733 |     "For the gradient with respect to the weights, you can observe that a correlation operation is necessary: First, the input has to be padded with half the kernel width. Then, each channel of the input has to be correlated with the channel $s$ of $E_Y$ to yield the gradient for kernel $s$. We have to compute\n",
 734 |     "\n",
 735 |     "\\begin{equation}\n",
 736 |     "\\frac{\\partial L}{\\partial W_{c, s}} = X_c \\star E_{Y_s} \\end{equation}\n",
 737 |     "\n",
 738 |     "for $c$ in $\\{1, ..., C\\}$ to stack together $W_s$:\n",
 739 |     "\n",
 740 |     "<img src=\"files/img/conv_back_weights.png\" width=\"400\">\n",
 741 |     "\n",
 742 |     "If convolution was used in the forward pass, the result of this correlation represents the flipped gradient, so it has to be flipped back before an update. If correlation was used instead, we save this flipping operation. To really understand this, you may want to grab pen and paper again.\n",
 743 |     "\n",
 744 |     "The gradient with respect to the bias can be computed by simply summing over the respective channel.\n",
 745 |     "\n",
 746 |     "Like in the fully connected layer, the gradient for the full mini-batch are the sum of the gradient of the elements of the batch.\n",
 747 |     "\n",
 748 |     "### Stride\n",
 749 |     "<figure>\n",
 750 |     "<img src=\"files/img/padding_strides.gif\" width=\"200\">\n",
 751 |     "<figcaption><center>Source: https://github.com/vdumoulin/conv_arithmetic</center></figcaption>\n",
 752 |     "</figure>\n",
 753 |     "\n",
 754 |     "A strided convolution can be used to downsample the input. From a mathematical perspective, this can be expressed as a convolution followed by subsampling. Similarly, in the backward pass, $E_Y$ is first upsampled (introducing zeros), and then processed as before.\n",
 755 |     "\n",
 756 |     "### Padding\n",
 757 |     "In this tutorial, we will restrict the padding strategy to \"same\" padding, meaning the input will be padded with zeros such that output after the convolution has the same size as the original input.\n",
 758 |     "\n",
 759 |     "### Reshaping\n",
 760 |     "Convolutional layers inherently expect the input to have a certain spatial layout with possibly arbitrary size, which is different to FC layers that expect a vector of fixed size. There are two common ways to make these operations interoperable: \n",
 761 |     "\n",
 762 |     " - Flatten the input before passing it to an FC layer\n",
 763 |     " - Have the convolutional layers reshape the input to the correct spatial layout\n",
 764 |     " \n",
 765 |     "Here, we will implement the first option. To this end, a FlattenLayer is introduced with the sole purpose of reshaping the input to be compatible with FC layers. As no computation is involved, the backward pass simply consists of reversing the reshaping.\n",
 766 |     "\n",
 767 |     "### Implementation task\n",
 768 |     "\n",
 769 |     "In the following, implement the classes ```FlattenLayer``` and ```ConvolutionalLayer``` as described above. The necessary parameters are further described in the method documentation. \n",
 770 |     "\n",
 771 |     "Note: If you use 3D convolution/correlation (which makes sense from an implementation perspective), keep in mind that you potentially need to compensate for \"unnecessary\" flipping in the channel dimension in your implementation. Check your implementation by running the unit tests in the subsequent cell.\n",
 772 |     "\n",
 773 |     "$^3$ Lavin A., Gray S. Fast Algorithms for Convolutional Neural Networks. In Proc. CVPR, 2016. arXiv:1509.09308."
 774 |    ]
 775 |   },
 776 |   {
 777 |    "cell_type": "code",
 778 |    "execution_count": null,
 779 |    "metadata": {
 780 |     "collapsed": true
 781 |    },
 782 |    "outputs": [],
 783 |    "source": [
 784 |     "# %load src/layers/conv.py\n",
 785 |     "class FlattenLayer(BaseLayer):\n",
 786 |     "    def __init__(self):\n",
 787 |     "        # TODO: define the necessary class variables\n",
 788 |     "        pass\n",
 789 |     "    \n",
 790 |     "    def forward(self, x):\n",
 791 |     "        \"\"\" Return a flattened version of the input.\n",
 792 |     "            param: x (np.ndarray): input, of shape [b, n_channels, p, q] where b is the batch size, \n",
 793 |     "                   n_channels is the number of channels and p x q is the image size\n",
 794 |     "            returns (np.ndarray): a flattened representation of x of shape [b, v] \n",
 795 |     "                   where b is the batch size and v is the output size = n_channels * p * q\n",
 796 |     "        \"\"\"\n",
 797 |     "        # TODO: Implement flattening of the image\n",
 798 |     "        pass\n",
 799 |     "    \n",
 800 |     "    def backward(self, error):\n",
 801 |     "        \"\"\" Return the gradient with respect to the input.\n",
 802 |     "            param: error (np.ndarray): the gradient passed down from the subsequent layer, of shape [b, m],\n",
 803 |     "                   where b is the batch size and m is the output size with m = n_channels * p * q from \n",
 804 |     "                   the forward pass\n",
 805 |     "            returns (np.ndarray): the error with restored dimensions from the forward pass, i.e. with \n",
 806 |     "                   shape [b, n_channels, p, q] where b is the batch size, n_channels is the number of \n",
 807 |     "                   channels and p x q is the image size\n",
 808 |     "        \"\"\"\n",
 809 |     "        # TODO: Restore the image dimensions\n",
 810 |     "        pass\n",
 811 |     "\n",
 812 |     "\n",
 813 |     "class ConvolutionalLayer(BaseLayer):\n",
 814 |     "    \n",
 815 |     "    def __init__(self, stride_shape, kernel_shape, n_kernels, learning_rate, weights_initializer=UniformRandom(), bias_initializer=Const(0.1)):\n",
 816 |     "        \"\"\" \n",
 817 |     "            param: stride: tuple in the form of (np, nq) which denote the subsampling factor of the \n",
 818 |     "                   convolution operation in the spatial dimensions\n",
 819 |     "            param: kernel_shape: integer tuple in the form of (n_channels, m, n) where n_channels is \n",
 820 |     "                   the number of input channels and m x n is the size of the filter kernels\n",
 821 |     "            param: n_kernels (int): number of kernels and therefore the number of output channels\n",
 822 |     "            param: learning_rate (float): learning rate of this layer\n",
 823 |     "            param: weights_initializer: initializer object for the filter weights\n",
 824 |     "            param: bias_initializer: initializer object for the bias\n",
 825 |     "        \"\"\"\n",
 826 |     "        # TODO: define the neccesary class variables, initialize the weights and bias\n",
 827 |     "        self.weights = ...\n",
 828 |     "        self.bias = ...\n",
 829 |     "        pass \n",
 830 |     "    \n",
 831 |     "    def forward(self, x):\n",
 832 |     "        \"\"\" Return the result of the forward pass of the convolutional layer.\n",
 833 |     "            param: x(np.ndarray): input, of shape [b, n_channels, p, q],  where b is the batch size, \n",
 834 |     "                   n_channels is the number of input channels and p x q is the image size\n",
 835 |     "            returns (np.ndarray): result of the forward pass, of shape (b, n_kernels, p', q') \n",
 836 |     "                   where b is the batch size, n_kernels is the number of kernels in this layer and \n",
 837 |     "                   p' x q' is the output image size (which depends on the stride)\n",
 838 |     "        \"\"\"\n",
 839 |     "        # TODO: Implement forward pass of the convolutional layer\n",
 840 |     "        pass\n",
 841 |     "    \n",
 842 |     "    def backward(self, error):\n",
 843 |     "        \"\"\" Update the weights of this layer and return the gradient with respect to the input.\n",
 844 |     "            param: error (np.ndarray): of shape (b, n_kernels, p', q') where b is the batch size, n_kernels\n",
 845 |     "                   is the number of kernels and p' x q' is the spacial error size (depends on the stride)\n",
 846 |     "            returns (np.ndarray): the gradient with respect to the input, of shape (b, n_channels, p, q) \n",
 847 |     "                   where b is the batch size, n_channels is the number of input channels to this layer and \n",
 848 |     "                   p x q is the image size.\n",
 849 |     "        \"\"\" \n",
 850 |     "        # TODO: Implement backward pass of the convolutional layer\n",
 851 |     "        pass\n",
 852 |     "    \n",
 853 |     "    def get_gradient_weights(self):\n",
 854 |     "        \"\"\" Returns the gradient with respect to the weights from the last call of backward() \"\"\"\n",
 855 |     "        # TODO: Implement\n",
 856 |     "        pass\n",
 857 |     "\n",
 858 |     "    def get_gradient_bias(self):\n",
 859 |     "        \"\"\" Returns the gradient with respect to the bias from the last call of backward() \"\"\"\n",
 860 |     "        # TODO: Implement\n",
 861 |     "        pass\n",
 862 |     "    \n",
 863 |     "    def initialize(self, weights_initializer, bias_initializer):\n",
 864 |     "        \"\"\" Initializes the weights/bias of this layer with the given initializers.\n",
 865 |     "            param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape)\n",
 866 |     "                   which will return initialized weights with the given shape\n",
 867 |     "            param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) \n",
 868 |     "                   which will return an initialized bias with the given shape\n",
 869 |     "        \"\"\"\n",
 870 |     "        # TODO: Implement. To make sure that He initialization works as intended, make sure the second dimension \n",
 871 |     "        # of weights_shape contains the number of input nodes that can be computed as n_in = n_channels * m * n\n",
 872 |     "        # and reshape the weights to the correct shape afterwards.\n",
 873 |     "        pass"
 874 |    ]
 875 |   },
 876 |   {
 877 |    "cell_type": "code",
 878 |    "execution_count": null,
 879 |    "metadata": {
 880 |     "collapsed": true,
 881 |     "deletable": false,
 882 |     "editable": false
 883 |    },
 884 |    "outputs": [],
 885 |    "source": [
 886 |     "%run Tests/TestConv.py\n",
 887 |     "TestConv.Conv = ConvolutionalLayer\n",
 888 |     "TestConv.FullyConnected = FullyConnectedLayer\n",
 889 |     "TestConv.He = He\n",
 890 |     "TestConv.Constant = Const\n",
 891 |     "TestConv.Flatten = FlattenLayer\n",
 892 |     "unittest.main(argv=['first-arg-is-ignored'], exit=False)"
 893 |    ]
 894 |   },
 895 |   {
 896 |    "cell_type": "markdown",
 897 |    "metadata": {
 898 |     "deletable": false,
 899 |     "editable": false
 900 |    },
 901 |    "source": [
 902 |     "## Pooling Layers\n",
 903 |     "\n",
 904 |     "As alternative to striding in a convolutional layer, specific pooling layers can be used to downsample the data and condense spacial information. We will look at max pooling as one example. In the forward pass, the output for each pixel is the maximum value in a neighborhood of the corresponding input pixel, calculated separately for every channel. The downsampling is again achieved by using a stride > 1.\n",
 905 |     "\n",
 906 |     "<figure>\n",
 907 |     "<img src=\"files/img/numerical_maxpooling.gif\" width=\"400\">\n",
 908 |     "<figcaption><center>Source: https://github.com/vdumoulin/conv_arithmetic</center></figcaption>\n",
 909 |     "</figure>\n",
 910 |     "\n",
 911 |     "The above example shows maxpooling with a neighborhood of $3 \\times 3$ and a stride of $[1, 1]$.\n",
 912 |     "\n",
 913 |     "The maximum operation can be thought of as an on/off switch for the backpropagation of the gradient for each pixel. We therefore need to store the location of the maximum value in the forward pass. Since the layer has no trainable parameters, we only need to compute the gradient with respect to the input. In the backward pass, the subgradient is given by the colloquial rule \"the winner takes it all\". The error is routed only towards the maximum locations; for all other input pixels, the gradient is zero. If the stride is smaller than the neighborhood, the routed gradients for the respective pixels are summed up.\n",
 914 |     "\n",
 915 |     "### Implementation task\n",
 916 |     "\n",
 917 |     "In the following, implement the class ```MaxPoolLayer```. Check your implementation as usual by running the unittests in the cell below the implementation."
 918 |    ]
 919 |   },
 920 |   {
 921 |    "cell_type": "code",
 922 |    "execution_count": null,
 923 |    "metadata": {
 924 |     "collapsed": true
 925 |    },
 926 |    "outputs": [],
 927 |    "source": [
 928 |     "# %load src/layers/pooling\n",
 929 |     "class MaxPoolLayer(BaseLayer):\n",
 930 |     "    \n",
 931 |     "    def __init__(self, neighborhood=(2, 2), stride=(2, 2)):\n",
 932 |     "        \"\"\" Max pooling layer.\n",
 933 |     "           param: neighborhood: tuple with shape (sp, sq) which denote the kernel size of the pooling operation in \n",
 934 |     "           the spatial dimensions\n",
 935 |     "           param: stride: tuple with shape (np, nq) which denote the subsampling factor of the pooling operation in\n",
 936 |     "           the spacial dimensions\n",
 937 |     "        \"\"\"\n",
 938 |     "        # TODO: define necessary class variables\n",
 939 |     "        pass\n",
 940 |     "    \n",
 941 |     "    def forward(self, x):\n",
 942 |     "        \"\"\" Return the result of maxpooling on the input.\n",
 943 |     "            param: x (np.ndarray) with shape (b, n_channels, p, q) where b is the batch size, \n",
 944 |     "                   n_channels is the number of input channels and p x q is the image size\n",
 945 |     "            returns (np.ndarray): the result of max pooling, of shape (b, n_channels, p', q')\n",
 946 |     "                   where b is the batch size, n_channels is the number of input channels and \n",
 947 |     "                   p' x q' is the new image size reduced by the stride. \n",
 948 |     "        \"\"\"\n",
 949 |     "        # TODO: Implement forward pass of max pooling\n",
 950 |     "        pass\n",
 951 |     "    \n",
 952 |     "    def backward(self, error):\n",
 953 |     "        \"\"\" Return the gradient with respect to the previous layer.\n",
 954 |     "            param: error(np.ndarray): the gradient passed own from the subsequent layer, \n",
 955 |     "                   of shape [b, n_channels, p', q'] where b is the batch size, n_channels is the \n",
 956 |     "                   number of channels and p' x q' is the image size reduced by the stride\n",
 957 |     "            returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n_channels, p, q] \n",
 958 |     "                   where b is the batch size, n_channels is the number of input channels to this layer and \n",
 959 |     "                   p x q is the image size prior to downsampling.\n",
 960 |     "        \"\"\"\n",
 961 |     "        # TODO: Implement backward pass of max pooling\n",
 962 |     "        pass"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": null,
 968 |    "metadata": {
 969 |     "collapsed": true,
 970 |     "deletable": false,
 971 |     "editable": false
 972 |    },
 973 |    "outputs": [],
 974 |    "source": [
 975 |     "%run Tests/TestMaxPoolLayer.py\n",
 976 |     "TestMaxPooling.MaxPooling = MaxPoolLayer\n",
 977 |     "TestMaxPooling.FullyConnected = FullyConnectedLayer\n",
 978 |     "TestMaxPooling.Flatten = FlattenLayer\n",
 979 |     "unittest.main(argv=['first-arg-is-ignored'], exit=False)"
 980 |    ]
 981 |   },
 982 |   {
 983 |    "cell_type": "markdown",
 984 |    "metadata": {
 985 |     "deletable": false,
 986 |     "editable": false
 987 |    },
 988 |    "source": [
 989 |     "## Dropout\n",
 990 |     "\n",
 991 |     "Most successful deep learning models use some regularization techniques intended to decrease the gap between training and test accuracy. The goal is to bias the model towards a model with lower training accuracy but better generalization capability. One prominent technique is dropout. It was for example used in the famous AlexNet network. \n",
 992 |     "The idea of this technique is to break dependencies between features by setting random activations to zero during training. This is typically done with a Bernoulli distribution: In each training iteration, the probability for a certain activation to \"drop out\" is $p$.\n",
 993 |     "The application of dropout shifts the mean of the activations because many elements are set to zero during training. At test time, when no element are dropped out, the mean is different, which can decrease performance. To combat this the \"training mean\" can be restored by multiplying all activations with $1 - p$ at test time.\n",
 994 |     " \n",
 995 |     "### Inverted dropout\n",
 996 |     "The multiplication at test time can be avoiding by rewriting dropout behaviour during training. This means that the dropout layer can actually be skipped completely during test time, allowing for faster inference. To this end, the activations are multiplied by $\\frac{1}{1 - p}$ after applying the stochastic function during training. This way, the mean is not changed by the layer and no operation needs to be performed during test time.\n",
 997 |     "\n",
 998 |     "\n",
 999 |     "### Implementation task\n",
1000 |     "In the following, implement the ```DropOut``` layer with inverted dropout. As usual, check your implementation by running the unittests. Note that dropout operates on each element of the input vector independently."
1001 |    ]
1002 |   },
1003 |   {
1004 |    "cell_type": "code",
1005 |    "execution_count": null,
1006 |    "metadata": {
1007 |     "collapsed": true
1008 |    },
1009 |    "outputs": [],
1010 |    "source": [
1011 |     "# %load src/layers/dropout\n",
1012 |     "class DropOut(BaseLayer):\n",
1013 |     "    \n",
1014 |     "    def __init__(self, probability):\n",
1015 |     "        \"\"\" DropOut Layer.\n",
1016 |     "            param: drop_probability: probability of each individual activation to be set to zero, in range [0, 1]    \n",
1017 |     "        \"\"\"\n",
1018 |     "        # TODO: Implement initialization\n",
1019 |     "        \n",
1020 |     "        pass\n",
1021 |     "    \n",
1022 |     "    def forward(self, x):\n",
1023 |     "        \"\"\" Forward pass through the layer: Set activations of the input randomly to zero.\n",
1024 |     "            param: x (np.ndarray): input\n",
1025 |     "            returns (np.ndarray): a new array of the same shape as x, after dropping random elements\n",
1026 |     "        \"\"\"\n",
1027 |     "        # TODO: Implement forward pass of the Dropout layer\n",
1028 |     "        # Hint: Make sure to treat training and test phase accordingly.\n",
1029 |     "        pass\n",
1030 |     "    \n",
1031 |     "    def backward(self, error):\n",
1032 |     "        \"\"\" Backward pass through the layer: Return the gradient with respect to the input.\n",
1033 |     "            param: error (np.ndarray): error passed down from the subsequent layer, of the same shape as the \n",
1034 |     "                   output of the forward pass\n",
1035 |     "            returns (np.ndarray):  gradient with respect to the input, of the same shape as error\n",
1036 |     "        \"\"\"\n",
1037 |     "        # TODO: Implement backward pass of the Dropout layer\n",
1038 |     "        pass"
1039 |    ]
1040 |   },
1041 |   {
1042 |    "cell_type": "code",
1043 |    "execution_count": null,
1044 |    "metadata": {
1045 |     "collapsed": true,
1046 |     "deletable": false,
1047 |     "editable": false
1048 |    },
1049 |    "outputs": [],
1050 |    "source": [
1051 |     "%run Tests/TestDropout.py\n",
1052 |     "TestDropout.DropOut = DropOut\n",
1053 |     "TestDropout.Phase = Phase\n",
1054 |     "unittest.main(argv=['first-arg-is-ignored'], exit=False)"
1055 |    ]
1056 |   },
1057 |   {
1058 |    "cell_type": "markdown",
1059 |    "metadata": {
1060 |     "deletable": false,
1061 |     "editable": false
1062 |    },
1063 |    "source": [
1064 |     "## LeNet\n",
1065 |     "\n",
1066 |     "As the last part of this tutorial, we use our developed operators to construct a simple neural network inspired by the traditional LeNet architecture:\n",
1067 |     "\n",
1068 |     "<figure>\n",
1069 |     "<img src=\"files/img/lenet.jpg\" width=\"600\">\n",
1070 |     "<figcaption><center>Source: LeCun et al, 1998.$^4$</center></figcaption>\n",
1071 |     "</figure>\n",
1072 |     "\n",
1073 |     "Use two convolutional layers with $5 \\times 5$ kernels and $6$ respectively $10$ channels. Each convolution is followed by a ReLU unit and max pooling of with a neighborhood and stride of 2 in each dimension. The top of the network is formed by three FC layers with ReLU activations producing outputs of dimensionality $120$, $84$ and subsequently the number of categories. Finally, use the SoftMaxCrossEntropyLoss as loss layer.\n",
1074 |     "\n",
1075 |     "First, have a look at the class ```NeuralNetwork```, that provides the basic framework in which you can use the different layers and stack them together to a functioning network. You don't need to adapt this class, but you can use it to implement the LeNet architecture. You may also want to refer back to the [description](#network_description) in the beginning.\n",
1076 |     "\n",
1077 |     "### Implementation task\n",
1078 |     "\n",
1079 |     "Next, implement the LeNet architecture in the ```build``` function and train your network in with the script provided below. \n",
1080 |     "\n",
1081 |     "Experiment for example with the activation function and DropOut, tune the learning rate or look at the effect of initialization. Feel free to add your own evaluations and plots. You can get the full test data of the MNIST data object by calling ```net.data_layer.get_test_set```.\n",
1082 |     "\n",
1083 |     "$^4$ LeCun Y., Bottou L., Bengio Y. and Haffner P. Gradient-based Learning Applied to Document Recognition. In Proc. IEEE, 1989."
1084 |    ]
1085 |   },
1086 |   {
1087 |    "cell_type": "code",
1088 |    "execution_count": null,
1089 |    "metadata": {
1090 |     "collapsed": true,
1091 |     "deletable": false
1092 |    },
1093 |    "outputs": [],
1094 |    "source": [
1095 |     "# %load src/network.py\n",
1096 |     "\n",
1097 |     "# Nothing to do in this cell: Just make yourself familiar with the NeuralNetwork class.\n",
1098 |     "class NeuralNetwork:\n",
1099 |     "    def __init__(self, weights_initializer, bias_initializer):\n",
1100 |     "        # list which will contain the loss after training\n",
1101 |     "        self.loss = []\n",
1102 |     "        self.data_layer = None   # the layer providing data\n",
1103 |     "        self.loss_layer = None   # the layer calculating the loss and the prediction\n",
1104 |     "        self.layers = []\n",
1105 |     "        self.weights_initializer = weights_initializer\n",
1106 |     "        self.bias_initializer = bias_initializer\n",
1107 |     "        self.label_tensor = None # the labels of the current iteration\n",
1108 |     "\n",
1109 |     "    def append_fixed_layer(self, layer):\n",
1110 |     "        \"\"\" Add a non-trainable layer to the network. \"\"\"\n",
1111 |     "        self.layers.append(layer)\n",
1112 |     "    \n",
1113 |     "    def append_trainable_layer(self, layer):\n",
1114 |     "        \"\"\" Add a new layer with trainable parameters to the network. Initialize the parameters of \n",
1115 |     "        the network using the object's initializers for weights and bias.\n",
1116 |     "        \"\"\"\n",
1117 |     "        layer.initialize(self.weights_initializer, self.bias_initializer)\n",
1118 |     "        self.layers.append(layer)\n",
1119 |     "\n",
1120 |     "    def forward(self):\n",
1121 |     "        \"\"\" Compute the forward pass through the network. \"\"\"\n",
1122 |     "        # fetch some training data\n",
1123 |     "        input_tensor, self.label_tensor = self.data_layer.forward()\n",
1124 |     "        # defer iterating through the network\n",
1125 |     "        activation_tensor = self.__forward_input(input_tensor)\n",
1126 |     "        # calculate the loss of the network using the final loss layer\n",
1127 |     "        return self.loss_layer.forward(activation_tensor, self.label_tensor)\n",
1128 |     "\n",
1129 |     "    def __forward_input(self, input_tensor):\n",
1130 |     "        \"\"\" Compute the forward pass through the network, stopping before the \n",
1131 |     "            loss layer.\n",
1132 |     "            param: input_tensor (np.ndarray): input to the network\n",
1133 |     "            returns: activation of the last \"regular\" layer\n",
1134 |     "        \"\"\"\n",
1135 |     "        activation_tensor = input_tensor\n",
1136 |     "        # pass the input up the network\n",
1137 |     "        for layer in self.layers:\n",
1138 |     "            activation_tensor = layer.forward(activation_tensor)\n",
1139 |     "        # return the activation of the last layer\n",
1140 |     "        return activation_tensor\n",
1141 |     "\n",
1142 |     "    def backward(self):\n",
1143 |     "        \"\"\" Perform the backward pass during training. \"\"\"\n",
1144 |     "        error_tensor = self.loss_layer.backward(self.label_tensor)\n",
1145 |     "        # pass back the error recursively\n",
1146 |     "        for layer in reversed(self.layers):\n",
1147 |     "            error_tensor = layer.backward(error_tensor)\n",
1148 |     "\n",
1149 |     "    def train(self, iterations):\n",
1150 |     "        \"\"\" Train the network for a fixed number of steps.\n",
1151 |     "            param: iterations (int): number of iterations for training \n",
1152 |     "        \"\"\"\n",
1153 |     "        for layer in self.layers:\n",
1154 |     "            layer.phase = Phase.train  # Make sure phase is set to \"train\" for all layers\n",
1155 |     "        for i in range(iterations):\n",
1156 |     "            loss = self.forward()  # go up the network\n",
1157 |     "            self.loss.append(loss)  # save the loss\n",
1158 |     "            self.backward()  # and down again\n",
1159 |     "            print('.', end='')\n",
1160 |     "\n",
1161 |     "\n",
1162 |     "    def test(self, input_tensor):\n",
1163 |     "        \"\"\" Apply the (trained) network to input data to generate a prediction. \n",
1164 |     "            param: input_tensor (nd.nparray): input (image or vector)\n",
1165 |     "            returns (np.ndarray): prediction by the network\n",
1166 |     "        \"\"\"\n",
1167 |     "        for layer in self.layers:\n",
1168 |     "            layer.phase = Phase.test  # Make sure phase is set to \"test\" for all layers\n",
1169 |     "        activation_tensor = self.__forward_input(input_tensor)\n",
1170 |     "        return self.loss_layer.predict(activation_tensor)"
1171 |    ]
1172 |   },
1173 |   {
1174 |    "cell_type": "code",
1175 |    "execution_count": null,
1176 |    "metadata": {
1177 |     "collapsed": true
1178 |    },
1179 |    "outputs": [],
1180 |    "source": [
1181 |     "def build():\n",
1182 |     "    # returns: a neural network architecture built according to the provided specification\n",
1183 |     "    \n",
1184 |     "    net = NeuralNetwork(He(), Const(0.1))\n",
1185 |     "    learning_rate = 0.001\n",
1186 |     "    categories = 10  # MNIST, numbers 0-9\n",
1187 |     "    \n",
1188 |     "    # TODO: Implement the architecture by adding layers to net\n",
1189 |     "\n",
1190 |     "    return net"
1191 |    ]
1192 |   },
1193 |   {
1194 |    "cell_type": "code",
1195 |    "execution_count": null,
1196 |    "metadata": {
1197 |     "collapsed": true,
1198 |     "deletable": false
1199 |    },
1200 |    "outputs": [],
1201 |    "source": [
1202 |     "import matplotlib\n",
1203 |     "import numpy as np\n",
1204 |     "import matplotlib.pyplot as plt\n",
1205 |     "\n",
1206 |     "net = build()\n",
1207 |     "\n",
1208 |     "from Tests import Helpers\n",
1209 |     "net.data_layer = Helpers.MNISTData(20)\n",
1210 |     "n_iters = 100\n",
1211 |     "net.train(n_iters)\n",
1212 |     "\n",
1213 |     "plt.plot(range(n_iters), net.loss)\n",
1214 |     "\n"
1215 |    ]
1216 |   },
1217 |   {
1218 |    "cell_type": "code",
1219 |    "execution_count": null,
1220 |    "metadata": {
1221 |     "collapsed": true
1222 |    },
1223 |    "outputs": [],
1224 |    "source": [
1225 |     "# Perform the prediction for a random test sample from the dataset:\n",
1226 |     "x, l = net.data_layer.get_random_test_sample()\n",
1227 |     "plt.imshow(x[:28*28].reshape(28, 28), cmap='gray')\n",
1228 |     "\n",
1229 |     "print(x.shape)\n",
1230 |     "print('Prediction with highest output: {}'.format(np.argmax(net.test(x))))\n",
1231 |     "print('Ground truth: {}'.format(np.argmax(l)))\n"
1232 |    ]
1233 |   },
1234 |   {
1235 |    "cell_type": "markdown",
1236 |    "metadata": {
1237 |     "deletable": false,
1238 |     "editable": false
1239 |    },
1240 |    "source": [
1241 |     "## Summary and Outlook\n",
1242 |     "In this tutorial, we implemented some of the most common building blocks of neural networks, including fully connected layers, activation functions, convolutional layers and regularization operators. Finally, we combined these operators to working network.\n",
1243 |     "\n",
1244 |     "We covered only a small subset of elements that are relevant for neural networks. We encourage you to play with other operators, for example batch normalization$^5$, alternative activation functions, initialization strategies or recurrent units. You may also refactor the framework to experiment with different optimizers, like SGD with momentum, Adam or AdaGrad, or extend the framework to allow for weight decay.\n",
1245 |     "\n",
1246 |     "We hope you enjoyed this tutorial and gained a deeper understanding of neural network operators and frameworks. Have fun on your journey further into deep learning and neural networks!\n",
1247 |     "\n",
1248 |     "$^5$ Ioffe S., Szegedy C. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. In Proc. ICML, 2015."
1249 |    ]
1250 |   }
1251 |  ],
1252 |  "metadata": {
1253 |   "celltoolbar": "Edit Metadata",
1254 |   "kernelspec": {
1255 |    "display_name": "Python 3",
1256 |    "language": "python",
1257 |    "name": "python3"
1258 |   },
1259 |   "language_info": {
1260 |    "codemirror_mode": {
1261 |     "name": "ipython",
1262 |     "version": 3
1263 |    },
1264 |    "file_extension": ".py",
1265 |    "mimetype": "text/x-python",
1266 |    "name": "python",
1267 |    "nbconvert_exporter": "python",
1268 |    "pygments_lexer": "ipython3",
1269 |    "version": "3.6.1"
1270 |   }
1271 |  },
1272 |  "nbformat": 4,
1273 |  "nbformat_minor": 2
1274 | }
1275 | 


--------------------------------------------------------------------------------