├── .gitignore ├── LICENSE.md ├── MyNetwork ├── README.md ├── expand_mnist.py ├── mnist.pkl.gz ├── mnist_average_darkness.py ├── mnist_loader.py ├── mnist_svm.py ├── network.py ├── network2.py ├── network3.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Michał Dobrzański 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | ### neuralnetworksanddeeplearning.com integrated scripts for Python 3.5.2 and Theano with CUDA support 4 | 5 | These scrips are updated ones from the **neuralnetworksanddeeplearning.com** gitHub repository in order to work with Python 3.5.2 6 | 7 | The testing file (**test.py**) contains all three networks (network.py, network2.py, network3.py) from the book and it is the starting point to run (i.e. *train and evaluate*) them. 8 | 9 | ## Just type at shell: **python3.5 test.py** 10 | 11 | In test.py there are examples of networks configurations with proper comments. I did that to relate with particular chapters from the book. 12 | 13 | ### License 14 | Disributed under MIT License. [Link](LICENSE.md). 15 | 16 | 17 | -------------------------------------------------------------------------------- /expand_mnist.py: -------------------------------------------------------------------------------- 1 | """expand_mnist.py 2 | ~~~~~~~~~~~~~~~~~~ 3 | 4 | Take the 50,000 MNIST training images, and create an expanded set of 5 | 250,000 images, by displacing each training image up, down, left and 6 | right, by one pixel. Save the resulting file to 7 | ../data/mnist_expanded.pkl.gz. 8 | 9 | Note that this program is memory intensive, and may not run on small 10 | systems. 11 | 12 | """ 13 | 14 | from __future__ import print_function 15 | 16 | #### Libraries 17 | 18 | # Standard library 19 | import cPickle 20 | import gzip 21 | import os.path 22 | import random 23 | 24 | # Third-party libraries 25 | import numpy as np 26 | 27 | print("Expanding the MNIST training set") 28 | 29 | if os.path.exists("../data/mnist_expanded.pkl.gz"): 30 | print("The expanded training set already exists. Exiting.") 31 | else: 32 | f = gzip.open("../data/mnist.pkl.gz", 'rb') 33 | training_data, validation_data, test_data = cPickle.load(f) 34 | f.close() 35 | expanded_training_pairs = [] 36 | j = 0 # counter 37 | for x, y in zip(training_data[0], training_data[1]): 38 | expanded_training_pairs.append((x, y)) 39 | image = np.reshape(x, (-1, 28)) 40 | j += 1 41 | if j % 1000 == 0: print("Expanding image number", j) 42 | # iterate over data telling us the details of how to 43 | # do the displacement 44 | for d, axis, index_position, index in [ 45 | (1, 0, "first", 0), 46 | (-1, 0, "first", 27), 47 | (1, 1, "last", 0), 48 | (-1, 1, "last", 27)]: 49 | new_img = np.roll(image, d, axis) 50 | if index_position == "first": 51 | new_img[index, :] = np.zeros(28) 52 | else: 53 | new_img[:, index] = np.zeros(28) 54 | expanded_training_pairs.append((np.reshape(new_img, 784), y)) 55 | random.shuffle(expanded_training_pairs) 56 | expanded_training_data = [list(d) for d in zip(*expanded_training_pairs)] 57 | print("Saving expanded data. This may take a few minutes.") 58 | f = gzip.open("../data/mnist_expanded.pkl.gz", "w") 59 | cPickle.dump((expanded_training_data, validation_data, test_data), f) 60 | f.close() 61 | -------------------------------------------------------------------------------- /mnist.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalDanielDobrzanski/DeepLearningPython/2da41b6dd19dafae92db1facbc4ed19738e80551/mnist.pkl.gz -------------------------------------------------------------------------------- /mnist_average_darkness.py: -------------------------------------------------------------------------------- 1 | """ 2 | mnist_average_darkness 3 | ~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | A naive classifier for recognizing handwritten digits from the MNIST 6 | data set. The program classifies digits based on how dark they are 7 | --- the idea is that digits like "1" tend to be less dark than digits 8 | like "8", simply because the latter has a more complex shape. When 9 | shown an image the classifier returns whichever digit in the training 10 | data had the closest average darkness. 11 | 12 | The program works in two steps: first it trains the classifier, and 13 | then it applies the classifier to the MNIST test data to see how many 14 | digits are correctly classified. 15 | 16 | Needless to say, this isn't a very good way of recognizing handwritten 17 | digits! Still, it's useful to show what sort of performance we get 18 | from naive ideas.""" 19 | 20 | #### Libraries 21 | # Standard library 22 | from collections import defaultdict 23 | 24 | # My libraries 25 | import mnist_loader 26 | 27 | def main(): 28 | training_data, validation_data, test_data = mnist_loader.load_data() 29 | # training phase: compute the average darknesses for each digit, 30 | # based on the training data 31 | avgs = avg_darknesses(training_data) 32 | # testing phase: see how many of the test images are classified 33 | # correctly 34 | num_correct = sum(int(guess_digit(image, avgs) == digit) 35 | for image, digit in zip(test_data[0], test_data[1])) 36 | print("Baseline classifier using average darkness of image.") 37 | print("{0} of {1} values correct.".format(num_correct, len(test_data[1]))) 38 | 39 | def avg_darknesses(training_data): 40 | """ Return a defaultdict whose keys are the digits 0 through 9. 41 | For each digit we compute a value which is the average darkness of 42 | training images containing that digit. The darkness for any 43 | particular image is just the sum of the darknesses for each pixel.""" 44 | digit_counts = defaultdict(int) 45 | darknesses = defaultdict(float) 46 | for image, digit in zip(training_data[0], training_data[1]): 47 | digit_counts[digit] += 1 48 | darknesses[digit] += sum(image) 49 | avgs = defaultdict(float) 50 | for digit, n in digit_counts.items(): 51 | avgs[digit] = darknesses[digit] / n 52 | return avgs 53 | 54 | def guess_digit(image, avgs): 55 | """Return the digit whose average darkness in the training data is 56 | closest to the darkness of ``image``. Note that ``avgs`` is 57 | assumed to be a defaultdict whose keys are 0...9, and whose values 58 | are the corresponding average darknesses across the training data.""" 59 | darkness = sum(image) 60 | distances = {k: abs(v-darkness) for k, v in avgs.items()} 61 | return min(distances, key=distances.get) 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /mnist_loader.py: -------------------------------------------------------------------------------- 1 | # %load mnist_loader.py 2 | """ 3 | mnist_loader 4 | ~~~~~~~~~~~~ 5 | A library to load the MNIST image data. For details of the data 6 | structures that are returned, see the doc strings for ``load_data`` 7 | and ``load_data_wrapper``. In practice, ``load_data_wrapper`` is the 8 | function usually called by our neural network code. 9 | """ 10 | 11 | #### Libraries 12 | # Standard library 13 | import pickle 14 | import gzip 15 | 16 | # Third-party libraries 17 | import numpy as np 18 | 19 | def load_data(): 20 | """Return the MNIST data as a tuple containing the training data, 21 | the validation data, and the test data. 22 | The ``training_data`` is returned as a tuple with two entries. 23 | The first entry contains the actual training images. This is a 24 | numpy ndarray with 50,000 entries. Each entry is, in turn, a 25 | numpy ndarray with 784 values, representing the 28 * 28 = 784 26 | pixels in a single MNIST image. 27 | The second entry in the ``training_data`` tuple is a numpy ndarray 28 | containing 50,000 entries. Those entries are just the digit 29 | values (0...9) for the corresponding images contained in the first 30 | entry of the tuple. 31 | The ``validation_data`` and ``test_data`` are similar, except 32 | each contains only 10,000 images. 33 | This is a nice data format, but for use in neural networks it's 34 | helpful to modify the format of the ``training_data`` a little. 35 | That's done in the wrapper function ``load_data_wrapper()``, see 36 | below. 37 | """ 38 | f = gzip.open('mnist.pkl.gz', 'rb') 39 | training_data, validation_data, test_data = pickle.load(f, encoding="latin1") 40 | f.close() 41 | return (training_data, validation_data, test_data) 42 | 43 | def load_data_wrapper(): 44 | """Return a tuple containing ``(training_data, validation_data, 45 | test_data)``. Based on ``load_data``, but the format is more 46 | convenient for use in our implementation of neural networks. 47 | In particular, ``training_data`` is a list containing 50,000 48 | 2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray 49 | containing the input image. ``y`` is a 10-dimensional 50 | numpy.ndarray representing the unit vector corresponding to the 51 | correct digit for ``x``. 52 | ``validation_data`` and ``test_data`` are lists containing 10,000 53 | 2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional 54 | numpy.ndarry containing the input image, and ``y`` is the 55 | corresponding classification, i.e., the digit values (integers) 56 | corresponding to ``x``. 57 | Obviously, this means we're using slightly different formats for 58 | the training data and the validation / test data. These formats 59 | turn out to be the most convenient for use in our neural network 60 | code.""" 61 | tr_d, va_d, te_d = load_data() 62 | training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]] 63 | training_results = [vectorized_result(y) for y in tr_d[1]] 64 | training_data = zip(training_inputs, training_results) 65 | validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]] 66 | validation_data = zip(validation_inputs, va_d[1]) 67 | test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]] 68 | test_data = zip(test_inputs, te_d[1]) 69 | return (training_data, validation_data, test_data) 70 | 71 | def vectorized_result(j): 72 | """Return a 10-dimensional unit vector with a 1.0 in the jth 73 | position and zeroes elsewhere. This is used to convert a digit 74 | (0...9) into a corresponding desired output from the neural 75 | network.""" 76 | e = np.zeros((10, 1)) 77 | e[j] = 1.0 78 | return e 79 | -------------------------------------------------------------------------------- /mnist_svm.py: -------------------------------------------------------------------------------- 1 | """ 2 | mnist_svm 3 | ~~~~~~~~~ 4 | 5 | A classifier program for recognizing handwritten digits from the MNIST 6 | data set, using an SVM classifier.""" 7 | 8 | #### Libraries 9 | # My libraries 10 | import mnist_loader 11 | 12 | # Third-party libraries 13 | from sklearn import svm 14 | 15 | def svm_baseline(): 16 | training_data, validation_data, test_data = mnist_loader.load_data() 17 | # train 18 | clf = svm.SVC() 19 | clf.fit(training_data[0], training_data[1]) 20 | # test 21 | predictions = [int(a) for a in clf.predict(test_data[0])] 22 | num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1])) 23 | print("Baseline classifier using an SVM.") 24 | print(str(num_correct) + " of " + str(len(test_data[1])) + " values correct.") 25 | 26 | if __name__ == "__main__": 27 | svm_baseline() 28 | 29 | -------------------------------------------------------------------------------- /network.py: -------------------------------------------------------------------------------- 1 | # %load network.py 2 | 3 | """ 4 | network.py 5 | ~~~~~~~~~~ 6 | IT WORKS 7 | 8 | A module to implement the stochastic gradient descent learning 9 | algorithm for a feedforward neural network. Gradients are calculated 10 | using backpropagation. Note that I have focused on making the code 11 | simple, easily readable, and easily modifiable. It is not optimized, 12 | and omits many desirable features. 13 | """ 14 | 15 | #### Libraries 16 | # Standard library 17 | import random 18 | 19 | # Third-party libraries 20 | import numpy as np 21 | 22 | class Network(object): 23 | 24 | def __init__(self, sizes): 25 | """The list ``sizes`` contains the number of neurons in the 26 | respective layers of the network. For example, if the list 27 | was [2, 3, 1] then it would be a three-layer network, with the 28 | first layer containing 2 neurons, the second layer 3 neurons, 29 | and the third layer 1 neuron. The biases and weights for the 30 | network are initialized randomly, using a Gaussian 31 | distribution with mean 0, and variance 1. Note that the first 32 | layer is assumed to be an input layer, and by convention we 33 | won't set any biases for those neurons, since biases are only 34 | ever used in computing the outputs from later layers.""" 35 | self.num_layers = len(sizes) 36 | self.sizes = sizes 37 | self.biases = [np.random.randn(y, 1) for y in sizes[1:]] 38 | self.weights = [np.random.randn(y, x) 39 | for x, y in zip(sizes[:-1], sizes[1:])] 40 | 41 | def feedforward(self, a): 42 | """Return the output of the network if ``a`` is input.""" 43 | for b, w in zip(self.biases, self.weights): 44 | a = sigmoid(np.dot(w, a)+b) 45 | return a 46 | 47 | def SGD(self, training_data, epochs, mini_batch_size, eta, 48 | test_data=None): 49 | """Train the neural network using mini-batch stochastic 50 | gradient descent. The ``training_data`` is a list of tuples 51 | ``(x, y)`` representing the training inputs and the desired 52 | outputs. The other non-optional parameters are 53 | self-explanatory. If ``test_data`` is provided then the 54 | network will be evaluated against the test data after each 55 | epoch, and partial progress printed out. This is useful for 56 | tracking progress, but slows things down substantially.""" 57 | 58 | training_data = list(training_data) 59 | n = len(training_data) 60 | 61 | if test_data: 62 | test_data = list(test_data) 63 | n_test = len(test_data) 64 | 65 | for j in range(epochs): 66 | random.shuffle(training_data) 67 | mini_batches = [ 68 | training_data[k:k+mini_batch_size] 69 | for k in range(0, n, mini_batch_size)] 70 | for mini_batch in mini_batches: 71 | self.update_mini_batch(mini_batch, eta) 72 | if test_data: 73 | print("Epoch {} : {} / {}".format(j,self.evaluate(test_data),n_test)) 74 | else: 75 | print("Epoch {} complete".format(j)) 76 | 77 | def update_mini_batch(self, mini_batch, eta): 78 | """Update the network's weights and biases by applying 79 | gradient descent using backpropagation to a single mini batch. 80 | The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta`` 81 | is the learning rate.""" 82 | nabla_b = [np.zeros(b.shape) for b in self.biases] 83 | nabla_w = [np.zeros(w.shape) for w in self.weights] 84 | for x, y in mini_batch: 85 | delta_nabla_b, delta_nabla_w = self.backprop(x, y) 86 | nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] 87 | nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] 88 | self.weights = [w-(eta/len(mini_batch))*nw 89 | for w, nw in zip(self.weights, nabla_w)] 90 | self.biases = [b-(eta/len(mini_batch))*nb 91 | for b, nb in zip(self.biases, nabla_b)] 92 | 93 | def backprop(self, x, y): 94 | """Return a tuple ``(nabla_b, nabla_w)`` representing the 95 | gradient for the cost function C_x. ``nabla_b`` and 96 | ``nabla_w`` are layer-by-layer lists of numpy arrays, similar 97 | to ``self.biases`` and ``self.weights``.""" 98 | nabla_b = [np.zeros(b.shape) for b in self.biases] 99 | nabla_w = [np.zeros(w.shape) for w in self.weights] 100 | # feedforward 101 | activation = x 102 | activations = [x] # list to store all the activations, layer by layer 103 | zs = [] # list to store all the z vectors, layer by layer 104 | for b, w in zip(self.biases, self.weights): 105 | z = np.dot(w, activation)+b 106 | zs.append(z) 107 | activation = sigmoid(z) 108 | activations.append(activation) 109 | # backward pass 110 | delta = self.cost_derivative(activations[-1], y) * \ 111 | sigmoid_prime(zs[-1]) 112 | nabla_b[-1] = delta 113 | nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 114 | # Note that the variable l in the loop below is used a little 115 | # differently to the notation in Chapter 2 of the book. Here, 116 | # l = 1 means the last layer of neurons, l = 2 is the 117 | # second-last layer, and so on. It's a renumbering of the 118 | # scheme in the book, used here to take advantage of the fact 119 | # that Python can use negative indices in lists. 120 | for l in range(2, self.num_layers): 121 | z = zs[-l] 122 | sp = sigmoid_prime(z) 123 | delta = np.dot(self.weights[-l+1].transpose(), delta) * sp 124 | nabla_b[-l] = delta 125 | nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) 126 | return (nabla_b, nabla_w) 127 | 128 | def evaluate(self, test_data): 129 | """Return the number of test inputs for which the neural 130 | network outputs the correct result. Note that the neural 131 | network's output is assumed to be the index of whichever 132 | neuron in the final layer has the highest activation.""" 133 | test_results = [(np.argmax(self.feedforward(x)), y) 134 | for (x, y) in test_data] 135 | return sum(int(x == y) for (x, y) in test_results) 136 | 137 | def cost_derivative(self, output_activations, y): 138 | """Return the vector of partial derivatives \partial C_x / 139 | \partial a for the output activations.""" 140 | return (output_activations-y) 141 | 142 | #### Miscellaneous functions 143 | def sigmoid(z): 144 | """The sigmoid function.""" 145 | return 1.0/(1.0+np.exp(-z)) 146 | 147 | def sigmoid_prime(z): 148 | """Derivative of the sigmoid function.""" 149 | return sigmoid(z)*(1-sigmoid(z)) 150 | -------------------------------------------------------------------------------- /network2.py: -------------------------------------------------------------------------------- 1 | """network2.py 2 | ~~~~~~~~~~~~~~ 3 | 4 | An improved version of network.py, implementing the stochastic 5 | gradient descent learning algorithm for a feedforward neural network. 6 | Improvements include the addition of the cross-entropy cost function, 7 | regularization, and better initialization of network weights. Note 8 | that I have focused on making the code simple, easily readable, and 9 | easily modifiable. It is not optimized, and omits many desirable 10 | features. 11 | 12 | """ 13 | 14 | #### Libraries 15 | # Standard library 16 | import json 17 | import random 18 | import sys 19 | 20 | # Third-party libraries 21 | import numpy as np 22 | 23 | 24 | #### Define the quadratic and cross-entropy cost functions 25 | 26 | class QuadraticCost(object): 27 | 28 | @staticmethod 29 | def fn(a, y): 30 | """Return the cost associated with an output ``a`` and desired output 31 | ``y``. 32 | 33 | """ 34 | return 0.5*np.linalg.norm(a-y)**2 35 | 36 | @staticmethod 37 | def delta(z, a, y): 38 | """Return the error delta from the output layer.""" 39 | return (a-y) * sigmoid_prime(z) 40 | 41 | 42 | class CrossEntropyCost(object): 43 | 44 | @staticmethod 45 | def fn(a, y): 46 | """Return the cost associated with an output ``a`` and desired output 47 | ``y``. Note that np.nan_to_num is used to ensure numerical 48 | stability. In particular, if both ``a`` and ``y`` have a 1.0 49 | in the same slot, then the expression (1-y)*np.log(1-a) 50 | returns nan. The np.nan_to_num ensures that that is converted 51 | to the correct value (0.0). 52 | 53 | """ 54 | return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a))) 55 | 56 | @staticmethod 57 | def delta(z, a, y): 58 | """Return the error delta from the output layer. Note that the 59 | parameter ``z`` is not used by the method. It is included in 60 | the method's parameters in order to make the interface 61 | consistent with the delta method for other cost classes. 62 | 63 | """ 64 | return (a-y) 65 | 66 | 67 | #### Main Network class 68 | class Network(object): 69 | 70 | def __init__(self, sizes, cost=CrossEntropyCost): 71 | """The list ``sizes`` contains the number of neurons in the respective 72 | layers of the network. For example, if the list was [2, 3, 1] 73 | then it would be a three-layer network, with the first layer 74 | containing 2 neurons, the second layer 3 neurons, and the 75 | third layer 1 neuron. The biases and weights for the network 76 | are initialized randomly, using 77 | ``self.default_weight_initializer`` (see docstring for that 78 | method). 79 | 80 | """ 81 | self.num_layers = len(sizes) 82 | self.sizes = sizes 83 | self.default_weight_initializer() 84 | self.cost=cost 85 | 86 | def default_weight_initializer(self): 87 | """Initialize each weight using a Gaussian distribution with mean 0 88 | and standard deviation 1 over the square root of the number of 89 | weights connecting to the same neuron. Initialize the biases 90 | using a Gaussian distribution with mean 0 and standard 91 | deviation 1. 92 | 93 | Note that the first layer is assumed to be an input layer, and 94 | by convention we won't set any biases for those neurons, since 95 | biases are only ever used in computing the outputs from later 96 | layers. 97 | 98 | """ 99 | self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]] 100 | self.weights = [np.random.randn(y, x)/np.sqrt(x) 101 | for x, y in zip(self.sizes[:-1], self.sizes[1:])] 102 | 103 | def large_weight_initializer(self): 104 | """Initialize the weights using a Gaussian distribution with mean 0 105 | and standard deviation 1. Initialize the biases using a 106 | Gaussian distribution with mean 0 and standard deviation 1. 107 | 108 | Note that the first layer is assumed to be an input layer, and 109 | by convention we won't set any biases for those neurons, since 110 | biases are only ever used in computing the outputs from later 111 | layers. 112 | 113 | This weight and bias initializer uses the same approach as in 114 | Chapter 1, and is included for purposes of comparison. It 115 | will usually be better to use the default weight initializer 116 | instead. 117 | 118 | """ 119 | self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]] 120 | self.weights = [np.random.randn(y, x) 121 | for x, y in zip(self.sizes[:-1], self.sizes[1:])] 122 | 123 | def feedforward(self, a): 124 | """Return the output of the network if ``a`` is input.""" 125 | for b, w in zip(self.biases, self.weights): 126 | a = sigmoid(np.dot(w, a)+b) 127 | return a 128 | 129 | def SGD(self, training_data, epochs, mini_batch_size, eta, 130 | lmbda = 0.0, 131 | evaluation_data=None, 132 | monitor_evaluation_cost=False, 133 | monitor_evaluation_accuracy=False, 134 | monitor_training_cost=False, 135 | monitor_training_accuracy=False, 136 | early_stopping_n = 0): 137 | """Train the neural network using mini-batch stochastic gradient 138 | descent. The ``training_data`` is a list of tuples ``(x, y)`` 139 | representing the training inputs and the desired outputs. The 140 | other non-optional parameters are self-explanatory, as is the 141 | regularization parameter ``lmbda``. The method also accepts 142 | ``evaluation_data``, usually either the validation or test 143 | data. We can monitor the cost and accuracy on either the 144 | evaluation data or the training data, by setting the 145 | appropriate flags. The method returns a tuple containing four 146 | lists: the (per-epoch) costs on the evaluation data, the 147 | accuracies on the evaluation data, the costs on the training 148 | data, and the accuracies on the training data. All values are 149 | evaluated at the end of each training epoch. So, for example, 150 | if we train for 30 epochs, then the first element of the tuple 151 | will be a 30-element list containing the cost on the 152 | evaluation data at the end of each epoch. Note that the lists 153 | are empty if the corresponding flag is not set. 154 | 155 | """ 156 | 157 | # early stopping functionality: 158 | best_accuracy=1 159 | 160 | training_data = list(training_data) 161 | n = len(training_data) 162 | 163 | if evaluation_data: 164 | evaluation_data = list(evaluation_data) 165 | n_data = len(evaluation_data) 166 | 167 | # early stopping functionality: 168 | best_accuracy=0 169 | no_accuracy_change=0 170 | 171 | evaluation_cost, evaluation_accuracy = [], [] 172 | training_cost, training_accuracy = [], [] 173 | for j in range(epochs): 174 | random.shuffle(training_data) 175 | mini_batches = [ 176 | training_data[k:k+mini_batch_size] 177 | for k in range(0, n, mini_batch_size)] 178 | for mini_batch in mini_batches: 179 | self.update_mini_batch( 180 | mini_batch, eta, lmbda, len(training_data)) 181 | 182 | print("Epoch %s training complete" % j) 183 | 184 | if monitor_training_cost: 185 | cost = self.total_cost(training_data, lmbda) 186 | training_cost.append(cost) 187 | print("Cost on training data: {}".format(cost)) 188 | if monitor_training_accuracy: 189 | accuracy = self.accuracy(training_data, convert=True) 190 | training_accuracy.append(accuracy) 191 | print("Accuracy on training data: {} / {}".format(accuracy, n)) 192 | if monitor_evaluation_cost: 193 | cost = self.total_cost(evaluation_data, lmbda, convert=True) 194 | evaluation_cost.append(cost) 195 | print("Cost on evaluation data: {}".format(cost)) 196 | if monitor_evaluation_accuracy: 197 | accuracy = self.accuracy(evaluation_data) 198 | evaluation_accuracy.append(accuracy) 199 | print("Accuracy on evaluation data: {} / {}".format(self.accuracy(evaluation_data), n_data)) 200 | 201 | # Early stopping: 202 | if early_stopping_n > 0: 203 | if accuracy > best_accuracy: 204 | best_accuracy = accuracy 205 | no_accuracy_change = 0 206 | #print("Early-stopping: Best so far {}".format(best_accuracy)) 207 | else: 208 | no_accuracy_change += 1 209 | 210 | if (no_accuracy_change == early_stopping_n): 211 | #print("Early-stopping: No accuracy change in last epochs: {}".format(early_stopping_n)) 212 | return evaluation_cost, evaluation_accuracy, training_cost, training_accuracy 213 | 214 | return evaluation_cost, evaluation_accuracy, \ 215 | training_cost, training_accuracy 216 | 217 | def update_mini_batch(self, mini_batch, eta, lmbda, n): 218 | """Update the network's weights and biases by applying gradient 219 | descent using backpropagation to a single mini batch. The 220 | ``mini_batch`` is a list of tuples ``(x, y)``, ``eta`` is the 221 | learning rate, ``lmbda`` is the regularization parameter, and 222 | ``n`` is the total size of the training data set. 223 | 224 | """ 225 | nabla_b = [np.zeros(b.shape) for b in self.biases] 226 | nabla_w = [np.zeros(w.shape) for w in self.weights] 227 | for x, y in mini_batch: 228 | delta_nabla_b, delta_nabla_w = self.backprop(x, y) 229 | nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] 230 | nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] 231 | self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw 232 | for w, nw in zip(self.weights, nabla_w)] 233 | self.biases = [b-(eta/len(mini_batch))*nb 234 | for b, nb in zip(self.biases, nabla_b)] 235 | 236 | def backprop(self, x, y): 237 | """Return a tuple ``(nabla_b, nabla_w)`` representing the 238 | gradient for the cost function C_x. ``nabla_b`` and 239 | ``nabla_w`` are layer-by-layer lists of numpy arrays, similar 240 | to ``self.biases`` and ``self.weights``.""" 241 | nabla_b = [np.zeros(b.shape) for b in self.biases] 242 | nabla_w = [np.zeros(w.shape) for w in self.weights] 243 | # feedforward 244 | activation = x 245 | activations = [x] # list to store all the activations, layer by layer 246 | zs = [] # list to store all the z vectors, layer by layer 247 | for b, w in zip(self.biases, self.weights): 248 | z = np.dot(w, activation)+b 249 | zs.append(z) 250 | activation = sigmoid(z) 251 | activations.append(activation) 252 | # backward pass 253 | delta = (self.cost).delta(zs[-1], activations[-1], y) 254 | nabla_b[-1] = delta 255 | nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 256 | # Note that the variable l in the loop below is used a little 257 | # differently to the notation in Chapter 2 of the book. Here, 258 | # l = 1 means the last layer of neurons, l = 2 is the 259 | # second-last layer, and so on. It's a renumbering of the 260 | # scheme in the book, used here to take advantage of the fact 261 | # that Python can use negative indices in lists. 262 | for l in range(2, self.num_layers): 263 | z = zs[-l] 264 | sp = sigmoid_prime(z) 265 | delta = np.dot(self.weights[-l+1].transpose(), delta) * sp 266 | nabla_b[-l] = delta 267 | nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) 268 | return (nabla_b, nabla_w) 269 | 270 | def accuracy(self, data, convert=False): 271 | """Return the number of inputs in ``data`` for which the neural 272 | network outputs the correct result. The neural network's 273 | output is assumed to be the index of whichever neuron in the 274 | final layer has the highest activation. 275 | 276 | The flag ``convert`` should be set to False if the data set is 277 | validation or test data (the usual case), and to True if the 278 | data set is the training data. The need for this flag arises 279 | due to differences in the way the results ``y`` are 280 | represented in the different data sets. In particular, it 281 | flags whether we need to convert between the different 282 | representations. It may seem strange to use different 283 | representations for the different data sets. Why not use the 284 | same representation for all three data sets? It's done for 285 | efficiency reasons -- the program usually evaluates the cost 286 | on the training data and the accuracy on other data sets. 287 | These are different types of computations, and using different 288 | representations speeds things up. More details on the 289 | representations can be found in 290 | mnist_loader.load_data_wrapper. 291 | 292 | """ 293 | if convert: 294 | results = [(np.argmax(self.feedforward(x)), np.argmax(y)) 295 | for (x, y) in data] 296 | else: 297 | results = [(np.argmax(self.feedforward(x)), y) 298 | for (x, y) in data] 299 | 300 | result_accuracy = sum(int(x == y) for (x, y) in results) 301 | return result_accuracy 302 | 303 | def total_cost(self, data, lmbda, convert=False): 304 | """Return the total cost for the data set ``data``. The flag 305 | ``convert`` should be set to False if the data set is the 306 | training data (the usual case), and to True if the data set is 307 | the validation or test data. See comments on the similar (but 308 | reversed) convention for the ``accuracy`` method, above. 309 | """ 310 | cost = 0.0 311 | for x, y in data: 312 | a = self.feedforward(x) 313 | if convert: y = vectorized_result(y) 314 | cost += self.cost.fn(a, y)/len(data) 315 | cost += 0.5*(lmbda/len(data))*sum(np.linalg.norm(w)**2 for w in self.weights) # '**' - to the power of. 316 | return cost 317 | 318 | def save(self, filename): 319 | """Save the neural network to the file ``filename``.""" 320 | data = {"sizes": self.sizes, 321 | "weights": [w.tolist() for w in self.weights], 322 | "biases": [b.tolist() for b in self.biases], 323 | "cost": str(self.cost.__name__)} 324 | f = open(filename, "w") 325 | json.dump(data, f) 326 | f.close() 327 | 328 | #### Loading a Network 329 | def load(filename): 330 | """Load a neural network from the file ``filename``. Returns an 331 | instance of Network. 332 | 333 | """ 334 | f = open(filename, "r") 335 | data = json.load(f) 336 | f.close() 337 | cost = getattr(sys.modules[__name__], data["cost"]) 338 | net = Network(data["sizes"], cost=cost) 339 | net.weights = [np.array(w) for w in data["weights"]] 340 | net.biases = [np.array(b) for b in data["biases"]] 341 | return net 342 | 343 | #### Miscellaneous functions 344 | def vectorized_result(j): 345 | """Return a 10-dimensional unit vector with a 1.0 in the j'th position 346 | and zeroes elsewhere. This is used to convert a digit (0...9) 347 | into a corresponding desired output from the neural network. 348 | 349 | """ 350 | e = np.zeros((10, 1)) 351 | e[j] = 1.0 352 | return e 353 | 354 | def sigmoid(z): 355 | """The sigmoid function.""" 356 | return 1.0/(1.0+np.exp(-z)) 357 | 358 | def sigmoid_prime(z): 359 | """Derivative of the sigmoid function.""" 360 | return sigmoid(z)*(1-sigmoid(z)) 361 | -------------------------------------------------------------------------------- /network3.py: -------------------------------------------------------------------------------- 1 | """network3.py 2 | ~~~~~~~~~~~~~~ 3 | 4 | A Theano-based program for training and running simple neural 5 | networks. 6 | 7 | Supports several layer types (fully connected, convolutional, max 8 | pooling, softmax), and activation functions (sigmoid, tanh, and 9 | rectified linear units, with more easily added). 10 | 11 | When run on a CPU, this program is much faster than network.py and 12 | network2.py. However, unlike network.py and network2.py it can also 13 | be run on a GPU, which makes it faster still. 14 | 15 | Because the code is based on Theano, the code is different in many 16 | ways from network.py and network2.py. However, where possible I have 17 | tried to maintain consistency with the earlier programs. In 18 | particular, the API is similar to network2.py. Note that I have 19 | focused on making the code simple, easily readable, and easily 20 | modifiable. It is not optimized, and omits many desirable features. 21 | 22 | This program incorporates ideas from the Theano documentation on 23 | convolutional neural nets (notably, 24 | http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's 25 | implementation of dropout (https://github.com/mdenil/dropout ), and 26 | from Chris Olah (http://colah.github.io ). 27 | 28 | """ 29 | 30 | #### Libraries 31 | # Standard library 32 | import pickle 33 | import gzip 34 | 35 | # Third-party libraries 36 | import numpy as np 37 | import theano 38 | import theano.tensor as T 39 | from theano.tensor.nnet import conv 40 | from theano.tensor.nnet import softmax 41 | from theano.tensor import shared_randomstreams 42 | from theano.tensor.signal.pool import pool_2d 43 | 44 | # Activation functions for neurons 45 | def linear(z): return z 46 | def ReLU(z): return T.maximum(0.0, z) 47 | from theano.tensor.nnet import sigmoid 48 | from theano.tensor import tanh 49 | 50 | 51 | #### Constants 52 | GPU = True 53 | if GPU: 54 | print("Trying to run under a GPU. If this is not desired, then modify "+\ 55 | "network3.py\nto set the GPU flag to False.") 56 | try: theano.config.device = 'gpu' 57 | except: pass # it's already set 58 | theano.config.floatX = 'float32' 59 | else: 60 | print("Running with a CPU. If this is not desired, then the modify "+\ 61 | "network3.py to set\nthe GPU flag to True.") 62 | 63 | #### Load the MNIST data 64 | def load_data_shared(filename="mnist.pkl.gz"): 65 | f = gzip.open(filename, 'rb') 66 | training_data, validation_data, test_data = pickle.load(f, encoding="latin1") 67 | f.close() 68 | def shared(data): 69 | """Place the data into shared variables. This allows Theano to copy 70 | the data to the GPU, if one is available. 71 | 72 | """ 73 | shared_x = theano.shared( 74 | np.asarray(data[0], dtype=theano.config.floatX), borrow=True) 75 | shared_y = theano.shared( 76 | np.asarray(data[1], dtype=theano.config.floatX), borrow=True) 77 | return shared_x, T.cast(shared_y, "int32") 78 | return [shared(training_data), shared(validation_data), shared(test_data)] 79 | 80 | #### Main class used to construct and train networks 81 | class Network(object): 82 | 83 | def __init__(self, layers, mini_batch_size): 84 | """Takes a list of `layers`, describing the network architecture, and 85 | a value for the `mini_batch_size` to be used during training 86 | by stochastic gradient descent. 87 | 88 | """ 89 | self.layers = layers 90 | self.mini_batch_size = mini_batch_size 91 | self.params = [param for layer in self.layers for param in layer.params] 92 | self.x = T.matrix("x") 93 | self.y = T.ivector("y") 94 | init_layer = self.layers[0] 95 | init_layer.set_inpt(self.x, self.x, self.mini_batch_size) 96 | for j in range(1, len(self.layers)): # xrange() was renamed to range() in Python 3. 97 | prev_layer, layer = self.layers[j-1], self.layers[j] 98 | layer.set_inpt( 99 | prev_layer.output, prev_layer.output_dropout, self.mini_batch_size) 100 | self.output = self.layers[-1].output 101 | self.output_dropout = self.layers[-1].output_dropout 102 | 103 | def SGD(self, training_data, epochs, mini_batch_size, eta, 104 | validation_data, test_data, lmbda=0.0): 105 | """Train the network using mini-batch stochastic gradient descent.""" 106 | training_x, training_y = training_data 107 | validation_x, validation_y = validation_data 108 | test_x, test_y = test_data 109 | 110 | # compute number of minibatches for training, validation and testing 111 | num_training_batches = int(size(training_data)/mini_batch_size) 112 | num_validation_batches = int(size(validation_data)/mini_batch_size) 113 | num_test_batches = int(size(test_data)/mini_batch_size) 114 | 115 | # define the (regularized) cost function, symbolic gradients, and updates 116 | l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers]) 117 | cost = self.layers[-1].cost(self)+\ 118 | 0.5*lmbda*l2_norm_squared/num_training_batches 119 | grads = T.grad(cost, self.params) 120 | updates = [(param, param-eta*grad) 121 | for param, grad in zip(self.params, grads)] 122 | 123 | # define functions to train a mini-batch, and to compute the 124 | # accuracy in validation and test mini-batches. 125 | i = T.lscalar() # mini-batch index 126 | train_mb = theano.function( 127 | [i], cost, updates=updates, 128 | givens={ 129 | self.x: 130 | training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], 131 | self.y: 132 | training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] 133 | }) 134 | validate_mb_accuracy = theano.function( 135 | [i], self.layers[-1].accuracy(self.y), 136 | givens={ 137 | self.x: 138 | validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], 139 | self.y: 140 | validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] 141 | }) 142 | test_mb_accuracy = theano.function( 143 | [i], self.layers[-1].accuracy(self.y), 144 | givens={ 145 | self.x: 146 | test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], 147 | self.y: 148 | test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] 149 | }) 150 | self.test_mb_predictions = theano.function( 151 | [i], self.layers[-1].y_out, 152 | givens={ 153 | self.x: 154 | test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size] 155 | }) 156 | # Do the actual training 157 | best_validation_accuracy = 0.0 158 | for epoch in range(epochs): 159 | for minibatch_index in range(num_training_batches): 160 | iteration = num_training_batches*epoch+minibatch_index 161 | if iteration % 1000 == 0: 162 | print("Training mini-batch number {0}".format(iteration)) 163 | cost_ij = train_mb(minibatch_index) 164 | if (iteration+1) % num_training_batches == 0: 165 | validation_accuracy = np.mean( 166 | [validate_mb_accuracy(j) for j in range(num_validation_batches)]) 167 | print("Epoch {0}: validation accuracy {1:.2%}".format( 168 | epoch, validation_accuracy)) 169 | if validation_accuracy >= best_validation_accuracy: 170 | print("This is the best validation accuracy to date.") 171 | best_validation_accuracy = validation_accuracy 172 | best_iteration = iteration 173 | if test_data: 174 | test_accuracy = np.mean( 175 | [test_mb_accuracy(j) for j in range(num_test_batches)]) 176 | print('The corresponding test accuracy is {0:.2%}'.format( 177 | test_accuracy)) 178 | print("Finished training network.") 179 | print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format( 180 | best_validation_accuracy, best_iteration)) 181 | print("Corresponding test accuracy of {0:.2%}".format(test_accuracy)) 182 | 183 | #### Define layer types 184 | 185 | class ConvPoolLayer(object): 186 | """Used to create a combination of a convolutional and a max-pooling 187 | layer. A more sophisticated implementation would separate the 188 | two, but for our purposes we'll always use them together, and it 189 | simplifies the code, so it makes sense to combine them. 190 | 191 | """ 192 | 193 | def __init__(self, filter_shape, image_shape, poolsize=(2, 2), 194 | activation_fn=sigmoid): 195 | """`filter_shape` is a tuple of length 4, whose entries are the number 196 | of filters, the number of input feature maps, the filter height, and the 197 | filter width. 198 | 199 | `image_shape` is a tuple of length 4, whose entries are the 200 | mini-batch size, the number of input feature maps, the image 201 | height, and the image width. 202 | 203 | `poolsize` is a tuple of length 2, whose entries are the y and 204 | x pooling sizes. 205 | 206 | """ 207 | self.filter_shape = filter_shape 208 | self.image_shape = image_shape 209 | self.poolsize = poolsize 210 | self.activation_fn=activation_fn 211 | # initialize weights and biases 212 | n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize)) 213 | self.w = theano.shared( 214 | np.asarray( 215 | np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape), 216 | dtype=theano.config.floatX), 217 | borrow=True) 218 | self.b = theano.shared( 219 | np.asarray( 220 | np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)), 221 | dtype=theano.config.floatX), 222 | borrow=True) 223 | self.params = [self.w, self.b] 224 | 225 | def set_inpt(self, inpt, inpt_dropout, mini_batch_size): 226 | self.inpt = inpt.reshape(self.image_shape) 227 | conv_out = conv.conv2d( 228 | input=self.inpt, filters=self.w, filter_shape=self.filter_shape, 229 | image_shape=self.image_shape) 230 | pooled_out = pool_2d( 231 | input=conv_out, ws=self.poolsize, ignore_border=True) 232 | self.output = self.activation_fn( 233 | pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) 234 | self.output_dropout = self.output # no dropout in the convolutional layers 235 | 236 | class FullyConnectedLayer(object): 237 | 238 | def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0): 239 | self.n_in = n_in 240 | self.n_out = n_out 241 | self.activation_fn = activation_fn 242 | self.p_dropout = p_dropout 243 | # Initialize weights and biases 244 | self.w = theano.shared( 245 | np.asarray( 246 | np.random.normal( 247 | loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)), 248 | dtype=theano.config.floatX), 249 | name='w', borrow=True) 250 | self.b = theano.shared( 251 | np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)), 252 | dtype=theano.config.floatX), 253 | name='b', borrow=True) 254 | self.params = [self.w, self.b] 255 | 256 | def set_inpt(self, inpt, inpt_dropout, mini_batch_size): 257 | self.inpt = inpt.reshape((mini_batch_size, self.n_in)) 258 | self.output = self.activation_fn( 259 | (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) 260 | self.y_out = T.argmax(self.output, axis=1) 261 | self.inpt_dropout = dropout_layer( 262 | inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) 263 | self.output_dropout = self.activation_fn( 264 | T.dot(self.inpt_dropout, self.w) + self.b) 265 | 266 | def accuracy(self, y): 267 | "Return the accuracy for the mini-batch." 268 | return T.mean(T.eq(y, self.y_out)) 269 | 270 | class SoftmaxLayer(object): 271 | 272 | def __init__(self, n_in, n_out, p_dropout=0.0): 273 | self.n_in = n_in 274 | self.n_out = n_out 275 | self.p_dropout = p_dropout 276 | # Initialize weights and biases 277 | self.w = theano.shared( 278 | np.zeros((n_in, n_out), dtype=theano.config.floatX), 279 | name='w', borrow=True) 280 | self.b = theano.shared( 281 | np.zeros((n_out,), dtype=theano.config.floatX), 282 | name='b', borrow=True) 283 | self.params = [self.w, self.b] 284 | 285 | def set_inpt(self, inpt, inpt_dropout, mini_batch_size): 286 | self.inpt = inpt.reshape((mini_batch_size, self.n_in)) 287 | self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) 288 | self.y_out = T.argmax(self.output, axis=1) 289 | self.inpt_dropout = dropout_layer( 290 | inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) 291 | self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b) 292 | 293 | def cost(self, net): 294 | "Return the log-likelihood cost." 295 | return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y]) 296 | 297 | def accuracy(self, y): 298 | "Return the accuracy for the mini-batch." 299 | return T.mean(T.eq(y, self.y_out)) 300 | 301 | 302 | #### Miscellanea 303 | def size(data): 304 | "Return the size of the dataset `data`." 305 | return data[0].get_value(borrow=True).shape[0] 306 | 307 | def dropout_layer(layer, p_dropout): 308 | srng = shared_randomstreams.RandomStreams( 309 | np.random.RandomState(0).randint(999999)) 310 | mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape) 311 | return layer*T.cast(mask, theano.config.floatX) 312 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing code for different neural network configurations. 3 | Adapted for Python 3.5.2 4 | 5 | Usage in shell: 6 | python3.5 test.py 7 | 8 | Network (network.py and network2.py) parameters: 9 | 2nd param is epochs count 10 | 3rd param is batch size 11 | 4th param is learning rate (eta) 12 | 13 | Author: 14 | Michał Dobrzański, 2016 15 | dobrzanski.michal.daniel@gmail.com 16 | """ 17 | 18 | # ---------------------- 19 | # - read the input data: 20 | ''' 21 | import mnist_loader 22 | training_data, validation_data, test_data = mnist_loader.load_data_wrapper() 23 | training_data = list(training_data) 24 | ''' 25 | # --------------------- 26 | # - network.py example: 27 | #import network 28 | 29 | ''' 30 | net = network.Network([784, 30, 10]) 31 | net.SGD(training_data, 30, 10, 3.0, test_data=test_data) 32 | ''' 33 | 34 | # ---------------------- 35 | # - network2.py example: 36 | #import network2 37 | 38 | ''' 39 | net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost) 40 | #net.large_weight_initializer() 41 | net.SGD(training_data, 30, 10, 0.1, lmbda = 5.0,evaluation_data=validation_data, 42 | monitor_evaluation_accuracy=True) 43 | ''' 44 | 45 | # chapter 3 - Overfitting example - too many epochs of learning applied on small (1k samples) amount od data. 46 | # Overfitting is treating noise as a signal. 47 | ''' 48 | net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost) 49 | net.large_weight_initializer() 50 | net.SGD(training_data[:1000], 400, 10, 0.5, evaluation_data=test_data, 51 | monitor_evaluation_accuracy=True, 52 | monitor_training_cost=True) 53 | ''' 54 | 55 | # chapter 3 - Regularization (weight decay) example 1 (only 1000 of training data and 30 hidden neurons) 56 | ''' 57 | net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost) 58 | net.large_weight_initializer() 59 | net.SGD(training_data[:1000], 400, 10, 0.5, 60 | evaluation_data=test_data, 61 | lmbda = 0.1, # this is a regularization parameter 62 | monitor_evaluation_cost=True, 63 | monitor_evaluation_accuracy=True, 64 | monitor_training_cost=True, 65 | monitor_training_accuracy=True) 66 | ''' 67 | 68 | # chapter 3 - Early stopping implemented 69 | ''' 70 | net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost) 71 | net.SGD(training_data[:1000], 30, 10, 0.5, 72 | lmbda=5.0, 73 | evaluation_data=validation_data, 74 | monitor_evaluation_accuracy=True, 75 | monitor_training_cost=True, 76 | early_stopping_n=10) 77 | ''' 78 | 79 | # chapter 4 - The vanishing gradient problem - deep networks are hard to train with simple SGD algorithm 80 | # this network learns much slower than a shallow one. 81 | ''' 82 | net = network2.Network([784, 30, 30, 30, 30, 10], cost=network2.CrossEntropyCost) 83 | net.SGD(training_data, 30, 10, 0.1, 84 | lmbda=5.0, 85 | evaluation_data=validation_data, 86 | monitor_evaluation_accuracy=True) 87 | ''' 88 | 89 | 90 | # ---------------------- 91 | # Theano and CUDA 92 | # ---------------------- 93 | 94 | """ 95 | This deep network uses Theano with GPU acceleration support. 96 | I am using Ubuntu 16.04 with CUDA 7.5. 97 | Tutorial: 98 | http://deeplearning.net/software/theano/install_ubuntu.html#install-ubuntu 99 | 100 | The following command will update only Theano: 101 | sudo pip install --upgrade --no-deps theano 102 | 103 | The following command will update Theano and Numpy/Scipy (warning bellow): 104 | sudo pip install --upgrade theano 105 | 106 | """ 107 | 108 | """ 109 | Below, there is a testing function to check whether your computations have been made on CPU or GPU. 110 | If the result is 'Used the cpu' and you want to have it in gpu, do the following: 111 | 1) install theano: 112 | sudo python3.5 -m pip install Theano 113 | 2) download and install the latest cuda: 114 | https://developer.nvidia.com/cuda-downloads 115 | I had some issues with that, so I followed this idea (better option is to download the 1,1GB package as .run file): 116 | http://askubuntu.com/questions/760242/how-can-i-force-16-04-to-add-a-repository-even-if-it-isnt-considered-secure-eno 117 | You may also want to grab the proper NVidia driver, choose it form there: 118 | System Settings > Software & Updates > Additional Drivers. 119 | 3) should work, run it with: 120 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python3.5 test.py 121 | http://deeplearning.net/software/theano/tutorial/using_gpu.html 122 | 4) Optionally, you can add cuDNN support from: 123 | https://developer.nvidia.com/cudnn 124 | 125 | 126 | """ 127 | def testTheano(): 128 | from theano import function, config, shared, sandbox 129 | import theano.tensor as T 130 | import numpy 131 | import time 132 | print("Testing Theano library...") 133 | vlen = 10 * 30 * 768 # 10 x #cores x # threads per core 134 | iters = 1000 135 | 136 | rng = numpy.random.RandomState(22) 137 | x = shared(numpy.asarray(rng.rand(vlen), config.floatX)) 138 | f = function([], T.exp(x)) 139 | print(f.maker.fgraph.toposort()) 140 | t0 = time.time() 141 | for i in range(iters): 142 | r = f() 143 | t1 = time.time() 144 | print("Looping %d times took %f seconds" % (iters, t1 - t0)) 145 | print("Result is %s" % (r,)) 146 | if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]): 147 | print('Used the cpu') 148 | else: 149 | print('Used the gpu') 150 | # Perform check: 151 | #testTheano() 152 | 153 | 154 | # ---------------------- 155 | # - network3.py example: 156 | import network3 157 | from network3 import Network, ConvPoolLayer, FullyConnectedLayer, SoftmaxLayer # softmax plus log-likelihood cost is more common in modern image classification networks. 158 | 159 | # read data: 160 | training_data, validation_data, test_data = network3.load_data_shared() 161 | # mini-batch size: 162 | mini_batch_size = 10 163 | 164 | # chapter 6 - shallow architecture using just a single hidden layer, containing 100 hidden neurons. 165 | ''' 166 | net = Network([ 167 | FullyConnectedLayer(n_in=784, n_out=100), 168 | SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) 169 | net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) 170 | ''' 171 | 172 | # chapter 6 - 5x5 local receptive fields, 20 feature maps, max-pooling layer 2x2 173 | ''' 174 | net = Network([ 175 | ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), 176 | filter_shape=(20, 1, 5, 5), 177 | poolsize=(2, 2)), 178 | FullyConnectedLayer(n_in=20*12*12, n_out=100), 179 | SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) 180 | net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) 181 | ''' 182 | 183 | # chapter 6 - inserting a second convolutional-pooling layer to the previous example => better accuracy 184 | ''' 185 | net = Network([ 186 | ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), 187 | filter_shape=(20, 1, 5, 5), 188 | poolsize=(2, 2)), 189 | ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12), 190 | filter_shape=(40, 20, 5, 5), 191 | poolsize=(2, 2)), 192 | FullyConnectedLayer(n_in=40*4*4, n_out=100), 193 | SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) 194 | net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) 195 | ''' 196 | 197 | # chapter 6 - rectified linear units and some l2 regularization (lmbda=0.1) => even better accuracy 198 | from network3 import ReLU 199 | net = Network([ 200 | ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), 201 | filter_shape=(20, 1, 5, 5), 202 | poolsize=(2, 2), 203 | activation_fn=ReLU), 204 | ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12), 205 | filter_shape=(40, 20, 5, 5), 206 | poolsize=(2, 2), 207 | activation_fn=ReLU), 208 | FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU), 209 | SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) 210 | net.SGD(training_data, 60, mini_batch_size, 0.03, validation_data, test_data, lmbda=0.1) 211 | --------------------------------------------------------------------------------