├── .gitignore ├── README.md ├── __init__.py ├── assignment1 ├── __init__.py ├── assignment1.py └── data │ ├── dataset1.mat │ ├── dataset2.mat │ ├── dataset3.mat │ └── dataset4.mat ├── assignment2 ├── __init__.py ├── assignment2.py └── data │ └── data.mat ├── assignment3 ├── __init__.py ├── assignment3.py └── data │ └── data.mat ├── assignment4 ├── __init__.py ├── assignment4.py └── data │ ├── a4_randomness_source.mat │ └── data_set.mat └── utility ├── __init__.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | *.pyc 3 | 4 | # Matlab files 5 | *.m 6 | 7 | # Virtual Environment 8 | *.env 9 | **/venv 10 | .venv 11 | 12 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm 13 | ## Directory-based project format 14 | .idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Project Description 2 | 3 | Assignments for Geoffrey Hinton's Neural Net Course on Coursera, translated from Matlab into Python. 4 | 5 | * assignments 2-4 are quite different than what is presented in the course, as they were refactored into logical 6 | classifiers (adapted from the sklearn framework). 7 | * more work could certainly be done to remove redundancy between assignments, especially between 3 and 4. 8 | * course can be found here: https://www.coursera.org/course/neuralnets 9 | 10 | ## Assignment 1 11 | * Implements linear Perceptron for two class problem 12 | 13 | ## Assignment 2 14 | * Implements a basic framework for training neural nets with mini-batch gradient descent for a language model. 15 | * Assignment covers hyperparameter search and observations through average cross entropy error. 16 | * i.e. number of training epochs, embedding and hidden layer size, training momentum 17 | 18 | ## Assignment 3 19 | * Trains a simple Feedforward Neural Network with Backpropogation, for recognizing USPS handwritten digits. 20 | * Assignment looks into efficient optimization, and into effective regularization. 21 | * Recognizes USPS handwritten digits. 22 | 23 | ## Assignment 4 24 | * Trains a Feedforward neural network with pretraining using Restricted Boltzman Machines (RBMs) 25 | * The RBM is used as the visible-to-hidden layer in a network exactly like the one made in programming assignment 3. 26 | * The RBM is trained using Contrastive Divergence gradient estimator with 1 full Gibbs update, a.k.a. CD-1. 27 | * Recognizes USPS handwritten digits. 28 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/__init__.py -------------------------------------------------------------------------------- /assignment1/__init__.py: -------------------------------------------------------------------------------- 1 | from assignment1 import * 2 | -------------------------------------------------------------------------------- /assignment1/assignment1.py: -------------------------------------------------------------------------------- 1 | """Implements Assignment 1 for Geoffrey Hinton's Neural Networks Course offered through Coursera. 2 | 3 | * Implements linear Perceptron for two class problem 4 | """ 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | __all__ = ['learn_perceptron', 9 | 'update_weights', 10 | 'eval_perceptron', 11 | 'plot_perceptron'] 12 | 13 | 14 | def learn_perceptron(neg_examples_nobias, pos_examples_nobias, w_init, w_gen_feas, 15 | pause=False): 16 | """Learns the weights of a perceptron for a 2-dimensional dataset and plots 17 | the perceptron at each iteration where an iteration is defined as one 18 | full pass through the data. If a generously feasible weight vector 19 | is provided then the visualization will also show the distance 20 | of the learned weight vectors to the generously feasible weight vector. 21 | 22 | Args: 23 | neg_examples_nobias (numpy.array) : The num_neg_examples x 2 matrix for the examples with target 0. 24 | num_neg_examples is the number of examples for the negative class. 25 | pos_examples_nobias (numpy.array) : The num_pos_examples x 2 matrix for the examples with target 1. 26 | num_pos_examples is the number of examples for the positive class. 27 | w_init (numpy.array) : A 3-dimensional initial weight vector. The last element is the bias. 28 | w_gen_feas (numpy.array) : A generously feasible weight vector. 29 | pause (bool) : Pause between iterations. 30 | Returns: 31 | numpy.array : The learned weight vector. 32 | """ 33 | num_err_history = [] 34 | w_dist_history = [] 35 | 36 | # add column vector of ones for bias term 37 | neg_examples = np.hstack((neg_examples_nobias, np.ones((len(neg_examples_nobias), 1)))) 38 | pos_examples = np.hstack((pos_examples_nobias, np.ones((len(pos_examples_nobias), 1)))) 39 | 40 | if np.size(w_init): 41 | w = np.random.rand(3, 1) 42 | else: 43 | w = w_init 44 | 45 | if not np.size(w_gen_feas): 46 | w_gen_feas = [] 47 | 48 | # Find the data points that the perceptron has incorrectly classified 49 | # and record the number of errors it makes. 50 | iter_ = 0 51 | mistakes0, mistakes1 = eval_perceptron(neg_examples, pos_examples, w) 52 | num_errs = len(mistakes0) + len(mistakes1) 53 | num_err_history.append(num_errs) 54 | print "Number of erros in iteration {0}:\t{1}".format(iter_, num_errs) 55 | print "Weights:", w 56 | plot_perceptron(neg_examples, pos_examples, mistakes0, mistakes1, num_err_history, 57 | w, w_dist_history) 58 | 59 | # If a generously feasible weight vector exists, record the distance 60 | # to it from the initial weight vector 61 | if len(w_gen_feas) != 0: 62 | w_dist_history.append(np.linalg.norm(w - w_gen_feas)) 63 | 64 | while num_errs > 0: 65 | iter_ = iter_ + 1 66 | 67 | # Update weights of perceptron 68 | w = update_weights(neg_examples, pos_examples, w) 69 | 70 | # If a generously feasible weight vetor exists, record the distance 71 | # to it from the current weight vector 72 | if len(w_gen_feas) != 0: 73 | w_dist_history.append(np.linalg.norm(w - w_gen_feas)) 74 | 75 | # Find the data points that the perceptron has incorrectly classified 76 | # and record the number of errors it makes. 77 | mistakes0, mistakes1 = eval_perceptron(neg_examples, pos_examples, w) 78 | num_errs = len(mistakes0) + len(mistakes1) 79 | num_err_history.append(num_errs) 80 | print "Number of erros in iteration {0}:\t{1}".format(iter_, num_errs) 81 | print "Weights:", w 82 | 83 | plot_perceptron(neg_examples, pos_examples, mistakes0, mistakes1, num_err_history, 84 | w, w_dist_history) 85 | if pause: 86 | while True: 87 | try: 88 | ans = input("Continue?") 89 | if ans == 1 or ans == 'y': 90 | break 91 | if ans == 0 or ans == 'n': 92 | return w 93 | except (ValueError, NameError): 94 | print("Sorry, I didn't understand that.") 95 | continue 96 | return w 97 | 98 | 99 | def update_weights(neg_examples, pos_examples, w_current): 100 | """Updates the weights of the perceptron for incorrectly classified points 101 | using the perceptron update algorithm. This function makes one sweep over 102 | the dataset. 103 | 104 | Args: 105 | neg_examples (numpy.array) : The num_neg_examples x 3 matrix for the examples with target 0. 106 | num_neg_examples is the number of examples for the negative class. 107 | pos_examples (numpy.array) : The num_pos_examples x 3 matrix for the examples with target 1. 108 | num_pos_examples is the number of examples for the positive class. 109 | w_current (numpy.array) : A 3-dimensional weight vector, the last element is the bias. 110 | Returns: 111 | (numpy.array) : The weight vector after one pass through the dataset using the perceptron 112 | learning rule. 113 | """ 114 | w = w_current 115 | for sample in neg_examples: 116 | assert len(np.shape(sample)) == 1 and np.shape(w)[1] == 1 117 | activation = np.dot(sample, w)[0] 118 | if activation >= 0: 119 | w += np.column_stack(sample).T * (0.0 - activation) 120 | for sample in pos_examples: 121 | assert len(np.shape(sample)) == 1 and np.shape(w)[1] == 1 122 | activation = np.dot(sample, w)[0] 123 | if activation < 0: 124 | w += np.column_stack(sample).T * (1.0 - activation) 125 | return w 126 | 127 | 128 | def eval_perceptron(neg_examples, pos_examples, w): 129 | """Evaluates the perceptron using a given weight vector. Here, evaluation 130 | refers to finding the data points that the perceptron incorrectly classifies. 131 | 132 | Args: 133 | neg_examples (numpy.array) : The num_neg_examples x 3 matrix for the examples with target 0. 134 | num_neg_examples is the number of examples for the negative class. 135 | pos_examples (numpy.array) : The num_pos_examples x 3 matrix for the examples with target 1. 136 | num_pos_examples is the number of examples for the positive class. 137 | w (numpy.array) : A 3-dimensional weight vector, the last element is the bias. 138 | Returns: 139 | (tuple) : 140 | mistakes0 : A vector containing the indices of the negative examples that have been 141 | incorrectly classified as positive. 142 | mistakes1 : A vector containing the indices of the positive examples that have been 143 | incorrectly classified as negative. 144 | """ 145 | mistakes0 = [i for i, sample in enumerate(neg_examples) if np.dot(sample, w)[0] >= 0] 146 | mistakes1 = [i for i, sample in enumerate(pos_examples) if np.dot(sample, w)[0] < 0] 147 | return mistakes0, mistakes1 148 | 149 | 150 | def plot_perceptron(neg_examples, pos_examples, mistakes0, mistakes1, 151 | num_err_history, w, w_dist_history): 152 | """The top-left plot shows the dataset and the classification boundary given by 153 | the weights of the perceptron. The negative examples are shown as circles 154 | while the positive examples are shown as squares. If an example is colored 155 | green then it means that the example has been correctly classified by the 156 | provided weights. If it is colored red then it has been incorrectly classified. 157 | The top-right plot shows the number of mistakes the perceptron algorithm has 158 | made in each iteration so far. 159 | 160 | The bottom-left plot shows the distance to some generously feasible weight 161 | vector if one has been provided (note, there can be an infinite number of these). 162 | Points that the classifier has made a mistake on are shown in red, 163 | while points that are correctly classified are shown in green. 164 | 165 | The goal is for all of the points to be green (if it is possible to do so). 166 | 167 | Args: 168 | neg_examples : The num_neg_examples x 3 matrix for the examples with target 0. 169 | num_neg_examples is the number of examples for the negative class. 170 | pos_examples : The num_pos_examples x 3 matrix for the examples with target 1. 171 | num_pos_examples is the number of examples for the positive class. 172 | mistakes0 : A vector containing the indices of the datapoints from class 0 incorrectly 173 | classified by the perceptron. This is a subset of neg_examples. 174 | mistakes1 : A vector containing the indices of the datapoints from class 1 incorrectly 175 | classified by the perceptron. This is a subset of pos_examples. 176 | num_err_history : A vector containing the number of mistakes for each 177 | iteration of learning so far. 178 | w : A 3-dimensional vector corresponding to the current weights of the 179 | perceptron. The last element is the bias. 180 | w_dist_history : A vector containing the L2-distance to a generously 181 | feasible weight vector for each iteration of learning so far. 182 | Empty if one has not been provided. 183 | """ 184 | f = plt.figure(1) 185 | 186 | neg_correct_ind = np.setdiff1d(range(len(neg_examples)), mistakes0) 187 | pos_correct_ind = np.setdiff1d(range(len(pos_examples)), mistakes1) 188 | assert all(m_idx not in set(neg_correct_ind) for m_idx in mistakes0) and \ 189 | all(m_idx not in set(pos_correct_ind) for m_idx in mistakes1) 190 | 191 | plt.subplot(2, 2, 1) 192 | plt.hold(True) 193 | if np.size(neg_examples): 194 | plt.plot(neg_examples[neg_correct_ind][:, 0], neg_examples[neg_correct_ind][:, 1], 'og', markersize=10) 195 | if np.size(pos_examples): 196 | plt.plot(pos_examples[pos_correct_ind][:, 0], pos_examples[pos_correct_ind][:, 1], 'sg', markersize=10) 197 | 198 | if len(mistakes0): 199 | plt.plot(neg_examples[mistakes0][:, 0], neg_examples[mistakes0][:, 1], 'or', markersize=10) 200 | if len(mistakes1): 201 | plt.plot(pos_examples[mistakes1][:, 0], pos_examples[mistakes1][:, 1], 'sr', markersize=10) 202 | 203 | plt.title('Perceptron Classifier') 204 | # In order to plot the decision line, we just need to get two points. 205 | plt.plot([-5, 5], [(-w[-1] + 5 * w[0]) / w[1], (-w[-1] - 5 * w[0]) / w[1]], 'k') 206 | plt.xlim([-1, 4]) 207 | plt.ylim([-2, 2]) 208 | plt.hold(False) 209 | 210 | plt.subplot(2, 2, 2) 211 | plt.plot(range(len(num_err_history)), num_err_history) 212 | plt.xlim([-1, max(15, len(num_err_history))]) 213 | plt.ylim([0, len(neg_examples) + len(pos_examples) + 1]) 214 | plt.title('Number of errors') 215 | plt.xlabel('Iteration') 216 | plt.ylabel('Number of errors') 217 | 218 | plt.subplot(2, 2, 3) 219 | plt.plot(range(len(w_dist_history)), w_dist_history) 220 | plt.xlim([-1, max(15, len(num_err_history))]) 221 | plt.ylim([0, 15]) 222 | plt.title('Distance') 223 | plt.xlabel('Iteration') 224 | plt.ylabel('Distance') 225 | plt.show() 226 | 227 | 228 | if __name__ == "__main__": 229 | import matplotlib.pylab as pylab 230 | 231 | pylab.rcParams['figure.figsize'] = 12, 8 232 | 233 | import scipy.io 234 | import os 235 | import matplotlib.pyplot as plt 236 | 237 | data_path = os.path.join(os.getcwd(), 'data/') 238 | files = ['dataset%d' % i for i in range(1, 5)] 239 | 240 | dataset_file = os.path.join(data_path, files[2]) 241 | data = scipy.io.loadmat(dataset_file) 242 | 243 | w = learn_perceptron(data['neg_examples_nobias'], 244 | data['pos_examples_nobias'], 245 | data['w_init'], 246 | data['w_gen_feas']) 247 | -------------------------------------------------------------------------------- /assignment1/data/dataset1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset1.mat -------------------------------------------------------------------------------- /assignment1/data/dataset2.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset2.mat -------------------------------------------------------------------------------- /assignment1/data/dataset3.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset3.mat -------------------------------------------------------------------------------- /assignment1/data/dataset4.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset4.mat -------------------------------------------------------------------------------- /assignment2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment2/__init__.py -------------------------------------------------------------------------------- /assignment2/assignment2.py: -------------------------------------------------------------------------------- 1 | """Implements Assignment 2 for Geoffrey Hinton's Neural Networks Course offered through Coursera. 2 | 3 | * Implements a basic framework for training neural nets with mini-batch gradient descent for a language model. 4 | * Assignment covers hyperparameter search and observations through average cross entropy error. 5 | * i.e. number of training epochs, embedding and hidden layer size, training momentum 6 | 7 | Abstracts classifiers developed in the course into, a more pythonic Sklearn framework. And cleans up a lot of the 8 | given code. 9 | """ 10 | import os 11 | import time 12 | 13 | import matplotlib.pylab as pylab 14 | import numpy as np 15 | from sklearn.base import BaseEstimator 16 | 17 | from utility.utils import zip_safe, loadmat 18 | 19 | __all__ = ['EvaluateCrossEntropy', 20 | 'NeuralNet', 21 | 'load_data', 22 | 'display_nearest_words', 23 | 'word_distance', 24 | 'A2Run' 25 | ] 26 | 27 | 28 | def load_data(data, batch_size=100): 29 | """This method loads the training, validation and test set. It also divides the training set into mini-batches. 30 | 31 | Notes: 32 | ----- 33 | * Subtract 1 from each index in `input` and `target` to fix matlab to python indexing 34 | 35 | Args: 36 | data (dict) : From mat file. 37 | batch_size (int) : Mini-batch size. 38 | 39 | Returns: 40 | dict: With keys `train`, `valid`, `test`, `vocab` 41 | train_input (numpy.array) : An array of size d X n X m, where 42 | d: number of input dimensions (in this case, 3). 43 | n: size of each mini-batch (in this case, 100). 44 | m: number of minibatches. 45 | train_target (numpy.array) : An array of size 1 X n X m. 46 | valid_input (numpy.array) : An array of size D X number of points in the validation set. 47 | test (numpy.array) : An array of size D X number of points in the test set. 48 | vocab (numpy.array) : Vocabulary containing index to word mapping. 49 | """ 50 | d = np.size(data['trainData'], 0) - 1 51 | m = int(np.size(data['trainData'], axis=1) / batch_size) 52 | 53 | sequences = {key: dict() for key in ['train', 'valid', 'test']} 54 | 55 | sequences['train']['input'] = np.reshape(data['trainData'][:d, :batch_size * m], (d, batch_size, m)) - 1 56 | sequences['train']['target'] = np.reshape(data['trainData'][d, :batch_size * m], (1, batch_size, m)) - 1 57 | sequences['valid']['input'] = data['validData'][:d, :] - 1 58 | sequences['valid']['target'] = data['validData'][d, :] - 1 59 | sequences['test']['input'] = data['testData'][:d, :] - 1 60 | sequences['test']['target'] = data['testData'][d, :] - 1 61 | sequences['vocab'] = data['vocab'] 62 | 63 | return sequences 64 | 65 | 66 | class NeuralNet(BaseEstimator): 67 | """Implements assignment 2 of Neural Networks for Machine Learning (Coursera) for Learning word representations. 68 | """ 69 | 70 | def __init__(self, 71 | epochs=1, 72 | learning_rate=0.1, 73 | momentum=0.9, 74 | numhid1=50, 75 | numhid2=200, 76 | init_wt=0.01, 77 | validation_ce_after=1000, 78 | vocab_size=None, 79 | num_words=None): 80 | """Initialize NeuralNet instance with training and visualization params. 81 | 82 | Args: 83 | epochs (int) : Number of epochs to run. 84 | learning_rate (float) : Learning rate. 85 | momentum (float) : Momentum default. 86 | numhid1 (int) : Dimensionality of embedding space. 87 | numhid2 (int) : Number of units in hidden layer. 88 | init_wt (float) : Standard deviation of the normal distribution which is sampled to 89 | get the initial weights 90 | validation_ce_after (int) : Show cross-entropy calculation after specified samples during validation 91 | vocab_size (int) : Length of vocabulary in dataset. 92 | num_words (int) : Num words used in each training sample (given from dataset). 93 | In the assignment case, there's 3 94 | """ 95 | assert vocab_size and num_words 96 | 97 | # Set Hyper params 98 | self.epochs = epochs 99 | self.vocab_size = vocab_size 100 | self.learning_rate = learning_rate 101 | self.momentum = momentum 102 | self.numhid1 = numhid1 103 | self.numhid2 = numhid2 104 | self.init_wt = init_wt 105 | self.show_validation_ce_after = validation_ce_after 106 | 107 | # INITIALIZE WEIGHTS AND BIASES 108 | self.word_embedding_weights = None 109 | self.embed_to_hid_weights = None 110 | self.hid_to_output_weights = None 111 | self.hid_bias = None 112 | self.output_bias = None 113 | 114 | self.word_embedding_weights_delta = None 115 | self.embed_to_hid_weights_delta = None 116 | self.hid_to_output_weights_delta = None 117 | self.hid_bias_delta = None 118 | self.output_bias_delta = None 119 | self.reset_classifier(vocab_size, num_words) 120 | 121 | # Initialize evaluation params 122 | self.tiny = np.exp(-30) 123 | self.batch_iteration = 0 # this is count in Matlab code 124 | self.trainset_ce = 0.0 125 | 126 | def reset_classifier(self, vocab_size, num_words): 127 | """Resets state of the classifier given vocab_size and num_words in dataset. 128 | """ 129 | self.word_embedding_weights = self.init_wt * np.random.rand(vocab_size, self.numhid1) 130 | self.embed_to_hid_weights = self.init_wt * np.random.rand(num_words * self.numhid1, self.numhid2) 131 | self.hid_to_output_weights = self.init_wt * np.random.rand(self.numhid2, vocab_size) 132 | self.hid_bias = np.zeros((self.numhid2, 1)) 133 | self.output_bias = np.zeros((vocab_size, 1)) 134 | 135 | self.word_embedding_weights_delta = np.zeros((vocab_size, self.numhid1)) 136 | self.embed_to_hid_weights_delta = np.zeros((num_words * self.numhid1, self.numhid2)) 137 | self.hid_to_output_weights_delta = np.zeros((self.numhid2, vocab_size)) 138 | self.hid_bias_delta = np.zeros((self.numhid2, 1)) 139 | self.output_bias_delta = np.zeros((vocab_size, 1)) 140 | 141 | def fit(self, X, y): 142 | """Fit model given matrix X and target y. 143 | 144 | Args: 145 | X (numpy.ndarray) : input matrix 146 | y (numpy.ndarray) : target matrix 147 | 148 | Returns: 149 | self (model) contains: 150 | word_embedding_weights 151 | embed_to_hid_weights 152 | hid_to_output_weights 153 | hid_bias 154 | output_bias 155 | """ 156 | numwords, batch_size = np.shape(X) 157 | # FORWARD PROPAGATE. 158 | # Compute the state of each layer in the network given the input batch 159 | # and all weights and biases 160 | embedding_layer_state, hidden_layer_state, output_layer_state = self.fprop(X) 161 | assert all([all(row == False) for row in np.isnan(output_layer_state)]) 162 | # COMPUTE DERIVATIVE. 163 | # Expand the target to a sparse 1-of-K vector. 164 | expanded_y = np.eye(self.vocab_size)[:, y] 165 | # Compute derivative of cross-entropy loss function. 166 | error_deriv = output_layer_state - expanded_y 167 | 168 | # MEASURE LOSS FUNCTION. 169 | ce = -sum(sum(np.multiply(expanded_y, 170 | np.log(output_layer_state + self.tiny)))) / float(batch_size) 171 | self.trainset_ce += (ce - self.trainset_ce) / float(self.batch_iteration) 172 | 173 | # BACK PROPAGATE. 174 | # OUTPUT LAYER. 175 | hid_to_output_weights_gradient = np.dot(hidden_layer_state, error_deriv.T) 176 | output_bias_gradient = np.column_stack(np.sum(error_deriv, axis=1)).T 177 | 178 | back_propagated_deriv_1 = np.multiply(np.multiply(np.dot(self.hid_to_output_weights, error_deriv), 179 | hidden_layer_state), (1 - hidden_layer_state)) 180 | 181 | # HIDDEN LAYER. 182 | embed_to_hid_weights_gradient = np.dot(embedding_layer_state, back_propagated_deriv_1.T) 183 | assert (self.numhid1 * numwords, self.numhid2) == embed_to_hid_weights_gradient.shape 184 | hid_bias_gradient = np.column_stack(np.sum(back_propagated_deriv_1, axis=1)).T 185 | assert (self.numhid2, 1) == hid_bias_gradient.shape 186 | back_propagated_deriv_2 = np.dot(self.embed_to_hid_weights, back_propagated_deriv_1) 187 | assert back_propagated_deriv_2.shape == (numwords * self.numhid1, batch_size) 188 | 189 | word_embedding_weights_gradient = np.zeros((self.vocab_size, self.numhid1)) 190 | # EMBEDDING LAYER. 191 | for w in xrange(1, numwords): 192 | word_embedding_weights_gradient += np.dot(np.eye(self.vocab_size)[:, X[w, :]], 193 | back_propagated_deriv_2[ 194 | (w - 1) * self.numhid1: w * self.numhid1, :].T) 195 | self.__update_weights_and_biases(batch_size, word_embedding_weights_gradient, 196 | embed_to_hid_weights_gradient, hid_to_output_weights_gradient, 197 | hid_bias_gradient, output_bias_gradient) 198 | return self 199 | 200 | def __update_weights_and_biases(self, 201 | batch_size, 202 | word_embedding_weights_gradient, 203 | embed_to_hid_weights_gradient, 204 | hid_to_output_weights_gradient, 205 | hid_bias_gradient, 206 | output_bias_gradient): 207 | """Update weights and biases 208 | """ 209 | self.word_embedding_weights_delta = self.momentum * self.word_embedding_weights_delta + \ 210 | word_embedding_weights_gradient / float(batch_size) 211 | self.word_embedding_weights -= self.learning_rate * self.word_embedding_weights_delta 212 | 213 | self.embed_to_hid_weights_delta = self.momentum * self.embed_to_hid_weights_delta + \ 214 | embed_to_hid_weights_gradient / float(batch_size) 215 | self.embed_to_hid_weights -= self.learning_rate * self.embed_to_hid_weights_delta 216 | 217 | self.hid_to_output_weights_delta = self.momentum * self.hid_to_output_weights_delta + \ 218 | hid_to_output_weights_gradient / float(batch_size) 219 | self.hid_to_output_weights -= self.learning_rate * self.hid_to_output_weights_delta 220 | 221 | self.hid_bias_delta = self.momentum * self.hid_bias_delta + hid_bias_gradient / float(batch_size) 222 | self.hid_bias -= self.learning_rate * self.hid_bias_delta 223 | 224 | self.output_bias_delta = self.momentum * self.output_bias_delta + output_bias_gradient / float(batch_size) 225 | self.output_bias -= self.learning_rate * self.output_bias_delta 226 | 227 | def train(self, sequences): 228 | """This function trains a neural network language model and validates as well. (These should be split up) 229 | 230 | Args: 231 | sequences (dict) : input data 232 | 233 | Returns: 234 | struct: contains the learned weights and biases and vocabulary. 235 | """ 236 | self.reset_classifier(vocab_size=len(sequences['vocab']), num_words=len(sequences['train']['input'])) 237 | for epoch in xrange(1, self.epochs + 1): 238 | print 'Epoch %d\n' % epoch 239 | self.trainset_ce = 0.0 240 | # LOOP OVER MINI-BATCHES. 241 | for m, (input_batch, target_batch) in enumerate(zip_safe(sequences['train']['input'].T, 242 | sequences['train']['target'].T)): 243 | self.batch_iteration += 1 244 | target_batch = target_batch.flatten() 245 | self.fit(input_batch.T, target_batch) 246 | 247 | # VALIDATE. 248 | if self.show_validation_ce_after and (m + 1) % self.show_validation_ce_after == 0: 249 | print '\rRunning validation ... Validation CE after %d : %.3f' % \ 250 | (m + 1, EvaluateCrossEntropy(self).compute_ce(sequences['valid'], vocab_size=self.vocab_size)) 251 | print '\rAverage Training CE : %.3f' % self.trainset_ce 252 | print 'Final Training CE : %.3f' % self.trainset_ce 253 | 254 | def fprop(self, input_batch): 255 | """This method forward propagates through a neural network. 256 | 257 | Args: 258 | input_batch (numpy.ndarray) : The input data as a matrix of size numwords X batchsize where, 259 | * numwords is the number of words. 260 | * batchsize is the number of data points. 261 | So, if input_batch(i, j) = k then the ith word in data point j is word index k of the vocabulary. 262 | 263 | Returns: 264 | tuple : 265 | embedding_layer_state (numpy.ndarray) : State of units in the embedding layer as a matrix of 266 | size numhid1*numwords X batchsize 267 | hidden_layer_state (numpy.ndarray) : State of units in the hidden layer as a matrix of 268 | size numhid2 X batchsize 269 | output_layer_state (numpy.ndarray) : State of units in the output layer as a matrix of size 270 | vocab_size X batchsize 271 | """ 272 | 273 | numwords, batch_size = np.shape(input_batch) 274 | vocab_size, numhid1 = np.shape(self.word_embedding_weights) 275 | numhid2 = np.size(self.embed_to_hid_weights, axis=1) 276 | 277 | # COMPUTE STATE OF WORD EMBEDDING LAYER. 278 | # Look up the inputs word indices in the word_embedding_weights matrix. 279 | embedding_layer_state = np.reshape(self.word_embedding_weights[input_batch.flatten()].T, 280 | (numhid1 * numwords, -1)) 281 | # COMPUTE STATE OF HIDDEN LAYER. 282 | # Compute inputs to hidden units. 283 | inputs_to_hidden_units = np.dot(self.embed_to_hid_weights.T, embedding_layer_state) + np.tile(self.hid_bias, 284 | (1, batch_size)) 285 | # Apply logistic activation function. 286 | hidden_layer_state = 1.0 / (1.0 + np.exp(-inputs_to_hidden_units)) 287 | assert hidden_layer_state.shape == (numhid2, batch_size) 288 | 289 | # COMPUTE STATE OF OUTPUT LAYER. 290 | # Compute inputs to softmax. 291 | inputs_to_softmax = np.dot(self.hid_to_output_weights.T, hidden_layer_state) + \ 292 | np.tile(self.output_bias, (1, batch_size)) 293 | assert inputs_to_softmax.shape == (vocab_size, batch_size) 294 | 295 | # Subtract maximum. 296 | inputs_to_softmax -= np.tile(np.max(inputs_to_softmax), (vocab_size, 1)) 297 | 298 | # Compute exp. 299 | output_layer_state = np.exp(inputs_to_softmax) 300 | sum_output = np.sum(output_layer_state, axis=0) 301 | # correct for min float -- Matlab didn't have this problem (it must assume this instead of outputting 0.0) 302 | sum_output[np.where(sum_output == 0.0)] = np.finfo(float).min 303 | # Normalize to get probability distribution. 304 | output_layer_state = np.divide(output_layer_state, np.tile(sum_output, (vocab_size, 1))) 305 | 306 | return embedding_layer_state, hidden_layer_state, output_layer_state 307 | 308 | def predict_next_word(self, sentence, vocab, k): 309 | """Predicts the next word. 310 | Example usage: 311 | predict_next_word('john', 'might', 'be', 3) 312 | predict_next_word('life', 'in', 'new', 3) 313 | 314 | Args: 315 | sentence (iterable) : 3 word iterable containing 316 | word1 (str) : The first word as a string. 317 | word2 (str) : The second word as a string. 318 | word3 (str) : The third word as a string. 319 | vocab (numpy.array) : vocabulary in model 320 | k (int) : The k most probable predictions are shown. 321 | """ 322 | input_ = np.array([np.where(vocab == word)[0] if np.where(vocab == word)[0] else [None] for word in sentence]) 323 | for i, vocab_idx in enumerate(input_): 324 | if not vocab_idx: 325 | print 'Word %s not in vocabulary.\n' % sentence[i] 326 | return 327 | 328 | _, _, output_layer_state = self.fprop(input_) 329 | 330 | prob = np.sort(output_layer_state, axis=0)[::-1] 331 | indices = np.argsort(output_layer_state, axis=0)[::-1] 332 | for i in xrange(0, k): 333 | # noinspection PyStringFormat 334 | print '"%s %s %s %s" -- [Prob: %.5f]' % (sentence + (vocab[indices[i]][-1], prob[i])) 335 | 336 | 337 | class EvaluateCrossEntropy(object): 338 | """Computes cross entropy given classifier model. 339 | """ 340 | 341 | def __init__(self, estimator): 342 | """Initialize EvaluateCrossEntropy instance. 343 | """ 344 | self.estimator = estimator 345 | 346 | def run_evaluation(self, sequences): 347 | # EVALUATE ON VALIDATION SET. 348 | print 'Running validation ... Final Validation CE : %.3f' % \ 349 | self.compute_ce(sequences['valid'], vocab_size=len(sequences['vocab'])) 350 | print 'Running test ... Final Test CE : %.3f' % \ 351 | self.compute_ce(sequences['test'], vocab_size=len(sequences['vocab'])) 352 | 353 | def compute_ce(self, data, vocab_size): 354 | """Compute Cross-Entropy 355 | 356 | Args: 357 | data (dict) : Contains `input` and `target` keys each containing numpy.array 358 | vocab_size (int): Number of words in vocabulary. 359 | 360 | Returns: 361 | float : Cross-Entropy 362 | """ 363 | embedding_layer_state, hidden_layer_state, output_layer_state = self.estimator.fprop(data['input']) 364 | datasetsize = np.size(data['input'], 1) 365 | expanded_target = np.eye(vocab_size)[:, data['target']] 366 | return -sum(sum(np.multiply(expanded_target, np.log(output_layer_state + np.exp(-30))))) / float(datasetsize) 367 | 368 | 369 | def word_distance(word1, word2, model, vocab): 370 | """Shows the L2 distance between word1 and word2 in the word_embedding_weights. 371 | 372 | Example: 373 | ----- 374 | word_distance('school', 'university', model, vocab) 375 | 376 | Args: 377 | word1 (str) : The first word as a string. 378 | word2 (str) : The second word as a string. 379 | model (NeuralNet) : Model returned by estimator 380 | vocab (numpy.array) : vocabulary in model 381 | 382 | Return: 383 | distance 384 | """ 385 | words = (word1, word2) 386 | idxs = np.array([np.where(vocab == word)[0][0] if np.where(vocab == word)[0] else None for word in words]) 387 | for i, vocab_idx in enumerate(idxs): 388 | if not vocab_idx: 389 | print 'Word %s not in vocabulary.\n' % words[i] 390 | return 391 | diff = model.word_embedding_weights[idxs[0], :] - model.word_embedding_weights[idxs[1], :] 392 | return np.sqrt(sum(np.multiply(diff, diff))) 393 | 394 | 395 | def display_nearest_words(word, model, k, vocab): 396 | """Shows the k-nearest words to the query word. 397 | Example: 398 | ----- 399 | display_nearest_words('school', model, 10) 400 | 401 | Args: 402 | word (str) : The query word as a string. 403 | model (NeuralNet) : Model returned by estimator 404 | k (int) : The number of nearest words to display. 405 | vocab (numpy.array) : vocabulary in model 406 | """ 407 | idx = np.where(vocab == word)[0] 408 | if not idx: 409 | print 'Word %s not in vocabulary.\n' % word 410 | return 411 | 412 | # Compute distance to every other word. 413 | word_rep = model.word_embedding_weights[idx][-1] 414 | diff = model.word_embedding_weights - np.tile(word_rep, (len(vocab), 1)) 415 | distance = np.sqrt(np.sum(np.multiply(diff, diff), axis=1)) 416 | 417 | # Sort by distance. 418 | order = np.argsort(distance) 419 | order = order[1: k + 1] # The nearest word is the query word itself, skip that. 420 | for i in xrange(k): 421 | print 'Word\t: %s \nDistance: %.2f\n' % (vocab[order[i]], distance[order[i]]) 422 | 423 | 424 | class A2Run(object): 425 | """Runs assignment 2. 426 | """ 427 | 428 | def __init__(self): 429 | """Initialize data set and all test cases for assignment. 430 | """ 431 | data = loadmat(os.path.join(os.getcwd(), 'data/data.mat')) 432 | self.data_sets = data['data'] 433 | self.classifier = None 434 | 435 | def run_evaluation(self, **estimator_params): 436 | """Runs 4-gram Neural Network evaluation. 437 | 438 | Args: 439 | estimator_params (dict) : Contains parameters for NN. See NeuralNet(..) 440 | """ 441 | start_time = time.time() 442 | sequences = load_data(self.data_sets, batch_size=100) 443 | self.classifier = NeuralNet(vocab_size=len(sequences['vocab']), 444 | num_words=len(sequences['train']['input']), 445 | **estimator_params) 446 | self.classifier.train(sequences) 447 | print 'Training took %.2f seconds\n', start_time - time.time() 448 | EvaluateCrossEntropy(self.classifier).run_evaluation(sequences) 449 | 450 | def a2_main(self, epochs=1, learning_rate=.10, momentum=0.9, numhid1=50, numhid2=200, init_wt=0.01, 451 | validation_ce_after=1000): 452 | """Runs training and computes error and loss of training, testing, and validation training sets. 453 | 454 | Args: 455 | wd_coeff (float) : weight decay coefficient 456 | n_hid (int) : number of hidden units 457 | n_iterations (int) : number of training iterations 458 | lr_net (float) : learning rate for neural net classifier 459 | train_momentum (float) : momentum used in training 460 | early_stopping (bool) : saves model at validation error minimum 461 | mini_batch_size (int) : size of training batches 462 | """ 463 | self.run_evaluation(epochs=epochs, 464 | learning_rate=learning_rate, 465 | momentum=momentum, 466 | numhid1=numhid1, 467 | numhid2=numhid2, 468 | init_wt=init_wt, 469 | validation_ce_after=validation_ce_after) 470 | 471 | 472 | # coding: utf-8 473 | 474 | # epochs:10 475 | # - learning_rate: 476 | # 0.001: 477 | # Validation CE: 4.379 478 | # 0.1: 479 | # Validation CE: 2.625 480 | # 10.0: 481 | # Validation CE: 4.584 482 | # - Model A: 5 dimensional embedding, 100 dimensional hidden layer: 483 | # Training CE: 2.980 484 | # - Model B: 50 dimensional embedding, 10 dimensional hidden layer: 485 | # Training CE 3.035 486 | # - Model C: 50 dimensional embedding, 200 dimensional hidden layer: 487 | # Training CE 2.559 488 | # - Model D: 100 dimensional embedding, 5 dimensional hidden layer: 489 | # Training CE 3.272 490 | if __name__ == "__main__": 491 | pylab.rcParams['figure.figsize'] = 12, 8 492 | a2 = A2Run() 493 | classifier_params = dict(epochs=1, 494 | learning_rate=.10, 495 | momentum=0.9, 496 | numhid1=50, 497 | numhid2=200, 498 | init_wt=0.01, 499 | validation_ce_after=1000) 500 | a2.a2_main(**classifier_params) 501 | test_words = ['you', 'were', 'in', 'china'] 502 | a2.classifier.predict_next_word((test_words[0], test_words[1], test_words[2]), a2.data_sets['vocab'], 5) 503 | display_nearest_words(test_words[1], a2.classifier, 10, a2.data_sets['vocab']) 504 | word_distance(test_words[0], test_words[1], a2.classifier, a2.data_sets['vocab']) 505 | word_distance('percent', 'dr.', a2.classifier, a2.data_sets['vocab']) 506 | -------------------------------------------------------------------------------- /assignment2/data/data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment2/data/data.mat -------------------------------------------------------------------------------- /assignment3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment3/__init__.py -------------------------------------------------------------------------------- /assignment3/assignment3.py: -------------------------------------------------------------------------------- 1 | """Implements Assignment 3 for Geoffrey Hinton's Neural Networks Course offered through Coursera. 2 | 3 | * Trains a simple Feedforward Neural Network with Backpropogation, for recognizing USPS handwritten digits. 4 | * Assignment looks into efficient optimization, and into effective regularization. 5 | * Recognizes USPS handwritten digits. 6 | 7 | Abstracts classifiers developed in the course into, a more pythonic Sklearn framework. And cleans up a lot of the 8 | given code. 9 | """ 10 | import copy 11 | import os 12 | 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | from numpy.testing import assert_array_equal 16 | from utility.utils import loadmat, logistic, log_sum_exp_over_rows 17 | 18 | NUM_INPUT_UNITS = 256 19 | NUM_CLASSES = 10 20 | 21 | __all__ = ['A3Run'] 22 | 23 | 24 | class FFNeuralNet: 25 | """Implements Feedforward Neural Network from Assignment 3 trained with Backpropagation. 26 | """ 27 | 28 | def __init__(self, 29 | training_iters, 30 | validation_data, 31 | wd_coeff=None, 32 | lr_net=0.02, 33 | n_hid=300, 34 | n_classes=10, 35 | n_input_units=256, 36 | train_momentum=0.9, 37 | mini_batch_size=100, 38 | early_stopping=False): 39 | """Initialize neural network. 40 | 41 | Args: 42 | training_iters (int) : number of training iterations 43 | validation_data (dict) : contains 'inputs' and 'targets' data matrices 44 | wd_coeff (float) : weight decay coefficient 45 | lr_net (float) : learning rate for neural net classifier 46 | n_hid (int) : number of hidden units 47 | n_classes (int) : number of classes 48 | train_momentum (float) : momentum used in training 49 | mini_batch_size (int) : size of training batches 50 | early_stopping (bool) : saves model at validation error minimum 51 | """ 52 | self.n_classes = n_classes 53 | self.wd_coeff = wd_coeff 54 | self.batch_size = mini_batch_size 55 | self.lr_net = lr_net 56 | self.n_iterations = training_iters 57 | self.train_momentum = train_momentum 58 | self.early_stopping = early_stopping 59 | self.validation_data = validation_data # used for early stopping 60 | 61 | # model result params 62 | self.training_data_losses = [] 63 | self.validation_data_losses = [] 64 | 65 | # Model params 66 | # We don't use random initialization, for this assignment. This way, everybody will get the same results. 67 | self.n_params = (n_input_units + n_classes) * n_hid 68 | theta = np.transpose(np.column_stack(np.cos(range(self.n_params)))) * 0.1 if self.n_params else np.array([]) 69 | self.model = self.theta_to_model(theta) 70 | self.theta = self.model_to_theta(self.model) 71 | assert_array_equal(theta.flatten(), self.theta) 72 | self.momentum_speed = self.theta * 0.0 73 | 74 | def reset_classifier(self): 75 | """Resets the model parameters. 76 | """ 77 | theta = np.transpose(np.column_stack(np.cos(range(self.n_params)))) * 0.1 if self.n_params else np.array([]) 78 | self.model = self.theta_to_model(theta) 79 | self.theta = self.model_to_theta(self.model) 80 | self.momentum_speed = self.theta * 0.0 81 | 82 | @staticmethod 83 | def model_to_theta(model): 84 | """Takes a model (or gradient in model form), and turns it into one long vector. See also theta_to_model.""" 85 | model_copy = copy.deepcopy(model) 86 | return np.hstack((model_copy['inputToHid'].flatten(), model_copy['hidToClass'].flatten())) 87 | 88 | @staticmethod 89 | def theta_to_model(theta): 90 | """Takes a model (or gradient) in the form of one long vector (maybe produced by model_to_theta), 91 | and restores it to the structure format, i.e. with fields .input_to_hid and .hid_to_class, both matrices. 92 | """ 93 | n_hid = np.size(theta, 0) / (NUM_INPUT_UNITS + NUM_CLASSES) 94 | return {'inputToHid': np.reshape(theta[:NUM_INPUT_UNITS * n_hid], (n_hid, NUM_INPUT_UNITS)), 95 | 'hidToClass': np.reshape(theta[NUM_INPUT_UNITS * n_hid: np.size(theta, 0)], (NUM_CLASSES, n_hid))} 96 | 97 | def fit(self, X, y): 98 | """Fit a model using Classification gradient descent. 99 | """ 100 | self._d_loss_by_d_model(inputs=X, targets=y) 101 | return self 102 | 103 | def train(self, sequences): 104 | """Implements optimize(..) from assignment. This trains using gradient descent with momentum. 105 | 106 | Args: 107 | model_shape (tuple) : is the shape of the array of weights. 108 | gradient_function : a function that takes parameters and and returns the gradient 109 | (or approximate gradient in the case of CD-1) of the function that we're maximizing. 110 | Note the contrast with the loss function that we saw in PA3, which we were minimizing. 111 | The returned gradient is an array of the same shape as the provided parameter. 112 | 113 | Returns: 114 | (numpy.array) : matrix of weights of the trained model (hid_to_class) 115 | """ 116 | self.reset_classifier() 117 | if self.early_stopping: 118 | best_so_far = dict() 119 | best_so_far['theta'] = None 120 | best_so_far['validationLoss'] = np.inf 121 | best_so_far['afterNIters'] = None 122 | 123 | n_training_cases = np.size(sequences['inputs'], 1) 124 | for i in xrange(self.n_iterations): 125 | training_batch_start = (i * self.batch_size) % n_training_cases 126 | training_batch_x = sequences['inputs'][:, training_batch_start: training_batch_start + self.batch_size] 127 | training_batch_y = sequences['targets'][:, training_batch_start: training_batch_start + self.batch_size] 128 | 129 | self.fit(training_batch_x, training_batch_y) 130 | self.momentum_speed = self.momentum_speed * self.train_momentum - self.gradient 131 | self.theta += self.momentum_speed * self.lr_net 132 | self.model = self.theta_to_model(self.theta) 133 | 134 | self.training_data_losses += [self.loss(sequences)] 135 | self.validation_data_losses += [self.loss(self.validation_data)] 136 | if self.early_stopping and self.validation_data_losses[-1] < best_so_far['validationLoss']: 137 | best_so_far['theta'] = copy.deepcopy(self.theta) # deepcopy avoids memory reference bug 138 | best_so_far['validationLoss'] = self.validation_data_losses[-1] 139 | best_so_far['afterNIters'] = i 140 | 141 | if np.mod(i, round(self.n_iterations / float(self.n_classes))) == 0: 142 | print 'After {0} optimization iterations, training data loss is {1}, and validation data ' \ 143 | 'loss is {2}'.format(i, self.training_data_losses[-1], self.validation_data_losses[-1]) 144 | 145 | # check gradient again, this time with more typical parameters and with a different data size 146 | if i == self.n_iterations: 147 | print 'Now testing the gradient on just a mini-batch instead of the whole training set... ' 148 | training_batch = {'inputs': training_batch_x, 'targets': training_batch_y} 149 | self.test_gradient(training_batch) 150 | 151 | if self.early_stopping: 152 | print 'Early stopping: validation loss was lowest after {0} iterations. ' \ 153 | 'We chose the model that we had then.'.format(best_so_far['afterNIters']) 154 | self.theta = copy.deepcopy(best_so_far['theta']) # deepcopy avoids memory reference bug 155 | 156 | def predict(self, x_sequences): 157 | """Predict a specific class from a given set of sequences. 158 | """ 159 | return np.argmax(self.predict_sequences_proba(x_sequences=x_sequences), axis=0) 160 | 161 | def predict_sequences_proba(self, x_sequences): 162 | """Predict the probability of each class in a given set of sequences. 163 | 164 | Returns: 165 | (numpy.array) : class input (size: by ) 166 | """ 167 | return self.predict_proba(x_sequences['inputs']) 168 | 169 | def predict_proba(self, inputs): 170 | """Predict the probability of each class given data inputs. 171 | 172 | Returns: 173 | (numpy.array) : probability of classes 174 | """ 175 | hid_input = np.dot(self.model['inputToHid'], inputs) 176 | hid_output = logistic(hid_input) # size: