├── .gitignore
├── README.md
├── __init__.py
├── assignment1
    ├── __init__.py
    ├── assignment1.py
    └── data
    │   ├── dataset1.mat
    │   ├── dataset2.mat
    │   ├── dataset3.mat
    │   └── dataset4.mat
├── assignment2
    ├── __init__.py
    ├── assignment2.py
    └── data
    │   └── data.mat
├── assignment3
    ├── __init__.py
    ├── assignment3.py
    └── data
    │   └── data.mat
├── assignment4
    ├── __init__.py
    ├── assignment4.py
    └── data
    │   ├── a4_randomness_source.mat
    │   └── data_set.mat
└── utility
    ├── __init__.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | *.pyc
 3 | 
 4 | # Matlab files
 5 | *.m
 6 | 
 7 | # Virtual Environment
 8 | *.env
 9 | **/venv
10 | .venv
11 | 
12 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm
13 | ## Directory-based project format
14 | .idea/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Project Description
 2 | 
 3 | Assignments for Geoffrey Hinton's Neural Net Course on Coursera, translated from Matlab into Python.
 4 | 
 5 | * assignments 2-4 are quite different than what is presented in the course, as they were refactored into logical
 6 | classifiers (adapted from the sklearn framework).
 7 | * more work could certainly be done to remove redundancy between assignments, especially between 3 and 4.
 8 | * course can be found here: https://www.coursera.org/course/neuralnets
 9 | 
10 | ## Assignment 1
11 | * Implements linear Perceptron for two class problem
12 | 
13 | ## Assignment 2
14 | * Implements a basic framework for training neural nets with mini-batch gradient descent for a language model.
15 | * Assignment covers hyperparameter search and observations through average cross entropy error.
16 |     * i.e. number of training epochs, embedding and hidden layer size, training momentum
17 | 
18 | ## Assignment 3
19 | * Trains a simple Feedforward Neural Network with Backpropogation, for recognizing USPS handwritten digits.
20 | * Assignment looks into efficient optimization, and into effective regularization.
21 | * Recognizes USPS handwritten digits.
22 | 
23 | ## Assignment 4
24 | * Trains a Feedforward neural network with pretraining using Restricted Boltzman Machines (RBMs)
25 | * The RBM is used as the visible-to-hidden layer in a network exactly like the one made in programming assignment 3.
26 | * The RBM is trained using Contrastive Divergence gradient estimator with 1 full Gibbs update, a.k.a. CD-1.
27 | * Recognizes USPS handwritten digits.
28 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/__init__.py


--------------------------------------------------------------------------------
/assignment1/__init__.py:
--------------------------------------------------------------------------------
1 | from assignment1 import *
2 | 


--------------------------------------------------------------------------------
/assignment1/assignment1.py:
--------------------------------------------------------------------------------
  1 | """Implements Assignment 1 for Geoffrey Hinton's Neural Networks Course offered through Coursera.
  2 | 
  3 | * Implements linear Perceptron for two class problem
  4 | """
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | __all__ = ['learn_perceptron',
  9 |            'update_weights',
 10 |            'eval_perceptron',
 11 |            'plot_perceptron']
 12 | 
 13 | 
 14 | def learn_perceptron(neg_examples_nobias, pos_examples_nobias, w_init, w_gen_feas,
 15 |                      pause=False):
 16 |     """Learns the weights of a perceptron for a 2-dimensional dataset and plots
 17 |     the perceptron at each iteration where an iteration is defined as one
 18 |     full pass through the data. If a generously feasible weight vector
 19 |     is provided then the visualization will also show the distance
 20 |     of the learned weight vectors to the generously feasible weight vector.
 21 | 
 22 |     Args:
 23 |         neg_examples_nobias (numpy.array) : The num_neg_examples x 2 matrix for the examples with target 0.
 24 |             num_neg_examples is the number of examples for the negative class.
 25 |         pos_examples_nobias (numpy.array) : The num_pos_examples x 2 matrix for the examples with target 1.
 26 |             num_pos_examples is the number of examples for the positive class.
 27 |         w_init (numpy.array)              : A 3-dimensional initial weight vector. The last element is the bias.
 28 |         w_gen_feas (numpy.array)          : A generously feasible weight vector.
 29 |         pause (bool)                      : Pause between iterations.
 30 |     Returns:
 31 |         numpy.array : The learned weight vector.
 32 |     """
 33 |     num_err_history = []
 34 |     w_dist_history = []
 35 | 
 36 |     # add column vector of ones for bias term
 37 |     neg_examples = np.hstack((neg_examples_nobias, np.ones((len(neg_examples_nobias), 1))))
 38 |     pos_examples = np.hstack((pos_examples_nobias, np.ones((len(pos_examples_nobias), 1))))
 39 | 
 40 |     if np.size(w_init):
 41 |         w = np.random.rand(3, 1)
 42 |     else:
 43 |         w = w_init
 44 | 
 45 |     if not np.size(w_gen_feas):
 46 |         w_gen_feas = []
 47 | 
 48 |     # Find the data points that the perceptron has incorrectly classified
 49 |     # and record the number of errors it makes.
 50 |     iter_ = 0
 51 |     mistakes0, mistakes1 = eval_perceptron(neg_examples, pos_examples, w)
 52 |     num_errs = len(mistakes0) + len(mistakes1)
 53 |     num_err_history.append(num_errs)
 54 |     print "Number of erros in iteration {0}:\t{1}".format(iter_, num_errs)
 55 |     print "Weights:", w
 56 |     plot_perceptron(neg_examples, pos_examples, mistakes0, mistakes1, num_err_history,
 57 |                     w, w_dist_history)
 58 | 
 59 |     # If a generously feasible weight vector exists, record the distance
 60 |     # to it from the initial weight vector
 61 |     if len(w_gen_feas) != 0:
 62 |         w_dist_history.append(np.linalg.norm(w - w_gen_feas))
 63 | 
 64 |     while num_errs > 0:
 65 |         iter_ = iter_ + 1
 66 | 
 67 |         # Update weights of perceptron
 68 |         w = update_weights(neg_examples, pos_examples, w)
 69 | 
 70 |         # If a generously feasible weight vetor exists, record the distance
 71 |         # to it from the current weight vector
 72 |         if len(w_gen_feas) != 0:
 73 |             w_dist_history.append(np.linalg.norm(w - w_gen_feas))
 74 | 
 75 |         # Find the data points that the perceptron has incorrectly classified
 76 |         # and record the number of errors it makes.
 77 |         mistakes0, mistakes1 = eval_perceptron(neg_examples, pos_examples, w)
 78 |         num_errs = len(mistakes0) + len(mistakes1)
 79 |         num_err_history.append(num_errs)
 80 |         print "Number of erros in iteration {0}:\t{1}".format(iter_, num_errs)
 81 |         print "Weights:", w
 82 | 
 83 |         plot_perceptron(neg_examples, pos_examples, mistakes0, mistakes1, num_err_history,
 84 |                         w, w_dist_history)
 85 |         if pause:
 86 |             while True:
 87 |                 try:
 88 |                     ans = input("Continue?")
 89 |                     if ans == 1 or ans == 'y':
 90 |                         break
 91 |                     if ans == 0 or ans == 'n':
 92 |                         return w
 93 |                 except (ValueError, NameError):
 94 |                     print("Sorry, I didn't understand that.")
 95 |                     continue
 96 |     return w
 97 | 
 98 | 
 99 | def update_weights(neg_examples, pos_examples, w_current):
100 |     """Updates the weights of the perceptron for incorrectly classified points
101 |     using the perceptron update algorithm. This function makes one sweep over
102 |     the dataset.
103 | 
104 |     Args:
105 |         neg_examples (numpy.array) : The num_neg_examples x 3 matrix for the examples with target 0.
106 |             num_neg_examples is the number of examples for the negative class.
107 |         pos_examples (numpy.array) : The num_pos_examples x 3 matrix for the examples with target 1.
108 |             num_pos_examples is the number of examples for the positive class.
109 |         w_current (numpy.array)    : A 3-dimensional weight vector, the last element is the bias.
110 |     Returns:
111 |         (numpy.array) : The weight vector after one pass through the dataset using the perceptron
112 |             learning rule.
113 |     """
114 |     w = w_current
115 |     for sample in neg_examples:
116 |         assert len(np.shape(sample)) == 1 and np.shape(w)[1] == 1
117 |         activation = np.dot(sample, w)[0]
118 |         if activation >= 0:
119 |             w += np.column_stack(sample).T * (0.0 - activation)
120 |     for sample in pos_examples:
121 |         assert len(np.shape(sample)) == 1 and np.shape(w)[1] == 1
122 |         activation = np.dot(sample, w)[0]
123 |         if activation < 0:
124 |             w += np.column_stack(sample).T * (1.0 - activation)
125 |     return w
126 | 
127 | 
128 | def eval_perceptron(neg_examples, pos_examples, w):
129 |     """Evaluates the perceptron using a given weight vector. Here, evaluation
130 |     refers to finding the data points that the perceptron incorrectly classifies.
131 | 
132 |     Args:
133 |         neg_examples (numpy.array) : The num_neg_examples x 3 matrix for the examples with target 0.
134 |             num_neg_examples is the number of examples for the negative class.
135 |         pos_examples (numpy.array) : The num_pos_examples x 3 matrix for the examples with target 1.
136 |             num_pos_examples is the number of examples for the positive class.
137 |         w (numpy.array)            : A 3-dimensional weight vector, the last element is the bias.
138 |     Returns:
139 |         (tuple) :
140 |             mistakes0 : A vector containing the indices of the negative examples that have been
141 |                 incorrectly classified as positive.
142 |             mistakes1 : A vector containing the indices of the positive examples that have been
143 |                 incorrectly classified as negative.
144 |     """
145 |     mistakes0 = [i for i, sample in enumerate(neg_examples) if np.dot(sample, w)[0] >= 0]
146 |     mistakes1 = [i for i, sample in enumerate(pos_examples) if np.dot(sample, w)[0] < 0]
147 |     return mistakes0, mistakes1
148 | 
149 | 
150 | def plot_perceptron(neg_examples, pos_examples, mistakes0, mistakes1,
151 |                     num_err_history, w, w_dist_history):
152 |     """The top-left plot shows the dataset and the classification boundary given by
153 |     the weights of the perceptron. The negative examples are shown as circles
154 |     while the positive examples are shown as squares. If an example is colored
155 |     green then it means that the example has been correctly classified by the
156 |     provided weights. If it is colored red then it has been incorrectly classified.
157 |     The top-right plot shows the number of mistakes the perceptron algorithm has
158 |     made in each iteration so far.
159 | 
160 |     The bottom-left plot shows the distance to some generously feasible weight
161 |     vector if one has been provided (note, there can be an infinite number of these).
162 |     Points that the classifier has made a mistake on are shown in red,
163 |     while points that are correctly classified are shown in green.
164 | 
165 |     The goal is for all of the points to be green (if it is possible to do so).
166 | 
167 |     Args:
168 |         neg_examples    : The num_neg_examples x 3 matrix for the examples with target 0.
169 |                           num_neg_examples is the number of examples for the negative class.
170 |         pos_examples    : The num_pos_examples x 3 matrix for the examples with target 1.
171 |                           num_pos_examples is the number of examples for the positive class.
172 |         mistakes0       : A vector containing the indices of the datapoints from class 0 incorrectly
173 |                           classified by the perceptron. This is a subset of neg_examples.
174 |         mistakes1       : A vector containing the indices of the datapoints from class 1 incorrectly
175 |                           classified by the perceptron. This is a subset of pos_examples.
176 |         num_err_history : A vector containing the number of mistakes for each
177 |                           iteration of learning so far.
178 |         w               : A 3-dimensional vector corresponding to the current weights of the
179 |                           perceptron. The last element is the bias.
180 |         w_dist_history  : A vector containing the L2-distance to a generously
181 |                           feasible weight vector for each iteration of learning so far.
182 |                           Empty if one has not been provided.
183 |     """
184 |     f = plt.figure(1)
185 | 
186 |     neg_correct_ind = np.setdiff1d(range(len(neg_examples)), mistakes0)
187 |     pos_correct_ind = np.setdiff1d(range(len(pos_examples)), mistakes1)
188 |     assert all(m_idx not in set(neg_correct_ind) for m_idx in mistakes0) and \
189 |            all(m_idx not in set(pos_correct_ind) for m_idx in mistakes1)
190 | 
191 |     plt.subplot(2, 2, 1)
192 |     plt.hold(True)
193 |     if np.size(neg_examples):
194 |         plt.plot(neg_examples[neg_correct_ind][:, 0], neg_examples[neg_correct_ind][:, 1], 'og', markersize=10)
195 |     if np.size(pos_examples):
196 |         plt.plot(pos_examples[pos_correct_ind][:, 0], pos_examples[pos_correct_ind][:, 1], 'sg', markersize=10)
197 | 
198 |     if len(mistakes0):
199 |         plt.plot(neg_examples[mistakes0][:, 0], neg_examples[mistakes0][:, 1], 'or', markersize=10)
200 |     if len(mistakes1):
201 |         plt.plot(pos_examples[mistakes1][:, 0], pos_examples[mistakes1][:, 1], 'sr', markersize=10)
202 | 
203 |     plt.title('Perceptron Classifier')
204 |     # In order to plot the decision line, we just need to get two points.
205 |     plt.plot([-5, 5], [(-w[-1] + 5 * w[0]) / w[1], (-w[-1] - 5 * w[0]) / w[1]], 'k')
206 |     plt.xlim([-1, 4])
207 |     plt.ylim([-2, 2])
208 |     plt.hold(False)
209 | 
210 |     plt.subplot(2, 2, 2)
211 |     plt.plot(range(len(num_err_history)), num_err_history)
212 |     plt.xlim([-1, max(15, len(num_err_history))])
213 |     plt.ylim([0, len(neg_examples) + len(pos_examples) + 1])
214 |     plt.title('Number of errors')
215 |     plt.xlabel('Iteration')
216 |     plt.ylabel('Number of errors')
217 | 
218 |     plt.subplot(2, 2, 3)
219 |     plt.plot(range(len(w_dist_history)), w_dist_history)
220 |     plt.xlim([-1, max(15, len(num_err_history))])
221 |     plt.ylim([0, 15])
222 |     plt.title('Distance')
223 |     plt.xlabel('Iteration')
224 |     plt.ylabel('Distance')
225 |     plt.show()
226 | 
227 | 
228 | if __name__ == "__main__":
229 |     import matplotlib.pylab as pylab
230 | 
231 |     pylab.rcParams['figure.figsize'] = 12, 8
232 | 
233 |     import scipy.io
234 |     import os
235 |     import matplotlib.pyplot as plt
236 | 
237 |     data_path = os.path.join(os.getcwd(), 'data/')
238 |     files = ['dataset%d' % i for i in range(1, 5)]
239 | 
240 |     dataset_file = os.path.join(data_path, files[2])
241 |     data = scipy.io.loadmat(dataset_file)
242 | 
243 |     w = learn_perceptron(data['neg_examples_nobias'],
244 |                          data['pos_examples_nobias'],
245 |                          data['w_init'],
246 |                          data['w_gen_feas'])
247 | 


--------------------------------------------------------------------------------
/assignment1/data/dataset1.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset1.mat


--------------------------------------------------------------------------------
/assignment1/data/dataset2.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset2.mat


--------------------------------------------------------------------------------
/assignment1/data/dataset3.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset3.mat


--------------------------------------------------------------------------------
/assignment1/data/dataset4.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment1/data/dataset4.mat


--------------------------------------------------------------------------------
/assignment2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment2/__init__.py


--------------------------------------------------------------------------------
/assignment2/assignment2.py:
--------------------------------------------------------------------------------
  1 | """Implements Assignment 2 for Geoffrey Hinton's Neural Networks Course offered through Coursera.
  2 | 
  3 | * Implements a basic framework for training neural nets with mini-batch gradient descent for a language model.
  4 | * Assignment covers hyperparameter search and observations through average cross entropy error.
  5 |     * i.e. number of training epochs, embedding and hidden layer size, training momentum
  6 | 
  7 | Abstracts classifiers developed in the course into, a more pythonic Sklearn framework. And cleans up a lot of the
  8 | given code.
  9 | """
 10 | import os
 11 | import time
 12 | 
 13 | import matplotlib.pylab as pylab
 14 | import numpy as np
 15 | from sklearn.base import BaseEstimator
 16 | 
 17 | from utility.utils import zip_safe, loadmat
 18 | 
 19 | __all__ = ['EvaluateCrossEntropy',
 20 |            'NeuralNet',
 21 |            'load_data',
 22 |            'display_nearest_words',
 23 |            'word_distance',
 24 |            'A2Run'
 25 |            ]
 26 | 
 27 | 
 28 | def load_data(data, batch_size=100):
 29 |     """This method loads the training, validation and test set. It also divides the training set into mini-batches.
 30 | 
 31 |     Notes:
 32 |     -----
 33 |     * Subtract 1 from each index in `input` and `target` to fix matlab to python indexing
 34 | 
 35 |     Args:
 36 |         data (dict)         : From mat file.
 37 |         batch_size (int)    : Mini-batch size.
 38 | 
 39 |     Returns:
 40 |         dict: With keys `train`, `valid`, `test`, `vocab`
 41 |             train_input (numpy.array)   : An array of size d X n X m, where
 42 |                 d: number of input dimensions (in this case, 3).
 43 |                 n: size of each mini-batch (in this case, 100).
 44 |                 m: number of minibatches.
 45 |             train_target (numpy.array)  : An array of size 1 X n X m.
 46 |             valid_input (numpy.array)   : An array of size D X number of points in the validation set.
 47 |             test (numpy.array)          : An array of size D X number of points in the test set.
 48 |             vocab (numpy.array)         : Vocabulary containing index to word mapping.
 49 |     """
 50 |     d = np.size(data['trainData'], 0) - 1
 51 |     m = int(np.size(data['trainData'], axis=1) / batch_size)
 52 | 
 53 |     sequences = {key: dict() for key in ['train', 'valid', 'test']}
 54 | 
 55 |     sequences['train']['input'] = np.reshape(data['trainData'][:d, :batch_size * m], (d, batch_size, m)) - 1
 56 |     sequences['train']['target'] = np.reshape(data['trainData'][d, :batch_size * m], (1, batch_size, m)) - 1
 57 |     sequences['valid']['input'] = data['validData'][:d, :] - 1
 58 |     sequences['valid']['target'] = data['validData'][d, :] - 1
 59 |     sequences['test']['input'] = data['testData'][:d, :] - 1
 60 |     sequences['test']['target'] = data['testData'][d, :] - 1
 61 |     sequences['vocab'] = data['vocab']
 62 | 
 63 |     return sequences
 64 | 
 65 | 
 66 | class NeuralNet(BaseEstimator):
 67 |     """Implements assignment 2 of Neural Networks for Machine Learning (Coursera) for Learning word representations.
 68 |     """
 69 | 
 70 |     def __init__(self,
 71 |                  epochs=1,
 72 |                  learning_rate=0.1,
 73 |                  momentum=0.9,
 74 |                  numhid1=50,
 75 |                  numhid2=200,
 76 |                  init_wt=0.01,
 77 |                  validation_ce_after=1000,
 78 |                  vocab_size=None,
 79 |                  num_words=None):
 80 |         """Initialize NeuralNet instance with training and visualization params.
 81 | 
 82 |         Args:
 83 |             epochs (int)                : Number of epochs to run.
 84 |             learning_rate (float)       : Learning rate.
 85 |             momentum (float)            : Momentum default.
 86 |             numhid1 (int)               : Dimensionality of embedding space.
 87 |             numhid2 (int)               : Number of units in hidden layer.
 88 |             init_wt (float)             : Standard deviation of the normal distribution which is sampled to
 89 |                                           get the initial weights
 90 |             validation_ce_after (int)   : Show cross-entropy calculation after specified samples during validation
 91 |             vocab_size (int)            : Length of vocabulary in dataset.
 92 |             num_words (int)             : Num words used in each training sample (given from dataset).
 93 |                                           In the assignment case, there's 3
 94 |         """
 95 |         assert vocab_size and num_words
 96 | 
 97 |         # Set Hyper params
 98 |         self.epochs = epochs
 99 |         self.vocab_size = vocab_size
100 |         self.learning_rate = learning_rate
101 |         self.momentum = momentum
102 |         self.numhid1 = numhid1
103 |         self.numhid2 = numhid2
104 |         self.init_wt = init_wt
105 |         self.show_validation_ce_after = validation_ce_after
106 | 
107 |         # INITIALIZE WEIGHTS AND BIASES
108 |         self.word_embedding_weights = None
109 |         self.embed_to_hid_weights = None
110 |         self.hid_to_output_weights = None
111 |         self.hid_bias = None
112 |         self.output_bias = None
113 | 
114 |         self.word_embedding_weights_delta = None
115 |         self.embed_to_hid_weights_delta = None
116 |         self.hid_to_output_weights_delta = None
117 |         self.hid_bias_delta = None
118 |         self.output_bias_delta = None
119 |         self.reset_classifier(vocab_size, num_words)
120 | 
121 |         # Initialize evaluation params
122 |         self.tiny = np.exp(-30)
123 |         self.batch_iteration = 0  # this is count in Matlab code
124 |         self.trainset_ce = 0.0
125 | 
126 |     def reset_classifier(self, vocab_size, num_words):
127 |         """Resets state of the classifier given vocab_size and num_words in dataset.
128 |         """
129 |         self.word_embedding_weights = self.init_wt * np.random.rand(vocab_size, self.numhid1)
130 |         self.embed_to_hid_weights = self.init_wt * np.random.rand(num_words * self.numhid1, self.numhid2)
131 |         self.hid_to_output_weights = self.init_wt * np.random.rand(self.numhid2, vocab_size)
132 |         self.hid_bias = np.zeros((self.numhid2, 1))
133 |         self.output_bias = np.zeros((vocab_size, 1))
134 | 
135 |         self.word_embedding_weights_delta = np.zeros((vocab_size, self.numhid1))
136 |         self.embed_to_hid_weights_delta = np.zeros((num_words * self.numhid1, self.numhid2))
137 |         self.hid_to_output_weights_delta = np.zeros((self.numhid2, vocab_size))
138 |         self.hid_bias_delta = np.zeros((self.numhid2, 1))
139 |         self.output_bias_delta = np.zeros((vocab_size, 1))
140 | 
141 |     def fit(self, X, y):
142 |         """Fit model given matrix X and target y.
143 | 
144 |         Args:
145 |             X   (numpy.ndarray) : input matrix
146 |             y   (numpy.ndarray) : target matrix
147 | 
148 |         Returns:
149 |             self (model) contains:
150 |                 word_embedding_weights
151 |                 embed_to_hid_weights
152 |                 hid_to_output_weights
153 |                 hid_bias
154 |                 output_bias
155 |         """
156 |         numwords, batch_size = np.shape(X)
157 |         # FORWARD PROPAGATE.
158 |         # Compute the state of each layer in the network given the input batch
159 |         # and all weights and biases
160 |         embedding_layer_state, hidden_layer_state, output_layer_state = self.fprop(X)
161 |         assert all([all(row == False) for row in np.isnan(output_layer_state)])
162 |         # COMPUTE DERIVATIVE.
163 |         # Expand the target to a sparse 1-of-K vector.
164 |         expanded_y = np.eye(self.vocab_size)[:, y]
165 |         # Compute derivative of cross-entropy loss function.
166 |         error_deriv = output_layer_state - expanded_y
167 | 
168 |         # MEASURE LOSS FUNCTION.
169 |         ce = -sum(sum(np.multiply(expanded_y,
170 |                                   np.log(output_layer_state + self.tiny)))) / float(batch_size)
171 |         self.trainset_ce += (ce - self.trainset_ce) / float(self.batch_iteration)
172 | 
173 |         # BACK PROPAGATE.
174 |         # OUTPUT LAYER.
175 |         hid_to_output_weights_gradient = np.dot(hidden_layer_state, error_deriv.T)
176 |         output_bias_gradient = np.column_stack(np.sum(error_deriv, axis=1)).T
177 | 
178 |         back_propagated_deriv_1 = np.multiply(np.multiply(np.dot(self.hid_to_output_weights, error_deriv),
179 |                                                           hidden_layer_state), (1 - hidden_layer_state))
180 | 
181 |         # HIDDEN LAYER.
182 |         embed_to_hid_weights_gradient = np.dot(embedding_layer_state, back_propagated_deriv_1.T)
183 |         assert (self.numhid1 * numwords, self.numhid2) == embed_to_hid_weights_gradient.shape
184 |         hid_bias_gradient = np.column_stack(np.sum(back_propagated_deriv_1, axis=1)).T
185 |         assert (self.numhid2, 1) == hid_bias_gradient.shape
186 |         back_propagated_deriv_2 = np.dot(self.embed_to_hid_weights, back_propagated_deriv_1)
187 |         assert back_propagated_deriv_2.shape == (numwords * self.numhid1, batch_size)
188 | 
189 |         word_embedding_weights_gradient = np.zeros((self.vocab_size, self.numhid1))
190 |         # EMBEDDING LAYER.
191 |         for w in xrange(1, numwords):
192 |             word_embedding_weights_gradient += np.dot(np.eye(self.vocab_size)[:, X[w, :]],
193 |                                                       back_propagated_deriv_2[
194 |                                                       (w - 1) * self.numhid1: w * self.numhid1, :].T)
195 |         self.__update_weights_and_biases(batch_size, word_embedding_weights_gradient,
196 |                                          embed_to_hid_weights_gradient, hid_to_output_weights_gradient,
197 |                                          hid_bias_gradient, output_bias_gradient)
198 |         return self
199 | 
200 |     def __update_weights_and_biases(self,
201 |                                     batch_size,
202 |                                     word_embedding_weights_gradient,
203 |                                     embed_to_hid_weights_gradient,
204 |                                     hid_to_output_weights_gradient,
205 |                                     hid_bias_gradient,
206 |                                     output_bias_gradient):
207 |         """Update weights and biases
208 |         """
209 |         self.word_embedding_weights_delta = self.momentum * self.word_embedding_weights_delta + \
210 |                                             word_embedding_weights_gradient / float(batch_size)
211 |         self.word_embedding_weights -= self.learning_rate * self.word_embedding_weights_delta
212 | 
213 |         self.embed_to_hid_weights_delta = self.momentum * self.embed_to_hid_weights_delta + \
214 |                                           embed_to_hid_weights_gradient / float(batch_size)
215 |         self.embed_to_hid_weights -= self.learning_rate * self.embed_to_hid_weights_delta
216 | 
217 |         self.hid_to_output_weights_delta = self.momentum * self.hid_to_output_weights_delta + \
218 |                                            hid_to_output_weights_gradient / float(batch_size)
219 |         self.hid_to_output_weights -= self.learning_rate * self.hid_to_output_weights_delta
220 | 
221 |         self.hid_bias_delta = self.momentum * self.hid_bias_delta + hid_bias_gradient / float(batch_size)
222 |         self.hid_bias -= self.learning_rate * self.hid_bias_delta
223 | 
224 |         self.output_bias_delta = self.momentum * self.output_bias_delta + output_bias_gradient / float(batch_size)
225 |         self.output_bias -= self.learning_rate * self.output_bias_delta
226 | 
227 |     def train(self, sequences):
228 |         """This function trains a neural network language model and validates as well. (These should be split up)
229 | 
230 |         Args:
231 |             sequences (dict) : input data
232 | 
233 |         Returns:
234 |             struct: contains the learned weights and biases and vocabulary.
235 |         """
236 |         self.reset_classifier(vocab_size=len(sequences['vocab']), num_words=len(sequences['train']['input']))
237 |         for epoch in xrange(1, self.epochs + 1):
238 |             print 'Epoch %d\n' % epoch
239 |             self.trainset_ce = 0.0
240 |             # LOOP OVER MINI-BATCHES.
241 |             for m, (input_batch, target_batch) in enumerate(zip_safe(sequences['train']['input'].T,
242 |                                                                      sequences['train']['target'].T)):
243 |                 self.batch_iteration += 1
244 |                 target_batch = target_batch.flatten()
245 |                 self.fit(input_batch.T, target_batch)
246 | 
247 |                 # VALIDATE.
248 |                 if self.show_validation_ce_after and (m + 1) % self.show_validation_ce_after == 0:
249 |                     print '\rRunning validation ... Validation CE after %d : %.3f' % \
250 |                           (m + 1, EvaluateCrossEntropy(self).compute_ce(sequences['valid'], vocab_size=self.vocab_size))
251 |             print '\rAverage Training CE : %.3f' % self.trainset_ce
252 |         print 'Final Training CE : %.3f' % self.trainset_ce
253 | 
254 |     def fprop(self, input_batch):
255 |         """This method forward propagates through a neural network.
256 | 
257 |         Args:
258 |             input_batch (numpy.ndarray)                 : The input data as a matrix of size numwords X batchsize where,
259 |                     *   numwords is the number of words.
260 |                     *   batchsize is the number of data points.
261 |                 So, if input_batch(i, j) = k then the ith word in data point j is word index k of the vocabulary.
262 | 
263 |         Returns:
264 |             tuple   :
265 |                 embedding_layer_state (numpy.ndarray)   : State of units in the embedding layer as a matrix of
266 |                     size numhid1*numwords X batchsize
267 |                 hidden_layer_state (numpy.ndarray)      : State of units in the hidden layer as a matrix of
268 |                     size numhid2 X batchsize
269 |                 output_layer_state (numpy.ndarray)      : State of units in the output layer as a matrix of size
270 |                     vocab_size X batchsize
271 |         """
272 | 
273 |         numwords, batch_size = np.shape(input_batch)
274 |         vocab_size, numhid1 = np.shape(self.word_embedding_weights)
275 |         numhid2 = np.size(self.embed_to_hid_weights, axis=1)
276 | 
277 |         # COMPUTE STATE OF WORD EMBEDDING LAYER.
278 |         # Look up the inputs word indices in the word_embedding_weights matrix.
279 |         embedding_layer_state = np.reshape(self.word_embedding_weights[input_batch.flatten()].T,
280 |                                            (numhid1 * numwords, -1))
281 |         # COMPUTE STATE OF HIDDEN LAYER.
282 |         # Compute inputs to hidden units.
283 |         inputs_to_hidden_units = np.dot(self.embed_to_hid_weights.T, embedding_layer_state) + np.tile(self.hid_bias,
284 |                                                                                                       (1, batch_size))
285 |         # Apply logistic activation function.
286 |         hidden_layer_state = 1.0 / (1.0 + np.exp(-inputs_to_hidden_units))
287 |         assert hidden_layer_state.shape == (numhid2, batch_size)
288 | 
289 |         # COMPUTE STATE OF OUTPUT LAYER.
290 |         # Compute inputs to softmax.
291 |         inputs_to_softmax = np.dot(self.hid_to_output_weights.T, hidden_layer_state) + \
292 |                             np.tile(self.output_bias, (1, batch_size))
293 |         assert inputs_to_softmax.shape == (vocab_size, batch_size)
294 | 
295 |         # Subtract maximum.
296 |         inputs_to_softmax -= np.tile(np.max(inputs_to_softmax), (vocab_size, 1))
297 | 
298 |         # Compute exp.
299 |         output_layer_state = np.exp(inputs_to_softmax)
300 |         sum_output = np.sum(output_layer_state, axis=0)
301 |         # correct for min float -- Matlab didn't have this problem (it must assume this instead of outputting 0.0)
302 |         sum_output[np.where(sum_output == 0.0)] = np.finfo(float).min
303 |         # Normalize to get probability distribution.
304 |         output_layer_state = np.divide(output_layer_state, np.tile(sum_output, (vocab_size, 1)))
305 | 
306 |         return embedding_layer_state, hidden_layer_state, output_layer_state
307 | 
308 |     def predict_next_word(self, sentence, vocab, k):
309 |         """Predicts the next word.
310 |         Example usage:
311 |             predict_next_word('john', 'might', 'be', 3)
312 |             predict_next_word('life', 'in', 'new', 3)
313 | 
314 |         Args:
315 |             sentence (iterable) : 3 word iterable containing
316 |                 word1 (str)         : The first word as a string.
317 |                 word2 (str)         : The second word as a string.
318 |                 word3 (str)         : The third word as a string.
319 |             vocab (numpy.array) : vocabulary in model
320 |             k (int)             : The k most probable predictions are shown.
321 |         """
322 |         input_ = np.array([np.where(vocab == word)[0] if np.where(vocab == word)[0] else [None] for word in sentence])
323 |         for i, vocab_idx in enumerate(input_):
324 |             if not vocab_idx:
325 |                 print 'Word %s not in vocabulary.\n' % sentence[i]
326 |                 return
327 | 
328 |         _, _, output_layer_state = self.fprop(input_)
329 | 
330 |         prob = np.sort(output_layer_state, axis=0)[::-1]
331 |         indices = np.argsort(output_layer_state, axis=0)[::-1]
332 |         for i in xrange(0, k):
333 |             # noinspection PyStringFormat
334 |             print '"%s %s %s %s" -- [Prob: %.5f]' % (sentence + (vocab[indices[i]][-1], prob[i]))
335 | 
336 | 
337 | class EvaluateCrossEntropy(object):
338 |     """Computes cross entropy given classifier model.
339 |     """
340 | 
341 |     def __init__(self, estimator):
342 |         """Initialize EvaluateCrossEntropy instance.
343 |         """
344 |         self.estimator = estimator
345 | 
346 |     def run_evaluation(self, sequences):
347 |         # EVALUATE ON VALIDATION SET.
348 |         print 'Running validation ... Final Validation CE : %.3f' % \
349 |               self.compute_ce(sequences['valid'], vocab_size=len(sequences['vocab']))
350 |         print 'Running test ... Final Test CE : %.3f' % \
351 |               self.compute_ce(sequences['test'], vocab_size=len(sequences['vocab']))
352 | 
353 |     def compute_ce(self, data, vocab_size):
354 |         """Compute Cross-Entropy
355 | 
356 |         Args:
357 |             data (dict)     : Contains `input` and `target` keys each containing numpy.array
358 |             vocab_size (int): Number of words in vocabulary.
359 | 
360 |         Returns:
361 |             float : Cross-Entropy
362 |         """
363 |         embedding_layer_state, hidden_layer_state, output_layer_state = self.estimator.fprop(data['input'])
364 |         datasetsize = np.size(data['input'], 1)
365 |         expanded_target = np.eye(vocab_size)[:, data['target']]
366 |         return -sum(sum(np.multiply(expanded_target, np.log(output_layer_state + np.exp(-30))))) / float(datasetsize)
367 | 
368 | 
369 | def word_distance(word1, word2, model, vocab):
370 |     """Shows the L2 distance between word1 and word2 in the word_embedding_weights.
371 | 
372 |     Example:
373 |     -----
374 |         word_distance('school', 'university', model, vocab)
375 | 
376 |     Args:
377 |         word1 (str)         : The first word as a string.
378 |         word2 (str)         : The second word as a string.
379 |         model (NeuralNet)   : Model returned by estimator
380 |         vocab (numpy.array) : vocabulary in model
381 | 
382 |     Return:
383 |         distance
384 |     """
385 |     words = (word1, word2)
386 |     idxs = np.array([np.where(vocab == word)[0][0] if np.where(vocab == word)[0] else None for word in words])
387 |     for i, vocab_idx in enumerate(idxs):
388 |         if not vocab_idx:
389 |             print 'Word %s not in vocabulary.\n' % words[i]
390 |             return
391 |     diff = model.word_embedding_weights[idxs[0], :] - model.word_embedding_weights[idxs[1], :]
392 |     return np.sqrt(sum(np.multiply(diff, diff)))
393 | 
394 | 
395 | def display_nearest_words(word, model, k, vocab):
396 |     """Shows the k-nearest words to the query word.
397 |     Example:
398 |     -----
399 |       display_nearest_words('school', model, 10)
400 | 
401 |     Args:
402 |         word (str)          : The query word as a string.
403 |         model (NeuralNet)   : Model returned by estimator
404 |         k (int)             : The number of nearest words to display.
405 |         vocab (numpy.array) : vocabulary in model
406 |     """
407 |     idx = np.where(vocab == word)[0]
408 |     if not idx:
409 |         print 'Word %s not in vocabulary.\n' % word
410 |         return
411 | 
412 |     # Compute distance to every other word.
413 |     word_rep = model.word_embedding_weights[idx][-1]
414 |     diff = model.word_embedding_weights - np.tile(word_rep, (len(vocab), 1))
415 |     distance = np.sqrt(np.sum(np.multiply(diff, diff), axis=1))
416 | 
417 |     # Sort by distance.
418 |     order = np.argsort(distance)
419 |     order = order[1: k + 1]  # The nearest word is the query word itself, skip that.
420 |     for i in xrange(k):
421 |         print 'Word\t: %s \nDistance: %.2f\n' % (vocab[order[i]], distance[order[i]])
422 | 
423 | 
424 | class A2Run(object):
425 |     """Runs assignment 2.
426 |     """
427 | 
428 |     def __init__(self):
429 |         """Initialize data set and all test cases for assignment.
430 |         """
431 |         data = loadmat(os.path.join(os.getcwd(), 'data/data.mat'))
432 |         self.data_sets = data['data']
433 |         self.classifier = None
434 | 
435 |     def run_evaluation(self, **estimator_params):
436 |         """Runs 4-gram Neural Network evaluation.
437 | 
438 |         Args:
439 |             estimator_params (dict) : Contains parameters for NN. See NeuralNet(..)
440 |         """
441 |         start_time = time.time()
442 |         sequences = load_data(self.data_sets, batch_size=100)
443 |         self.classifier = NeuralNet(vocab_size=len(sequences['vocab']),
444 |                                     num_words=len(sequences['train']['input']),
445 |                                     **estimator_params)
446 |         self.classifier.train(sequences)
447 |         print 'Training took %.2f seconds\n', start_time - time.time()
448 |         EvaluateCrossEntropy(self.classifier).run_evaluation(sequences)
449 | 
450 |     def a2_main(self, epochs=1, learning_rate=.10, momentum=0.9, numhid1=50, numhid2=200, init_wt=0.01,
451 |                 validation_ce_after=1000):
452 |         """Runs training and computes error and loss of training, testing, and validation training sets.
453 | 
454 |         Args:
455 |             wd_coeff (float)        : weight decay coefficient
456 |             n_hid (int)             : number of hidden units
457 |             n_iterations (int)      : number of training iterations
458 |             lr_net (float)          : learning rate for neural net classifier
459 |             train_momentum (float)  : momentum used in training
460 |             early_stopping (bool)   : saves model at validation error minimum
461 |             mini_batch_size (int)   : size of training batches
462 |         """
463 |         self.run_evaluation(epochs=epochs,
464 |                             learning_rate=learning_rate,
465 |                             momentum=momentum,
466 |                             numhid1=numhid1,
467 |                             numhid2=numhid2,
468 |                             init_wt=init_wt,
469 |                             validation_ce_after=validation_ce_after)
470 | 
471 | 
472 | # coding: utf-8
473 | 
474 | # epochs:10
475 | #  - learning_rate:
476 | #         0.001:
477 | #             Validation CE: 4.379
478 | #         0.1:
479 | #             Validation CE: 2.625
480 | #         10.0:
481 | #             Validation CE: 4.584
482 | #  - Model A: 5 dimensional embedding, 100 dimensional hidden layer:
483 | #          Training CE: 2.980
484 | #  - Model B: 50 dimensional embedding, 10 dimensional hidden layer:
485 | #          Training CE 3.035
486 | #  - Model C: 50 dimensional embedding, 200 dimensional hidden layer:
487 | #          Training CE 2.559
488 | #  - Model D: 100 dimensional embedding, 5 dimensional hidden layer:
489 | #          Training CE 3.272
490 | if __name__ == "__main__":
491 |     pylab.rcParams['figure.figsize'] = 12, 8
492 |     a2 = A2Run()
493 |     classifier_params = dict(epochs=1,
494 |                              learning_rate=.10,
495 |                              momentum=0.9,
496 |                              numhid1=50,
497 |                              numhid2=200,
498 |                              init_wt=0.01,
499 |                              validation_ce_after=1000)
500 |     a2.a2_main(**classifier_params)
501 |     test_words = ['you', 'were', 'in', 'china']
502 |     a2.classifier.predict_next_word((test_words[0], test_words[1], test_words[2]), a2.data_sets['vocab'], 5)
503 |     display_nearest_words(test_words[1], a2.classifier, 10, a2.data_sets['vocab'])
504 |     word_distance(test_words[0], test_words[1], a2.classifier, a2.data_sets['vocab'])
505 |     word_distance('percent', 'dr.', a2.classifier, a2.data_sets['vocab'])
506 | 


--------------------------------------------------------------------------------
/assignment2/data/data.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment2/data/data.mat


--------------------------------------------------------------------------------
/assignment3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment3/__init__.py


--------------------------------------------------------------------------------
/assignment3/assignment3.py:
--------------------------------------------------------------------------------
  1 | """Implements Assignment 3 for Geoffrey Hinton's Neural Networks Course offered through Coursera.
  2 | 
  3 | * Trains a simple Feedforward Neural Network with Backpropogation, for recognizing USPS handwritten digits.
  4 | * Assignment looks into efficient optimization, and into effective regularization.
  5 | * Recognizes USPS handwritten digits.
  6 | 
  7 | Abstracts classifiers developed in the course into, a more pythonic Sklearn framework. And cleans up a lot of the
  8 | given code.
  9 | """
 10 | import copy
 11 | import os
 12 | 
 13 | import numpy as np
 14 | import matplotlib.pyplot as plt
 15 | from numpy.testing import assert_array_equal
 16 | from utility.utils import loadmat, logistic, log_sum_exp_over_rows
 17 | 
 18 | NUM_INPUT_UNITS = 256
 19 | NUM_CLASSES = 10
 20 | 
 21 | __all__ = ['A3Run']
 22 | 
 23 | 
 24 | class FFNeuralNet:
 25 |     """Implements Feedforward Neural Network from Assignment 3 trained with Backpropagation.
 26 |     """
 27 | 
 28 |     def __init__(self,
 29 |                  training_iters,
 30 |                  validation_data,
 31 |                  wd_coeff=None,
 32 |                  lr_net=0.02,
 33 |                  n_hid=300,
 34 |                  n_classes=10,
 35 |                  n_input_units=256,
 36 |                  train_momentum=0.9,
 37 |                  mini_batch_size=100,
 38 |                  early_stopping=False):
 39 |         """Initialize neural network.
 40 | 
 41 |         Args:
 42 |             training_iters (int)    : number of training iterations
 43 |             validation_data (dict)  : contains 'inputs' and 'targets' data matrices
 44 |             wd_coeff (float)        : weight decay coefficient
 45 |             lr_net (float)          : learning rate for neural net classifier
 46 |             n_hid (int)             : number of hidden units
 47 |             n_classes (int)         : number of classes
 48 |             train_momentum (float)  : momentum used in training
 49 |             mini_batch_size (int)   : size of training batches
 50 |             early_stopping (bool)   : saves model at validation error minimum
 51 |         """
 52 |         self.n_classes = n_classes
 53 |         self.wd_coeff = wd_coeff
 54 |         self.batch_size = mini_batch_size
 55 |         self.lr_net = lr_net
 56 |         self.n_iterations = training_iters
 57 |         self.train_momentum = train_momentum
 58 |         self.early_stopping = early_stopping
 59 |         self.validation_data = validation_data  # used for early stopping
 60 | 
 61 |         # model result params
 62 |         self.training_data_losses = []
 63 |         self.validation_data_losses = []
 64 | 
 65 |         # Model params
 66 |         # We don't use random initialization, for this assignment. This way, everybody will get the same results.
 67 |         self.n_params = (n_input_units + n_classes) * n_hid
 68 |         theta = np.transpose(np.column_stack(np.cos(range(self.n_params)))) * 0.1 if self.n_params else np.array([])
 69 |         self.model = self.theta_to_model(theta)
 70 |         self.theta = self.model_to_theta(self.model)
 71 |         assert_array_equal(theta.flatten(), self.theta)
 72 |         self.momentum_speed = self.theta * 0.0
 73 | 
 74 |     def reset_classifier(self):
 75 |         """Resets the model parameters.
 76 |         """
 77 |         theta = np.transpose(np.column_stack(np.cos(range(self.n_params)))) * 0.1 if self.n_params else np.array([])
 78 |         self.model = self.theta_to_model(theta)
 79 |         self.theta = self.model_to_theta(self.model)
 80 |         self.momentum_speed = self.theta * 0.0
 81 | 
 82 |     @staticmethod
 83 |     def model_to_theta(model):
 84 |         """Takes a model (or gradient in model form), and turns it into one long vector. See also theta_to_model."""
 85 |         model_copy = copy.deepcopy(model)
 86 |         return np.hstack((model_copy['inputToHid'].flatten(), model_copy['hidToClass'].flatten()))
 87 | 
 88 |     @staticmethod
 89 |     def theta_to_model(theta):
 90 |         """Takes a model (or gradient) in the form of one long vector (maybe produced by model_to_theta),
 91 |         and restores it to the structure format, i.e. with fields .input_to_hid and .hid_to_class, both matrices.
 92 |         """
 93 |         n_hid = np.size(theta, 0) / (NUM_INPUT_UNITS + NUM_CLASSES)
 94 |         return {'inputToHid': np.reshape(theta[:NUM_INPUT_UNITS * n_hid], (n_hid, NUM_INPUT_UNITS)),
 95 |                 'hidToClass': np.reshape(theta[NUM_INPUT_UNITS * n_hid: np.size(theta, 0)], (NUM_CLASSES, n_hid))}
 96 | 
 97 |     def fit(self, X, y):
 98 |         """Fit a model using Classification gradient descent.
 99 |         """
100 |         self._d_loss_by_d_model(inputs=X, targets=y)
101 |         return self
102 | 
103 |     def train(self, sequences):
104 |         """Implements optimize(..) from assignment. This trains using gradient descent with momentum.
105 | 
106 |         Args:
107 |             model_shape (tuple) : is the shape of the array of weights.
108 |             gradient_function   : a function that takes parameters <model> and <data> and returns the gradient
109 |                 (or approximate gradient in the case of CD-1) of the function that we're maximizing.
110 |                 Note the contrast with the loss function that we saw in PA3, which we were minimizing.
111 |                 The returned gradient is an array of the same shape as the provided <model> parameter.
112 | 
113 |         Returns:
114 |             (numpy.array) : matrix of weights of the trained model (hid_to_class)
115 |         """
116 |         self.reset_classifier()
117 |         if self.early_stopping:
118 |             best_so_far = dict()
119 |             best_so_far['theta'] = None
120 |             best_so_far['validationLoss'] = np.inf
121 |             best_so_far['afterNIters'] = None
122 | 
123 |         n_training_cases = np.size(sequences['inputs'], 1)
124 |         for i in xrange(self.n_iterations):
125 |             training_batch_start = (i * self.batch_size) % n_training_cases
126 |             training_batch_x = sequences['inputs'][:, training_batch_start: training_batch_start + self.batch_size]
127 |             training_batch_y = sequences['targets'][:, training_batch_start: training_batch_start + self.batch_size]
128 | 
129 |             self.fit(training_batch_x, training_batch_y)
130 |             self.momentum_speed = self.momentum_speed * self.train_momentum - self.gradient
131 |             self.theta += self.momentum_speed * self.lr_net
132 |             self.model = self.theta_to_model(self.theta)
133 | 
134 |             self.training_data_losses += [self.loss(sequences)]
135 |             self.validation_data_losses += [self.loss(self.validation_data)]
136 |             if self.early_stopping and self.validation_data_losses[-1] < best_so_far['validationLoss']:
137 |                 best_so_far['theta'] = copy.deepcopy(self.theta)  # deepcopy avoids memory reference bug
138 |                 best_so_far['validationLoss'] = self.validation_data_losses[-1]
139 |                 best_so_far['afterNIters'] = i
140 | 
141 |             if np.mod(i, round(self.n_iterations / float(self.n_classes))) == 0:
142 |                 print 'After {0} optimization iterations, training data loss is {1}, and validation data ' \
143 |                       'loss is {2}'.format(i, self.training_data_losses[-1], self.validation_data_losses[-1])
144 | 
145 |             # check gradient again, this time with more typical parameters and with a different data size
146 |             if i == self.n_iterations:
147 |                 print 'Now testing the gradient on just a mini-batch instead of the whole training set... '
148 |                 training_batch = {'inputs': training_batch_x, 'targets': training_batch_y}
149 |                 self.test_gradient(training_batch)
150 | 
151 |         if self.early_stopping:
152 |             print 'Early stopping: validation loss was lowest after {0} iterations. ' \
153 |                   'We chose the model that we had then.'.format(best_so_far['afterNIters'])
154 |             self.theta = copy.deepcopy(best_so_far['theta'])  # deepcopy avoids memory reference bug
155 | 
156 |     def predict(self, x_sequences):
157 |         """Predict a specific class from a given set of sequences.
158 |         """
159 |         return np.argmax(self.predict_sequences_proba(x_sequences=x_sequences), axis=0)
160 | 
161 |     def predict_sequences_proba(self, x_sequences):
162 |         """Predict the probability of each class in a given set of sequences.
163 | 
164 |         Returns:
165 |             (numpy.array) : class input (size: <number of classes> by <number of data cases>)
166 |         """
167 |         return self.predict_proba(x_sequences['inputs'])
168 | 
169 |     def predict_proba(self, inputs):
170 |         """Predict the probability of each class given data inputs.
171 | 
172 |         Returns:
173 |             (numpy.array) : probability of classes
174 |         """
175 |         hid_input = np.dot(self.model['inputToHid'], inputs)
176 |         hid_output = logistic(hid_input)  # size: <number of hidden units> by <number of data cases>
177 |         return np.dot(self.model['hidToClass'], hid_output)
178 | 
179 |     def predict_sequences_log_proba(self, x_sequences):
180 |         """Predict the log probability of each class in a given set of sequences.
181 | 
182 |         Returns:
183 |             (numpy.array) : log probability of each class (size: <number of classes, i.e. 10> by <number of data cases>)
184 |         """
185 |         class_input = self.predict_sequences_proba(x_sequences)
186 |         return self.predict_log_proba(class_input)
187 | 
188 |     def predict_log_proba(self, class_input):
189 |         """Predicts log probability of each class given class inputs
190 | 
191 |         Notes:
192 |         * log(sum(exp of class_input)) is what we subtract to get properly normalized log class probabilities.
193 | 
194 |         Args:
195 |             class_input (numpy.array)   : probability of each class (see predict_sequences_proba(..))
196 |                                           (size: <1> by <number of data cases>)
197 | 
198 |         Returns:
199 |             (numpy.array) : log probability of each class.
200 |         """
201 |         class_normalizer = log_sum_exp_over_rows(class_input)
202 |         return class_input - np.tile(class_normalizer, (np.size(class_input, 0), 1))
203 | 
204 |     def loss(self, data):
205 |         """Evaluate loss of given data set.
206 | 
207 |         Args:
208 |             data (dict):
209 |                     - 'inputs' is a matrix of size <number of inputs i.e. NUM_INPUT_UNITS> by <number of data cases>
210 |                        Each column describes a different data case.
211 |                     - 'targets' is a matrix of size <number of classes i.e. NUM_CLASSES> by <number of data cases>
212 |                        Each column describes a different data case. It contains a one-of-N encoding of the class,
213 |                        i.e. one element in every column is 1 and the others are 0.
214 | 
215 |         Returns:
216 |             float : loss of model
217 |         """
218 |         log_class_prob = self.predict_sequences_log_proba(data)
219 |         # select the right log class probability using that sum then take the mean over all data cases.
220 |         classification_loss = -np.mean(sum(log_class_prob * data['targets'], 0))
221 |         # weight decay loss. very straightforward: E = 1/2 * wd_coeffecient * theta^2
222 |         wd_loss = sum(self.model_to_theta(self.model) ** 2) / 2.0 * self.wd_coeff
223 |         return classification_loss + wd_loss
224 | 
225 |     def _d_loss_by_d_model(self, inputs, targets):
226 |         """Compute derivative of loss.
227 |         Args:
228 |             data (dict):
229 |                     - 'inputs' is a matrix of size <number of inputs i.e. NUM_INPUT_UNITS> by <number of data cases>
230 |                     - 'targets' is a matrix of size <number of classes i.e. NUM_CLASSES> by <number of data cases>
231 | 
232 |         Returns:
233 |             dict:   The returned object is supposed to be exactly like parameter <model>,
234 |                     i.e. it has fields ret['inputToHid'] and ret['hidToClass'].
235 |                     However, the contents of those matrices are gradients (d loss by d model parameter),
236 |                     instead of model parameters.
237 |         """
238 |         ret_model = dict()
239 | 
240 |         # First, feed forward the values, capture the weight input's (class_input and hid_input) and
241 |         # activations (class_output and hid_output) at every layer.
242 |         hid_input = np.dot(self.model['inputToHid'], inputs)
243 |         hid_output = logistic(hid_input)
244 |         class_input = np.dot(self.model['hidToClass'], hid_output)
245 |         class_prob = np.exp(self.predict_log_proba(class_input))
246 | 
247 |         # Now, back propagate. Compute the delta error (error_deriv) for the output layer (the third layer).
248 |         error_deriv = class_prob - targets
249 |         # Compute the gradient for the output layer across all training examples then divide
250 |         # across the training set size for each weight gradient.
251 |         hid_to_output_weights_gradient = np.dot(hid_output, error_deriv.T) / float(np.size(hid_output, axis=1))
252 |         ret_model['hidToClass'] = hid_to_output_weights_gradient.T
253 | 
254 |         # Compute the delta error (backpropagate_error_deriv) for the hidden layer.
255 |         backpropagate_error_deriv = np.dot(self.model['hidToClass'].T, error_deriv)
256 |         # Compute the gradient for the hidden layer across all training examples then divide
257 |         # across the training set size for each weight gradient.
258 |         input_to_hidden_weights_gradient = np.dot(inputs, ((1.0 - hid_output) * hid_output *
259 |                                                            backpropagate_error_deriv).T) / float(np.size(hid_output,
260 |                                                                                                          axis=1))
261 |         ret_model['inputToHid'] = input_to_hidden_weights_gradient.T
262 | 
263 |         # Add in the weight decay.
264 |         ret_model['inputToHid'] += self.model['inputToHid'] * self.wd_coeff
265 |         ret_model['hidToClass'] += self.model['hidToClass'] * self.wd_coeff
266 |         self.gradient = self.model_to_theta(ret_model)
267 | 
268 |     def classification_performance(self, data):
269 |         """This returns the fraction of data cases that is incorrectly classified by the model.
270 |         """
271 |         return np.mean(np.array(self.predict(data) != np.argmax(data['targets'], axis=0), dtype=float))
272 | 
273 |     def test_gradient(self, data):
274 |         """Test the gradient using a finite difference approximation to the gradient.
275 | 
276 |         Notes:
277 |         *  If that finite difference approximation results in an approximate gradient that's very different
278 |           from the computed gradient produced, then the program prints an error message.
279 |         """
280 |         base_theta = self.model_to_theta(self.model)
281 |         h = 1e-2
282 |         correctness_threshold = 1e-5
283 |         self._d_loss_by_d_model(data['inputs'], data['targets'])
284 |         analytic_gradient_struct = self.theta_to_model(self.gradient)
285 |         if np.size(analytic_gradient_struct.keys(), 0) != 2:
286 |             raise Exception('The object returned by def d_loss_by_d_model should have exactly two field names: '
287 |                             '.input_to_hid and .hid_to_class')
288 | 
289 |         if np.size(analytic_gradient_struct['inputToHid']) != np.size(self.model['inputToHid']):
290 |             raise Exception('The size of .input_to_hid of the return value of d_loss_by_d_model (currently {0}) '
291 |                             'should be same as the size of model[\'inputToHid\'] '
292 |                             '(currently {1})'.format(np.size(analytic_gradient_struct['inputToHid']),
293 |                                                      np.size(self.model['inputToHid'])))
294 | 
295 |         if np.size(analytic_gradient_struct['hidToClass']) != np.size(self.model['hidToClass']):
296 |             raise Exception('The size of .hid_to_class of the return value of d_loss_by_d_model (currently {0}) '
297 |                             'should be same as the size of model[\'hidToClass\'] '
298 |                             '(currently {1})'.format(np.size(analytic_gradient_struct['hidToClass']),
299 |                                                      np.size(self.model['hidToClass'])))
300 | 
301 |         analytic_gradient = self.gradient
302 |         if any(np.isnan(analytic_gradient)) or any(np.isinf(analytic_gradient)):
303 |             raise Exception('Your gradient computation produced a NaN or infinity. That is an error.')
304 | 
305 |         # We want to test the gradient not for every element of theta, because that's a lot of work.
306 |         # Instead, we test for only a few elements. If there's an error, this is probably enough to find that error.
307 |         input_to_hid_theta_size = np.prod(np.size(self.model['inputToHid']))
308 |         hid_to_class_theta_size = np.prod(np.size(self.model['hidToClass']))
309 |         big_prime = 1299721  # 1299721 is prime and thus ensures a somewhat random-like selection of indices.
310 |         hid_to_class_indices_to_check = np.mod(big_prime * np.array(range(20)), hid_to_class_theta_size) + \
311 |                                         input_to_hid_theta_size
312 |         input_to_hid_indices_to_check = np.mod(big_prime * np.array(range(80)), input_to_hid_theta_size)
313 |         indices_to_check = np.hstack((hid_to_class_indices_to_check, input_to_hid_indices_to_check))
314 |         for i, test_index in enumerate(indices_to_check):
315 |             analytic_here = analytic_gradient[test_index]
316 |             theta_step = base_theta * 0.0
317 |             theta_step[test_index] = h
318 |             contribution_distances = range(-4, 0) + range(1, 5)
319 |             contribution_weights = [1. / 280, -4. / 105, 1. / 5, -4. / 5, 4. / 5, -1. / 5, 4. / 105, -1. / 280]
320 |             temp = 0.
321 |             for distance, weight in zip(contribution_distances, contribution_weights):
322 |                 self.model = self.theta_to_model(base_theta + theta_step * distance)  # temporarily update model
323 |                 temp += self.loss(data) * weight
324 |             fd_here = temp / h
325 |             diff = abs(analytic_here - fd_here)
326 |             if diff > correctness_threshold and diff / float(abs(analytic_here) + abs(fd_here)) > correctness_threshold:
327 |                 part_names = ['inputToHid', 'hidToClass']
328 |                 raise Exception('Theta element #{0} (part of {1}), with value {2}, has finite difference gradient {3} '
329 |                                 'but analytic gradient {4}. That looks like an error.'.format(test_index,
330 |                                                                                               part_names[i <= 19],
331 |                                                                                               base_theta[test_index],
332 |                                                                                               fd_here,
333 |                                                                                               analytic_here))
334 | 
335 |             if i == 19:
336 |                 print 'Gradient test passed for hid_to_class.'
337 |             if i == 99:
338 |                 print 'Gradient test passed for input_to_hid.'
339 | 
340 |         self.model = self.theta_to_model(base_theta)  # make sure model is reset back to initial
341 |         print 'Gradient test passed. That means that the gradient that your code computed is within 0.001%% of the ' \
342 |               'gradient that the finite difference approximation computed, so the gradient calculation procedure is ' \
343 |               'probably correct (not certainly, but probably).'
344 | 
345 | 
346 | class A3Run(object):
347 |     """Runs assignment 3.
348 |     """
349 | 
350 |     def __init__(self):
351 |         """Initialize data set and all test cases for assignment.
352 |         """
353 |         data = loadmat(os.path.join(os.getcwd(), 'Data/data.mat'))
354 |         self.data_sets = data['data']
355 | 
356 |     def a3_main(self, wd_coeff, n_hid, n_iterations, lr_net, train_momentum=0.9,
357 |                 early_stopping=False, mini_batch_size=100):
358 |         """Runs training and computes error and loss of training, testing, and validation training sets.
359 | 
360 |         Args:
361 |             wd_coeff (float)        : weight decay coefficient
362 |             n_hid (int)             : number of hidden units
363 |             n_iterations (int)      : number of training iterations
364 |             lr_net (float)          : learning rate for neural net classifier
365 |             train_momentum (float)  : momentum used in training
366 |             early_stopping (bool)   : saves model at validation error minimum
367 |             mini_batch_size (int)   : size of training batches
368 |         """
369 |         nn = FFNeuralNet(training_iters=n_iterations, validation_data=self.data_sets['validation'],
370 |                          wd_coeff=wd_coeff, lr_net=lr_net, n_hid=n_hid,
371 |                          n_classes=10, n_input_units=256, train_momentum=train_momentum,
372 |                          mini_batch_size=mini_batch_size, early_stopping=early_stopping)
373 | 
374 |         if n_iterations != 0:
375 |             print 'Now testing the gradient on the whole training set... '
376 |             nn.test_gradient(self.data_sets['training'])
377 |             nn.train(self.data_sets['training'])
378 | 
379 |         # the optimization is finished. Now do some reporting.
380 |         if n_iterations != 0 and nn.training_data_losses and nn.validation_data_losses:
381 |             plt.plot(nn.training_data_losses, 'b')
382 |             plt.plot(nn.validation_data_losses, 'r')
383 |             plt.legend(['training', 'validation'])
384 |             plt.ylabel('loss')
385 |             plt.xlabel('iteration number')
386 |             plt.title('learning rate={}, momentum={}, hidden units={}'.format(lr_net, train_momentum, n_hid))
387 |             plt.show()
388 | 
389 |         for data_name, data_segment in self.data_sets.iteritems():
390 |             print 'The loss on the {0} data is {1}'.format(data_name, nn.loss(data_segment))
391 |             if wd_coeff != 0:
392 |                 nn.wd_coeff = 0
393 |                 print 'The classification loss (i.e. without weight decay) ' \
394 |                       'on the {0} data is {1}'.format(data_name, nn.loss(data_segment))
395 |             print 'The classification error rate ' \
396 |                   'on the {0} data is {1}'.format(data_name, nn.classification_performance(data_segment))
397 | 
398 | 
399 | # Here is the entrance of this script
400 | # In[ ]:
401 | 
402 | # In[ ]:
403 | 
404 | n_hid = 7
405 | 
406 | # In[ ]:
407 | 
408 | a3 = A3Run()
409 | 
410 | # In[ ]:
411 | 
412 | # Q1
413 | a3.a3_main(0, n_hid=0, n_iterations=0, lr_net=0, train_momentum=0, early_stopping=False, mini_batch_size=0)
414 | 
415 | # In[ ]:
416 | 
417 | # Q2
418 | a3.a3_main(0, n_hid=10, n_iterations=70, lr_net=0.005, train_momentum=0.0, early_stopping=False,
419 |            mini_batch_size=4)
420 | 
421 | # In[ ]:
422 | 
423 | # Q3
424 | a3.a3_main(0, n_hid=10, n_iterations=70, lr_net=0.5, train_momentum=0.0, early_stopping=False,
425 |            mini_batch_size=4)
426 | 
427 | # Question 3-4
428 | learning_rates = [0.002, 0.01, 0.05, 0.2, 1.0, 5.0, 20.0]
429 | momentums = [0.0, 0.9]
430 | for momentum in momentums:
431 |     for learning_rate in learning_rates:
432 |         print "Momentum and learning rate are ({0}, {1})".format(momentum, learning_rate)
433 |         a3.a3_main(0, n_hid=10, n_iterations=70, lr_net=learning_rate, train_momentum=momentum,
434 |                    early_stopping=False, mini_batch_size=4)
435 |         print
436 | 
437 | # In[ ]:
438 | 
439 | ## Question 5
440 | a3.a3_main(0, n_hid=200, n_iterations=1000, lr_net=0.35, train_momentum=0.9,
441 |            early_stopping=False, mini_batch_size=100)
442 | 
443 | # In[ ]:
444 | 
445 | ## Question 6
446 | a3.a3_main(0, n_hid=200, n_iterations=1000, lr_net=0.35, train_momentum=0.9,
447 |            early_stopping=True, mini_batch_size=100)
448 | 
449 | # In[ ]:
450 | 
451 | # Question 7
452 | for decay in [0, 0.0001, 0.001, 0.01, 1., 5]:
453 |     print decay
454 |     a3.a3_main(decay, n_hid=200, n_iterations=1000, lr_net=0.35, train_momentum=0.9,
455 |                early_stopping=False, mini_batch_size=100)
456 |     print
457 | 
458 | # In[ ]:
459 | 
460 | ## Question 8
461 | for size in [10, 30, 100, 130, 170]:
462 |     print size
463 |     a3.a3_main(0, n_hid=size, n_iterations=1000, lr_net=0.35, train_momentum=0.9,
464 |                early_stopping=False, mini_batch_size=100)
465 |     print
466 | 
467 | # In[ ]:
468 | 
469 | ## Question 9/10
470 | for size in [18, 37, 83, 113, 189]:
471 |     print size
472 |     a3.a3_main(0, n_hid=size, n_iterations=1000, lr_net=0.35, train_momentum=0.9,
473 |                early_stopping=True, mini_batch_size=100)
474 |     print
475 | 


--------------------------------------------------------------------------------
/assignment3/data/data.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment3/data/data.mat


--------------------------------------------------------------------------------
/assignment4/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment4/__init__.py


--------------------------------------------------------------------------------
/assignment4/assignment4.py:
--------------------------------------------------------------------------------
  1 | """Implements Assignment 4 for Geoffrey Hinton's Neural Networks Course offered through Coursera.
  2 | 
  3 | * Trains a Feedforward neural network with pretraining using Restricted Boltzman Machines (RBMs)
  4 | * The RBM is used as the visible-to-hidden layer in a network exactly like the one made in programming assignment 3.
  5 | * The RBM is trained using Contrastive Divergence gradient estimator with 1 full Gibbs update, a.k.a. CD-1.
  6 | * Recognizes USPS handwritten digits.
  7 | 
  8 | Abstracts classifiers developed in the course into, a more pythonic Sklearn framework. And cleans up a lot of the
  9 | given code.
 10 | """
 11 | 
 12 | from sklearn.base import BaseEstimator
 13 | 
 14 | from utility.utils import log_sum_exp_over_rows, batches
 15 | 
 16 | __all__ = ['A4Run']
 17 | 
 18 | 
 19 | class A4Helper(object):
 20 |     """Helper class for neural network and rbm classifiers, as well as assignment initalization.
 21 |     """
 22 | 
 23 |     def __init__(self):
 24 |         """
 25 |         Notes:
 26 |         * All sizes requested are transposed to account for column matrix as default vector in matlab.
 27 |           a4_rand(..) then returns a transposed matrix to have the correct shape.
 28 |         """
 29 |         a4_randomness_source = loadmat(os.path.join(os.getcwd(), 'data/a4_randomness_source.mat'))
 30 |         self.randomness_source = a4_randomness_source['randomness_source']
 31 | 
 32 |     def a4_rand(self, requested_size, seed):
 33 |         """Generates "random" matrix of requested size. Randomness is a function of the provided data source.
 34 | 
 35 |         Notes:
 36 |         * This is to reduce true randomness for grading.
 37 | 
 38 |         Args:
 39 |             requested_size (tuple)  : Tuple of ints for shape of returned array.
 40 |             seed (int)              : Used to compute start index.
 41 | 
 42 |         Returns:
 43 |             (numpy.array)   : Matrix of requested size from randomness source.
 44 |         """
 45 |         start_i = int(round(seed) % round(len(self.randomness_source) / 10.0))
 46 |         if start_i + np.prod(requested_size) >= len(self.randomness_source):
 47 |             raise Exception('a4_rand failed to generate an array of that size (too big)')
 48 |         return np.reshape(self.randomness_source[start_i:start_i + np.prod(requested_size)], requested_size).T
 49 | 
 50 |     def sample_bernoulli(self, probabilities, report_calls=False):
 51 |         """Compute states of given probability vector.
 52 | 
 53 |         Args:
 54 |             probabilites (numpy.ndarray) : probability vector
 55 | 
 56 |         Returns:
 57 |             numpy.ndarray : states of given probabilities
 58 |         """
 59 |         if report_calls:
 60 |             print 'sample_bernoulli() was called with a matrix of size {0} by {1}.'.format(*np.shape(probabilities))
 61 |         seed = np.sum(probabilities)
 62 |         return np.array(probabilities > self.a4_rand(np.shape(probabilities)[::-1], seed), dtype=float)
 63 | 
 64 |     @staticmethod
 65 |     def configuration_goodness(rbm_w, visible_state, hidden_state):
 66 |         """Computes negative energy.
 67 | 
 68 |         Args:
 69 |             rbm_w (numpy.array)         : a matrix of size <number of hidden units> by <number of visible units>
 70 |             visible_state (numpy.array) : a binary matrix of size <number of visible units> by <number of configurations
 71 |                                           that we're handling in parallel>.
 72 |             hidden_state (numpy.array)  : a binary matrix of size <number of hidden units> by <number of configurations
 73 |                                           that we're handling in parallel>.
 74 | 
 75 |         Returns:
 76 |             float: the mean over cases of the goodness (negative energy) of the described configurations.
 77 |         """
 78 |         return np.mean(np.sum(np.dot(rbm_w, visible_state) * hidden_state, 0))
 79 | 
 80 |     @staticmethod
 81 |     def configuration_goodness_gradient(visible_state, hidden_state):
 82 |         """Computes gradient of negative energy.
 83 | 
 84 |         Notes:
 85 |         * You don't need the model parameters for this computation.
 86 | 
 87 |         Args:
 88 |             visible_state (numpy.array) : is a binary matrix of size <number of visible units> by
 89 |                                           <number of configurations that we're handling in parallel>.
 90 |             hidden_state (numpy.array)  : is a (possibly but not necessarily binary) matrix of size
 91 |                                           <number of hidden units>
 92 |                                           by <number of configurations that we're handling in parallel>.
 93 | 
 94 |         Returns:
 95 |             (numpy.array)   : gradient of negative energy (same shape as as the model parameters)
 96 |         """
 97 |         return np.dot(hidden_state, visible_state.T) / np.size(visible_state, 1)
 98 | 
 99 |     @staticmethod
100 |     def hidden_state_to_visible_probabilities(rbm_w, hidden_state):
101 |         """This takes in the (binary) states of the hidden units, and returns the activation probabilities
102 |          of the visible units, conditional on those states.
103 | 
104 |         Args:
105 |             rbm_w (numpy.array)         : a matrix of size <number of hidden units> by <number of visible units>
106 |             hidden_state (numpy.array)  : is a binary matrix of size <number of hidden units> by <number of
107 |                                           configurations that we're handling in parallel>.
108 | 
109 |         Returns:
110 |             (numpy.array)   : Activation probabilities of visible units. size <number of visible units> by
111 |                               <number of configurations that we're handling in parallel>.
112 |         """
113 |         return logistic(np.dot(rbm_w.T, hidden_state))
114 | 
115 |     @staticmethod
116 |     def visible_state_to_hidden_probabilities(rbm_w, visible_state):
117 |         """This takes in the (binary) states of the visible units, and returns the activation probabilities of the
118 |         hidden units conditional on those states.
119 | 
120 |         Args:
121 |             rbm_w (numpy.array)         : is a matrix of size <number of hidden units> by <number of visible units>
122 |             visible_state (numpy.array) : is a binary matrix of size <number of visible units> by <number of
123 |                                           configurations that we're handling in parallel>.
124 | 
125 |         Returns:
126 |             (numpy.array)   : Activation probabilities of hidden units. size <number of visible units> by
127 |                               <number of configurations that we're handling in parallel>.
128 |         """
129 | 
130 |         return logistic(np.dot(rbm_w, visible_state))
131 | 
132 |     @staticmethod
133 |     def describe_matrix(matrix):
134 |         print('Describing a matrix of size {0} by {1}. The mean of the elements is {2}. '
135 |               'The sum of the elements is {3}'.format(np.size(matrix, 0), np.size(matrix, 1), np.mean(matrix),
136 |                                                       np.sum(matrix)))
137 | 
138 | 
139 | class RBM(BaseEstimator, A4Helper):
140 |     """Implements pre-training for the RBM using CD-1 gradient function.
141 |     """
142 | 
143 |     def __init__(self,
144 |                  training_iters=1,
145 |                  lr_rbm=0.01,
146 |                  n_hid=300,
147 |                  n_visible=256,
148 |                  train_momentum=0.9,
149 |                  mini_batch_size=100):
150 |         """Initialize RBM model.
151 | 
152 |         Args:
153 |             training_iters (int)    : number of training iterations
154 |             lr_rbm (float)          : learning rate for RBM model
155 |             n_hid (int)             : number of hidden units
156 |             n_visible (int)         : number of visible units
157 |             train_momentum (float)  : momentum used in training
158 |             mini_batch_size (int)   : size of training batches
159 |         """
160 |         super(RBM, self).__init__()
161 |         self.model_shape = (n_hid, n_visible)
162 |         self.mini_batch_size = mini_batch_size
163 |         self.lr_rbm = lr_rbm
164 |         self.n_iterations = training_iters
165 |         self.train_momentum = train_momentum
166 | 
167 |         # Model params
168 |         self.rbm_w = None
169 |         self.gradient = None
170 | 
171 |     def reset_classifier(self):
172 |         """Resets the model parameters.
173 |         """
174 |         self.rbm_w = (self.a4_rand(self.model_shape[::-1], np.prod(self.model_shape)) * 2 - 1) * 0.1
175 |         self.gradient = np.zeros(self.model_shape)
176 | 
177 |     def fit(self, X):
178 |         """Fit a model using one step Contrastive Divergence CD-1.
179 |         """
180 |         self._cd1(visible_data=X)
181 |         return self
182 | 
183 |     def _cd1(self, visible_data):
184 |         """Implements single step contrastive divergence (CD-1).
185 | 
186 |         Args:
187 |             visible_data (numpy.array)  : is a (possibly but not necessarily binary) matrix of
188 |                                           size <number of visible units> by <number of data cases>
189 | 
190 |         Returns:
191 |             (numpy.array)   : The returned value is the gradient approximation produced by CD-1.
192 |                               It's of the same shape as <rbm_w>.
193 |         """
194 |         visible_data = self.sample_bernoulli(probabilities=visible_data)  # Question 8
195 |         hidden_probs = self.visible_state_to_hidden_probabilities(rbm_w=self.rbm_w, visible_state=visible_data)
196 |         hidden_states = self.sample_bernoulli(probabilities=hidden_probs)
197 |         initial = self.configuration_goodness_gradient(visible_state=visible_data, hidden_state=hidden_states)
198 |         visible_probs = self.hidden_state_to_visible_probabilities(rbm_w=self.rbm_w, hidden_state=hidden_states)
199 |         visible_states = self.sample_bernoulli(probabilities=visible_probs)
200 |         hidden_probs = self.visible_state_to_hidden_probabilities(rbm_w=self.rbm_w, visible_state=visible_states)
201 |         # hidden_states = self.sample_bernoulli(probabilities=hidden_probs)  # Question 6
202 |         reconstruction = self.configuration_goodness_gradient(visible_state=visible_states, hidden_state=hidden_probs)
203 | 
204 |         self.gradient = initial - reconstruction
205 | 
206 |     def train(self, sequences):
207 |         """Implements optimize(..) from assignment. This trains a model that's defined by a single matrix of weights.
208 | 
209 |         Notes:
210 |         * This uses mini-batches of size 100, momentum of 0.9, no weight decay, and no early stopping.
211 | 
212 |         Args:
213 |             model_shape (tuple) : is the shape of the array of weights.
214 |             gradient_function   : a function that takes parameters <model> and <data> and returns the gradient
215 |                 (or approximate gradient in the case of CD-1) of the function that we're maximizing.
216 |                 Note the contrast with the loss function that we saw in PA3, which we were minimizing.
217 |                 The returned gradient is an array of the same shape as the provided <model> parameter.
218 | 
219 |         Returns:
220 |             numpy.array : matrix of weights of the trained model
221 |         """
222 |         self.reset_classifier()
223 |         momentum_speed = np.zeros(self.model_shape)
224 |         for i, mini_batch_x in enumerate(batches(sequences['inputs'], self.mini_batch_size)):
225 |             if i >= self.n_iterations:
226 |                 break
227 |             self.fit(mini_batch_x)
228 |             momentum_speed = self.train_momentum * momentum_speed + self.gradient
229 |             self.rbm_w += momentum_speed * self.lr_rbm
230 | 
231 | 
232 | class FFNeuralNet(BaseEstimator, A4Helper):
233 |     """Implements Feedforward Neural Network from Assignment 4.
234 |     """
235 | 
236 |     def __init__(self,
237 |                  rbm_w=None,
238 |                  training_iters=1,
239 |                  lr_net=0.02,
240 |                  n_hid=300,
241 |                  n_classes=10,
242 |                  train_momentum=0.9,
243 |                  mini_batch_size=100):
244 |         """Initialize neural network.
245 | 
246 |         Args:
247 |             rbm_w (numpy.array)     : weight matrix of RBM
248 |             training_iters (int)    : number of training iterations
249 |             lr_net (float)          : learning rate for neural net classifier
250 |             n_hid (int)             : number of hidden units
251 |             n_classes (int)         : number of classes
252 |             train_momentum (float)  : momentum used in training
253 |             mini_batch_size (int)   : size of training batches
254 |         """
255 |         super(FFNeuralNet, self).__init__()
256 |         self.model_shape = (n_classes, n_hid)
257 |         self.mini_batch_size = mini_batch_size
258 |         self.lr_net = lr_net
259 |         self.n_iterations = training_iters
260 |         self.train_momentum = train_momentum
261 | 
262 |         # Model params
263 |         assert rbm_w is not None
264 |         self.rbm_w = rbm_w
265 |         self.model = None
266 |         self.d_phi_by_d_input_to_class = None
267 | 
268 |     def reset_classifier(self):
269 |         """Resets the model parameters.
270 |         """
271 |         self.model = (self.a4_rand(self.model_shape[::-1], np.prod(self.model_shape)) * 2 - 1) * 0.1
272 |         self.d_phi_by_d_input_to_class = np.zeros(self.model_shape)
273 | 
274 |     def fit(self, X, y):
275 |         """Fit a model using Classification gradient descent.
276 |         """
277 |         self._classification_phi_gradient(inputs=X, targets=y)
278 |         return self
279 | 
280 |     def train(self, sequences):
281 |         """Implements optimize(..) from assignment. This trains a hid_to_class.
282 | 
283 |         Notes:
284 |         * This uses mini-batches of size 100, momentum of 0.9, no weight decay, and no early stopping.
285 | 
286 |         Args:
287 |             model_shape (tuple) : is the shape of the array of weights.
288 |             gradient_function   : a function that takes parameters <model> and <data> and returns the gradient
289 |                 (or approximate gradient in the case of CD-1) of the function that we're maximizing.
290 |                 Note the contrast with the loss function that we saw in PA3, which we were minimizing.
291 |                 The returned gradient is an array of the same shape as the provided <model> parameter.
292 | 
293 |         Returns:
294 |             (numpy.array) : matrix of weights of the trained model (hid_to_class)
295 |         """
296 |         self.reset_classifier()
297 |         # calculate the hidden layer representation of the labeled data, rbm_w is input_to_hid
298 |         hidden_representation = logistic(np.dot(self.rbm_w, sequences['inputs']))
299 |         momentum_speed = np.zeros(self.model_shape)
300 |         for i, (mini_batch_x, mini_batch_y) in enumerate(zip(batches(hidden_representation, self.mini_batch_size),
301 |                                                              batches(sequences['targets'], self.mini_batch_size))):
302 |             if i >= self.n_iterations:
303 |                 break
304 |             self.fit(mini_batch_x, mini_batch_y)
305 |             momentum_speed = self.train_momentum * momentum_speed + self.d_phi_by_d_input_to_class
306 |             self.model += momentum_speed * self.lr_net
307 | 
308 |     def predict(self, x_sequences):
309 |         """Predict a specific class from a given set of sequences.
310 |         """
311 |         return np.argmax(self.predict_sequences_proba(x_sequences=x_sequences), axis=0)
312 | 
313 |     def predict_proba(self, inputs):
314 |         """Predict the probability of each class given data inputs.
315 | 
316 |         Returns:
317 |             (numpy.array) : probability of classes
318 |         """
319 |         hid_input = np.dot(self.rbm_w, inputs)  # size: <number of hidden units> by <number of data cases>
320 |         hid_output = logistic(hid_input)  # size: <number of hidden units> by <number of data cases>
321 |         return np.dot(self.model, hid_output)
322 | 
323 |     def predict_sequences_proba(self, x_sequences):
324 |         """Predict the probability of each class in a given set of sequences.
325 | 
326 |         Returns:
327 |             (numpy.array) : class input (size: <number of classes> by <number of data cases>)
328 |         """
329 |         return self.predict_proba(x_sequences['inputs'])
330 | 
331 |     def predict_sequences_log_proba(self, x_sequences):
332 |         """Predict the log probability of each class in a given set of sequences.
333 | 
334 |         Returns:
335 |             (numpy.array) : log probability of each class (size: <number of classes, i.e. 10> by <number of data cases>)
336 |         """
337 |         class_input = self.predict_sequences_proba(x_sequences)
338 |         return self.predict_log_proba(class_input)
339 | 
340 |     def predict_log_proba(self, class_input):
341 |         """Predicts log probability of each class given class inputs
342 | 
343 |         Notes:
344 |         * log(sum(exp of class_input)) is what we subtract to get properly normalized log class probabilities.
345 | 
346 |         Args:
347 |             class_input (numpy.array)   : probability of each class (see predict_sequences_proba(..))
348 |                                           (size: <1> by <number of data cases>)
349 | 
350 |         Returns:
351 |             (numpy.array) : log probability of each class.
352 |         """
353 |         class_normalizer = log_sum_exp_over_rows(class_input)
354 |         return class_input - np.tile(class_normalizer, (np.size(class_input, 0), 1))
355 | 
356 |     def compute_error_and_loss(self, sequences, data_name=None):
357 |         """Computes error rate and loss for given data set.
358 | 
359 |         Notes:
360 |         * select the right log class probability using that sum then take the mean over all data cases.
361 | 
362 |         Args:
363 |             sequences (numpy.array) : input
364 |             data_name (string)      : name of data used in logging (i.e. training, validation, testing)
365 |         """
366 |         class_input = self.predict_sequences_proba(x_sequences=sequences)
367 |         log_class_prob = self.predict_log_proba(class_input=class_input)
368 |         error_rate = np.mean(np.argmax(class_input, axis=0) != np.argmax(sequences['targets'], axis=0))
369 |         loss = -np.mean(np.sum(log_class_prob * sequences['targets'], 0))
370 |         self.log_results(error_rate, loss, data_name)
371 | 
372 |     @staticmethod
373 |     def log_results(error_rate, loss, data_name):
374 |         data_name = 'given' if not data_name else data_name
375 |         print ('For the {0} data, the classification cross-entropy loss is {1}, and the classification error '
376 |                'rate (i.e. the misclassification rate) is {2}'.format(data_name, loss, error_rate))
377 | 
378 |     def _classification_phi_gradient(self, inputs, targets):
379 |         """This returns the gradient of phi (a.k.a. negative of the loss) for the class input.
380 | 
381 |         Notes:
382 |         * This is about a very simple model: there's an input layer, and a softmax output layer.
383 |           There are no hidden layers, and no biases.
384 | 
385 |         Args:
386 |             input_to_class (numpy.ndarray)  : input to the components of the softmax. size: <number of classes>
387 |                                               by <number of data cases>
388 |             data (numpy.ndarray)            : has fields .inputs (matrix of size <number of input units> by
389 |                 <number of data cases>) and .targets (matrix of size <number of classes> by <number of data cases>).
390 |         """
391 |         # log(sum(exp)) is what we subtract to get normalized log class probabilities.
392 |         class_input = np.dot(self.model, inputs)
393 |         class_prob = np.exp(self.predict_log_proba(class_input))
394 |         # now: gradient computation
395 |         d_loss_by_d_class_input = -(targets - class_prob) / np.size(inputs, 1)
396 |         d_loss_by_d_input_to_class = np.dot(d_loss_by_d_class_input, inputs.T)
397 |         self.d_phi_by_d_input_to_class = -d_loss_by_d_input_to_class
398 | 
399 | 
400 | class A4Run(A4Helper):
401 |     """Runs assignment 4.
402 |     """
403 | 
404 |     def __init__(self):
405 |         """Initialize data set and all test cases for assignment.
406 | 
407 |         Notes:
408 |         * All sizes requested are transposed to account for column matrix as default vector in matlab.
409 |           a4_rand(..) then returns a transposed matrix to have the correct shape.
410 |         """
411 |         super(A4Run, self).__init__()
412 |         data = loadmat(os.path.join(os.getcwd(), 'data/data_set.mat'))
413 |         self.data_sets = data['data']
414 | 
415 |         self.test_rbm_w = self.a4_rand([256, 100], 0) * 2 - 1
416 |         self.small_test_rbm_w = self.a4_rand([256, 10], 0) * 2 - 1
417 | 
418 |         self.data_1_case = self.sample_bernoulli(self.extract_mini_batch(self.data_sets['training'], 0, 1)['inputs'],
419 |                                                  report_calls=False)
420 |         self.data_10_cases = self.sample_bernoulli(self.extract_mini_batch(self.data_sets['training'], 99,
421 |                                                                            10)['inputs'], report_calls=False)
422 |         self.data_37_cases = self.sample_bernoulli(self.extract_mini_batch(self.data_sets['training'], 199,
423 |                                                                            37)['inputs'], report_calls=False)
424 | 
425 |         self.test_hidden_state_1_case = self.sample_bernoulli(self.a4_rand([1, 100], 0), report_calls=False)
426 |         self.test_hidden_state_10_cases = self.sample_bernoulli(self.a4_rand([10, 100], 1), report_calls=False)
427 |         self.test_hidden_state_37_cases = self.sample_bernoulli(self.a4_rand([37, 100], 2), report_calls=False)
428 | 
429 |     @staticmethod
430 |     def extract_mini_batch(data_set, start_i, n_cases):
431 |         """Extract specified region of data.
432 | 
433 |         Notes:
434 |         * This is just used for test data initalization and was replaced by batch(..) for use in models
435 | 
436 |         Args:
437 |             data_set (numpy.array)  : target data set.
438 |             start_i (int)           : starting index for mini batch.
439 |             n_cases (int)           : number of cases to return.
440 | 
441 |         Returns:
442 |             (numpy.array)   : data set split into n_case sub arrays.
443 |         """
444 |         mini_batch = dict()
445 |         mini_batch['inputs'] = data_set['inputs'][:, start_i: start_i + n_cases]
446 |         mini_batch['targets'] = data_set['targets'][:, start_i: start_i + n_cases]
447 |         return mini_batch
448 | 
449 |     def build_neural_net_model(self, n_hid, lr_rbm, lr_net, training_iters, n_classes, train_momentum,
450 |                                n_visible, mini_batch_size, show_rbm_weights):
451 |         """Runs pre-training and returns neural network model.
452 | 
453 |         Args:
454 |             n_hid (int)             : number of hidden units
455 |             lr_rbm (float)          : learning rate for RBM model
456 |             lr_net (float)          : learning rate for neural net classifier
457 |             training_iters (int)    : number of training iterations
458 |             n_classes (int)         : number of classes
459 |             train_momentum (float)  : momentum used in training
460 |             n_visible (int)         : number of visible units
461 |             mini_batch_size (int)   : size of training batches
462 |             show_rbm_weights (bool) : display rbm weights in colour plot with same dimension of rbm_w
463 | 
464 |         Returns:
465 |             FFNeuralNet : instance of feedforward neural network classifier
466 |         """
467 |         rbm = RBM(training_iters=training_iters, lr_rbm=lr_rbm, n_hid=n_hid, n_visible=n_visible,
468 |                   train_momentum=train_momentum, mini_batch_size=mini_batch_size)
469 |         rbm.train(self.data_sets['training'])
470 |         if show_rbm_weights:
471 |             self.show_rbm(rbm.rbm_w)
472 |         return FFNeuralNet(training_iters=training_iters, rbm_w=rbm.rbm_w, lr_net=lr_net, n_hid=n_hid,
473 |                            n_classes=n_classes, train_momentum=train_momentum, mini_batch_size=mini_batch_size)
474 | 
475 |     def a4_main(self, n_hid, lr_rbm, lr_classification, n_iterations, show_rbm_weights=False, train_momentum=0.9):
476 |         """Runs training and computes error and loss of training, testing, and validation training sets.
477 |         """
478 |         if np.prod(np.shape(self.data_sets)) != 1:
479 |             raise Exception('You must run a4_init before you do anything else.')
480 |         nn = self.build_neural_net_model(n_hid=n_hid, lr_rbm=lr_rbm, lr_net=lr_classification,
481 |                                          training_iters=n_iterations, n_classes=10,
482 |                                          train_momentum=train_momentum, n_visible=256, mini_batch_size=100,
483 |                                          show_rbm_weights=show_rbm_weights)
484 |         nn.train(self.data_sets['training'])
485 | 
486 |         for data_name, data in self.data_sets.iteritems():
487 |             nn.compute_error_and_loss(data, data_name=data_name)
488 | 
489 |     def train_rbm_test_cases(self, data_cases):
490 |         """Runs training on given test case. Answer for question 6 and 7 when called with describe_matrix(..)
491 |         """
492 |         rbm = RBM()
493 |         rbm.rbm_w = self.test_rbm_w
494 |         rbm.fit(data_cases)
495 |         return rbm.gradient
496 | 
497 |     def question_10(self):
498 |         """Prints logarithm (base e) of partition function for small_test_rbm_w. Answer for question 10.
499 |         """
500 |         print "Log (base e) of partition function for small_test_rbm_w is :", self.partition_log(self.small_test_rbm_w)
501 | 
502 |     def show_rbm(self, rbm_w):
503 |         """Display rbm weights in colour plot with same dimension of rbm_w.
504 |         """
505 |         n_hid = np.size(rbm_w, 0)
506 |         n_rows = int(np.ceil(np.sqrt(n_hid)))
507 |         blank_lines = 4
508 |         distance = 16 + blank_lines
509 |         to_show = np.zeros([n_rows * distance + blank_lines, n_rows * distance + blank_lines])
510 |         for i in xrange(0, n_hid):
511 |             row_i = int(i / n_rows)  # take floor
512 |             col_i = int(i % n_rows)
513 |             pixels = np.reshape(rbm_w[i, :], (16, 16)).T
514 |             row_base = row_i * distance + blank_lines
515 |             col_base = col_i * distance + blank_lines
516 |             to_show[row_base:row_base + 16, col_base:col_base + 16] = pixels
517 | 
518 |         extreme = np.max(abs(to_show))
519 |         try:
520 |             plt.imshow(to_show, vmin=-extreme, vmax=extreme)
521 |             plt.title('hidden units of the RBM')
522 |             plt.show()
523 |         except:
524 |             print('Failed to display the RBM. No big deal (you do not need the display to finish the assignment), '
525 |                   'but you are missing out on an interesting picture.')
526 |             raise
527 | 
528 |     @staticmethod
529 |     def partition_log(w):
530 |         """Computes logarithm (base e) of partition function for given size of rbm (hidden units)
531 | 
532 |         Notes:
533 |         * Answer for question 10
534 | 
535 |         Args:
536 |             w (numpy.array) : given rbm weight matrix
537 | 
538 |         Returns:
539 |             float : log of partition function
540 |         """
541 |         dec_2_bin = lambda x, n_bits: np.array(["{0:b}".format(val).zfill(n_bits) for val in x])
542 |         binary = np.array([list(val) for val in dec_2_bin(range(pow(2, np.size(w, 0))), np.size(w, 0))], dtype=float)
543 |         return np.log(np.sum(np.prod((np.exp(np.dot(binary, w)) + 1).T, axis=0)))
544 | 
545 | 
546 | if __name__ == '__main__':
547 | 
548 |     import numpy as np
549 |     import os
550 | 
551 |     import matplotlib.pylab as pylab
552 |     import matplotlib.pyplot as plt
553 | 
554 |     pylab.rcParams['figure.figsize'] = 12, 8
555 | 
556 |     from utility.utils import loadmat, logistic
557 | 
558 |     # In[ ]:
559 | 
560 |     a4 = A4Run()
561 | 
562 |     # In[ ]:
563 | 
564 |     # Q1
565 |     a4.a4_main(300, 0, 0, 0)
566 | 
567 |     # In[ ]:
568 | 
569 |     # Q1
570 |     a4.a4_main(300, 0.02, 0.1, 8, show_rbm_weights=False)
571 | 
572 |     # In[ ]:
573 | 
574 |     # Q2
575 |     a4.describe_matrix(a4.visible_state_to_hidden_probabilities(a4.test_rbm_w, a4.data_1_case))
576 |     a4.describe_matrix(a4.visible_state_to_hidden_probabilities(a4.test_rbm_w, a4.data_10_cases))
577 |     a4.describe_matrix(a4.visible_state_to_hidden_probabilities(a4.test_rbm_w, a4.data_37_cases))
578 | 
579 |     # In[ ]:
580 | 
581 |     # Q3
582 |     a4.describe_matrix(a4.hidden_state_to_visible_probabilities(a4.test_rbm_w, a4.test_hidden_state_1_case))
583 |     a4.describe_matrix(a4.hidden_state_to_visible_probabilities(a4.test_rbm_w, a4.test_hidden_state_10_cases))
584 |     a4.describe_matrix(a4.hidden_state_to_visible_probabilities(a4.test_rbm_w, a4.test_hidden_state_37_cases))
585 | 
586 |     # In[ ]:
587 | 
588 |     # Q4
589 |     print a4.configuration_goodness(a4.test_rbm_w, a4.data_1_case, a4.test_hidden_state_1_case)
590 |     print a4.configuration_goodness(a4.test_rbm_w, a4.data_10_cases, a4.test_hidden_state_10_cases)
591 |     print a4.configuration_goodness(a4.test_rbm_w, a4.data_37_cases, a4.test_hidden_state_37_cases)
592 | 
593 |     # In[ ]:
594 | 
595 |     # Q5
596 |     a4.describe_matrix(a4.configuration_goodness_gradient(a4.data_1_case, a4.test_hidden_state_1_case))
597 |     a4.describe_matrix(a4.configuration_goodness_gradient(a4.data_10_cases, a4.test_hidden_state_10_cases))
598 |     a4.describe_matrix(a4.configuration_goodness_gradient(a4.data_37_cases, a4.test_hidden_state_37_cases))
599 | 
600 |     # In[ ]:
601 | 
602 |     # Q6/7
603 |     a4.describe_matrix(a4.train_rbm_test_cases(a4.data_1_case))
604 |     print
605 |     a4.describe_matrix(a4.train_rbm_test_cases(a4.data_1_case))
606 |     print
607 |     a4.describe_matrix(a4.train_rbm_test_cases(a4.data_1_case))
608 | 
609 |     # In[ ]:
610 | 
611 |     # Q8
612 |     lr = .005
613 |     # all_lr = [lr/4, lr/2, lr/1.5, lr/1.25, lr*1.25, lr*1.5, lr*2, lr*4, lr*6, lr*8, lr*10]
614 |     all_lr = [lr * 15, lr * 20, lr * 25, lr * 27]
615 |     for lr_ in all_lr:
616 |         print "LEARNING RATE: ", lr_
617 |         # a4.a4_main(300, .02, lr_, 1000, show_rbm_weights=True)
618 |         print
619 | 
620 |     # In[ ]:
621 | 
622 |     # Q10
623 |     a4.question_10()
624 | 
625 | 
626 |     # In[ ]:
627 | 


--------------------------------------------------------------------------------
/assignment4/data/a4_randomness_source.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment4/data/a4_randomness_source.mat


--------------------------------------------------------------------------------
/assignment4/data/data_set.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/assignment4/data/data_set.mat


--------------------------------------------------------------------------------
/utility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hankcs/coursera-neural-net/f6b2e1985ff5e0f163f07be97cde108c7a02f7d8/utility/__init__.py


--------------------------------------------------------------------------------
/utility/utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for Assignments."""
 2 | import scipy.io as sio
 3 | import numpy as np
 4 | 
 5 | 
 6 | __all__ = ['zip_safe',
 7 |            'loadmat',
 8 |            'logistic',
 9 |            'log_sum_exp_over_rows',
10 |            'batches']
11 | 
12 | 
13 | def zip_safe(*lists):
14 |     """Zip function that checks that all the zipped lists have the same length."""
15 |     assert len(lists) > 0
16 |     assert all(len(list_) == len(lists[0]) for list_ in lists)
17 |     zipped_lists = zip(*lists)
18 |     return zipped_lists
19 | 
20 | 
21 | def loadmat(filename):
22 |     """This function should be called instead of direct spio.loadmat
23 |     as it cures the problem of not properly recovering python dictionaries
24 |     from mat files. It calls the function check keys to cure all entries
25 |     which are still mat-objects.
26 |     """
27 |     return _check_keys(sio.loadmat(filename, struct_as_record=False, squeeze_me=True))
28 | 
29 | 
30 | def _check_keys(data):
31 |     """Checks if entries in dictionary are mat-objects.
32 |     If yes todict is called to change them to nested dictionaries
33 |     """
34 |     for key in data:
35 |         if isinstance(data[key], sio.matlab.mio5_params.mat_struct):
36 |             data[key] = _todict(data[key])
37 |     return data
38 | 
39 | 
40 | def _todict(matobj):
41 |     """A recursive function which constructs from matobjects nested dictionaries.
42 |     """
43 |     data = {}
44 |     for strg in matobj._fieldnames:
45 |         elem = matobj.__dict__[strg]
46 |         if isinstance(elem, sio.matlab.mio5_params.mat_struct):
47 |             data[strg] = _todict(elem)
48 |         else:
49 |             data[strg] = elem
50 |     return data
51 | 
52 | 
53 | def logistic(x):
54 |     return 1. / (1. + np.exp(-x))
55 | 
56 | 
57 | def log_sum_exp_over_rows(a):
58 |     """Computes log(sum(np.exp(a), 1)) in a numerically stable way."""
59 |     col_maxs = np.max(a, axis=0)
60 |     return np.log(sum(np.exp(a - np.tile(col_maxs, (np.size(a, 0), 1))), 0)) + col_maxs
61 | 
62 | 
63 | def batches(iterable, n=1):
64 |     """Yields specified number of mini batches (more efficient than extract_mini_batch(..)
65 | 
66 |     Args:
67 |         iterable (numpy.array)  : helper function for splitting array into batches.
68 |         n (int)                 : number of batches.
69 |     """
70 |     l = np.size(iterable, 1)
71 |     for ndx in range(0, l, n):
72 |         yield iterable[:, ndx:min(ndx + n, l)]
73 | 


--------------------------------------------------------------------------------