├── HierarchicalSoftmax.py ├── LICENSE.md ├── README.md ├── Softmax.py ├── benchmark_functions.py ├── plots ├── 500--20k_cost.png ├── 500--20k_predicted_class.png └── 500--20k_time.png └── run.py /HierarchicalSoftmax.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import theano 3 | import theano.tensor as T 4 | 5 | 6 | class HierarchicalSoftmax(object): 7 | """ 8 | 2-level Hierarchical Softmax layer. Adapted from the Hierarchical Softmax layer from the lisa-groundhog package: 9 | https://github.com/lisa-groundhog/GroundHog 10 | """ 11 | 12 | def __init__(self, input_, target, n_in, n_out, create_zero_probabilities_for_output=False): 13 | """ 14 | :type input_: theano.tensor.TensorType 15 | :param input_: symbolic variable that describes the input (one minibatch) 16 | 17 | :type target: theano.tensor.TensorType 18 | :param target: symbolic variable that describes the out class (one minibatch) 19 | 20 | :type n_in: int 21 | :param n_in: number of input units 22 | 23 | :type n_out: int 24 | :param n_out: number of output units 25 | 26 | :type create_zero_probabilities_for_output: bool 27 | :param create_zero_probabilities_for_output: whether or not to create zero probabilities for 28 | non-target classes during training (eats up time) 29 | """ 30 | self.zero_probs_for_output = create_zero_probabilities_for_output 31 | self.n_out = n_out 32 | 33 | # output layer is a 2-level graph 34 | # a predicted class label is defined as a a fixed arbitrary path through this graph 35 | # we thus need at least sqrt(n_out) nodes in the first level 36 | # (ceil of the scalar x is the smallest integer i, such that i >= x) 37 | self.n_level1_nodes = numpy.ceil(numpy.sqrt(n_out)).astype('int64') 38 | # and at most sqrt(n_out) nodes in the second level -- note that sometimes we may end up 39 | # with a graph that has a few more possible paths than there are output classes 40 | self.n_level2_nodes = numpy.ceil(n_out/float(self.n_level1_nodes)).astype('int64') 41 | 42 | # define weight matrix 'W1' and bias 'b1' for first level in output graph 43 | self.W1 = theano.shared(value=numpy.zeros((n_in, self.n_level1_nodes), dtype=theano.config.floatX), 44 | name='W1', borrow=True) 45 | self.b1 = theano.shared(value=numpy.zeros((self.n_level1_nodes,), dtype=theano.config.floatX), 46 | name='b1', borrow=True) 47 | 48 | # define weight matrix 'W2' and bias 'b2' for second level in output graph 49 | self.W2 = theano.shared(value=numpy.zeros((n_in, self.n_level2_nodes), dtype=theano.config.floatX), 50 | name='W2', borrow=True) 51 | self.b2 = theano.shared(value=numpy.zeros((self.n_level2_nodes,), dtype=theano.config.floatX), 52 | name='b2', borrow=True) 53 | 54 | self.params = [self.W1, self.b1, self.W2, self.b2] 55 | 56 | self.p_y_given_x = self.forward_prop(input_, target) 57 | 58 | if self.zero_probs_for_output: 59 | self.cost = -T.mean(T.log(self.p_y_given_x)[T.arange(target.shape[0]), target]) 60 | else: 61 | self.cost = -T.mean(T.log(self.p_y_given_x)) 62 | 63 | 64 | def get_predictions(self, input_): 65 | return T.argmax(self.forward_prop(input_), axis=1) 66 | 67 | 68 | def forward_prop(self, input_, y_true=None): 69 | """ 70 | If target is 'None', compute the probability of taking the correct path through the output graph. 71 | Else, compute the probability for each possible path (= each possible output class). 72 | """ 73 | level1_vals = T.nnet.softmax(T.dot(input_, self.W1) + self.b1) 74 | level2_vals = T.nnet.softmax(T.dot(input_, self.W2) + self.b2) 75 | 76 | batch_size = input_.shape[0] 77 | 78 | # compute all possible predictions [ time complexity is O(n_out) ] 79 | if y_true is None: 80 | 81 | def _path_probas(idx): 82 | lev1_vec, lev2_vec = level1_vals[idx], level2_vals[idx] 83 | result, updates = theano.scan(fn=lambda k, array_: k * array_, 84 | sequences=lev1_vec, 85 | non_sequences=lev2_vec) 86 | return result.flatten() 87 | 88 | output, updates = theano.scan(fn=_path_probas, sequences=T.arange(batch_size)) 89 | 90 | # since we may have more possible paths through the graph than output classes, 91 | # ignore the remaining paths 92 | output = output[:, :self.n_out] 93 | 94 | # compute only batch_size predictions [ time complexity is O(2 x sqrt(n_out)) = O(sqrt(n_out)) ] 95 | else: 96 | # to each class label, assign a pair of nodes in layer1 and layer2 of the output graph 97 | level1_idx = y_true // self.n_level1_nodes 98 | level2_idx = y_true % self.n_level2_nodes 99 | 100 | # calculate probability of taking correct path through the graph 101 | level1_val = level1_vals[T.arange(batch_size), level1_idx] 102 | level2_val = level2_vals[T.arange(batch_size), level2_idx] 103 | target_probas = level1_val * level2_val 104 | 105 | if self.zero_probs_for_output: 106 | # output is a matrix of predictions, with dimensionality (batch_size, n_out). 107 | # since we only have a probability for the correct label, 108 | # we assign a probability of zero to all other labels 109 | output = T.zeros((batch_size, self.n_out)) 110 | output = T.set_subtensor(output[T.arange(batch_size), y_true], target_probas) 111 | else: 112 | # use this branch if you want to save computation time by skipping the creation of a matrix that 113 | # that contains mostly zeros; in this case, output will be a a single probability (for the target class) 114 | output = target_probas 115 | 116 | return output -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Robert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a version of hierarchical softmax that is based on an implementation found here: https://github.com/lisa-groundhog/GroundHog 2 | 3 | benchmark_functions.py contains functionality for training flat and hierarchical softmax models on randomly generated data, then comparing the models in terms of (1) predictions on unseen data, (2) training loss, and (3) runtime. Look at run.py for some examples. 4 | 5 | Dependencies: 6 | 7 | - Theano (0.7.0) 8 | - numpy (1.9.2) 9 | - matplotlib (1.4.3) 10 | -------------------------------------------------------------------------------- /Softmax.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import theano 3 | import theano.tensor as T 4 | 5 | 6 | class Softmax(object): 7 | """ 8 | Softmax layer. Code is based on the Logistic Regression used in the Theano deep learning tutorial: 9 | http://deeplearning.net/tutorial/code/logistic_sgd.py 10 | """ 11 | 12 | def __init__(self, input_, n_in, n_out): 13 | """ 14 | :type input_: theano.tensor.TensorType 15 | :param input_: symbolic variable that describes the input (one minibatch) 16 | 17 | :type n_in: int 18 | :param n_in: number of input units 19 | 20 | :type n_out: int 21 | :param n_out: number of output units 22 | """ 23 | self.input = input_ 24 | 25 | self.W = theano.shared(value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), name='W', borrow=True) 26 | self.b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True) 27 | 28 | self.p_y_given_x = T.nnet.softmax(T.dot(input_, self.W) + self.b) 29 | self.params = [self.W, self.b] 30 | 31 | 32 | def get_predictions(self, input_): 33 | return T.argmax(T.nnet.softmax(T.dot(input_, self.W) + self.b), axis=1) 34 | 35 | 36 | def negative_log_likelihood(self, y): 37 | cost = -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) 38 | return cost -------------------------------------------------------------------------------- /benchmark_functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy 4 | import theano 5 | 6 | import theano.tensor as T 7 | 8 | from HierarchicalSoftmax import HierarchicalSoftmax 9 | from Softmax import Softmax 10 | 11 | 12 | from matplotlib import pyplot 13 | # set parameters for plots 14 | pyplot.rcParams.update({'figure.figsize': (25, 20), 'font.size': 25}) 15 | 16 | 17 | 18 | ######################################################################################################################## 19 | 20 | # helper functions for plotting 21 | 22 | def save_plot_to(plot_dir, plot_name): 23 | pyplot.savefig(plot_dir + plot_name, additional_artists=get_paras_for_centering_legend_below_plot(), 24 | bbox_inches='tight') 25 | pyplot.close() 26 | 27 | 28 | def get_paras_for_centering_legend_below_plot(): 29 | # get matplotlib parameters for centering the legend below plots 30 | pyplot.legend(loc=9, bbox_to_anchor=(0.5, -0.1)) 31 | lgd = pyplot.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2) 32 | art = [lgd] 33 | return art 34 | 35 | 36 | ######################################################################################################################## 37 | 38 | 39 | def generate_data(n_classes, n_training_examples, input_size): 40 | """ 41 | Generate dummy training data. 42 | 43 | Arguments: 44 | - n_classes: how many output classes there should be in the data set 45 | - n_training_examples: how many training examples there should be 46 | - input_size: length of each input vector 47 | 48 | Returns: 49 | - train_set_x: array of input vectors 50 | - train_set_y: array of integer classes, to be predicted from vectors in 'train_set_x' 51 | """ 52 | numpy.random.seed(123) 53 | train_set_x = [numpy.random.rand(input_size) for i in range(n_training_examples)] 54 | 55 | # balance training data for class 56 | # if training data cannot evenly be divided by number of classes, 57 | # assign class 0 to the remaining data 58 | interval = n_training_examples / n_classes 59 | remainder = n_training_examples % n_classes 60 | train_set_y = [i for j in range(interval) for i in range(n_classes)] + [0 for j in range(remainder)] 61 | 62 | assert len(train_set_x) == len(train_set_y) 63 | 64 | train_set_x = theano.shared(numpy.asarray(train_set_x, dtype=theano.config.floatX), borrow=True) 65 | train_set_y = theano.shared(numpy.asarray(train_set_y, dtype=theano.config.floatX), borrow=True) 66 | train_set_y = T.cast(train_set_y, 'int32') 67 | 68 | return train_set_x, train_set_y 69 | 70 | 71 | def generate_data_train_softmax(n_classes, n_training_examples, input_size, n_epochs, learning_rate=0.1, batch_size=10, 72 | hierarchical=False): 73 | """ 74 | Train either a flat or hierarchical softmax model on randomly generated data and return the predicted class for a 75 | single random test example, the average training loss at the last epoch, 76 | and the time it took to train the model. 77 | 78 | Arguments: 79 | - n_classes: how many output classes there should be in the randomly generated data set 80 | - n_training_examples: how many training examples there should be 81 | - input_size: length of each randomly generated input vector 82 | - n_epochs: number training epochs 83 | - learning_rate: learning rate of the softmax model 84 | - batch_size: batch size for softmax model 85 | - hierarchical: whether to generate_data_train_softmax with hierachical softmax 86 | (use flat softmax otherwise) 87 | 88 | Returns: 89 | - predicted: the predicted class for a single randomly generated test example 90 | - avg_loss: the average training loss over generate_data_train_softmax batches at the last training epoch 91 | - total_train_time: the time it took, in minutes, to generate_data_train_softmax the model 92 | """ 93 | train_set_x, train_set_y = generate_data(n_classes, n_training_examples, input_size) 94 | n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size 95 | 96 | print 'we have %s train batches' % n_train_batches 97 | 98 | # allocate symbolic variables for the data 99 | index = T.lscalar() # index to a [mini]batch 100 | x = T.matrix('x') # data, presented as rasterized images 101 | y = T.ivector('y') # labels, presented as 1D vector of [int] labels 102 | 103 | # instantiate hierarchical softmax model and calculate gradients 104 | if hierarchical: 105 | 106 | softmax = HierarchicalSoftmax(input_=x, target=y, n_in=input_size, n_out=n_classes) 107 | cost = softmax.cost 108 | 109 | g_W = T.grad(cost=cost, wrt=softmax.W1) 110 | g_b = T.grad(cost=cost, wrt=softmax.b1) 111 | g_U = T.grad(cost=cost, wrt=softmax.W2) 112 | g_c = T.grad(cost=cost, wrt=softmax.b2) 113 | 114 | updates = [(softmax.W1, softmax.W1 - learning_rate * g_W), 115 | (softmax.b1, softmax.b1 - learning_rate * g_b), 116 | (softmax.W2, softmax.W2 - learning_rate * g_U), 117 | (softmax.b2, softmax.b2 - learning_rate * g_c)] 118 | 119 | # instantiate flat softmax model and calculate gradients 120 | else: 121 | softmax = Softmax(input_=x, n_in=input_size, n_out=n_classes) 122 | cost = softmax.negative_log_likelihood(y) 123 | 124 | g_W = T.grad(cost=cost, wrt=softmax.W) 125 | g_b = T.grad(cost=cost, wrt=softmax.b) 126 | 127 | updates = [(softmax.W, softmax.W - learning_rate * g_W), 128 | (softmax.b, softmax.b - learning_rate * g_b)] 129 | 130 | # compile a Theano function `train_model` that returns the cost and at 131 | # the same time updates the parameter of the model based on the rules 132 | # defined in `updates` 133 | train_model = theano.function( 134 | inputs=[index], 135 | outputs=cost, 136 | updates=updates, 137 | givens={ 138 | x: train_set_x[index * batch_size: (index + 1) * batch_size], 139 | y: train_set_y[index * batch_size: (index + 1) * batch_size] 140 | } 141 | ) 142 | 143 | # train the model 144 | start_time = time.time() 145 | avg_loss = None 146 | for epoch in range(n_epochs): 147 | 148 | costs_over_batches = [] 149 | for minibatch_index in xrange(n_train_batches): 150 | minibatch_avg_cost = train_model(minibatch_index) 151 | costs_over_batches.append(minibatch_avg_cost) 152 | 153 | avg_loss = numpy.mean(costs_over_batches) 154 | 155 | print 'Epoch: %s' % epoch 156 | print 'Loss: %s' % avg_loss 157 | print 'Time since beginning of training: %s' % ((time.time() - start_time) / 60) 158 | print 159 | 160 | total_train_time = ((time.time() - start_time) / 60) 161 | print 'Training took: %s' % total_train_time 162 | print '\n\n' 163 | 164 | # generate a single random test example 165 | numpy.random.seed(444) 166 | input_ = numpy.asarray([numpy.random.rand(input_size)]) 167 | 168 | predictions = softmax.get_predictions(input_) 169 | 170 | # compute class prediction 171 | preds_eval = predictions.eval() 172 | 173 | return preds_eval, avg_loss, total_train_time 174 | 175 | 176 | def benchmark_softmax(n_classes_range, plot_name, n_data_points=50000, input_size=3, n_epochs=2): 177 | """ 178 | Train a flat and a hierarchical softmax model for a range of different numbers of output classes. 179 | Then create three plots: 180 | 181 | - training time as a function of number of output classes 182 | - the predicted class as a function of number of classes (predicted class by model) 183 | - training loss at last epoch as a function of number of output classes 184 | 185 | The training time should increase linearly as a function of number of output classes for the flat model, 186 | whereas it should increase much less for the hierarchical model. Predicted classes should be equivalent most of 187 | the time. The training loss may differ across flat and hierarchical softmax. 188 | 189 | Arguments: 190 | - n_classes_range: list of numbers of classes for which to generate_data_train_softmax models 191 | - plot_name: name of plots -- will be saved to os.getcwd() + '/plots/' 192 | """ 193 | def get_benchmark_data(hierarchical=False): 194 | costs = [] 195 | times = [] 196 | preds = [] 197 | for n_classes in n_classes_range: 198 | print 'training %s softmax model with %s classes' % ('hierarchical' if hierarchical else 'flat', n_classes) 199 | print 200 | pred, cost, time = generate_data_train_softmax(n_classes, n_data_points, input_size, n_epochs, 201 | hierarchical=hierarchical) 202 | costs.append(cost) 203 | times.append(time) 204 | preds.append(pred) 205 | return costs, times, preds 206 | 207 | def plot_benchmark(xs, hierarchical_ys, flat_ys, y_axis_label): 208 | pyplot.plot(xs, hierarchical_ys, 'o-', markersize=40, linewidth=9, label='hierarchical softmax') 209 | pyplot.plot(xs, flat_ys, 'o-', markersize=40, linewidth=9, label='flat softmax') 210 | pyplot.xlabel('nr of output classes', fontsize=40) 211 | pyplot.ylabel(y_axis_label, fontsize=40) 212 | plot_dir = os.getcwd() + '/plots/' 213 | save_plot_to(plot_dir, '%s_%s' % (plot_name, y_axis_label)) 214 | 215 | # generate_data_train_softmax hierarchical softmax models 216 | h_cost, h_time, h_preds = get_benchmark_data(hierarchical=True) 217 | 218 | # generate_data_train_softmax flat softmax models 219 | f_cost, f_time, f_preds = get_benchmark_data(hierarchical=False) 220 | 221 | #plot results 222 | plot_benchmark(n_classes_range, h_cost, f_cost, 'cost') 223 | plot_benchmark(n_classes_range, h_time, f_time, 'time') 224 | plot_benchmark(n_classes_range, h_preds, f_preds, 'predicted_class') -------------------------------------------------------------------------------- /plots/500--20k_cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobGrimm/HierarchicalSoftmax/1c15f4a411996d770074f5c99236001e19862acd/plots/500--20k_cost.png -------------------------------------------------------------------------------- /plots/500--20k_predicted_class.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobGrimm/HierarchicalSoftmax/1c15f4a411996d770074f5c99236001e19862acd/plots/500--20k_predicted_class.png -------------------------------------------------------------------------------- /plots/500--20k_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobGrimm/HierarchicalSoftmax/1c15f4a411996d770074f5c99236001e19862acd/plots/500--20k_time.png -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from benchmark_functions import generate_data_train_softmax 2 | from benchmark_functions import benchmark_softmax 3 | 4 | # train hierarchical softmax on dataset with 5.000 output classes 5 | generate_data_train_softmax(n_classes=5000, n_training_examples=50000, input_size=3, hierarchical=True, n_epochs=2, 6 | batch_size=50) 7 | 8 | 9 | # train flat softmax on the same data -- this will take longer 10 | generate_data_train_softmax(n_classes=5000, n_training_examples=50000, input_size=3, hierarchical=False, n_epochs=2, 11 | batch_size=50) 12 | 13 | 14 | # train hierarchical softmax on data set with 1 million output classes 15 | # will take much longer if you do it with flat softmax 16 | generate_data_train_softmax(n_classes=1000000, n_training_examples=50000, input_size=3, hierarchical=True, n_epochs=2, 17 | batch_size=50) 18 | 19 | 20 | # for varying numbers of output classes (1000 -- 20.000, in increments of 1000), 21 | # train both a flat and a hierarchical softmax model. keep track of the total 22 | # training time and the training loss at the last epoch for each model. 23 | # then plot cost and training time as a function of number of output classes, as well as the predicted class 24 | # for a randomly generated test example. 25 | # save plots to os.getcwd() + '\plots\' 26 | benchmark_softmax(n_classes_range=range(1000, 21000, 1000), plot_name='500--20k', input_size=100, n_data_points=500, 27 | n_epochs=2) 28 | --------------------------------------------------------------------------------