├── HierarchicalSoftmax.py
├── LICENSE.md
├── README.md
├── Softmax.py
├── benchmark_functions.py
├── plots
    ├── 500--20k_cost.png
    ├── 500--20k_predicted_class.png
    └── 500--20k_time.png
└── run.py


/HierarchicalSoftmax.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import theano
  3 | import theano.tensor as T
  4 | 
  5 | 
  6 | class HierarchicalSoftmax(object):
  7 |     """
  8 |     2-level Hierarchical Softmax layer. Adapted from the Hierarchical Softmax layer from the lisa-groundhog package:
  9 |     https://github.com/lisa-groundhog/GroundHog
 10 |     """
 11 | 
 12 |     def __init__(self, input_, target, n_in, n_out, create_zero_probabilities_for_output=False):
 13 |         """
 14 |         :type input_:   theano.tensor.TensorType
 15 |         :param input_:  symbolic variable that describes the input (one minibatch)
 16 | 
 17 |         :type target:   theano.tensor.TensorType
 18 |         :param target:  symbolic variable that describes the out class (one minibatch)
 19 | 
 20 |         :type n_in:     int
 21 |         :param n_in:    number of input units
 22 | 
 23 |         :type n_out:    int
 24 |         :param n_out:   number of output units
 25 | 
 26 |         :type create_zero_probabilities_for_output:    bool
 27 |         :param create_zero_probabilities_for_output:   whether or not to create zero probabilities for
 28 |                                                        non-target classes during training (eats up time)
 29 |         """
 30 |         self.zero_probs_for_output = create_zero_probabilities_for_output
 31 |         self.n_out = n_out
 32 | 
 33 |         # output layer is a 2-level graph
 34 |         # a predicted class label is defined as a a fixed arbitrary path through this graph
 35 |         # we thus need at least sqrt(n_out) nodes in the first level
 36 |         # (ceil of the scalar x is the smallest integer i, such that i >= x)
 37 |         self.n_level1_nodes = numpy.ceil(numpy.sqrt(n_out)).astype('int64')
 38 |         # and at most sqrt(n_out) nodes in the second level -- note that sometimes we may end up
 39 |         # with a graph that has a few more possible paths than there are output classes
 40 |         self.n_level2_nodes = numpy.ceil(n_out/float(self.n_level1_nodes)).astype('int64')
 41 | 
 42 |         # define weight matrix 'W1' and bias 'b1' for first level in output graph
 43 |         self.W1 = theano.shared(value=numpy.zeros((n_in,  self.n_level1_nodes), dtype=theano.config.floatX),
 44 |                                 name='W1', borrow=True)
 45 |         self.b1 = theano.shared(value=numpy.zeros((self.n_level1_nodes,), dtype=theano.config.floatX),
 46 |                                 name='b1', borrow=True)
 47 | 
 48 |          # define weight matrix 'W2' and bias 'b2' for second level in output graph
 49 |         self.W2 = theano.shared(value=numpy.zeros((n_in,  self.n_level2_nodes), dtype=theano.config.floatX),
 50 |                                 name='W2', borrow=True)
 51 |         self.b2 = theano.shared(value=numpy.zeros((self.n_level2_nodes,), dtype=theano.config.floatX),
 52 |                                 name='b2', borrow=True)
 53 | 
 54 |         self.params = [self.W1, self.b1, self.W2, self.b2]
 55 | 
 56 |         self.p_y_given_x = self.forward_prop(input_, target)
 57 | 
 58 |         if self.zero_probs_for_output:
 59 |             self.cost = -T.mean(T.log(self.p_y_given_x)[T.arange(target.shape[0]), target])
 60 |         else:
 61 |             self.cost = -T.mean(T.log(self.p_y_given_x))
 62 | 
 63 | 
 64 |     def get_predictions(self, input_):
 65 |         return T.argmax(self.forward_prop(input_), axis=1)
 66 | 
 67 | 
 68 |     def forward_prop(self, input_, y_true=None):
 69 |         """
 70 |         If target is 'None', compute the probability of taking the correct path through the output graph.
 71 |         Else, compute the probability for each possible path (= each possible output class).
 72 |         """
 73 |         level1_vals = T.nnet.softmax(T.dot(input_, self.W1) + self.b1)
 74 |         level2_vals = T.nnet.softmax(T.dot(input_, self.W2) + self.b2)
 75 | 
 76 |         batch_size = input_.shape[0]
 77 | 
 78 |         # compute all possible predictions [ time complexity is O(n_out) ]
 79 |         if y_true is None:
 80 | 
 81 |             def _path_probas(idx):
 82 |                 lev1_vec, lev2_vec = level1_vals[idx], level2_vals[idx]
 83 |                 result, updates = theano.scan(fn=lambda k, array_: k * array_,
 84 |                                               sequences=lev1_vec,
 85 |                                               non_sequences=lev2_vec)
 86 |                 return result.flatten()
 87 | 
 88 |             output, updates = theano.scan(fn=_path_probas, sequences=T.arange(batch_size))
 89 | 
 90 |             # since we may have more possible paths through the graph than output classes,
 91 |             # ignore the remaining paths
 92 |             output = output[:, :self.n_out]
 93 | 
 94 |         # compute only batch_size predictions [ time complexity is O(2 x sqrt(n_out)) = O(sqrt(n_out)) ]
 95 |         else:
 96 |             # to each class label, assign a pair of nodes in layer1 and layer2 of the output graph
 97 |             level1_idx = y_true // self.n_level1_nodes
 98 |             level2_idx = y_true % self.n_level2_nodes
 99 | 
100 |             # calculate probability of taking correct path through the graph
101 |             level1_val = level1_vals[T.arange(batch_size), level1_idx]
102 |             level2_val = level2_vals[T.arange(batch_size), level2_idx]
103 |             target_probas = level1_val * level2_val
104 | 
105 |             if self.zero_probs_for_output:
106 |                 # output is a matrix of predictions, with dimensionality (batch_size, n_out).
107 |                 # since we only have a probability for the correct label,
108 |                 # we assign a probability of zero to all other labels
109 |                 output = T.zeros((batch_size, self.n_out))
110 |                 output = T.set_subtensor(output[T.arange(batch_size), y_true], target_probas)
111 |             else:
112 |                 # use this branch if you want to save computation time by skipping the creation of a matrix that
113 |                 # that contains mostly zeros; in this case, output will be a a single probability (for the target class)
114 |                 output = target_probas
115 | 
116 |         return output


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Robert
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is a version of hierarchical softmax that is based on an implementation found here: https://github.com/lisa-groundhog/GroundHog
 2 | 
 3 | benchmark_functions.py contains functionality for training flat and hierarchical softmax models on randomly generated data, then comparing the models in terms of (1) predictions on unseen data, (2) training loss, and (3) runtime. Look at run.py for some examples. 
 4 | 
 5 | Dependencies:
 6 | 
 7 | - Theano (0.7.0)
 8 | - numpy (1.9.2)
 9 | - matplotlib (1.4.3)
10 | 


--------------------------------------------------------------------------------
/Softmax.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import theano
 3 | import theano.tensor as T
 4 | 
 5 | 
 6 | class Softmax(object):
 7 |     """
 8 |     Softmax layer. Code is based on the Logistic Regression used in the Theano deep learning tutorial:
 9 |     http://deeplearning.net/tutorial/code/logistic_sgd.py
10 |     """
11 | 
12 |     def __init__(self, input_, n_in, n_out):
13 |         """
14 |         :type input_:   theano.tensor.TensorType
15 |         :param input_:  symbolic variable that describes the input (one minibatch)
16 | 
17 |         :type n_in:     int
18 |         :param n_in:    number of input units
19 | 
20 |         :type n_out:    int
21 |         :param n_out:   number of output units
22 |         """
23 |         self.input = input_
24 | 
25 |         self.W = theano.shared(value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), name='W', borrow=True)
26 |         self.b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True)
27 | 
28 |         self.p_y_given_x = T.nnet.softmax(T.dot(input_, self.W) + self.b)
29 |         self.params = [self.W, self.b]
30 | 
31 | 
32 |     def get_predictions(self, input_):
33 |         return T.argmax(T.nnet.softmax(T.dot(input_, self.W) + self.b), axis=1)
34 | 
35 | 
36 |     def negative_log_likelihood(self, y):
37 |         cost = -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
38 |         return cost


--------------------------------------------------------------------------------
/benchmark_functions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import numpy
  4 | import theano
  5 | 
  6 | import theano.tensor as T
  7 | 
  8 | from HierarchicalSoftmax import HierarchicalSoftmax
  9 | from Softmax import Softmax
 10 | 
 11 | 
 12 | from matplotlib import pyplot
 13 | # set parameters for plots
 14 | pyplot.rcParams.update({'figure.figsize': (25, 20), 'font.size': 25})
 15 | 
 16 | 
 17 | 
 18 | ########################################################################################################################
 19 | 
 20 | # helper functions for plotting
 21 | 
 22 | def save_plot_to(plot_dir, plot_name):
 23 |     pyplot.savefig(plot_dir + plot_name, additional_artists=get_paras_for_centering_legend_below_plot(),
 24 |                    bbox_inches='tight')
 25 |     pyplot.close()
 26 | 
 27 | 
 28 | def get_paras_for_centering_legend_below_plot():
 29 |     # get matplotlib parameters for centering the legend below plots
 30 |     pyplot.legend(loc=9, bbox_to_anchor=(0.5, -0.1))
 31 |     lgd = pyplot.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
 32 |     art = [lgd]
 33 |     return art
 34 | 
 35 | 
 36 | ########################################################################################################################
 37 | 
 38 | 
 39 | def generate_data(n_classes, n_training_examples, input_size):
 40 |     """
 41 |     Generate dummy training data.
 42 | 
 43 |     Arguments:
 44 |         - n_classes: how many output classes there should be in the data set
 45 |         - n_training_examples: how many training examples there should be
 46 |         - input_size: length of each input vector
 47 | 
 48 |     Returns:
 49 |         - train_set_x: array of input vectors
 50 |         - train_set_y: array of integer classes, to be predicted from vectors in 'train_set_x'
 51 |     """
 52 |     numpy.random.seed(123)
 53 |     train_set_x = [numpy.random.rand(input_size) for i in range(n_training_examples)]
 54 | 
 55 |     # balance training data for class
 56 |     # if training data cannot evenly be divided by number of classes,
 57 |     # assign class 0 to the remaining data
 58 |     interval = n_training_examples / n_classes
 59 |     remainder = n_training_examples % n_classes
 60 |     train_set_y = [i for j in range(interval) for i in range(n_classes)] + [0 for j in range(remainder)]
 61 | 
 62 |     assert len(train_set_x) == len(train_set_y)
 63 | 
 64 |     train_set_x = theano.shared(numpy.asarray(train_set_x, dtype=theano.config.floatX), borrow=True)
 65 |     train_set_y = theano.shared(numpy.asarray(train_set_y, dtype=theano.config.floatX), borrow=True)
 66 |     train_set_y = T.cast(train_set_y, 'int32')
 67 | 
 68 |     return train_set_x, train_set_y
 69 | 
 70 | 
 71 | def generate_data_train_softmax(n_classes, n_training_examples, input_size, n_epochs, learning_rate=0.1, batch_size=10,
 72 |                                 hierarchical=False):
 73 |     """
 74 |     Train either a flat or hierarchical softmax model on randomly generated data and return the predicted class for a
 75 |     single random test example, the average training loss at the last epoch,
 76 |     and the time it took to train the model.
 77 | 
 78 |     Arguments:
 79 |          - n_classes:           how many output classes there should be in the randomly generated data set
 80 |          - n_training_examples: how many training examples there should be
 81 |          - input_size:          length of each randomly generated input vector
 82 |          - n_epochs:            number training epochs
 83 |          - learning_rate:       learning rate of the softmax model
 84 |          - batch_size:          batch size for softmax model
 85 |          - hierarchical:        whether to generate_data_train_softmax with hierachical softmax
 86 |                                 (use flat softmax otherwise)
 87 | 
 88 |     Returns:
 89 |         - predicted: the predicted class for a single randomly generated test example
 90 |         - avg_loss: the average training loss over generate_data_train_softmax batches at the last training epoch
 91 |         - total_train_time: the time it took, in minutes, to generate_data_train_softmax the model
 92 |     """
 93 |     train_set_x, train_set_y = generate_data(n_classes, n_training_examples, input_size)
 94 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
 95 | 
 96 |     print 'we have %s train batches' % n_train_batches
 97 | 
 98 |     # allocate symbolic variables for the data
 99 |     index = T.lscalar()     # index to a [mini]batch
100 |     x = T.matrix('x')       # data, presented as rasterized images
101 |     y = T.ivector('y')      # labels, presented as 1D vector of [int] labels
102 | 
103 |     # instantiate hierarchical softmax model and calculate gradients
104 |     if hierarchical:
105 | 
106 |         softmax = HierarchicalSoftmax(input_=x, target=y, n_in=input_size, n_out=n_classes)
107 |         cost = softmax.cost
108 | 
109 |         g_W = T.grad(cost=cost, wrt=softmax.W1)
110 |         g_b = T.grad(cost=cost, wrt=softmax.b1)
111 |         g_U = T.grad(cost=cost, wrt=softmax.W2)
112 |         g_c = T.grad(cost=cost, wrt=softmax.b2)
113 | 
114 |         updates = [(softmax.W1, softmax.W1 - learning_rate * g_W),
115 |                    (softmax.b1, softmax.b1 - learning_rate * g_b),
116 |                    (softmax.W2, softmax.W2 - learning_rate * g_U),
117 |                    (softmax.b2, softmax.b2 - learning_rate * g_c)]
118 | 
119 |     # instantiate flat softmax model and calculate gradients
120 |     else:
121 |         softmax = Softmax(input_=x, n_in=input_size, n_out=n_classes)
122 |         cost = softmax.negative_log_likelihood(y)
123 | 
124 |         g_W = T.grad(cost=cost, wrt=softmax.W)
125 |         g_b = T.grad(cost=cost, wrt=softmax.b)
126 | 
127 |         updates = [(softmax.W, softmax.W - learning_rate * g_W),
128 |                    (softmax.b, softmax.b - learning_rate * g_b)]
129 | 
130 |     # compile a Theano function `train_model` that returns the cost and at
131 |     # the same time updates the parameter of the model based on the rules
132 |     # defined in `updates`
133 |     train_model = theano.function(
134 |         inputs=[index],
135 |         outputs=cost,
136 |         updates=updates,
137 |         givens={
138 |             x: train_set_x[index * batch_size: (index + 1) * batch_size],
139 |             y: train_set_y[index * batch_size: (index + 1) * batch_size]
140 |         }
141 |     )
142 | 
143 |     # train the model
144 |     start_time = time.time()
145 |     avg_loss = None
146 |     for epoch in range(n_epochs):
147 | 
148 |         costs_over_batches = []
149 |         for minibatch_index in xrange(n_train_batches):
150 |             minibatch_avg_cost = train_model(minibatch_index)
151 |             costs_over_batches.append(minibatch_avg_cost)
152 | 
153 |         avg_loss = numpy.mean(costs_over_batches)
154 | 
155 |         print 'Epoch: %s' % epoch
156 |         print 'Loss: %s' % avg_loss
157 |         print 'Time since beginning of training: %s' % ((time.time() - start_time) / 60)
158 |         print
159 | 
160 |     total_train_time = ((time.time() - start_time) / 60)
161 |     print 'Training took: %s' % total_train_time
162 |     print '\n\n'
163 | 
164 |     # generate a single random test example
165 |     numpy.random.seed(444)
166 |     input_ = numpy.asarray([numpy.random.rand(input_size)])
167 | 
168 |     predictions = softmax.get_predictions(input_)
169 | 
170 |     # compute class prediction
171 |     preds_eval = predictions.eval()
172 | 
173 |     return preds_eval, avg_loss, total_train_time
174 | 
175 | 
176 | def benchmark_softmax(n_classes_range, plot_name, n_data_points=50000, input_size=3, n_epochs=2):
177 |     """
178 |     Train a flat and a hierarchical softmax model for a range of different numbers of output classes.
179 |     Then create three plots:
180 | 
181 |         - training time as a function of number of output classes
182 |         - the predicted class as a function of number of classes (predicted class by model)
183 |         - training loss at last epoch as a function of number of output classes
184 | 
185 |     The training time should increase linearly as a function of number of output classes for the flat model,
186 |     whereas it should increase much less for the hierarchical model. Predicted classes should be equivalent most of
187 |     the time. The training loss may differ across flat and hierarchical softmax.
188 | 
189 |     Arguments:
190 |         - n_classes_range:  list of numbers of classes for which to generate_data_train_softmax models
191 |         - plot_name:        name of plots -- will be saved to os.getcwd() + '/plots/'
192 |     """
193 |     def get_benchmark_data(hierarchical=False):
194 |         costs = []
195 |         times = []
196 |         preds = []
197 |         for n_classes in n_classes_range:
198 |             print 'training %s softmax model with %s classes' % ('hierarchical' if hierarchical else 'flat', n_classes)
199 |             print
200 |             pred, cost, time = generate_data_train_softmax(n_classes, n_data_points, input_size, n_epochs,
201 |                                                            hierarchical=hierarchical)
202 |             costs.append(cost)
203 |             times.append(time)
204 |             preds.append(pred)
205 |         return costs, times, preds
206 | 
207 |     def plot_benchmark(xs, hierarchical_ys, flat_ys, y_axis_label):
208 |         pyplot.plot(xs, hierarchical_ys, 'o-', markersize=40, linewidth=9, label='hierarchical softmax')
209 |         pyplot.plot(xs, flat_ys, 'o-', markersize=40, linewidth=9, label='flat softmax')
210 |         pyplot.xlabel('nr of output classes', fontsize=40)
211 |         pyplot.ylabel(y_axis_label, fontsize=40)
212 |         plot_dir = os.getcwd() + '/plots/'
213 |         save_plot_to(plot_dir, '%s_%s' % (plot_name, y_axis_label))
214 | 
215 |     # generate_data_train_softmax hierarchical softmax models
216 |     h_cost, h_time, h_preds = get_benchmark_data(hierarchical=True)
217 | 
218 |      # generate_data_train_softmax flat softmax models
219 |     f_cost, f_time, f_preds = get_benchmark_data(hierarchical=False)
220 | 
221 |     #plot results
222 |     plot_benchmark(n_classes_range, h_cost, f_cost, 'cost')
223 |     plot_benchmark(n_classes_range, h_time, f_time, 'time')
224 |     plot_benchmark(n_classes_range, h_preds, f_preds, 'predicted_class')


--------------------------------------------------------------------------------
/plots/500--20k_cost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RobGrimm/HierarchicalSoftmax/1c15f4a411996d770074f5c99236001e19862acd/plots/500--20k_cost.png


--------------------------------------------------------------------------------
/plots/500--20k_predicted_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RobGrimm/HierarchicalSoftmax/1c15f4a411996d770074f5c99236001e19862acd/plots/500--20k_predicted_class.png


--------------------------------------------------------------------------------
/plots/500--20k_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RobGrimm/HierarchicalSoftmax/1c15f4a411996d770074f5c99236001e19862acd/plots/500--20k_time.png


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from benchmark_functions import generate_data_train_softmax
 2 | from benchmark_functions import benchmark_softmax
 3 | 
 4 | # train hierarchical softmax on dataset with 5.000 output classes
 5 | generate_data_train_softmax(n_classes=5000, n_training_examples=50000, input_size=3, hierarchical=True, n_epochs=2,
 6 |                             batch_size=50)
 7 | 
 8 | 
 9 | # train flat softmax on the same data -- this will take longer
10 | generate_data_train_softmax(n_classes=5000, n_training_examples=50000, input_size=3, hierarchical=False, n_epochs=2,
11 |                             batch_size=50)
12 | 
13 | 
14 | # train hierarchical softmax on data set with 1 million output classes
15 | # will take much longer if you do it with flat softmax
16 | generate_data_train_softmax(n_classes=1000000, n_training_examples=50000, input_size=3, hierarchical=True, n_epochs=2,
17 |                             batch_size=50)
18 | 
19 | 
20 | # for varying numbers of output classes (1000 -- 20.000, in increments of 1000),
21 | # train both a flat and a hierarchical softmax model. keep track of the total
22 | # training time and the training loss at the last epoch for each model.
23 | # then plot cost and training time as a function of number of output classes, as well as the predicted class
24 | # for a randomly generated test example.
25 | # save plots to os.getcwd() + '\plots\'
26 | benchmark_softmax(n_classes_range=range(1000, 21000, 1000), plot_name='500--20k', input_size=100, n_data_points=500,
27 |                   n_epochs=2)
28 | 


--------------------------------------------------------------------------------