├── README.md
├── binary_layers.py
├── binary_ops.py
├── mnist_cnn.py
└── mnist_mlp.py


/README.md:
--------------------------------------------------------------------------------
 1 | **Depricated!**  The repo is based on Keras 1.2 and not maintained anymore.
 2 | Check [here](https://github.com/DingKe/nn_playground/tree/master/binarynet) if you use Keras 2.0.
 3 | 
 4 | An implemtation of binaryNet for Keras.
 5 | 
 6 | The binarized Dense and Conv2D are two keras layers, thus can be integrated into keras framework out of box.
 7 | 
 8 | To run the demo:
 9 | python mnist_mlp.py # train a binary MLP model on MNIST
10 | python mnist_cnn.py # train a binary CNN model on MNIST
11 | 
12 | The code is according to the [theano version](https://github.com/MatthieuCourbariaux/BinaryNet).
13 | The only missing ingredient is that the learning rate is not scaled w.r.t. weight' fan-in & fan-out. 
14 | (An involved [patch](https://github.com/fchollet/keras/pull/3004) is needed.)
15 | 
16 | Reference paper: [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1](http://arxiv.org/abs/1602.02830)
17 | 


--------------------------------------------------------------------------------
/binary_layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | 
  4 | from keras import backend as K
  5 | 
  6 | from keras.layers import InputSpec, Layer, Dense, Convolution2D
  7 | from keras import constraints
  8 | from keras import initializations
  9 | 
 10 | from binary_ops import binarize
 11 | 
 12 | 
 13 | class Clip(constraints.Constraint):
 14 |     def __init__(self, min_value, max_value=None):
 15 |         self.min_value = min_value
 16 |         self.max_value = max_value
 17 |         if not self.max_value:
 18 |             self.max_value = -self.min_value
 19 |         if self.min_value > self.max_value:
 20 |             self.min_value, self.max_value = self.max_value, self.min_value
 21 | 
 22 |     def __call__(self, p):
 23 |         return K.clip(p, self.min_value, self.max_value)
 24 | 
 25 |     def get_config(self):
 26 |         return {"name": self.__call__.__name__,
 27 |                 "min_value": self.min_value,
 28 |                 "max_value": self.max_value}
 29 | 
 30 | 
 31 | class BinaryDense(Dense):
 32 |     ''' Binarized Dense layer
 33 |     References: 
 34 |     "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830]
 35 |     '''
 36 |     def __init__(self, output_dim, H=1., W_lr_multiplier='Glorot', b_lr_multiplier=None, **kwargs):
 37 |         self.H = H
 38 |         self.W_lr_multiplier = W_lr_multiplier
 39 |         self.b_lr_multiplier = b_lr_multiplier
 40 |         
 41 |         super(BinaryDense, self).__init__(output_dim, **kwargs)
 42 |     
 43 |     def build(self, input_shape):
 44 |         assert len(input_shape) == 2
 45 |         input_dim = input_shape[1]
 46 |         self.input_spec = [InputSpec(dtype=K.floatx(),
 47 |                                      shape=(None, input_dim))]
 48 | 
 49 |         if self.H == 'Glorot':
 50 |             self.H = np.float32(np.sqrt(1.5 / (input_dim + self.output_dim)))
 51 |             #print('Glorot H: {}'.format(self.H))
 52 |             
 53 |         if self.W_lr_multiplier == 'Glorot':
 54 |             self.W_lr_multiplier = np.float32(1. / np.sqrt(1.5 / (input_dim + self.output_dim)))
 55 |             #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier))
 56 |             
 57 |         self.W_constraint = Clip(-self.H, self.H)
 58 |         
 59 |         self.init = initializations.get('uniform')
 60 |         self.W = self.init((input_dim, self.output_dim), scale=self.H,
 61 |                            name='{}_W'.format(self.name))
 62 |         self.trainable_weights = [self.W]
 63 | 
 64 |         if self.bias:
 65 |             self.lr_multipliers = [self.W_lr_multiplier, self.b_lr_multiplier]
 66 |         else:
 67 |             self.lr_multipliers = [self.W_lr_multiplier]
 68 |         
 69 |         self.regularizers = []
 70 |         if self.W_regularizer:
 71 |             self.W_regularizer.set_param(self.W)
 72 |             self.regularizers.append(self.W_regularizer)
 73 | 
 74 |         if self.activity_regularizer:
 75 |             self.activity_regularizer.set_layer(self)
 76 |             self.regularizers.append(self.activity_regularizer)
 77 | 
 78 |         self.constraints = {}
 79 |         if self.W_constraint:
 80 |             self.constraints[self.W] = self.W_constraint
 81 | 
 82 |         if self.bias:
 83 |             self.b = K.zeros((self.output_dim,),
 84 |                              name='{}_b'.format(self.name))
 85 |             self.trainable_weights += [self.b]
 86 | 
 87 |             if self.b_regularizer:
 88 |                 self.b_regularizer.set_param(self.b)
 89 |                 self.regularizers.append(self.b_regularizer)
 90 | 
 91 |             if self.b_constraint:
 92 |                 self.constraints[self.b] = self.b_constraint
 93 | 
 94 |         if self.initial_weights is not None:
 95 |             self.set_weights(self.initial_weights)
 96 |             del self.initial_weights
 97 | 
 98 |     def call(self, x, mask=None):
 99 |         Wb = binarize(self.W, H=self.H)
100 | 
101 |         if self.bias:
102 |             output = self.activation(K.dot(x, Wb) + self.b)
103 |         else:
104 |             output = self.activation(K.dot(x, Wb))
105 |         return output
106 |         
107 |     def get_config(self):
108 |         config = {'H': self.H,
109 |                   'W_lr_multiplier': self.W_lr_multiplier,
110 |                   'b_lr_multiplier': self.b_lr_multiplier}
111 |         base_config = super(BinaryDense, self).get_config()
112 |         return dict(list(base_config.items()) + list(config.items()))
113 | 
114 | 
115 | class BinaryConvolution2D(Convolution2D):
116 |     '''Binarized Convolution2D layer
117 |     References: 
118 |     "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830]
119 |     '''
120 |     def __init__(self, nb_filter, nb_row, nb_col, W_lr_multiplier='Glorot', 
121 |                  b_lr_multiplier=None, H=1., **kwargs):
122 |         self.H = H
123 |         self.W_lr_multiplier = W_lr_multiplier
124 |         self.b_lr_multiplier = b_lr_multiplier
125 |         
126 |         super(BinaryConvolution2D, self).__init__(nb_filter, nb_row, nb_col, **kwargs)
127 |         
128 |     def build(self, input_shape):
129 |         if self.dim_ordering == 'th':
130 |             stack_size = input_shape[1]
131 |             self.W_shape = (self.nb_filter, stack_size, self.nb_row, self.nb_col)
132 |         elif self.dim_ordering == 'tf':
133 |             stack_size = input_shape[3]
134 |             self.W_shape = (self.nb_row, self.nb_col, stack_size, self.nb_filter)
135 |         else:
136 |             raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
137 |             
138 |         if self.H == 'Glorot':
139 |             nb_input = int(stack_size * self.nb_row * self.nb_col)
140 |             nb_output = int(self.nb_filter * self.nb_row * self.nb_col)
141 |             self.H = np.float32(np.sqrt(1.5 / (nb_input + nb_output)))
142 |             #print('Glorot H: {}'.format(self.H))
143 |             
144 |         if self.W_lr_multiplier == 'Glorot':
145 |             nb_input = int(stack_size * self.nb_row * self.nb_col)
146 |             nb_output = int(self.nb_filter *self.nb_row * self.nb_col)
147 |             self.W_lr_multiplier = np.float32(1. / np.sqrt(1.5/ (nb_input + nb_output)))
148 |             #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier))
149 |             
150 |         self.init = initializations.get('uniform')
151 |         self.W = self.init(self.W_shape, scale=self.H, name='{}_W'.format(self.name))
152 |         self.trainable_weights = [self.W]
153 | 
154 |         if self.bias:
155 |             self.lr_multipliers = [self.W_lr_multiplier, self.b_lr_multiplier]
156 |         else:
157 |             self.lr_multipliers = [self.W_lr_multiplier]
158 |         
159 |         self.regularizers = []
160 |         if self.W_regularizer:
161 |             self.W_regularizer.set_param(self.W)
162 |             self.regularizers.append(self.W_regularizer)
163 | 
164 |         if self.activity_regularizer:
165 |             self.activity_regularizer.set_layer(self)
166 |             self.regularizers.append(self.activity_regularizer)
167 | 
168 |         self.constraints = {}
169 |         if self.W_constraint:
170 |             self.constraints[self.W] = self.W_constraint
171 | 
172 |         if self.bias:
173 |             self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
174 |             self.trainable_weights += [self.b]
175 | 
176 |             if self.b_regularizer:
177 |                 self.b_regularizer.set_param(self.b)
178 |                 self.regularizers.append(self.b_regularizer)
179 | 
180 |             if self.b_constraint:
181 |                 self.constraints[self.b] = self.b_constraint
182 | 
183 |         W_constraint = Clip(-self.H, self.H)
184 |         self.constraints[self.W] = W_constraint
185 | 
186 |         if self.initial_weights is not None:
187 |             self.set_weights(self.initial_weights)
188 |             del self.initial_weights
189 | 
190 |     def call(self, x, mask=None):
191 |         Wb = binarize(self.W, H=self.H) 
192 |         conv_out = K.conv2d(x, Wb, strides=self.subsample,
193 |                             border_mode=self.border_mode,
194 |                             dim_ordering=self.dim_ordering,
195 |                             filter_shape=self.W_shape)
196 |                                 
197 |         if self.bias:
198 |             if self.dim_ordering == 'th':
199 |                 conv_out = conv_out + K.reshape(self.b, (1, self.nb_filter, 1, 1))
200 |             elif self.dim_ordering == 'tf':
201 |                 conv_out = conv_out + K.reshape(self.b, (1, 1, 1, self.nb_filter))
202 |             else:
203 |                 raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
204 |                 
205 |         output = self.activation(conv_out)
206 |         return output
207 |         
208 |     def get_config(self):
209 |         config = {'H': self.H,
210 |                   'W_lr_multiplier': self.W_lr_multiplier,
211 |                   'b_lr_multiplier': self.b_lr_multiplier}
212 |         base_config = super(BinaryConvolution2D, self).get_config()
213 |         return dict(list(base_config.items()) + list(config.items()))
214 | 


--------------------------------------------------------------------------------
/binary_ops.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | import keras.backend as K
 4 | 
 5 | 
 6 | def round_through(x):
 7 |     '''Element-wise rounding to the closest integer with full gradient propagation.
 8 |     A trick from [Sergey Ioffe](http://stackoverflow.com/a/36480182)
 9 |     '''
10 |     rounded = K.round(x)
11 |     return x + K.stop_gradient(rounded - x)
12 | 
13 | 
14 | def _hard_sigmoid(x):
15 |     '''Hard sigmoid different from the more conventional form (see definition of K.hard_sigmoid).
16 | 
17 |     # Reference:
18 |     - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830}
19 | 
20 |     '''
21 |     x = (0.5 * x) + 0.5
22 |     return K.clip(x, 0, 1)
23 | 
24 | 
25 | def binary_sigmoid(x):
26 |     '''Binary hard sigmoid for training binarized neural network.
27 | 
28 |     # Reference:
29 |     - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830}
30 | 
31 |     '''
32 |     return round_through(_hard_sigmoid(x))
33 | 
34 | 
35 | def binary_tanh(x):
36 |     '''Binary hard sigmoid for training binarized neural network.
37 |      The neurons' activations binarization function
38 |      It behaves like the sign function during forward propagation
39 |      And like:
40 |         hard_tanh(x) = 2 * _hard_sigmoid(x) - 1 
41 |         clear gradient when |x| > 1 during back propagation
42 | 
43 |     # Reference:
44 |     - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830}
45 | 
46 |     '''
47 |     return 2 * round_through(_hard_sigmoid(x)) - 1
48 | 
49 | 
50 | def binarize(W, H=1):
51 |     '''The weights' binarization function, 
52 | 
53 |     # Reference:
54 |     - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830}
55 | 
56 |     '''
57 |     # [-H, H] -> -H or H
58 |     Wb = H * binary_tanh(W / H)
59 |     return Wb
60 | 


--------------------------------------------------------------------------------
/mnist_cnn.py:
--------------------------------------------------------------------------------
  1 | '''Trains a simple binarize CNN on the MNIST dataset.
  2 | Modified from keras' examples/mnist_mlp.py
  3 | Gets to 98.98% test accuracy after 20 epochs using tensorflow backend
  4 | '''
  5 | 
  6 | from __future__ import print_function
  7 | import numpy as np
  8 | np.random.seed(1337)  # for reproducibility
  9 | 
 10 | from keras.datasets import mnist
 11 | from keras.models import Sequential
 12 | from keras.layers import Dense, Dropout, Activation, BatchNormalization, MaxPooling2D
 13 | from keras.layers import Flatten
 14 | from keras.optimizers import SGD, Adam, RMSprop
 15 | from keras.callbacks import LearningRateScheduler
 16 | from keras.utils import np_utils
 17 | 
 18 | from binary_ops import binary_tanh as binary_tanh_op
 19 | from binary_layers import BinaryDense, BinaryConvolution2D
 20 | 
 21 | 
 22 | def binary_tanh(x):
 23 |     return binary_tanh_op(x)
 24 | 
 25 | 
 26 | H = 1.
 27 | W_lr_multiplier = 'Glorot'
 28 | 
 29 | # nn
 30 | batch_size = 50
 31 | nb_epoch = 20 
 32 | nb_channel = 1
 33 | img_rows = 28 
 34 | img_cols = 28 
 35 | nb_filters = 32 
 36 | nb_conv = 3
 37 | nb_pool = 2
 38 | nb_hid = 128
 39 | nb_classes = 10
 40 | bias = False
 41 | 
 42 | # learning rate schedule
 43 | lr_start = 1e-3
 44 | lr_end = 1e-4
 45 | lr_decay = (lr_end / lr_start)**(1. / nb_epoch)
 46 | 
 47 | # BN
 48 | epsilon = 1e-6
 49 | momentum = 0.9
 50 | 
 51 | # dropout
 52 | p1 = 0.25
 53 | p2 = 0.5
 54 | 
 55 | # the data, shuffled and split between train and test sets
 56 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 57 | 
 58 | X_train = X_train.reshape(60000, 1, 28, 28)
 59 | X_test = X_test.reshape(10000, 1, 28, 28)
 60 | X_train = X_train.astype('float32')
 61 | X_test = X_test.astype('float32')
 62 | X_train /= 255
 63 | X_test /= 255
 64 | print(X_train.shape[0], 'train samples')
 65 | print(X_test.shape[0], 'test samples')
 66 | 
 67 | # convert class vectors to binary class matrices
 68 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss
 69 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1
 70 | 
 71 | 
 72 | model = Sequential()
 73 | # conv1
 74 | model.add(BinaryConvolution2D(128, 3, 3, input_shape=(nb_channel, img_rows, img_cols),
 75 |                               H=H, W_lr_multiplier=W_lr_multiplier, 
 76 |                               border_mode='same', bias=bias, name='conv1'))
 77 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn1'))
 78 | model.add(Activation(binary_tanh, name='act1'))
 79 | # conv2
 80 | model.add(BinaryConvolution2D(128, 3, 3, H=H, W_lr_multiplier=W_lr_multiplier, 
 81 |                               border_mode='same', bias=bias, name='conv2'))
 82 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool2'))
 83 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn2'))
 84 | model.add(Activation(binary_tanh, name='act2'))
 85 | # conv3
 86 | model.add(BinaryConvolution2D(256, 3, 3, H=H, W_lr_multiplier=W_lr_multiplier,
 87 |                               border_mode='same', bias=bias, name='conv3'))
 88 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn3'))
 89 | model.add(Activation(binary_tanh, name='act3'))
 90 | # conv4
 91 | model.add(BinaryConvolution2D(256, 3, 3, H=H, W_lr_multiplier=W_lr_multiplier,
 92 |                               border_mode='same', bias=bias, name='conv4'))
 93 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool4'))
 94 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn4'))
 95 | model.add(Activation(binary_tanh, name='act4'))
 96 | model.add(Flatten())
 97 | # dense1
 98 | model.add(BinaryDense(1024, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias, name='dense5'))
 99 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn5'))
100 | model.add(Activation(binary_tanh, name='act5'))
101 | # dense2
102 | model.add(BinaryDense(nb_classes, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias, name='dense6'))
103 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn6'))
104 | 
105 | opt = Adam(lr=lr_start) 
106 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc'])
107 | model.summary()
108 | 
109 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e)
110 | history = model.fit(X_train, Y_train,
111 |                     batch_size=batch_size, nb_epoch=nb_epoch,
112 |                     verbose=1, validation_data=(X_test, Y_test),
113 |                     callbacks=[lr_scheduler])
114 | score = model.evaluate(X_test, Y_test, verbose=0)
115 | print('Test score:', score[0])
116 | print('Test accuracy:', score[1])
117 | 


--------------------------------------------------------------------------------
/mnist_mlp.py:
--------------------------------------------------------------------------------
  1 | '''Trains a simple binarize fully connected NN on the MNIST dataset.
  2 | Modified from keras' examples/mnist_mlp.py
  3 | Gets to 97.9% test accuracy after 20 epochs using theano backend
  4 | '''
  5 | 
  6 | 
  7 | from __future__ import print_function
  8 | import numpy as np
  9 | np.random.seed(1337)  # for reproducibility
 10 | 
 11 | import keras.backend as K
 12 | from keras.datasets import mnist
 13 | from keras.models import Sequential
 14 | from keras.layers import Dense, Dropout, Activation, BatchNormalization
 15 | from keras.optimizers import SGD, Adam, RMSprop
 16 | from keras.callbacks import LearningRateScheduler
 17 | from keras.utils import np_utils
 18 | 
 19 | from binary_ops import binary_tanh as binary_tanh_op
 20 | from binary_layers import BinaryDense
 21 | 
 22 | 
 23 | class DropoutNoScale(Dropout):
 24 |     '''Keras Dropout does scale the input in training phase, which is undesirable here.
 25 |     '''
 26 |     def call(self, x, mask=None):
 27 |         if 0. < self.p < 1.:
 28 |             noise_shape = self._get_noise_shape(x)
 29 |             x = K.in_train_phase(K.dropout(x, self.p, noise_shape) * (1. - self.p), # multiplied by (1. - self.p) for compensation
 30 |                                  x)
 31 |         return x
 32 | 
 33 | 
 34 | def binary_tanh(x):
 35 |     return binary_tanh_op(x)
 36 | 
 37 | 
 38 | batch_size = 100
 39 | nb_epoch = 20
 40 | nb_classes = 10
 41 | 
 42 | H = 'Glorot'
 43 | W_lr_multiplier = 'Glorot'
 44 | 
 45 | # network
 46 | num_unit = 2048
 47 | num_hidden = 3
 48 | bias = False
 49 | 
 50 | # learning rate schedule
 51 | lr_start = 1e-3
 52 | lr_end = 1e-4
 53 | lr_decay = (lr_end / lr_start)**(1. / nb_epoch)
 54 | 
 55 | # BN
 56 | epsilon = 1e-6
 57 | momentum = 0.9
 58 | 
 59 | # dropout
 60 | drop_in = 0.2
 61 | drop_hidden = 0.5
 62 | 
 63 | # the data, shuffled and split between train and test sets
 64 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 65 | 
 66 | X_train = X_train.reshape(60000, 784)
 67 | X_test = X_test.reshape(10000, 784)
 68 | X_train = X_train.astype('float32')
 69 | X_test = X_test.astype('float32')
 70 | X_train /= 255
 71 | X_test /= 255
 72 | print(X_train.shape[0], 'train samples')
 73 | print(X_test.shape[0], 'test samples')
 74 | 
 75 | # convert class vectors to binary class matrices
 76 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss
 77 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1
 78 | 
 79 | model = Sequential()
 80 | model.add(DropoutNoScale(drop_in, input_shape=(784,), name='drop0'))
 81 | for i in range(num_hidden):
 82 |     model.add(BinaryDense(num_unit, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias,
 83 |               name='dense{}'.format(i+1)))
 84 |     model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn{}'.format(i+1)))
 85 |     model.add(Activation(binary_tanh, name='act{}'.format(i+1)))
 86 |     model.add(DropoutNoScale(drop_hidden, name='drop{}'.format(i+1)))
 87 | model.add(BinaryDense(10, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias,
 88 |           name='dense'))
 89 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn'))
 90 | 
 91 | model.summary()
 92 | 
 93 | opt = Adam(lr=lr_start) 
 94 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc'])
 95 | 
 96 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e)
 97 | history = model.fit(X_train, Y_train,
 98 |                     batch_size=batch_size, nb_epoch=nb_epoch,
 99 |                     verbose=1, validation_data=(X_test, Y_test),
100 |                     callbacks=[lr_scheduler])
101 | score = model.evaluate(X_test, Y_test, verbose=0)
102 | print('Test score:', score[0])
103 | print('Test accuracy:', score[1])
104 | 


--------------------------------------------------------------------------------