├── README.md ├── binary_layers.py ├── binary_ops.py ├── mnist_cnn.py └── mnist_mlp.py /README.md: -------------------------------------------------------------------------------- 1 | **Depricated!** The repo is based on Keras 1.2 and not maintained anymore. 2 | Check [here](https://github.com/DingKe/nn_playground/tree/master/binarynet) if you use Keras 2.0. 3 | 4 | An implemtation of binaryNet for Keras. 5 | 6 | The binarized Dense and Conv2D are two keras layers, thus can be integrated into keras framework out of box. 7 | 8 | To run the demo: 9 | python mnist_mlp.py # train a binary MLP model on MNIST 10 | python mnist_cnn.py # train a binary CNN model on MNIST 11 | 12 | The code is according to the [theano version](https://github.com/MatthieuCourbariaux/BinaryNet). 13 | The only missing ingredient is that the learning rate is not scaled w.r.t. weight' fan-in & fan-out. 14 | (An involved [patch](https://github.com/fchollet/keras/pull/3004) is needed.) 15 | 16 | Reference paper: [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1](http://arxiv.org/abs/1602.02830) 17 | -------------------------------------------------------------------------------- /binary_layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from keras import backend as K 5 | 6 | from keras.layers import InputSpec, Layer, Dense, Convolution2D 7 | from keras import constraints 8 | from keras import initializations 9 | 10 | from binary_ops import binarize 11 | 12 | 13 | class Clip(constraints.Constraint): 14 | def __init__(self, min_value, max_value=None): 15 | self.min_value = min_value 16 | self.max_value = max_value 17 | if not self.max_value: 18 | self.max_value = -self.min_value 19 | if self.min_value > self.max_value: 20 | self.min_value, self.max_value = self.max_value, self.min_value 21 | 22 | def __call__(self, p): 23 | return K.clip(p, self.min_value, self.max_value) 24 | 25 | def get_config(self): 26 | return {"name": self.__call__.__name__, 27 | "min_value": self.min_value, 28 | "max_value": self.max_value} 29 | 30 | 31 | class BinaryDense(Dense): 32 | ''' Binarized Dense layer 33 | References: 34 | "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830] 35 | ''' 36 | def __init__(self, output_dim, H=1., W_lr_multiplier='Glorot', b_lr_multiplier=None, **kwargs): 37 | self.H = H 38 | self.W_lr_multiplier = W_lr_multiplier 39 | self.b_lr_multiplier = b_lr_multiplier 40 | 41 | super(BinaryDense, self).__init__(output_dim, **kwargs) 42 | 43 | def build(self, input_shape): 44 | assert len(input_shape) == 2 45 | input_dim = input_shape[1] 46 | self.input_spec = [InputSpec(dtype=K.floatx(), 47 | shape=(None, input_dim))] 48 | 49 | if self.H == 'Glorot': 50 | self.H = np.float32(np.sqrt(1.5 / (input_dim + self.output_dim))) 51 | #print('Glorot H: {}'.format(self.H)) 52 | 53 | if self.W_lr_multiplier == 'Glorot': 54 | self.W_lr_multiplier = np.float32(1. / np.sqrt(1.5 / (input_dim + self.output_dim))) 55 | #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier)) 56 | 57 | self.W_constraint = Clip(-self.H, self.H) 58 | 59 | self.init = initializations.get('uniform') 60 | self.W = self.init((input_dim, self.output_dim), scale=self.H, 61 | name='{}_W'.format(self.name)) 62 | self.trainable_weights = [self.W] 63 | 64 | if self.bias: 65 | self.lr_multipliers = [self.W_lr_multiplier, self.b_lr_multiplier] 66 | else: 67 | self.lr_multipliers = [self.W_lr_multiplier] 68 | 69 | self.regularizers = [] 70 | if self.W_regularizer: 71 | self.W_regularizer.set_param(self.W) 72 | self.regularizers.append(self.W_regularizer) 73 | 74 | if self.activity_regularizer: 75 | self.activity_regularizer.set_layer(self) 76 | self.regularizers.append(self.activity_regularizer) 77 | 78 | self.constraints = {} 79 | if self.W_constraint: 80 | self.constraints[self.W] = self.W_constraint 81 | 82 | if self.bias: 83 | self.b = K.zeros((self.output_dim,), 84 | name='{}_b'.format(self.name)) 85 | self.trainable_weights += [self.b] 86 | 87 | if self.b_regularizer: 88 | self.b_regularizer.set_param(self.b) 89 | self.regularizers.append(self.b_regularizer) 90 | 91 | if self.b_constraint: 92 | self.constraints[self.b] = self.b_constraint 93 | 94 | if self.initial_weights is not None: 95 | self.set_weights(self.initial_weights) 96 | del self.initial_weights 97 | 98 | def call(self, x, mask=None): 99 | Wb = binarize(self.W, H=self.H) 100 | 101 | if self.bias: 102 | output = self.activation(K.dot(x, Wb) + self.b) 103 | else: 104 | output = self.activation(K.dot(x, Wb)) 105 | return output 106 | 107 | def get_config(self): 108 | config = {'H': self.H, 109 | 'W_lr_multiplier': self.W_lr_multiplier, 110 | 'b_lr_multiplier': self.b_lr_multiplier} 111 | base_config = super(BinaryDense, self).get_config() 112 | return dict(list(base_config.items()) + list(config.items())) 113 | 114 | 115 | class BinaryConvolution2D(Convolution2D): 116 | '''Binarized Convolution2D layer 117 | References: 118 | "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830] 119 | ''' 120 | def __init__(self, nb_filter, nb_row, nb_col, W_lr_multiplier='Glorot', 121 | b_lr_multiplier=None, H=1., **kwargs): 122 | self.H = H 123 | self.W_lr_multiplier = W_lr_multiplier 124 | self.b_lr_multiplier = b_lr_multiplier 125 | 126 | super(BinaryConvolution2D, self).__init__(nb_filter, nb_row, nb_col, **kwargs) 127 | 128 | def build(self, input_shape): 129 | if self.dim_ordering == 'th': 130 | stack_size = input_shape[1] 131 | self.W_shape = (self.nb_filter, stack_size, self.nb_row, self.nb_col) 132 | elif self.dim_ordering == 'tf': 133 | stack_size = input_shape[3] 134 | self.W_shape = (self.nb_row, self.nb_col, stack_size, self.nb_filter) 135 | else: 136 | raise Exception('Invalid dim_ordering: ' + self.dim_ordering) 137 | 138 | if self.H == 'Glorot': 139 | nb_input = int(stack_size * self.nb_row * self.nb_col) 140 | nb_output = int(self.nb_filter * self.nb_row * self.nb_col) 141 | self.H = np.float32(np.sqrt(1.5 / (nb_input + nb_output))) 142 | #print('Glorot H: {}'.format(self.H)) 143 | 144 | if self.W_lr_multiplier == 'Glorot': 145 | nb_input = int(stack_size * self.nb_row * self.nb_col) 146 | nb_output = int(self.nb_filter *self.nb_row * self.nb_col) 147 | self.W_lr_multiplier = np.float32(1. / np.sqrt(1.5/ (nb_input + nb_output))) 148 | #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier)) 149 | 150 | self.init = initializations.get('uniform') 151 | self.W = self.init(self.W_shape, scale=self.H, name='{}_W'.format(self.name)) 152 | self.trainable_weights = [self.W] 153 | 154 | if self.bias: 155 | self.lr_multipliers = [self.W_lr_multiplier, self.b_lr_multiplier] 156 | else: 157 | self.lr_multipliers = [self.W_lr_multiplier] 158 | 159 | self.regularizers = [] 160 | if self.W_regularizer: 161 | self.W_regularizer.set_param(self.W) 162 | self.regularizers.append(self.W_regularizer) 163 | 164 | if self.activity_regularizer: 165 | self.activity_regularizer.set_layer(self) 166 | self.regularizers.append(self.activity_regularizer) 167 | 168 | self.constraints = {} 169 | if self.W_constraint: 170 | self.constraints[self.W] = self.W_constraint 171 | 172 | if self.bias: 173 | self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name)) 174 | self.trainable_weights += [self.b] 175 | 176 | if self.b_regularizer: 177 | self.b_regularizer.set_param(self.b) 178 | self.regularizers.append(self.b_regularizer) 179 | 180 | if self.b_constraint: 181 | self.constraints[self.b] = self.b_constraint 182 | 183 | W_constraint = Clip(-self.H, self.H) 184 | self.constraints[self.W] = W_constraint 185 | 186 | if self.initial_weights is not None: 187 | self.set_weights(self.initial_weights) 188 | del self.initial_weights 189 | 190 | def call(self, x, mask=None): 191 | Wb = binarize(self.W, H=self.H) 192 | conv_out = K.conv2d(x, Wb, strides=self.subsample, 193 | border_mode=self.border_mode, 194 | dim_ordering=self.dim_ordering, 195 | filter_shape=self.W_shape) 196 | 197 | if self.bias: 198 | if self.dim_ordering == 'th': 199 | conv_out = conv_out + K.reshape(self.b, (1, self.nb_filter, 1, 1)) 200 | elif self.dim_ordering == 'tf': 201 | conv_out = conv_out + K.reshape(self.b, (1, 1, 1, self.nb_filter)) 202 | else: 203 | raise Exception('Invalid dim_ordering: ' + self.dim_ordering) 204 | 205 | output = self.activation(conv_out) 206 | return output 207 | 208 | def get_config(self): 209 | config = {'H': self.H, 210 | 'W_lr_multiplier': self.W_lr_multiplier, 211 | 'b_lr_multiplier': self.b_lr_multiplier} 212 | base_config = super(BinaryConvolution2D, self).get_config() 213 | return dict(list(base_config.items()) + list(config.items())) 214 | -------------------------------------------------------------------------------- /binary_ops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import keras.backend as K 4 | 5 | 6 | def round_through(x): 7 | '''Element-wise rounding to the closest integer with full gradient propagation. 8 | A trick from [Sergey Ioffe](http://stackoverflow.com/a/36480182) 9 | ''' 10 | rounded = K.round(x) 11 | return x + K.stop_gradient(rounded - x) 12 | 13 | 14 | def _hard_sigmoid(x): 15 | '''Hard sigmoid different from the more conventional form (see definition of K.hard_sigmoid). 16 | 17 | # Reference: 18 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 19 | 20 | ''' 21 | x = (0.5 * x) + 0.5 22 | return K.clip(x, 0, 1) 23 | 24 | 25 | def binary_sigmoid(x): 26 | '''Binary hard sigmoid for training binarized neural network. 27 | 28 | # Reference: 29 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 30 | 31 | ''' 32 | return round_through(_hard_sigmoid(x)) 33 | 34 | 35 | def binary_tanh(x): 36 | '''Binary hard sigmoid for training binarized neural network. 37 | The neurons' activations binarization function 38 | It behaves like the sign function during forward propagation 39 | And like: 40 | hard_tanh(x) = 2 * _hard_sigmoid(x) - 1 41 | clear gradient when |x| > 1 during back propagation 42 | 43 | # Reference: 44 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 45 | 46 | ''' 47 | return 2 * round_through(_hard_sigmoid(x)) - 1 48 | 49 | 50 | def binarize(W, H=1): 51 | '''The weights' binarization function, 52 | 53 | # Reference: 54 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 55 | 56 | ''' 57 | # [-H, H] -> -H or H 58 | Wb = H * binary_tanh(W / H) 59 | return Wb 60 | -------------------------------------------------------------------------------- /mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple binarize CNN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 98.98% test accuracy after 20 epochs using tensorflow backend 4 | ''' 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | np.random.seed(1337) # for reproducibility 9 | 10 | from keras.datasets import mnist 11 | from keras.models import Sequential 12 | from keras.layers import Dense, Dropout, Activation, BatchNormalization, MaxPooling2D 13 | from keras.layers import Flatten 14 | from keras.optimizers import SGD, Adam, RMSprop 15 | from keras.callbacks import LearningRateScheduler 16 | from keras.utils import np_utils 17 | 18 | from binary_ops import binary_tanh as binary_tanh_op 19 | from binary_layers import BinaryDense, BinaryConvolution2D 20 | 21 | 22 | def binary_tanh(x): 23 | return binary_tanh_op(x) 24 | 25 | 26 | H = 1. 27 | W_lr_multiplier = 'Glorot' 28 | 29 | # nn 30 | batch_size = 50 31 | nb_epoch = 20 32 | nb_channel = 1 33 | img_rows = 28 34 | img_cols = 28 35 | nb_filters = 32 36 | nb_conv = 3 37 | nb_pool = 2 38 | nb_hid = 128 39 | nb_classes = 10 40 | bias = False 41 | 42 | # learning rate schedule 43 | lr_start = 1e-3 44 | lr_end = 1e-4 45 | lr_decay = (lr_end / lr_start)**(1. / nb_epoch) 46 | 47 | # BN 48 | epsilon = 1e-6 49 | momentum = 0.9 50 | 51 | # dropout 52 | p1 = 0.25 53 | p2 = 0.5 54 | 55 | # the data, shuffled and split between train and test sets 56 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 57 | 58 | X_train = X_train.reshape(60000, 1, 28, 28) 59 | X_test = X_test.reshape(10000, 1, 28, 28) 60 | X_train = X_train.astype('float32') 61 | X_test = X_test.astype('float32') 62 | X_train /= 255 63 | X_test /= 255 64 | print(X_train.shape[0], 'train samples') 65 | print(X_test.shape[0], 'test samples') 66 | 67 | # convert class vectors to binary class matrices 68 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss 69 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1 70 | 71 | 72 | model = Sequential() 73 | # conv1 74 | model.add(BinaryConvolution2D(128, 3, 3, input_shape=(nb_channel, img_rows, img_cols), 75 | H=H, W_lr_multiplier=W_lr_multiplier, 76 | border_mode='same', bias=bias, name='conv1')) 77 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn1')) 78 | model.add(Activation(binary_tanh, name='act1')) 79 | # conv2 80 | model.add(BinaryConvolution2D(128, 3, 3, H=H, W_lr_multiplier=W_lr_multiplier, 81 | border_mode='same', bias=bias, name='conv2')) 82 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool2')) 83 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn2')) 84 | model.add(Activation(binary_tanh, name='act2')) 85 | # conv3 86 | model.add(BinaryConvolution2D(256, 3, 3, H=H, W_lr_multiplier=W_lr_multiplier, 87 | border_mode='same', bias=bias, name='conv3')) 88 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn3')) 89 | model.add(Activation(binary_tanh, name='act3')) 90 | # conv4 91 | model.add(BinaryConvolution2D(256, 3, 3, H=H, W_lr_multiplier=W_lr_multiplier, 92 | border_mode='same', bias=bias, name='conv4')) 93 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool4')) 94 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn4')) 95 | model.add(Activation(binary_tanh, name='act4')) 96 | model.add(Flatten()) 97 | # dense1 98 | model.add(BinaryDense(1024, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias, name='dense5')) 99 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn5')) 100 | model.add(Activation(binary_tanh, name='act5')) 101 | # dense2 102 | model.add(BinaryDense(nb_classes, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias, name='dense6')) 103 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn6')) 104 | 105 | opt = Adam(lr=lr_start) 106 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 107 | model.summary() 108 | 109 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 110 | history = model.fit(X_train, Y_train, 111 | batch_size=batch_size, nb_epoch=nb_epoch, 112 | verbose=1, validation_data=(X_test, Y_test), 113 | callbacks=[lr_scheduler]) 114 | score = model.evaluate(X_test, Y_test, verbose=0) 115 | print('Test score:', score[0]) 116 | print('Test accuracy:', score[1]) 117 | -------------------------------------------------------------------------------- /mnist_mlp.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple binarize fully connected NN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 97.9% test accuracy after 20 epochs using theano backend 4 | ''' 5 | 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | np.random.seed(1337) # for reproducibility 10 | 11 | import keras.backend as K 12 | from keras.datasets import mnist 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout, Activation, BatchNormalization 15 | from keras.optimizers import SGD, Adam, RMSprop 16 | from keras.callbacks import LearningRateScheduler 17 | from keras.utils import np_utils 18 | 19 | from binary_ops import binary_tanh as binary_tanh_op 20 | from binary_layers import BinaryDense 21 | 22 | 23 | class DropoutNoScale(Dropout): 24 | '''Keras Dropout does scale the input in training phase, which is undesirable here. 25 | ''' 26 | def call(self, x, mask=None): 27 | if 0. < self.p < 1.: 28 | noise_shape = self._get_noise_shape(x) 29 | x = K.in_train_phase(K.dropout(x, self.p, noise_shape) * (1. - self.p), # multiplied by (1. - self.p) for compensation 30 | x) 31 | return x 32 | 33 | 34 | def binary_tanh(x): 35 | return binary_tanh_op(x) 36 | 37 | 38 | batch_size = 100 39 | nb_epoch = 20 40 | nb_classes = 10 41 | 42 | H = 'Glorot' 43 | W_lr_multiplier = 'Glorot' 44 | 45 | # network 46 | num_unit = 2048 47 | num_hidden = 3 48 | bias = False 49 | 50 | # learning rate schedule 51 | lr_start = 1e-3 52 | lr_end = 1e-4 53 | lr_decay = (lr_end / lr_start)**(1. / nb_epoch) 54 | 55 | # BN 56 | epsilon = 1e-6 57 | momentum = 0.9 58 | 59 | # dropout 60 | drop_in = 0.2 61 | drop_hidden = 0.5 62 | 63 | # the data, shuffled and split between train and test sets 64 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 65 | 66 | X_train = X_train.reshape(60000, 784) 67 | X_test = X_test.reshape(10000, 784) 68 | X_train = X_train.astype('float32') 69 | X_test = X_test.astype('float32') 70 | X_train /= 255 71 | X_test /= 255 72 | print(X_train.shape[0], 'train samples') 73 | print(X_test.shape[0], 'test samples') 74 | 75 | # convert class vectors to binary class matrices 76 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss 77 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1 78 | 79 | model = Sequential() 80 | model.add(DropoutNoScale(drop_in, input_shape=(784,), name='drop0')) 81 | for i in range(num_hidden): 82 | model.add(BinaryDense(num_unit, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias, 83 | name='dense{}'.format(i+1))) 84 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn{}'.format(i+1))) 85 | model.add(Activation(binary_tanh, name='act{}'.format(i+1))) 86 | model.add(DropoutNoScale(drop_hidden, name='drop{}'.format(i+1))) 87 | model.add(BinaryDense(10, H=H, W_lr_multiplier=W_lr_multiplier, bias=bias, 88 | name='dense')) 89 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn')) 90 | 91 | model.summary() 92 | 93 | opt = Adam(lr=lr_start) 94 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 95 | 96 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 97 | history = model.fit(X_train, Y_train, 98 | batch_size=batch_size, nb_epoch=nb_epoch, 99 | verbose=1, validation_data=(X_test, Y_test), 100 | callbacks=[lr_scheduler]) 101 | score = model.evaluate(X_test, Y_test, verbose=0) 102 | print('Test score:', score[0]) 103 | print('Test accuracy:', score[1]) 104 | --------------------------------------------------------------------------------