├── .gitignore ├── LICENSE ├── README.md ├── avb ├── README.md └── avb.py ├── binarynet ├── README.md ├── binary_layers.py ├── binary_ops.py ├── mnist_cnn.py └── mnist_mlp.py ├── focal_loss ├── README.md ├── losses.py └── mnist_mlp.py ├── gcnn ├── README.md ├── char_generator.py ├── char_lm_gcnn.py ├── data │ ├── imdb_preprocess_semi.py │ ├── nonbreaking_prefixes │ │ └── nonbreaking_prefix.en │ ├── tinyshakespeare │ │ └── input.txt │ └── tokenizer.perl ├── gcnn.py ├── imdb_generator.py └── imdb_lm_gcnn.py ├── glsgan ├── README.md └── glsgan_mlp.py ├── layernorm ├── README.md ├── babi_lnlstm.py ├── data ├── imdb_cnn_lstm.py ├── imdb_generator.py ├── imdb_lm.py ├── layer_norm_layers.py ├── mnist_cnn.py └── mnist_mlp.py ├── lsgan ├── README.md └── lsgan_mlp.py ├── qrnn ├── README.md ├── imbd_qrnn.py ├── imbd_qrnn_Bidirecional.py └── qrnn.py ├── senet ├── README.md ├── layers.py └── mnist_cnn.py ├── ternarynet ├── README.md ├── imdb_generator.py ├── imdb_lm.py ├── mnist_cnn.py ├── mnist_mlp.py ├── ternary_layers.py └── ternary_ops.py ├── vae ├── README.md ├── img │ ├── i_mse.png │ ├── i_xent.png │ ├── x_mse.png │ ├── x_xent.png │ ├── z_mse.png │ └── z_xent.png └── variational_autoencoder.py ├── weightnorm ├── README.md ├── cifar10_cnn.py ├── gru_text_generation.py ├── imdb_generator.py ├── imdb_lm.py ├── mnist_cnn.py ├── mnist_mlp.py └── weight_norm_layers.py ├── wgan ├── README.md ├── wgan_cnn.py └── wgan_mlp.py └── xnornet ├── README.md ├── binary_layers.py ├── binary_ops.py ├── mnist_cnn.py ├── mnist_mlp.py └── xnor_layers.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Ke Ding 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # binarynet 2 | Binary Networks 3 | 4 | # xnornet 5 | XNOR Networks 6 | 7 | # ternarynet 8 | Ternary Networks 9 | 10 | # qrnn 11 | Quasi-Recurrent Nueral Networks 12 | 13 | # vae 14 | Variational Auto-Encoder 15 | 16 | # gcnn 17 | Gated Convolutional Nueral Networks 18 | 19 | # weighnorm 20 | Weight Normalization 21 | 22 | # layernorm 23 | Layer Normalization 24 | 25 | # wgan 26 | Wasserstein GAN 27 | 28 | # lsgan 29 | Least Squares GAN 30 | 31 | # glsgan 32 | Generalized Loss Sensitive GAN 33 | 34 | # focal_loss 35 | Focal Loss 36 | 37 | # senet 38 | Squeeze-and-Excitation Networks 39 | 40 | Note: 41 | By default, all keras scripts run on tensorflow backend with image_data_format 'channels_first'. 42 | -------------------------------------------------------------------------------- /avb/README.md: -------------------------------------------------------------------------------- 1 | Try to implement Adversarial Variational Bayes (AVB) on MNIST. 2 | 3 | Caveat: the paper (as of v1 22/01/2017) lacks implementation details (especially for how to manipulate epsilon), I don't know if I do it in the right way. 4 | The training just does not converge. 5 | 6 | --- 7 | ## References 8 | * Mescheder et al. [Adversarial Variational Bayes: Unifying Variantional Autoencoders and Generative Adersarial Networks](www.arxiv.org/abs/1701.04722). 9 | -------------------------------------------------------------------------------- /avb/avb.py: -------------------------------------------------------------------------------- 1 | '''This script demonstrates how to build a adversarial variational bayes with Keras. 2 | ''' 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from scipy.stats import norm 6 | 7 | from keras.layers import Input, Dense, Lambda, Activation, Concatenate, Dot 8 | from keras.models import Model 9 | from keras.regularizers import l2 10 | from keras import backend as K 11 | from keras import objectives 12 | from keras.datasets import mnist 13 | from keras.optimizers import * 14 | 15 | np.random.seed(1111) # for reproducibility 16 | 17 | training = True 18 | 19 | batch_size = 50 20 | n = 784 # for datapoint 21 | m = 2 # for hidden variables 22 | l = 5 # for random noise 23 | hidden_dim = 256 24 | epochs = 50 25 | epsilon_std = 1.0 26 | loss = 'categorical_crossentorpy' # 'mse' or 'categorical_crossentropy' 27 | 28 | decay = 1e-4 # weight decay, a.k. l2 regularization 29 | use_bias = True 30 | 31 | ## Encoder 32 | x = Input(shape=(n,)) 33 | g = Input(shape=(l,)) 34 | x_g = Concatenate(-1)([x, g]) 35 | h_encoded = Dense(hidden_dim, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='relu')(x_g) 36 | z = Dense(m, activation='relu', kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias)(h_encoded) 37 | 38 | encoder = Model([x, g], z) 39 | 40 | ## Decoder 41 | decoder_h = Dense(hidden_dim, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='relu') 42 | decoder_mean = Dense(n, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='sigmoid') 43 | h_decoded = decoder_h(z) 44 | x_hat = decoder_mean(h_decoded) 45 | 46 | rec = Model([x, g], x_hat) 47 | rec.compile(optimizer=Adam(1e-3), loss='binary_crossentropy') 48 | 49 | ## Discriminator 50 | x_h = Dense(hidden_dim, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='relu') 51 | z_h = Dense(hidden_dim, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='relu') 52 | 53 | ### gan_g 54 | x_h.trainable = False 55 | z_h.trainable = False 56 | T = Dot(-1)([x_h(x), z_h(z)]) 57 | gan_g = Model([x, g], T) 58 | gan_g.compile(optimizer=Adam(1e-4), loss=lambda y_true, y_predict: -K.mean(y_predict, -1)) 59 | 60 | ### gan_d 61 | x_h.trainable = True 62 | z_h.trainable = True 63 | x = Input(shape=(n,)) 64 | z = Input(shape=(m,)) 65 | fake = Dot(-1)([x_h(x), z_h(z)]) 66 | fake = Activation('sigmoid')(fake) 67 | gan_d = Model([x, z], fake) 68 | gan_d.compile(optimizer=Adam(1e-4), loss='binary_crossentropy') 69 | 70 | if training: 71 | # train the VAE on MNIST digits 72 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 73 | 74 | x_train = x_train.astype('float32') / 255 75 | x_test = x_test.astype('float32') / 255 76 | x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:]))) 77 | x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:]))) 78 | 79 | ids = range(len(x_train)) 80 | for i in range(epochs): 81 | print('Epoch {}:'.format(i + 1)) 82 | rec_loss = [] 83 | gen_loss = [] 84 | dis_loss = [] 85 | for s in range(0, len(ids), batch_size): 86 | x = x_train[ids[s:s+batch_size]] 87 | y = y_train[ids[s:s+batch_size]] 88 | bs = len(x) 89 | 90 | eps = np.random.randn(bs, l) 91 | z = np.random.randn(bs, m) 92 | 93 | # reconstruction 94 | loss = rec.train_on_batch([x, eps], x) 95 | rec_loss.append(loss) 96 | 97 | # encoder 98 | loss = gan_g.train_on_batch([x, eps], [1] * bs) 99 | gen_loss.append(loss) 100 | 101 | # discriminator 102 | z_fake = encoder.predict([x, eps]) 103 | x = np.concatenate([x, x], axis=0) 104 | z = np.concatenate([z, z_fake], axis=0) 105 | y = np.asarray([1] * bs + [0] * bs, dtype='float32') 106 | 107 | loss = gan_d.train_on_batch([x, z], y) 108 | dis_loss.append(loss) 109 | 110 | if s % 1000 == 0: 111 | print('rec loss: {}, gen loss: {}, dis loss: {}'.\ 112 | format(np.mean(rec_loss), np.mean(gen_loss), np.mean(dis_loss))) 113 | 114 | print('rec loss: {}, gen loss: {}, dis loss: {}'.\ 115 | format(np.mean(rec_loss), np.mean(gen_loss), np.mean(dis_loss))) 116 | rec.save_weights('weights_{}.h5'.format(i)) 117 | 118 | # display a 2D manifold of the digits 119 | n = 15 # figure with 15x15 digits 120 | digit_size = 28 121 | figure = np.zeros((digit_size * n, digit_size * n)) 122 | # linearly spaced coordinates on the unit square were transformed through the inverse CDF (ppf) of the Gaussian 123 | # to produce values of the latent variables z, since the prior of the latent space is Gaussian 124 | grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) 125 | grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) 126 | 127 | z = Input(shape=(m,)) 128 | h_decoded = decoder_h(z) 129 | x_hat = decoder_mean(h_decoded) 130 | decoder = Model(z, x_hat) 131 | decoder.load_weights('weights_8.h5', by_name=True) 132 | 133 | for i, yi in enumerate(grid_x): 134 | for j, xi in enumerate(grid_y): 135 | z_sample = np.array([[xi, yi]]) 136 | x_decoded = decoder.predict(z_sample) 137 | digit = x_decoded[0].reshape(digit_size, digit_size) 138 | figure[i * digit_size: (i + 1) * digit_size, 139 | j * digit_size: (j + 1) * digit_size] = digit 140 | 141 | fig = plt.figure(figsize=(10, 10)) 142 | plt.imshow(figure, cmap='Greys_r') 143 | plt.show() 144 | fig.savefig('x.png') 145 | -------------------------------------------------------------------------------- /binarynet/README.md: -------------------------------------------------------------------------------- 1 | An implemtation of binaryNet for Keras. 2 | 3 | The binarized Dense and Conv2D are two keras layers, thus can be integrated into keras framework out of box. 4 | 5 | ## To run the demo: 6 | ### train a binary MLP model on MNIST 7 | python mnist_mlp.py 8 | ### train a binary CNN model on MNIST 9 | python mnist_cnn.py 10 | 11 | The code is according to the [theano version](https://github.com/MatthieuCourbariaux/BinaryNet). 12 | The only missing ingredient is that the learning rate is not scaled w.r.t. weight' fan-in & fan-out. 13 | (An involved [patch](https://github.com/fchollet/keras/pull/3004) is needed.) 14 | 15 | ## Reference 16 | * Courbariaux et al. [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1](http://arxiv.org/abs/1602.02830). 17 | -------------------------------------------------------------------------------- /binarynet/binary_layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from keras import backend as K 5 | 6 | from keras.layers import InputSpec, Layer, Dense, Conv2D 7 | from keras import constraints 8 | from keras import initializers 9 | 10 | from binary_ops import binarize 11 | 12 | 13 | class Clip(constraints.Constraint): 14 | def __init__(self, min_value, max_value=None): 15 | self.min_value = min_value 16 | self.max_value = max_value 17 | if not self.max_value: 18 | self.max_value = -self.min_value 19 | if self.min_value > self.max_value: 20 | self.min_value, self.max_value = self.max_value, self.min_value 21 | 22 | def __call__(self, p): 23 | return K.clip(p, self.min_value, self.max_value) 24 | 25 | def get_config(self): 26 | return {"min_value": self.min_value, 27 | "max_value": self.max_value} 28 | 29 | 30 | class BinaryDense(Dense): 31 | ''' Binarized Dense layer 32 | References: 33 | "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830] 34 | ''' 35 | def __init__(self, units, H=1., kernel_lr_multiplier='Glorot', bias_lr_multiplier=None, **kwargs): 36 | super(BinaryDense, self).__init__(units, **kwargs) 37 | self.H = H 38 | self.kernel_lr_multiplier = kernel_lr_multiplier 39 | self.bias_lr_multiplier = bias_lr_multiplier 40 | 41 | super(BinaryDense, self).__init__(units, **kwargs) 42 | 43 | def build(self, input_shape): 44 | assert len(input_shape) >= 2 45 | input_dim = input_shape[1] 46 | 47 | if self.H == 'Glorot': 48 | self.H = np.float32(np.sqrt(1.5 / (input_dim + self.units))) 49 | #print('Glorot H: {}'.format(self.H)) 50 | if self.kernel_lr_multiplier == 'Glorot': 51 | self.kernel_lr_multiplier = np.float32(1. / np.sqrt(1.5 / (input_dim + self.units))) 52 | #print('Glorot learning rate multiplier: {}'.format(self.kernel_lr_multiplier)) 53 | 54 | self.kernel_constraint = Clip(-self.H, self.H) 55 | self.kernel_initializer = initializers.RandomUniform(-self.H, self.H) 56 | self.kernel = self.add_weight(shape=(input_dim, self.units), 57 | initializer=self.kernel_initializer, 58 | name='kernel', 59 | regularizer=self.kernel_regularizer, 60 | constraint=self.kernel_constraint) 61 | 62 | if self.use_bias: 63 | self.lr_multipliers = [self.kernel_lr_multiplier, self.bias_lr_multiplier] 64 | self.bias = self.add_weight(shape=(self.output_dim,), 65 | initializer=self.bias_initializer, 66 | name='bias', 67 | regularizer=self.bias_regularizer, 68 | constraint=self.bias_constraint) 69 | else: 70 | self.lr_multipliers = [self.kernel_lr_multiplier] 71 | self.bias = None 72 | 73 | self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim}) 74 | self.built = True 75 | 76 | 77 | def call(self, inputs): 78 | binary_kernel = binarize(self.kernel, H=self.H) 79 | output = K.dot(inputs, binary_kernel) 80 | if self.use_bias: 81 | output = K.bias_add(output, self.bias) 82 | if self.activation is not None: 83 | output = self.activation(output) 84 | return output 85 | 86 | def get_config(self): 87 | config = {'H': self.H, 88 | 'kernel_lr_multiplier': self.kernel_lr_multiplier, 89 | 'bias_lr_multiplier': self.bias_lr_multiplier} 90 | base_config = super(BinaryDense, self).get_config() 91 | return dict(list(base_config.items()) + list(config.items())) 92 | 93 | 94 | class BinaryConv2D(Conv2D): 95 | '''Binarized Convolution2D layer 96 | References: 97 | "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830] 98 | ''' 99 | def __init__(self, filters, kernel_lr_multiplier='Glorot', 100 | bias_lr_multiplier=None, H=1., **kwargs): 101 | super(BinaryConv2D, self).__init__(filters, **kwargs) 102 | self.H = H 103 | self.kernel_lr_multiplier = kernel_lr_multiplier 104 | self.bias_lr_multiplier = bias_lr_multiplier 105 | 106 | 107 | def build(self, input_shape): 108 | if self.data_format == 'channels_first': 109 | channel_axis = 1 110 | else: 111 | channel_axis = -1 112 | if input_shape[channel_axis] is None: 113 | raise ValueError('The channel dimension of the inputs ' 114 | 'should be defined. Found `None`.') 115 | 116 | input_dim = input_shape[channel_axis] 117 | kernel_shape = self.kernel_size + (input_dim, self.filters) 118 | 119 | base = self.kernel_size[0] * self.kernel_size[1] 120 | if self.H == 'Glorot': 121 | nb_input = int(input_dim * base) 122 | nb_output = int(self.filters * base) 123 | self.H = np.float32(np.sqrt(1.5 / (nb_input + nb_output))) 124 | #print('Glorot H: {}'.format(self.H)) 125 | 126 | if self.kernel_lr_multiplier == 'Glorot': 127 | nb_input = int(input_dim * base) 128 | nb_output = int(self.filters * base) 129 | self.kernel_lr_multiplier = np.float32(1. / np.sqrt(1.5/ (nb_input + nb_output))) 130 | #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier)) 131 | 132 | self.kernel_constraint = Clip(-self.H, self.H) 133 | self.kernel_initializer = initializers.RandomUniform(-self.H, self.H) 134 | self.kernel = self.add_weight(shape=kernel_shape, 135 | initializer=self.kernel_initializer, 136 | name='kernel', 137 | regularizer=self.kernel_regularizer, 138 | constraint=self.kernel_constraint) 139 | 140 | if self.use_bias: 141 | self.lr_multipliers = [self.kernel_lr_multiplier, self.bias_lr_multiplier] 142 | self.bias = self.add_weight((self.output_dim,), 143 | initializer=self.bias_initializers, 144 | name='bias', 145 | regularizer=self.bias_regularizer, 146 | constraint=self.bias_constraint) 147 | 148 | else: 149 | self.lr_multipliers = [self.kernel_lr_multiplier] 150 | self.bias = None 151 | 152 | # Set input spec. 153 | self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim}) 154 | self.built = True 155 | 156 | def call(self, inputs): 157 | binary_kernel = binarize(self.kernel, H=self.H) 158 | outputs = K.conv2d( 159 | inputs, 160 | binary_kernel, 161 | strides=self.strides, 162 | padding=self.padding, 163 | data_format=self.data_format, 164 | dilation_rate=self.dilation_rate) 165 | 166 | if self.use_bias: 167 | outputs = K.bias_add( 168 | outputs, 169 | self.bias, 170 | data_format=self.data_format) 171 | 172 | if self.activation is not None: 173 | return self.activation(outputs) 174 | return outputs 175 | 176 | def get_config(self): 177 | config = {'H': self.H, 178 | 'kernel_lr_multiplier': self.kernel_lr_multiplier, 179 | 'bias_lr_multiplier': self.bias_lr_multiplier} 180 | base_config = super(BinaryConv2D, self).get_config() 181 | return dict(list(base_config.items()) + list(config.items())) 182 | 183 | 184 | # Aliases 185 | 186 | BinaryConvolution2D = BinaryConv2D 187 | -------------------------------------------------------------------------------- /binarynet/binary_ops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import keras.backend as K 4 | 5 | 6 | def round_through(x): 7 | '''Element-wise rounding to the closest integer with full gradient propagation. 8 | A trick from [Sergey Ioffe](http://stackoverflow.com/a/36480182) 9 | ''' 10 | rounded = K.round(x) 11 | return x + K.stop_gradient(rounded - x) 12 | 13 | 14 | def _hard_sigmoid(x): 15 | '''Hard sigmoid different from the more conventional form (see definition of K.hard_sigmoid). 16 | 17 | # Reference: 18 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 19 | 20 | ''' 21 | x = (0.5 * x) + 0.5 22 | return K.clip(x, 0, 1) 23 | 24 | 25 | def binary_sigmoid(x): 26 | '''Binary hard sigmoid for training binarized neural network. 27 | 28 | # Reference: 29 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 30 | 31 | ''' 32 | return round_through(_hard_sigmoid(x)) 33 | 34 | 35 | def binary_tanh(x): 36 | '''Binary hard sigmoid for training binarized neural network. 37 | The neurons' activations binarization function 38 | It behaves like the sign function during forward propagation 39 | And like: 40 | hard_tanh(x) = 2 * _hard_sigmoid(x) - 1 41 | clear gradient when |x| > 1 during back propagation 42 | 43 | # Reference: 44 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 45 | 46 | ''' 47 | return 2 * round_through(_hard_sigmoid(x)) - 1 48 | 49 | 50 | def binarize(W, H=1): 51 | '''The weights' binarization function, 52 | 53 | # Reference: 54 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 55 | 56 | ''' 57 | # [-H, H] -> -H or H 58 | Wb = H * binary_tanh(W / H) 59 | return Wb 60 | 61 | 62 | def _mean_abs(x, axis=None, keepdims=False): 63 | return K.stop_gradient(K.mean(K.abs(x), axis=axis, keepdims=keepdims)) 64 | 65 | 66 | def xnorize(W, H=1., axis=None, keepdims=False): 67 | Wb = binarize(W, H) 68 | Wa = _mean_abs(W, axis, keepdims) 69 | 70 | return Wa, Wb 71 | -------------------------------------------------------------------------------- /binarynet/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple binarize CNN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 98.98% test accuracy after 20 epochs using tensorflow backend 4 | ''' 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | np.random.seed(1337) # for reproducibility 9 | 10 | import keras.backend as K 11 | from keras.datasets import mnist 12 | from keras.models import Sequential 13 | from keras.layers import Dense, Dropout, Activation, BatchNormalization, MaxPooling2D 14 | from keras.layers import Flatten 15 | from keras.optimizers import SGD, Adam, RMSprop 16 | from keras.callbacks import LearningRateScheduler 17 | from keras.utils import np_utils 18 | 19 | from binary_ops import binary_tanh as binary_tanh_op 20 | from binary_layers import BinaryDense, BinaryConv2D 21 | 22 | 23 | def binary_tanh(x): 24 | return binary_tanh_op(x) 25 | 26 | 27 | H = 1. 28 | kernel_lr_multiplier = 'Glorot' 29 | 30 | # nn 31 | batch_size = 50 32 | epochs = 20 33 | channels = 1 34 | img_rows = 28 35 | img_cols = 28 36 | filters = 32 37 | kernel_size = (3, 3) 38 | pool_size = (2, 2) 39 | hidden_units = 128 40 | classes = 10 41 | use_bias = False 42 | 43 | # learning rate schedule 44 | lr_start = 1e-3 45 | lr_end = 1e-4 46 | lr_decay = (lr_end / lr_start)**(1. / epochs) 47 | 48 | # BN 49 | epsilon = 1e-6 50 | momentum = 0.9 51 | 52 | # dropout 53 | p1 = 0.25 54 | p2 = 0.5 55 | 56 | # the data, shuffled and split between train and test sets 57 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 58 | 59 | X_train = X_train.reshape(60000, 1, 28, 28) 60 | X_test = X_test.reshape(10000, 1, 28, 28) 61 | X_train = X_train.astype('float32') 62 | X_test = X_test.astype('float32') 63 | X_train /= 255 64 | X_test /= 255 65 | print(X_train.shape[0], 'train samples') 66 | print(X_test.shape[0], 'test samples') 67 | 68 | # convert class vectors to binary class matrices 69 | Y_train = np_utils.to_categorical(y_train, classes) * 2 - 1 # -1 or 1 for hinge loss 70 | Y_test = np_utils.to_categorical(y_test, classes) * 2 - 1 71 | 72 | 73 | model = Sequential() 74 | # conv1 75 | model.add(BinaryConv2D(128, kernel_size=kernel_size, input_shape=(channels, img_rows, img_cols), 76 | data_format='channels_first', 77 | H=H, kernel_lr_multiplier=kernel_lr_multiplier, 78 | padding='same', use_bias=use_bias, name='conv1')) 79 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn1')) 80 | model.add(Activation(binary_tanh, name='act1')) 81 | # conv2 82 | model.add(BinaryConv2D(128, kernel_size=kernel_size, H=H, kernel_lr_multiplier=kernel_lr_multiplier, 83 | data_format='channels_first', 84 | padding='same', use_bias=use_bias, name='conv2')) 85 | model.add(MaxPooling2D(pool_size=pool_size, name='pool2', data_format='channels_first')) 86 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn2')) 87 | model.add(Activation(binary_tanh, name='act2')) 88 | # conv3 89 | model.add(BinaryConv2D(256, kernel_size=kernel_size, H=H, kernel_lr_multiplier=kernel_lr_multiplier, 90 | data_format='channels_first', 91 | padding='same', use_bias=use_bias, name='conv3')) 92 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn3')) 93 | model.add(Activation(binary_tanh, name='act3')) 94 | # conv4 95 | model.add(BinaryConv2D(256, kernel_size=kernel_size, H=H, kernel_lr_multiplier=kernel_lr_multiplier, 96 | data_format='channels_first', 97 | padding='same', use_bias=use_bias, name='conv4')) 98 | model.add(MaxPooling2D(pool_size=pool_size, name='pool4', data_format='channels_first')) 99 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn4')) 100 | model.add(Activation(binary_tanh, name='act4')) 101 | model.add(Flatten()) 102 | # dense1 103 | model.add(BinaryDense(1024, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, name='dense5')) 104 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn5')) 105 | model.add(Activation(binary_tanh, name='act5')) 106 | # dense2 107 | model.add(BinaryDense(classes, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, name='dense6')) 108 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn6')) 109 | 110 | opt = Adam(lr=lr_start) 111 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 112 | model.summary() 113 | 114 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 115 | history = model.fit(X_train, Y_train, 116 | batch_size=batch_size, epochs=epochs, 117 | verbose=1, validation_data=(X_test, Y_test), 118 | callbacks=[lr_scheduler]) 119 | score = model.evaluate(X_test, Y_test, verbose=0) 120 | print('Test score:', score[0]) 121 | print('Test accuracy:', score[1]) 122 | -------------------------------------------------------------------------------- /binarynet/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple binarize fully connected NN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 97.9% test accuracy after 20 epochs using theano backend 4 | ''' 5 | 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | np.random.seed(1337) # for reproducibility 10 | 11 | import keras.backend as K 12 | from keras.datasets import mnist 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout, Activation, BatchNormalization 15 | from keras.optimizers import SGD, Adam, RMSprop 16 | from keras.callbacks import LearningRateScheduler 17 | from keras.utils import np_utils 18 | 19 | from binary_ops import binary_tanh as binary_tanh_op 20 | from binary_layers import BinaryDense, Clip 21 | 22 | from keras.models import load_model 23 | 24 | 25 | class DropoutNoScale(Dropout): 26 | '''Keras Dropout does scale the input in training phase, which is undesirable here. 27 | ''' 28 | def call(self, inputs, training=None): 29 | if 0. < self.rate < 1.: 30 | noise_shape = self._get_noise_shape(inputs) 31 | 32 | def dropped_inputs(): 33 | return K.dropout(inputs, self.rate, noise_shape, 34 | seed=self.seed) * (1 - self.rate) 35 | return K.in_train_phase(dropped_inputs, inputs, 36 | training=training) 37 | return inputs 38 | 39 | def binary_tanh(x): 40 | return binary_tanh_op(x) 41 | 42 | 43 | batch_size = 100 44 | epochs = 20 45 | nb_classes = 10 46 | 47 | H = 'Glorot' 48 | kernel_lr_multiplier = 'Glorot' 49 | 50 | # network 51 | num_unit = 2048 52 | num_hidden = 3 53 | use_bias = False 54 | 55 | # learning rate schedule 56 | lr_start = 1e-3 57 | lr_end = 1e-4 58 | lr_decay = (lr_end / lr_start)**(1. / epochs) 59 | 60 | # BN 61 | epsilon = 1e-6 62 | momentum = 0.9 63 | 64 | # dropout 65 | drop_in = 0.2 66 | drop_hidden = 0.5 67 | 68 | # the data, shuffled and split between train and test sets 69 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 70 | 71 | X_train = X_train.reshape(60000, 784) 72 | X_test = X_test.reshape(10000, 784) 73 | X_train = X_train.astype('float32') 74 | X_test = X_test.astype('float32') 75 | X_train /= 255 76 | X_test /= 255 77 | print(X_train.shape[0], 'train samples') 78 | print(X_test.shape[0], 'test samples') 79 | 80 | # convert class vectors to binary class matrices 81 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss 82 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1 83 | 84 | model = Sequential() 85 | model.add(DropoutNoScale(drop_in, input_shape=(784,), name='drop0')) 86 | for i in range(num_hidden): 87 | model.add(BinaryDense(num_unit, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, 88 | name='dense{}'.format(i+1))) 89 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn{}'.format(i+1))) 90 | model.add(Activation(binary_tanh, name='act{}'.format(i+1))) 91 | model.add(DropoutNoScale(drop_hidden, name='drop{}'.format(i+1))) 92 | model.add(BinaryDense(10, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, 93 | name='dense')) 94 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn')) 95 | 96 | model.summary() 97 | 98 | opt = Adam(lr=lr_start) 99 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 100 | 101 | # deserialized custom layers 102 | #model.save('mlp.h5') 103 | #model = load_model('mlp.h5', custom_objects={'DropoutNoScale': DropoutNoScale, 104 | # 'BinaryDense': BinaryDense, 105 | # 'Clip': Clip, 106 | # 'binary_tanh': binary_tanh}) 107 | 108 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 109 | history = model.fit(X_train, Y_train, 110 | batch_size=batch_size, epochs=epochs, 111 | verbose=1, validation_data=(X_test, Y_test), 112 | callbacks=[lr_scheduler]) 113 | score = model.evaluate(X_test, Y_test, verbose=0) 114 | print('Test score:', score[0]) 115 | print('Test accuracy:', score[1]) 116 | -------------------------------------------------------------------------------- /focal_loss/README.md: -------------------------------------------------------------------------------- 1 | Proof-of-concept implementation of **Focal Loss**, similiar to cross entropy loss, but pus less weight on well 2 | classified samples. 3 | 4 | ## References 5 | * Lin et al. [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) 6 | -------------------------------------------------------------------------------- /focal_loss/losses.py: -------------------------------------------------------------------------------- 1 | import keras.backend as K 2 | 3 | 4 | def focal_loss(target, output, gamma=2): 5 | output /= K.sum(output, axis=-1, keepdims=True) 6 | eps = K.epsilon() 7 | output = K.clip(output, eps, 1. - eps) 8 | return -K.sum(K.pow(1. - output, gamma) * target * K.log(output), 9 | axis=-1) 10 | -------------------------------------------------------------------------------- /focal_loss/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple deep NN on the MNIST dataset using **Focal Loss**. 2 | ''' 3 | 4 | from __future__ import print_function 5 | 6 | import keras 7 | from keras.datasets import mnist 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Dropout 10 | from keras.optimizers import RMSprop 11 | 12 | from losses import focal_loss 13 | 14 | batch_size = 128 15 | num_classes = 10 16 | epochs = 20 17 | 18 | gamma = 5. 19 | 20 | # the data, shuffled and split between train and test sets 21 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 22 | 23 | x_train = x_train.reshape(60000, 784) 24 | x_test = x_test.reshape(10000, 784) 25 | x_train = x_train.astype('float32') 26 | x_test = x_test.astype('float32') 27 | x_train /= 255 28 | x_test /= 255 29 | print(x_train.shape[0], 'train samples') 30 | print(x_test.shape[0], 'test samples') 31 | 32 | # convert class vectors to binary class matrices 33 | y_train = keras.utils.to_categorical(y_train, num_classes) 34 | y_test = keras.utils.to_categorical(y_test, num_classes) 35 | 36 | model = Sequential() 37 | model.add(Dense(512, activation='relu', input_shape=(784,))) 38 | model.add(Dropout(0.2)) 39 | model.add(Dense(512, activation='relu')) 40 | model.add(Dropout(0.2)) 41 | model.add(Dense(10, activation='softmax')) 42 | 43 | model.summary() 44 | 45 | model.compile(loss=lambda y, y_hat: focal_loss(y, y_hat, gamma), 46 | optimizer=RMSprop(), 47 | metrics=['accuracy']) 48 | 49 | history = model.fit(x_train, y_train, 50 | batch_size=batch_size, 51 | epochs=epochs, 52 | verbose=1, 53 | validation_data=(x_test, y_test)) 54 | score = model.evaluate(x_test, y_test, verbose=0) 55 | print('Test loss:', score[0]) 56 | print('Test accuracy:', score[1]) 57 | -------------------------------------------------------------------------------- /gcnn/README.md: -------------------------------------------------------------------------------- 1 | Toy Keras example using Gated Convoluational Networks for language model on IMDB datasets. 2 | 3 | Note: 4 | Only work for tensorflow backend, see [issue 1](https://github.com/DingKe/nn_playground/issues/1). 5 | 6 | ## Prepare data 7 | change current directory to data, and follow the instuctions in imdb_preprocess_semi.py. 8 | 9 | ## Run 10 | 11 | ### Word LM 12 | python imdb_lm_gcnn.py 13 | 14 | ### Charactor LM 15 | python char_lm_gcc.py 16 | 17 | 18 | ## References 19 | * Dauphin et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/abs/1612.08083). 20 | -------------------------------------------------------------------------------- /gcnn/char_generator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | modified from https://github.com/sherjilozair/char-rnn-tensorflow 3 | ''' 4 | import codecs 5 | import os 6 | import collections 7 | from six.moves import cPickle 8 | import numpy as np 9 | 10 | class TextLoader(object): 11 | def __init__(self, data_dir, batch_size, seq_length, encoding='utf-8'): 12 | self.data_dir = data_dir 13 | self.batch_size = batch_size 14 | self.seq_length = seq_length 15 | self.encoding = encoding 16 | 17 | input_file = os.path.join(data_dir, "input.txt") 18 | vocab_file = os.path.join(data_dir, "vocab.pkl") 19 | tensor_file = os.path.join(data_dir, "data.npy") 20 | 21 | if not (os.path.exists(vocab_file) and os.path.exists(tensor_file)): 22 | print("reading text file") 23 | self.preprocess(input_file, vocab_file, tensor_file) 24 | else: 25 | print("loading preprocessed files") 26 | self.load_preprocessed(vocab_file, tensor_file) 27 | self.create_batches() 28 | 29 | def preprocess(self, input_file, vocab_file, tensor_file): 30 | with codecs.open(input_file, "r", encoding=self.encoding) as f: 31 | data = f.read() 32 | counter = collections.Counter(data) 33 | count_pairs = sorted(counter.items(), key=lambda x: -x[1]) 34 | self.chars, _ = zip(*count_pairs) 35 | self.vocab_size = len(self.chars) 36 | self.vocab = dict(zip(self.chars, range(len(self.chars)))) 37 | with open(vocab_file, 'wb') as f: 38 | cPickle.dump(self.chars, f) 39 | self.tensor = np.array(list(map(self.vocab.get, data))) 40 | np.save(tensor_file, self.tensor) 41 | 42 | def load_preprocessed(self, vocab_file, tensor_file): 43 | with open(vocab_file, 'rb') as f: 44 | self.chars = cPickle.load(f) 45 | self.vocab_size = len(self.chars) 46 | self.vocab = dict(zip(self.chars, range(len(self.chars)))) 47 | self.tensor = np.load(tensor_file) 48 | self.num_batches = int(self.tensor.size / (self.batch_size * 49 | self.seq_length)) 50 | 51 | def create_batches(self): 52 | self.num_batches = int(self.tensor.size / (self.batch_size * 53 | self.seq_length)) 54 | 55 | # When the data (tensor) is too small, let's give them a better error message 56 | if self.num_batches==0: 57 | assert False, "Not enough data. Make seq_length and batch_size small." 58 | 59 | self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length] 60 | xdata = self.tensor 61 | ydata = np.copy(self.tensor) 62 | ydata[:-1] = xdata[1:] 63 | ydata[-1] = xdata[0] 64 | self.x_batches = np.split(xdata.reshape(self.batch_size, -1), self.num_batches, 1) 65 | self.y_batches = np.split(ydata.reshape(self.batch_size, -1), self.num_batches, 1) 66 | 67 | def __call__(self): 68 | while True: 69 | for x, y in zip(self.x_batches, self.y_batches): 70 | yield x, np.expand_dims(y, -1) 71 | -------------------------------------------------------------------------------- /gcnn/char_lm_gcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Simple RNN for Language Model 3 | ''' 4 | from __future__ import print_function 5 | import os 6 | 7 | from keras.models import Model 8 | from keras.layers import Input, Embedding, Dense, TimeDistributed 9 | from keras.optimizers import * 10 | 11 | from gcnn import GCNN 12 | from char_generator import TextLoader 13 | 14 | 15 | def LM(batch_size, window_size=3, vocsize=20000, embed_dim=20, hidden_dim=30, nb_layers=1): 16 | x = Input(batch_shape=(batch_size, None)) 17 | # mebedding 18 | y = Embedding(vocsize+2, embed_dim, mask_zero=False)(x) 19 | for i in range(nb_layers-1): 20 | y = GCNN(hidden_dim, window_size=window_size, 21 | name='gcnn{}'.format(i + 1))(y) 22 | y = GCNN(hidden_dim, window_size=window_size, 23 | name='gcnn{}'.format(nb_layers))(y) 24 | y = TimeDistributed(Dense(vocsize+2, activation='softmax', name='dense{}'.format(nb_layers)))(y) 25 | 26 | model = Model(inputs=x, outputs=y) 27 | 28 | return model 29 | 30 | 31 | def run_demo(): 32 | batch_size = 50 33 | epochs = 100 34 | nb_layers = 3 35 | 36 | max_len = 50 37 | window_size = 5 38 | 39 | # Prepare data 40 | path = './data/tinyshakespeare' 41 | data_loader = TextLoader(path, batch_size, max_len) 42 | vocsize = data_loader.vocab_size 43 | print('vocsize: {}'.format(vocsize)) 44 | 45 | # Build model 46 | model = LM(batch_size, window_size=window_size, vocsize=vocsize, nb_layers=nb_layers) 47 | model.compile(optimizer='adam', 48 | loss='sparse_categorical_crossentropy') 49 | 50 | 51 | train_steps = data_loader.num_batches 52 | 53 | # Start training 54 | model.summary() 55 | model.fit_generator(data_loader(), steps_per_epoch=train_steps, 56 | epochs=epochs, verbose=1) 57 | 58 | 59 | if __name__ == '__main__': 60 | run_demo() 61 | -------------------------------------------------------------------------------- /gcnn/data/imdb_preprocess_semi.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script is what created the dataset pickled. 3 | 4 | 1) You need to download this file and put it in the same directory as this file. 5 | https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission. 6 | 7 | 2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory. 8 | 9 | 3) Then run this script. 10 | """ 11 | 12 | 13 | import numpy 14 | import cPickle as pkl 15 | 16 | from collections import OrderedDict 17 | 18 | import glob 19 | import os 20 | 21 | from subprocess import Popen, PIPE 22 | 23 | dataset_path = os.path.expandvars('$PWD/aclImdb/') 24 | 25 | # tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer 26 | tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-'] 27 | 28 | 29 | def tokenize(sentences): 30 | 31 | print 'Tokenizing..', 32 | text = "\n".join(sentences) 33 | tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) 34 | tok_text, _ = tokenizer.communicate(text) 35 | toks = tok_text.split('\n')[:-1] 36 | print 'Done' 37 | 38 | return toks 39 | 40 | 41 | def build_dict(path): 42 | sentences = [] 43 | currdir = os.getcwd() 44 | os.chdir('%s/pos/' % path) 45 | for ff in glob.glob("*.txt"): 46 | with open(ff, 'r') as f: 47 | sentences.append(f.readline().strip()) 48 | os.chdir('%s/neg/' % path) 49 | for ff in glob.glob("*.txt"): 50 | with open(ff, 'r') as f: 51 | sentences.append(f.readline().strip()) 52 | os.chdir(currdir) 53 | 54 | sentences = tokenize(sentences) 55 | 56 | print 'Building dictionary..', 57 | wordcount = dict() 58 | for ss in sentences: 59 | words = ss.strip().lower().split() 60 | for w in words: 61 | if w not in wordcount: 62 | wordcount[w] = 1 63 | else: 64 | wordcount[w] += 1 65 | 66 | counts = wordcount.values() 67 | keys = wordcount.keys() 68 | 69 | sorted_idx = numpy.argsort(counts)[::-1] 70 | 71 | worddict = dict() 72 | 73 | for idx, ss in enumerate(sorted_idx): 74 | worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK) 75 | 76 | print numpy.sum(counts), ' total words ', len(keys), ' unique words' 77 | 78 | return worddict 79 | 80 | 81 | def grab_data(path, dictionary): 82 | sentences = [] 83 | currdir = os.getcwd() 84 | os.chdir(path) 85 | for ff in glob.glob("*.txt"): 86 | with open(ff, 'r') as f: 87 | sentences.append(f.readline().strip()) 88 | os.chdir(currdir) 89 | sentences = tokenize(sentences) 90 | 91 | seqs = [None] * len(sentences) 92 | for idx, ss in enumerate(sentences): 93 | words = ss.strip().lower().split() 94 | seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words] 95 | 96 | return seqs 97 | 98 | 99 | def main(): 100 | # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ 101 | path = dataset_path 102 | dictionary = build_dict(os.path.join(path, 'train')) 103 | 104 | train_x_pos = grab_data(path+'train/pos', dictionary) 105 | train_x_neg = grab_data(path+'train/neg', dictionary) 106 | train_x_unsup = grab_data(path+'train/unsup', dictionary) 107 | train_x = train_x_pos + train_x_neg 108 | train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg) 109 | 110 | test_x_pos = grab_data(path+'test/pos', dictionary) 111 | test_x_neg = grab_data(path+'test/neg', dictionary) 112 | test_x = test_x_pos + test_x_neg 113 | test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg) 114 | 115 | f = open('imdb-full.pkl', 'wb') 116 | data = {'train_x': train_x, 'train_y': train_y, 117 | 'test_x': test_x, 'test_y': test_y, 118 | 'train_x_unsup': train_x_unsup 119 | } 120 | pkl.dump(data, f, True) 121 | f.close() 122 | 123 | f = open('imdb.dict.pkl', 'wb') 124 | pkl.dump(dictionary, f, True) 125 | f.close() 126 | 127 | if __name__ == '__main__': 128 | main() 129 | -------------------------------------------------------------------------------- /gcnn/data/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /gcnn/gcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import numpy as np 4 | 5 | from keras import backend as K 6 | from keras import activations, initializers, regularizers, constraints 7 | from keras.layers import Layer, InputSpec 8 | 9 | from keras.utils.conv_utils import conv_output_length 10 | 11 | class GCNN(Layer): 12 | '''Gated Convolutional Networks 13 | 14 | # Arguments 15 | 16 | # References 17 | - Dauphin et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/abs/1612.08083) 18 | ''' 19 | def __init__(self, output_dim, window_size=3, stride=1, 20 | kernel_initializer='uniform', bias_initializer='zero', 21 | activation='linear', activity_regularizer=None, 22 | kernel_regularizer=None, bias_regularizer=None, 23 | kernel_constraint=None, bias_constraint=None, 24 | use_bias=True, input_dim=None, input_length=None, **kwargs): 25 | self.output_dim = output_dim 26 | self.window_size = window_size 27 | self.strides = (stride, 1) 28 | 29 | self.use_bias = use_bias 30 | self.kernel_initializer = initializers.get(kernel_initializer) 31 | self.bias_initializer = initializers.get(bias_initializer) 32 | self.activation = activations.get(activation) 33 | self.kernel_regularizer = regularizers.get(kernel_regularizer) 34 | self.bias_regularizer = regularizers.get(bias_regularizer) 35 | self.activity_regularizer = regularizers.get(activity_regularizer) 36 | self.kernel_constraint = constraints.get(kernel_constraint) 37 | self.bias_constraint = constraints.get(bias_constraint) 38 | 39 | self.input_spec = [InputSpec(ndim=3)] 40 | self.input_dim = input_dim 41 | self.input_length = input_length 42 | if self.input_dim: 43 | kwargs['input_shape'] = (self.input_length, self.input_dim) 44 | super(GCNN, self).__init__(**kwargs) 45 | 46 | def build(self, input_shape): 47 | input_dim = input_shape[2] 48 | self.input_dim = input_dim 49 | self.input_spec = [InputSpec(shape=input_shape)] 50 | self.kernel_shape = (self.window_size, 1, input_dim, self.output_dim * 2) 51 | self.kernel = self.add_weight(self.kernel_shape, 52 | initializer=self.kernel_initializer, 53 | name='kernel', 54 | regularizer=self.kernel_regularizer, 55 | constraint=self.kernel_constraint) 56 | 57 | if self.use_bias: 58 | self.bias = self.add_weight((self.output_dim * 2,), 59 | initializer=self.bias_initializer, 60 | name='b', 61 | regularizer=self.bias_regularizer, 62 | constraint=self.bias_constraint) 63 | 64 | self.built = True 65 | 66 | def compute_output_shape(self, input_shape): 67 | length = input_shape[1] 68 | if length: 69 | length = conv_output_length(length + self.window_size - 1, 70 | self.window_size, 'valid', 71 | self.strides[0]) 72 | return (input_shape[0], length, self.output_dim) 73 | 74 | def call(self, x): 75 | # input shape: (nb_samples, time (padded with zeros), input_dim) 76 | # note that the .build() method of subclasses MUST define 77 | # self.input_spec with a complete input shape. 78 | input_shape = self.input_spec[0].shape 79 | 80 | if self.window_size > 1: 81 | x = K.temporal_padding(x, (self.window_size-1, 0)) 82 | x = K.expand_dims(x, 2) # add a dummy dimension 83 | 84 | # z, g 85 | output = K.conv2d(x, self.kernel, strides=self.strides, 86 | padding='valid', 87 | data_format='channels_last') 88 | output = K.squeeze(output, 2) # remove the dummy dimension 89 | if self.use_bias: 90 | output = K.bias_add(output, self.bias, data_format='channels_last') 91 | z = output[:, :, :self.output_dim] 92 | g = output[:, :, self.output_dim:] 93 | 94 | return self.activation(z) * K.sigmoid(g) 95 | 96 | def get_config(self): 97 | config = {'output_dim': self.output_dim, 98 | 'window_size': self.window_size, 99 | 'init': self.init.get_config(), 100 | 'stride': self.strides[0], 101 | 'activation': activations.serialize(self.activation), 102 | 'kernel_initializer': initializers.serialize(self.kernel_initializer), 103 | 'bias_initializer': initializers.serialize(self.bias_initializer), 104 | 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer), 105 | 'bias_regularizer': regularizers.serialize(self.bias_regularizer), 106 | 'activity_regularizer': regularizers.serialize(self.activy_regularizer), 107 | 'kernel_constraint': constraints.serialize(self.kernel_constraint), 108 | 'bias_constraint': constraints.serialize(self.bias_constraint), 109 | 'use_bias': self.use_bias, 110 | 'input_dim': self.input_dim, 111 | 'input_length': self.input_length} 112 | base_config = super(GCNN, self).get_config() 113 | return dict(list(base_config.items()) + list(config.items())) 114 | -------------------------------------------------------------------------------- /gcnn/imdb_generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import os 4 | import gzip 5 | import numpy as np 6 | import random 7 | from six.moves import cPickle 8 | 9 | 10 | def set_unk(sent, vocab_size=None): 11 | return [w if not vocab_size or w <= vocab_size + 1 else 1 for w in sent] 12 | 13 | 14 | def pad(xs, ys, max_len=None): 15 | new_xs = [] 16 | for x in xs: 17 | if max_len: 18 | x = x[:max_len] 19 | new_xs.append(x) 20 | xs = new_xs 21 | max_len = max_len if max_len else max([len(x) for x in xs]) 22 | 23 | xa = np.zeros((len(xs), max_len), dtype='int32') 24 | for i in range(len(xs)): 25 | for j, c in enumerate(xs[i]): 26 | xa[i, j] = c 27 | ya = np.asarray(ys, dtype='int32') if ys else None 28 | 29 | return xa, ya 30 | 31 | 32 | def add_eos(xs, max_len=None, eos=None): 33 | new_xs, new_ys = [] 34 | for x in xs: 35 | y = x + [eos] 36 | x = [eos] + x 37 | new_xs.append(x) 38 | new_ys.apped(y) 39 | xs, ys = new_xs, new_ys 40 | max_len = max_len + 1 if max_len else max(max([len(x) for x in xs])) 41 | 42 | xa = np.zeros((len(xs), max_len), dtype='int32') 43 | ya = np.zeros((len(ys), max_len), dtype='int32') 44 | for i in range(len(xa)): 45 | for j, c in enumerate(xs[i]): 46 | xa[i, j] = c 47 | ya[i, j] = ys[i][j] 48 | 49 | return xa, ya 50 | 51 | 52 | class IMDBLM(object): 53 | '''IMDB for training language model 54 | ''' 55 | def __init__(self, path, which_set=None, train_ratio=0.9, 56 | max_len=5, vocab_size=101745, batch_size=10, 57 | shuffle=True, seed=1111): 58 | self.__dict__.update(locals()) 59 | self.__dict__.pop('self') 60 | 61 | self.random = random.Random() 62 | self.random.seed(self.seed) 63 | 64 | self.path = os.path.expandvars(self.path) 65 | 66 | def __call__(self): 67 | if self.path.endswith('.gz'): 68 | with gzip.open(self.path, 'r') as fp: 69 | data = cPickle.load(fp) 70 | else: 71 | with open(self.path, 'r') as fp: 72 | data = cPickle.load(fp) 73 | 74 | assert self.which_set in ['train', 'validation', 'test', None], \ 75 | "which_set should be 'train' or 'validation' or 'test', " + \ 76 | "but '{}' is given.".format(self.which_set) 77 | 78 | if self.which_set in ['train', 'validation', None]: 79 | train_x, train_x_unsup = data['train_x'], data['train_x_unsup'] 80 | # concatenate all sentences 81 | text = [] 82 | for x in train_x + train_x_unsup: 83 | text += x 84 | 85 | NUM_TRAIN = int(len(text) * self.train_ratio) 86 | NUM_VALID = len(text) - NUM_TRAIN 87 | if self.which_set == 'train': 88 | text = text[:NUM_TRAIN] 89 | elif self.which_set == 'validation': 90 | text = text[-NUM_VALID:] 91 | else: 92 | # concatenate all sentences 93 | text = [] 94 | for x in data['test_x']: 95 | text += x 96 | 97 | text = set_unk(text, self.vocab_size) 98 | 99 | del data 100 | 101 | nb_words = len(text) 102 | seg_size = nb_words // self.batch_size 103 | cursors = [i * seg_size for i in range(self.batch_size)] 104 | 105 | while True: 106 | x = np.zeros((self.batch_size, self.max_len), dtype='int32') 107 | y = np.zeros((self.batch_size, self.max_len, 1), dtype='int32') 108 | for i in range(self.batch_size): 109 | c = cursors[i] 110 | for j in range(self.max_len): 111 | x[i, j] = text[(c + j) % nb_words] 112 | y[i, j, 0] = text[(c + j + 1) % nb_words] 113 | cursors[i] = (c + self.max_len) % nb_words 114 | yield x, y 115 | -------------------------------------------------------------------------------- /gcnn/imdb_lm_gcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Simple RNN for Language Model 3 | ''' 4 | from __future__ import print_function 5 | import os 6 | 7 | from keras.models import Model 8 | from keras.layers import Input, Embedding, Dense, TimeDistributed 9 | from keras.optimizers import * 10 | 11 | from gcnn import GCNN 12 | from imdb_generator import IMDBLM 13 | 14 | 15 | def LM(batch_size, window_size=3, vocsize=20000, embed_dim=20, hidden_dim=30, nb_layers=1): 16 | x = Input(batch_shape=(batch_size, None)) 17 | # mebedding 18 | y = Embedding(vocsize+2, embed_dim, mask_zero=False)(x) 19 | for i in range(nb_layers-1): 20 | y = GCNN(hidden_dim, window_size=window_size, 21 | name='gcnn{}'.format(i + 1))(y) 22 | y = GCNN(hidden_dim, window_size=window_size, 23 | name='gcnn{}'.format(nb_layers))(y) 24 | y = TimeDistributed(Dense(vocsize+2, activation='softmax', name='dense{}'.format(nb_layers)))(y) 25 | 26 | model = Model(inputs=x, outputs=y) 27 | 28 | return model 29 | 30 | 31 | def train_model(): 32 | batch_size = 32 33 | epochs = 100 34 | nb_layers = 3 35 | vocsize = 2000 # top 2k 36 | max_len = 30 37 | train_ratio = 0.99 38 | window_size = 3 39 | 40 | # Build model 41 | model = LM(batch_size, window_size=window_size, vocsize=vocsize, nb_layers=nb_layers) 42 | model.compile(optimizer='adam', 43 | loss='sparse_categorical_crossentropy') 44 | 45 | # Prepare data 46 | path = './data/imdb-full.pkl' 47 | # Train 48 | train_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, shuffle=True, 49 | which_set='train', train_ratio=train_ratio, batch_size=batch_size) 50 | # Validation 51 | val_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, 52 | which_set='validation', train_ratio=train_ratio, batch_size=batch_size) 53 | 54 | train_steps = 500 55 | val_steps = 200 56 | 57 | # Start training 58 | model.summary() 59 | model.fit_generator(train_gen(), steps_per_epoch=train_steps, 60 | validation_data=val_gen(), validation_steps=val_steps, 61 | epochs=epochs, verbose=1) 62 | 63 | 64 | def run_demo(): 65 | train_model() 66 | 67 | 68 | if __name__ == '__main__': 69 | run_demo() 70 | -------------------------------------------------------------------------------- /glsgan/README.md: -------------------------------------------------------------------------------- 1 | Generalized Loss Sensitive GAN -- a unified perspective for WGAN and LSGAN. 2 | 3 | This repo is the toy demo for the [blog on GAN](http://blog.csdn.net/jackytintin/article/details/61908718). Please refer https://github.com/guojunq/glsgan for the 'official' repo for GLSGAN in torch. 4 | 5 | ## Reference 6 | * Arjovsky et al. [Wasserstein GAN](https://www.arxiv.org/abs/1701.07875). 7 | * Qi. [Loss-Sensitive Generative Adversarial Networks on Lipschitz Densities](https://arxiv.org/abs/1701.06264). 8 | -------------------------------------------------------------------------------- /glsgan/glsgan_mlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Generalized Loss Sensentive Generative Adversarial Network (GLSGAN) 5 | """ 6 | from __future__ import print_function 7 | from PIL import Image 8 | from six.moves import range 9 | 10 | import keras.backend as K 11 | K.set_image_data_format('channels_first') 12 | 13 | from keras.datasets import mnist 14 | from keras.layers import Input, Dense, Reshape, Flatten, Dropout, Activation, BatchNormalization, ELU, LeakyReLU 15 | from keras.models import Sequential, Model 16 | from keras.optimizers import RMSprop, Adam 17 | from keras.utils.generic_utils import Progbar 18 | 19 | import numpy as np 20 | np.random.seed(1337) 21 | 22 | 23 | def clip_weights(model, lower, upper): 24 | for l in model.layers: 25 | weights = l.get_weights() 26 | weights = [np.clip(w, lower, upper) for w in weights] 27 | l.set_weights(weights) 28 | 29 | 30 | def dummy_loss(loss_to_backprop, y_pred): 31 | return K.mean(loss_to_backprop * y_pred) 32 | 33 | 34 | def build_generator(latent_size): 35 | ''' 36 | Any model with input shape (?, latent_size) and output shape (?, 1, 28, 28) fits here. 37 | ''' 38 | model = Sequential() 39 | model.add(Dense(1024, input_dim=latent_size, activation='relu')) 40 | model.add(Dense(28 * 28, activation='tanh')) 41 | model.add(Reshape((1, 28, 28))) 42 | 43 | return model 44 | 45 | 46 | def build_discriminator(act): 47 | ''' 48 | Any model with input shape (?, 1, 28, 28) and output shape (?, 1) fits here. 49 | Use different activator for different type of GAN. 50 | ''' 51 | model = Sequential() 52 | model.add(Flatten(input_shape=(1, 28, 28))) 53 | model.add(Dense(256)) 54 | model.add(Activation('relu')) 55 | model.add(Dense(128)) 56 | model.add(Activation('relu')) 57 | model.add(Dense(1, activation='linear')) 58 | model.add(act) 59 | 60 | return model 61 | 62 | 63 | if __name__ == '__main__': 64 | gan_type = 'wgan' # 'wgan', 'lsgan', 'elu', 'l1' 65 | epochs = 5000 66 | batch_size = 50 67 | latent_size = 20 68 | lr = 0.00005 69 | c = 0.08 70 | slope = 0 # [-inf, 1], 1 for WGAN, 0 for LSGAN 71 | 72 | act = LeakyReLU(slope) # try act = ELU() 73 | 74 | # build the discriminator 75 | disc = build_discriminator(act) 76 | disc.compile( 77 | optimizer=RMSprop(lr=lr), 78 | loss=dummy_loss 79 | ) 80 | 81 | # build the generator 82 | generator = build_generator(latent_size) 83 | 84 | latent = Input(shape=(latent_size, )) 85 | # get a fake image 86 | fake = generator(latent) 87 | # we only want to be able to train generation for the combined model 88 | disc.trainable = False 89 | fake = disc(fake) 90 | combined = Model(inputs=latent, outputs=fake) 91 | combined.compile( 92 | optimizer=RMSprop(lr=lr), 93 | loss=dummy_loss 94 | ) 95 | 96 | # get our mnist data, and force it to be of shape (..., 1, 28, 28) with 97 | # range [-1, 1] 98 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 99 | X_train = (X_train.astype(np.float32) - 127.5) / 127.5 100 | X_train = np.expand_dims(X_train, axis=1) 101 | 102 | X_test = (X_test.astype(np.float32) - 127.5) / 127.5 103 | X_test = np.expand_dims(X_test, axis=1) 104 | 105 | nb_train, nb_test = X_train.shape[0], X_test.shape[0] 106 | 107 | for epoch in range(epochs): 108 | print('Epoch {} of {}'.format(epoch + 1, epochs)) 109 | 110 | nb_batches = int(X_train.shape[0] / batch_size) 111 | progress_bar = Progbar(target=nb_batches) 112 | 113 | epoch_disc_loss= [] 114 | epoch_gen_loss = [] 115 | 116 | index = 0 117 | while index < nb_batches: 118 | ## discriminator 119 | if epoch < 5 or epoch % 100 == 0: 120 | Diters = 1 121 | else: 122 | Diters = 1 123 | iter = 0 124 | disc_loss= [] 125 | while index < nb_batches and iter < Diters: 126 | progress_bar.update(index) 127 | index += 1 128 | iter += 1 129 | 130 | # generate a new batch of noise 131 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 132 | # generate a batch of fake images 133 | generated_images = generator.predict(noise, verbose=0) 134 | 135 | # get a batch of real images 136 | image_batch = X_train[index * batch_size:(index + 1) * batch_size] 137 | label_batch = y_train[index * batch_size:(index + 1) * batch_size] 138 | 139 | X = np.concatenate((image_batch, generated_images)) 140 | y = np.array([-1] * len(image_batch) + [1] * batch_size) 141 | 142 | disc_loss.append(-disc.train_on_batch(X, y)) 143 | 144 | clip_weights(disc, -c, c) 145 | 146 | epoch_disc_loss.append(sum(disc_loss)/len(disc_loss)) 147 | 148 | ## generator 149 | # make new noise. we generate 2 * batch size here such that we have 150 | # the generator optimize over an identical number of images as the disc 151 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 152 | target = -np.ones(batch_size) 153 | epoch_gen_loss.append(-combined.train_on_batch(noise, target)) 154 | 155 | print('\n[Loss_D: {:.3f}, Loss_G: {:.3f}]'.format(np.mean(epoch_disc_loss), np.mean(epoch_gen_loss))) 156 | 157 | # save weights every epoch 158 | if False: 159 | generator.save_weights( 160 | 'slope_{}_mlp_generator_epoch_{:03d}.hdf5'.format(slope, epoch), True) 161 | disc.save_weights( 162 | 'slope_{}_mlp_disc_epoch_{:03d}.hdf5'.format(slop, epoch), True) 163 | 164 | # generate some digits to display 165 | noise = np.random.uniform(-1, 1, (100, latent_size)) 166 | # get a batch to display 167 | generated_images = generator.predict(noise, verbose=0) 168 | 169 | # arrange them into a grid 170 | img = (np.concatenate([r.reshape(-1, 28) 171 | for r in np.split(generated_images, 10) 172 | ], axis=-1) * 127.5 + 127.5).astype(np.uint8) 173 | 174 | Image.fromarray(img).save( 175 | 'slope_{}_mlp_epoch_{:03d}_generated.png'.format(slope, epoch)) 176 | -------------------------------------------------------------------------------- /layernorm/README.md: -------------------------------------------------------------------------------- 1 | Layer Normalization 2 | 3 | ## Run 4 | 5 | ### MLP on MNIST 6 | python mnist_mlp.py 7 | 8 | ### CNN on MNIST 9 | python mnist_cnn.py 10 | 11 | ### bAbI toy QA tasks 12 | python babi_lnlstm.py 13 | 14 | 15 | ### IMDB sentiment classification 16 | imdb_cnn_lstm.py 17 | 18 | ## References: 19 | Ba et al. [Layer Normalization](https://arxiv.org/abs/1607.06450). 20 | [Git repo](https://github.com/ryankiros/layer-norm). 21 | -------------------------------------------------------------------------------- /layernorm/babi_lnlstm.py: -------------------------------------------------------------------------------- 1 | '''Trains two recurrent neural networks based upon a story and a question. 2 | The resulting merged vector is then queried to answer a range of bAbI tasks. 3 | 4 | The results are comparable to those for an LSTM model provided in Weston et al.: 5 | "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks" 6 | http://arxiv.org/abs/1502.05698 7 | 8 | Task Number | FB LSTM Baseline | LSTM | LSTM w/ LN 9 | --- | --- | --- | --- 10 | QA1 - Single Supporting Fact | 50 | 52.0 | 58.0 11 | 12 | For the resources related to the bAbI project, refer to: 13 | https://research.facebook.com/researchers/1543934539189348 14 | ''' 15 | 16 | from __future__ import print_function 17 | from functools import reduce 18 | import re 19 | import tarfile 20 | 21 | import numpy as np 22 | np.random.seed(1337) # for reproducibility 23 | 24 | from keras.utils.data_utils import get_file 25 | from keras import layers 26 | from keras.models import Model 27 | from keras.preprocessing.sequence import pad_sequences 28 | 29 | from layer_norm_layers import * 30 | 31 | 32 | def tokenize(sent): 33 | '''Return the tokens of a sentence including punctuation. 34 | 35 | >>> tokenize('Bob dropped the apple. Where is the apple?') 36 | ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?'] 37 | ''' 38 | return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()] 39 | 40 | 41 | def parse_stories(lines, only_supporting=False): 42 | '''Parse stories provided in the bAbi tasks format 43 | 44 | If only_supporting is true, only the sentences that support the answer are kept. 45 | ''' 46 | data = [] 47 | story = [] 48 | for line in lines: 49 | line = line.decode('utf-8').strip() 50 | nid, line = line.split(' ', 1) 51 | nid = int(nid) 52 | if nid == 1: 53 | story = [] 54 | if '\t' in line: 55 | q, a, supporting = line.split('\t') 56 | q = tokenize(q) 57 | substory = None 58 | if only_supporting: 59 | # Only select the related substory 60 | supporting = map(int, supporting.split()) 61 | substory = [story[i - 1] for i in supporting] 62 | else: 63 | # Provide all the substories 64 | substory = [x for x in story if x] 65 | data.append((substory, q, a)) 66 | story.append('') 67 | else: 68 | sent = tokenize(line) 69 | story.append(sent) 70 | return data 71 | 72 | 73 | def get_stories(f, only_supporting=False, max_length=None): 74 | '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story. 75 | 76 | If max_length is supplied, any stories longer than max_length tokens will be discarded. 77 | ''' 78 | data = parse_stories(f.readlines(), only_supporting=only_supporting) 79 | flatten = lambda data: reduce(lambda x, y: x + y, data) 80 | data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length] 81 | return data 82 | 83 | 84 | def vectorize_stories(data, word_idx, story_maxlen, query_maxlen): 85 | X = [] 86 | Xq = [] 87 | Y = [] 88 | for story, query, answer in data: 89 | x = [word_idx[w] for w in story] 90 | xq = [word_idx[w] for w in query] 91 | y = np.zeros(len(word_idx) + 1) # let's not forget that index 0 is reserved 92 | y[word_idx[answer]] = 1 93 | X.append(x) 94 | Xq.append(xq) 95 | Y.append(y) 96 | return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y) 97 | 98 | RNN = LayerNormLSTM # layers.LSTM 99 | EMBED_HIDDEN_SIZE = 50 100 | SENT_HIDDEN_SIZE = 100 101 | QUERY_HIDDEN_SIZE = 100 102 | BATCH_SIZE = 32 103 | EPOCHS = 40 104 | print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, QUERY_HIDDEN_SIZE)) 105 | 106 | try: 107 | path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz') 108 | except: 109 | print('Error downloading dataset, please download it manually:\n' 110 | '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n' 111 | '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz') 112 | raise 113 | tar = tarfile.open(path) 114 | # Default QA1 with 1000 samples 115 | challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt' 116 | # QA1 with 10,000 samples 117 | # challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt' 118 | # QA2 with 1000 samples 119 | # challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt' 120 | # QA2 with 10,000 samples 121 | # challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt' 122 | train = get_stories(tar.extractfile(challenge.format('train'))) 123 | test = get_stories(tar.extractfile(challenge.format('test'))) 124 | 125 | vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train + test))) 126 | # Reserve 0 for masking via pad_sequences 127 | vocab_size = len(vocab) + 1 128 | word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) 129 | story_maxlen = max(map(len, (x for x, _, _ in train + test))) 130 | query_maxlen = max(map(len, (x for _, x, _ in train + test))) 131 | 132 | X, Xq, Y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen) 133 | tX, tXq, tY = vectorize_stories(test, word_idx, story_maxlen, query_maxlen) 134 | 135 | print('vocab = {}'.format(vocab)) 136 | print('X.shape = {}'.format(X.shape)) 137 | print('Xq.shape = {}'.format(Xq.shape)) 138 | print('Y.shape = {}'.format(Y.shape)) 139 | print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen)) 140 | 141 | print('Build model...') 142 | 143 | sentence = layers.Input(shape=(story_maxlen,), dtype='int32') 144 | encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence) 145 | encoded_sentence = layers.Dropout(0.3)(encoded_sentence) 146 | 147 | question = layers.Input(shape=(query_maxlen,), dtype='int32') 148 | encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question) 149 | encoded_question = layers.Dropout(0.3)(encoded_question) 150 | encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question) 151 | encoded_question = layers.RepeatVector(story_maxlen)(encoded_question) 152 | 153 | merged = layers.add([encoded_sentence, encoded_question]) 154 | merged = RNN(EMBED_HIDDEN_SIZE)(merged) 155 | merged = layers.Dropout(0.3)(merged) 156 | preds = layers.Dense(vocab_size, activation='softmax')(merged) 157 | 158 | model = Model([sentence, question], preds) 159 | model.compile(optimizer='rmsprop', 160 | loss='categorical_crossentropy', 161 | metrics=['accuracy']) 162 | 163 | print('Training') 164 | model.fit([X, Xq], Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.05) 165 | loss, acc = model.evaluate([tX, tXq], tY, batch_size=BATCH_SIZE) 166 | print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc)) 167 | -------------------------------------------------------------------------------- /layernorm/data: -------------------------------------------------------------------------------- 1 | ../gcnn/data -------------------------------------------------------------------------------- /layernorm/imdb_cnn_lstm.py: -------------------------------------------------------------------------------- 1 | '''Train a recurrent convolutional network on the IMDB sentiment 2 | classification task. 3 | 4 | w/o LN: 5 | Gets to 0.8544 test accuracy after 2 epochs. 6 | w/ LN: 7 | Gets to 0.8564 test accuracy after 2 epochs. 8 | ''' 9 | from __future__ import print_function 10 | import numpy as np 11 | np.random.seed(1337) # for reproducibility 12 | 13 | from keras.preprocessing import sequence 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout, Activation 16 | from keras.layers import Embedding 17 | from keras.layers import LSTM 18 | from keras.layers import Conv1D, MaxPooling1D 19 | from keras.datasets import imdb 20 | 21 | from layer_norm_layers import * 22 | 23 | 24 | # Embedding 25 | max_features = 20000 26 | maxlen = 100 27 | embedding_size = 128 28 | 29 | # Convolution 30 | filter_length = 5 31 | filters = 64 32 | pool_size = 4 33 | 34 | # LSTM 35 | lstm_output_size = 70 36 | 37 | # Training 38 | batch_size = 30 39 | epochs = 2 40 | 41 | ''' 42 | Note: 43 | batch_size is highly sensitive. 44 | Only 2 epochs are needed as the dataset is very small. 45 | ''' 46 | 47 | print('Build model...') 48 | model = Sequential() 49 | model.add(Embedding(max_features, embedding_size, input_length=maxlen)) 50 | model.add(Dropout(0.25)) 51 | model.add(Conv1D(filters=filters, 52 | kernel_size=5, 53 | padding='valid', 54 | activation='relu', 55 | strides=1)) 56 | model.add(MaxPooling1D(pool_size=pool_size)) 57 | model.add(LayerNormLSTM(lstm_output_size)) 58 | #model.add(LSTM(lstm_output_size)) 59 | model.add(Dense(1)) 60 | model.add(Activation('sigmoid')) 61 | 62 | model.compile(loss='binary_crossentropy', 63 | optimizer='adam', 64 | metrics=['accuracy']) 65 | 66 | print('Loading data...') 67 | (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features) 68 | print(len(X_train), 'train sequences') 69 | print(len(X_test), 'test sequences') 70 | 71 | print('Pad sequences (samples x time)') 72 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 73 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 74 | print('X_train shape:', X_train.shape) 75 | print('X_test shape:', X_test.shape) 76 | 77 | 78 | print('Train...') 79 | model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, 80 | validation_data=(X_test, y_test)) 81 | score, acc = model.evaluate(X_test, y_test, batch_size=batch_size) 82 | print('Test score:', score) 83 | print('Test accuracy:', acc) 84 | -------------------------------------------------------------------------------- /layernorm/imdb_generator.py: -------------------------------------------------------------------------------- 1 | ../gcnn/imdb_generator.py -------------------------------------------------------------------------------- /layernorm/imdb_lm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Simple RNN for Language Model 3 | ''' 4 | from __future__ import print_function 5 | import os 6 | 7 | from keras.models import Model 8 | from keras.layers import Input, Embedding, Dense, TimeDistributed 9 | from keras.optimizers import * 10 | 11 | from layer_norm_layers import * 12 | from imdb_generator import IMDBLM 13 | 14 | 15 | def LM(batch_size, vocsize=20000, embed_dim=20, hidden_dim=30, nb_layers=1): 16 | x = Input(batch_shape=(batch_size, None)) 17 | # mebedding 18 | y = Embedding(vocsize+2, embed_dim, mask_zero=False)(x) 19 | for i in range(nb_layers-1): 20 | y = LayerNormLSTM(hidden_dim, return_sequences=True, name='lnlstm{}'.format(i + 1))(y) 21 | y = LayerNormLSTM(hidden_dim, return_sequences=True, name='lnlstm{}'.format(nb_layers))(y) 22 | y = TimeDistributed(Dense(vocsize+2, activation='softmax', name='dense{}'.format(nb_layers)))(y) 23 | 24 | model = Model(inputs=x, outputs=y) 25 | 26 | return model 27 | 28 | 29 | def train_model(): 30 | batch_size = 32 31 | epochs = 100 32 | 33 | vocsize = 2000 # top 2k 34 | max_len = 30 35 | train_ratio = 0.99 36 | 37 | # Build model 38 | model = LM(batch_size, vocsize=vocsize, nb_layers=3) 39 | model.compile(optimizer='adam', 40 | loss='sparse_categorical_crossentropy') 41 | 42 | # Prepare data 43 | path = './data/imdb-full.pkl' 44 | # Train 45 | train_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, shuffle=True, 46 | which_set='train', train_ratio=train_ratio, batch_size=batch_size) 47 | # Validation 48 | val_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, 49 | which_set='validation', train_ratio=train_ratio, batch_size=batch_size) 50 | 51 | train_steps = 2000 52 | val_steps = 200 53 | 54 | # Start training 55 | model.summary() 56 | model.fit_generator(train_gen(), steps_per_epoch=train_steps, 57 | validation_data=val_gen(), validation_steps=val_steps, 58 | epochs=epochs, verbose=1) 59 | 60 | 61 | def run_demo(): 62 | train_model() 63 | 64 | 65 | if __name__ == '__main__': 66 | run_demo() 67 | -------------------------------------------------------------------------------- /layernorm/layer_norm_layers.py: -------------------------------------------------------------------------------- 1 | from keras.engine import Layer, InputSpec 2 | from keras.layers import LSTM, GRU 3 | from keras import initializers, regularizers 4 | from keras import backend as K 5 | 6 | import numpy as np 7 | 8 | 9 | def to_list(x): 10 | if type(x) not in [list, tuple]: 11 | return [x] 12 | else: 13 | return list(x) 14 | 15 | 16 | def LN(x, gamma, beta, epsilon=1e-6, axis=-1): 17 | m = K.mean(x, axis=axis, keepdims=True) 18 | std = K.sqrt(K.var(x, axis=axis, keepdims=True) + epsilon) 19 | x_normed = (x - m) / (std + epsilon) 20 | x_normed = gamma * x_normed + beta 21 | 22 | return x_normed 23 | 24 | 25 | class LayerNormalization(Layer): 26 | def __init__(self, axis=-1, 27 | gamma_init='one', beta_init='zero', 28 | gamma_regularizer=None, beta_regularizer=None, 29 | epsilon=1e-6, **kwargs): 30 | super(LayerNormalization, self).__init__(**kwargs) 31 | 32 | self.axis = to_list(axis) 33 | self.gamma_init = initializers.get(gamma_init) 34 | self.beta_init = initializers.get(beta_init) 35 | self.gamma_regularizer = regularizers.get(gamma_regularizer) 36 | self.beta_regularizer = regularizers.get(beta_regularizer) 37 | self.epsilon = epsilon 38 | 39 | self.supports_masking = True 40 | 41 | def build(self, input_shape): 42 | self.input_spec = [InputSpec(shape=input_shape)] 43 | shape = [1 for _ in input_shape] 44 | for i in self.axis: 45 | shape[i] = input_shape[i] 46 | self.gamma = self.add_weight(shape=shape, 47 | initializer=self.gamma_init, 48 | regularizer=self.gamma_regularizer, 49 | name='gamma') 50 | self.beta = self.add_weight(shape=shape, 51 | initializer=self.beta_init, 52 | regularizer=self.beta_regularizer, 53 | name='beta') 54 | self.built = True 55 | 56 | def call(self, inputs, mask=None): 57 | return LN(inputs, gamma=self.gamma, beta=self.beta, 58 | axis=self.axis, epsilon=self.epsilon) 59 | 60 | def get_config(self): 61 | config = {'epsilon': self.epsilon, 62 | 'axis': self.axis, 63 | 'gamma_init': initializers.serialize(self.gamma_init), 64 | 'beta_init': initializers.serialize(self.beta_init), 65 | 'gamma_regularizer': regularizers.serialize(self.gamma_regularizer), 66 | 'beta_regularizer': regularizers.serialize(self.gamma_regularizer)} 67 | base_config = super(LayerNormalization, self).get_config() 68 | return dict(list(base_config.items()) + list(config.items())) 69 | 70 | 71 | class LayerNormLSTM(LSTM): 72 | def build(self, input_shape): 73 | if isinstance(input_shape, list): 74 | input_shape = input_shape[0] 75 | batch_size = input_shape[0] if self.stateful else None 76 | self.input_dim = input_shape[2] 77 | self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim)) 78 | self.state_spec = [InputSpec(shape=(batch_size, self.units)), 79 | InputSpec(shape=(batch_size, self.units))] 80 | 81 | 82 | # initial states: 2 all-zero tensors of shape (units) 83 | self.states = [None, None] 84 | if self.stateful: 85 | self.reset_states() 86 | 87 | self.kernel = self.add_weight(shape=(self.input_dim, 4 * self.units), 88 | initializer=self.kernel_initializer, 89 | name='kernel', 90 | regularizer=self.kernel_regularizer, 91 | constraint=self.kernel_constraint) 92 | self.recurrent_kernel = self.add_weight(shape=(self.units, self.units * 4), 93 | name='recurrent_kernel', 94 | initializer=self.recurrent_initializer, 95 | regularizer=self.recurrent_regularizer, 96 | constraint=self.recurrent_constraint) 97 | 98 | self.gamma_1 = self.add_weight(shape=(4 * self.units,), 99 | initializer='one', 100 | name='gamma_1') 101 | self.beta_1 = self.add_weight(shape=(4 * self.units,), 102 | initializer='zero', 103 | name='beta_1') 104 | self.gamma_2 = self.add_weight(shape=(4 * self.units,), 105 | initializer='one', 106 | name='gamma_2') 107 | self.beta_2 = self.add_weight(shape=(4 * self.units,), 108 | initializer='zero', 109 | name='beta_2') 110 | self.gamma_3 = self.add_weight(shape=(self.units,), 111 | initializer='one', 112 | name='gamma_3') 113 | self.beta_3 = self.add_weight((self.units,), 114 | initializer='zero', 115 | name='beta_3') 116 | 117 | if self.use_bias: 118 | if self.unit_forget_bias: 119 | def bias_initializer(shape, *args, **kwargs): 120 | return K.concatenate([ 121 | self.bias_initializer((self.units,), *args, **kwargs), 122 | initializers.Ones()((self.units,), *args, **kwargs), 123 | self.bias_initializer((self.units * 2,), *args, **kwargs), 124 | ]) 125 | else: 126 | bias_initializer = self.bias_initializer 127 | self.bias = self.add_weight(shape=(self.units * 4,), 128 | name='bias', 129 | initializer=bias_initializer, 130 | regularizer=self.bias_regularizer, 131 | constraint=self.bias_constraint) 132 | else: 133 | self.bias = None 134 | self.built = True 135 | 136 | def preprocess_input(self, inputs, training=None): 137 | return inputs 138 | 139 | def step(self, x, states): 140 | h_tm1 = states[0] 141 | c_tm1 = states[1] 142 | B_U = states[2] 143 | B_W = states[3] 144 | 145 | z = LN(K.dot(x * B_W[0], self.kernel), self.gamma_1, self.beta_1) + \ 146 | LN(K.dot(h_tm1 * B_U[0], self.recurrent_kernel), self.gamma_2, self.beta_2) 147 | if self.use_bias: 148 | z = K.bias_add(z, self.bias) 149 | 150 | z0 = z[:, :self.units] 151 | z1 = z[:, self.units: 2 * self.units] 152 | z2 = z[:, 2 * self.units: 3 * self.units] 153 | z3 = z[:, 3 * self.units:] 154 | 155 | i = self.recurrent_activation(z0) 156 | f = self.recurrent_activation(z1) 157 | c = f * c_tm1 + i * self.activation(z2) 158 | o = self.recurrent_activation(z3) 159 | 160 | h = o * self.activation(LN(c, self.gamma_3, self.beta_3)) 161 | return h, [h, c] 162 | -------------------------------------------------------------------------------- /layernorm/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple convnet on the MNIST dataset. 2 | 3 | w/o LN: 4 | Gets to 99.25% test accuracy after 12 epochs 5 | w/ LN: 6 | Gets to 98.99% test accuracy after 12 epochs 7 | 8 | Caveat: the original paper doesn't discuss explicitly how to 9 | apply LN to Convolution layers or if LN is effective of conv. 10 | ''' 11 | 12 | from __future__ import print_function 13 | import numpy as np 14 | np.random.seed(1337) # for reproducibility 15 | 16 | from keras.datasets import mnist 17 | from keras.models import Sequential 18 | from keras.layers import Dense, Dropout, Activation, Flatten 19 | from keras.layers import Conv2D, MaxPooling2D 20 | from keras.utils import np_utils 21 | from keras import backend as K 22 | 23 | from layer_norm_layers import * 24 | 25 | K.set_image_dim_ordering('th') 26 | 27 | batch_size = 128 28 | nb_classes = 10 29 | epochs = 12 30 | 31 | # input image dimensions 32 | img_rows, img_cols = 28, 28 33 | # number of convolutional filters to use 34 | filters = 32 35 | # size of pooling area for max pooling 36 | pool_size = (2, 2) 37 | # convolution kernel size 38 | kernel_size = (3, 3) 39 | 40 | # the data, shuffled and split between train and test sets 41 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 42 | 43 | if K.image_data_format() == 'channels_first': 44 | X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) 45 | X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) 46 | input_shape = (1, img_rows, img_cols) 47 | ln_axis = 1 # try 1, [2, 3] or [1, 2, 3] 48 | else: 49 | X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) 50 | X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1) 51 | input_shape = (img_rows, img_cols, 1) 52 | ln_axis = 3 # try 3, [1, 2] or [1, 2, 3] 53 | 54 | X_train = X_train.astype('float32') 55 | X_test = X_test.astype('float32') 56 | X_train /= 255 57 | X_test /= 255 58 | print('X_train shape:', X_train.shape) 59 | print(X_train.shape[0], 'train samples') 60 | print(X_test.shape[0], 'test samples') 61 | 62 | # convert class vectors to binary class matrices 63 | Y_train = np_utils.to_categorical(y_train, nb_classes) 64 | Y_test = np_utils.to_categorical(y_test, nb_classes) 65 | 66 | model = Sequential() 67 | 68 | model.add(Conv2D(filters, kernel_size, 69 | padding='valid', 70 | input_shape=input_shape)) 71 | model.add(LayerNormalization(axis=ln_axis)) 72 | model.add(Activation('relu')) 73 | model.add(Conv2D(filters, kernel_size)) 74 | model.add(LayerNormalization(axis=ln_axis)) 75 | model.add(Activation('relu')) 76 | model.add(MaxPooling2D(pool_size=pool_size)) 77 | model.add(Dropout(0.25)) 78 | 79 | model.add(Flatten()) 80 | model.add(Dense(128)) 81 | model.add(LayerNormalization()) 82 | model.add(Activation('relu')) 83 | model.add(Dropout(0.5)) 84 | model.add(Dense(nb_classes)) 85 | model.add(LayerNormalization()) 86 | model.add(Activation('softmax')) 87 | 88 | model.compile(loss='categorical_crossentropy', 89 | optimizer='adadelta', 90 | metrics=['accuracy']) 91 | 92 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, 93 | verbose=1, validation_data=(X_test, Y_test)) 94 | score = model.evaluate(X_test, Y_test, verbose=0) 95 | print('Test score:', score[0]) 96 | print('Test accuracy:', score[1]) 97 | -------------------------------------------------------------------------------- /layernorm/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple fully connected NN with Weight Normalzition on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 98.28% test accuracy after 20 epochs using tensorflow backend 4 | ''' 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | np.random.seed(1337) # for reproducibility 9 | 10 | from keras.datasets import mnist 11 | from keras.models import Sequential 12 | from keras.layers.core import Dense, Dropout, Activation 13 | from keras.optimizers import SGD, Adam, RMSprop 14 | from keras.utils import np_utils 15 | 16 | from layer_norm_layers import * 17 | 18 | batch_size = 128 19 | nb_classes = 10 20 | epochs = 20 21 | 22 | # the data, shuffled and split between train and test sets 23 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 24 | 25 | X_train = X_train.reshape(60000, 784) 26 | X_test = X_test.reshape(10000, 784) 27 | X_train = X_train.astype('float32') 28 | X_test = X_test.astype('float32') 29 | X_train /= 255 30 | X_test /= 255 31 | print(X_train.shape[0], 'train samples') 32 | print(X_test.shape[0], 'test samples') 33 | 34 | # convert class vectors to binary class matrices 35 | Y_train = np_utils.to_categorical(y_train, nb_classes) 36 | Y_test = np_utils.to_categorical(y_test, nb_classes) 37 | 38 | model = Sequential() 39 | model.add(Dense(512, input_shape=(784,))) 40 | model.add(LayerNormalization()) 41 | model.add(Activation('relu')) 42 | model.add(Dropout(0.2)) 43 | model.add(Dense(512)) 44 | model.add(LayerNormalization()) 45 | model.add(Activation('relu')) 46 | model.add(Dropout(0.2)) 47 | model.add(Dense(10)) 48 | model.add(LayerNormalization()) 49 | model.add(Activation('softmax')) 50 | 51 | model.summary() 52 | 53 | model.compile(loss='categorical_crossentropy', 54 | optimizer=RMSprop(), 55 | metrics=['accuracy']) 56 | 57 | history = model.fit(X_train, Y_train, 58 | batch_size=batch_size, epochs=epochs, 59 | verbose=1, validation_data=(X_test, Y_test)) 60 | score = model.evaluate(X_test, Y_test, verbose=0) 61 | print('Test score:', score[0]) 62 | print('Test accuracy:', score[1]) 63 | -------------------------------------------------------------------------------- /lsgan/README.md: -------------------------------------------------------------------------------- 1 | Toy keras implementation of Least Squares GAN (LSGAN) on MNIST 2 | 3 | ## Reference 4 | * Mao et al. [Least Squares Generative Adversarial Networks](https://www.arxiv.org/abs/1611.04076) 5 | -------------------------------------------------------------------------------- /lsgan/lsgan_mlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Train a Least Square Generative Adversarial Network (WGAN) on the MNIST 5 | """ 6 | from __future__ import print_function 7 | from PIL import Image 8 | from six.moves import range 9 | 10 | import keras.backend as K 11 | K.set_image_data_format('channels_first') 12 | 13 | from keras.datasets import mnist 14 | from keras.layers import Input, Dense, Reshape, Flatten, Dropout, Activation, BatchNormalization 15 | from keras.models import Sequential, Model 16 | from keras.optimizers import RMSprop, Adam 17 | from keras.utils.generic_utils import Progbar 18 | 19 | import numpy as np 20 | np.random.seed(1337) 21 | 22 | 23 | def build_generator(latent_size): 24 | model = Sequential() 25 | model.add(Dense(1024, input_dim=latent_size, activation='relu')) 26 | model.add(Dense(28 * 28, activation='tanh')) 27 | model.add(Reshape((1, 28, 28))) 28 | 29 | return model 30 | 31 | 32 | def build_discriminator(): 33 | f = Sequential() 34 | f.add(Flatten(input_shape=(1, 28, 28))) 35 | f.add(Dense(256)) 36 | f.add(Activation('relu')) 37 | f.add(Dense(128)) 38 | f.add(Activation('relu')) 39 | f.add(Dense(1, activation='linear')) 40 | 41 | image = Input(shape=(1, 28, 28)) 42 | score = f(image) 43 | 44 | model = Model(image, score) 45 | 46 | return model 47 | 48 | 49 | if __name__ == '__main__': 50 | 51 | epochs = 5000 52 | batch_size = 50 53 | latent_size = 20 54 | 55 | lr = 0.001 56 | 57 | # build the discriminator 58 | disc = build_discriminator() 59 | disc.compile( 60 | optimizer=Adam(lr=lr), 61 | loss='mse' 62 | ) 63 | 64 | # build the generator 65 | generator = build_generator(latent_size) 66 | 67 | latent = Input(shape=(latent_size, )) 68 | # get a fake image 69 | fake = generator(latent) 70 | # we only want to be able to train generation for the combined model 71 | disc.trainable = False 72 | fake = disc(fake) 73 | combined = Model(inputs=latent, outputs=fake) 74 | combined.compile( 75 | optimizer=Adam(lr=lr), 76 | loss='mse' 77 | ) 78 | 79 | # get our mnist data, and force it to be of shape (..., 1, 28, 28) with 80 | # range [-1, 1] 81 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 82 | X_train = (X_train.astype(np.float32) - 127.5) / 127.5 83 | X_train = np.expand_dims(X_train, axis=1) 84 | 85 | X_test = (X_test.astype(np.float32) - 127.5) / 127.5 86 | X_test = np.expand_dims(X_test, axis=1) 87 | 88 | nb_train, nb_test = X_train.shape[0], X_test.shape[0] 89 | 90 | for epoch in range(epochs): 91 | print('Epoch {} of {}'.format(epoch + 1, epochs)) 92 | 93 | nb_batches = int(X_train.shape[0] / batch_size) 94 | progress_bar = Progbar(target=nb_batches) 95 | 96 | epoch_disc_loss = [] 97 | epoch_gen_loss = [] 98 | 99 | index = 0 100 | while index < nb_batches: 101 | ## discrminator 102 | progress_bar.update(index) 103 | index += 1 104 | 105 | # generate a new batch of noise 106 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 107 | # generate a batch of fake images 108 | generated_images = generator.predict(noise, verbose=0) 109 | 110 | # get a batch of real images 111 | image_batch = X_train[index * batch_size:(index + 1) * batch_size] 112 | label_batch = y_train[index * batch_size:(index + 1) * batch_size] 113 | 114 | X = np.concatenate((image_batch, generated_images)) 115 | # a == 0, b == 1 116 | y = np.array([1] * len(image_batch) + [0] * batch_size) 117 | 118 | epoch_disc_loss.append(disc.train_on_batch(X, y)) 119 | 120 | 121 | ## generator 122 | # make new noise. we generate 2 * batch size here such that we have 123 | # the generator optimize over an identical number of images as the 124 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 125 | target = np.ones(batch_size) # c == b == 1, cf. Eq. (9) 126 | epoch_gen_loss.append(combined.train_on_batch(noise, target)) 127 | 128 | print('\n[Loss_D: {:.3f}, Loss_G: {:.3f}]'.format(np.mean(epoch_disc_loss), np.mean(epoch_gen_loss))) 129 | 130 | # save weights every epoch 131 | if False: 132 | generator.save_weights( 133 | 'mlp_generator_epoch_{0:03d}.hdf5'.format(epoch), True) 134 | disc.save_weights( 135 | 'mlp_disc_epoch_{0:03d}.hdf5'.format(epoch), True) 136 | 137 | # generate some digits to display 138 | noise = np.random.uniform(-1, 1, (100, latent_size)) 139 | # get a batch to display 140 | generated_images = generator.predict(noise, verbose=0) 141 | 142 | # arrange them into a grid 143 | img = (np.concatenate([r.reshape(-1, 28) 144 | for r in np.split(generated_images, 10) 145 | ], axis=-1) * 127.5 + 127.5).astype(np.uint8) 146 | 147 | Image.fromarray(img).save( 148 | 'mlp_epoch_{0:03d}_generated.png'.format(epoch)) 149 | -------------------------------------------------------------------------------- /qrnn/README.md: -------------------------------------------------------------------------------- 1 | A trial implemention of QRNN-fo with dropout for Keras 2 | 3 | ## Run the demo: 4 | python imbd_qrnn.py 5 | 6 | ## Reference 7 | * Bradbury et al. [Quasi-recurrent Neural Networks](https://arxiv.org/abs/1611.01576) 8 | -------------------------------------------------------------------------------- /qrnn/imbd_qrnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a QRNN on the IMDB sentiment classification task. 2 | Modified from keras' examples/imbd_lstm.py. 3 | ''' 4 | from __future__ import print_function 5 | import numpy as np 6 | np.random.seed(1337) # for reproducibility 7 | 8 | from keras.preprocessing import sequence 9 | from keras.utils import np_utils 10 | from keras.models import Sequential 11 | from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D 12 | from keras.layers import LSTM, SimpleRNN, GRU 13 | from keras.regularizers import l2 14 | from keras.constraints import maxnorm 15 | from keras.datasets import imdb 16 | 17 | from qrnn import QRNN 18 | 19 | max_features = 20000 20 | maxlen = 80 # cut texts after this number of words (among top max_features most common words) 21 | batch_size = 32 22 | 23 | print('Build model...') 24 | model = Sequential() 25 | model.add(Embedding(max_features, 128)) 26 | model.add(SpatialDropout1D(0.2)) 27 | model.add(QRNN(128, window_size=3, dropout=0.2, 28 | kernel_regularizer=l2(1e-4), bias_regularizer=l2(1e-4), 29 | kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))) 30 | model.add(Dense(1)) 31 | model.add(Activation('sigmoid')) 32 | 33 | # try using different optimizers and different optimizer configs 34 | model.compile(loss='binary_crossentropy', 35 | optimizer='adam', 36 | metrics=['accuracy']) 37 | 38 | print('Loading data...') 39 | (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features) 40 | print(len(X_train), 'train sequences') 41 | print(len(X_test), 'test sequences') 42 | 43 | print('Pad sequences (samples x time)') 44 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 45 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 46 | print('X_train shape:', X_train.shape) 47 | print('X_test shape:', X_test.shape) 48 | 49 | print('Train...') 50 | model.fit(X_train, y_train, batch_size=batch_size, epochs=15, 51 | validation_data=(X_test, y_test)) 52 | score, acc = model.evaluate(X_test, y_test, 53 | batch_size=batch_size) 54 | print('Test score:', score) 55 | print('Test accuracy:', acc) 56 | -------------------------------------------------------------------------------- /qrnn/imbd_qrnn_Bidirecional.py: -------------------------------------------------------------------------------- 1 | '''Trains a QRNN on the IMDB sentiment classification task. 2 | Modified from keras' examples/imbd_lstm.py. 3 | ''' 4 | from __future__ import print_function 5 | import numpy as np 6 | np.random.seed(1337) # for reproducibility 7 | 8 | from keras.preprocessing import sequence 9 | from keras.utils import np_utils 10 | from keras.models import Sequential 11 | from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D 12 | from keras.layers import LSTM, SimpleRNN, GRU, Bidirecional 13 | from keras.regularizers import l2 14 | from keras.constraints import maxnorm 15 | from keras.datasets import imdb 16 | 17 | from qrnn import QRNN 18 | 19 | max_features = 20000 20 | maxlen = 80 # cut texts after this number of words (among top max_features most common words) 21 | batch_size = 32 22 | 23 | print('Build model...') 24 | model = Sequential() 25 | model.add(Embedding(max_features, 128)) 26 | model.add(SpatialDropout1D(0.2)) 27 | model.add(Bidirecional(QRNN(128, window_size=3, dropout=0.2, 28 | kernel_regularizer=l2(1e-4), bias_regularizer=l2(1e-4), 29 | kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10)))) 30 | model.add(Dense(1)) 31 | model.add(Activation('sigmoid')) 32 | 33 | # try using different optimizers and different optimizer configs 34 | model.compile(loss='binary_crossentropy', 35 | optimizer='adam', 36 | metrics=['accuracy']) 37 | 38 | print('Loading data...') 39 | (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features) 40 | print(len(X_train), 'train sequences') 41 | print(len(X_test), 'test sequences') 42 | 43 | print('Pad sequences (samples x time)') 44 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 45 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 46 | print('X_train shape:', X_train.shape) 47 | print('X_test shape:', X_test.shape) 48 | 49 | print('Train...') 50 | model.fit(X_train, y_train, batch_size=batch_size, epochs=15, 51 | validation_data=(X_test, y_test)) 52 | score, acc = model.evaluate(X_test, y_test, 53 | batch_size=batch_size) 54 | print('Test score:', score) 55 | print('Test accuracy:', acc) 56 | -------------------------------------------------------------------------------- /qrnn/qrnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import numpy as np 4 | 5 | from keras import backend as K 6 | from keras import activations, initializers, regularizers, constraints 7 | from keras.layers import Layer, InputSpec 8 | 9 | from keras.utils.conv_utils import conv_output_length 10 | 11 | import theano 12 | import theano.tensor as T 13 | 14 | 15 | def _dropout(x, level, noise_shape=None, seed=None): 16 | x = K.dropout(x, level, noise_shape, seed) 17 | x *= (1. - level) # compensate for the scaling by the dropout 18 | return x 19 | 20 | 21 | class QRNN(Layer): 22 | '''Quasi RNN 23 | 24 | # Arguments 25 | units: dimension of the internal projections and the final output. 26 | 27 | # References 28 | - [Quasi-recurrent Neural Networks](http://arxiv.org/abs/1611.01576) 29 | ''' 30 | def __init__(self, units, window_size=2, stride=1, 31 | return_sequences=False, go_backwards=False, 32 | stateful=False, unroll=False, activation='tanh', 33 | kernel_initializer='uniform', bias_initializer='zero', 34 | kernel_regularizer=None, bias_regularizer=None, 35 | activity_regularizer=None, 36 | kernel_constraint=None, bias_constraint=None, 37 | dropout=0, use_bias=True, input_dim=None, input_length=None, 38 | **kwargs): 39 | self.return_sequences = return_sequences 40 | self.go_backwards = go_backwards 41 | self.stateful = stateful 42 | self.unroll = unroll 43 | 44 | self.units = units 45 | self.window_size = window_size 46 | self.strides = (stride, 1) 47 | 48 | self.use_bias = use_bias 49 | self.activation = activations.get(activation) 50 | self.kernel_initializer = initializers.get(kernel_initializer) 51 | self.bias_initializer = initializers.get(bias_initializer) 52 | self.kernel_regularizer = regularizers.get(kernel_regularizer) 53 | self.bias_regularizer = regularizers.get(bias_regularizer) 54 | self.activity_regularizer = regularizers.get(activity_regularizer) 55 | self.kernel_constraint = constraints.get(kernel_constraint) 56 | self.bias_constraint = constraints.get(bias_constraint) 57 | 58 | self.recurrent_dropout = 0 #not used, added to maintain compatibility with keras.Bidirectional 59 | self.dropout = dropout 60 | self.supports_masking = True 61 | self.input_spec = [InputSpec(ndim=3)] 62 | self.input_dim = input_dim 63 | self.input_length = input_length 64 | if self.input_dim: 65 | kwargs['input_shape'] = (self.input_length, self.input_dim) 66 | super(QRNN, self).__init__(**kwargs) 67 | 68 | def build(self, input_shape): 69 | if isinstance(input_shape, list): 70 | input_shape = input_shape[0] 71 | 72 | batch_size = input_shape[0] if self.stateful else None 73 | self.input_dim = input_shape[2] 74 | self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim)) 75 | self.state_spec = InputSpec(shape=(batch_size, self.units)) 76 | 77 | self.states = [None] 78 | if self.stateful: 79 | self.reset_states() 80 | 81 | kernel_shape = (self.window_size, 1, self.input_dim, self.units * 3) 82 | self.kernel = self.add_weight(name='kernel', 83 | shape=kernel_shape, 84 | initializer=self.kernel_initializer, 85 | regularizer=self.kernel_regularizer, 86 | constraint=self.kernel_constraint) 87 | if self.use_bias: 88 | self.bias = self.add_weight(name='bias', 89 | shape=(self.units * 3,), 90 | initializer=self.bias_initializer, 91 | regularizer=self.bias_regularizer, 92 | constraint=self.bias_constraint) 93 | 94 | self.built = True 95 | 96 | def compute_output_shape(self, input_shape): 97 | if isinstance(input_shape, list): 98 | input_shape = input_shape[0] 99 | 100 | length = input_shape[1] 101 | if length: 102 | length = conv_output_length(length + self.window_size - 1, 103 | self.window_size, 'valid', 104 | self.strides[0]) 105 | if self.return_sequences: 106 | return (input_shape[0], length, self.units) 107 | else: 108 | return (input_shape[0], self.units) 109 | 110 | def compute_mask(self, inputs, mask): 111 | if self.return_sequences: 112 | return mask 113 | else: 114 | return None 115 | 116 | def get_initial_states(self, inputs): 117 | # build an all-zero tensor of shape (samples, units) 118 | initial_state = K.zeros_like(inputs) # (samples, timesteps, input_dim) 119 | initial_state = K.sum(initial_state, axis=(1, 2)) # (samples,) 120 | initial_state = K.expand_dims(initial_state) # (samples, 1) 121 | initial_state = K.tile(initial_state, [1, self.units]) # (samples, units) 122 | initial_states = [initial_state for _ in range(len(self.states))] 123 | return initial_states 124 | 125 | def reset_states(self, states=None): 126 | if not self.stateful: 127 | raise AttributeError('Layer must be stateful.') 128 | if not self.input_spec: 129 | raise RuntimeError('Layer has never been called ' 130 | 'and thus has no states.') 131 | 132 | batch_size = self.input_spec.shape[0] 133 | if not batch_size: 134 | raise ValueError('If a QRNN is stateful, it needs to know ' 135 | 'its batch size. Specify the batch size ' 136 | 'of your input tensors: \n' 137 | '- If using a Sequential model, ' 138 | 'specify the batch size by passing ' 139 | 'a `batch_input_shape` ' 140 | 'argument to your first layer.\n' 141 | '- If using the functional API, specify ' 142 | 'the time dimension by passing a ' 143 | '`batch_shape` argument to your Input layer.') 144 | 145 | if self.states[0] is None: 146 | self.states = [K.zeros((batch_size, self.units)) 147 | for _ in self.states] 148 | elif states is None: 149 | for state in self.states: 150 | K.set_value(state, np.zeros((batch_size, self.units))) 151 | else: 152 | if not isinstance(states, (list, tuple)): 153 | states = [states] 154 | if len(states) != len(self.states): 155 | raise ValueError('Layer ' + self.name + ' expects ' + 156 | str(len(self.states)) + ' states, ' 157 | 'but it received ' + str(len(states)) + 158 | 'state values. Input received: ' + 159 | str(states)) 160 | for index, (value, state) in enumerate(zip(states, self.states)): 161 | if value.shape != (batch_size, self.units): 162 | raise ValueError('State ' + str(index) + 163 | ' is incompatible with layer ' + 164 | self.name + ': expected shape=' + 165 | str((batch_size, self.units)) + 166 | ', found shape=' + str(value.shape)) 167 | K.set_value(state, value) 168 | 169 | def __call__(self, inputs, initial_state=None, **kwargs): 170 | # If `initial_state` is specified, 171 | # and if it a Keras tensor, 172 | # then add it to the inputs and temporarily 173 | # modify the input spec to include the state. 174 | if initial_state is not None: 175 | if hasattr(initial_state, '_keras_history'): 176 | # Compute the full input spec, including state 177 | input_spec = self.input_spec 178 | state_spec = self.state_spec 179 | if not isinstance(state_spec, list): 180 | state_spec = [state_spec] 181 | self.input_spec = [input_spec] + state_spec 182 | 183 | # Compute the full inputs, including state 184 | if not isinstance(initial_state, (list, tuple)): 185 | initial_state = [initial_state] 186 | inputs = [inputs] + list(initial_state) 187 | 188 | # Perform the call 189 | output = super(QRNN, self).__call__(inputs, **kwargs) 190 | 191 | # Restore original input spec 192 | self.input_spec = input_spec 193 | return output 194 | else: 195 | kwargs['initial_state'] = initial_state 196 | return super(QRNN, self).__call__(inputs, **kwargs) 197 | 198 | def call(self, inputs, mask=None, initial_state=None, training=None): 199 | # input shape: `(samples, time (padded with zeros), input_dim)` 200 | # note that the .build() method of subclasses MUST define 201 | # self.input_spec and self.state_spec with complete input shapes. 202 | if isinstance(inputs, list): 203 | initial_states = inputs[1:] 204 | inputs = inputs[0] 205 | elif initial_state is not None: 206 | pass 207 | elif self.stateful: 208 | initial_states = self.states 209 | else: 210 | initial_states = self.get_initial_states(inputs) 211 | 212 | if len(initial_states) != len(self.states): 213 | raise ValueError('Layer has ' + str(len(self.states)) + 214 | ' states but was passed ' + 215 | str(len(initial_states)) + 216 | ' initial states.') 217 | input_shape = K.int_shape(inputs) 218 | if self.unroll and input_shape[1] is None: 219 | raise ValueError('Cannot unroll a RNN if the ' 220 | 'time dimension is undefined. \n' 221 | '- If using a Sequential model, ' 222 | 'specify the time dimension by passing ' 223 | 'an `input_shape` or `batch_input_shape` ' 224 | 'argument to your first layer. If your ' 225 | 'first layer is an Embedding, you can ' 226 | 'also use the `input_length` argument.\n' 227 | '- If using the functional API, specify ' 228 | 'the time dimension by passing a `shape` ' 229 | 'or `batch_shape` argument to your Input layer.') 230 | constants = self.get_constants(inputs, training=None) 231 | preprocessed_input = self.preprocess_input(inputs, training=None) 232 | 233 | last_output, outputs, states = K.rnn(self.step, preprocessed_input, 234 | initial_states, 235 | go_backwards=self.go_backwards, 236 | mask=mask, 237 | constants=constants, 238 | unroll=self.unroll, 239 | input_length=input_shape[1]) 240 | if self.stateful: 241 | updates = [] 242 | for i in range(len(states)): 243 | updates.append((self.states[i], states[i])) 244 | self.add_update(updates, inputs) 245 | 246 | # Properly set learning phase 247 | if 0 < self.dropout < 1: 248 | last_output._uses_learning_phase = True 249 | outputs._uses_learning_phase = True 250 | 251 | if self.return_sequences: 252 | return outputs 253 | else: 254 | return last_output 255 | 256 | def preprocess_input(self, inputs, training=None): 257 | if self.window_size > 1: 258 | inputs = K.temporal_padding(inputs, (self.window_size-1, 0)) 259 | inputs = K.expand_dims(inputs, 2) # add a dummy dimension 260 | 261 | output = K.conv2d(inputs, self.kernel, strides=self.strides, 262 | padding='valid', 263 | data_format='channels_last') 264 | output = K.squeeze(output, 2) # remove the dummy dimension 265 | if self.use_bias: 266 | output = K.bias_add(output, self.bias, data_format='channels_last') 267 | 268 | if self.dropout is not None and 0. < self.dropout < 1.: 269 | z = output[:, :, :self.units] 270 | f = output[:, :, self.units:2 * self.units] 271 | o = output[:, :, 2 * self.units:] 272 | f = K.in_train_phase(1 - _dropout(1 - f, self.dropout), f, training=training) 273 | return K.concatenate([z, f, o], -1) 274 | else: 275 | return output 276 | 277 | def step(self, inputs, states): 278 | prev_output = states[0] 279 | 280 | z = inputs[:, :self.units] 281 | f = inputs[:, self.units:2 * self.units] 282 | o = inputs[:, 2 * self.units:] 283 | 284 | z = self.activation(z) 285 | f = f if self.dropout is not None and 0. < self.dropout < 1. else K.sigmoid(f) 286 | o = K.sigmoid(o) 287 | 288 | output = f * prev_output + (1 - f) * z 289 | output = o * output 290 | 291 | return output, [output] 292 | 293 | def get_constants(self, inputs, training=None): 294 | return [] 295 | 296 | def get_config(self): 297 | config = {'units': self.units, 298 | 'window_size': self.window_size, 299 | 'stride': self.strides[0], 300 | 'return_sequences': self.return_sequences, 301 | 'go_backwards': self.go_backwards, 302 | 'stateful': self.stateful, 303 | 'unroll': self.unroll, 304 | 'use_bias': self.use_bias, 305 | 'dropout': self.dropout, 306 | 'activation': activations.serialize(self.activation), 307 | 'kernel_initializer': initializers.serialize(self.kernel_initializer), 308 | 'bias_initializer': initializers.serialize(self.bias_initializer), 309 | 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer), 310 | 'bias_regularizer': regularizers.serialize(self.bias_regularizer), 311 | 'activity_regularizer': regularizers.serialize(self.activity_regularizer), 312 | 'kernel_constraint': constraints.serialize(self.kernel_constraint), 313 | 'bias_constraint': constraints.serialize(self.bias_constraint), 314 | 'input_dim': self.input_dim, 315 | 'input_length': self.input_length} 316 | base_config = super(QRNN, self).get_config() 317 | return dict(list(base_config.items()) + list(config.items())) 318 | -------------------------------------------------------------------------------- /senet/README.md: -------------------------------------------------------------------------------- 1 | Squeese & Excitating wrapper (a simple gating mechanism) for 2D convolution (4D data). 2 | 3 | ## References: 4 | * Hu et al. [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507) 5 | * Official Repo (Caffe): https://github.com/hujie-frank/SENet 6 | -------------------------------------------------------------------------------- /senet/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from keras import backend as K 3 | from keras import initializers, constraints, regularizers 4 | from keras.layers import InputSpec, Layer 5 | 6 | from keras.utils import conv_utils 7 | 8 | 9 | class SE(Layer): 10 | def __init__(self, 11 | ratio, 12 | data_format=None, 13 | use_bias=True, 14 | kernel_initializer='glorot_uniform', 15 | bias_initializer='zeros', 16 | kernel_regularizer=None, 17 | bias_regularizer=None, 18 | activity_regularizer=None, 19 | kernel_constraint=None, 20 | bias_constraint=None, 21 | **kwargs): 22 | super(SE, self).__init__(**kwargs) 23 | 24 | self.ratio = ratio 25 | self.data_format= conv_utils.normalize_data_format(data_format) 26 | 27 | self.use_bias = use_bias 28 | self.kernel_initializer = initializers.get(kernel_initializer) 29 | self.bias_initializer = initializers.get(bias_initializer) 30 | self.kernel_regularizer = regularizers.get(kernel_regularizer) 31 | self.bias_regularizer = regularizers.get(bias_regularizer) 32 | self.activity_regularizer = regularizers.get(activity_regularizer) 33 | self.kernel_constraint = constraints.get(kernel_constraint) 34 | self.bias_constraint = constraints.get(bias_constraint) 35 | self.supports_masking = True 36 | 37 | def build(self, input_shape): 38 | assert len(input_shape) == 4 39 | self.input_spec = InputSpec(shape=input_shape) 40 | 41 | if self.data_format == 'channels_first': 42 | channel_axis = 1 43 | else: 44 | channel_axis = 3 45 | channels = input_shape[channel_axis] 46 | 47 | self.kernel1 = self.add_weight(shape=(channels, channels // self.ratio), 48 | initializer=self.kernel_initializer, 49 | name='kernel1', 50 | regularizer=self.kernel_regularizer, 51 | constraint=self.kernel_constraint) 52 | if self.use_bias: 53 | self.bias1 = self.add_weight(shape=(channels // self.ratio,), 54 | initializer=self.bias_initializer, 55 | name='bias1', 56 | regularizer=self.bias_regularizer, 57 | constraint=self.bias_constraint) 58 | else: 59 | self.bias1 = None 60 | 61 | self.kernel2 = self.add_weight(shape=(channels // self.ratio, channels), 62 | initializer=self.kernel_initializer, 63 | name='kernel2', 64 | regularizer=self.kernel_regularizer, 65 | constraint=self.kernel_constraint) 66 | if self.use_bias: 67 | self.bias2 = self.add_weight(shape=(channels,), 68 | initializer=self.bias_initializer, 69 | name='bias2', 70 | regularizer=self.bias_regularizer, 71 | constraint=self.bias_constraint) 72 | else: 73 | self.bias2 = None 74 | 75 | self.built = True 76 | 77 | def compute_output_shape(self, input_shape): 78 | return input_shape 79 | 80 | def call(self, inputs): 81 | if self.data_format == 'channels_first': 82 | sq = K.mean(inputs, [2, 3]) 83 | else: 84 | sq = K.mean(inputs, [1, 2]) 85 | 86 | ex = K.dot(sq, self.kernel1) 87 | if self.use_bias: 88 | ex = K.bias_add(ex, self.bias1) 89 | ex= K.relu(ex) 90 | 91 | ex = K.dot(ex, self.kernel2) 92 | if self.use_bias: 93 | ex = K.bias_add(ex, self.bias2) 94 | ex= K.sigmoid(ex) 95 | 96 | if self.data_format == 'channels_first': 97 | ex = K.expand_dims(ex, -1) 98 | ex = K.expand_dims(ex, -1) 99 | else: 100 | ex = K.expand_dims(ex, 1) 101 | ex = K.expand_dims(ex, 1) 102 | 103 | return inputs * ex 104 | 105 | def get_config(self): 106 | config = { 107 | 'ratio': self.ratio, 108 | 'data_format': self.data_format, 109 | 'use_bias': self.use_bias, 110 | 'kernel_initializer': initializers.serialize(self.kernel_initializer), 111 | 'bias_initializer': initializers.serialize(self.bias_initializer), 112 | 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer), 113 | 'bias_regularizer': regularizers.serialize(self.bias_regularizer), 114 | 'kernel_constraint': constraints.serialize(self.kernel_constraint), 115 | 'bias_constraint': constraints.serialize(self.bias_constraint) 116 | } 117 | base_config = super(SE, self).get_config() 118 | return dict(list(base_config.items()) + list(config.items())) 119 | -------------------------------------------------------------------------------- /senet/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple convnet on the MNIST dataset. 2 | 3 | Gets to 98.96% test accuracy after 12 epochs 4 | ''' 5 | 6 | from __future__ import print_function 7 | import keras 8 | from keras.datasets import mnist 9 | from keras.models import Sequential 10 | from keras.layers import Dense, Dropout, Flatten 11 | from keras.layers import Conv2D, MaxPooling2D 12 | from keras import backend as K 13 | 14 | from layers import SE 15 | 16 | 17 | batch_size = 128 18 | num_classes = 10 19 | epochs = 12 20 | 21 | ratio = 4 22 | 23 | # input image dimensions 24 | img_rows, img_cols = 28, 28 25 | 26 | # the data, shuffled and split between train and test sets 27 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 28 | 29 | if K.image_data_format() == 'channels_first': 30 | x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) 31 | x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) 32 | input_shape = (1, img_rows, img_cols) 33 | else: 34 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 35 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 36 | input_shape = (img_rows, img_cols, 1) 37 | 38 | x_train = x_train.astype('float32') 39 | x_test = x_test.astype('float32') 40 | x_train /= 255 41 | x_test /= 255 42 | print('x_train shape:', x_train.shape) 43 | print(x_train.shape[0], 'train samples') 44 | print(x_test.shape[0], 'test samples') 45 | 46 | # convert class vectors to binary class matrices 47 | y_train = keras.utils.to_categorical(y_train, num_classes) 48 | y_test = keras.utils.to_categorical(y_test, num_classes) 49 | 50 | model = Sequential() 51 | model.add(Conv2D(32, kernel_size=(3, 3), 52 | activation='relu', 53 | input_shape=input_shape)) 54 | model.add(SE(ratio)) 55 | model.add(Conv2D(64, (3, 3), activation='relu')) 56 | model.add(SE(ratio)) 57 | model.add(MaxPooling2D(pool_size=(2, 2))) 58 | model.add(Dropout(0.25)) 59 | model.add(Flatten()) 60 | model.add(Dense(128, activation='relu')) 61 | model.add(Dropout(0.5)) 62 | model.add(Dense(num_classes, activation='softmax')) 63 | 64 | model.compile(loss=keras.losses.categorical_crossentropy, 65 | optimizer=keras.optimizers.Adadelta(), 66 | metrics=['accuracy']) 67 | 68 | model.fit(x_train, y_train, 69 | batch_size=batch_size, 70 | epochs=epochs, 71 | verbose=1, 72 | validation_data=(x_test, y_test)) 73 | score = model.evaluate(x_test, y_test, verbose=0) 74 | print('Test loss:', score[0]) 75 | print('Test accuracy:', score[1]) 76 | -------------------------------------------------------------------------------- /ternarynet/README.md: -------------------------------------------------------------------------------- 1 | Ternary Recurrent Networks for Keras. 2 | 3 | ## run 4 | ### MLP 5 | python mnist_mlp.py 6 | 7 | ### CNN 8 | python mnist_cnn.py 9 | 10 | ### RNN 11 | pyton imdb_lm.py 12 | 13 | ## References 14 | * Ott et al. [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902) 15 | * Li et al. [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) 16 | -------------------------------------------------------------------------------- /ternarynet/imdb_generator.py: -------------------------------------------------------------------------------- 1 | ../gcnn/imdb_generator.py -------------------------------------------------------------------------------- /ternarynet/imdb_lm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Simple RNN for Language Model 3 | ''' 4 | from __future__ import print_function 5 | import os 6 | 7 | from keras.models import Model 8 | from keras.layers import Input, Embedding, Dense, TimeDistributed 9 | from keras.optimizers import * 10 | 11 | from ternary_layers import * 12 | from imdb_generator import IMDBLM 13 | 14 | 15 | def LM(batch_size, window_size=3, vocsize=20000, embed_dim=20, hidden_dim=30, nb_layers=1): 16 | x = Input(batch_shape=(batch_size, None)) 17 | # mebedding 18 | y = Embedding(vocsize+2, embed_dim, mask_zero=False)(x) 19 | for i in range(nb_layers-1): 20 | y = TernaryRNN(hidden_dim, return_sequences=True, name='trnn{}'.format(i + 1))(y) 21 | y = TernaryRNN(hidden_dim, return_sequences=True, name='trnn{}'.format(nb_layers))(y) 22 | y = TimeDistributed(Dense(vocsize+2, activation='softmax', name='dense{}'.format(nb_layers)))(y) 23 | 24 | model = Model(input=x, output=y) 25 | 26 | return model 27 | 28 | 29 | def train_model(): 30 | batch_size = 32 31 | epochs = 100 32 | 33 | vocsize = 2000 # top 2k 34 | max_len = 30 35 | train_ratio = 0.99 36 | 37 | # Build model 38 | model = LM(batch_size, vocsize=vocsize, nb_layers=1) 39 | model.compile(optimizer='adam', 40 | loss='sparse_categorical_crossentropy') 41 | 42 | # Prepare data 43 | path = './data/imdb-full.pkl' 44 | # Train 45 | train_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, shuffle=True, 46 | which_set='train', train_ratio=train_ratio, batch_size=batch_size) 47 | # Validation 48 | val_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, 49 | which_set='validation', train_ratio=train_ratio, batch_size=batch_size) 50 | 51 | train_steps = 2000 52 | val_steps = 200 53 | 54 | # Start training 55 | model.summary() 56 | model.fit_generator(train_gen(), steps_per_epoch=train_steps, 57 | validation_data=val_gen(), validation_steps=val_steps, 58 | epochs=epochs, verbose=1) 59 | 60 | 61 | def run_demo(): 62 | train_model() 63 | 64 | 65 | if __name__ == '__main__': 66 | run_demo() 67 | -------------------------------------------------------------------------------- /ternarynet/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple ternarize CNN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to % test accuracy after 20 epochs using tensorflow backend 4 | ''' 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | np.random.seed(1337) # for reproducibility 9 | 10 | from keras.datasets import mnist 11 | from keras.models import Sequential 12 | from keras.layers import Dense, Dropout, Activation, BatchNormalization, MaxPooling2D 13 | from keras.layers import Flatten 14 | from keras.optimizers import SGD, Adam, RMSprop 15 | from keras.callbacks import LearningRateScheduler 16 | from keras.utils import np_utils 17 | import keras.backend as K 18 | 19 | K.set_image_data_format('channels_first') 20 | 21 | from ternary_ops import ternarize 22 | from ternary_layers import TernaryDense, TernaryConv2D 23 | 24 | 25 | def ternary_tanh(x): 26 | x = K.clip(x, -1, 1) 27 | return ternarize(x) 28 | 29 | H = 1. 30 | kernel_lr_multiplier = 'Glorot' 31 | 32 | # nn 33 | batch_size = 50 34 | epochs = 20 35 | nb_channel = 1 36 | img_rows = 28 37 | img_cols = 28 38 | nb_filters = 32 39 | nb_conv = 3 40 | nb_pool = 2 41 | nb_hid = 128 42 | nb_classes = 10 43 | use_bias = False 44 | 45 | # learning rate schedule 46 | lr_start = 1e-3 47 | lr_end = 1e-4 48 | lr_decay = (lr_end / lr_start)**(1. / epochs) 49 | 50 | # BN 51 | epsilon = 1e-6 52 | momentum = 0.9 53 | 54 | # dropout 55 | p1 = 0.25 56 | p2 = 0.5 57 | 58 | # the data, shuffled and split between train and test sets 59 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 60 | 61 | X_train = X_train.reshape(60000, 1, 28, 28) 62 | X_test = X_test.reshape(10000, 1, 28, 28) 63 | X_train = X_train.astype('float32') 64 | X_test = X_test.astype('float32') 65 | X_train /= 255 66 | X_test /= 255 67 | print(X_train.shape[0], 'train samples') 68 | print(X_test.shape[0], 'test samples') 69 | 70 | # convert class vectors to binary class matrices 71 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss 72 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1 73 | 74 | 75 | model = Sequential() 76 | # conv1 77 | model.add(TernaryConv2D(128, kernel_size=(3, 3), input_shape=(nb_channel, img_rows, img_cols), 78 | H=H, kernel_lr_multiplier=kernel_lr_multiplier, 79 | padding='same', use_bias=use_bias, name='conv1')) 80 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn1')) 81 | model.add(Activation(ternary_tanh, name='act1')) 82 | # conv2 83 | model.add(TernaryConv2D(128, kernel_size=(3, 3), H=H, kernel_lr_multiplier=kernel_lr_multiplier, 84 | padding='same', use_bias=use_bias, name='conv2')) 85 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool2')) 86 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn2')) 87 | model.add(Activation(ternary_tanh, name='act2')) 88 | # conv3 89 | model.add(TernaryConv2D(256, kernel_size=(3, 3,), H=H, kernel_lr_multiplier=kernel_lr_multiplier, 90 | padding='same', use_bias=use_bias, name='conv3')) 91 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn3')) 92 | model.add(Activation(ternary_tanh, name='act3')) 93 | # conv4 94 | model.add(TernaryConv2D(256, kernel_size=(3, 3,), H=H, kernel_lr_multiplier=kernel_lr_multiplier, 95 | padding='same', use_bias=use_bias, name='conv4')) 96 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool4')) 97 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn4')) 98 | model.add(Activation(ternary_tanh, name='act4')) 99 | model.add(Flatten()) 100 | # dense1 101 | model.add(TernaryDense(1024, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, name='dense5')) 102 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn5')) 103 | model.add(Activation(ternary_tanh, name='act5')) 104 | # dense2 105 | model.add(TernaryDense(nb_classes, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, name='dense6')) 106 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn6')) 107 | 108 | opt = Adam(lr=lr_start) 109 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 110 | model.summary() 111 | 112 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 113 | history = model.fit(X_train, Y_train, 114 | batch_size=batch_size, epochs=epochs, 115 | verbose=1, validation_data=(X_test, Y_test), 116 | callbacks=[lr_scheduler]) 117 | score = model.evaluate(X_test, Y_test, verbose=0) 118 | print('Test score:', score[0]) 119 | print('Test accuracy:', score[1]) 120 | -------------------------------------------------------------------------------- /ternarynet/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple binarize fully connected NN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 98.10% test accuracy after 20 epochs using theano backend 4 | ''' 5 | 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | np.random.seed(1337) # for reproducibility 10 | 11 | import keras.backend as K 12 | from keras.datasets import mnist 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout, Activation, BatchNormalization 15 | from keras.optimizers import SGD, Adam, RMSprop 16 | from keras.callbacks import LearningRateScheduler 17 | from keras.utils import np_utils 18 | 19 | from ternary_ops import ternarize 20 | from ternary_layers import TernaryDense 21 | 22 | 23 | class DropoutNoScale(Dropout): 24 | '''Keras Dropout does scale the input in training phase, which is undesirable here. 25 | ''' 26 | def call(self, inputs, mask=None): 27 | if 0. < self.rate < 1.: 28 | noise_shape = self._get_noise_shape(inputs) 29 | inputs = K.in_train_phase( 30 | K.dropout(inputs, self.rate, noise_shape) * (1. - self.rate), 31 | inputs)# multiplied by (1. - self.rate) for compensation 32 | return inputs 33 | 34 | 35 | def ternary_tanh(x): 36 | x = K.clip(x, -1, 1) 37 | return ternarize(x) 38 | 39 | 40 | batch_size = 100 41 | epochs = 20 42 | nb_classes = 10 43 | 44 | H = 'Glorot' 45 | kernel_lr_multiplier = 'Glorot' 46 | 47 | # network 48 | num_unit = 2048 49 | num_hidden = 3 50 | use_bias = False 51 | 52 | # learning rate schedule 53 | lr_start = 1e-3 54 | lr_end = 1e-4 55 | lr_decay = (lr_end / lr_start)**(1. / epochs) 56 | 57 | # BN 58 | epsilon = 1e-6 59 | momentum = 0.9 60 | 61 | # dropout 62 | drop_in = 0.2 63 | drop_hidden = 0.5 64 | 65 | # the data, shuffled and split between train and test sets 66 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 67 | 68 | X_train = X_train.reshape(60000, 784) 69 | X_test = X_test.reshape(10000, 784) 70 | X_train = X_train.astype('float32') 71 | X_test = X_test.astype('float32') 72 | X_train /= 255 73 | X_test /= 255 74 | print(X_train.shape[0], 'train samples') 75 | print(X_test.shape[0], 'test samples') 76 | 77 | # convert class vectors to binary class matrices 78 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss 79 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1 80 | 81 | model = Sequential() 82 | model.add(DropoutNoScale(drop_in, input_shape=(784,), name='drop0')) 83 | for i in range(num_hidden): 84 | model.add(TernaryDense(num_unit, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, 85 | name='dense{}'.format(i+1))) 86 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn{}'.format(i+1))) 87 | model.add(Activation(ternary_tanh, name='act{}'.format(i+1))) 88 | model.add(DropoutNoScale(drop_hidden, name='drop{}'.format(i+1))) 89 | model.add(TernaryDense(10, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, 90 | name='dense')) 91 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn')) 92 | 93 | model.summary() 94 | 95 | opt = Adam(lr=lr_start) 96 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 97 | 98 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 99 | history = model.fit(X_train, Y_train, 100 | batch_size=batch_size, epochs=epochs, 101 | verbose=1, validation_data=(X_test, Y_test), 102 | callbacks=[lr_scheduler]) 103 | score = model.evaluate(X_test, Y_test, verbose=0) 104 | print('Test score:', score[0]) 105 | print('Test accuracy:', score[1]) 106 | -------------------------------------------------------------------------------- /ternarynet/ternary_layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from keras import backend as K 5 | 6 | from keras.layers import InputSpec, Dense, Conv2D, SimpleRNN 7 | from keras import constraints 8 | from keras import initializers 9 | 10 | from ternary_ops import ternarize as ternarize, ternarize_dot 11 | 12 | 13 | class Clip(constraints.Constraint): 14 | def __init__(self, min_value, max_value=None): 15 | self.min_value = min_value 16 | self.max_value = max_value 17 | if not self.max_value: 18 | self.max_value = -self.min_value 19 | if self.min_value > self.max_value: 20 | self.min_value, self.max_value = self.max_value, self.min_value 21 | 22 | def __call__(self, p): 23 | return K.clip(p, self.min_value, self.max_value) 24 | 25 | def get_config(self): 26 | return {"min_value": self.min_value, 27 | "max_value": self.max_value} 28 | 29 | 30 | class TernaryDense(Dense): 31 | ''' Ternarized Dense layer 32 | 33 | References: 34 | - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902} 35 | - [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) 36 | ''' 37 | def __init__(self, units, H=1., kernel_lr_multiplier='Glorot', bias_lr_multiplier=None, **kwargs): 38 | super(TernaryDense, self).__init__(units, **kwargs) 39 | self.H = H 40 | self.kernel_lr_multiplier = kernel_lr_multiplier 41 | self.bias_lr_multiplier = bias_lr_multiplier 42 | 43 | 44 | def build(self, input_shape): 45 | assert len(input_shape) >= 2 46 | input_dim = input_shape[1] 47 | 48 | if self.H == 'Glorot': 49 | self.H = np.float32(np.sqrt(1.5 / (input_dim + self.units))) 50 | #print('Glorot H: {}'.format(self.H)) 51 | if self.kernel_lr_multiplier == 'Glorot': 52 | self.kernel_lr_multiplier = np.float32(1. / np.sqrt(1.5 / (input_dim + self.units))) 53 | #print('Glorot learning rate multiplier: {}'.format(self.kernel_lr_multiplier)) 54 | 55 | self.kernel_constraint = Clip(-self.H, self.H) 56 | self.kernel_initializer = initializers.RandomUniform(-self.H, self.H) 57 | self.kernel = self.add_weight(shape=(input_dim, self.units), 58 | initializer=self.kernel_initializer, 59 | name='kernel', 60 | regularizer=self.kernel_regularizer, 61 | constraint=self.kernel_constraint) 62 | 63 | if self.use_bias: 64 | self.lr_multipliers = [self.kernel_lr_multiplier, self.bias_lr_multiplier] 65 | self.bias = self.add_weight(shape=(self.output_dim,), 66 | initializer=self.bias_initializer, 67 | name='bias', 68 | regularizer=self.bias_regularizer, 69 | constraint=self.bias_constraint) 70 | else: 71 | self.lr_multipliers = [self.kernel_lr_multiplier] 72 | self.bias = None 73 | 74 | self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim}) 75 | self.built = True 76 | 77 | def call(self, inputs): 78 | ternary_kernel = ternarize(self.kernel, H=self.H) 79 | output = K.dot(inputs, ternary_kernel) 80 | if self.use_bias: 81 | output = K.bias_add(output, self.bias) 82 | if self.activation is not None: 83 | output = self.activation(output) 84 | return output 85 | 86 | def get_config(self): 87 | config = {'H': self.H, 88 | 'kernel_lr_multiplier': self.kernel_lr_multiplier, 89 | 'bias_lr_multiplier': self.bias_lr_multiplier} 90 | base_config = super(TernaryDense, self).get_config() 91 | return dict(list(base_config.items()) + list(config.items())) 92 | 93 | 94 | class TernaryConv2D(Conv2D): 95 | '''Ternarized Convolution2D layer 96 | References: 97 | - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902} 98 | - [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) 99 | ''' 100 | def __init__(self, filters, kernel_lr_multiplier='Glorot', 101 | bias_lr_multiplier=None, H=1., **kwargs): 102 | super(TernaryConv2D, self).__init__(filters, **kwargs) 103 | self.H = H 104 | self.kernel_lr_multiplier = kernel_lr_multiplier 105 | self.bias_lr_multiplier = bias_lr_multiplier 106 | 107 | def build(self, input_shape): 108 | if self.data_format == 'channels_first': 109 | channel_axis = 1 110 | else: 111 | channel_axis = -1 112 | if input_shape[channel_axis] is None: 113 | raise ValueError('The channel dimension of the inputs ' 114 | 'should be defined. Found `None`.') 115 | 116 | input_dim = input_shape[channel_axis] 117 | kernel_shape = self.kernel_size + (input_dim, self.filters) 118 | 119 | base = self.kernel_size[0] * self.kernel_size[1] 120 | if self.H == 'Glorot': 121 | nb_input = int(input_dim * base) 122 | nb_output = int(self.filters * base) 123 | self.H = np.float32(np.sqrt(1.5 / (nb_input + nb_output))) 124 | #print('Glorot H: {}'.format(self.H)) 125 | 126 | if self.kernel_lr_multiplier == 'Glorot': 127 | nb_input = int(input_dim * base) 128 | nb_output = int(self.filters * base) 129 | self.kernel_lr_multiplier = np.float32(1. / np.sqrt(1.5/ (nb_input + nb_output))) 130 | #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier)) 131 | 132 | self.kernel_constraint = Clip(-self.H, self.H) 133 | self.kernel_initializer = initializers.RandomUniform(-self.H, self.H) 134 | self.kernel = self.add_weight(shape=kernel_shape, 135 | initializer=self.kernel_initializer, 136 | name='kernel', 137 | regularizer=self.kernel_regularizer, 138 | constraint=self.kernel_constraint) 139 | 140 | if self.use_bias: 141 | self.lr_multipliers = [self.kernel_lr_multiplier, self.bias_lr_multiplier] 142 | self.bias = self.add_weight((self.output_dim,), 143 | initializer=self.bias_initializers, 144 | name='bias', 145 | regularizer=self.bias_regularizer, 146 | constraint=self.bias_constraint) 147 | 148 | else: 149 | self.lr_multipliers = [self.kernel_lr_multiplier] 150 | self.bias = None 151 | 152 | # Set input spec. 153 | self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim}) 154 | self.built = True 155 | 156 | def call(self, inputs): 157 | ternary_kernel = ternarize(self.kernel, H=self.H) 158 | outputs = K.conv2d( 159 | inputs, 160 | ternary_kernel, 161 | strides=self.strides, 162 | padding=self.padding, 163 | data_format=self.data_format, 164 | dilation_rate=self.dilation_rate) 165 | 166 | if self.use_bias: 167 | outputs = K.bias_add( 168 | outputs, 169 | self.bias, 170 | data_format=self.data_format) 171 | 172 | if self.activation is not None: 173 | return self.activation(outputs) 174 | return outputs 175 | 176 | def get_config(self): 177 | config = {'H': self.H, 178 | 'kernel_lr_multiplier': self.kernel_lr_multiplier, 179 | 'bias_lr_multiplier': self.bias_lr_multiplier} 180 | base_config = super(TernaryConv2D, self).get_config() 181 | return dict(list(base_config.items()) + list(config.items())) 182 | 183 | 184 | class TernaryRNN(SimpleRNN): 185 | ''' Ternarized RNN layer 186 | 187 | References: 188 | - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902} 189 | ''' 190 | def preprocess_input(self, inputs, training=None): 191 | return inputs 192 | 193 | def step(self, inputs, states): 194 | if 0 < self.dropout < 1: 195 | h = ternarize_dot(inputs * states[1], self.kernel) 196 | else: 197 | h = ternarize_dot(inputs, self.kernel) 198 | if self.bias is not None: 199 | h = K.bias_add(h, self.bias) 200 | 201 | prev_output = states[0] 202 | if 0 < self.recurrent_dropout < 1: 203 | prev_output *= states[2] 204 | output = h + ternarize_dot(prev_output, self.recurrent_kernel) 205 | if self.activation is not None: 206 | output = self.activation(output) 207 | 208 | # Properly set learning phase on output tensor. 209 | if 0 < self.dropout + self.recurrent_dropout: 210 | output._uses_learning_phase = True 211 | return output, [output] 212 | 213 | def get_constants(self, inputs, training=None): 214 | constants = [] 215 | if 0 < self.dropout < 1: 216 | input_shape = K.int_shape(inputs) 217 | input_dim = input_shape[-1] 218 | ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1))) 219 | ones = K.tile(ones, (1, int(input_dim))) 220 | 221 | def dropped_inputs(): 222 | return K.dropout(ones, self.dropout) 223 | 224 | dp_mask = K.in_train_phase(dropped_inputs, 225 | ones, 226 | training=training) 227 | constants.append(dp_mask) 228 | else: 229 | constants.append(K.cast_to_floatx(1.)) 230 | 231 | if 0 < self.recurrent_dropout < 1: 232 | ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1))) 233 | ones = K.tile(ones, (1, self.units)) 234 | 235 | def dropped_inputs(): 236 | return K.dropout(ones, self.recurrent_dropout) 237 | rec_dp_mask = K.in_train_phase(dropped_inputs, 238 | ones, 239 | training=training) 240 | constants.append(rec_dp_mask) 241 | else: 242 | constants.append(K.cast_to_floatx(1.)) 243 | return constants 244 | 245 | # Aliases 246 | 247 | TernaryConvolution2D = TernaryConv2D 248 | -------------------------------------------------------------------------------- /ternarynet/ternary_ops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import keras.backend as K 4 | 5 | 6 | def switch(condition, t, e): 7 | if K.backend() == 'tensorflow': 8 | import tensorflow as tf 9 | return tf.where(condition, t, e) 10 | elif K.backend() == 'theano': 11 | import theano.tensor as tt 12 | return tt.switch(condition, t, e) 13 | 14 | 15 | def _ternarize(W, H=1): 16 | '''The weights' ternarization function, 17 | 18 | # References: 19 | - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902) 20 | - [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) 21 | ''' 22 | W /= H 23 | 24 | ones = K.ones_like(W) 25 | zeros = K.zeros_like(W) 26 | Wt = switch(W > 0.5, ones, switch(W <= -0.5, -ones, zeros)) 27 | 28 | Wt *= H 29 | 30 | return Wt 31 | 32 | 33 | def ternarize(W, H=1): 34 | '''The weights' ternarization function, 35 | 36 | # References: 37 | - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902) 38 | - [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) 39 | ''' 40 | Wt = _ternarize(W, H) 41 | return W + K.stop_gradient(Wt - W) 42 | 43 | 44 | def ternarize_dot(x, W): 45 | '''For RNN (maybe Dense or Conv too). 46 | Refer to 'Recurrent Neural Networks with Limited Numerical Precision' Section 3.1 47 | ''' 48 | Wt = _ternarize(W) 49 | return K.dot(x, W) + K.stop_gradient(K.dot(x, Wt - W)) 50 | -------------------------------------------------------------------------------- /vae/README.md: -------------------------------------------------------------------------------- 1 | Simple keras implementation of Variational Auto-Encoder (VAE) on MNIST 2 | 3 | > This repo is with the [blog on VAE (in Chinese)](http://blog.csdn.net/jackytintin/article/details/53641885). 4 | 5 | 6 | ## Learned latent space 7 | ### MSE loss 8 | ![](./img/z_mse.png) 9 | ### crossentropy loss 10 | ![](./img/z_xent.png) 11 | 12 | ## Learned manifold 13 | ### MSE loss 14 | ![](./img/x_mse.png) 15 | ### crossentropy loss 16 | ![](./img/x_xent.png) 17 | 18 | 19 | ## Imputation 20 | ### MSE loss 21 | ![](./img/i_mse.png) 22 | ### crossentropy loss 23 | ![](./img/i_xent.png) 24 | 25 | ## Refenreces: 26 | * Kingma et al. [https://arxiv.org/abs/1312.6114](Auto-Encoding Variational Bayes). 27 | -------------------------------------------------------------------------------- /vae/img/i_mse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DingKe/nn_playground/8254491a79bd0e02d392f250b7da07bccc7cea2a/vae/img/i_mse.png -------------------------------------------------------------------------------- /vae/img/i_xent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DingKe/nn_playground/8254491a79bd0e02d392f250b7da07bccc7cea2a/vae/img/i_xent.png -------------------------------------------------------------------------------- /vae/img/x_mse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DingKe/nn_playground/8254491a79bd0e02d392f250b7da07bccc7cea2a/vae/img/x_mse.png -------------------------------------------------------------------------------- /vae/img/x_xent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DingKe/nn_playground/8254491a79bd0e02d392f250b7da07bccc7cea2a/vae/img/x_xent.png -------------------------------------------------------------------------------- /vae/img/z_mse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DingKe/nn_playground/8254491a79bd0e02d392f250b7da07bccc7cea2a/vae/img/z_mse.png -------------------------------------------------------------------------------- /vae/img/z_xent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DingKe/nn_playground/8254491a79bd0e02d392f250b7da07bccc7cea2a/vae/img/z_xent.png -------------------------------------------------------------------------------- /vae/variational_autoencoder.py: -------------------------------------------------------------------------------- 1 | '''This script demonstrates how to build a variational autoencoder with Keras. 2 | 3 | Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 4 | ''' 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from scipy.stats import norm 8 | 9 | from keras.layers import Input, Dense, Lambda 10 | from keras.models import Model 11 | from keras.regularizers import l2 12 | from keras import backend as K 13 | from keras import objectives 14 | from keras.datasets import mnist 15 | 16 | np.random.seed(1111) # for reproducibility 17 | 18 | batch_size = 100 19 | n = 784 20 | m = 2 21 | hidden_dim = 256 22 | epochs = 50 23 | epsilon_std = 1.0 24 | use_loss = 'xent' # 'mse' or 'xent' 25 | 26 | decay = 1e-4 # weight decay, a.k. l2 regularization 27 | use_bias = True 28 | 29 | ## Encoder 30 | x = Input(batch_shape=(batch_size, n)) 31 | h_encoded = Dense(hidden_dim, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='tanh')(x) 32 | z_mean = Dense(m, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias)(h_encoded) 33 | z_log_var = Dense(m, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias)(h_encoded) 34 | 35 | 36 | ## Sampler 37 | def sampling(args): 38 | z_mean, z_log_var = args 39 | epsilon = K.random_normal_variable(shape=(batch_size, m), mean=0., 40 | scale=epsilon_std) 41 | return z_mean + K.exp(z_log_var / 2) * epsilon 42 | 43 | z = Lambda(sampling, output_shape=(m,))([z_mean, z_log_var]) 44 | 45 | # we instantiate these layers separately so as to reuse them later 46 | decoder_h = Dense(hidden_dim, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='tanh') 47 | decoder_mean = Dense(n, kernel_regularizer=l2(decay), bias_regularizer=l2(decay), use_bias=use_bias, activation='sigmoid') 48 | 49 | ## Decoder 50 | h_decoded = decoder_h(z) 51 | x_hat = decoder_mean(h_decoded) 52 | 53 | 54 | ## loss 55 | def vae_loss(x, x_hat): 56 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 57 | xent_loss = n * objectives.binary_crossentropy(x, x_hat) 58 | mse_loss = n * objectives.mse(x, x_hat) 59 | if use_loss == 'xent': 60 | return xent_loss + kl_loss 61 | elif use_loss == 'mse': 62 | return mse_loss + kl_loss 63 | else: 64 | raise Expception, 'Nonknow loss!' 65 | 66 | vae = Model(x, x_hat) 67 | vae.compile(optimizer='rmsprop', loss=vae_loss) 68 | 69 | # train the VAE on MNIST digits 70 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 71 | 72 | x_train = x_train.astype('float32') / 255. 73 | x_test = x_test.astype('float32') / 255. 74 | x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:]))) 75 | x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:]))) 76 | 77 | vae.fit(x_train, x_train, 78 | shuffle=True, 79 | epochs=epochs, 80 | batch_size=batch_size, 81 | validation_data=(x_test, x_test)) 82 | 83 | ##----------Visualization----------## 84 | # build a model to project inputs on the latent space 85 | encoder = Model(x, z_mean) 86 | 87 | # display a 2D plot of the digit classes in the latent space 88 | x_test_encoded = encoder.predict(x_test, batch_size=batch_size) 89 | fig = plt.figure(figsize=(6, 6)) 90 | plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) 91 | plt.colorbar() 92 | plt.show() 93 | fig.savefig('z_{}.png'.format(use_loss)) 94 | 95 | # build a digit generator that can sample from the learned distribution 96 | decoder_input = Input(shape=(m,)) 97 | _h_decoded = decoder_h(decoder_input) 98 | _x_hat = decoder_mean(_h_decoded) 99 | generator = Model(decoder_input, _x_hat) 100 | 101 | # display a 2D manifold of the digits 102 | n = 15 # figure with 15x15 digits 103 | digit_size = 28 104 | figure = np.zeros((digit_size * n, digit_size * n)) 105 | # linearly spaced coordinates on the unit square were transformed through the inverse CDF (ppf) of the Gaussian 106 | # to produce values of the latent variables z, since the prior of the latent space is Gaussian 107 | grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) 108 | grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) 109 | 110 | for i, yi in enumerate(grid_x): 111 | for j, xi in enumerate(grid_y): 112 | z_sample = np.array([[xi, yi]]) 113 | x_decoded = generator.predict(z_sample) 114 | digit = x_decoded[0].reshape(digit_size, digit_size) 115 | figure[i * digit_size: (i + 1) * digit_size, 116 | j * digit_size: (j + 1) * digit_size] = digit 117 | 118 | fig = plt.figure(figsize=(10, 10)) 119 | plt.imshow(figure, cmap='Greys_r') 120 | plt.show() 121 | fig.savefig('x_{}.png'.format(use_loss)) 122 | 123 | # data imputation 124 | figure = np.zeros((digit_size * 3, digit_size * n)) 125 | x = x_test[:batch_size,:] 126 | x_corupted = np.copy(x) 127 | x_corupted[:, 300:400] = 0 128 | x_encoded = vae.predict(x_corupted, batch_size=batch_size).reshape((-1, digit_size, digit_size)) 129 | x = x.reshape((-1, digit_size, digit_size)) 130 | x_corupted = x_corupted.reshape((-1, digit_size, digit_size)) 131 | for i in range(n): 132 | xi = x[i] 133 | xi_c = x_corupted[i] 134 | xi_e = x_encoded[i] 135 | figure[:digit_size, i * digit_size:(i+1)*digit_size] = xi 136 | figure[digit_size:2 * digit_size, i * digit_size:(i+1)*digit_size] = xi_c 137 | figure[2 * digit_size:, i * digit_size:(i+1)*digit_size] = xi_e 138 | 139 | fig = plt.figure(figsize=(10, 10)) 140 | plt.imshow(figure, cmap='Greys_r') 141 | plt.show() 142 | fig.savefig('i_{}.png'.format(use_loss)) 143 | -------------------------------------------------------------------------------- /weightnorm/README.md: -------------------------------------------------------------------------------- 1 | Weight Normalization 2 | 3 | ## Run 4 | 5 | ### MLP w/ WN on MNIST 6 | python mnist_mlp.py 7 | 8 | ### CNN w/ WN on MNIST 9 | pyton minist_cnn.py 10 | 11 | ### CNN w/ WN on CIFAR10 12 | pyton cifar10_cnn.py 13 | 14 | ### GRU text generator w/ WN 15 | python gru_text_generation.py 16 | 17 | ### GRU Language Model 18 | imdb_lm.py 19 | > See [this](../gcnn/README.md) how to prepare data. 20 | 21 | ## References: 22 | Salimans and Kingma. [Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks](https://papers.nips.cc/paper/6114-weight-normalization-a-simple-reparameterization-to-accelerate-training-of-deep-neural-networks.pdf) 23 | -------------------------------------------------------------------------------- /weightnorm/cifar10_cnn.py: -------------------------------------------------------------------------------- 1 | '''Train a simple deep CNN on the CIFAR10 small images dataset. 2 | 3 | GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): 4 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python cifar10_cnn.py 5 | 6 | Conventional CNN: 7 | It gets down to 0.65 test logloss in 25 epochs, and down to 0.55 after 50 epochs. 8 | CNN with WN: 9 | It gets down to 0.61 test logloss in 25 epochs, and down to 0.55 after 50 epochs. 10 | ''' 11 | 12 | from __future__ import print_function 13 | from keras.datasets import cifar10 14 | from keras.preprocessing.image import ImageDataGenerator 15 | from keras.models import Sequential 16 | from keras.layers import Dense, Dropout, Activation, Flatten 17 | from keras.layers import Conv2D, MaxPooling2D 18 | from keras.utils import np_utils 19 | 20 | from weight_norm_layers import * 21 | 22 | K.set_image_data_format('channels_first') 23 | 24 | batch_size = 32 25 | nb_classes = 10 26 | epochs = 50 27 | data_augmentation = True 28 | 29 | # input image dimensions 30 | img_rows, img_cols = 32, 32 31 | # The CIFAR10 images are RGB. 32 | img_channels = 3 33 | 34 | # build model 35 | model = Sequential() 36 | model.add(WeightNormConv2D(32, (3, 3), padding='same', 37 | input_shape=(img_channels, img_rows, img_cols))) 38 | model.add(Activation('relu')) 39 | model.add(WeightNormConv2D(32, (3, 3))) 40 | model.add(Activation('relu')) 41 | model.add(MaxPooling2D(pool_size=(2, 2))) 42 | model.add(Dropout(0.25)) 43 | 44 | model.add(WeightNormConv2D(64, (3, 3), padding='same')) 45 | model.add(Activation('relu')) 46 | model.add(WeightNormConv2D(64, (3, 3))) 47 | model.add(Activation('relu')) 48 | model.add(MaxPooling2D(pool_size=(2, 2))) 49 | model.add(Dropout(0.25)) 50 | 51 | model.add(Flatten()) 52 | model.add(WeightNormDense(512)) 53 | model.add(Activation('relu')) 54 | model.add(Dropout(0.5)) 55 | model.add(WeightNormDense(nb_classes)) 56 | model.add(Activation('softmax')) 57 | 58 | # Let's train the model using RMSprop 59 | model.compile(loss='categorical_crossentropy', 60 | optimizer='rmsprop', 61 | metrics=['accuracy']) 62 | 63 | model.summary() 64 | 65 | # The data, shuffled and split between train and test sets: 66 | (X_train, y_train), (X_test, y_test) = cifar10.load_data() 67 | print('X_train shape:', X_train.shape) 68 | print(X_train.shape[0], 'train samples') 69 | print(X_test.shape[0], 'test samples') 70 | 71 | # Convert class vectors to binary class matrices. 72 | Y_train = np_utils.to_categorical(y_train, nb_classes) 73 | Y_test = np_utils.to_categorical(y_test, nb_classes) 74 | 75 | X_train = X_train.astype('float32') 76 | X_test = X_test.astype('float32') 77 | X_train /= 255 78 | X_test /= 255 79 | 80 | if not data_augmentation: 81 | print('Not using data augmentation.') 82 | model.fit(X_train, Y_train, 83 | batch_size=batch_size, 84 | epochs=epochs, 85 | validation_data=(X_test, Y_test), 86 | shuffle=True) 87 | else: 88 | print('Using real-time data augmentation.') 89 | # This will do preprocessing and realtime data augmentation: 90 | datagen = ImageDataGenerator( 91 | featurewise_center=False, # set input mean to 0 over the dataset 92 | samplewise_center=False, # set each sample mean to 0 93 | featurewise_std_normalization=False, # divide inputs by std of the dataset 94 | samplewise_std_normalization=False, # divide each input by its std 95 | zca_whitening=False, # apply ZCA whitening 96 | rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) 97 | width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) 98 | height_shift_range=0.1, # randomly shift images vertically (fraction of total height) 99 | horizontal_flip=True, # randomly flip images 100 | vertical_flip=False) # randomly flip images 101 | 102 | # Compute quantities required for featurewise normalization 103 | # (std, mean, and principal components if ZCA whitening is applied). 104 | datagen.fit(X_train) 105 | 106 | # Fit the model on the batches generated by datagen.flow(). 107 | model.fit_generator(datagen.flow(X_train, Y_train, 108 | batch_size=batch_size), 109 | steps_per_epoch=X_train.shape[0]/batch_size, 110 | epochs=epochs, 111 | validation_data=(X_test, Y_test)) 112 | -------------------------------------------------------------------------------- /weightnorm/gru_text_generation.py: -------------------------------------------------------------------------------- 1 | '''Example script to generate text from Nietzsche's writings. 2 | Modified from keras' example/lstm_text_generation.py 3 | ''' 4 | 5 | from __future__ import print_function 6 | from keras.models import Sequential 7 | from keras.layers import Dense, Activation, Dropout 8 | from keras.optimizers import RMSprop 9 | from keras.utils.data_utils import get_file 10 | import numpy as np 11 | import random 12 | import sys 13 | 14 | from weight_norm_layers import WeightNormGRU 15 | 16 | path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") 17 | text = open(path).read().lower() 18 | print('corpus length:', len(text)) 19 | 20 | chars = sorted(list(set(text))) 21 | print('total chars:', len(chars)) 22 | char_indices = dict((c, i) for i, c in enumerate(chars)) 23 | indices_char = dict((i, c) for i, c in enumerate(chars)) 24 | 25 | # cut the text in semi-redundant sequences of maxlen characters 26 | maxlen = 40 27 | step = 3 28 | sentences = [] 29 | next_chars = [] 30 | for i in range(0, len(text) - maxlen, step): 31 | sentences.append(text[i: i + maxlen]) 32 | next_chars.append(text[i + maxlen]) 33 | print('nb sequences:', len(sentences)) 34 | 35 | print('Vectorization...') 36 | X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) 37 | y = np.zeros((len(sentences), len(chars)), dtype=np.bool) 38 | for i, sentence in enumerate(sentences): 39 | for t, char in enumerate(sentence): 40 | X[i, t, char_indices[char]] = 1 41 | y[i, char_indices[next_chars[i]]] = 1 42 | 43 | 44 | # build the model: a single GRU 45 | print('Build model...') 46 | model = Sequential() 47 | model.add(WeightNormGRU(128, input_shape=(maxlen, len(chars)))) 48 | model.add(Dense(len(chars))) 49 | model.add(Activation('softmax')) 50 | 51 | optimizer = RMSprop(lr=0.01) 52 | model.compile(loss='categorical_crossentropy', optimizer=optimizer) 53 | 54 | 55 | def sample(preds, temperature=1.0): 56 | # helper function to sample an index from a probability array 57 | preds = np.asarray(preds).astype('float64') 58 | preds = np.log(preds) / temperature 59 | exp_preds = np.exp(preds) 60 | preds = exp_preds / np.sum(exp_preds) 61 | probas = np.random.multinomial(1, preds, 1) 62 | return np.argmax(probas) 63 | 64 | # train the model, output generated text after each iteration 65 | for iteration in range(1, 60): 66 | print() 67 | print('-' * 50) 68 | print('Iteration', iteration) 69 | model.fit(X, y, batch_size=128, epochs=1) 70 | 71 | start_index = random.randint(0, len(text) - maxlen - 1) 72 | 73 | for diversity in [0.2, 0.5, 1.0, 1.2]: 74 | print() 75 | print('----- diversity:', diversity) 76 | 77 | generated = '' 78 | sentence = text[start_index: start_index + maxlen] 79 | generated += sentence 80 | print('----- Generating with seed: "' + sentence + '"') 81 | sys.stdout.write(generated) 82 | 83 | for i in range(400): 84 | x = np.zeros((1, maxlen, len(chars))) 85 | for t, char in enumerate(sentence): 86 | x[0, t, char_indices[char]] = 1. 87 | 88 | preds = model.predict(x, verbose=0)[0] 89 | next_index = sample(preds, diversity) 90 | next_char = indices_char[next_index] 91 | 92 | generated += next_char 93 | sentence = sentence[1:] + next_char 94 | 95 | sys.stdout.write(next_char) 96 | sys.stdout.flush() 97 | print() 98 | -------------------------------------------------------------------------------- /weightnorm/imdb_generator.py: -------------------------------------------------------------------------------- 1 | ../gcnn/imdb_generator.py -------------------------------------------------------------------------------- /weightnorm/imdb_lm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Simple RNN for Language Model 3 | ''' 4 | from __future__ import print_function 5 | import os 6 | 7 | from keras.models import Model 8 | from keras.layers import Input, Embedding, Dense, TimeDistributed 9 | from keras.optimizers import * 10 | 11 | from weight_norm_layers import * 12 | from imdb_generator import IMDBLM 13 | 14 | 15 | def LM(batch_size, vocsize=20000, embed_dim=20, hidden_dim=30, nb_layers=1): 16 | x = Input(batch_shape=(batch_size, None)) 17 | # mebedding 18 | y = Embedding(vocsize+2, embed_dim, mask_zero=False)(x) 19 | for i in range(nb_layers-1): 20 | y = WeightNormGRU(hidden_dim, return_sequences=True, name='wngru{}'.format(i + 1))(y) 21 | y = WeightNormGRU(hidden_dim, return_sequences=True, name='wngru{}'.format(nb_layers))(y) 22 | y = TimeDistributed(Dense(vocsize+2, activation='softmax', name='dense{}'.format(nb_layers)))(y) 23 | 24 | model = Model(input=x, output=y) 25 | 26 | return model 27 | 28 | 29 | def train_model(): 30 | batch_size = 32 31 | nb_epoch = 100 32 | 33 | vocsize = 2000 # top 2k 34 | max_len = 30 35 | train_ratio = 0.99 36 | 37 | # Build model 38 | model = LM(batch_size, vocsize=vocsize, nb_layers=3) 39 | model.compile(optimizer='adam', 40 | loss='sparse_categorical_crossentropy') 41 | 42 | # Prepare data 43 | path = './data/imdb-full.pkl' 44 | # Train 45 | train_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, shuffle=True, 46 | which_set='train', train_ratio=train_ratio, batch_size=batch_size) 47 | # Validation 48 | val_gen = IMDBLM(path=path, max_len=max_len, vocab_size=vocsize, 49 | which_set='validation', train_ratio=train_ratio, batch_size=batch_size) 50 | 51 | train_samples = 20000 52 | val_samples = 2000 53 | 54 | # Start training 55 | model.summary() 56 | model.fit_generator(train_gen(), samples_per_epoch=train_samples, 57 | validation_data=val_gen(), nb_val_samples=val_samples, 58 | nb_epoch=nb_epoch, verbose=1) 59 | 60 | 61 | def run_demo(): 62 | train_model() 63 | 64 | 65 | if __name__ == '__main__': 66 | run_demo() 67 | -------------------------------------------------------------------------------- /weightnorm/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple CNN with weight normalization on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | w/o WN: 4 | Gets to 99.25% test accuracy after 12 epochs 5 | w/ WN: 6 | Gets to 99.45% test accuracy after 10 epochs using tensorflow backend 7 | ''' 8 | 9 | from __future__ import print_function 10 | import numpy as np 11 | np.random.seed(1337) # for reproducibility 12 | 13 | from keras.datasets import mnist 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout, Activation, BatchNormalization, MaxPooling2D 16 | from keras.layers import Flatten 17 | from keras.optimizers import SGD, Adam, RMSprop 18 | from keras.callbacks import LearningRateScheduler 19 | from keras.utils import np_utils 20 | import keras.backend as K 21 | K.set_image_data_format('channels_first') 22 | 23 | from weight_norm_layers import * 24 | 25 | 26 | # nn 27 | batch_size = 50 28 | epochs = 20 29 | nb_channel = 1 30 | img_rows = 28 31 | img_cols = 28 32 | nb_classes = 10 33 | use_bias = False 34 | 35 | # learning rate schedule 36 | lr_start = 1e-3 37 | lr_end = 1e-4 38 | lr_decay = (lr_end / lr_start)**(1. / epochs) 39 | 40 | # BN 41 | epsilon = 1e-6 42 | momentum = 0.9 43 | 44 | # dropout 45 | p1 = 0.25 46 | p2 = 0.5 47 | 48 | # the data, shuffled and split between train and test sets 49 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 50 | 51 | X_train = X_train.reshape(60000, 1, 28, 28) 52 | X_test = X_test.reshape(10000, 1, 28, 28) 53 | X_train = X_train.astype('float32') 54 | X_test = X_test.astype('float32') 55 | X_train /= 255 56 | X_test /= 255 57 | print(X_train.shape[0], 'train samples') 58 | print(X_test.shape[0], 'test samples') 59 | 60 | # convert class vectors to binary class matrices 61 | Y_train = np_utils.to_categorical(y_train, nb_classes) 62 | Y_test = np_utils.to_categorical(y_test, nb_classes) 63 | 64 | 65 | model = Sequential() 66 | # conv1 67 | model.add(WeightNormConv2D(128, (3, 3), input_shape=(nb_channel, img_rows, img_cols), 68 | padding='same', use_bias=use_bias, name='conv1')) 69 | #model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn1')) 70 | model.add(Activation('relu', name='act1')) 71 | # conv2 72 | model.add(WeightNormConv2D(128, (3, 3), 73 | padding='same', use_bias=use_bias, name='conv2')) 74 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool2')) 75 | #model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn2')) 76 | model.add(Activation('relu', name='act2')) 77 | # conv3 78 | model.add(WeightNormConv2D(256, (3, 3), 79 | padding='same', use_bias=use_bias, name='conv3')) 80 | #model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn3')) 81 | model.add(Activation('relu', name='act3')) 82 | # conv4 83 | model.add(WeightNormConv2D(256, (3, 3), 84 | padding='same', use_bias=use_bias, name='conv4')) 85 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool4')) 86 | #model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn4')) 87 | model.add(Activation('relu', name='act4')) 88 | model.add(Flatten()) 89 | # dense1 90 | model.add(WeightNormDense(1024, use_bias=use_bias, name='dense5')) 91 | #model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn5')) 92 | model.add(Activation('relu', name='act5')) 93 | # dense2 94 | model.add(WeightNormDense(nb_classes, use_bias=use_bias, name='dense6')) 95 | #model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn6')) 96 | model.add(Activation('softmax', name='act6')) 97 | 98 | opt = Adam(lr=lr_start) 99 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 100 | model.summary() 101 | 102 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 103 | history = model.fit(X_train, Y_train, 104 | batch_size=batch_size, epochs=epochs, 105 | verbose=1, validation_data=(X_test, Y_test), 106 | callbacks=[lr_scheduler]) 107 | score = model.evaluate(X_test, Y_test, verbose=0) 108 | print('Test score:', score[0]) 109 | print('Test accuracy:', score[1]) 110 | -------------------------------------------------------------------------------- /weightnorm/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple fully connected NN with Weight Normalzition on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 98.4% test accuracy after 20 epochs using tensorflow backend 4 | ''' 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | np.random.seed(1337) # for reproducibility 9 | 10 | from keras.datasets import mnist 11 | from keras.models import Sequential 12 | from keras.layers.core import Dense, Dropout, Activation 13 | from keras.optimizers import SGD, Adam, RMSprop 14 | from keras.utils import np_utils 15 | 16 | from weight_norm_layers import * 17 | 18 | batch_size = 128 19 | nb_classes = 10 20 | epochs = 20 21 | 22 | # the data, shuffled and split between train and test sets 23 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 24 | 25 | X_train = X_train.reshape(60000, 784) 26 | X_test = X_test.reshape(10000, 784) 27 | X_train = X_train.astype('float32') 28 | X_test = X_test.astype('float32') 29 | X_train /= 255 30 | X_test /= 255 31 | print(X_train.shape[0], 'train samples') 32 | print(X_test.shape[0], 'test samples') 33 | 34 | # convert class vectors to binary class matrices 35 | Y_train = np_utils.to_categorical(y_train, nb_classes) 36 | Y_test = np_utils.to_categorical(y_test, nb_classes) 37 | 38 | model = Sequential() 39 | model.add(WeightNormDense(512, input_shape=(784,))) 40 | model.add(Activation('relu')) 41 | model.add(Dropout(0.2)) 42 | model.add(WeightNormDense(512)) 43 | model.add(Activation('relu')) 44 | model.add(Dropout(0.2)) 45 | model.add(WeightNormDense(10)) 46 | model.add(Activation('softmax')) 47 | 48 | model.summary() 49 | 50 | model.compile(loss='categorical_crossentropy', 51 | optimizer=RMSprop(), 52 | metrics=['accuracy']) 53 | 54 | history = model.fit(X_train, Y_train, 55 | batch_size=batch_size, epochs=epochs, 56 | verbose=1, validation_data=(X_test, Y_test)) 57 | score = model.evaluate(X_test, Y_test, verbose=0) 58 | print('Test score:', score[0]) 59 | print('Test accuracy:', score[1]) 60 | -------------------------------------------------------------------------------- /weightnorm/weight_norm_layers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | from keras import backend as K 5 | from keras.engine import InputSpec 6 | from keras.layers import Dense, Conv2D, GRU 7 | 8 | 9 | class WeightNormDense(Dense): 10 | def build(self, input_shape): 11 | assert len(input_shape) >= 2 12 | input_dim = input_shape[-1] 13 | 14 | self.kernel = self.add_weight(shape=(input_dim, self.units), 15 | initializer=self.kernel_initializer, 16 | name='kernel', 17 | regularizer=self.kernel_regularizer, 18 | constraint=self.kernel_constraint) 19 | self.g = self.add_weight(shape=(self.units,), 20 | initializer='one', 21 | name='g') 22 | if self.use_bias: 23 | self.bias = self.add_weight(shape=(self.units,), 24 | initializer=self.bias_initializer, 25 | name='bias', 26 | regularizer=self.bias_regularizer, 27 | constraint=self.bias_constraint) 28 | else: 29 | self.bias = None 30 | self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim}) 31 | self.built = True 32 | 33 | def call(self, inputs): 34 | kernel = self.kernel * self.g / K.sqrt(K.sum(K.square(self.kernel), axis=0)) 35 | output = K.dot(inputs, kernel) 36 | if self.use_bias: 37 | output = K.bias_add(output, self.bias) 38 | if self.activation is not None: 39 | output = self.activation(output) 40 | return output 41 | 42 | 43 | class WeightNormConv2D(Conv2D): 44 | def build(self, input_shape): 45 | if self.data_format == 'channels_first': 46 | channel_axis = 1 47 | else: 48 | channel_axis = -1 49 | if input_shape[channel_axis] is None: 50 | raise ValueError('The channel dimension of the inputs ' 51 | 'should be defined. Found `None`.') 52 | input_dim = input_shape[channel_axis] 53 | kernel_shape = self.kernel_size + (input_dim, self.filters) 54 | 55 | self.kernel = self.add_weight(shape=kernel_shape, 56 | initializer=self.kernel_initializer, 57 | name='kernel', 58 | regularizer=self.kernel_regularizer, 59 | constraint=self.kernel_constraint) 60 | self.g = self.add_weight(shape=(1, 1, 1, self.filters), 61 | initializer='one', 62 | name='g') 63 | if self.use_bias: 64 | self.bias = self.add_weight(shape=(self.filters,), 65 | initializer=self.bias_initializer, 66 | name='bias', 67 | regularizer=self.bias_regularizer, 68 | constraint=self.bias_constraint) 69 | else: 70 | self.bias = None 71 | # Set input spec. 72 | self.input_spec = InputSpec(ndim=self.rank + 2, 73 | axes={channel_axis: input_dim}) 74 | 75 | self.built = True 76 | 77 | def call(self, x): 78 | kernel = self.kernel * self.g / K.sqrt(K.sum(K.square(self.kernel), axis=[0, 1, 2], keepdims=True)) 79 | output = K.conv2d(x, kernel, strides=self.strides, 80 | padding=self.padding, 81 | data_format=self.data_format) 82 | if self.use_bias: 83 | output = K.bias_add(output, self.bias, data_format=self.data_format) 84 | if self.activation is not None: 85 | output = self.activation(output) 86 | return output 87 | 88 | 89 | class WeightNormGRU(GRU): 90 | def build(self, input_shape): 91 | if isinstance(input_shape, list): 92 | input_shape = input_shape[0] 93 | 94 | batch_size = input_shape[0] if self.stateful else None 95 | self.input_dim = input_shape[2] 96 | self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim)) 97 | self.state_spec = InputSpec(shape=(batch_size, self.units)) 98 | 99 | self.states = [None] 100 | if self.stateful: 101 | self.reset_states() 102 | 103 | self.kernel = self.add_weight(shape=(self.input_dim, self.units * 3), 104 | name='kernel', 105 | initializer=self.kernel_initializer, 106 | regularizer=self.kernel_regularizer, 107 | constraint=self.kernel_constraint) 108 | self.recurrent_kernel = self.add_weight( 109 | shape=(self.units, self.units * 3), 110 | name='recurrent_kernel', 111 | initializer=self.recurrent_initializer, 112 | regularizer=self.recurrent_regularizer, 113 | constraint=self.recurrent_constraint) 114 | 115 | if self.use_bias: 116 | self.bias = self.add_weight(shape=(self.units * 3,), 117 | name='bias', 118 | initializer=self.bias_initializer, 119 | regularizer=self.bias_regularizer, 120 | constraint=self.bias_constraint) 121 | else: 122 | self.bias = None 123 | 124 | self.kernel_z = self.kernel[:, :self.units] 125 | self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units] 126 | self.kernel_r = self.kernel[:, self.units: self.units * 2] 127 | self.recurrent_kernel_r = self.recurrent_kernel[:, 128 | self.units: 129 | self.units * 2] 130 | self.kernel_h = self.kernel[:, self.units * 2:] 131 | self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:] 132 | 133 | if self.use_bias: 134 | self.bias_z = self.bias[:self.units] 135 | self.bias_r = self.bias[self.units: self.units * 2] 136 | self.bias_h = self.bias[self.units * 2:] 137 | else: 138 | self.bias_z = None 139 | self.bias_r = None 140 | self.bias_h = None 141 | 142 | self.g_kernel_z = self.add_weight((self.units,), 143 | initializer='one', 144 | name='g_kernel_z') 145 | self.g_kernel_r = self.add_weight((self.units,), 146 | initializer='one', 147 | name='g_kernel_r') 148 | self.g_kernel_h = self.add_weight((self.units,), 149 | initializer='one', 150 | name='g_kernel_h') 151 | self.g_recurrent_kernel_z = self.add_weight((self.units,), 152 | initializer='one', 153 | name='g_recurrent_kernel_z') 154 | self.g_recurrent_kernel_r = self.add_weight((self.units,), 155 | initializer='one', 156 | name='g_recurrent_kernel_r') 157 | self.g_recurrent_kernel_h = self.add_weight((self.units,), 158 | initializer='one', 159 | name='g_recurrrent_kernel_h') 160 | self.built = True 161 | 162 | def preprocess_input(self, x, training=None): 163 | return x 164 | 165 | def step(self, x, states): 166 | h_tm1 = states[0] # previous memory 167 | B_U = states[1] # dropout matrices for recurrent units 168 | B_W = states[2] 169 | 170 | kernel_z = self.kernel_z * self.g_kernel_z / K.sqrt(K.sum(K.square(self.g_kernel_z), axis=0)) 171 | kernel_r = self.kernel_r * self.g_kernel_r / K.sqrt(K.sum(K.square(self.g_kernel_r), axis=0)) 172 | kernel_h = self.kernel_h * self.g_kernel_h / K.sqrt(K.sum(K.square(self.g_kernel_h), axis=0)) 173 | recurrent_kernel_z = self.recurrent_kernel_z * self.g_recurrent_kernel_z / K.sqrt(K.sum(K.square(self.g_recurrent_kernel_z), axis=0)) 174 | recurrent_kernel_r = self.recurrent_kernel_r * self.g_recurrent_kernel_r / K.sqrt(K.sum(K.square(self.g_recurrent_kernel_r), axis=0)) 175 | recurrent_kernel_h = self.recurrent_kernel_h * self.g_recurrent_kernel_h / K.sqrt(K.sum(K.square(self.g_recurrent_kernel_h), axis=0)) 176 | 177 | x_z = K.dot(x * B_W[0], kernel_z) 178 | x_r = K.dot(x * B_W[1], kernel_r) 179 | x_h = K.dot(x * B_W[2], kernel_h) 180 | if self.use_bias: 181 | x_z += self.bias_z 182 | x_r += self.bias_r 183 | x_h += self.bias_h 184 | 185 | z = self.recurrent_activation(x_z + K.dot(h_tm1 * B_U[0], recurrent_kernel_z)) 186 | r = self.recurrent_activation(x_r + K.dot(h_tm1 * B_U[1], recurrent_kernel_r)) 187 | hh = self.activation(x_h + K.dot(r * h_tm1 * B_U[2], recurrent_kernel_h)) 188 | h = z * h_tm1 + (1 - z) * hh 189 | 190 | return h, [h] 191 | 192 | 193 | # Aliases 194 | 195 | WeightNormConvolution2D = WeightNormConv2D 196 | -------------------------------------------------------------------------------- /wgan/README.md: -------------------------------------------------------------------------------- 1 | Toy keras implementation of Wasserstein GAN (WGAN) on MNIST 2 | 3 | ## Reference 4 | * Arjovsky et al. [Wasserstein GAN](https://www.arxiv.org/abs/1701.07875) 5 | -------------------------------------------------------------------------------- /wgan/wgan_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Train a Wasserstein Generative Adversarial Network (WGAN) on the MNIST 5 | """ 6 | from __future__ import print_function 7 | from PIL import Image 8 | from six.moves import range 9 | 10 | import keras.backend as K 11 | K.set_image_data_format('channels_first') 12 | 13 | from keras.datasets import mnist 14 | from keras.layers import Input, Dense, Reshape, Flatten, Dropout, Activation, BatchNormalization 15 | from keras.layers import Conv2D, UpSampling2D, LeakyReLU 16 | from keras.models import Sequential, Model 17 | from keras.optimizers import RMSprop, Adam 18 | from keras.utils.generic_utils import Progbar 19 | 20 | import numpy as np 21 | np.random.seed(1337) 22 | 23 | 24 | def clip_weights(model, lower, upper): 25 | for l in model.layers: 26 | weights = l.get_weights() 27 | weights = [np.clip(w, lower, upper) for w in weights] 28 | l.set_weights(weights) 29 | 30 | 31 | def wasserstein(y_true, y_pred): 32 | return K.mean(y_true * y_pred) 33 | 34 | 35 | def build_generator(latent_size): 36 | cnn = Sequential() 37 | cnn.add(Dense(1024, input_dim=latent_size, activation='relu')) 38 | cnn.add(Dense(128 * 7 * 7, activation='relu')) 39 | cnn.add(Reshape((128, 7, 7))) 40 | 41 | # upsample to (..., 14, 14) 42 | cnn.add(UpSampling2D(size=(2, 2))) 43 | cnn.add(Conv2D(256, 5, padding='same', 44 | activation='relu', kernel_initializer='glorot_normal')) 45 | 46 | # upsample to (..., 28, 28) 47 | cnn.add(UpSampling2D(size=(2, 2))) 48 | cnn.add(Conv2D(128, 5, padding='same', 49 | activation='relu', kernel_initializer='glorot_normal')) 50 | 51 | # take a channel axis reduction 52 | cnn.add(Conv2D(1, 2, padding='same', 53 | activation='tanh', kernel_initializer='glorot_normal')) 54 | 55 | # this is the z space commonly refered to in GAN papers 56 | latent = Input(shape=(latent_size,)) 57 | 58 | fake_image = cnn(latent) 59 | 60 | return Model(inputs=latent, outputs=fake_image) 61 | 62 | 63 | def build_critic(c=0.01): 64 | # build a relatively standard conv net with LeakyReLUs 65 | cnn = Sequential() 66 | 67 | cnn.add(Conv2D(32, 3, padding='same', strides=(2, 2), 68 | input_shape=(1, 28, 28))) 69 | cnn.add(LeakyReLU()) 70 | cnn.add(Dropout(0.3)) 71 | 72 | cnn.add(Conv2D(64, 3, padding='same', strides=(1, 1))) 73 | cnn.add(LeakyReLU()) 74 | cnn.add(Dropout(0.3)) 75 | 76 | cnn.add(Conv2D(128, 3, padding='same', strides=(2, 2))) 77 | cnn.add(LeakyReLU()) 78 | cnn.add(Dropout(0.3)) 79 | 80 | cnn.add(Conv2D(256, 3, padding='same', strides=(1, 1))) 81 | cnn.add(LeakyReLU()) 82 | cnn.add(Dropout(0.3)) 83 | 84 | cnn.add(Flatten()) 85 | 86 | image = Input(shape=(1, 28, 28)) 87 | 88 | features = cnn(image) 89 | fake = Dense(1, activation='linear', name='critic')(features) 90 | 91 | return Model(inputs=image, outputs=fake) 92 | 93 | 94 | if __name__ == '__main__': 95 | 96 | epochs = 5000 97 | batch_size = 50 98 | latent_size = 20 99 | 100 | lr = 0.0001 101 | c = 0.01 102 | 103 | # build the critic 104 | critic = build_critic() 105 | critic.compile( 106 | optimizer=RMSprop(lr=lr), 107 | loss=wasserstein 108 | ) 109 | 110 | # build the generator 111 | generator = build_generator(latent_size) 112 | 113 | latent = Input(shape=(latent_size, )) 114 | # get a fake image 115 | fake = generator(latent) 116 | # we only want to be able to train generation for the combined model 117 | critic.trainable = False 118 | fake = critic(fake) 119 | combined = Model(inputs=latent, outputs=fake) 120 | combined.compile( 121 | optimizer=Adam(lr=lr), 122 | loss=wasserstein 123 | ) 124 | 125 | # get our mnist data, and force it to be of shape (..., 1, 28, 28) with 126 | # range [-1, 1] 127 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 128 | X_train = (X_train.astype(np.float32) - 127.5) / 127.5 129 | X_train = np.expand_dims(X_train, axis=1) 130 | 131 | X_test = (X_test.astype(np.float32) - 127.5) / 127.5 132 | X_test = np.expand_dims(X_test, axis=1) 133 | 134 | nb_train, nb_test = X_train.shape[0], X_test.shape[0] 135 | 136 | for epoch in range(epochs): 137 | print('Epoch {} of {}'.format(epoch + 1, epochs)) 138 | 139 | nb_batches = int(X_train.shape[0] / batch_size) 140 | progress_bar = Progbar(target=nb_batches) 141 | 142 | epoch_critic_loss = [] 143 | epoch_gen_loss = [] 144 | 145 | index = 0 146 | while index < nb_batches: 147 | ## critic 148 | if epoch < 5 or epoch % 100 == 0: 149 | Diters = 100 150 | else: 151 | Diters = 5 152 | iter = 0 153 | critic_loss = [] 154 | while index < nb_batches and iter < Diters: 155 | progress_bar.update(index) 156 | index += 1 157 | iter += 1 158 | 159 | # generate a new batch of noise 160 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 161 | # generate a batch of fake images 162 | generated_images = generator.predict(noise, verbose=0) 163 | 164 | # get a batch of real images 165 | image_batch = X_train[index * batch_size:(index + 1) * batch_size] 166 | label_batch = y_train[index * batch_size:(index + 1) * batch_size] 167 | 168 | X = np.concatenate((image_batch, generated_images)) 169 | y = np.array([-1] * len(image_batch) + [1] * batch_size) 170 | 171 | critic_loss.append(-critic.train_on_batch(X, y)) 172 | 173 | clip_weights(critic, -c, c) 174 | 175 | epoch_critic_loss.append(sum(critic_loss)/len(critic_loss)) 176 | 177 | ## generator 178 | # make new noise. we generate 2 * batch size here such that we have 179 | # the generator optimize over an identical number of images as the 180 | # critic 181 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 182 | target = -np.ones(batch_size) 183 | epoch_gen_loss.append(-combined.train_on_batch(noise, target)) 184 | 185 | print('\n[Loss_C: {:.3f}, Loss_G: {:.3f}]'.format(np.mean(epoch_critic_loss), np.mean(epoch_gen_loss))) 186 | 187 | # save weights every epoch 188 | generator.save_weights( 189 | 'cnn_generator_epoch_{0:03d}.hdf5'.format(epoch), True) 190 | critic.save_weights( 191 | 'cnn_critic_epoch_{0:03d}.hdf5'.format(epoch), True) 192 | 193 | # generate some digits to display 194 | noise = np.random.uniform(-1, 1, (100, latent_size)) 195 | # get a batch to display 196 | generated_images = generator.predict(noise, verbose=0) 197 | 198 | # arrange them into a grid 199 | img = (np.concatenate([r.reshape(-1, 28) 200 | for r in np.split(generated_images, 10) 201 | ], axis=-1) * 127.5 + 127.5).astype(np.uint8) 202 | 203 | Image.fromarray(img).save( 204 | 'cnn_epoch_{0:03d}_generated.png'.format(epoch)) 205 | -------------------------------------------------------------------------------- /wgan/wgan_mlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Train a Wasserstein Generative Adversarial Network (WGAN) on the MNIST 5 | """ 6 | from __future__ import print_function 7 | from PIL import Image 8 | from six.moves import range 9 | 10 | import keras.backend as K 11 | 12 | from keras.datasets import mnist 13 | from keras.layers import Input, Dense, Reshape, Flatten, Dropout, Activation, BatchNormalization 14 | from keras.models import Sequential, Model 15 | from keras.optimizers import RMSprop, Adam 16 | from keras.utils.generic_utils import Progbar 17 | 18 | import numpy as np 19 | np.random.seed(1337) 20 | 21 | 22 | def clip_weights(model, lower, upper): 23 | for l in model.layers: 24 | weights = l.get_weights() 25 | weights = [np.clip(w, lower, upper) for w in weights] 26 | l.set_weights(weights) 27 | 28 | 29 | def wasserstein(y_true, y_pred): 30 | return K.mean(y_true * y_pred) 31 | 32 | 33 | def build_generator(latent_size): 34 | model = Sequential() 35 | model.add(Dense(1024, input_dim=latent_size, activation='relu')) 36 | model.add(Dense(28 * 28, activation='tanh')) 37 | model.add(Reshape((1, 28, 28))) 38 | 39 | return model 40 | 41 | 42 | def build_critic(c=0.01): 43 | f = Sequential() 44 | f.add(Flatten(input_shape=(1, 28, 28))) 45 | f.add(Dense(256)) 46 | f.add(Activation('relu')) 47 | f.add(Dense(128)) 48 | f.add(Activation('relu')) 49 | f.add(Dense(1, activation='linear')) 50 | 51 | image = Input(shape=(1, 28, 28)) 52 | score = f(image) 53 | 54 | model = Model(image, score) 55 | 56 | return model 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | epochs = 5000 62 | batch_size = 50 63 | latent_size = 20 64 | 65 | lr = 0.0001 66 | c = 0.01 67 | 68 | # build the critic 69 | critic = build_critic() 70 | critic.compile( 71 | optimizer=RMSprop(lr=lr), 72 | loss=wasserstein 73 | ) 74 | 75 | # build the generator 76 | generator = build_generator(latent_size) 77 | 78 | latent = Input(shape=(latent_size, )) 79 | # get a fake image 80 | fake = generator(latent) 81 | # we only want to be able to train generation for the combined model 82 | critic.trainable = False 83 | fake = critic(fake) 84 | combined = Model(inputs=latent, outputs=fake) 85 | combined.compile( 86 | optimizer=Adam(lr=lr), 87 | loss=wasserstein 88 | ) 89 | 90 | # get our mnist data, and force it to be of shape (..., 1, 28, 28) with 91 | # range [-1, 1] 92 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 93 | X_train = (X_train.astype(np.float32) - 127.5) / 127.5 94 | X_train = np.expand_dims(X_train, axis=1) 95 | 96 | X_test = (X_test.astype(np.float32) - 127.5) / 127.5 97 | X_test = np.expand_dims(X_test, axis=1) 98 | 99 | nb_train, nb_test = X_train.shape[0], X_test.shape[0] 100 | 101 | for epoch in range(epochs): 102 | print('Epoch {} of {}'.format(epoch + 1, epochs)) 103 | 104 | nb_batches = int(X_train.shape[0] / batch_size) 105 | progress_bar = Progbar(target=nb_batches) 106 | 107 | epoch_critic_loss = [] 108 | epoch_gen_loss = [] 109 | 110 | index = 0 111 | while index < nb_batches: 112 | ## critic 113 | if epoch < 5 or epoch % 100 == 0: 114 | Diters = 100 115 | else: 116 | Diters = 5 117 | iter = 0 118 | critic_loss = [] 119 | while index < nb_batches and iter < Diters: 120 | progress_bar.update(index) 121 | index += 1 122 | iter += 1 123 | 124 | # generate a new batch of noise 125 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 126 | # generate a batch of fake images 127 | generated_images = generator.predict(noise, verbose=0) 128 | 129 | # get a batch of real images 130 | image_batch = X_train[index * batch_size:(index + 1) * batch_size] 131 | label_batch = y_train[index * batch_size:(index + 1) * batch_size] 132 | 133 | X = np.concatenate((image_batch, generated_images)) 134 | y = np.array([-1] * len(image_batch) + [1] * batch_size) 135 | 136 | critic_loss.append(-critic.train_on_batch(X, y)) 137 | 138 | clip_weights(critic, -c, c) 139 | 140 | epoch_critic_loss.append(sum(critic_loss)/len(critic_loss)) 141 | 142 | ## generator 143 | # make new noise. we generate 2 * batch size here such that we have 144 | # the generator optimize over an identical number of images as the 145 | # critic 146 | noise = np.random.uniform(-1, 1, (batch_size, latent_size)) 147 | target = -np.ones(batch_size) 148 | epoch_gen_loss.append(-combined.train_on_batch(noise, target)) 149 | 150 | print('\n[Loss_C: {:.3f}, Loss_G: {:.3f}]'.format(np.mean(epoch_critic_loss), np.mean(epoch_gen_loss))) 151 | 152 | # save weights every epoch 153 | if False: 154 | generator.save_weights( 155 | 'mlp_generator_epoch_{0:03d}.hdf5'.format(epoch), True) 156 | critic.save_weights( 157 | 'mlp_critic_epoch_{0:03d}.hdf5'.format(epoch), True) 158 | 159 | # generate some digits to display 160 | noise = np.random.uniform(-1, 1, (100, latent_size)) 161 | # get a batch to display 162 | generated_images = generator.predict(noise, verbose=0) 163 | 164 | # arrange them into a grid 165 | img = (np.concatenate([r.reshape(-1, 28) 166 | for r in np.split(generated_images, 10) 167 | ], axis=-1) * 127.5 + 127.5).astype(np.uint8) 168 | 169 | Image.fromarray(img).save( 170 | 'mlp_epoch_{0:03d}_generated.png'.format(epoch)) 171 | -------------------------------------------------------------------------------- /xnornet/README.md: -------------------------------------------------------------------------------- 1 | This is a keras implementation of XNOR Networks. 2 | 3 | ## Run 4 | ### train a XNOR MLP model on MNIST 5 | python mnist_mlp.py 6 | ### train a XNOR CNN model on MNIST 7 | python mnist_cnn.py 8 | 9 | ## Reference 10 | * Rastegari et al. [XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks](https://arxiv.org/abs/1603.05279) 11 | -------------------------------------------------------------------------------- /xnornet/binary_layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from keras import backend as K 5 | 6 | from keras.layers import InputSpec, Layer, Dense, Conv2D 7 | from keras import constraints 8 | from keras import initializers 9 | 10 | from binary_ops import binarize 11 | 12 | 13 | class Clip(constraints.Constraint): 14 | def __init__(self, min_value, max_value=None): 15 | self.min_value = min_value 16 | self.max_value = max_value 17 | if not self.max_value: 18 | self.max_value = -self.min_value 19 | if self.min_value > self.max_value: 20 | self.min_value, self.max_value = self.max_value, self.min_value 21 | 22 | def __call__(self, p): 23 | return K.clip(p, self.min_value, self.max_value) 24 | 25 | def get_config(self): 26 | return {"min_value": self.min_value, 27 | "max_value": self.max_value} 28 | 29 | 30 | class BinaryDense(Dense): 31 | ''' Binarized Dense layer 32 | References: 33 | "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830] 34 | ''' 35 | def __init__(self, units, H=1., kernel_lr_multiplier='Glorot', bias_lr_multiplier=None, **kwargs): 36 | super(BinaryDense, self).__init__(units, **kwargs) 37 | self.H = H 38 | self.kernel_lr_multiplier = kernel_lr_multiplier 39 | self.bias_lr_multiplier = bias_lr_multiplier 40 | 41 | super(BinaryDense, self).__init__(units, **kwargs) 42 | 43 | def build(self, input_shape): 44 | assert len(input_shape) >= 2 45 | input_dim = input_shape[1] 46 | 47 | if self.H == 'Glorot': 48 | self.H = np.float32(np.sqrt(1.5 / (input_dim + self.units))) 49 | #print('Glorot H: {}'.format(self.H)) 50 | if self.kernel_lr_multiplier == 'Glorot': 51 | self.kernel_lr_multiplier = np.float32(1. / np.sqrt(1.5 / (input_dim + self.units))) 52 | #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier)) 53 | 54 | self.kernel_constraint = Clip(-self.H, self.H) 55 | self.kernel_initializer = initializers.RandomUniform(-self.H, self.H) 56 | self.kernel = self.add_weight(shape=(input_dim, self.units), 57 | initializer=self.kernel_initializer, 58 | name='kernel', 59 | regularizer=self.kernel_regularizer, 60 | constraint=self.kernel_constraint) 61 | 62 | if self.use_bias: 63 | self.lr_multipliers = [self.kernel_lr_multiplier, self.bias_lr_multiplier] 64 | self.bias = self.add_weight(shape=(self.output_dim,), 65 | initializer=self.bias_initializer, 66 | name='bias', 67 | regularizer=self.bias_regularizer, 68 | constraint=self.bias_constraint) 69 | else: 70 | self.lr_multipliers = [self.kernel_lr_multiplier] 71 | self.bias = None 72 | self.built = True 73 | 74 | 75 | def call(self, inputs): 76 | binary_kernel = binarize(self.kernel, H=self.H) 77 | output = K.dot(inputs, binary_kernel) 78 | if self.use_bias: 79 | output = K.bias_add(output, self.bias) 80 | if self.activation is not None: 81 | output = self.activation(output) 82 | return output 83 | 84 | def get_config(self): 85 | config = {'H': self.H, 86 | 'W_lr_multiplier': self.W_lr_multiplier, 87 | 'b_lr_multiplier': self.b_lr_multiplier} 88 | base_config = super(BinaryDense, self).get_config() 89 | return dict(list(base_config.items()) + list(config.items())) 90 | 91 | 92 | class BinaryConv2D(Conv2D): 93 | '''Binarized Convolution2D layer 94 | References: 95 | "BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1" [http://arxiv.org/abs/1602.02830] 96 | ''' 97 | def __init__(self, filters, kernel_lr_multiplier='Glorot', 98 | bias_lr_multiplier=None, H=1., **kwargs): 99 | super(BinaryConv2D, self).__init__(filters, **kwargs) 100 | self.H = H 101 | self.kernel_lr_multiplier = kernel_lr_multiplier 102 | self.bias_lr_multiplier = bias_lr_multiplier 103 | 104 | 105 | def build(self, input_shape): 106 | if self.data_format == 'channels_first': 107 | channel_axis = 1 108 | else: 109 | channel_axis = -1 110 | if input_shape[channel_axis] is None: 111 | raise ValueError('The channel dimension of the inputs ' 112 | 'should be defined. Found `None`.') 113 | 114 | input_dim = input_shape[channel_axis] 115 | kernel_shape = self.kernel_size + (input_dim, self.filters) 116 | 117 | base = self.kernel_size[0] * self.kernel_size[1] 118 | if self.H == 'Glorot': 119 | nb_input = int(input_dim * base) 120 | nb_output = int(self.filters * base) 121 | self.H = np.float32(np.sqrt(1.5 / (nb_input + nb_output))) 122 | #print('Glorot H: {}'.format(self.H)) 123 | 124 | if self.kernel_lr_multiplier == 'Glorot': 125 | nb_input = int(input_dim * base) 126 | nb_output = int(self.filters * base) 127 | self.kernel_lr_multiplier = np.float32(1. / np.sqrt(1.5/ (nb_input + nb_output))) 128 | #print('Glorot learning rate multiplier: {}'.format(self.lr_multiplier)) 129 | 130 | self.kernel_constraint = Clip(-self.H, self.H) 131 | self.kernel_initializer = initializers.RandomUniform(-self.H, self.H) 132 | self.kernel = self.add_weight(shape=kernel_shape, 133 | initializer=self.kernel_initializer, 134 | name='kernel', 135 | regularizer=self.kernel_regularizer, 136 | constraint=self.kernel_constraint) 137 | 138 | if self.use_bias: 139 | self.lr_multipliers = [self.kernel_lr_multiplier, self.bias_lr_multiplier] 140 | self.bias = self.add_weight((self.output_dim,), 141 | initializer=self.bias_initializers, 142 | name='bias', 143 | regularizer=self.bias_regularizer, 144 | constraint=self.bias_constraint) 145 | 146 | else: 147 | self.lr_multipliers = [self.kernel_lr_multiplier] 148 | self.bias = None 149 | 150 | self.built = True 151 | 152 | def call(self, inputs): 153 | binary_kernel = binarize(self.kernel, H=self.H) 154 | outputs = K.conv2d( 155 | inputs, 156 | binary_kernel, 157 | strides=self.strides, 158 | padding=self.padding, 159 | data_format=self.data_format, 160 | dilation_rate=self.dilation_rate) 161 | 162 | if self.use_bias: 163 | outputs = K.bias_add( 164 | outputs, 165 | self.bias, 166 | data_format=self.data_format) 167 | 168 | if self.activation is not None: 169 | return self.activation(outputs) 170 | return outputs 171 | 172 | def get_config(self): 173 | config = {'H': self.H, 174 | 'kernel_lr_multiplier': self.kernel_lr_multiplier, 175 | 'bias_lr_multiplier': self.bias_lr_multiplier} 176 | base_config = super(BinaryConv2D, self).get_config() 177 | return dict(list(base_config.items()) + list(config.items())) 178 | 179 | 180 | # Aliases 181 | 182 | BinaryConvolution2D = BinaryConv2D 183 | -------------------------------------------------------------------------------- /xnornet/binary_ops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import keras.backend as K 4 | 5 | 6 | def round_through(x): 7 | '''Element-wise rounding to the closest integer with full gradient propagation. 8 | A trick from [Sergey Ioffe](http://stackoverflow.com/a/36480182) 9 | ''' 10 | rounded = K.round(x) 11 | return x + K.stop_gradient(rounded - x) 12 | 13 | 14 | def _hard_sigmoid(x): 15 | '''Hard sigmoid different from the more conventional form (see definition of K.hard_sigmoid). 16 | 17 | # Reference: 18 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 19 | 20 | ''' 21 | x = (0.5 * x) + 0.5 22 | return K.clip(x, 0, 1) 23 | 24 | 25 | def binary_sigmoid(x): 26 | '''Binary hard sigmoid for training binarized neural network. 27 | 28 | # Reference: 29 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 30 | 31 | ''' 32 | return round_through(_hard_sigmoid(x)) 33 | 34 | 35 | def binary_tanh(x): 36 | '''Binary hard sigmoid for training binarized neural network. 37 | The neurons' activations binarization function 38 | It behaves like the sign function during forward propagation 39 | And like: 40 | hard_tanh(x) = 2 * _hard_sigmoid(x) - 1 41 | clear gradient when |x| > 1 during back propagation 42 | 43 | # Reference: 44 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 45 | 46 | ''' 47 | return 2 * round_through(_hard_sigmoid(x)) - 1 48 | 49 | 50 | def binarize(W, H=1): 51 | '''The weights' binarization function, 52 | 53 | # Reference: 54 | - [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Courbariaux et al. 2016](http://arxiv.org/abs/1602.02830} 55 | 56 | ''' 57 | # [-H, H] -> -H or H 58 | Wb = H * binary_tanh(W / H) 59 | return Wb 60 | 61 | 62 | def _mean_abs(x, axis=None, keepdims=False): 63 | return K.stop_gradient(K.mean(K.abs(x), axis=axis, keepdims=keepdims)) 64 | 65 | 66 | def xnorize(W, H=1., axis=None, keepdims=False): 67 | Wb = binarize(W, H) 68 | Wa = _mean_abs(W, axis, keepdims) 69 | 70 | return Wa, Wb 71 | -------------------------------------------------------------------------------- /xnornet/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple xnor CNN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 98.18% test accuracy after 20 epochs using tensorflow backend 4 | ''' 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | np.random.seed(1337) # for reproducibility 9 | 10 | from keras.datasets import mnist 11 | from keras.models import Sequential 12 | from keras.layers import Dense, Dropout, Activation, BatchNormalization, MaxPooling2D 13 | from keras.layers import Flatten 14 | from keras.optimizers import SGD, Adam, RMSprop 15 | from keras.callbacks import LearningRateScheduler 16 | from keras.utils import np_utils 17 | import keras.backend as K 18 | K.set_image_data_format('channels_first') 19 | 20 | 21 | from binary_ops import binary_tanh as binary_tanh_op 22 | from xnor_layers import XnorDense, XnorConv2D 23 | 24 | 25 | H = 1. 26 | kernel_lr_multiplier = 'Glorot' 27 | 28 | # nn 29 | batch_size = 50 30 | epochs = 20 31 | nb_channel = 1 32 | img_rows = 28 33 | img_cols = 28 34 | nb_filters = 32 35 | nb_conv = 3 36 | nb_pool = 2 37 | nb_hid = 128 38 | nb_classes = 10 39 | use_bias = False 40 | 41 | # learning rate schedule 42 | lr_start = 1e-3 43 | lr_end = 1e-4 44 | lr_decay = (lr_end / lr_start)**(1. / epochs) 45 | 46 | # BN 47 | epsilon = 1e-6 48 | momentum = 0.9 49 | 50 | # dropout 51 | p1 = 0.25 52 | p2 = 0.5 53 | 54 | # the data, shuffled and split between train and test sets 55 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 56 | 57 | X_train = X_train.reshape(60000, 1, 28, 28) 58 | X_test = X_test.reshape(10000, 1, 28, 28) 59 | X_train = X_train.astype('float32') 60 | X_test = X_test.astype('float32') 61 | X_train /= 255 62 | X_test /= 255 63 | print(X_train.shape[0], 'train samples') 64 | print(X_test.shape[0], 'test samples') 65 | 66 | # convert class vectors to binary class matrices 67 | Y_train = np_utils.to_categorical(y_train, nb_classes) * 2 - 1 # -1 or 1 for hinge loss 68 | Y_test = np_utils.to_categorical(y_test, nb_classes) * 2 - 1 69 | 70 | 71 | model = Sequential() 72 | # conv1 73 | model.add(XnorConv2D(128, kernel_size=(3, 3), input_shape=(nb_channel, img_rows, img_cols), 74 | H=H, kernel_lr_multiplier=kernel_lr_multiplier, 75 | padding='same', use_bias=use_bias, name='conv1')) 76 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn1')) 77 | model.add(Activation('relu', name='act1')) 78 | # conv2 79 | model.add(XnorConv2D(128, kernel_size=(3, 3), H=H, kernel_lr_multiplier=kernel_lr_multiplier, 80 | padding='same', use_bias=use_bias, name='conv2')) 81 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool2')) 82 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn2')) 83 | model.add(Activation('relu', name='act2')) 84 | # conv3 85 | model.add(XnorConv2D(256, kernel_size=(3, 3), H=H, kernel_lr_multiplier=kernel_lr_multiplier, 86 | padding='same', use_bias=use_bias, name='conv3')) 87 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn3')) 88 | model.add(Activation('relu', name='act3')) 89 | # conv4 90 | model.add(XnorConv2D(256, kernel_size=(3, 3), H=H, kernel_lr_multiplier=kernel_lr_multiplier, 91 | padding='same', use_bias=use_bias, name='conv4')) 92 | model.add(MaxPooling2D(pool_size=(2, 2), name='pool4')) 93 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, axis=1, name='bn4')) 94 | model.add(Activation('relu', name='act4')) 95 | model.add(Flatten()) 96 | # dense1 97 | model.add(XnorDense(1024, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, name='dense5')) 98 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn5')) 99 | model.add(Activation('relu', name='act5')) 100 | # dense2 101 | model.add(XnorDense(nb_classes, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, name='dense6')) 102 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn6')) 103 | 104 | opt = Adam(lr=lr_start) 105 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 106 | model.summary() 107 | 108 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 109 | history = model.fit(X_train, Y_train, 110 | batch_size=batch_size, epochs=epochs, 111 | verbose=1, validation_data=(X_test, Y_test), 112 | callbacks=[lr_scheduler]) 113 | score = model.evaluate(X_test, Y_test, verbose=0) 114 | print('Test score:', score[0]) 115 | print('Test accuracy:', score[1]) 116 | -------------------------------------------------------------------------------- /xnornet/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple xnor fully connected NN on the MNIST dataset. 2 | Modified from keras' examples/mnist_mlp.py 3 | Gets to 97.41% test accuracy after 20 epochs using tensorflow backend 4 | ''' 5 | 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | np.random.seed(1337) # for reproducibility 10 | 11 | import keras.backend as K 12 | from keras.datasets import mnist 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout, Activation, BatchNormalization 15 | from keras.optimizers import SGD, Adam, RMSprop 16 | from keras.callbacks import LearningRateScheduler 17 | from keras.utils import np_utils 18 | 19 | from xnor_layers import XnorDense 20 | 21 | 22 | batch_size = 100 23 | epochs = 20 24 | classes = 10 25 | 26 | H = 'Glorot' 27 | kernel_lr_multiplier = 'Glorot' 28 | 29 | # network 30 | num_unit = 2048 31 | num_hidden = 3 32 | use_bias = False 33 | 34 | # learning rate schedule 35 | lr_start = 1e-3 36 | lr_end = 1e-4 37 | lr_decay = (lr_end / lr_start)**(1. / epochs) 38 | 39 | # BN 40 | epsilon = 1e-6 41 | momentum = 0.9 42 | 43 | # dropout 44 | drop_in = 0 #0.2 45 | drop_hidden = 0# 0.5 46 | 47 | # the data, shuffled and split between train and test sets 48 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 49 | 50 | X_train = X_train.reshape(60000, 784) 51 | X_test = X_test.reshape(10000, 784) 52 | X_train = X_train.astype('float32') 53 | X_test = X_test.astype('float32') 54 | X_train /= 255 55 | X_test /= 255 56 | print(X_train.shape[0], 'train samples') 57 | print(X_test.shape[0], 'test samples') 58 | 59 | # convert class vectors to binary class matrices 60 | Y_train = np_utils.to_categorical(y_train, classes) * 2 - 1 # -1 or 1 for hinge loss 61 | Y_test = np_utils.to_categorical(y_test, classes) * 2 - 1 62 | 63 | model = Sequential() 64 | model.add(Dropout(drop_in, input_shape=(784,), name='drop0')) 65 | for i in range(num_hidden): 66 | model.add(XnorDense(num_unit, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, 67 | name='dense{}'.format(i+1))) 68 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn{}'.format(i+1))) 69 | model.add(Activation('relu', name='act{}'.format(i+1))) 70 | model.add(Dropout(drop_hidden, name='drop{}'.format(i+1))) 71 | model.add(XnorDense(10, H=H, kernel_lr_multiplier=kernel_lr_multiplier, use_bias=use_bias, 72 | name='dense')) 73 | model.add(BatchNormalization(epsilon=epsilon, momentum=momentum, name='bn')) 74 | 75 | model.summary() 76 | 77 | opt = Adam(lr=lr_start) 78 | model.compile(loss='squared_hinge', optimizer=opt, metrics=['acc']) 79 | 80 | lr_scheduler = LearningRateScheduler(lambda e: lr_start * lr_decay ** e) 81 | history = model.fit(X_train, Y_train, 82 | batch_size=batch_size, epochs=epochs, 83 | verbose=1, validation_data=(X_test, Y_test), 84 | callbacks=[lr_scheduler]) 85 | score = model.evaluate(X_test, Y_test, verbose=0) 86 | print('Test score:', score[0]) 87 | print('Test accuracy:', score[1]) 88 | -------------------------------------------------------------------------------- /xnornet/xnor_layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from keras import backend as K 4 | from binary_ops import xnorize 5 | 6 | from binary_layers import BinaryDense, BinaryConv2D 7 | 8 | 9 | class XnorDense(BinaryDense): 10 | '''XNOR Dense layer 11 | References: 12 | - [XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks](http://arxiv.org/abs/1603.05279) 13 | ''' 14 | def call(self, inputs, mask=None): 15 | inputs_a, inputs_b = xnorize(inputs, 1., axis=1, keepdims=True) # (nb_sample, 1) 16 | kernel_a, kernel_b = xnorize(self.kernel, self.H, axis=0, keepdims=True) # (1, units) 17 | output = K.dot(inputs_b, kernel_b) * kernel_a * inputs_a 18 | if self.use_bias: 19 | output = K.bias_add(output, self.bias) 20 | if self.activation is not None: 21 | output = self.activation(output) 22 | return output 23 | 24 | 25 | class XnorConv2D(BinaryConv2D): 26 | '''XNOR Conv2D layer 27 | References: 28 | - [XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks](http://arxiv.org/abs/1603.05279) 29 | ''' 30 | def call(self, inputs): 31 | _, kernel_b = xnorize(self.kernel, self.H) 32 | _, inputs_b = xnorize(inputs) 33 | outputs = K.conv2d(inputs_b, kernel_b, strides=self.strides, 34 | padding=self.padding, 35 | data_format=self.data_format, 36 | dilation_rate=self.dilation_rate) 37 | 38 | # calculate Wa and xa 39 | 40 | # kernel_a 41 | mask = K.reshape(self.kernel, (-1, self.filters)) # self.nb_row * self.nb_col * channels, filters 42 | kernel_a = K.stop_gradient(K.mean(K.abs(mask), axis=0)) # filters 43 | 44 | # inputs_a 45 | if self.data_format == 'channels_first': 46 | channel_axis = 1 47 | else: 48 | channel_axis = -1 49 | mask = K.mean(K.abs(inputs), axis=channel_axis, keepdims=True) 50 | ones = K.ones(self.kernel_size + (1, 1)) 51 | inputs_a = K.conv2d(mask, ones, strides=self.strides, 52 | padding=self.padding, 53 | data_format=self.data_format, 54 | dilation_rate=self.dilation_rate) # nb_sample, 1, new_nb_row, new_nb_col 55 | if self.data_format == 'channels_first': 56 | outputs = outputs * K.stop_gradient(inputs_a) * K.expand_dims(K.expand_dims(K.expand_dims(kernel_a, 0), -1), -1) 57 | else: 58 | outputs = outputs * K.stop_gradient(inputs_a) * K.expand_dims(K.expand_dims(K.expand_dims(kernel_a, 0), 0), 0) 59 | 60 | if self.use_bias: 61 | outputs = K.bias_add( 62 | outputs, 63 | self.bias, 64 | data_format=self.data_format) 65 | 66 | if self.activation is not None: 67 | return self.activation(outputs) 68 | return outputs 69 | 70 | 71 | # Aliases 72 | 73 | XnorConvolution2D = XnorConv2D 74 | --------------------------------------------------------------------------------