├── README.md └── ZoneoutLSTMCell.py /README.md: -------------------------------------------------------------------------------- 1 | # zoneout-tensorflow 2 | An implementation of zoneout regularizer on LSTM-RNN by Tensorflow 3 | -------------------------------------------------------------------------------- /ZoneoutLSTMCell.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 by Akira TAMAMORI 2 | # 3 | # This program is free software; you can redistribute it and/or modify it under 4 | # the terms of the GNU General Public License as published by the Free Software 5 | # Foundation, either version 3 of the License, or (at your option) any later 6 | # version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 11 | # details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 17 | # 18 | # Unless required by applicable law or agreed to in writing, software 19 | # distributed under the License is distributed on an "AS IS" BASIS, 20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 | # See the License for the specific language governing permissions and 22 | # limitations under the License. 23 | 24 | # Notice: 25 | # This file is tested on TensorFlow v0.12.0 only. 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | from tensorflow.python.ops.rnn_cell import RNNCell 30 | 31 | 32 | # Thanks to 'initializers_enhanced.py' of Project RNN Enhancement: 33 | # https://github.com/nicolas-ivanov/Seq2Seq_Upgrade_TensorFlow/blob/master/rnn_enhancement/initializers_enhanced.py 34 | def orthogonal_initializer(scale=1.0): 35 | def _initializer(shape, dtype=tf.float32): 36 | flat_shape = (shape[0], np.prod(shape[1:])) 37 | a = np.random.normal(0.0, 1.0, flat_shape) 38 | u, _, v = np.linalg.svd(a, full_matrices=False) 39 | q = u if u.shape == flat_shape else v 40 | q = q.reshape(shape) 41 | return tf.constant(scale * q[:shape[0], :shape[1]], dtype=tf.float32) 42 | return _initializer 43 | 44 | 45 | class ZoneoutLSTMCell(RNNCell): 46 | """Zoneout Regularization for LSTM-RNN. 47 | 48 | """ 49 | 50 | def __init__(self, num_units, is_training, input_size=None, 51 | use_peepholes=False, cell_clip=None, 52 | initializer=orthogonal_initializer(), 53 | num_proj=None, proj_clip=None, 54 | forget_bias=1.0, 55 | state_is_tuple=True, 56 | activation=tf.tanh, 57 | zoneout_factor_cell=0.0, 58 | zoneout_factor_output=0.0, 59 | reuse=None): 60 | """Initialize the parameters for an LSTM cell. 61 | Args: 62 | num_units: int, The number of units in the LSTM cell. 63 | is_training: bool, set True when training. 64 | use_peepholes: bool, set True to enable diagonal/peephole 65 | connections. 66 | cell_clip: (optional) A float value, if provided the cell state 67 | is clipped by this value prior to the cell output activation. 68 | initializer: (optional) The initializer to use for the weight 69 | matrices. 70 | num_proj: (optional) int, The output dimensionality for 71 | the projection matrices. If None, no projection is performed. 72 | forget_bias: Biases of the forget gate are initialized by default 73 | to 1 in order to reduce the scale of forgetting at the beginning of 74 | the training. 75 | activation: Activation function of the inner states. 76 | """ 77 | if not state_is_tuple: 78 | tf.logging.warn( 79 | "%s: Using a concatenated state is slower and will soon be " 80 | "deprecated. Use state_is_tuple=True.", self) 81 | if input_size is not None: 82 | tf.logging.warn( 83 | "%s: The input_size parameter is deprecated.", self) 84 | 85 | if not (zoneout_factor_cell >= 0.0 and zoneout_factor_cell <= 1.0): 86 | raise ValueError( 87 | "Parameter zoneout_factor_cell must be in [0 1]") 88 | 89 | if not (zoneout_factor_output >= 0.0 and zoneout_factor_output <= 1.0): 90 | raise ValueError( 91 | "Parameter zoneout_factor_cell must be in [0 1]") 92 | 93 | self.num_units = num_units 94 | self.is_training = is_training 95 | self.use_peepholes = use_peepholes 96 | self.cell_clip = cell_clip 97 | self.num_proj = num_proj 98 | self.proj_clip = proj_clip 99 | self.initializer = initializer 100 | self.forget_bias = forget_bias 101 | self.state_is_tuple = state_is_tuple 102 | self.activation = activation 103 | self.zoneout_factor_cell = zoneout_factor_cell 104 | self.zoneout_factor_output = zoneout_factor_output 105 | 106 | if num_proj: 107 | self._state_size = ( 108 | tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj) 109 | if state_is_tuple else num_units + num_proj) 110 | self._output_size = num_proj 111 | else: 112 | self._state_size = ( 113 | tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units) 114 | if state_is_tuple else 2 * num_units) 115 | self._output_size = num_units 116 | 117 | @property 118 | def state_size(self): 119 | return self._state_size 120 | 121 | @property 122 | def output_size(self): 123 | return self._output_size 124 | 125 | def __call__(self, inputs, state, scope=None): 126 | 127 | num_proj = self.num_units if self.num_proj is None else self.num_proj 128 | 129 | if self.state_is_tuple: 130 | (c_prev, h_prev) = state 131 | else: 132 | c_prev = tf.slice(state, [0, 0], [-1, self.num_units]) 133 | h_prev = tf.slice(state, [0, self.num_units], [-1, num_proj]) 134 | 135 | # c_prev : Tensor with the size of [batch_size, state_size] 136 | # h_prev : Tensor with the size of [batch_size, state_size/2] 137 | 138 | dtype = inputs.dtype 139 | input_size = inputs.get_shape().with_rank(2)[1] 140 | 141 | with tf.variable_scope(scope or type(self).__name__): 142 | if input_size.value is None: 143 | raise ValueError( 144 | "Could not infer input size from inputs.get_shape()[-1]") 145 | 146 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate 147 | lstm_matrix = _linear([inputs, h_prev], 4 * self.num_units, True) 148 | i, j, f, o = tf.split(1, 4, lstm_matrix) 149 | 150 | # diagonal connections 151 | if self.use_peepholes: 152 | w_f_diag = tf.get_variable( 153 | "W_F_diag", shape=[self.num_units], dtype=dtype) 154 | w_i_diag = tf.get_variable( 155 | "W_I_diag", shape=[self.num_units], dtype=dtype) 156 | w_o_diag = tf.get_variable( 157 | "W_O_diag", shape=[self.num_units], dtype=dtype) 158 | 159 | with tf.name_scope(None, "zoneout"): 160 | # make binary mask tensor for cell 161 | keep_prob_cell = tf.convert_to_tensor( 162 | self.zoneout_factor_cell, 163 | dtype=c_prev.dtype 164 | ) 165 | random_tensor_cell = keep_prob_cell 166 | random_tensor_cell += \ 167 | tf.random_uniform(tf.shape(c_prev), 168 | seed=None, dtype=c_prev.dtype) 169 | binary_mask_cell = tf.floor(random_tensor_cell) 170 | # 0 <-> 1 swap 171 | binary_mask_cell_complement = tf.ones(tf.shape(c_prev)) \ 172 | - binary_mask_cell 173 | 174 | # make binary mask tensor for output 175 | keep_prob_output = tf.convert_to_tensor( 176 | self.zoneout_factor_output, 177 | dtype=h_prev.dtype 178 | ) 179 | random_tensor_output = keep_prob_output 180 | random_tensor_output += \ 181 | tf.random_uniform(tf.shape(h_prev), 182 | seed=None, dtype=h_prev.dtype) 183 | binary_mask_output = tf.floor(random_tensor_output) 184 | # 0 <-> 1 swap 185 | binary_mask_output_complement = tf.ones(tf.shape(h_prev)) \ 186 | - binary_mask_output 187 | 188 | # apply zoneout for cell 189 | if self.use_peepholes: 190 | c_temp = c_prev * \ 191 | tf.sigmoid(f + self.forget_bias + 192 | w_f_diag * c_prev) + \ 193 | tf.sigmoid(i + w_i_diag * c_prev) * \ 194 | self.activation(j) 195 | if self.is_training and self.zoneout_factor_cell > 0.0: 196 | c = binary_mask_cell * c_prev + \ 197 | binary_mask_cell_complement * c_temp 198 | else: 199 | c = c_temp 200 | else: 201 | c_temp = c_prev * tf.sigmoid(f + self.forget_bias) + \ 202 | tf.sigmoid(i) * self.activation(j) 203 | if self.is_training and self.zoneout_factor_cell > 0.0: 204 | c = binary_mask_cell * c_prev + \ 205 | binary_mask_cell_complement * c_temp 206 | else: 207 | c = c_temp 208 | 209 | if self.cell_clip is not None: 210 | c = tf.clip_by_value(c, -self.cell_clip, self.cell_clip) 211 | 212 | # apply zoneout for output 213 | if self.use_peepholes: 214 | h_temp = tf.sigmoid(o + w_o_diag * c) * self.activation(c) 215 | if self.is_training and self.zoneout_factor_output > 0.0: 216 | h = binary_mask_output * h_prev + \ 217 | binary_mask_output_complement * h_temp 218 | else: 219 | h = h_temp 220 | else: 221 | h_temp = tf.sigmoid(o) * self.activation(c) 222 | if self.is_training and self.zoneout_factor_output > 0.0: 223 | h = binary_mask_output * h_prev + \ 224 | binary_mask_output_complement * h_temp 225 | else: 226 | h = h_temp 227 | 228 | # apply prejection 229 | if self.num_proj is not None: 230 | w_proj = tf.get_variable( 231 | "W_P", [self.num_units, num_proj], dtype=dtype) 232 | 233 | h = tf.matmul(h, w_proj) 234 | if self.proj_clip is not None: 235 | h = tf.clip_by_value(h, -self.proj_clip, self.proj_clip) 236 | 237 | new_state = (tf.nn.rnn_cell.LSTMStateTuple(c, h) 238 | if self.state_is_tuple else tf.concat(1, [c, h])) 239 | 240 | return h, new_state 241 | 242 | 243 | def _linear(args, output_size, bias, bias_start=0.0, scope=None): 244 | """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. 245 | Args: 246 | args: a 2D Tensor or a list of 2D, batch x n, Tensors. 247 | output_size: int, second dimension of W[i]. 248 | bias: boolean, whether to add a bias term or not. 249 | bias_start: starting value to initialize the bias; 0 by default. 250 | scope: VariableScope for the created subgraph; defaults to "Linear". 251 | Returns: 252 | A 2D Tensor with shape [batch x output_size] equal to 253 | sum_i(args[i] * W[i]), where W[i]s are newly created matrices. 254 | Raises: 255 | ValueError: if some of the arguments has unspecified or wrong shape. 256 | """ 257 | if args is None or (isinstance(args, (list, tuple)) and not args): 258 | raise ValueError("`args` must be specified") 259 | if not isinstance(args, (list, tuple)): 260 | args = [args] 261 | 262 | # Calculate the total size of arguments on dimension 1. 263 | total_arg_size = 0 264 | shapes = [a.get_shape().as_list() for a in args] 265 | for shape in shapes: 266 | if len(shape) != 2: 267 | raise ValueError( 268 | "Linear is expecting 2D arguments: %s" % str(shapes)) 269 | if not shape[1]: 270 | raise ValueError( 271 | "Linear expects shape[1] of arguments: %s" % str(shapes)) 272 | else: 273 | total_arg_size += shape[1] 274 | 275 | # Now the computation. 276 | with tf.variable_scope(scope or "Linear"): 277 | matrix = tf.get_variable("Matrix", [total_arg_size, output_size]) 278 | if len(args) == 1: 279 | res = tf.matmul(args[0], matrix) 280 | else: 281 | res = tf.matmul(tf.concat(1, args), matrix) 282 | if not bias: 283 | return res 284 | bias_term = tf.get_variable( 285 | "Bias", [output_size], 286 | initializer=tf.constant_initializer(bias_start)) 287 | return res + bias_term 288 | --------------------------------------------------------------------------------