├── README.md
└── ZoneoutLSTMCell.py


/README.md:
--------------------------------------------------------------------------------
1 | # zoneout-tensorflow
2 | An implementation of zoneout regularizer on LSTM-RNN by Tensorflow   
3 | 


--------------------------------------------------------------------------------
/ZoneoutLSTMCell.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2017 by Akira TAMAMORI
  2 | #
  3 | # This program is free software; you can redistribute it and/or modify it under
  4 | # the terms of the GNU General Public License as published by the Free Software
  5 | # Foundation, either version 3 of the License, or (at your option) any later
  6 | # version.
  7 | #
  8 | # This program is distributed in the hope that it will be useful, but WITHOUT
  9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 10 | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 11 | # details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License along with
 14 | # this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 17 | #
 18 | # Unless required by applicable law or agreed to in writing, software
 19 | # distributed under the License is distributed on an "AS IS" BASIS,
 20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 21 | # See the License for the specific language governing permissions and
 22 | # limitations under the License.
 23 | 
 24 | # Notice:
 25 | # This file is tested on TensorFlow v0.12.0 only.
 26 | 
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | from tensorflow.python.ops.rnn_cell import RNNCell
 30 | 
 31 | 
 32 | # Thanks to 'initializers_enhanced.py' of Project RNN Enhancement:
 33 | # https://github.com/nicolas-ivanov/Seq2Seq_Upgrade_TensorFlow/blob/master/rnn_enhancement/initializers_enhanced.py
 34 | def orthogonal_initializer(scale=1.0):
 35 |     def _initializer(shape, dtype=tf.float32):
 36 |         flat_shape = (shape[0], np.prod(shape[1:]))
 37 |         a = np.random.normal(0.0, 1.0, flat_shape)
 38 |         u, _, v = np.linalg.svd(a, full_matrices=False)
 39 |         q = u if u.shape == flat_shape else v
 40 |         q = q.reshape(shape)
 41 |         return tf.constant(scale * q[:shape[0], :shape[1]], dtype=tf.float32)
 42 |     return _initializer
 43 | 
 44 | 
 45 | class ZoneoutLSTMCell(RNNCell):
 46 |     """Zoneout Regularization for LSTM-RNN.
 47 | 
 48 |     """
 49 | 
 50 |     def __init__(self, num_units, is_training, input_size=None,
 51 |                  use_peepholes=False, cell_clip=None,
 52 |                  initializer=orthogonal_initializer(),
 53 |                  num_proj=None, proj_clip=None,
 54 |                  forget_bias=1.0,
 55 |                  state_is_tuple=True,
 56 |                  activation=tf.tanh,
 57 |                  zoneout_factor_cell=0.0,
 58 |                  zoneout_factor_output=0.0,
 59 |                  reuse=None):
 60 |         """Initialize the parameters for an LSTM cell.
 61 |         Args:
 62 |           num_units: int, The number of units in the LSTM cell.
 63 |           is_training: bool, set True when training.
 64 |           use_peepholes: bool, set True to enable diagonal/peephole
 65 |             connections.
 66 |           cell_clip: (optional) A float value, if provided the cell state
 67 |             is clipped by this value prior to the cell output activation.
 68 |           initializer: (optional) The initializer to use for the weight
 69 |             matrices.
 70 |           num_proj: (optional) int, The output dimensionality for
 71 |             the projection matrices.  If None, no projection is performed.
 72 |           forget_bias: Biases of the forget gate are initialized by default
 73 |             to 1 in order to reduce the scale of forgetting at the beginning of
 74 |             the training.
 75 |           activation: Activation function of the inner states.
 76 |         """
 77 |         if not state_is_tuple:
 78 |             tf.logging.warn(
 79 |                 "%s: Using a concatenated state is slower and will soon be "
 80 |                 "deprecated.  Use state_is_tuple=True.", self)
 81 |         if input_size is not None:
 82 |             tf.logging.warn(
 83 |                 "%s: The input_size parameter is deprecated.", self)
 84 | 
 85 |         if not (zoneout_factor_cell >= 0.0 and zoneout_factor_cell <= 1.0):
 86 |             raise ValueError(
 87 |                 "Parameter zoneout_factor_cell must be in [0 1]")
 88 | 
 89 |         if not (zoneout_factor_output >= 0.0 and zoneout_factor_output <= 1.0):
 90 |             raise ValueError(
 91 |                 "Parameter zoneout_factor_cell must be in [0 1]")
 92 | 
 93 |         self.num_units = num_units
 94 |         self.is_training = is_training
 95 |         self.use_peepholes = use_peepholes
 96 |         self.cell_clip = cell_clip
 97 |         self.num_proj = num_proj
 98 |         self.proj_clip = proj_clip
 99 |         self.initializer = initializer
100 |         self.forget_bias = forget_bias
101 |         self.state_is_tuple = state_is_tuple
102 |         self.activation = activation
103 |         self.zoneout_factor_cell = zoneout_factor_cell
104 |         self.zoneout_factor_output = zoneout_factor_output
105 | 
106 |         if num_proj:
107 |             self._state_size = (
108 |                 tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj)
109 |                 if state_is_tuple else num_units + num_proj)
110 |             self._output_size = num_proj
111 |         else:
112 |             self._state_size = (
113 |                 tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units)
114 |                 if state_is_tuple else 2 * num_units)
115 |             self._output_size = num_units
116 | 
117 |     @property
118 |     def state_size(self):
119 |         return self._state_size
120 | 
121 |     @property
122 |     def output_size(self):
123 |         return self._output_size
124 | 
125 |     def __call__(self, inputs, state, scope=None):
126 | 
127 |         num_proj = self.num_units if self.num_proj is None else self.num_proj
128 | 
129 |         if self.state_is_tuple:
130 |             (c_prev, h_prev) = state
131 |         else:
132 |             c_prev = tf.slice(state, [0, 0], [-1, self.num_units])
133 |             h_prev = tf.slice(state, [0, self.num_units], [-1, num_proj])
134 | 
135 |         # c_prev : Tensor with the size of [batch_size, state_size]
136 |         # h_prev : Tensor with the size of [batch_size, state_size/2]
137 | 
138 |         dtype = inputs.dtype
139 |         input_size = inputs.get_shape().with_rank(2)[1]
140 | 
141 |         with tf.variable_scope(scope or type(self).__name__):
142 |             if input_size.value is None:
143 |                 raise ValueError(
144 |                     "Could not infer input size from inputs.get_shape()[-1]")
145 | 
146 |             # i = input_gate, j = new_input, f = forget_gate, o = output_gate
147 |             lstm_matrix = _linear([inputs, h_prev], 4 * self.num_units, True)
148 |             i, j, f, o = tf.split(1, 4, lstm_matrix)
149 | 
150 |             # diagonal connections
151 |             if self.use_peepholes:
152 |                 w_f_diag = tf.get_variable(
153 |                     "W_F_diag", shape=[self.num_units], dtype=dtype)
154 |                 w_i_diag = tf.get_variable(
155 |                     "W_I_diag", shape=[self.num_units], dtype=dtype)
156 |                 w_o_diag = tf.get_variable(
157 |                     "W_O_diag", shape=[self.num_units], dtype=dtype)
158 | 
159 |             with tf.name_scope(None, "zoneout"):
160 |                 # make binary mask tensor for cell
161 |                 keep_prob_cell = tf.convert_to_tensor(
162 |                     self.zoneout_factor_cell,
163 |                     dtype=c_prev.dtype
164 |                 )
165 |                 random_tensor_cell = keep_prob_cell
166 |                 random_tensor_cell += \
167 |                     tf.random_uniform(tf.shape(c_prev),
168 |                                       seed=None, dtype=c_prev.dtype)
169 |                 binary_mask_cell = tf.floor(random_tensor_cell)
170 |                 # 0 <-> 1 swap
171 |                 binary_mask_cell_complement = tf.ones(tf.shape(c_prev)) \
172 |                     - binary_mask_cell
173 | 
174 |                 # make binary mask tensor for output
175 |                 keep_prob_output = tf.convert_to_tensor(
176 |                     self.zoneout_factor_output,
177 |                     dtype=h_prev.dtype
178 |                 )
179 |                 random_tensor_output = keep_prob_output
180 |                 random_tensor_output += \
181 |                     tf.random_uniform(tf.shape(h_prev),
182 |                                       seed=None, dtype=h_prev.dtype)
183 |                 binary_mask_output = tf.floor(random_tensor_output)
184 |                 # 0 <-> 1 swap
185 |                 binary_mask_output_complement = tf.ones(tf.shape(h_prev)) \
186 |                     - binary_mask_output
187 | 
188 |             # apply zoneout for cell
189 |             if self.use_peepholes:
190 |                 c_temp = c_prev * \
191 |                     tf.sigmoid(f + self.forget_bias +
192 |                                w_f_diag * c_prev) + \
193 |                     tf.sigmoid(i + w_i_diag * c_prev) * \
194 |                     self.activation(j)
195 |                 if self.is_training and self.zoneout_factor_cell > 0.0:
196 |                     c = binary_mask_cell * c_prev + \
197 |                         binary_mask_cell_complement * c_temp
198 |                 else:
199 |                     c = c_temp
200 |             else:
201 |                 c_temp = c_prev * tf.sigmoid(f + self.forget_bias) + \
202 |                     tf.sigmoid(i) * self.activation(j)
203 |                 if self.is_training and self.zoneout_factor_cell > 0.0:
204 |                     c = binary_mask_cell * c_prev + \
205 |                         binary_mask_cell_complement * c_temp
206 |                 else:
207 |                     c = c_temp
208 | 
209 |             if self.cell_clip is not None:
210 |                 c = tf.clip_by_value(c, -self.cell_clip, self.cell_clip)
211 | 
212 |             # apply zoneout for output
213 |             if self.use_peepholes:
214 |                 h_temp = tf.sigmoid(o + w_o_diag * c) * self.activation(c)
215 |                 if self.is_training and self.zoneout_factor_output > 0.0:
216 |                     h = binary_mask_output * h_prev + \
217 |                         binary_mask_output_complement * h_temp
218 |                 else:
219 |                     h = h_temp
220 |             else:
221 |                 h_temp = tf.sigmoid(o) * self.activation(c)
222 |                 if self.is_training and self.zoneout_factor_output > 0.0:
223 |                     h = binary_mask_output * h_prev + \
224 |                         binary_mask_output_complement * h_temp
225 |                 else:
226 |                     h = h_temp
227 | 
228 |             # apply prejection
229 |             if self.num_proj is not None:
230 |                 w_proj = tf.get_variable(
231 |                     "W_P", [self.num_units, num_proj], dtype=dtype)
232 | 
233 |                 h = tf.matmul(h, w_proj)
234 |                 if self.proj_clip is not None:
235 |                     h = tf.clip_by_value(h, -self.proj_clip, self.proj_clip)
236 | 
237 |             new_state = (tf.nn.rnn_cell.LSTMStateTuple(c, h)
238 |                          if self.state_is_tuple else tf.concat(1, [c, h]))
239 | 
240 |             return h, new_state
241 | 
242 | 
243 | def _linear(args, output_size, bias, bias_start=0.0, scope=None):
244 |     """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
245 |     Args:
246 |       args: a 2D Tensor or a list of 2D, batch x n, Tensors.
247 |       output_size: int, second dimension of W[i].
248 |       bias: boolean, whether to add a bias term or not.
249 |       bias_start: starting value to initialize the bias; 0 by default.
250 |       scope: VariableScope for the created subgraph; defaults to "Linear".
251 |     Returns:
252 |       A 2D Tensor with shape [batch x output_size] equal to
253 |       sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
254 |     Raises:
255 |       ValueError: if some of the arguments has unspecified or wrong shape.
256 |     """
257 |     if args is None or (isinstance(args, (list, tuple)) and not args):
258 |         raise ValueError("`args` must be specified")
259 |     if not isinstance(args, (list, tuple)):
260 |         args = [args]
261 | 
262 |     # Calculate the total size of arguments on dimension 1.
263 |     total_arg_size = 0
264 |     shapes = [a.get_shape().as_list() for a in args]
265 |     for shape in shapes:
266 |         if len(shape) != 2:
267 |             raise ValueError(
268 |                 "Linear is expecting 2D arguments: %s" % str(shapes))
269 |         if not shape[1]:
270 |             raise ValueError(
271 |                 "Linear expects shape[1] of arguments: %s" % str(shapes))
272 |         else:
273 |             total_arg_size += shape[1]
274 | 
275 |     # Now the computation.
276 |     with tf.variable_scope(scope or "Linear"):
277 |         matrix = tf.get_variable("Matrix", [total_arg_size, output_size])
278 |         if len(args) == 1:
279 |             res = tf.matmul(args[0], matrix)
280 |         else:
281 |             res = tf.matmul(tf.concat(1, args), matrix)
282 |         if not bias:
283 |             return res
284 |         bias_term = tf.get_variable(
285 |             "Bias", [output_size],
286 |             initializer=tf.constant_initializer(bias_start))
287 |     return res + bias_term
288 | 


--------------------------------------------------------------------------------