├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── lamb
    ├── VERSION
    ├── __init__.py
    ├── averaged.py
    ├── cell.py
    ├── corpus.py
    ├── dropout.py
    ├── dyneval.py
    ├── evaluation.py
    ├── experiment
    │   ├── awd
    │   │   └── train_awd_lstm.sh
    │   ├── continue.sh
    │   ├── mixture-of-softmaxes
    │   │   ├── train_awd_lstm_mos.sh
    │   │   └── tune_ptb_24m.sh
    │   ├── mogrifier
    │   │   ├── README.md
    │   │   ├── config
    │   │   │   ├── 0393c7dc3532+_tune_ptb_char_24m_lstm_fm_d2_asgd_ts150
    │   │   │   │   └── trial_596
    │   │   │   │   │   └── config
    │   │   │   ├── 4d0a9a5bdb04+_tune_enwik8_48m_lstm_d4_arms
    │   │   │   │   └── trial_400
    │   │   │   │   │   └── config
    │   │   │   ├── 4d0a9a5bdb04+_tune_enwik8_48m_lstm_fm_d4_arms
    │   │   │   │   └── trial_234
    │   │   │   │   │   └── config
    │   │   │   ├── 558aa30c0b15+_tune_mwc_fi_24m_lstm_d2_arms
    │   │   │   │   └── trial_758
    │   │   │   │   │   └── config
    │   │   │   ├── 558aa30c0b15+_tune_mwc_fi_24m_lstm_fm_d2_arms
    │   │   │   │   └── trial_371
    │   │   │   │   │   └── config
    │   │   │   ├── 786252db3825+_tune_ptb_24m_lstm_d2_arms
    │   │   │   │   └── trial_833
    │   │   │   │   │   └── config
    │   │   │   ├── 786252db3825+_tune_ptb_24m_lstm_fm_d2_arms
    │   │   │   │   └── trial_483
    │   │   │   │   │   └── config
    │   │   │   ├── 9e20581d3dad+_tune_mwc_en_24m_lstm_d2_arms
    │   │   │   │   └── trial_502
    │   │   │   │   │   └── config
    │   │   │   ├── 9e20581d3dad+_tune_mwc_en_24m_lstm_fm_d2_arms
    │   │   │   │   └── trial_422
    │   │   │   │   │   └── config
    │   │   │   ├── c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_d2_arms
    │   │   │   │   └── trial_763
    │   │   │   │   │   └── config
    │   │   │   ├── c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_fm_d2_arms
    │   │   │   │   └── trial_747
    │   │   │   │   │   └── config
    │   │   │   ├── e81db31261c0+_tune_enwik8_96m_lstm_d4_arms
    │   │   │   │   └── trial_295
    │   │   │   │   │   └── config
    │   │   │   └── e81db31261c0+_tune_enwik8_96m_lstm_fm_d4_arms_MtngW
    │   │   │   │   └── trial_216
    │   │   │   │       └── config
    │   │   ├── train_enwik8.sh
    │   │   ├── train_mwc.sh
    │   │   ├── train_ptb.sh
    │   │   ├── train_ptb_char.sh
    │   │   ├── train_wikitext-2.sh
    │   │   ├── tune_copy.sh
    │   │   ├── tune_dyneval.sh
    │   │   ├── tune_enwik8.sh
    │   │   ├── tune_mwc.sh
    │   │   ├── tune_ptb.sh
    │   │   ├── tune_ptb_char.sh
    │   │   ├── tune_ptb_fast.sh
    │   │   └── tune_wikitext-2.sh
    │   ├── on-the-state
    │   │   ├── README.md
    │   │   ├── enwik8_27m_lstm_d4
    │   │   │   └── hps_proto
    │   │   ├── enwik8_46m_lstm_d4
    │   │   │   └── hps_proto
    │   │   ├── ptb_10m_lstm_d1
    │   │   │   └── hps_proto
    │   │   ├── ptb_24m_lstm_d4
    │   │   │   └── hps_proto
    │   │   ├── train_enwik8.sh
    │   │   ├── train_ptb.sh
    │   │   ├── train_wikitext-2.sh
    │   │   └── wikitext-2_24m_lstm_d2
    │   │   │   └── hps_proto
    │   ├── pushing-the-bounds
    │   │   ├── README.md
    │   │   └── test.sh
    │   ├── rerun.sh
    │   ├── rerun_old.sh
    │   ├── test.sh
    │   ├── train_ptb_10m_lstm_d1.sh
    │   ├── train_ptb_24m_lstm_d4.sh
    │   └── tune_ptb_10m.sh
    ├── lamb_flags.py
    ├── lib
    │   ├── config
    │   │   ├── README.md
    │   │   ├── common.sh
    │   │   ├── copy.sh
    │   │   ├── enwik8.sh
    │   │   ├── enwik8_char.sh
    │   │   ├── enwik8_char_rmsprop.sh
    │   │   ├── mwc.sh
    │   │   ├── ptb.sh
    │   │   ├── ptb_char.sh
    │   │   ├── ptb_word.sh
    │   │   ├── ptb_word_rmsprop.sh
    │   │   ├── ptb_word_slow.sh
    │   │   ├── running.sh
    │   │   ├── tuning.sh
    │   │   ├── wikitext-103.sh
    │   │   ├── wikitext-103_word.sh
    │   │   ├── wikitext-103_word_rmsprop.sh
    │   │   ├── wikitext-2.sh
    │   │   ├── wikitext-2_word.sh
    │   │   └── wikitext-2_word_rmsprop.sh
    │   ├── describe_version.sh
    │   ├── run.sh
    │   ├── run_helper.sh
    │   └── setup.sh
    ├── lm.py
    ├── main.py
    ├── monitoring.py
    ├── nascell.py
    ├── res_multi_rnn_cell.py
    ├── skip_multi_rnn_cell.py
    ├── test
    │   ├── data
    │   │   ├── add.txt
    │   │   ├── corpus.txt
    │   │   └── save_v1
    │   │   │   ├── args
    │   │   │   └── config
    │   ├── dummy_test.py
    │   ├── finish.sh
    │   ├── start.sh
    │   ├── test_episodic_char_lstm_d2.sh
    │   ├── test_load_optimizer_state.sh
    │   ├── test_save_v1.sh
    │   ├── test_simple_lstm.sh
    │   └── test_sparse_rhn.sh
    ├── tiled_linear.py
    ├── tiled_lstm.py
    ├── tiled_rhn.py
    ├── training.py
    ├── utils.py
    └── vocab.py
└── setup.py


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google.com/conduct/).
29 | 


--------------------------------------------------------------------------------
/lamb/VERSION:
--------------------------------------------------------------------------------
1 | 1.0
2 | 


--------------------------------------------------------------------------------
/lamb/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | 


--------------------------------------------------------------------------------
/lamb/averaged.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ============================================================================
 15 | 
 16 | """Averaging of model weights."""
 17 | 
 18 | # pylint: disable=missing-docstring
 19 | # pylint: disable=g-complex-comprehension
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import tensorflow.compat.v1 as tf
 26 | 
 27 | 
 28 | class Averaged(object):
 29 | 
 30 |   def __init__(self, tensors):
 31 |     tensors = list(tensors)
 32 |     with tf.variable_scope('averaged'):
 33 |       self._num_samples = tf.Variable(0, name='num_samples', trainable=False)
 34 |       with tf.variable_scope('avg'):
 35 |         self._averages = [
 36 |             tf.get_variable(
 37 |                 tensor.name.replace('/', '-').replace(':', '-'),
 38 |                 tensor.get_shape(), initializer=tf.zeros_initializer(),
 39 |                 trainable=False)
 40 |             for tensor in tensors]
 41 |       with tf.variable_scope('save'):
 42 |         self._saves = [
 43 |             tf.get_variable(
 44 |                 tensor.name.replace('/', '-').replace(':', '-'),
 45 |                 tensor.get_shape(), initializer=tf.zeros_initializer(),
 46 |                 trainable=False)
 47 |             for tensor in tensors]
 48 |     self._tensors = tensors
 49 |     self._take_sample = self._make_take_sample()
 50 |     self._switch = self._make_swith_to_average()
 51 |     self._restore = self._make_restore()
 52 |     self._reset = self._make_reset()
 53 | 
 54 |   def take_sample(self):
 55 |     tf.get_default_session().run(self._take_sample)
 56 | 
 57 |   def switch_to_average(self):
 58 |     tf.get_default_session().run(self._switch)
 59 | 
 60 |   def restore(self):
 61 |     tf.get_default_session().run(self._restore)
 62 | 
 63 |   def reset(self):
 64 |     tf.get_default_session().run(self._reset)
 65 | 
 66 |   def __enter__(self):
 67 |     self.switch_to_average()
 68 | 
 69 |   def __exit__(self, type_, value, traceback):
 70 |     self.restore()
 71 | 
 72 |   def _make_take_sample(self):
 73 |     assignments = []
 74 |     n = tf.cast(self._num_samples, tf.float32)
 75 |     mu = 1.0 / (1.0 + n)
 76 |     for tensor, average in zip(self._tensors, self._averages):
 77 |       assignments.append(tf.assign_add(average, (tensor-average)*mu))
 78 |     add_to_averages = tf.group(assignments)
 79 |     with tf.control_dependencies([add_to_averages]):
 80 |       incr_num_samples = tf.assign(self._num_samples, self._num_samples + 1)
 81 |     return incr_num_samples
 82 | 
 83 |   def _make_swith_to_average(self):
 84 |     assignments = []
 85 |     for save, tensor, average in zip(
 86 |         self._saves, self._tensors, self._averages):
 87 |       with tf.control_dependencies([save.assign(tensor)]):
 88 |         assignments.append(tensor.assign(average))
 89 |     return tf.group(assignments)
 90 | 
 91 |   def _make_restore(self):
 92 |     assignments = []
 93 |     for save, tensor in zip(self._saves, self._tensors):
 94 |       assignments.append(tf.assign(tensor, save))
 95 |     return tf.group(assignments)
 96 | 
 97 |   def _make_reset(self):
 98 |     return tf.assign(self._num_samples, 0)
 99 | 
100 | 
101 | # TODO(melisgl): I think this works with ResourceVariables but not with normal
102 | # Variables. Deferred until TF2.0.
103 | def _swap(x, y):
104 |   x_value = x.read_value()
105 |   y_value = y.read_value()
106 |   with tf.control_dependencies([x_value, y_value]):
107 |     swap = tf.group(y.assign(x_value), x.assign(y_value))
108 |   return swap
109 | 


--------------------------------------------------------------------------------
/lamb/dropout.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ============================================================================
 15 | 
 16 | """Variational Dropout."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | from sonnet.python.modules import base as snt_base
 23 | import tensorflow.compat.v1 as tf
 24 | import tensorflow_probability as tfp
 25 | from tensorflow.contrib import util as contrib_util
 26 | 
 27 | 
 28 | class Dropout(snt_base.AbstractModule):
 29 |   """Possibly variational dropout."""
 30 | 
 31 |   def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'):
 32 |     super(Dropout, self).__init__(name=name)
 33 |     self._keep_prob = keep_prob
 34 |     self._keep_mask = None
 35 |     self._share_mask = share_mask
 36 |     self._scaler = scaler
 37 | 
 38 |   def _ensure_keep_mask(self, x):
 39 |     if self._keep_mask is None or not self._share_mask:
 40 |       shape = tf.shape(x)
 41 |       noise = tf.random_uniform(shape, dtype=x.dtype)
 42 |       self._keep_mask = (tf.floor(self._keep_prob + noise)
 43 |                          * (self._scaler / self._keep_prob))
 44 |       self._keep_mask.set_shape(x.get_shape())
 45 |     return self._keep_mask
 46 | 
 47 |   def _build(self, x):
 48 |     if contrib_util.constant_value(self._keep_prob) == 1:
 49 |       return x
 50 |     else:
 51 |       return x * self._ensure_keep_mask(x)
 52 | 
 53 | 
 54 | class GaussianDropout(snt_base.AbstractModule):
 55 |   """Possibly variational dropout."""
 56 | 
 57 |   def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'):
 58 |     super(GaussianDropout, self).__init__(name=name)
 59 |     self._keep_prob = keep_prob
 60 |     self._keep_mask = None
 61 |     self._share_mask = share_mask
 62 |     self._scaler = scaler
 63 | 
 64 |   def _ensure_keep_mask(self, x):
 65 |     if self._keep_mask is None or not self._share_mask:
 66 |       shape = tf.shape(x)
 67 |       # Calculate the stddev for the normal distribution that
 68 |       # matches the stddev of the bernoulli with p=keep_prob.
 69 |       stddev = tf.sqrt((1 - self._keep_prob) / self._keep_prob)
 70 |       self._keep_mask = tf.random_normal(shape, mean=1.0, stddev=stddev,
 71 |                                          dtype=x.dtype)
 72 |       self._keep_mask.set_shape(x.get_shape())
 73 |     return self._keep_mask
 74 | 
 75 |   def _build(self, x):
 76 |     if contrib_util.constant_value(self._keep_prob) == 1:
 77 |       return x
 78 |     else:
 79 |       return x * self._ensure_keep_mask(x)
 80 | 
 81 | 
 82 | class DirichletDropout(snt_base.AbstractModule):
 83 |   """Possibly variational dropout."""
 84 | 
 85 |   def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'):
 86 |     super(DirichletDropout, self).__init__(name=name)
 87 |     self._keep_prob = keep_prob
 88 |     self._keep_mask = None
 89 |     self._share_mask = share_mask
 90 |     self._scaler = scaler
 91 | 
 92 |   def _ensure_keep_mask(self, x):
 93 |     if self._keep_mask is None or not self._share_mask:
 94 |       shape = tf.shape(x)
 95 |       k = shape[1]
 96 |       # To make this class a drop-in replacement for bernoulli dropout we
 97 |       # paramaterize it with keep_prob. Set alpha of the dirichlet so that the
 98 |       # variance is equal to the variance of the bernoulli with p=keep_prob
 99 |       # divided by keep_prob.
100 |       # Now the variance of the dirichlet with k equal alphas is
101 |       # (k-1)/(k^2(k*alpha+1). Solve that for alpha.
102 |       kf = tf.cast(k, tf.float32)
103 |       alpha = self._keep_prob * (kf - 1.0) / ((1-self._keep_prob)*kf) - 1.0/kf
104 |       dist = tfp.distributions.Dirichlet(tf.ones(shape=k) * alpha)
105 |       assert (dist.reparameterization_type ==
106 |               tfp.distributions.FULLY_REPARAMETERIZED)
107 |       # The E[dir(alpha)] = 1/k for all elements, but we want the expectation to
108 |       # be keep_prob, hence the multiplication.
109 |       self._keep_mask = kf * dist.sample(shape[0])
110 |       self._keep_mask.set_shape(x.get_shape())
111 |     return self._keep_mask
112 | 
113 |   def _build(self, x):
114 |     if contrib_util.constant_value(self._keep_prob) == 1:
115 |       return x
116 |     else:
117 |       return tf.cond(tf.equal(self._keep_prob, 1.0),
118 |                      lambda: x,
119 |                      lambda: x * self._ensure_keep_mask(x))
120 | 
121 | 
122 | class DriftingDropout(snt_base.AbstractModule):
123 |   """Dropout with gradually changing mask."""
124 | 
125 |   def __init__(self, keep_prob, flip_prob=0.0, scaler=1.0, name='dropout'):
126 |     super(DriftingDropout, self).__init__(name=name)
127 |     self._keep_prob = keep_prob
128 |     self._flip_prob = flip_prob
129 |     self._scaler = scaler
130 |     self._time_step = 0
131 | 
132 |   def _build(self, x, state):
133 |     prev_keep_mask = state
134 |     shape = tf.shape(x)
135 |     noise = tf.random_uniform(shape, dtype=x.dtype)
136 |     other_mask = tf.floor(self._keep_prob + noise)
137 |     choice_noise = tf.random_uniform(shape, dtype=x.dtype)
138 |     choice = tf.less(choice_noise, self._flip_prob)
139 |     # KLUDGE(melisgl): The client has to pass the last keep_mask from
140 |     # a batch to the next so the mask may end up next to some
141 |     # recurrent cell state. This state is often zero at the beginning
142 |     # and may be periodically zeroed (per example) during training.
143 |     # While zeroing LSTM state is okay, zeroing the dropout mask is
144 |     # not. So instead of forcing every client to deal with this common
145 |     # (?) case, if an all zero mask is detected, then regenerate a
146 |     # fresh mask. This is of course a major hack and won't help with
147 |     # learnt initial states, for example.
148 |     sum_ = tf.reduce_sum(prev_keep_mask, 1, keepdims=True)
149 |     is_initializing = tf.equal(sum_, 0.0)
150 | 
151 |     self._keep_mask = tf.where(tf.logical_or(choice, is_initializing),
152 |                                other_mask,
153 |                                prev_keep_mask)
154 |     self._time_step += 1
155 |     return x * self._keep_mask / self._keep_prob * self._scaler
156 | 


--------------------------------------------------------------------------------
/lamb/dyneval.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ============================================================================
 15 | 
 16 | """Dynamic evaluation."""
 17 | 
 18 | # pylint: disable=missing-docstring
 19 | # pylint: disable=g-complex-comprehension
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import tensorflow.compat.v1 as tf
 26 | 
 27 | 
 28 | class Dyneval(object):
 29 | 
 30 |   def __init__(self, grads_and_vars, learning_rate, decay_rate, epsilon):
 31 |     with tf.variable_scope('dyneval'):
 32 |       # convert_to_tensor densifies IndexedSlices
 33 |       self._grads = [tf.convert_to_tensor(grad) for grad, _ in grads_and_vars]
 34 |       self._vars = [var for _, var in grads_and_vars]
 35 |       self._learning_rate = learning_rate
 36 |       self._decay_rate = decay_rate
 37 |       def shadow_vars():
 38 |         return [
 39 |             tf.get_variable(
 40 |                 var.name.replace('/', '-').replace(':', '-'),
 41 |                 var.get_shape(), initializer=tf.zeros_initializer(),
 42 |                 trainable=False)
 43 |             for var in self._vars]
 44 |       with tf.variable_scope('save'):
 45 |         self._saves = shadow_vars()
 46 |       with tf.variable_scope('sum_squared_grads'):
 47 |         self._sum_squared_grads = shadow_vars()
 48 |       self._save = self._make_save()
 49 |       self._restore = self._make_restore()
 50 | 
 51 |       # These are for computing an RMSProplike estimate of the variance of
 52 |       # minibatch gradients. Here, this quantity is estimated on the training
 53 |       # set once, while gradient descent happens on validation/test.
 54 |       self._num_squared_grads = tf.get_variable(
 55 |           'num_squared_grads', [], initializer=tf.zeros_initializer(),
 56 |           trainable=False)
 57 |       self._zero_sum_squared_grads = self._make_zero_sum_squared_grads()
 58 |       self._add_squared_grads = self._make_add_squared_grads()
 59 |       self._epsilon = epsilon
 60 | 
 61 |       self._update = self._make_update()
 62 | 
 63 |   def _make_save(self):
 64 |     assignments = []
 65 |     for save, var in zip(self._saves, self._vars):
 66 |       assignments.append(save.assign(var))
 67 |     return tf.group(assignments)
 68 | 
 69 |   def _make_restore(self):
 70 |     assignments = []
 71 |     for save, var in zip(self._saves, self._vars):
 72 |       assignments.append(var.assign(save))
 73 |     return tf.group(assignments)
 74 | 
 75 |   def _make_update(self):
 76 |     mss = []
 77 |     gsum = 0.0
 78 |     count = 0
 79 |     for sum_squared_grads in self._sum_squared_grads:
 80 |       ms = tf.sqrt(sum_squared_grads / self._num_squared_grads)
 81 |       gsum += tf.reduce_sum(ms)
 82 |       count += tf.reduce_sum(tf.ones_like(ms))
 83 |       mss.append(ms)
 84 |     gsum = gsum / count
 85 | 
 86 |     assignments = []
 87 |     for grad, var, save, sum_squared_grads, ms in zip(
 88 |         self._grads, self._vars, self._saves, self._sum_squared_grads, mss):
 89 |       decay_rate = tf.minimum(1.0, self._decay_rate*(ms/gsum))
 90 |       delta = (-self._learning_rate*grad / (ms + self._epsilon) +
 91 |                decay_rate*(save-var))
 92 |       assignments.append(var.assign_add(delta))
 93 |     return tf.group(assignments)
 94 | 
 95 |   def _make_add_squared_grads(self):
 96 |     assignments = []
 97 |     for sum_squared_grads, grads in zip(self._sum_squared_grads, self._grads):
 98 |       assignments.append(sum_squared_grads.assign_add(tf.square(grads)))
 99 |     return tf.group(assignments + [self._num_squared_grads.assign_add(1)])
100 | 
101 |   def _make_zero_sum_squared_grads(self):
102 |     assignments = []
103 |     for sum_squared_grads in self._sum_squared_grads:
104 |       assignments.append(sum_squared_grads.assign(
105 |           tf.zeros_like(sum_squared_grads)))
106 |     return tf.group(assignments + [self._num_squared_grads.assign(0)])
107 | 
108 |   def save(self):
109 |     tf.get_default_session().run(self._save)
110 | 
111 |   def restore(self):
112 |     tf.get_default_session().run(self._restore)
113 | 
114 |   def update_op(self):
115 |     return self._update
116 | 
117 |   def zero_sum_squared_grads(self):
118 |     tf.get_default_session().run(self._zero_sum_squared_grads)
119 | 
120 |   def add_squared_grads_op(self):
121 |     return self._add_squared_grads
122 | 
123 |   def __enter__(self):
124 |     self.save()
125 | 
126 |   def __exit__(self, type_, value, traceback):
127 |     self.restore()
128 | 


--------------------------------------------------------------------------------
/lamb/experiment/awd/train_awd_lstm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | # This script reproduces the PTB results from "Regularizing and Optimizing LSTM
19 | # Language Models" (Merity, 2017) without fine-tuning or dynamic evaluation.
20 | #
21 | # Based on https://github.com/salesforce/awd-lstm-lm.
22 | #
23 | # Reaches ~4.084 validation cross-entropy (59.38 ppl) without fine-tuning.
24 | 
25 | set -e
26 | 
27 | source "$(dirname $0)/../../lib/setup.sh"
28 | source_lib "config/common.sh"
29 | source_lib "config/running.sh"
30 | source_lib "config/ptb_word.sh"
31 | 
32 | # Model
33 | 
34 | share_input_and_output_embeddings=true
35 | input_embedding_size=400
36 | output_embedding_size=400
37 | cap_input_gate=false
38 | input_dropout=0.4
39 | embedding_dropout=0.1
40 | output_dropout=0.4
41 | shared_mask_dropout=true
42 | 
43 | # Cell
44 | 
45 | model="lstm"
46 | num_layers=3
47 | lstm_skip_connection=false
48 | hidden_size=1150,1150,400
49 | inter_layer_dropout=0.25
50 | state_dropout=0.5
51 | tie_forget_and_input_gates=false
52 | 
53 | # Objective
54 | 
55 | activation_norm_penalty=2.0
56 | l2_penalty=8.4e-5 # 1.2e-6*70
57 | drop_state_probability=0.01
58 | 
59 | # Initialization
60 | 
61 | forget_bias=0.0
62 | 
63 | # Schedule
64 | 
65 | steps_per_turn=100
66 | print_training_stats_every_num_steps=100
67 | turns=3168 # ~500 epochs (with batch_size=20 and max_time_steps=70).
68 | 
69 | # Optimizer
70 | 
71 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all
72 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums
73 | # the log probabilities over time steps and averages only over the examples in
74 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty
75 | # had to be adjusted.
76 | max_time_steps=70
77 | max_grad_norm=17.5 # 0.25*70
78 | optimizer_type="sgd"
79 | batch_size=20
80 | learning_rate=0.42857143 # 30.0/70
81 | 
82 | # Evaluation hyperparameters
83 | 
84 | trigger_averaging_turns=50
85 | trigger_averaging_at_the_latest=2000
86 | max_training_eval_batches=20
87 | 
88 | # Misc
89 | 
90 | swap_memory=true
91 | 
92 | source_lib "run.sh" "$@"
93 | 


--------------------------------------------------------------------------------
/lamb/experiment/continue.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | 
24 | name="$2"
25 | config_file="$3/config"
26 | load_checkpoint="$3/last"
27 | source_lib "run.sh" "$1"
28 | 


--------------------------------------------------------------------------------
/lamb/experiment/mixture-of-softmaxes/train_awd_lstm_mos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | # This script reproduces the PTB results from "Breaking the Softmax Bottleneck:
19 | # A High-Rank RNN Language Model" (Zhilin Yang, Zihang Dai, Ruslan
20 | # Salakhutdinov, William W. Cohen) without fine-tuning or dynamic evaluation.
21 | #
22 | # Based on https://github.com/zihangdai/mos.
23 | 
24 | set -e
25 | 
26 | source "$(dirname $0)/../../lib/setup.sh"
27 | source_lib "config/common.sh"
28 | source_lib "config/running.sh"
29 | source_lib "config/ptb_word.sh"
30 | 
31 | # Model
32 | 
33 | share_input_and_output_embeddings=true
34 | input_embedding_size=280
35 | output_embedding_size=280
36 | cap_input_gate=false
37 | input_dropout=0.4
38 | embedding_dropout=0.1
39 | output_dropout=0.4
40 | downprojected_output_dropout=0.29
41 | shared_mask_dropout=true
42 | mos_num_components=15
43 | 
44 | # Cell
45 | 
46 | model="lstm"
47 | num_layers=3
48 | lstm_skip_connection=false
49 | hidden_size=960,960,620
50 | inter_layer_dropout=0.225
51 | state_dropout=0.5
52 | tie_forget_and_input_gates=false
53 | 
54 | # Objective
55 | 
56 | l2_penalty=8.4e-5 # 1.2e-6*70
57 | drop_state_probability=0.01
58 | 
59 | # Initialization
60 | 
61 | forget_bias=0.0
62 | 
63 | # Schedule
64 | 
65 | steps_per_turn=100
66 | print_training_stats_every_num_steps=100
67 | turns=8000
68 | 
69 | # Optimizer
70 | 
71 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all
72 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums
73 | # the log probabilities over time steps and averages only over the examples in
74 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty
75 | # had to be adjusted.
76 | max_time_steps=70
77 | max_grad_norm=17.5 # 0.25*70
78 | optimizer_type="sgd"
79 | batch_size=12
80 | learning_rate=0.285 # 20.0/70
81 | 
82 | # Evaluation hyperparameters
83 | 
84 | trigger_averaging_turns=50
85 | trigger_averaging_at_the_latest=2000
86 | max_training_eval_batches=20
87 | 
88 | # Misc
89 | 
90 | swap_memory=true
91 | 
92 | source_lib "run.sh" "$@"
93 | 


--------------------------------------------------------------------------------
/lamb/experiment/mixture-of-softmaxes/tune_ptb_24m.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/ptb_word.sh"
 28 | 
 29 | # Model
 30 | 
 31 | num_params=$(million 24)
 32 | share_input_and_output_embeddings=true
 33 | cap_input_gate=false
 34 | shared_mask_dropout=true
 35 | 
 36 | # Cell
 37 | 
 38 | model="lstm"
 39 | num_layers=3
 40 | lstm_skip_connection=false
 41 | tie_forget_and_input_gates=false
 42 | 
 43 | # Objective
 44 | 
 45 | drop_state_probability=0.01
 46 | 
 47 | # Initialization
 48 | 
 49 | forget_bias=0.0
 50 | 
 51 | # Schedule
 52 | 
 53 | steps_per_turn=100
 54 | print_training_stats_every_num_steps=100
 55 | turns=600
 56 | 
 57 | # Optimizer
 58 | 
 59 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all
 60 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums
 61 | # the log probabilities over time steps and averages only over the examples in
 62 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty
 63 | # had to be adjusted.
 64 | max_time_steps=70
 65 | max_grad_norm=10.0
 66 | trigger_averaging_turns=25
 67 | trigger_averaging_at_the_latest=400
 68 | 
 69 | # Early stopping
 70 | 
 71 | early_stopping_turns=30
 72 | early_stopping_worst_xe_target=4.4
 73 | 
 74 | # Evaluation
 75 | 
 76 | max_training_eval_batches=20
 77 | eval_softmax_temperature=-0.8
 78 | 
 79 | # Misc
 80 | 
 81 | swap_memory=true
 82 | 
 83 | # Tuning parameters
 84 | 
 85 | num_workers=60
 86 | 
 87 | # SGD
 88 | optimizer_type="sgd"
 89 | mos_num_components=0
 90 | tuneables="batch_size,learning_rate,l2_penalty,
 91 |   token_dropout,input_dropout,inter_layer_dropout,state_dropout,
 92 |   output_dropout,downprojected_output_dropout,input_embedding_ratio"
 93 | name="$(default_name)_${model}_d${num_layers}_asgd"
 94 | source_lib "run.sh" "$@"
 95 | 
 96 | # RMSPROP
 97 | optimizer_type="rmsprop"
 98 | mos_num_components=0
 99 | tuneables="batch_size,learning_rate,l2_penalty,
100 |   token_dropout,input_dropout,inter_layer_dropout,state_dropout,
101 |   output_dropout,downprojected_output_dropout,input_embedding_ratio"
102 | name="$(default_name)_${model}_d${num_layers}_arms"
103 | source_lib "run.sh" "$@"
104 | 
105 | # SGD, MoS
106 | optimizer_type="sgd"
107 | mos_num_components=15
108 | tuneables="batch_size,learning_rate,l2_penalty,
109 |   token_dropout,input_dropout,inter_layer_dropout,state_dropout,
110 |   output_dropout,downprojected_output_dropout,input_embedding_ratio"
111 | name="$(default_name)_${model}_d${num_layers}_asgd_mos${mos_num_components}"
112 | source_lib "run.sh" "$@"
113 | 
114 | # RMSPROP, MoS
115 | optimizer_type="rmsprop"
116 | mos_num_components=15
117 | tuneables="batch_size,learning_rate,l2_penalty,
118 |   token_dropout,input_dropout,inter_layer_dropout,state_dropout,
119 |   output_dropout,downprojected_output_dropout,input_embedding_ratio"
120 | name="$(default_name)_${model}_d${num_layers}_arms_mos${mos_num_components}"
121 | source_lib "run.sh" "$@"
122 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains saved configuration files for tuned models from the
 2 | [Mogrifier LSTM](https://arxiv.org/abs/1909.01792) paper. Model weights are not
 3 | included.
 4 | 
 5 | Don't forget to [set up the data](../../README.md).
 6 | 
 7 | For example, to train a Mogrifier LSTM with 24M parameters on PTB with tuned
 8 | hyperparameters (see the paper above):
 9 | 
10 |     ./train_ptb.sh run train-dir-name config/786252db3825+_tune_ptb_24m_lstm_fm_d2_arms/trial_483/config
11 | 
12 | There are separate training scripts for other datasets. The `config` directory
13 | holds the best hyperparameters for various model and dataset combinations. The
14 | training will save the model in `./train-dir-name_<unique-id>`. To test the
15 | saved model:
16 | 
17 |     ../test.sh run test-dir-name ./train-dir-name_<unique-id>/
18 | 
19 | If training runs out of GPU memory, you may want to decrease `max_time_steps`
20 | (the BPTT window size), but don't expect to reproduce the results that way.
21 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/0393c7dc3532+_tune_ptb_char_24m_lstm_fm_d2_asgd_ts150/trial_596/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'utf-8'),
  4 |   ('word_based', True),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 24000000),
  8 |   ('share_input_and_output_embeddings', False),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 1.0),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.27860252841733274),
 17 |   ('output_dropout', 0.2347428361918374),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 2),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 4),
 27 |   ('feature_mask_rank', 24),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.0737609984911853),
 39 |   ('state_dropout', 0.17118611234551975),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 0.00025558089199237096),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 200),
 58 |   ('print_training_stats_every_num_steps', 200),
 59 |   ('turns', 500),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 64),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 150),
 71 |   ('trigger_averaging_turns', 25),
 72 |   ('trigger_averaging_at_the_latest', 400),
 73 |   # learning rate
 74 |   ('learning_rate', 0.003739598828019367),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_power_mean_power', 1.0),
 99 |   ('eval_dropout_multiplier', 1.0),
100 |   # experiments
101 |   # checkpoints
102 |   ('save_checkpoints', True),
103 |   # misc
104 |   ('seed', 1),
105 |   ('swap_memory', False),
106 |   ('log_device_placement', False),
107 |   ('summary_flush_secs', 120),
108 | ]
109 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/4d0a9a5bdb04+_tune_enwik8_48m_lstm_d4_arms/trial_400/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'CP437'),
  4 |   ('word_based', False),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 48000000),
  8 |   ('share_input_and_output_embeddings', False),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 1.0),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.16279440026790548),
 17 |   ('output_dropout', 0.13860156332143037),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 4),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 0),
 27 |   ('feature_mask_rank', 0),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.11949666753873665),
 39 |   ('state_dropout', 0.1036809388104279),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 2.5181258956042348e-05),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 400),
 58 |   ('print_training_stats_every_num_steps', 1000),
 59 |   ('turns', 100),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 128),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 500),
 71 |   ('trigger_averaging_turns', 10),
 72 |   ('trigger_averaging_at_the_latest', 80),
 73 |   # learning rate
 74 |   ('learning_rate', 0.002516709293528533),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/4d0a9a5bdb04+_tune_enwik8_48m_lstm_fm_d4_arms/trial_234/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'CP437'),
  4 |   ('word_based', False),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 48000000),
  8 |   ('share_input_and_output_embeddings', False),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 1.0),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.015496029112930465),
 17 |   ('output_dropout', 0.138307173174503),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 4),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 6),
 27 |   ('feature_mask_rank', 79),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.008633961431527571),
 39 |   ('state_dropout', 0.0437288219541186),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 0.000993383826740019),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 400),
 58 |   ('print_training_stats_every_num_steps', 1000),
 59 |   ('turns', 100),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 128),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 500),
 71 |   ('trigger_averaging_turns', 10),
 72 |   ('trigger_averaging_at_the_latest', 80),
 73 |   # learning rate
 74 |   ('learning_rate', 0.001021423409385794),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/558aa30c0b15+_tune_mwc_fi_24m_lstm_d2_arms/trial_758/config:
--------------------------------------------------------------------------------
  1 | 
  2 | [ ('config_version', 5),
  3 |   # data
  4 |   ('conditioning_separator', ''),
  5 |   ('file_encoding', 'utf-8'),
  6 |   ('word_based', False),
  7 |   ('episodic', False),
  8 |   # model
  9 |   ('num_params', 24000000),
 10 |   ('share_input_and_output_embeddings', False),
 11 |   ('input_embedding_size', -1),
 12 |   ('output_embedding_size', -1),
 13 |   ('input_embedding_ratio', 1.5665444454253725),
 14 |   ('output_embedding_ratio', -1.0),
 15 |   ('mos_num_components', 0),
 16 |   ('token_dropout', 0.0),
 17 |   ('embedding_dropout', 0.0),
 18 |   ('input_dropout', 0.0004278254540998817),
 19 |   ('output_dropout', 0.21672999424789158),
 20 |   ('downprojected_output_dropout', -1.0),
 21 |   ('shared_mask_dropout', False),
 22 |   ('embed_once', True),
 23 |   ('output_once', True),
 24 |   # cell
 25 |   ('model', 'lstm'),
 26 |   ('num_layers', 2),
 27 |   ('residual_connections', False),
 28 |   ('lstm_skip_connection', True),
 29 |   ('feature_mask_rounds', 0),
 30 |   ('feature_mask_rank', 0),
 31 |   ('feature_mask', False),
 32 |   ('sparsity_ratio', -1.0),
 33 |   ('overlay_rank', -1),
 34 |   ('hidden_size', [-1]),
 35 |   ('hidden_size_multiplier', 1.0),
 36 |   ('layer_norm', False),
 37 |   ('activation_fn', 'tf.tanh'),
 38 |   ('tie_forget_and_input_gates', False),
 39 |   ('cap_input_gate', True),
 40 |   ('trainable_initial_state', False),
 41 |   ('inter_layer_dropout', 0.03679207573249842),
 42 |   ('state_dropout', 0.15784488790163897),
 43 |   ('state_dropout_flip_rate', 0.0),
 44 |   ('update_dropout', 0.0),
 45 |   ('cell_clip', -1.0),
 46 |   # objective
 47 |   ('model_average', 'arithmetic'),
 48 |   ('num_training_samples', 1),
 49 |   ('l2_penalty', 3.35903544036833e-05),
 50 |   ('l1_penalty', 0.0),
 51 |   ('activation_norm_penalty', 0.0),
 52 |   ('drop_state_probability', 0.01),
 53 |   # initialization
 54 |   ('embedding_init_factor', 1.0),
 55 |   ('scale_input_embeddings', False),
 56 |   ('cell_init_factor', 1.0),
 57 |   ('forget_bias', 1.0),
 58 |   ('output_init_factor', 1.0),
 59 |   # schedule
 60 |   ('steps_per_turn', 200),
 61 |   ('print_training_stats_every_num_steps', 200),
 62 |   ('turns', 500),
 63 |   # optimization
 64 |   ('optimizer_type', 'rmsprop'),
 65 |   ('rmsprop_beta2', 0.999),
 66 |   ('rmsprop_epsilon', 1e-08),
 67 |   ('adam_beta1', 0.9),
 68 |   ('adam_beta2', 0.999),
 69 |   ('adam_epsilon', 1e-08),
 70 |   ('batch_size', 64),
 71 |   ('accum_batch_size', -1),
 72 |   ('max_grad_norm', 10.0),
 73 |   ('max_time_steps', 150),
 74 |   ('trigger_averaging_turns', 25),
 75 |   ('trigger_averaging_at_the_latest', 400),
 76 |   # learning rate
 77 |   ('learning_rate', 0.0038728221226125496),
 78 |   ('learning_rate_decay', 1.0),
 79 |   ('learning_rate_decay_burn_in_steps', 0),
 80 |   ('drop_learning_rate_turns', -1),
 81 |   ('drop_learning_rate_multiplier', 1.0),
 82 |   ('drop_learning_rate_at_the_latest', -1),
 83 |   # early stopping
 84 |   ('early_stopping_turns', -1),
 85 |   ('early_stopping_rampup_turns', 0),
 86 |   ('early_stopping_worst_xe_target', ''),
 87 |   ('early_stopping_slowest_rate', 0.0),
 88 |   # cross-validation
 89 |   ('crossvalidate', False),
 90 |   ('crossvalidation_folds', 10),
 91 |   ('crossvalidation_rounds', 1),
 92 |   # evaluation
 93 |   ('max_training_eval_batches', 20),
 94 |   ('max_eval_eval_batches', -1),
 95 |   ('max_test_eval_batches', -1),
 96 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 97 |   ('eval_on_test', False),
 98 |   ('eval_method', 'deterministic'),
 99 |   ('num_eval_samples', 0),
100 |   ('eval_softmax_temperature', -0.8),
101 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
102 |   ('eval_power_mean_power', 1.0),
103 |   ('eval_dropout_multiplier', 1.0),
104 |   ('validation_prediction_file', ''),
105 |   ('dyneval', False),
106 |   ('dyneval_learning_rate', 0.001),
107 |   ('dyneval_decay_rate', 0.02),
108 |   ('dyneval_epsilon', 1e-05),
109 |   # experiments
110 |   # checkpoints
111 |   ('save_checkpoints', True),
112 |   # misc
113 |   ('seed', 1),
114 |   ('swap_memory', True),
115 |   ('log_device_placement', False),
116 |   ('summary_flush_secs', 120),
117 | ]
118 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/558aa30c0b15+_tune_mwc_fi_24m_lstm_fm_d2_arms/trial_371/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('conditioning_separator', ''),
  4 |   ('file_encoding', 'utf-8'),
  5 |   ('word_based', False),
  6 |   ('episodic', False),
  7 |   # model
  8 |   ('num_params', 24000000),
  9 |   ('share_input_and_output_embeddings', False),
 10 |   ('input_embedding_size', -1),
 11 |   ('output_embedding_size', -1),
 12 |   ('input_embedding_ratio', 1.3016032871831578),
 13 |   ('output_embedding_ratio', -1.0),
 14 |   ('mos_num_components', 0),
 15 |   ('token_dropout', 0.0),
 16 |   ('embedding_dropout', 0.0),
 17 |   ('input_dropout', 0.06995541397794428),
 18 |   ('output_dropout', 0.22149685667402097),
 19 |   ('downprojected_output_dropout', -1.0),
 20 |   ('shared_mask_dropout', False),
 21 |   ('embed_once', True),
 22 |   ('output_once', True),
 23 |   # cell
 24 |   ('model', 'lstm'),
 25 |   ('num_layers', 2),
 26 |   ('residual_connections', False),
 27 |   ('lstm_skip_connection', True),
 28 |   ('feature_mask_rounds', 5),
 29 |   ('feature_mask_rank', 100),
 30 |   ('feature_mask', False),
 31 |   ('sparsity_ratio', -1.0),
 32 |   ('overlay_rank', -1),
 33 |   ('hidden_size', [-1]),
 34 |   ('hidden_size_multiplier', 1.0),
 35 |   ('layer_norm', False),
 36 |   ('activation_fn', 'tf.tanh'),
 37 |   ('tie_forget_and_input_gates', False),
 38 |   ('cap_input_gate', True),
 39 |   ('trainable_initial_state', False),
 40 |   ('inter_layer_dropout', 0.11571939622760244),
 41 |   ('state_dropout', 0.1759160317735942),
 42 |   ('state_dropout_flip_rate', 0.0),
 43 |   ('update_dropout', 0.0),
 44 |   ('cell_clip', -1.0),
 45 |   # objective
 46 |   ('model_average', 'arithmetic'),
 47 |   ('num_training_samples', 1),
 48 |   ('l2_penalty', 9.607977185924193e-05),
 49 |   ('l1_penalty', 0.0),
 50 |   ('activation_norm_penalty', 0.0),
 51 |   ('drop_state_probability', 0.01),
 52 |   # initialization
 53 |   ('embedding_init_factor', 1.0),
 54 |   ('scale_input_embeddings', False),
 55 |   ('cell_init_factor', 1.0),
 56 |   ('forget_bias', 1.0),
 57 |   ('output_init_factor', 1.0),
 58 |   # schedule
 59 |   ('steps_per_turn', 200),
 60 |   ('print_training_stats_every_num_steps', 200),
 61 |   ('turns', 500),
 62 |   # optimization
 63 |   ('optimizer_type', 'rmsprop'),
 64 |   ('rmsprop_beta2', 0.999),
 65 |   ('rmsprop_epsilon', 1e-08),
 66 |   ('adam_beta1', 0.9),
 67 |   ('adam_beta2', 0.999),
 68 |   ('adam_epsilon', 1e-08),
 69 |   ('batch_size', 64),
 70 |   ('accum_batch_size', -1),
 71 |   ('max_grad_norm', 10.0),
 72 |   ('max_time_steps', 150),
 73 |   ('trigger_averaging_turns', 25),
 74 |   ('trigger_averaging_at_the_latest', 400),
 75 |   # learning rate
 76 |   ('learning_rate', 0.001999992683987708),
 77 |   ('learning_rate_decay', 1.0),
 78 |   ('learning_rate_decay_burn_in_steps', 0),
 79 |   ('drop_learning_rate_turns', -1),
 80 |   ('drop_learning_rate_multiplier', 1.0),
 81 |   ('drop_learning_rate_at_the_latest', -1),
 82 |   # early stopping
 83 |   ('early_stopping_turns', -1),
 84 |   ('early_stopping_rampup_turns', 0),
 85 |   ('early_stopping_worst_xe_target', ''),
 86 |   ('early_stopping_slowest_rate', 0.0),
 87 |   # cross-validation
 88 |   ('crossvalidate', False),
 89 |   ('crossvalidation_folds', 10),
 90 |   ('crossvalidation_rounds', 1),
 91 |   # evaluation
 92 |   ('max_training_eval_batches', 20),
 93 |   ('max_eval_eval_batches', -1),
 94 |   ('max_test_eval_batches', -1),
 95 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 96 |   ('eval_on_test', False),
 97 |   ('eval_method', 'deterministic'),
 98 |   ('num_eval_samples', 0),
 99 |   ('eval_softmax_temperature', -0.8),
100 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
101 |   ('eval_power_mean_power', 1.0),
102 |   ('eval_dropout_multiplier', 1.0),
103 |   ('validation_prediction_file', ''),
104 |   ('dyneval', False),
105 |   ('dyneval_learning_rate', 0.001),
106 |   ('dyneval_decay_rate', 0.02),
107 |   ('dyneval_epsilon', 1e-05),
108 |   # experiments
109 |   # checkpoints
110 |   ('save_checkpoints', True),
111 |   # misc
112 |   ('seed', 1),
113 |   ('swap_memory', True),
114 |   ('log_device_placement', False),
115 |   ('summary_flush_secs', 120),
116 | ]
117 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/786252db3825+_tune_ptb_24m_lstm_d2_arms/trial_833/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'utf-8'),
  4 |   ('word_based', True),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 24000000),
  8 |   ('share_input_and_output_embeddings', True),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 1.0),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.6275626425150355),
 17 |   ('output_dropout', 0.6901712653612706),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 2),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 0),
 27 |   ('feature_mask_rank', 0),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.3069926535017156),
 39 |   ('state_dropout', 0.3692225400980858),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 0.00024908138497223704),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 200),
 58 |   ('print_training_stats_every_num_steps', 200),
 59 |   ('turns', 1000),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 64),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 70),
 71 |   ('trigger_averaging_turns', 50),
 72 |   ('trigger_averaging_at_the_latest', 800),
 73 |   # learning rate
 74 |   ('learning_rate', 0.0030369099569192135),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', False),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/786252db3825+_tune_ptb_24m_lstm_fm_d2_arms/trial_483/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'utf-8'),
  4 |   ('word_based', True),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 24000000),
  8 |   ('share_input_and_output_embeddings', True),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 1.0),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.7290787773167251),
 17 |   ('output_dropout', 0.7156690388448465),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 2),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 5),
 27 |   ('feature_mask_rank', 84),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.2909822365241189),
 39 |   ('state_dropout', 0.38729439899832296),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 0.00025235335778471014),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 200),
 58 |   ('print_training_stats_every_num_steps', 200),
 59 |   ('turns', 1000),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 64),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 70),
 71 |   ('trigger_averaging_turns', 50),
 72 |   ('trigger_averaging_at_the_latest', 800),
 73 |   # learning rate
 74 |   ('learning_rate', 0.002299987130225388),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', False),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/9e20581d3dad+_tune_mwc_en_24m_lstm_d2_arms/trial_502/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'utf-8'),
  4 |   ('word_based', False),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 24000000),
  8 |   ('share_input_and_output_embeddings', False),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 0.16763290107221795),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.24803406411000273),
 17 |   ('output_dropout', 0.06200886700243824),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 2),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 0),
 27 |   ('feature_mask_rank', 0),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.04740148103981923),
 39 |   ('state_dropout', 0.046954638037220955),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 7.825277510671981e-06),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 200),
 58 |   ('print_training_stats_every_num_steps', 200),
 59 |   ('turns', 500),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 64),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 150),
 71 |   ('trigger_averaging_turns', 25),
 72 |   ('trigger_averaging_at_the_latest', 400),
 73 |   # learning rate
 74 |   ('learning_rate', 0.0038051220647221428),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/9e20581d3dad+_tune_mwc_en_24m_lstm_fm_d2_arms/trial_422/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'utf-8'),
  4 |   ('word_based', False),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 24000000),
  8 |   ('share_input_and_output_embeddings', False),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 0.48783057795681084),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.22379479910762798),
 17 |   ('output_dropout', 0.005212299871888891),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 2),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 6),
 27 |   ('feature_mask_rank', 78),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.08779703173530118),
 39 |   ('state_dropout', 0.09548532162445378),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 9.245434142118616e-05),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 200),
 58 |   ('print_training_stats_every_num_steps', 200),
 59 |   ('turns', 500),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 64),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 150),
 71 |   ('trigger_averaging_turns', 25),
 72 |   ('trigger_averaging_at_the_latest', 400),
 73 |   # learning rate
 74 |   ('learning_rate', 0.0014344414472614946),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_d2_arms/trial_763/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'utf-8'),
  4 |   ('word_based', True),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 35000000),
  8 |   ('share_input_and_output_embeddings', True),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 0.3530770457779424),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 2),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.6090979517941943),
 17 |   ('output_dropout', 0.34845530389157287),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 2),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 0),
 27 |   ('feature_mask_rank', 0),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.09075401405970591),
 39 |   ('state_dropout', 0.2714030562283111),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 0.00023063627783021125),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 200),
 58 |   ('print_training_stats_every_num_steps', 200),
 59 |   ('turns', 1000),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 64),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 70),
 71 |   ('trigger_averaging_turns', 50),
 72 |   ('trigger_averaging_at_the_latest', 800),
 73 |   # learning rate
 74 |   ('learning_rate', 0.003183909546336849),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_fm_d2_arms/trial_747/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'utf-8'),
  4 |   ('word_based', True),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 35000000),
  8 |   ('share_input_and_output_embeddings', True),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 0.1993194960596213),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 2),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.5469087645499495),
 17 |   ('output_dropout', 0.34766651193735193),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 2),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 6),
 27 |   ('feature_mask_rank', 48),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.1988228263748591),
 39 |   ('state_dropout', 0.22137985867236876),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 0.00018994987193751323),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 200),
 58 |   ('print_training_stats_every_num_steps', 200),
 59 |   ('turns', 1000),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 64),
 68 |   ('accum_batch_size', -1),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 70),
 71 |   ('trigger_averaging_turns', 50),
 72 |   ('trigger_averaging_at_the_latest', 800),
 73 |   # learning rate
 74 |   ('learning_rate', 0.003287792100749033),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/e81db31261c0+_tune_enwik8_96m_lstm_d4_arms/trial_295/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'CP437'),
  4 |   ('word_based', False),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 96000000),
  8 |   ('share_input_and_output_embeddings', False),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 1.0),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.1233672355450206),
 17 |   ('output_dropout', 0.24846692818769148),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 4),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 0),
 27 |   ('feature_mask_rank', 0),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.12636500626697247),
 39 |   ('state_dropout', 0.13063181510547955),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 5.853555404849184e-05),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 400),
 58 |   ('print_training_stats_every_num_steps', 1000),
 59 |   ('turns', 100),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 128),
 68 |   ('accum_batch_size', 64),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 500),
 71 |   ('trigger_averaging_turns', 10),
 72 |   ('trigger_averaging_at_the_latest', 80),
 73 |   # learning rate
 74 |   ('learning_rate', 0.001975213597736287),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/e81db31261c0+_tune_enwik8_96m_lstm_fm_d4_arms_MtngW/trial_216/config:
--------------------------------------------------------------------------------
  1 | [ ('config_version', 5),
  2 |   # data
  3 |   ('file_encoding', 'CP437'),
  4 |   ('word_based', False),
  5 |   ('episodic', False),
  6 |   # model
  7 |   ('num_params', 96000000),
  8 |   ('share_input_and_output_embeddings', False),
  9 |   ('input_embedding_size', -1),
 10 |   ('output_embedding_size', -1),
 11 |   ('input_embedding_ratio', 1.0),
 12 |   ('output_embedding_ratio', -1.0),
 13 |   ('mos_num_components', 0),
 14 |   ('token_dropout', 0.0),
 15 |   ('embedding_dropout', 0.0),
 16 |   ('input_dropout', 0.015908852824256536),
 17 |   ('output_dropout', 0.2878539844807166),
 18 |   ('downprojected_output_dropout', -1.0),
 19 |   ('shared_mask_dropout', False),
 20 |   ('embed_once', True),
 21 |   # cell
 22 |   ('model', 'lstm'),
 23 |   ('num_layers', 4),
 24 |   ('residual_connections', False),
 25 |   ('lstm_skip_connection', True),
 26 |   ('feature_mask_rounds', 5),
 27 |   ('feature_mask_rank', 61),
 28 |   ('feature_mask', False),
 29 |   ('sparsity_ratio', -1.0),
 30 |   ('overlay_rank', -1),
 31 |   ('hidden_size', [-1]),
 32 |   ('hidden_size_multiplier', 1.0),
 33 |   ('layer_norm', False),
 34 |   ('activation_fn', 'tf.tanh'),
 35 |   ('tie_forget_and_input_gates', False),
 36 |   ('cap_input_gate', True),
 37 |   ('trainable_initial_state', False),
 38 |   ('inter_layer_dropout', 0.13785990907975867),
 39 |   ('state_dropout', 0.1648727901535727),
 40 |   ('state_dropout_flip_rate', 0.0),
 41 |   ('update_dropout', 0.0),
 42 |   ('cell_clip', -1.0),
 43 |   # objective
 44 |   ('model_average', 'arithmetic'),
 45 |   ('num_training_samples', 1),
 46 |   ('l2_penalty', 4.409390792135428e-05),
 47 |   ('l1_penalty', 0.0),
 48 |   ('activation_norm_penalty', 0.0),
 49 |   ('drop_state_probability', 0.01),
 50 |   # initialization
 51 |   ('embedding_init_factor', 1.0),
 52 |   ('scale_input_embeddings', False),
 53 |   ('cell_init_factor', 1.0),
 54 |   ('forget_bias', 1.0),
 55 |   ('output_init_factor', 1.0),
 56 |   # schedule
 57 |   ('steps_per_turn', 400),
 58 |   ('print_training_stats_every_num_steps', 1000),
 59 |   ('turns', 100),
 60 |   # optimization
 61 |   ('optimizer_type', 'rmsprop'),
 62 |   ('rmsprop_beta2', 0.999),
 63 |   ('rmsprop_epsilon', 1e-08),
 64 |   ('adam_beta1', 0.9),
 65 |   ('adam_beta2', 0.999),
 66 |   ('adam_epsilon', 1e-08),
 67 |   ('batch_size', 128),
 68 |   ('accum_batch_size', 64),
 69 |   ('max_grad_norm', 10.0),
 70 |   ('max_time_steps', 500),
 71 |   ('trigger_averaging_turns', 10),
 72 |   ('trigger_averaging_at_the_latest', 80),
 73 |   # learning rate
 74 |   ('learning_rate', 0.0022480107672343715),
 75 |   ('learning_rate_decay', 1.0),
 76 |   ('learning_rate_decay_burn_in_steps', 0),
 77 |   ('drop_learning_rate_turns', -1),
 78 |   ('drop_learning_rate_multiplier', 1.0),
 79 |   ('drop_learning_rate_at_the_latest', -1),
 80 |   # early stopping
 81 |   ('early_stopping_turns', -1),
 82 |   ('early_stopping_rampup_turns', 0),
 83 |   ('early_stopping_worst_xe_target', ''),
 84 |   ('early_stopping_slowest_rate', 0.0),
 85 |   # cross-validation
 86 |   ('crossvalidate', False),
 87 |   ('crossvalidation_folds', 10),
 88 |   ('crossvalidation_rounds', 1),
 89 |   # evaluation
 90 |   ('max_training_eval_batches', 20),
 91 |   ('max_eval_eval_batches', -1),
 92 |   ('max_test_eval_batches', -1),
 93 |   ('min_non_episodic_eval_examples_per_stripe', 100),
 94 |   ('eval_on_test', False),
 95 |   ('eval_method', 'deterministic'),
 96 |   ('num_eval_samples', 0),
 97 |   ('eval_softmax_temperature', -0.8),
 98 |   ('eval_softmax_temperature_estimation_num_tokens', 50000),
 99 |   ('eval_power_mean_power', 1.0),
100 |   ('eval_dropout_multiplier', 1.0),
101 |   ('validation_prediction_file', ''),
102 |   ('dyneval', False),
103 |   ('dyneval_learning_rate', 0.001),
104 |   ('dyneval_decay_rate', 0.02),
105 |   ('dyneval_epsilon', 1e-05),
106 |   # experiments
107 |   # checkpoints
108 |   ('save_checkpoints', True),
109 |   # misc
110 |   ('seed', 1),
111 |   ('swap_memory', True),
112 |   ('log_device_placement', False),
113 |   ('summary_flush_secs', 120),
114 | ]
115 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_enwik8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/enwik8.sh"
24 | 
25 | name="$2"
26 | config_file="$3"
27 | 
28 | source_lib "run.sh" "$1"
29 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_mwc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/mwc.sh"
24 | 
25 | # Data
26 | 
27 | lang="${2:-en}"
28 | training_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.tr.raw.unk"
29 | validation_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.va.raw.unk"
30 | test_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.te.raw.unk"
31 | 
32 | name="$2"
33 | config_file="$3"
34 | 
35 | source_lib "run.sh" "$1"
36 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_ptb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word.sh"
24 | 
25 | name="$2"
26 | config_file="$3"
27 | 
28 | source_lib "run.sh" "$1"
29 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_ptb_char.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_char.sh"
24 | 
25 | name="$2"
26 | config_file="$3"
27 | 
28 | source_lib "run.sh" "$1"
29 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_wikitext-2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/wikitext-2_word.sh"
24 | 
25 | name="$2"
26 | config_file="$3"
27 | 
28 | source_lib "run.sh" "$1"
29 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_copy.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/copy.sh"
 28 | 
 29 | # Model
 30 | 
 31 | num_param_millions=10
 32 | num_params=$(million ${num_param_millions})
 33 | share_input_and_output_embeddings=false
 34 | shared_mask_dropout=false
 35 | 
 36 | # Cell
 37 | 
 38 | model="lstm"
 39 | num_layers=1
 40 | lstm_skip_connection=true
 41 | tie_forget_and_input_gates=false
 42 | cap_input_gate=true
 43 | 
 44 | # Objective
 45 | 
 46 | drop_state_probability=0.0
 47 | 
 48 | # Initialization
 49 | 
 50 | forget_bias=1.0
 51 | 
 52 | # Schedule
 53 | 
 54 | steps_per_turn=200
 55 | print_training_stats_every_num_steps=200
 56 | turns=100
 57 | 
 58 | # Optimizer
 59 | 
 60 | optimizer_type="rmsprop"
 61 | batch_size=64
 62 | max_grad_norm=10.0
 63 | max_time_steps=155
 64 | 
 65 | # Early stopping
 66 | 
 67 | # early_stopping_turns=30
 68 | # early_stopping_worst_xe_target=4.4
 69 | 
 70 | # Evaluation
 71 | 
 72 | max_training_eval_batches=20
 73 | eval_softmax_temperature=-0.8
 74 | 
 75 | # Tuning parameters
 76 | 
 77 | priority=200
 78 | num_workers=60
 79 | 
 80 | # Misc
 81 | 
 82 | swap_memory=true
 83 | 
 84 | # Start experiments with averaged optimization
 85 | 
 86 | drop_learning_rate_turns=-1
 87 | drop_learning_rate_multiplier=1.0
 88 | drop_learning_rate_at_the_latest=-1
 89 | trigger_averaging_turns=10
 90 | trigger_averaging_at_the_latest=80
 91 | 
 92 | # feature mask
 93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 94 |   feature_mask_rounds,feature_mask_rank"
 95 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
 96 | source_lib "run.sh" "$@"
 97 | 
 98 | # vanilla
 99 | tuneables="input_embedding_ratio,learning_rate,l2_penalty"
100 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
101 | source_lib "run.sh" "$@"
102 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_dyneval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 | 
21 | set -e
22 | 
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$1"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | 
28 | name="$2"
29 | config_file="$3/config"
30 | load_checkpoint="$3/best"
31 | 
32 | save_checkpoints=false
33 | turns=0
34 | 
35 | # Evaluation
36 | 
37 | dyneval=true
38 | batch_size=1024
39 | max_training_eval_batches=500
40 | max_grad_norm=0.0
41 | eval_softmax_temperature=-0.8
42 | eval_softmax_temperature_estimation_num_tokens=50000
43 | l2_penalty=0.0
44 | 
45 | # Tuning parameters
46 | 
47 | priority=200
48 | num_workers=60
49 | 
50 | tuneables="batch_size,max_time_steps,
51 |   dyneval_learning_rate,dyneval_decay_rate,dyneval_epsilon"
52 | name="$(default_name)_${name}"
53 | source_lib "run.sh" "$1"
54 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_enwik8.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/enwik8.sh"
 28 | 
 29 | # Model
 30 | 
 31 | num_param_millions=24
 32 | num_params=$(million ${num_param_millions})
 33 | share_input_and_output_embeddings=false
 34 | shared_mask_dropout=false
 35 | 
 36 | # Cell
 37 | 
 38 | model="lstm"
 39 | num_layers=4
 40 | lstm_skip_connection=true
 41 | tie_forget_and_input_gates=false
 42 | cap_input_gate=true
 43 | 
 44 | # Objective
 45 | 
 46 | drop_state_probability=0.01
 47 | 
 48 | # Initialization
 49 | 
 50 | forget_bias=1.0
 51 | 
 52 | # Schedule
 53 | 
 54 | steps_per_turn=1000
 55 | print_training_stats_every_num_steps=1000
 56 | turns=100
 57 | 
 58 | # Optimizer
 59 | 
 60 | optimizer_type="rmsprop"
 61 | batch_size=128
 62 | max_grad_norm=10.0
 63 | max_time_steps=200
 64 | 
 65 | # Early stopping
 66 | 
 67 | # early_stopping_turns=30
 68 | # early_stopping_worst_xe_target=4.4
 69 | 
 70 | # Evaluation
 71 | 
 72 | max_training_eval_batches=20
 73 | eval_softmax_temperature=-0.8
 74 | 
 75 | # Tuning parameters
 76 | 
 77 | priority=200
 78 | num_workers=60
 79 | 
 80 | # Misc
 81 | 
 82 | swap_memory=true
 83 | 
 84 | # Start experiments with averaged optimization
 85 | 
 86 | drop_learning_rate_turns=-1
 87 | drop_learning_rate_multiplier=1.0
 88 | drop_learning_rate_at_the_latest=-1
 89 | trigger_averaging_turns=10
 90 | trigger_averaging_at_the_latest=80
 91 | 
 92 | # feature mask
 93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 94 |   input_dropout,inter_layer_dropout,state_dropout,
 95 |   output_dropout,
 96 |   feature_mask_rounds,feature_mask_rank"
 97 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
 98 | source_lib "run.sh" "$@"
 99 | 
100 | # vanilla
101 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
102 |   input_dropout,inter_layer_dropout,state_dropout,
103 |   output_dropout"
104 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
105 | source_lib "run.sh" "$@"
106 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_mwc.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/mwc.sh"
 28 | 
 29 | # Data
 30 | 
 31 | lang="${2:-en}"
 32 | training_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.tr.raw.unk"
 33 | validation_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.va.raw.unk"
 34 | test_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.te.raw.unk"
 35 | 
 36 | # Model
 37 | 
 38 | num_param_millions=24
 39 | num_params=$(million ${num_param_millions})
 40 | share_input_and_output_embeddings=false
 41 | shared_mask_dropout=false
 42 | 
 43 | # Cell
 44 | 
 45 | model="lstm"
 46 | num_layers=2
 47 | lstm_skip_connection=true
 48 | tie_forget_and_input_gates=false
 49 | cap_input_gate=true
 50 | 
 51 | # Objective
 52 | 
 53 | drop_state_probability=0.01
 54 | 
 55 | # Initialization
 56 | 
 57 | forget_bias=1.0
 58 | 
 59 | # Schedule
 60 | 
 61 | steps_per_turn=200
 62 | print_training_stats_every_num_steps=200
 63 | turns=500
 64 | 
 65 | # Optimizer
 66 | 
 67 | optimizer_type="rmsprop"
 68 | batch_size=64
 69 | max_grad_norm=10.0
 70 | max_time_steps=150
 71 | 
 72 | # Early stopping
 73 | 
 74 | # early_stopping_turns=30
 75 | # early_stopping_worst_xe_target=4.4
 76 | 
 77 | # Evaluation
 78 | 
 79 | max_training_eval_batches=20
 80 | eval_softmax_temperature=-0.8
 81 | 
 82 | # Tuning parameters
 83 | 
 84 | priority=200
 85 | num_workers=60
 86 | 
 87 | # Misc
 88 | 
 89 | swap_memory=true
 90 | 
 91 | # Start experiments with averaged optimization
 92 | 
 93 | drop_learning_rate_turns=-1
 94 | drop_learning_rate_multiplier=1.0
 95 | drop_learning_rate_at_the_latest=-1
 96 | trigger_averaging_turns=25
 97 | trigger_averaging_at_the_latest=400
 98 | 
 99 | # feature mask
100 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
101 |   input_dropout,inter_layer_dropout,state_dropout,
102 |   output_dropout,
103 |   feature_mask_rounds,feature_mask_rank"
104 | name="$(default_name)_${lang}_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
105 | source_lib "run.sh" "$@"
106 | 
107 | # vanilla
108 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
109 |   input_dropout,inter_layer_dropout,state_dropout,
110 |   output_dropout"
111 | name="$(default_name)_${lang}_${num_param_millions}m_${model}_d${num_layers}_arms"
112 | source_lib "run.sh" "$@"
113 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_ptb.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/ptb_word.sh"
 28 | 
 29 | # Model
 30 | 
 31 | num_param_millions=24
 32 | num_params=$(million ${num_param_millions})
 33 | share_input_and_output_embeddings=true
 34 | shared_mask_dropout=false
 35 | 
 36 | # Cell
 37 | 
 38 | model="lstm"
 39 | num_layers=2
 40 | lstm_skip_connection=true
 41 | tie_forget_and_input_gates=false
 42 | cap_input_gate=true
 43 | 
 44 | # Objective
 45 | 
 46 | drop_state_probability=0.01
 47 | 
 48 | # Initialization
 49 | 
 50 | forget_bias=1.0
 51 | 
 52 | # Schedule
 53 | 
 54 | steps_per_turn=200
 55 | print_training_stats_every_num_steps=200
 56 | turns=1000
 57 | 
 58 | # Optimizer
 59 | 
 60 | optimizer_type="rmsprop"
 61 | batch_size=64
 62 | max_grad_norm=10.0
 63 | max_time_steps=70
 64 | 
 65 | # Early stopping
 66 | 
 67 | # early_stopping_turns=30
 68 | # early_stopping_worst_xe_target=4.4
 69 | 
 70 | # Evaluation
 71 | 
 72 | max_training_eval_batches=20
 73 | eval_softmax_temperature=-0.8
 74 | 
 75 | # Tuning parameters
 76 | 
 77 | priority=200
 78 | num_workers=60
 79 | 
 80 | # Start experiments with dropped learning rate
 81 | 
 82 | # drop_learning_rate_turns=100
 83 | # drop_learning_rate_multiplier=0.1
 84 | # drop_learning_rate_at_the_latest=1600
 85 | #
 86 | # # feature mask
 87 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 88 | #   input_dropout,inter_layer_dropout,state_dropout,
 89 | #   output_dropout,
 90 | #   feature_mask_rounds,feature_mask_rank"
 91 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms"
 92 | # source_lib "run.sh" "$@"
 93 | # 
 94 | # # vanilla
 95 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 96 | #   input_dropout,inter_layer_dropout,state_dropout,
 97 | #   output_dropout"
 98 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms"
 99 | # source_lib "run.sh" "$@"
100 | 
101 | # Start experiments with averaged optimization
102 | 
103 | drop_learning_rate_turns=-1
104 | drop_learning_rate_multiplier=1.0
105 | drop_learning_rate_at_the_latest=-1
106 | trigger_averaging_turns=50
107 | trigger_averaging_at_the_latest=800
108 | 
109 | # feature mask
110 | tuneables="learning_rate,l2_penalty,
111 |   input_dropout,inter_layer_dropout,state_dropout,
112 |   output_dropout,
113 |   feature_mask_rounds,feature_mask_rank"
114 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
115 | source_lib "run.sh" "$@"
116 | 
117 | # vanilla
118 | tuneables="learning_rate,l2_penalty,
119 |   input_dropout,inter_layer_dropout,state_dropout,
120 |   output_dropout"
121 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
122 | source_lib "run.sh" "$@"
123 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_ptb_char.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/ptb_char.sh"
 28 | 
 29 | # Model
 30 | 
 31 | num_param_millions=24
 32 | num_params=$(million ${num_param_millions})
 33 | share_input_and_output_embeddings=false
 34 | shared_mask_dropout=false
 35 | 
 36 | # Cell
 37 | 
 38 | model="lstm"
 39 | num_layers=2
 40 | lstm_skip_connection=true
 41 | tie_forget_and_input_gates=false
 42 | cap_input_gate=true
 43 | 
 44 | # Objective
 45 | 
 46 | drop_state_probability=0.01
 47 | 
 48 | # Initialization
 49 | 
 50 | forget_bias=1.0
 51 | 
 52 | # Schedule
 53 | 
 54 | steps_per_turn=200
 55 | print_training_stats_every_num_steps=200
 56 | turns=500
 57 | 
 58 | # Optimizer
 59 | 
 60 | optimizer_type="rmsprop"
 61 | batch_size=64
 62 | max_grad_norm=10.0
 63 | max_time_steps=150
 64 | 
 65 | # Early stopping
 66 | 
 67 | # early_stopping_turns=30
 68 | # early_stopping_worst_xe_target=4.4
 69 | 
 70 | # Evaluation
 71 | 
 72 | max_training_eval_batches=20
 73 | eval_softmax_temperature=-0.8
 74 | 
 75 | # Tuning parameters
 76 | 
 77 | priority=200
 78 | num_workers=60
 79 | 
 80 | # Misc
 81 | 
 82 | swap_memory=true
 83 | 
 84 | # Start experiments with averaged optimization
 85 | 
 86 | drop_learning_rate_turns=-1
 87 | drop_learning_rate_multiplier=1.0
 88 | drop_learning_rate_at_the_latest=-1
 89 | trigger_averaging_turns=25
 90 | trigger_averaging_at_the_latest=400
 91 | 
 92 | # feature mask
 93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 94 |   input_dropout,inter_layer_dropout,state_dropout,
 95 |   output_dropout,
 96 |   feature_mask_rounds,feature_mask_rank"
 97 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
 98 | source_lib "run.sh" "$@"
 99 | 
100 | # vanilla
101 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
102 |   input_dropout,inter_layer_dropout,state_dropout,
103 |   output_dropout"
104 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
105 | source_lib "run.sh" "$@"
106 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_ptb_fast.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/ptb_word.sh"
 28 | 
 29 | # Model
 30 | 
 31 | num_param_millions=24
 32 | num_params=$(million ${num_param_millions})
 33 | share_input_and_output_embeddings=true
 34 | shared_mask_dropout=false
 35 | 
 36 | # Cell
 37 | 
 38 | model="lstm"
 39 | num_layers=2
 40 | lstm_skip_connection=true
 41 | tie_forget_and_input_gates=false
 42 | cap_input_gate=true
 43 | 
 44 | # Objective
 45 | 
 46 | drop_state_probability=0.01
 47 | 
 48 | # Initialization
 49 | 
 50 | forget_bias=1.0
 51 | 
 52 | # Schedule
 53 | 
 54 | steps_per_turn=100
 55 | print_training_stats_every_num_steps=100
 56 | turns=600
 57 | 
 58 | # Optimizer
 59 | 
 60 | optimizer_type="rmsprop"
 61 | batch_size=64
 62 | max_grad_norm=10.0
 63 | max_time_steps=35
 64 | 
 65 | # Early stopping
 66 | 
 67 | # early_stopping_turns=30
 68 | # early_stopping_worst_xe_target=4.4
 69 | 
 70 | # Evaluation
 71 | 
 72 | max_training_eval_batches=20
 73 | eval_softmax_temperature=-0.8
 74 | 
 75 | # Tuning parameters
 76 | 
 77 | priority=200
 78 | num_workers=60
 79 | 
 80 | # Start experiments with dropped learning rate
 81 | 
 82 | # drop_learning_rate_turns=100
 83 | # drop_learning_rate_multiplier=0.1
 84 | # drop_learning_rate_at_the_latest=1600
 85 | #
 86 | # # feature mask
 87 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 88 | #   input_dropout,inter_layer_dropout,state_dropout,
 89 | #   output_dropout,
 90 | #   feature_mask_rounds,feature_mask_rank"
 91 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms"
 92 | # source_lib "run.sh" "$@"
 93 | # 
 94 | # # vanilla
 95 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 96 | #   input_dropout,inter_layer_dropout,state_dropout,
 97 | #   output_dropout"
 98 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms"
 99 | # source_lib "run.sh" "$@"
100 | 
101 | # Start experiments with averaged optimization
102 | 
103 | drop_learning_rate_turns=-1
104 | drop_learning_rate_multiplier=1.0
105 | drop_learning_rate_at_the_latest=-1
106 | trigger_averaging_turns=25
107 | trigger_averaging_at_the_latest=400
108 | 
109 | # feature mask
110 | tuneables="learning_rate,l2_penalty,
111 |   input_dropout,inter_layer_dropout,state_dropout,
112 |   output_dropout,
113 |   feature_mask_rounds,feature_mask_rank"
114 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
115 | source_lib "run.sh" "$@"
116 | 
117 | # vanilla
118 | tuneables="learning_rate,l2_penalty,
119 |   input_dropout,inter_layer_dropout,state_dropout,
120 |   output_dropout"
121 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
122 | source_lib "run.sh" "$@"
123 | 


--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_wikitext-2.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ============================================================================
 16 | 
 17 | 
 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
 19 | # illustration only.
 20 | 
 21 | set -e
 22 | 
 23 | # Include definitions of dataset and tuning related variables.
 24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
 25 | source_lib "config/common.sh"
 26 | source_lib "config/tuning.sh"
 27 | source_lib "config/wikitext-2_word.sh"
 28 | 
 29 | # Model
 30 | 
 31 | num_param_millions=35
 32 | num_params=$(million ${num_param_millions})
 33 | share_input_and_output_embeddings=true
 34 | shared_mask_dropout=false
 35 | 
 36 | # Cell
 37 | 
 38 | model="lstm"
 39 | num_layers=2
 40 | lstm_skip_connection=true
 41 | tie_forget_and_input_gates=false
 42 | cap_input_gate=true
 43 | 
 44 | # Objective
 45 | 
 46 | drop_state_probability=0.01
 47 | 
 48 | # Initialization
 49 | 
 50 | forget_bias=1.0
 51 | 
 52 | # Schedule
 53 | 
 54 | steps_per_turn=200
 55 | print_training_stats_every_num_steps=200
 56 | turns=1000
 57 | 
 58 | # Optimizer
 59 | 
 60 | optimizer_type="rmsprop"
 61 | batch_size=64
 62 | max_grad_norm=10.0
 63 | max_time_steps=70
 64 | 
 65 | # Early stopping
 66 | 
 67 | # early_stopping_turns=30
 68 | # early_stopping_worst_xe_target=4.4
 69 | 
 70 | # Evaluation
 71 | 
 72 | max_training_eval_batches=20
 73 | eval_softmax_temperature=-0.8
 74 | 
 75 | # Tuning parameters
 76 | 
 77 | priority=200
 78 | num_workers=60
 79 | 
 80 | # Misc
 81 | 
 82 | swap_memory=true
 83 | 
 84 | # Start experiments with dropped learning rate
 85 | 
 86 | # drop_learning_rate_turns=100
 87 | # drop_learning_rate_multiplier=0.1
 88 | # drop_learning_rate_at_the_latest=1600
 89 | #
 90 | # # feature mask
 91 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
 92 | #   input_dropout,inter_layer_dropout,state_dropout,
 93 | #   output_dropout,
 94 | #   feature_mask_rounds,feature_mask_rank"
 95 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms"
 96 | # source_lib "run.sh" "$@"
 97 | # 
 98 | # # vanilla
 99 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
100 | #   input_dropout,inter_layer_dropout,state_dropout,
101 | #   output_dropout"
102 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms"
103 | # source_lib "run.sh" "$@"
104 | 
105 | # Start experiments with averaged optimization
106 | 
107 | drop_learning_rate_turns=-1
108 | drop_learning_rate_multiplier=1.0
109 | drop_learning_rate_at_the_latest=-1
110 | trigger_averaging_turns=50
111 | trigger_averaging_at_the_latest=800
112 | 
113 | # feature mask
114 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
115 |   input_dropout,inter_layer_dropout,state_dropout,
116 |   output_dropout,
117 |   feature_mask_rounds,feature_mask_rank"
118 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
119 | source_lib "run.sh" "$@"
120 | 
121 | # vanilla
122 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
123 |   input_dropout,inter_layer_dropout,state_dropout,
124 |   output_dropout"
125 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
126 | source_lib "run.sh" "$@"
127 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains saved configuration files for tuned models from the [On
 2 | the state of the art of evaluation in neural language
 3 | models](https://arxiv.org/abs/1707.05589) paper. Model weights are not included.
 4 | 
 5 | Don't forget to [set up the data](../../README.md).
 6 | 
 7 | To train the 1 layer LSTM model of 10m weights on PTB with tuned hyperparameters
 8 | (see the paper above):
 9 | 
10 |     ./train_ptb.sh run ptb_10m_lstm_d1/hps_proto
11 | 
12 | There are separate training script for enwik8 and wikitext-2. The training will
13 | save the model in `/tmp/lamb/ptb_10m_lstm_d1/`. To test the saved model:
14 | 
15 |     ../test.sh run some-descriptive-name /tmp/lamb/ptb_10m_lstm_d1/
16 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/enwik8_27m_lstm_d4/hps_proto:
--------------------------------------------------------------------------------
  1 | hparam {
  2 |   key: "activation_fn"
  3 |   value {
  4 |     bytes_value: "tf.tanh"
  5 |   }
  6 | }
  7 | hparam {
  8 |   key: "adam_beta1"
  9 |   value {
 10 |     float_value: 0.899999976158
 11 |   }
 12 | }
 13 | hparam {
 14 |   key: "adam_beta2"
 15 |   value {
 16 |     float_value: 0.999000012875
 17 |   }
 18 | }
 19 | hparam {
 20 |   key: "adam_epsilon"
 21 |   value {
 22 |     float_value: 9.99999993923e-09
 23 |   }
 24 | }
 25 | hparam {
 26 |   key: "batch_size"
 27 |   value {
 28 |     int64_value: 128
 29 |   }
 30 | }
 31 | hparam {
 32 |   key: "cell_clip"
 33 |   value {
 34 |     float_value: -1.0
 35 |   }
 36 | }
 37 | hparam {
 38 |   key: "cell_init_factor"
 39 |   value {
 40 |     float_value: 1.0
 41 |   }
 42 | }
 43 | hparam {
 44 |   key: "drop_learning_rate_at_the_latest"
 45 |   value {
 46 |     int64_value: 450
 47 |   }
 48 | }
 49 | hparam {
 50 |   key: "drop_learning_rate_multiplier"
 51 |   value {
 52 |     float_value: 0.10000000149
 53 |   }
 54 | }
 55 | hparam {
 56 |   key: "drop_learning_rate_rounds"
 57 |   value {
 58 |     int64_value: 13
 59 |   }
 60 | }
 61 | hparam {
 62 |   key: "drop_state_probability"
 63 |   value {
 64 |     float_value: 0.00999999977648
 65 |   }
 66 | }
 67 | hparam {
 68 |   key: "embed_once"
 69 |   value {
 70 |     bool_value: true
 71 |   }
 72 | }
 73 | hparam {
 74 |   key: "embedding_init_factor"
 75 |   value {
 76 |     float_value: 1.0
 77 |   }
 78 | }
 79 | hparam {
 80 |   key: "feature_mask"
 81 |   value {
 82 |     bool_value: false
 83 |   }
 84 | }
 85 | hparam {
 86 |   key: "forget_bias"
 87 |   value {
 88 |     float_value: 1.0
 89 |   }
 90 | }
 91 | hparam {
 92 |   key: "hidden_size"
 93 |   value {
 94 |     int64_value: 911
 95 |   }
 96 | }
 97 | hparam {
 98 |   key: "input_dropout"
 99 |   value {
100 |     float_value: 0.196795836091
101 |   }
102 | }
103 | hparam {
104 |   key: "input_embedding_ratio"
105 |   value {
106 |     float_value: 1.0
107 |   }
108 | }
109 | hparam {
110 |   key: "input_embedding_size"
111 |   value {
112 |     int64_value: 911
113 |   }
114 | }
115 | hparam {
116 |   key: "intra_layer_dropout"
117 |   value {
118 |     float_value: 0.0307693872601
119 |   }
120 | }
121 | hparam {
122 |   key: "layer_norm"
123 |   value {
124 |     bool_value: false
125 |   }
126 | }
127 | hparam {
128 |   key: "learning_rate"
129 |   value {
130 |     float_value: 0.00203050486743
131 |   }
132 | }
133 | hparam {
134 |   key: "learning_rate_decay"
135 |   value {
136 |     float_value: 1.0
137 |   }
138 | }
139 | hparam {
140 |   key: "learning_rate_decay_burn_in_steps"
141 |   value {
142 |     int64_value: 0
143 |   }
144 | }
145 | hparam {
146 |   key: "lstm_skip_connection"
147 |   value {
148 |     bool_value: true
149 |   }
150 | }
151 | hparam {
152 |   key: "max_grad_norm"
153 |   value {
154 |     float_value: 10.0
155 |   }
156 | }
157 | hparam {
158 |   key: "model"
159 |   value {
160 |     bytes_value: "lstm"
161 |   }
162 | }
163 | hparam {
164 |   key: "num_eval_samples"
165 |   value {
166 |     int64_value: 0
167 |   }
168 | }
169 | hparam {
170 |   key: "num_layers"
171 |   value {
172 |     int64_value: 4
173 |   }
174 | }
175 | hparam {
176 |   key: "num_params"
177 |   value {
178 |     int64_value: 27000000
179 |   }
180 | }
181 | hparam {
182 |   key: "optimizer_type"
183 |   value {
184 |     bytes_value: "rmsprop"
185 |   }
186 | }
187 | hparam {
188 |   key: "outer_steps"
189 |   value {
190 |     int64_value: 500
191 |   }
192 | }
193 | hparam {
194 |   key: "output_dropout"
195 |   value {
196 |     float_value: 0.0695193335414
197 |   }
198 | }
199 | hparam {
200 |   key: "output_embedding_ratio"
201 |   value {
202 |     float_value: 1.0
203 |   }
204 | }
205 | hparam {
206 |   key: "output_embedding_size"
207 |   value {
208 |     int64_value: 911
209 |   }
210 | }
211 | hparam {
212 |   key: "output_init_factor"
213 |   value {
214 |     float_value: 1.0
215 |   }
216 | }
217 | hparam {
218 |   key: "overlay_rank"
219 |   value {
220 |     int64_value: -1
221 |   }
222 | }
223 | hparam {
224 |   key: "rmsprop_beta2"
225 |   value {
226 |     float_value: 0.990000009537
227 |   }
228 | }
229 | hparam {
230 |   key: "rmsprop_epsilon"
231 |   value {
232 |     float_value: 9.99999974738e-06
233 |   }
234 | }
235 | hparam {
236 |   key: "share_input_and_output_embeddings"
237 |   value {
238 |     bool_value: false
239 |   }
240 | }
241 | hparam {
242 |   key: "sparsity_ratio"
243 |   value {
244 |     float_value: -1.0
245 |   }
246 | }
247 | hparam {
248 |   key: "state_dropout"
249 |   value {
250 |     float_value: 0.0808205232024
251 |   }
252 | }
253 | hparam {
254 |   key: "tie_forget_and_input_gates"
255 |   value {
256 |     bool_value: false
257 |   }
258 | }
259 | hparam {
260 |   key: "token_dropout"
261 |   value {
262 |     float_value: 0.0
263 |   }
264 | }
265 | hparam {
266 |   key: "trainable_initial_state"
267 |   value {
268 |     bool_value: false
269 |   }
270 | }
271 | hparam {
272 |   key: "update_dropout"
273 |   value {
274 |     float_value: 0.0
275 |   }
276 | }
277 | hparam {
278 |   key: "vocab_size"
279 |   value {
280 |     int64_value: 206
281 |   }
282 | }
283 | hparam {
284 |   key: "weight_decay"
285 |   value {
286 |     float_value: 7.50829849494e-06
287 |   }
288 | }
289 | hparam {
290 |   key: "weight_penalty"
291 |   value {
292 |     float_value: 0.0
293 |   }
294 | }
295 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/enwik8_46m_lstm_d4/hps_proto:
--------------------------------------------------------------------------------
  1 | hparam {
  2 |   key: "activation_fn"
  3 |   value {
  4 |     bytes_value: "tf.tanh"
  5 |   }
  6 | }
  7 | hparam {
  8 |   key: "adam_beta1"
  9 |   value {
 10 |     float_value: 0.899999976158
 11 |   }
 12 | }
 13 | hparam {
 14 |   key: "adam_beta2"
 15 |   value {
 16 |     float_value: 0.999000012875
 17 |   }
 18 | }
 19 | hparam {
 20 |   key: "adam_epsilon"
 21 |   value {
 22 |     float_value: 9.99999993923e-09
 23 |   }
 24 | }
 25 | hparam {
 26 |   key: "batch_size"
 27 |   value {
 28 |     int64_value: 128
 29 |   }
 30 | }
 31 | hparam {
 32 |   key: "cell_clip"
 33 |   value {
 34 |     float_value: -1.0
 35 |   }
 36 | }
 37 | hparam {
 38 |   key: "cell_init_factor"
 39 |   value {
 40 |     float_value: 1.0
 41 |   }
 42 | }
 43 | hparam {
 44 |   key: "drop_learning_rate_at_the_latest"
 45 |   value {
 46 |     int64_value: 450
 47 |   }
 48 | }
 49 | hparam {
 50 |   key: "drop_learning_rate_multiplier"
 51 |   value {
 52 |     float_value: 0.10000000149
 53 |   }
 54 | }
 55 | hparam {
 56 |   key: "drop_learning_rate_rounds"
 57 |   value {
 58 |     int64_value: 13
 59 |   }
 60 | }
 61 | hparam {
 62 |   key: "drop_state_probability"
 63 |   value {
 64 |     float_value: 0.00999999977648
 65 |   }
 66 | }
 67 | hparam {
 68 |   key: "embed_once"
 69 |   value {
 70 |     bool_value: true
 71 |   }
 72 | }
 73 | hparam {
 74 |   key: "embedding_init_factor"
 75 |   value {
 76 |     float_value: 1.0
 77 |   }
 78 | }
 79 | hparam {
 80 |   key: "feature_mask"
 81 |   value {
 82 |     bool_value: false
 83 |   }
 84 | }
 85 | hparam {
 86 |   key: "forget_bias"
 87 |   value {
 88 |     float_value: 1.0
 89 |   }
 90 | }
 91 | hparam {
 92 |   key: "hidden_size"
 93 |   value {
 94 |     int64_value: 1192
 95 |   }
 96 | }
 97 | hparam {
 98 |   key: "input_dropout"
 99 |   value {
100 |     float_value: 0.0335461571813
101 |   }
102 | }
103 | hparam {
104 |   key: "input_embedding_ratio"
105 |   value {
106 |     float_value: 1.0
107 |   }
108 | }
109 | hparam {
110 |   key: "input_embedding_size"
111 |   value {
112 |     int64_value: 1192
113 |   }
114 | }
115 | hparam {
116 |   key: "intra_layer_dropout"
117 |   value {
118 |     float_value: 0.0122289275751
119 |   }
120 | }
121 | hparam {
122 |   key: "layer_norm"
123 |   value {
124 |     bool_value: false
125 |   }
126 | }
127 | hparam {
128 |   key: "learning_rate"
129 |   value {
130 |     float_value: 0.00218322896399
131 |   }
132 | }
133 | hparam {
134 |   key: "learning_rate_decay"
135 |   value {
136 |     float_value: 1.0
137 |   }
138 | }
139 | hparam {
140 |   key: "learning_rate_decay_burn_in_steps"
141 |   value {
142 |     int64_value: 0
143 |   }
144 | }
145 | hparam {
146 |   key: "lstm_skip_connection"
147 |   value {
148 |     bool_value: true
149 |   }
150 | }
151 | hparam {
152 |   key: "max_grad_norm"
153 |   value {
154 |     float_value: 10.0
155 |   }
156 | }
157 | hparam {
158 |   key: "model"
159 |   value {
160 |     bytes_value: "lstm"
161 |   }
162 | }
163 | hparam {
164 |   key: "num_eval_samples"
165 |   value {
166 |     int64_value: 0
167 |   }
168 | }
169 | hparam {
170 |   key: "num_layers"
171 |   value {
172 |     int64_value: 4
173 |   }
174 | }
175 | hparam {
176 |   key: "num_params"
177 |   value {
178 |     int64_value: 46000000
179 |   }
180 | }
181 | hparam {
182 |   key: "optimizer_type"
183 |   value {
184 |     bytes_value: "rmsprop"
185 |   }
186 | }
187 | hparam {
188 |   key: "outer_steps"
189 |   value {
190 |     int64_value: 500
191 |   }
192 | }
193 | hparam {
194 |   key: "output_dropout"
195 |   value {
196 |     float_value: 0.279572278261
197 |   }
198 | }
199 | hparam {
200 |   key: "output_embedding_ratio"
201 |   value {
202 |     float_value: 1.0
203 |   }
204 | }
205 | hparam {
206 |   key: "output_embedding_size"
207 |   value {
208 |     int64_value: 1192
209 |   }
210 | }
211 | hparam {
212 |   key: "output_init_factor"
213 |   value {
214 |     float_value: 1.0
215 |   }
216 | }
217 | hparam {
218 |   key: "overlay_rank"
219 |   value {
220 |     int64_value: -1
221 |   }
222 | }
223 | hparam {
224 |   key: "rmsprop_beta2"
225 |   value {
226 |     float_value: 0.990000009537
227 |   }
228 | }
229 | hparam {
230 |   key: "rmsprop_epsilon"
231 |   value {
232 |     float_value: 9.99999974738e-06
233 |   }
234 | }
235 | hparam {
236 |   key: "share_input_and_output_embeddings"
237 |   value {
238 |     bool_value: false
239 |   }
240 | }
241 | hparam {
242 |   key: "sparsity_ratio"
243 |   value {
244 |     float_value: -1.0
245 |   }
246 | }
247 | hparam {
248 |   key: "state_dropout"
249 |   value {
250 |     float_value: 0.0622955262661
251 |   }
252 | }
253 | hparam {
254 |   key: "tie_forget_and_input_gates"
255 |   value {
256 |     bool_value: false
257 |   }
258 | }
259 | hparam {
260 |   key: "token_dropout"
261 |   value {
262 |     float_value: 0.0
263 |   }
264 | }
265 | hparam {
266 |   key: "trainable_initial_state"
267 |   value {
268 |     bool_value: false
269 |   }
270 | }
271 | hparam {
272 |   key: "update_dropout"
273 |   value {
274 |     float_value: 0.0
275 |   }
276 | }
277 | hparam {
278 |   key: "vocab_size"
279 |   value {
280 |     int64_value: 206
281 |   }
282 | }
283 | hparam {
284 |   key: "weight_decay"
285 |   value {
286 |     float_value: 0.0
287 |   }
288 | }
289 | hparam {
290 |   key: "weight_penalty"
291 |   value {
292 |     float_value: 0.0
293 |   }
294 | }
295 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/ptb_10m_lstm_d1/hps_proto:
--------------------------------------------------------------------------------
  1 | hparam {
  2 |   key: "activation_fn"
  3 |   value {
  4 |     bytes_value: "tf.tanh"
  5 |   }
  6 | }
  7 | hparam {
  8 |   key: "adam_beta1"
  9 |   value {
 10 |     float_value: 0.899999976158
 11 |   }
 12 | }
 13 | hparam {
 14 |   key: "adam_beta2"
 15 |   value {
 16 |     float_value: 0.999000012875
 17 |   }
 18 | }
 19 | hparam {
 20 |   key: "adam_epsilon"
 21 |   value {
 22 |     float_value: 9.99999993923e-09
 23 |   }
 24 | }
 25 | hparam {
 26 |   key: "batch_size"
 27 |   value {
 28 |     int64_value: 64
 29 |   }
 30 | }
 31 | hparam {
 32 |   key: "cell_clip"
 33 |   value {
 34 |     float_value: -1.0
 35 |   }
 36 | }
 37 | hparam {
 38 |   key: "cell_init_factor"
 39 |   value {
 40 |     float_value: 1.0
 41 |   }
 42 | }
 43 | hparam {
 44 |   key: "drop_learning_rate_at_the_latest"
 45 |   value {
 46 |     int64_value: 900
 47 |   }
 48 | }
 49 | hparam {
 50 |   key: "drop_learning_rate_multiplier"
 51 |   value {
 52 |     float_value: 0.10000000149
 53 |   }
 54 | }
 55 | hparam {
 56 |   key: "drop_learning_rate_rounds"
 57 |   value {
 58 |     int64_value: 26
 59 |   }
 60 | }
 61 | hparam {
 62 |   key: "drop_state_probability"
 63 |   value {
 64 |     float_value: 0.00999999977648
 65 |   }
 66 | }
 67 | hparam {
 68 |   key: "embed_once"
 69 |   value {
 70 |     bool_value: true
 71 |   }
 72 | }
 73 | hparam {
 74 |   key: "embedding_init_factor"
 75 |   value {
 76 |     float_value: 1.0
 77 |   }
 78 | }
 79 | hparam {
 80 |   key: "feature_mask"
 81 |   value {
 82 |     bool_value: false
 83 |   }
 84 | }
 85 | hparam {
 86 |   key: "forget_bias"
 87 |   value {
 88 |     float_value: 1.0
 89 |   }
 90 | }
 91 | hparam {
 92 |   key: "hidden_size"
 93 |   value {
 94 |     int64_value: 1194
 95 |   }
 96 | }
 97 | hparam {
 98 |   key: "input_dropout"
 99 |   value {
100 |     float_value: 0.579891622066
101 |   }
102 | }
103 | hparam {
104 |   key: "input_embedding_ratio"
105 |   value {
106 |     float_value: 0.224374398589
107 |   }
108 | }
109 | hparam {
110 |   key: "input_embedding_size"
111 |   value {
112 |     int64_value: 268
113 |   }
114 | }
115 | hparam {
116 |   key: "intra_layer_dropout"
117 |   value {
118 |     float_value: 0.873659133911
119 |   }
120 | }
121 | hparam {
122 |   key: "layer_norm"
123 |   value {
124 |     bool_value: false
125 |   }
126 | }
127 | hparam {
128 |   key: "learning_rate"
129 |   value {
130 |     float_value: 0.00417865626514
131 |   }
132 | }
133 | hparam {
134 |   key: "learning_rate_decay"
135 |   value {
136 |     float_value: 1.0
137 |   }
138 | }
139 | hparam {
140 |   key: "learning_rate_decay_burn_in_steps"
141 |   value {
142 |     int64_value: 0
143 |   }
144 | }
145 | hparam {
146 |   key: "lstm_skip_connection"
147 |   value {
148 |     bool_value: true
149 |   }
150 | }
151 | hparam {
152 |   key: "max_grad_norm"
153 |   value {
154 |     float_value: 10.0
155 |   }
156 | }
157 | hparam {
158 |   key: "model"
159 |   value {
160 |     bytes_value: "lstm"
161 |   }
162 | }
163 | hparam {
164 |   key: "num_eval_samples"
165 |   value {
166 |     int64_value: 0
167 |   }
168 | }
169 | hparam {
170 |   key: "num_layers"
171 |   value {
172 |     int64_value: 1
173 |   }
174 | }
175 | hparam {
176 |   key: "num_params"
177 |   value {
178 |     int64_value: 10000000
179 |   }
180 | }
181 | hparam {
182 |   key: "optimizer_type"
183 |   value {
184 |     bytes_value: "rmsprop"
185 |   }
186 | }
187 | hparam {
188 |   key: "outer_steps"
189 |   value {
190 |     int64_value: 1000
191 |   }
192 | }
193 | hparam {
194 |   key: "output_dropout"
195 |   value {
196 |     float_value: 0.327008873224
197 |   }
198 | }
199 | hparam {
200 |   key: "output_embedding_ratio"
201 |   value {
202 |     float_value: 0.224374398589
203 |   }
204 | }
205 | hparam {
206 |   key: "output_embedding_size"
207 |   value {
208 |     int64_value: 268
209 |   }
210 | }
211 | hparam {
212 |   key: "output_init_factor"
213 |   value {
214 |     float_value: 1.0
215 |   }
216 | }
217 | hparam {
218 |   key: "overlay_rank"
219 |   value {
220 |     int64_value: -1
221 |   }
222 | }
223 | hparam {
224 |   key: "rmsprop_beta2"
225 |   value {
226 |     float_value: 0.999000012875
227 |   }
228 | }
229 | hparam {
230 |   key: "rmsprop_epsilon"
231 |   value {
232 |     float_value: 9.99999993923e-09
233 |   }
234 | }
235 | hparam {
236 |   key: "share_input_and_output_embeddings"
237 |   value {
238 |     bool_value: true
239 |   }
240 | }
241 | hparam {
242 |   key: "sparsity_ratio"
243 |   value {
244 |     float_value: -1.0
245 |   }
246 | }
247 | hparam {
248 |   key: "state_dropout"
249 |   value {
250 |     float_value: 0.215256482363
251 |   }
252 | }
253 | hparam {
254 |   key: "tie_forget_and_input_gates"
255 |   value {
256 |     bool_value: false
257 |   }
258 | }
259 | hparam {
260 |   key: "token_dropout"
261 |   value {
262 |     float_value: 0.0
263 |   }
264 | }
265 | hparam {
266 |   key: "trainable_initial_state"
267 |   value {
268 |     bool_value: false
269 |   }
270 | }
271 | hparam {
272 |   key: "update_dropout"
273 |   value {
274 |     float_value: 0.0
275 |   }
276 | }
277 | hparam {
278 |   key: "vocab_size"
279 |   value {
280 |     int64_value: 10001
281 |   }
282 | }
283 | hparam {
284 |   key: "weight_decay"
285 |   value {
286 |     float_value: 0.000124350262922
287 |   }
288 | }
289 | hparam {
290 |   key: "weight_penalty"
291 |   value {
292 |     float_value: 0.0
293 |   }
294 | }
295 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/ptb_24m_lstm_d4/hps_proto:
--------------------------------------------------------------------------------
  1 | hparam {
  2 |   key: "activation_fn"
  3 |   value {
  4 |     bytes_value: "tf.tanh"
  5 |   }
  6 | }
  7 | hparam {
  8 |   key: "adam_beta1"
  9 |   value {
 10 |     float_value: 0.899999976158
 11 |   }
 12 | }
 13 | hparam {
 14 |   key: "adam_beta2"
 15 |   value {
 16 |     float_value: 0.999000012875
 17 |   }
 18 | }
 19 | hparam {
 20 |   key: "adam_epsilon"
 21 |   value {
 22 |     float_value: 9.99999993923e-09
 23 |   }
 24 | }
 25 | hparam {
 26 |   key: "batch_size"
 27 |   value {
 28 |     int64_value: 64
 29 |   }
 30 | }
 31 | hparam {
 32 |   key: "cell_clip"
 33 |   value {
 34 |     float_value: -1.0
 35 |   }
 36 | }
 37 | hparam {
 38 |   key: "cell_init_factor"
 39 |   value {
 40 |     float_value: 1.0
 41 |   }
 42 | }
 43 | hparam {
 44 |   key: "drop_learning_rate_at_the_latest"
 45 |   value {
 46 |     int64_value: 900
 47 |   }
 48 | }
 49 | hparam {
 50 |   key: "drop_learning_rate_multiplier"
 51 |   value {
 52 |     float_value: 0.10000000149
 53 |   }
 54 | }
 55 | hparam {
 56 |   key: "drop_learning_rate_rounds"
 57 |   value {
 58 |     int64_value: 26
 59 |   }
 60 | }
 61 | hparam {
 62 |   key: "drop_state_probability"
 63 |   value {
 64 |     float_value: 0.00999999977648
 65 |   }
 66 | }
 67 | hparam {
 68 |   key: "embed_once"
 69 |   value {
 70 |     bool_value: true
 71 |   }
 72 | }
 73 | hparam {
 74 |   key: "embedding_init_factor"
 75 |   value {
 76 |     float_value: 1.0
 77 |   }
 78 | }
 79 | hparam {
 80 |   key: "feature_mask"
 81 |   value {
 82 |     bool_value: false
 83 |   }
 84 | }
 85 | hparam {
 86 |   key: "forget_bias"
 87 |   value {
 88 |     float_value: 1.0
 89 |   }
 90 | }
 91 | hparam {
 92 |   key: "hidden_size"
 93 |   value {
 94 |     int64_value: 723
 95 |   }
 96 | }
 97 | hparam {
 98 |   key: "input_dropout"
 99 |   value {
100 |     float_value: 0.633642196655
101 |   }
102 | }
103 | hparam {
104 |   key: "input_embedding_ratio"
105 |   value {
106 |     float_value: 1.0
107 |   }
108 | }
109 | hparam {
110 |   key: "input_embedding_size"
111 |   value {
112 |     int64_value: 723
113 |   }
114 | }
115 | hparam {
116 |   key: "intra_layer_dropout"
117 |   value {
118 |     float_value: 0.309127420187
119 |   }
120 | }
121 | hparam {
122 |   key: "layer_norm"
123 |   value {
124 |     bool_value: false
125 |   }
126 | }
127 | hparam {
128 |   key: "learning_rate"
129 |   value {
130 |     float_value: 0.00396024715155
131 |   }
132 | }
133 | hparam {
134 |   key: "learning_rate_decay"
135 |   value {
136 |     float_value: 1.0
137 |   }
138 | }
139 | hparam {
140 |   key: "learning_rate_decay_burn_in_steps"
141 |   value {
142 |     int64_value: 0
143 |   }
144 | }
145 | hparam {
146 |   key: "lstm_skip_connection"
147 |   value {
148 |     bool_value: true
149 |   }
150 | }
151 | hparam {
152 |   key: "max_grad_norm"
153 |   value {
154 |     float_value: 10.0
155 |   }
156 | }
157 | hparam {
158 |   key: "model"
159 |   value {
160 |     bytes_value: "lstm"
161 |   }
162 | }
163 | hparam {
164 |   key: "num_eval_samples"
165 |   value {
166 |     int64_value: 0
167 |   }
168 | }
169 | hparam {
170 |   key: "num_layers"
171 |   value {
172 |     int64_value: 4
173 |   }
174 | }
175 | hparam {
176 |   key: "num_params"
177 |   value {
178 |     int64_value: 24000000
179 |   }
180 | }
181 | hparam {
182 |   key: "optimizer_type"
183 |   value {
184 |     bytes_value: "rmsprop"
185 |   }
186 | }
187 | hparam {
188 |   key: "outer_steps"
189 |   value {
190 |     int64_value: 1000
191 |   }
192 | }
193 | hparam {
194 |   key: "output_dropout"
195 |   value {
196 |     float_value: 0.700856506824
197 |   }
198 | }
199 | hparam {
200 |   key: "output_embedding_ratio"
201 |   value {
202 |     float_value: 1.0
203 |   }
204 | }
205 | hparam {
206 |   key: "output_embedding_size"
207 |   value {
208 |     int64_value: 723
209 |   }
210 | }
211 | hparam {
212 |   key: "output_init_factor"
213 |   value {
214 |     float_value: 1.0
215 |   }
216 | }
217 | hparam {
218 |   key: "overlay_rank"
219 |   value {
220 |     int64_value: -1
221 |   }
222 | }
223 | hparam {
224 |   key: "rmsprop_beta2"
225 |   value {
226 |     float_value: 0.999000012875
227 |   }
228 | }
229 | hparam {
230 |   key: "rmsprop_epsilon"
231 |   value {
232 |     float_value: 9.99999993923e-09
233 |   }
234 | }
235 | hparam {
236 |   key: "share_input_and_output_embeddings"
237 |   value {
238 |     bool_value: true
239 |   }
240 | }
241 | hparam {
242 |   key: "sparsity_ratio"
243 |   value {
244 |     float_value: -1.0
245 |   }
246 | }
247 | hparam {
248 |   key: "state_dropout"
249 |   value {
250 |     float_value: 0.64275187254
251 |   }
252 | }
253 | hparam {
254 |   key: "tie_forget_and_input_gates"
255 |   value {
256 |     bool_value: false
257 |   }
258 | }
259 | hparam {
260 |   key: "token_dropout"
261 |   value {
262 |     float_value: 0.0
263 |   }
264 | }
265 | hparam {
266 |   key: "trainable_initial_state"
267 |   value {
268 |     bool_value: false
269 |   }
270 | }
271 | hparam {
272 |   key: "update_dropout"
273 |   value {
274 |     float_value: 0.0
275 |   }
276 | }
277 | hparam {
278 |   key: "vocab_size"
279 |   value {
280 |     int64_value: 10001
281 |   }
282 | }
283 | hparam {
284 |   key: "weight_decay"
285 |   value {
286 |     float_value: 7.44869103073e-05
287 |   }
288 | }
289 | hparam {
290 |   key: "weight_penalty"
291 |   value {
292 |     float_value: 0.0
293 |   }
294 | }
295 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/train_enwik8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/enwik8_char.sh"
24 | 
25 | hps_proto_file="$2"
26 | name="$(basename "$(dirname "$2")")"
27 | 
28 | source_lib "run.sh" "$1"
29 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/train_ptb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word_rmsprop.sh"
24 | 
25 | hps_proto_file="$2"
26 | name="$(basename "$(dirname "$2")")"
27 | 
28 | source_lib "run.sh" "$1"
29 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/train_wikitext-2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/wikitext-2_word.sh"
24 | 
25 | hps_proto_file="$2"
26 | name="$(basename "$(dirname "$2")")"
27 | 
28 | source_lib "run.sh" "$1"
29 | 


--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/wikitext-2_24m_lstm_d2/hps_proto:
--------------------------------------------------------------------------------
  1 | hparam {
  2 |   key: "activation_fn"
  3 |   value {
  4 |     bytes_value: "tf.tanh"
  5 |   }
  6 | }
  7 | hparam {
  8 |   key: "adam_beta1"
  9 |   value {
 10 |     float_value: 0.899999976158
 11 |   }
 12 | }
 13 | hparam {
 14 |   key: "adam_beta2"
 15 |   value {
 16 |     float_value: 0.999000012875
 17 |   }
 18 | }
 19 | hparam {
 20 |   key: "adam_epsilon"
 21 |   value {
 22 |     float_value: 9.99999993923e-09
 23 |   }
 24 | }
 25 | hparam {
 26 |   key: "batch_size"
 27 |   value {
 28 |     int64_value: 64
 29 |   }
 30 | }
 31 | hparam {
 32 |   key: "cap_input_gate"
 33 |   value {
 34 |     bool_value: true
 35 |   }
 36 | }
 37 | hparam {
 38 |   key: "cell_clip"
 39 |   value {
 40 |     float_value: -1.0
 41 |   }
 42 | }
 43 | hparam {
 44 |   key: "cell_init_factor"
 45 |   value {
 46 |     float_value: 1.0
 47 |   }
 48 | }
 49 | hparam {
 50 |   key: "drop_learning_rate_at_the_latest"
 51 |   value {
 52 |     int64_value: 900
 53 |   }
 54 | }
 55 | hparam {
 56 |   key: "drop_learning_rate_multiplier"
 57 |   value {
 58 |     float_value: 0.10000000149
 59 |   }
 60 | }
 61 | hparam {
 62 |   key: "drop_learning_rate_rounds"
 63 |   value {
 64 |     int64_value: 26
 65 |   }
 66 | }
 67 | hparam {
 68 |   key: "drop_state_probability"
 69 |   value {
 70 |     float_value: 0.00999999977648
 71 |   }
 72 | }
 73 | hparam {
 74 |   key: "embed_once"
 75 |   value {
 76 |     bool_value: true
 77 |   }
 78 | }
 79 | hparam {
 80 |   key: "embedding_init_factor"
 81 |   value {
 82 |     float_value: 1.0
 83 |   }
 84 | }
 85 | hparam {
 86 |   key: "feature_mask"
 87 |   value {
 88 |     bool_value: false
 89 |   }
 90 | }
 91 | hparam {
 92 |   key: "forget_bias"
 93 |   value {
 94 |     float_value: 1.0
 95 |   }
 96 | }
 97 | hparam {
 98 |   key: "hidden_size"
 99 |   value {
100 |     int64_value: 1227
101 |   }
102 | }
103 | hparam {
104 |   key: "input_dropout"
105 |   value {
106 |     float_value: 0.484243571758
107 |   }
108 | }
109 | hparam {
110 |   key: "input_embedding_ratio"
111 |   value {
112 |     float_value: 0.121501773596
113 |   }
114 | }
115 | hparam {
116 |   key: "input_embedding_size"
117 |   value {
118 |     int64_value: 149
119 |   }
120 | }
121 | hparam {
122 |   key: "intra_layer_dropout"
123 |   value {
124 |     float_value: 0.0920244976878
125 |   }
126 | }
127 | hparam {
128 |   key: "layer_norm"
129 |   value {
130 |     bool_value: false
131 |   }
132 | }
133 | hparam {
134 |   key: "learning_rate"
135 |   value {
136 |     float_value: 0.00246041407809
137 |   }
138 | }
139 | hparam {
140 |   key: "learning_rate_decay"
141 |   value {
142 |     float_value: 1.0
143 |   }
144 | }
145 | hparam {
146 |   key: "learning_rate_decay_burn_in_steps"
147 |   value {
148 |     int64_value: 0
149 |   }
150 | }
151 | hparam {
152 |   key: "lstm_skip_connection"
153 |   value {
154 |     bool_value: true
155 |   }
156 | }
157 | hparam {
158 |   key: "max_grad_norm"
159 |   value {
160 |     float_value: 10.0
161 |   }
162 | }
163 | hparam {
164 |   key: "model"
165 |   value {
166 |     bytes_value: "lstm"
167 |   }
168 | }
169 | hparam {
170 |   key: "num_eval_samples"
171 |   value {
172 |     int64_value: 0
173 |   }
174 | }
175 | hparam {
176 |   key: "num_layers"
177 |   value {
178 |     int64_value: 2
179 |   }
180 | }
181 | hparam {
182 |   key: "num_params"
183 |   value {
184 |     int64_value: 24000000
185 |   }
186 | }
187 | hparam {
188 |   key: "optimizer_type"
189 |   value {
190 |     bytes_value: "rmsprop"
191 |   }
192 | }
193 | hparam {
194 |   key: "outer_steps"
195 |   value {
196 |     int64_value: 1000
197 |   }
198 | }
199 | hparam {
200 |   key: "output_dropout"
201 |   value {
202 |     float_value: 0.391492575407
203 |   }
204 | }
205 | hparam {
206 |   key: "output_embedding_ratio"
207 |   value {
208 |     float_value: 0.121501773596
209 |   }
210 | }
211 | hparam {
212 |   key: "output_embedding_size"
213 |   value {
214 |     int64_value: 149
215 |   }
216 | }
217 | hparam {
218 |   key: "output_init_factor"
219 |   value {
220 |     float_value: 1.0
221 |   }
222 | }
223 | hparam {
224 |   key: "overlay_rank"
225 |   value {
226 |     int64_value: -1
227 |   }
228 | }
229 | hparam {
230 |   key: "rmsprop_beta2"
231 |   value {
232 |     float_value: 0.999000012875
233 |   }
234 | }
235 | hparam {
236 |   key: "rmsprop_epsilon"
237 |   value {
238 |     float_value: 9.99999993923e-09
239 |   }
240 | }
241 | hparam {
242 |   key: "share_input_and_output_embeddings"
243 |   value {
244 |     bool_value: true
245 |   }
246 | }
247 | hparam {
248 |   key: "sparsity_ratio"
249 |   value {
250 |     float_value: -1.0
251 |   }
252 | }
253 | hparam {
254 |   key: "state_dropout"
255 |   value {
256 |     float_value: 0.453888505697
257 |   }
258 | }
259 | hparam {
260 |   key: "tie_forget_and_input_gates"
261 |   value {
262 |     bool_value: false
263 |   }
264 | }
265 | hparam {
266 |   key: "token_dropout"
267 |   value {
268 |     float_value: 0.0
269 |   }
270 | }
271 | hparam {
272 |   key: "trainable_initial_state"
273 |   value {
274 |     bool_value: false
275 |   }
276 | }
277 | hparam {
278 |   key: "update_dropout"
279 |   value {
280 |     float_value: 0.0
281 |   }
282 | }
283 | hparam {
284 |   key: "vocab_size"
285 |   value {
286 |     int64_value: 33279
287 |   }
288 | }
289 | hparam {
290 |   key: "weight_decay"
291 |   value {
292 |     float_value: 3.77565629606e-05
293 |   }
294 | }
295 | hparam {
296 |   key: "weight_penalty"
297 |   value {
298 |     float_value: 0.0
299 |   }
300 | }
301 | 


--------------------------------------------------------------------------------
/lamb/experiment/pushing-the-bounds/README.md:
--------------------------------------------------------------------------------
 1 | This directory is to accompany the [Pushing the bounds of
 2 | dropout](https://arxiv.org/abs/1805.09208) paper.
 3 | 
 4 | The paper is mostly about how to make predictions with a model trained with
 5 | dropout. Use any saved model such as those trained in `../on-the-state/` and
 6 | evaluate them with `./test.sh` (in this dir). One difference to `../test.sh` is
 7 | that `./test.sh` tunes the optimal evaluation softmax temperature on the
 8 | validation set (between 0.8 and 1.0):
 9 | 
10 |     eval_softmax_temperature=-0.8
11 | 
12 | Also, in addition to deterministic (or 'standard') dropout, it does MC dropout
13 | (the arithmetic averaged variant) with various `eval_dropout_multiplier`s. See
14 | the linked paper for details.
15 | 
16 | So, assuming there is a saved model in `/tmp/lamb/ptb_10m_lstm_d1/`. Test it
17 | with:
18 | 
19 |     ./test.sh run some-descriptive-name /tmp/lamb/ptb_10m_lstm_d1/
20 | 
21 | Thus the model will be evaluated more than once. In the output, the line with
22 | `final test_det_t0.9 xe:` has the test cross-entropy at the optimal softmax
23 | temperature (in this case 0.9). Similarly, `final test_mca_d0.8_t0.9 xe:`
24 | corresponds to the test cross-entropy with `eval_dropout_multiplier=0.8` and
25 | softmax temperature 0.9.
26 | 


--------------------------------------------------------------------------------
/lamb/experiment/pushing-the-bounds/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | 
24 | saved_args="$1"
25 | 
26 | save_checkpoints=false
27 | turns=0
28 | min_non_episodic_eval_examples_per_stripe=500000
29 | eval_on_test=true
30 | 
31 | test_one() {
32 |   local suffix="$1"
33 |   local experiment_dir="$2"
34 |   local name="$(default_name)_${suffix}"
35 |   local config_file="${experiment_dir}/config"
36 |   local load_checkpoint="${experiment_dir}/best"
37 |   source_lib "run.sh" "${saved_args}"
38 | }
39 | 
40 | name="$2"
41 | experiment_dir="$3"
42 | 
43 | eval_softmax_temperature=-0.8
44 | 
45 | eval_method="deterministic"
46 | test_one "det" "${experiment_dir}"
47 | 
48 | eval_method="arithmetic"
49 | num_eval_samples=200
50 | eval_dropout_multiplier=0.8
51 | test_one "amc$eval_dropout_multiplier" "${experiment_dir}"
52 | 


--------------------------------------------------------------------------------
/lamb/experiment/rerun.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | 
24 | cmd="$1"
25 | 
26 | run_one() {
27 |   local name="$(default_name)_$1"
28 |   local config_file="$2/config"
29 |   source_lib "run.sh" "${cmd}"
30 | }
31 | 
32 | run_one "$2" "$3"
33 | 


--------------------------------------------------------------------------------
/lamb/experiment/rerun_old.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | 
24 | cmd="$1"
25 | 
26 | run_one() {
27 |   local name="$(default_name)_$1"
28 |   local flags_as_dict="$2/args"
29 |   local hps_proto_file="$2/config"
30 |   source_lib "run.sh" "${cmd}"
31 | }
32 | 
33 | run_one "$2" "$3"
34 | 


--------------------------------------------------------------------------------
/lamb/experiment/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | 
24 | cmd="$1"
25 | load_checkpoint="$3/best"
26 | config_file="${4:-$3}/config"
27 | 
28 | save_checkpoints=false
29 | turns=0
30 | min_non_episodic_eval_examples_per_stripe=500000
31 | 
32 | test_one() {
33 |   local name="$1"
34 |   source_lib "run.sh" "${cmd}"
35 | }
36 | 
37 | cell="lu"
38 | gpu_type="v100"
39 | 
40 | eval_on_test=false
41 | 
42 | eval_method="deterministic"
43 | test_one "$2_det"
44 | 
45 | # MC dropout evaluation can be a bit better, but it's very slow.
46 | eval_method="arithmetic"
47 | num_eval_samples=200
48 | eval_dropout_multiplier=0.6
49 | test_one "$2_amc$eval_dropout_multiplier"
50 | eval_dropout_multiplier=0.7
51 | test_one "$2_amc$eval_dropout_multiplier"
52 | eval_dropout_multiplier=0.8
53 | test_one "$2_amc$eval_dropout_multiplier"
54 | eval_dropout_multiplier=0.9
55 | test_one "$2_amc$eval_dropout_multiplier"
56 | 
57 | eval_on_test=true
58 | max_eval_eval_batches=1
59 | 
60 | eval_method="deterministic"
61 | test_one "$2_test_det"
62 | 
63 | # MC dropout evaluation can be a bit better, but it's very slow.
64 | eval_method="arithmetic"
65 | num_eval_samples=200
66 | eval_dropout_multiplier=0.6
67 | test_one "$2_test_amc$eval_dropout_multiplier"
68 | eval_dropout_multiplier=0.7
69 | test_one "$2_test_amc$eval_dropout_multiplier"
70 | eval_dropout_multiplier=0.8
71 | test_one "$2_test_amc$eval_dropout_multiplier"
72 | eval_dropout_multiplier=0.9
73 | test_one "$2_test_amc$eval_dropout_multiplier"
74 | 


--------------------------------------------------------------------------------
/lamb/experiment/train_ptb_10m_lstm_d1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word_slow.sh"
24 | 
25 | # Model hyperparameters
26 | 
27 | model="lstm"
28 | num_params=$(million 10)
29 | share_input_and_output_embeddings=true
30 | tie_forget_and_input_gates=false
31 | cap_input_gate=true
32 | forget_bias=1.0
33 | num_layers=1
34 | 
35 | # Tuned hyperparameters
36 | 
37 | learning_rate=0.0048308
38 | l2_penalty=0.00007676
39 | input_dropout=0.51551
40 | inter_layer_dropout=
41 | state_dropout=0.18417
42 | output_dropout=0.33801
43 | input_embedding_ratio=0.22973
44 | 
45 | # Evaluation hyperparameters
46 | 
47 | eval_softmax_temperature=-0.8
48 | 
49 | source_lib "run.sh" "$@"
50 | 


--------------------------------------------------------------------------------
/lamb/experiment/train_ptb_24m_lstm_d4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -e
19 | 
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word_slow.sh"
24 | 
25 | # Model hyperparameters
26 | 
27 | model="lstm"
28 | num_params=$(million 24)
29 | share_input_and_output_embeddings=true
30 | tie_forget_and_input_gates=false
31 | cap_input_gate=true
32 | forget_bias=1.0
33 | num_layers=4
34 | 
35 | # Tuned hyperparameters
36 | 
37 | learning_rate=0.0033390
38 | l2_penalty=0.000093711
39 | input_dropout=0.68697
40 | inter_layer_dropout=0.31323
41 | state_dropout=0.48479
42 | output_dropout=0.69626
43 | 
44 | # Evaluation hyperparameters
45 | 
46 | eval_softmax_temperature=-0.8
47 | 
48 | source_lib "run.sh" "$@"
49 | 


--------------------------------------------------------------------------------
/lamb/experiment/tune_ptb_10m.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 | 
21 | set -e
22 | 
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../lib/setup.sh"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/ptb_word_rmsprop.sh"
28 | 
29 | # Model hyperparameters
30 | 
31 | num_params=$(million 10)
32 | share_input_and_output_embeddings=true
33 | 
34 | # Evaluation hyperparameters
35 | 
36 | eval_softmax_temperature=-0.8
37 | 
38 | # Tuning parameters
39 | 
40 | num_workers=60
41 | 
42 | # Start a number of tuning studies, setting model specific parameters.
43 | 
44 | model="lstm"
45 | tie_forget_and_input_gates=false
46 | forget_bias=1.0
47 | num_layers=1
48 | 
49 | tuneables="learning_rate,l2_penalty,
50 |   input_dropout,inter_layer_dropout,state_dropout,
51 |   output_dropout,input_embedding_ratio"
52 | name="$(default_name)_${model}_d${num_layers}"
53 | source_lib "run.sh" "$@"
54 | 


--------------------------------------------------------------------------------
/lamb/lib/config/README.md:
--------------------------------------------------------------------------------
1 | Shell scripts here set up variables for datasets and all kinds of arguments to
2 | the main binary. They are intended to be sourced and can source other scripts.
3 | 


--------------------------------------------------------------------------------
/lamb/lib/config/common.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | logtostderr=true
17 | 
18 | default_name() {
19 |   if which git > /dev/null 2>&1; then
20 |     echo "$(git rev-parse --short HEAD)_$(basename $0 .sh)"
21 |   else
22 |     echo "$(basename $0 .sh)"
23 |   fi
24 | }
25 | 
26 | name="$(default_name)"
27 | 
28 | million() {
29 |   echo $(($1 * 1000 * 1000))
30 | }
31 | 


--------------------------------------------------------------------------------
/lamb/lib/config/copy.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | copy_data_dir=${copy_data_dir:-"${HOME}/data/copy/"}
17 | training_file="${copy_data_dir}copy-training.txt"
18 | validation_file="${copy_data_dir}copy-valid.txt"
19 | test_file="${copy_data_dir}copy-test.txt"
20 | word_based=true
21 | episodic=true
22 | conditioning_separator="|"
23 | 


--------------------------------------------------------------------------------
/lamb/lib/config/enwik8.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | enwik8_data_dir=${enwik8_data_dir:-"${HOME}/data/enwik8/"}
17 | training_file="${enwik8_data_dir}enwik8-training.txt"
18 | validation_file="${enwik8_data_dir}enwik8-valid.txt"
19 | test_file="${enwik8_data_dir}enwik8-test.txt"
20 | file_encoding="CP437"
21 | word_based=false
22 | 


--------------------------------------------------------------------------------
/lamb/lib/config/enwik8_char.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/common.sh"
17 | source_lib "config/enwik8.sh"
18 | # While utf-8 is the actual encoding, for character based modelling
19 | # the literature seems to have settled on bytes as evidenced by
20 | # mentions of a vocabulary size of 205 (it is more than 5000 with
21 | # utf-8).
22 | file_encoding="CP437"
23 | word_based=false
24 | episodic=false
25 | max_time_steps=50
26 | # 400*500=200k optimization steps. With batch size 128 and max_time_steps
27 | # 50, for example, that's about 14 epochs.
28 | steps_per_turn=400
29 | turns=500
30 | print_training_stats_every_num_steps=100
31 | early_stopping_turns=15
32 | early_stopping_rampup_turns=30
33 | early_stopping_worst_xe_target=1.05,0.93,0.92
34 | drop_learning_rate_turns=13
35 | drop_learning_rate_multiplier=0.1
36 | drop_learning_rate_at_the_latest=450
37 | drop_state_probability=0.01
38 | max_eval_eval_batches=500
39 | 


--------------------------------------------------------------------------------
/lamb/lib/config/enwik8_char_rmsprop.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/enwik8_char.sh"
17 | optimizer_type=rmsprop
18 | rmsprop_beta2=0.99
19 | rmsprop_epsilon=1e-5
20 | batch_size=128
21 | max_grad_norm=10.0
22 | 


--------------------------------------------------------------------------------
/lamb/lib/config/mwc.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | mwc_data_dir=${mwc_data_dir:-"${HOME}/data/mwc/"}
17 | file_encoding="utf-8"
18 | word_based=false
19 | 


--------------------------------------------------------------------------------
/lamb/lib/config/ptb.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | ptb_data_dir=${ptb_data_dir:-"${HOME}/data/ptb/"}
17 | training_file="${ptb_data_dir}ptb.train.txt"
18 | validation_file="${ptb_data_dir}ptb.valid.txt"
19 | test_file="${ptb_data_dir}ptb.test.txt"
20 | 


--------------------------------------------------------------------------------
/lamb/lib/config/ptb_char.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | ptb_data_dir=${ptb_data_dir:-"${HOME}/data/ptb/"}
17 | training_file="${ptb_data_dir}ptb.char.train.txt"
18 | validation_file="${ptb_data_dir}ptb.char.valid.txt"
19 | test_file="${ptb_data_dir}ptb.char.test.txt"
20 | # There are spaces between characters.
21 | word_based=true
22 | 


--------------------------------------------------------------------------------
/lamb/lib/config/ptb_word.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/common.sh"
17 | source_lib "config/ptb.sh"
18 | word_based=true
19 | episodic=false
20 | 


--------------------------------------------------------------------------------
/lamb/lib/config/ptb_word_rmsprop.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/ptb_word.sh"
17 | 
18 | optimizer_type=rmsprop
19 | batch_size=64
20 | max_grad_norm=10.0
21 | max_time_steps=35
22 | 
23 | steps_per_turn=100
24 | turns=1000
25 | print_training_stats_every_num_steps=100
26 | 
27 | early_stopping_turns=30
28 | early_stopping_rampup_turns=60
29 | early_stopping_worst_xe_target=4.4,4.2
30 | 
31 | drop_learning_rate_turns=26
32 | drop_learning_rate_multiplier=0.1
33 | drop_learning_rate_at_the_latest=900
34 | drop_state_probability=0.01
35 | 


--------------------------------------------------------------------------------
/lamb/lib/config/ptb_word_slow.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/common.sh"
17 | source_lib "config/ptb_word_rmsprop.sh"
18 | episodic=false
19 | max_time_steps=35
20 | steps_per_turn=100
21 | turns=2500
22 | print_training_stats_every_num_steps=100
23 | early_stopping_turns=100
24 | early_stopping_rampup_turns=200
25 | early_stopping_worst_xe_target=4.4,4.2
26 | drop_learning_rate_turns=90
27 | drop_learning_rate_multiplier=0.1
28 | drop_learning_rate_at_the_latest=2000
29 | drop_state_probability=0.01
30 | 


--------------------------------------------------------------------------------
/lamb/lib/config/running.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | # Just a placeholder for now. Don't remove it though as it is necessary for the
17 | # source_lib override mechanism.
18 | 


--------------------------------------------------------------------------------
/lamb/lib/config/tuning.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | # Just a placeholder for now. Don't remove it though as it is necessary for the
17 | # source_lib override mechanism.
18 | 


--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-103.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | wikitext_103_data_dir=${wikitext_103_data_dir:-"${HOME}/data/wikitext-103/"}
17 | training_file="${wikitext_103_data_dir}wiki.train.tokens"
18 | validation_file="${wikitext_103_data_dir}wiki.valid.tokens"
19 | test_file="${wikitext_103_data_dir}wiki.test.tokens"
20 | 


--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-103_word.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/common.sh"
17 | source_lib "config/wikitext-103.sh"
18 | word_based=true
19 | episodic=false
20 | max_time_steps=35
21 | steps_per_turn=1000
22 | turns=1000
23 | print_training_stats_every_num_steps=1000
24 | early_stopping_turns=30
25 | early_stopping_rampup_turns=60
26 | early_stopping_worst_xe_target=3.5,3.3
27 | drop_learning_rate_turns=26
28 | drop_learning_rate_multiplier=0.1
29 | drop_learning_rate_at_the_latest=900
30 | drop_state_probability=0.01
31 | 


--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-103_word_rmsprop.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/wikitext-103_word.sh"
17 | optimizer_type=rmsprop
18 | batch_size=64
19 | max_grad_norm=10.0
20 | 


--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-2.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | wikitext_2_data_dir=${wikitext_2_data_dir:-"${HOME}/data/wikitext-2/"}
17 | training_file="${wikitext_2_data_dir}wiki.train.tokens"
18 | validation_file="${wikitext_2_data_dir}wiki.valid.tokens"
19 | test_file="${wikitext_2_data_dir}wiki.test.tokens"
20 | 


--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-2_word.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/common.sh"
17 | source_lib "config/wikitext-2.sh"
18 | word_based=true
19 | episodic=false
20 | # max_time_steps=35
21 | # steps_per_turn=200
22 | # turns=1000
23 | # print_training_stats_every_num_steps=100
24 | # early_stopping_turns=30
25 | # early_stopping_rampup_turns=60
26 | # early_stopping_worst_xe_target=4.9,4.5
27 | # drop_learning_rate_turns=26
28 | # drop_learning_rate_multiplier=0.1
29 | # drop_learning_rate_at_the_latest=900
30 | # drop_state_probability=0.01
31 | 


--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-2_word_rmsprop.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | source_lib "config/wikitext-2_word.sh"
17 | optimizer_type=rmsprop
18 | batch_size=64
19 | max_grad_norm=10.0
20 | 


--------------------------------------------------------------------------------
/lamb/lib/describe_version.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | # We want to know what code was run for an experiment. This prints the git
17 | # version, status and the non-committed diffs, if any.
18 | 
19 | echo "$(date): Invoking LAMB."
20 | if (which git && git rev-parse --is-inside-work-tree) > /dev/null 2>&1; then
21 |   echo "git version: $(git rev-parse --short HEAD)"
22 |   git --no-pager status
23 |   git --no-pager diff
24 |   git --no-pager diff --cached
25 | fi
26 | 


--------------------------------------------------------------------------------
/lamb/lib/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | # This script runs LAMB.
17 | #
18 | # Usage
19 | # -----
20 | #
21 | # See experiment/*.sh for examples.
22 | #
23 | # Assign values to shell variables of the same name as hyperparameters, command
24 | # line flags and source this script. The single, optional command line argument
25 | # (of the sourcee) is the command which must be "run" in the open source
26 | # version.
27 | #
28 | # setup.py is assumed to have been sourced.
29 | #
30 | # How it works
31 | # ------------
32 | #
33 | # The configuration options (see ../README.md) are gathered from shell variables
34 | # and passed as command line arguments to the binary.
35 | 
36 | cmd="${1:-run}"
37 | 
38 | source_lib "run_helper.sh"
39 | 
40 | _project_dir=${project_dir:-"."}
41 | _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}"
42 | # If ensure_new_experiment, add a random suffix that makes experiment_dir
43 | # unique.
44 | if [ "${ensure_new_experiment}" != "false" ]; then
45 |   _suffix="$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c5)"
46 |   _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}_${_suffix}"
47 |   while test -d "${_experiment_dir}"; do
48 |     _suffix="$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c5)"
49 |     _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}_${_suffix}"
50 |   done
51 | fi
52 | 
53 | mkdir -p "${_experiment_dir}"
54 | 
55 | {
56 |   source_lib "describe_version.sh"
57 | }  > >(tee -a "${_experiment_dir}/lamb_version")
58 | 
59 | {
60 |   if [ "${cmd}" = "run" ]; then
61 |     eval $(echo "python" "${base}/main.py" "$(gather_args)")
62 |   elif [ "${cmd}" = "run_par" ]; then
63 |     eval $(echo "${base}/lamb.par" "$(gather_args)")
64 |   else
65 |     echo "Unsupported command ${cmd}."
66 |     exit 1
67 |   fi
68 | }  > >(tee -a "${_experiment_dir}/stdout") \
69 |   2> >(tee -a "${_experiment_dir}/stderr" >&2)
70 | 


--------------------------------------------------------------------------------
/lamb/lib/run_helper.sh:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ============================================================================
 15 | 
 16 | 
 17 | set -e
 18 | 
 19 | escape_cl_arg() {
 20 |   printf "%q" "$1"
 21 | }
 22 | 
 23 | # This command:
 24 | #     add_param hps "--" model "X" "escape_cl_arg"
 25 | # will add to $hps the line:
 26 | #     --model=${model}X
 27 | # where ${model} is actually evaluated and transformed by
 28 | # escape_cl_arg. See the 'indirect references' shell concept.
 29 | add_param() {
 30 |   var1="\${$1}"
 31 |   prefix=$2
 32 |   var2="\${$3}"
 33 |   suffix=$4
 34 |   val2=$(eval "echo \$$3")
 35 |   if [ "$val2" ]; then
 36 |     local escape_fn=$5
 37 |     if [ "$escape_fn" ]; then
 38 |       var2="\$($escape_fn \"$var2\")"
 39 |     fi
 40 |     eval $1="\"$var1$prefix$3=$var2$suffix\""
 41 |   fi
 42 | }
 43 | 
 44 | add_cl_arg() {
 45 |   add_param "$1" "--" "$2" " " "escape_cl_arg"
 46 | }
 47 | 
 48 | gather_args() {
 49 |   ## Populate args (mirroring the structure of README.md). See command line
 50 |   ## argument definitions in lamb_flags.py.
 51 | 
 52 |   local args=""
 53 | 
 54 |   # data
 55 |   add_cl_arg args training_file
 56 |   add_cl_arg args validation_file
 57 |   add_cl_arg args test_file
 58 |   add_cl_arg args conditioning_separator
 59 |   add_cl_arg args file_encoding
 60 |   add_cl_arg args word_based
 61 |   add_cl_arg args episodic
 62 | 
 63 |   # model
 64 |   add_cl_arg args num_params
 65 |   add_cl_arg args share_input_and_output_embeddings
 66 |   add_cl_arg args input_embedding_size
 67 |   add_cl_arg args output_embedding_size
 68 |   add_cl_arg args input_embedding_ratio
 69 |   add_cl_arg args output_embedding_ratio
 70 |   add_cl_arg args embedding_dropout
 71 |   add_cl_arg args token_dropout
 72 |   add_cl_arg args input_dropout
 73 |   add_cl_arg args input_dropout_base
 74 |   add_cl_arg args output_dropout
 75 |   add_cl_arg args downprojected_output_dropout
 76 |   add_cl_arg args shared_mask_dropout
 77 |   add_cl_arg args embed_once
 78 |   add_cl_arg args output_once
 79 | 
 80 |   # cell
 81 |   add_cl_arg args model
 82 |   add_cl_arg args num_layers
 83 |   add_cl_arg args residual_connections
 84 |   add_cl_arg args lstm_skip_connection
 85 |   add_cl_arg args feature_mask_rounds
 86 |   add_cl_arg args feature_mask_rank
 87 |   add_cl_arg args sparsity_ratio
 88 |   add_cl_arg args overlay_rank
 89 |   add_cl_arg args hidden_size
 90 |   add_cl_arg args hidden_size_multiplier
 91 |   add_cl_arg args layer_norm
 92 |   add_cl_arg args activation_fn
 93 |   add_cl_arg args tie_forget_and_input_gates
 94 |   add_cl_arg args cap_input_gate
 95 |   add_cl_arg args mos_num_components
 96 |   add_cl_arg args trainable_initial_state
 97 |   add_cl_arg args inter_layer_dropout
 98 |   add_cl_arg args state_dropout
 99 |   add_cl_arg args state_dropout_flip_rate
100 |   add_cl_arg args update_dropout
101 |   add_cl_arg args cell_clip
102 | 
103 |   # objective
104 |   add_cl_arg args model_average
105 |   add_cl_arg args num_training_samples
106 |   add_cl_arg args l2_penalty
107 |   add_cl_arg args l1_penalty
108 |   add_cl_arg args activation_norm_penalty
109 |   add_cl_arg args drop_state_probability
110 | 
111 |   # initialization
112 |   add_cl_arg args embedding_init_factor
113 |   add_cl_arg args scale_input_embeddings
114 |   add_cl_arg args cell_init_factor
115 |   add_cl_arg args forget_bias
116 |   add_cl_arg args output_init_factor
117 | 
118 |   # schedule
119 |   add_cl_arg args steps_per_turn
120 |   add_cl_arg args turns
121 |   add_cl_arg args print_training_stats_every_num_steps
122 |   
123 |   # optimization
124 |   add_cl_arg args optimizer_type
125 |   add_cl_arg args rmsprop_beta2
126 |   add_cl_arg args rmsprop_epsilon
127 |   add_cl_arg args adam_beta1
128 |   add_cl_arg args adam_beta2
129 |   add_cl_arg args adam_epsilon
130 |   add_cl_arg args max_grad_norm
131 |   add_cl_arg args batch_size
132 |   add_cl_arg args accum_batch_size
133 |   add_cl_arg args max_time_steps
134 |   add_cl_arg args trigger_averaging_turns
135 |   add_cl_arg args trigger_averaging_at_the_latest
136 | 
137 |   # learning rate
138 |   add_cl_arg args learning_rate
139 |   add_cl_arg args learning_rate_decay
140 |   add_cl_arg args learning_rate_decay_burn_in_steps
141 |   add_cl_arg args drop_learning_rate_turns
142 |   add_cl_arg args drop_learning_rate_multiplier
143 |   add_cl_arg args drop_learning_rate_at_the_latest
144 |   
145 |   # early stopping
146 |   add_cl_arg args early_stopping_turns
147 |   add_cl_arg args early_stopping_rampup_turns
148 |   add_cl_arg args early_stopping_worst_xe_target
149 |   add_cl_arg args early_stopping_slowest_rate
150 | 
151 |   # cross-validation
152 |   add_cl_arg args crossvalidate
153 |   add_cl_arg args crossvalidation_rounds
154 |   add_cl_arg args crossvalidate_max_folds
155 | 
156 |   # evaluation
157 |   add_cl_arg args max_training_eval_batches
158 |   add_cl_arg args max_eval_eval_batches
159 |   add_cl_arg args max_test_eval_batches
160 |   add_cl_arg args min_non_episodic_eval_examples_per_stripe
161 |   add_cl_arg args eval_on_test
162 |   add_cl_arg args eval_method
163 |   add_cl_arg args num_eval_samples
164 |   add_cl_arg args eval_softmax_temperature
165 |   add_cl_arg args eval_softmax_temperature_estimation_num_tokens
166 |   add_cl_arg args eval_power_mean_power
167 |   add_cl_arg args eval_dropout_multiplier
168 |   add_cl_arg args validation_prediction_file
169 |   add_cl_arg args dyneval
170 |   add_cl_arg args dyneval_learning_rate
171 |   add_cl_arg args dyneval_decay_rate
172 |   add_cl_arg args dyneval_epsilon
173 | 
174 |   # experiments
175 |   local experiment_dir="${_experiment_dir}"
176 |   add_cl_arg args experiment_dir
177 |   add_cl_arg args save_config
178 |   add_cl_arg args config_file
179 |   add_cl_arg args hps_proto_file # deprecated
180 |   add_cl_arg args flags_as_dict # deprecated
181 | 
182 |   # checkpoints
183 |   add_cl_arg args save_checkpoints
184 |   add_cl_arg args load_checkpoint
185 |   add_cl_arg args load_optimizer_state
186 |   add_cl_arg args load_averaged
187 |   add_cl_arg args use_old_linear_names
188 | 
189 |   # Misc flags
190 |   add_cl_arg args seed
191 |   add_cl_arg args swap_memory
192 |   add_cl_arg args logtostderr
193 |   add_cl_arg args log_device_placement
194 |   add_cl_arg args summary_flush_secs
195 | 
196 |   echo "${args}"
197 | }
198 | 


--------------------------------------------------------------------------------
/lamb/lib/setup.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | if [[ "$0" == "$BASH_SOURCE" ]]; then
17 |   echo "This script must be sourced."
18 |   exit 1
19 | fi
20 | 
21 | base=$(dirname "$BASH_SOURCE")/..
22 | 
23 | cmd=${1:-"run"}
24 | 
25 | lib_override_path=
26 | 
27 | # `source_lib` is like the shell built-in `source`, but allows files in
28 | # `lib_override_path` to shadow those in lamb/lib/.
29 | source_lib() {
30 |   local _name="$1"
31 |   shift
32 |   if [ -d "${lib_override_path}" -a \
33 |        -f "${lib_override_path}/lib/${_name}" ]; then
34 |     source "${lib_override_path}/lib/${_name}" "$@"
35 |   else
36 |     source "${base}/lib/${_name}" "$@"
37 |   fi
38 | }
39 | 


--------------------------------------------------------------------------------
/lamb/res_multi_rnn_cell.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """A stacked RNN cell with residual connections."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow.compat.v1 as tf
23 | from tensorflow.contrib import framework as contrib_framework
24 | 
25 | nest = contrib_framework.nest
26 | 
27 | 
28 | class ResMultiRNNCell(tf.nn.rnn_cell.RNNCell):
29 |   """RNN cell composed sequentially of multiple simple cells."""
30 | 
31 |   def __init__(self, cells, state_is_tuple=True):
32 |     """Create a RNN cell composed sequentially of a number of RNNCells.
33 | 
34 |     Args:
35 |       cells: list of RNNCells that will be composed in this order.
36 |       state_is_tuple: If True, accepted and returned states are n-tuples, where
37 |         `n = len(cells)`.  If False, the states are all
38 |         concatenated along the column axis.  This latter behavior will soon be
39 |         deprecated.
40 | 
41 |     Raises:
42 |       ValueError: if cells is empty (not allowed), or at least one of the cells
43 |         returns a state tuple but the flag `state_is_tuple` is `False`.
44 |     """
45 |     if not cells:
46 |       raise ValueError("Must specify at least one cell for ResMultiRNNCell.")
47 |     if not nest.is_sequence(cells):
48 |       raise TypeError(
49 |           "cells must be a list or tuple, but saw: %s." % cells)
50 | 
51 |     self._cells = cells
52 |     self._state_is_tuple = state_is_tuple
53 |     if not state_is_tuple:
54 |       if any(nest.is_sequence(c.state_size) for c in self._cells):
55 |         raise ValueError("Some cells return tuples of states, but the flag "
56 |                          "state_is_tuple is not set.  State sizes are: %s"
57 |                          % str([c.state_size for c in self._cells]))
58 | 
59 |   @property
60 |   def state_size(self):
61 |     if self._state_is_tuple:
62 |       return tuple(cell.state_size for cell in self._cells)
63 |     else:
64 |       return sum([cell.state_size for cell in self._cells])
65 | 
66 |   @property
67 |   def output_size(self):
68 |     return self._cells[-1].output_size
69 | 
70 |   def __call__(self, inputs, state, scope=None):
71 |     """Run this multi-layer cell on inputs, starting from state."""
72 |     with tf.variable_scope(scope or "res_multi_rnn_cell"):
73 |       cur_state_pos = 0
74 |       cur_inp = inputs
75 |       new_states = []
76 |       for i, cell in enumerate(self._cells):
77 |         with tf.variable_scope("cell_%d" % i):
78 |           if self._state_is_tuple:
79 |             if not nest.is_sequence(state):
80 |               raise ValueError(
81 |                   "Expected state to be a tuple of length %d, but received: %s"
82 |                   % (len(self.state_size), state))
83 |             cur_state = state[i]
84 |           else:
85 |             cur_state = tf.slice(
86 |                 state, [0, cur_state_pos], [-1, cell.state_size])
87 |             cur_state_pos += cell.state_size
88 |           cur_inp2, new_state = cell(cur_inp, cur_state)
89 |           if i == 0:
90 |             cur_inp = cur_inp2
91 |           else:
92 |             cur_inp = cur_inp + cur_inp2
93 |           new_states.append(new_state)
94 |     new_states = (tuple(new_states) if self._state_is_tuple else
95 |                   tf.concat(new_states, 1))
96 |     return cur_inp, new_states
97 | 


--------------------------------------------------------------------------------
/lamb/skip_multi_rnn_cell.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """A RNN cell with skip connections."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow.compat.v1 as tf
23 | from tensorflow.contrib import framework as contrib_framework
24 | 
25 | nest = contrib_framework.nest
26 | 
27 | 
28 | class SkipMultiRNNCell(tf.nn.rnn_cell.RNNCell):
29 |   """RNN cell composed sequentially of multiple simple cells."""
30 | 
31 |   def __init__(self, cells, state_is_tuple=True):
32 |     """Create a RNN cell composed sequentially of a number of RNNCells.
33 | 
34 |     Args:
35 |       cells: list of RNNCells that will be composed in this order.
36 |       state_is_tuple: If True, accepted and returned states are n-tuples, where
37 |         `n = len(cells)`.  If False, the states are all
38 |         concatenated along the column axis.  This latter behavior will soon be
39 |         deprecated.
40 | 
41 |     Raises:
42 |       ValueError: if cells is empty (not allowed), or at least one of the cells
43 |         returns a state tuple but the flag `state_is_tuple` is `False`.
44 |     """
45 |     if not cells:
46 |       raise ValueError("Must specify at least one cell for SkipMultiRNNCell.")
47 |     if not nest.is_sequence(cells):
48 |       raise TypeError(
49 |           "cells must be a list or tuple, but saw: %s." % cells)
50 | 
51 |     self._cells = cells
52 |     self._state_is_tuple = state_is_tuple
53 |     if not state_is_tuple:
54 |       if any(nest.is_sequence(c.state_size) for c in self._cells):
55 |         raise ValueError("Some cells return tuples of states, but the flag "
56 |                          "state_is_tuple is not set.  State sizes are: %s"
57 |                          % str([c.state_size for c in self._cells]))
58 | 
59 |   @property
60 |   def state_size(self):
61 |     if self._state_is_tuple:
62 |       return tuple(cell.state_size for cell in self._cells)
63 |     else:
64 |       return sum([cell.state_size for cell in self._cells])
65 | 
66 |   @property
67 |   def output_size(self):
68 |     return self._cells[-1].output_size
69 | 
70 |   def __call__(self, inputs, state, scope=None):
71 |     """Run this multi-layer cell on inputs, starting from state."""
72 |     output = None
73 |     with tf.variable_scope(scope or "skip_multi_rnn_cell"):
74 |       cur_state_pos = 0
75 |       cur_inp = inputs
76 |       new_states = []
77 |       for i, cell in enumerate(self._cells):
78 |         with tf.variable_scope("cell_%d" % i):
79 |           if self._state_is_tuple:
80 |             if not nest.is_sequence(state):
81 |               raise ValueError(
82 |                   "Expected state to be a tuple of length %d, but received: %s"
83 |                   % (len(self.state_size), state))
84 |             cur_state = state[i]
85 |           else:
86 |             cur_state = tf.slice(
87 |                 state, [0, cur_state_pos], [-1, cell.state_size])
88 |             cur_state_pos += cell.state_size
89 |           cur_inp, new_state = cell(cur_inp, cur_state)
90 |           new_states.append(new_state)
91 |           if output is None:
92 |             output = cur_inp
93 |           else:
94 |             output += cur_inp
95 |     new_states = (tuple(new_states) if self._state_is_tuple else
96 |                   tf.concat(new_states, 1))
97 |     return output, new_states
98 | 


--------------------------------------------------------------------------------
/lamb/test/data/save_v1/args:
--------------------------------------------------------------------------------
1 | {'swap_memory': False, 'crossvalidate': False, 'seed': 1, 'early_stopping_rounds': 10, 'max_test_eval_batches': None, 'crossvalidation_rounds': 1, 'early_stopping_worst_xe_target': '9.0', 'hps': 'model=lstm,num_layers=1,num_params=10000000,share_input_and_output_embeddings=true,tie_forget_and_input_gates=false,cap_input_gate=true,forget_bias=1.0,input_embedding_ratio=0.22973,input_dropout=0.51551,state_dropout=0.18417,output_dropout=0.33801,weight_decay=0.00007676,optimizer_type=rmsprop,max_grad_norm=10.0,outer_steps=25,batch_size=,learning_rate=0.0048308,drop_learning_rate_rounds=90,drop_learning_rate_multiplier=0.1,drop_learning_rate_at_the_latest=2000,drop_state_probability=0.01,softmax_test_time_temperature=-0.8,', 'use_old_linear_names': False, 'crossvalidation_folds': 10, 'training_file': '/non-existent-dir/data/ptb/ptb.train.txt', 'file_encoding': 'utf-8', 'max_training_eval_batches': 100, 'word_based': True, 'experiment_dir': '/non-existent-dir/baf254a0c6_train_ptb_10m_lstm_lstm_d1', 'max_eval_eval_batches': None, 'test_file': '/non-existent-dir/data/ptb/ptb.test.txt', 'min_non_episodic_eval_examples_per_stripe': 100, 'print_every': 100, 'hps_proto_file': None, 'save_checkpoints': True, 'load_checkpoint': None, 'log_device_placement': False, 'early_stopping_rampup_rounds': 20, 'episodic': False, 'eval_file': '/non-existent-dir/data/ptb/ptb.valid.txt', 'steps': 100, 'summary_flush_secs': 120, 'max_steps': 35}
2 | 


--------------------------------------------------------------------------------
/lamb/test/data/save_v1/config:
--------------------------------------------------------------------------------
  1 | hparam {
  2 |   key: "activation_fn"
  3 |   value {
  4 |     bytes_value: "tf.tanh"
  5 |   }
  6 | }
  7 | hparam {
  8 |   key: "activation_norm_penalty"
  9 |   value {
 10 |     float_value: 0.0
 11 |   }
 12 | }
 13 | hparam {
 14 |   key: "adam_beta1"
 15 |   value {
 16 |     float_value: 0.899999976158
 17 |   }
 18 | }
 19 | hparam {
 20 |   key: "adam_beta2"
 21 |   value {
 22 |     float_value: 0.999000012875
 23 |   }
 24 | }
 25 | hparam {
 26 |   key: "adam_epsilon"
 27 |   value {
 28 |     float_value: 9.99999993923e-09
 29 |   }
 30 | }
 31 | hparam {
 32 |   key: "batch_size"
 33 |   value {
 34 |     int64_value: 64
 35 |   }
 36 | }
 37 | hparam {
 38 |   key: "cap_input_gate"
 39 |   value {
 40 |     bool_value: true
 41 |   }
 42 | }
 43 | hparam {
 44 |   key: "cell_clip"
 45 |   value {
 46 |     float_value: -1.0
 47 |   }
 48 | }
 49 | hparam {
 50 |   key: "cell_init_factor"
 51 |   value {
 52 |     float_value: 1.0
 53 |   }
 54 | }
 55 | hparam {
 56 |   key: "downprojected_output_dropout"
 57 |   value {
 58 |     float_value: -1.0
 59 |   }
 60 | }
 61 | hparam {
 62 |   key: "drop_learning_rate_at_the_latest"
 63 |   value {
 64 |     int64_value: 2000
 65 |   }
 66 | }
 67 | hparam {
 68 |   key: "drop_learning_rate_multiplier"
 69 |   value {
 70 |     float_value: 0.10000000149
 71 |   }
 72 | }
 73 | hparam {
 74 |   key: "drop_learning_rate_rounds"
 75 |   value {
 76 |     int64_value: 90
 77 |   }
 78 | }
 79 | hparam {
 80 |   key: "drop_state_probability"
 81 |   value {
 82 |     float_value: 0.00999999977648
 83 |   }
 84 | }
 85 | hparam {
 86 |   key: "embed_once"
 87 |   value {
 88 |     bool_value: true
 89 |   }
 90 | }
 91 | hparam {
 92 |   key: "embedding_init_factor"
 93 |   value {
 94 |     float_value: 1.0
 95 |   }
 96 | }
 97 | hparam {
 98 |   key: "eval_method"
 99 |   value {
100 |     bytes_value: "deterministic"
101 |   }
102 | }
103 | hparam {
104 |   key: "feature_mask"
105 |   value {
106 |     bool_value: false
107 |   }
108 | }
109 | hparam {
110 |   key: "feature_mask_rank"
111 |   value {
112 |     int64_value: 0
113 |   }
114 | }
115 | hparam {
116 |   key: "feature_mask_rounds"
117 |   value {
118 |     int64_value: 0
119 |   }
120 | }
121 | hparam {
122 |   key: "forget_bias"
123 |   value {
124 |     float_value: 1.0
125 |   }
126 | }
127 | hparam {
128 |   key: "hidden_size"
129 |   value {
130 |     int64_value: -1
131 |   }
132 | }
133 | hparam {
134 |   key: "input_dropout"
135 |   value {
136 |     float_value: 0.51551002264
137 |   }
138 | }
139 | hparam {
140 |   key: "input_embedding_ratio"
141 |   value {
142 |     float_value: 0.229729995131
143 |   }
144 | }
145 | hparam {
146 |   key: "input_embedding_size"
147 |   value {
148 |     int64_value: -1
149 |   }
150 | }
151 | hparam {
152 |   key: "intra_layer_dropout"
153 |   value {
154 |     float_value: 0.0
155 |   }
156 | }
157 | hparam {
158 |   key: "layer_norm"
159 |   value {
160 |     bool_value: false
161 |   }
162 | }
163 | hparam {
164 |   key: "learning_rate"
165 |   value {
166 |     float_value: 0.00483079999685
167 |   }
168 | }
169 | hparam {
170 |   key: "learning_rate_decay"
171 |   value {
172 |     float_value: 1.0
173 |   }
174 | }
175 | hparam {
176 |   key: "learning_rate_decay_burn_in_steps"
177 |   value {
178 |     int64_value: 0
179 |   }
180 | }
181 | hparam {
182 |   key: "lstm_skip_connection"
183 |   value {
184 |     bool_value: true
185 |   }
186 | }
187 | hparam {
188 |   key: "max_grad_norm"
189 |   value {
190 |     float_value: 10.0
191 |   }
192 | }
193 | hparam {
194 |   key: "mixture_of_softmaxes_num_components"
195 |   value {
196 |     int64_value: 1
197 |   }
198 | }
199 | hparam {
200 |   key: "model"
201 |   value {
202 |     bytes_value: "lstm"
203 |   }
204 | }
205 | hparam {
206 |   key: "model_average"
207 |   value {
208 |     bytes_value: "arithmetic"
209 |   }
210 | }
211 | hparam {
212 |   key: "num_eval_samples"
213 |   value {
214 |     int64_value: 0
215 |   }
216 | }
217 | hparam {
218 |   key: "num_layers"
219 |   value {
220 |     int64_value: 1
221 |   }
222 | }
223 | hparam {
224 |   key: "num_params"
225 |   value {
226 |     int64_value: 50000
227 |   }
228 | }
229 | hparam {
230 |   key: "num_training_samples"
231 |   value {
232 |     int64_value: 1
233 |   }
234 | }
235 | hparam {
236 |   key: "optimizer_type"
237 |   value {
238 |     bytes_value: "rmsprop"
239 |   }
240 | }
241 | hparam {
242 |   key: "outer_steps"
243 |   value {
244 |     int64_value: 2500
245 |   }
246 | }
247 | hparam {
248 |   key: "output_dropout"
249 |   value {
250 |     float_value: 0.338010013103
251 |   }
252 | }
253 | hparam {
254 |   key: "output_embedding_ratio"
255 |   value {
256 |     float_value: -1.0
257 |   }
258 | }
259 | hparam {
260 |   key: "output_embedding_size"
261 |   value {
262 |     int64_value: -1
263 |   }
264 | }
265 | hparam {
266 |   key: "output_init_factor"
267 |   value {
268 |     float_value: 1.0
269 |   }
270 | }
271 | hparam {
272 |   key: "overlay_rank"
273 |   value {
274 |     int64_value: -1
275 |   }
276 | }
277 | hparam {
278 |   key: "rmsprop_beta2"
279 |   value {
280 |     float_value: 0.999000012875
281 |   }
282 | }
283 | hparam {
284 |   key: "rmsprop_epsilon"
285 |   value {
286 |     float_value: 9.99999993923e-09
287 |   }
288 | }
289 | hparam {
290 |   key: "share_input_and_output_embeddings"
291 |   value {
292 |     bool_value: true
293 |   }
294 | }
295 | hparam {
296 |   key: "softmax_test_time_temperature"
297 |   value {
298 |     float_value: -0.800000011921
299 |   }
300 | }
301 | hparam {
302 |   key: "sparsity_ratio"
303 |   value {
304 |     float_value: -1.0
305 |   }
306 | }
307 | hparam {
308 |   key: "state_dropout"
309 |   value {
310 |     float_value: 0.184169992805
311 |   }
312 | }
313 | hparam {
314 |   key: "state_dropout_flip_rate"
315 |   value {
316 |     float_value: 0.0
317 |   }
318 | }
319 | hparam {
320 |   key: "test_time_dropout_multiplier"
321 |   value {
322 |     float_value: 1.0
323 |   }
324 | }
325 | hparam {
326 |   key: "test_time_power_mean_power"
327 |   value {
328 |     float_value: 1.0
329 |   }
330 | }
331 | hparam {
332 |   key: "tie_forget_and_input_gates"
333 |   value {
334 |     bool_value: false
335 |   }
336 | }
337 | hparam {
338 |   key: "token_dropout"
339 |   value {
340 |     float_value: 0.0
341 |   }
342 | }
343 | hparam {
344 |   key: "trainable_initial_state"
345 |   value {
346 |     bool_value: false
347 |   }
348 | }
349 | hparam {
350 |   key: "update_dropout"
351 |   value {
352 |     float_value: 0.0
353 |   }
354 | }
355 | hparam {
356 |   key: "vocab_size"
357 |   value {
358 |     int64_value: 10001
359 |   }
360 | }
361 | hparam {
362 |   key: "weight_decay"
363 |   value {
364 |     float_value: 7.67599995015e-05
365 |   }
366 | }
367 | hparam {
368 |   key: "weight_penalty"
369 |   value {
370 |     float_value: 0.0
371 |   }
372 | }
373 | 


--------------------------------------------------------------------------------
/lamb/test/dummy_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | import tensorflow.compat.v1 as tf
17 | 
18 | 
19 | class DummyTest(tf.test.TestCase):
20 | 
21 |   def testCompilation(self):
22 |     pass
23 | 
24 | 
25 | if __name__ == "__main__":
26 |   tf.test.main()
27 | 


--------------------------------------------------------------------------------
/lamb/test/finish.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | 
17 | # Intended to be sourced after setting all the configuration options.
18 | 
19 | experiment_dir="$TEST_TMPDIR/${name}"
20 | 
21 | # Run
22 | source_lib "run.sh" run_par
23 | 
24 | # Check that the best reported evaluation XE is below a certain
25 | # threshold.
26 | grep_xes() {
27 |   cat "${_experiment_dir}/stderr" |
28 |     sed -rn "s/.*'best_xe': ([0-9]*)\.([0-9]{1,2}).*/\1.\2/p"
29 | }
30 | first_xe=$(grep_xes | head -n 1)
31 | last_xe=$(grep_xes | tail -n 1)
32 | expected_improvement="${expected_improvement:-0.5}"
33 | # check_ge doesn't work with floats, let's do it by hand.
34 | if (( $(echo "$first_xe - $expected_improvement < $last_xe" | bc -l) )); then
35 |   echo "XE went from $first_xe to $last_xe, and that's not a large enough \
36 | improvement ($expected_improvement)."
37 |   exit 1
38 | fi
39 | 
40 | echo "PASS"
41 | 


--------------------------------------------------------------------------------
/lamb/test/start.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | set -e -o pipefail
17 | 
18 | source googletest.sh
19 | 
20 | if [ "${base}" = "" ]; then
21 |   source "$(dirname $0)/../lib/setup.sh"
22 | fi
23 | source_lib "config/common.sh"
24 | source_lib "config/running.sh"
25 | 
26 | training_file="${base}/test/data/corpus.txt"
27 | validation_file="${training_file}"
28 | unset test_file
29 | 
30 | batch_size=64
31 | max_training_eval_batches=2
32 | max_eval_eval_batches=2
33 | max_test_eval_batches=2
34 | max_time_steps=3
35 | steps_per_turn=5
36 | turns=2
37 | 
38 | # Misc
39 | use_gpu=false
40 | 


--------------------------------------------------------------------------------
/lamb/test/test_episodic_char_lstm_d2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | source "$(dirname $0)/start.sh"
19 | 
20 | training_file="${base}/test/data/add.txt"
21 | validation_file="${training_file}"
22 | expected_improvement="${expected_improvement:-0.2}"
23 | 
24 | word_based=false
25 | episodic=true
26 | conditioning_separator="="
27 | max_time_steps=40
28 | 
29 | # Model hyperparameters
30 | 
31 | model=lstm
32 | num_layers=2
33 | hidden_size=50
34 | num_eval_samples=2
35 | 
36 | # Optimization hyperparameters
37 | 
38 | learning_rate=0.01
39 | 
40 | # Run
41 | source "$(dirname $0)/finish.sh"
42 | 


--------------------------------------------------------------------------------
/lamb/test/test_load_optimizer_state.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -x
19 | 
20 | source "$(dirname $0)/start.sh"
21 | 
22 | # Model hyperparameters
23 | 
24 | model=lstm
25 | num_layers=1
26 | hidden_size=17
27 | output_embedding_size=15
28 | 
29 | # Optimization hyperparameters
30 | 
31 | learning_rate=0.2
32 | early_stopping_turns=-1
33 | 
34 | # Run
35 | source "$(dirname $0)/finish.sh"
36 | previous_xe=$last_xe
37 | 
38 | # Load checkpoint and check that validation XE is the same.
39 | load_checkpoint="${_experiment_dir}/best"
40 | optimizer_type="sgd"
41 | # Loading the checkpoint would if a different optimizer's state were loaded.
42 | load_optimizer_state=false
43 | turns=0
44 | expected_improvement=0.0
45 | source "$(dirname $0)/finish.sh"
46 | 
47 | if [ "$previous_xe" != "$last_xe" ]; then
48 |   echo "XE was $previous_xe, after reloading checkpoint it became $last_xe."
49 |   exit 1
50 | fi
51 | 


--------------------------------------------------------------------------------
/lamb/test/test_save_v1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -x
19 | 
20 | source "$(dirname $0)/start.sh"
21 | 
22 | hps_proto_file="$(dirname $0)/data/save_v1/config"
23 | flags_as_dict="$(dirname $0)/data/save_v1/args"
24 | 
25 | # Run
26 | source "$(dirname $0)/finish.sh"
27 | 


--------------------------------------------------------------------------------
/lamb/test/test_simple_lstm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | set -x
19 | 
20 | source "$(dirname $0)/start.sh"
21 | 
22 | # Model hyperparameters
23 | 
24 | model=lstm
25 | num_layers=2
26 | hidden_size=17,13
27 | output_embedding_size=11
28 | lstm_skip_connection=false
29 | 
30 | # Optimization hyperparameters
31 | 
32 | learning_rate=0.2
33 | early_stopping_turns=-1
34 | 
35 | # Run
36 | source "$(dirname $0)/finish.sh"
37 | previous_xe=$last_xe
38 | 
39 | # Load checkpoint and check that validation XE is the same.
40 | load_checkpoint="${_experiment_dir}/last"
41 | turns=0
42 | expected_improvement=0.0
43 | source "$(dirname $0)/finish.sh"
44 | 
45 | if [ "$previous_xe" != "$last_xe" ]; then
46 |   echo "XE was $previous_xe, after reloading checkpoint it became $last_xe."
47 |   exit 1
48 | fi
49 | 


--------------------------------------------------------------------------------
/lamb/test/test_sparse_rhn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | 
18 | source "$(dirname $0)/start.sh"
19 | 
20 | # Model hyperparameters
21 | 
22 | model=rhn
23 | num_layers=2
24 | hidden_size=17
25 | output_embedding_size=15
26 | sparsity_ratio=0.5
27 | 
28 | # Optimization hyperparameters
29 | 
30 | expected_improvement=0.3
31 | learning_rate=0.2
32 | steps_per_turn=20
33 | 
34 | # Run
35 | source "$(dirname $0)/finish.sh"
36 | 


--------------------------------------------------------------------------------
/lamb/vocab.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ============================================================================
 15 | 
 16 | """Vocabulary."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | from six.moves import range
 22 | 
 23 | 
 24 | class Vocab(object):
 25 |   """Immutable reversible mappings from strings to integers."""
 26 | 
 27 |   def __init__(self, tokens, unk=u'<UNK>', eos=u'\u25bc'):
 28 |     """Create a Vocab object that maps `tokens` to dense indices."""
 29 |     self._token_to_index = {}
 30 |     self._token_to_frequency = {}
 31 |     self._unk = unk
 32 |     self._eos = eos
 33 |     token_to_index = self._token_to_index
 34 |     token_to_frequency = self._token_to_frequency
 35 |     # Get the unique tokens from `tokens` that might be a generator.
 36 |     for token in tokens:
 37 |       token_to_index[token] = True
 38 |       token_to_frequency[token] = token_to_frequency.get(token, 0) + 1
 39 |     token_to_index[unk] = True
 40 |     token_to_index[eos] = True
 41 |     # Now that we have a smaller set of tokens, assign ids in sorted
 42 |     # order for deterministic encoding.
 43 |     self._index_to_token = [None] * len(token_to_index)
 44 |     index_to_token = self._index_to_token
 45 |     i = 0
 46 |     for token in sorted(list(token_to_index)):
 47 |       token_to_index[token] = i
 48 |       index_to_token[i] = token
 49 |       i += 1
 50 | 
 51 |   def unk_index(self):
 52 |     """Returns the index of the unknown token."""
 53 |     return self._token_to_index[self._unk]
 54 | 
 55 |   def eos_index(self):
 56 |     """Returns the index of the end-of-sentence token."""
 57 |     return self._token_to_index[self._eos]
 58 | 
 59 |   def token(self, index_):
 60 |     """The string whose `index()` is `index_` or an IndexError."""
 61 |     return self._index_to_token[index_]
 62 | 
 63 |   def __iter__(self):
 64 |     """Iterates over tokens in order of indices."""
 65 |     for i in range(self.size()):
 66 |       yield self.token(i)
 67 | 
 68 |   def index_or_unk(self, token):
 69 |     """Find the index assigned to `token`.
 70 | 
 71 |     Args:
 72 |       token: a string.
 73 |     Returns:
 74 |       The index of `token` or `unk_index()` if it is not in the vocabulary.
 75 |     """
 76 |     if token in self._token_to_index:
 77 |       return self._token_to_index[token]
 78 |     else:
 79 |       return self.unk_index()
 80 | 
 81 |   def size(self):
 82 |     """Returns the number of different tokens in the vocabulary."""
 83 |     return len(self._index_to_token)
 84 | 
 85 |   def decode(self, ids):
 86 |     """Decode a sequence of `ids` with `token()`."""
 87 |     assert all([0 <= x and x < len(self._index_to_token) for x in ids])
 88 |     return [self.token(x) for x in ids]
 89 | 
 90 |   def encode(self, tokens, add_eos=True):
 91 |     """Encodes a sentence into a list of token indices.
 92 | 
 93 |     Args:
 94 |       tokens: A list of tokens.
 95 |       add_eos: Whether to add the end of sentence token.
 96 |     Returns:
 97 |       A list of integer token indices where `unk_index()` stands for
 98 |       tokens not found in the vocabulary.
 99 |     """
100 |     ids = [self.index_or_unk(token) for token in tokens]
101 | 
102 |     if add_eos:
103 |       ids += [self.eos_index()]
104 | 
105 |     return ids
106 | 
107 |   def index_frequency(self, index_):
108 |     return self._token_to_frequency.get(self.token(index_), 0)
109 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """Setup for pip package."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | from setuptools import find_packages
23 | from setuptools import setup
24 | 
25 | REQUIRED_PACKAGES = ['absl-py', 'numpy', 'dm-sonnet', 'six']
26 | EXTRA_PACKAGES = {
27 |     'tensorflow': ['tensorflow>=1.15.0', 'tensorflow-probability>=0.4.0'],
28 |     'tensorflow with gpu': ['tensorflow-gpu>=1.8.0',
29 |                             'tensorflow-probability-gpu>=0.4.0'],
30 | }
31 | 
32 | 
33 | setup(
34 |     name='lamb',
35 |     version='1.0',
36 |     description=('LAnguage Modelling Benchmarks is '
37 |                  'to tune and test Tensorflow LM models.'),
38 |     long_description='',
39 |     url='http://github.com/deepmind/lamb/',
40 |     author='Gabor Melis',
41 |     author_email='melisgl@google.com',
42 |     # Contained modules and scripts.
43 |     packages=find_packages(),
44 |     install_requires=REQUIRED_PACKAGES,
45 |     extras_require=EXTRA_PACKAGES,
46 |     zip_safe=False,
47 |     license='Apache 2.0',
48 |     classifiers=[
49 |         'Development Status :: 5 - Production/Stable',
50 |         'Intended Audience :: Developers',
51 |         'Intended Audience :: Education',
52 |         'Intended Audience :: Science/Research',
53 |         'License :: OSI Approved :: Apache Software License',
54 |         'Operating System :: MacOS :: MacOS X',
55 |         'Operating System :: Microsoft :: Windows',
56 |         'Operating System :: POSIX',
57 |         'Operating System :: Unix',
58 |         'Programming Language :: Python :: 2.7',
59 |         'Programming Language :: Python :: 3.4',
60 |         'Programming Language :: Python :: 3.5',
61 |         'Programming Language :: Python :: 3.6',
62 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
63 |         'Topic :: Software Development :: Libraries',
64 |     ],
65 |     keywords='lamb tensorflow language modelling machine learning',
66 | )
67 | 


--------------------------------------------------------------------------------