├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── lamb ├── VERSION ├── __init__.py ├── averaged.py ├── cell.py ├── corpus.py ├── dropout.py ├── dyneval.py ├── evaluation.py ├── experiment │ ├── awd │ │ └── train_awd_lstm.sh │ ├── continue.sh │ ├── mixture-of-softmaxes │ │ ├── train_awd_lstm_mos.sh │ │ └── tune_ptb_24m.sh │ ├── mogrifier │ │ ├── README.md │ │ ├── config │ │ │ ├── 0393c7dc3532+_tune_ptb_char_24m_lstm_fm_d2_asgd_ts150 │ │ │ │ └── trial_596 │ │ │ │ │ └── config │ │ │ ├── 4d0a9a5bdb04+_tune_enwik8_48m_lstm_d4_arms │ │ │ │ └── trial_400 │ │ │ │ │ └── config │ │ │ ├── 4d0a9a5bdb04+_tune_enwik8_48m_lstm_fm_d4_arms │ │ │ │ └── trial_234 │ │ │ │ │ └── config │ │ │ ├── 558aa30c0b15+_tune_mwc_fi_24m_lstm_d2_arms │ │ │ │ └── trial_758 │ │ │ │ │ └── config │ │ │ ├── 558aa30c0b15+_tune_mwc_fi_24m_lstm_fm_d2_arms │ │ │ │ └── trial_371 │ │ │ │ │ └── config │ │ │ ├── 786252db3825+_tune_ptb_24m_lstm_d2_arms │ │ │ │ └── trial_833 │ │ │ │ │ └── config │ │ │ ├── 786252db3825+_tune_ptb_24m_lstm_fm_d2_arms │ │ │ │ └── trial_483 │ │ │ │ │ └── config │ │ │ ├── 9e20581d3dad+_tune_mwc_en_24m_lstm_d2_arms │ │ │ │ └── trial_502 │ │ │ │ │ └── config │ │ │ ├── 9e20581d3dad+_tune_mwc_en_24m_lstm_fm_d2_arms │ │ │ │ └── trial_422 │ │ │ │ │ └── config │ │ │ ├── c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_d2_arms │ │ │ │ └── trial_763 │ │ │ │ │ └── config │ │ │ ├── c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_fm_d2_arms │ │ │ │ └── trial_747 │ │ │ │ │ └── config │ │ │ ├── e81db31261c0+_tune_enwik8_96m_lstm_d4_arms │ │ │ │ └── trial_295 │ │ │ │ │ └── config │ │ │ └── e81db31261c0+_tune_enwik8_96m_lstm_fm_d4_arms_MtngW │ │ │ │ └── trial_216 │ │ │ │ └── config │ │ ├── train_enwik8.sh │ │ ├── train_mwc.sh │ │ ├── train_ptb.sh │ │ ├── train_ptb_char.sh │ │ ├── train_wikitext-2.sh │ │ ├── tune_copy.sh │ │ ├── tune_dyneval.sh │ │ ├── tune_enwik8.sh │ │ ├── tune_mwc.sh │ │ ├── tune_ptb.sh │ │ ├── tune_ptb_char.sh │ │ ├── tune_ptb_fast.sh │ │ └── tune_wikitext-2.sh │ ├── on-the-state │ │ ├── README.md │ │ ├── enwik8_27m_lstm_d4 │ │ │ └── hps_proto │ │ ├── enwik8_46m_lstm_d4 │ │ │ └── hps_proto │ │ ├── ptb_10m_lstm_d1 │ │ │ └── hps_proto │ │ ├── ptb_24m_lstm_d4 │ │ │ └── hps_proto │ │ ├── train_enwik8.sh │ │ ├── train_ptb.sh │ │ ├── train_wikitext-2.sh │ │ └── wikitext-2_24m_lstm_d2 │ │ │ └── hps_proto │ ├── pushing-the-bounds │ │ ├── README.md │ │ └── test.sh │ ├── rerun.sh │ ├── rerun_old.sh │ ├── test.sh │ ├── train_ptb_10m_lstm_d1.sh │ ├── train_ptb_24m_lstm_d4.sh │ └── tune_ptb_10m.sh ├── lamb_flags.py ├── lib │ ├── config │ │ ├── README.md │ │ ├── common.sh │ │ ├── copy.sh │ │ ├── enwik8.sh │ │ ├── enwik8_char.sh │ │ ├── enwik8_char_rmsprop.sh │ │ ├── mwc.sh │ │ ├── ptb.sh │ │ ├── ptb_char.sh │ │ ├── ptb_word.sh │ │ ├── ptb_word_rmsprop.sh │ │ ├── ptb_word_slow.sh │ │ ├── running.sh │ │ ├── tuning.sh │ │ ├── wikitext-103.sh │ │ ├── wikitext-103_word.sh │ │ ├── wikitext-103_word_rmsprop.sh │ │ ├── wikitext-2.sh │ │ ├── wikitext-2_word.sh │ │ └── wikitext-2_word_rmsprop.sh │ ├── describe_version.sh │ ├── run.sh │ ├── run_helper.sh │ └── setup.sh ├── lm.py ├── main.py ├── monitoring.py ├── nascell.py ├── res_multi_rnn_cell.py ├── skip_multi_rnn_cell.py ├── test │ ├── data │ │ ├── add.txt │ │ ├── corpus.txt │ │ └── save_v1 │ │ │ ├── args │ │ │ └── config │ ├── dummy_test.py │ ├── finish.sh │ ├── start.sh │ ├── test_episodic_char_lstm_d2.sh │ ├── test_load_optimizer_state.sh │ ├── test_save_v1.sh │ ├── test_simple_lstm.sh │ └── test_sparse_rhn.sh ├── tiled_linear.py ├── tiled_lstm.py ├── tiled_rhn.py ├── training.py ├── utils.py └── vocab.py └── setup.py /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /lamb/VERSION: -------------------------------------------------------------------------------- 1 | 1.0 2 | -------------------------------------------------------------------------------- /lamb/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | -------------------------------------------------------------------------------- /lamb/averaged.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Averaging of model weights.""" 17 | 18 | # pylint: disable=missing-docstring 19 | # pylint: disable=g-complex-comprehension 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import tensorflow.compat.v1 as tf 26 | 27 | 28 | class Averaged(object): 29 | 30 | def __init__(self, tensors): 31 | tensors = list(tensors) 32 | with tf.variable_scope('averaged'): 33 | self._num_samples = tf.Variable(0, name='num_samples', trainable=False) 34 | with tf.variable_scope('avg'): 35 | self._averages = [ 36 | tf.get_variable( 37 | tensor.name.replace('/', '-').replace(':', '-'), 38 | tensor.get_shape(), initializer=tf.zeros_initializer(), 39 | trainable=False) 40 | for tensor in tensors] 41 | with tf.variable_scope('save'): 42 | self._saves = [ 43 | tf.get_variable( 44 | tensor.name.replace('/', '-').replace(':', '-'), 45 | tensor.get_shape(), initializer=tf.zeros_initializer(), 46 | trainable=False) 47 | for tensor in tensors] 48 | self._tensors = tensors 49 | self._take_sample = self._make_take_sample() 50 | self._switch = self._make_swith_to_average() 51 | self._restore = self._make_restore() 52 | self._reset = self._make_reset() 53 | 54 | def take_sample(self): 55 | tf.get_default_session().run(self._take_sample) 56 | 57 | def switch_to_average(self): 58 | tf.get_default_session().run(self._switch) 59 | 60 | def restore(self): 61 | tf.get_default_session().run(self._restore) 62 | 63 | def reset(self): 64 | tf.get_default_session().run(self._reset) 65 | 66 | def __enter__(self): 67 | self.switch_to_average() 68 | 69 | def __exit__(self, type_, value, traceback): 70 | self.restore() 71 | 72 | def _make_take_sample(self): 73 | assignments = [] 74 | n = tf.cast(self._num_samples, tf.float32) 75 | mu = 1.0 / (1.0 + n) 76 | for tensor, average in zip(self._tensors, self._averages): 77 | assignments.append(tf.assign_add(average, (tensor-average)*mu)) 78 | add_to_averages = tf.group(assignments) 79 | with tf.control_dependencies([add_to_averages]): 80 | incr_num_samples = tf.assign(self._num_samples, self._num_samples + 1) 81 | return incr_num_samples 82 | 83 | def _make_swith_to_average(self): 84 | assignments = [] 85 | for save, tensor, average in zip( 86 | self._saves, self._tensors, self._averages): 87 | with tf.control_dependencies([save.assign(tensor)]): 88 | assignments.append(tensor.assign(average)) 89 | return tf.group(assignments) 90 | 91 | def _make_restore(self): 92 | assignments = [] 93 | for save, tensor in zip(self._saves, self._tensors): 94 | assignments.append(tf.assign(tensor, save)) 95 | return tf.group(assignments) 96 | 97 | def _make_reset(self): 98 | return tf.assign(self._num_samples, 0) 99 | 100 | 101 | # TODO(melisgl): I think this works with ResourceVariables but not with normal 102 | # Variables. Deferred until TF2.0. 103 | def _swap(x, y): 104 | x_value = x.read_value() 105 | y_value = y.read_value() 106 | with tf.control_dependencies([x_value, y_value]): 107 | swap = tf.group(y.assign(x_value), x.assign(y_value)) 108 | return swap 109 | -------------------------------------------------------------------------------- /lamb/dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Variational Dropout.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | from sonnet.python.modules import base as snt_base 23 | import tensorflow.compat.v1 as tf 24 | import tensorflow_probability as tfp 25 | from tensorflow.contrib import util as contrib_util 26 | 27 | 28 | class Dropout(snt_base.AbstractModule): 29 | """Possibly variational dropout.""" 30 | 31 | def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'): 32 | super(Dropout, self).__init__(name=name) 33 | self._keep_prob = keep_prob 34 | self._keep_mask = None 35 | self._share_mask = share_mask 36 | self._scaler = scaler 37 | 38 | def _ensure_keep_mask(self, x): 39 | if self._keep_mask is None or not self._share_mask: 40 | shape = tf.shape(x) 41 | noise = tf.random_uniform(shape, dtype=x.dtype) 42 | self._keep_mask = (tf.floor(self._keep_prob + noise) 43 | * (self._scaler / self._keep_prob)) 44 | self._keep_mask.set_shape(x.get_shape()) 45 | return self._keep_mask 46 | 47 | def _build(self, x): 48 | if contrib_util.constant_value(self._keep_prob) == 1: 49 | return x 50 | else: 51 | return x * self._ensure_keep_mask(x) 52 | 53 | 54 | class GaussianDropout(snt_base.AbstractModule): 55 | """Possibly variational dropout.""" 56 | 57 | def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'): 58 | super(GaussianDropout, self).__init__(name=name) 59 | self._keep_prob = keep_prob 60 | self._keep_mask = None 61 | self._share_mask = share_mask 62 | self._scaler = scaler 63 | 64 | def _ensure_keep_mask(self, x): 65 | if self._keep_mask is None or not self._share_mask: 66 | shape = tf.shape(x) 67 | # Calculate the stddev for the normal distribution that 68 | # matches the stddev of the bernoulli with p=keep_prob. 69 | stddev = tf.sqrt((1 - self._keep_prob) / self._keep_prob) 70 | self._keep_mask = tf.random_normal(shape, mean=1.0, stddev=stddev, 71 | dtype=x.dtype) 72 | self._keep_mask.set_shape(x.get_shape()) 73 | return self._keep_mask 74 | 75 | def _build(self, x): 76 | if contrib_util.constant_value(self._keep_prob) == 1: 77 | return x 78 | else: 79 | return x * self._ensure_keep_mask(x) 80 | 81 | 82 | class DirichletDropout(snt_base.AbstractModule): 83 | """Possibly variational dropout.""" 84 | 85 | def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'): 86 | super(DirichletDropout, self).__init__(name=name) 87 | self._keep_prob = keep_prob 88 | self._keep_mask = None 89 | self._share_mask = share_mask 90 | self._scaler = scaler 91 | 92 | def _ensure_keep_mask(self, x): 93 | if self._keep_mask is None or not self._share_mask: 94 | shape = tf.shape(x) 95 | k = shape[1] 96 | # To make this class a drop-in replacement for bernoulli dropout we 97 | # paramaterize it with keep_prob. Set alpha of the dirichlet so that the 98 | # variance is equal to the variance of the bernoulli with p=keep_prob 99 | # divided by keep_prob. 100 | # Now the variance of the dirichlet with k equal alphas is 101 | # (k-1)/(k^2(k*alpha+1). Solve that for alpha. 102 | kf = tf.cast(k, tf.float32) 103 | alpha = self._keep_prob * (kf - 1.0) / ((1-self._keep_prob)*kf) - 1.0/kf 104 | dist = tfp.distributions.Dirichlet(tf.ones(shape=k) * alpha) 105 | assert (dist.reparameterization_type == 106 | tfp.distributions.FULLY_REPARAMETERIZED) 107 | # The E[dir(alpha)] = 1/k for all elements, but we want the expectation to 108 | # be keep_prob, hence the multiplication. 109 | self._keep_mask = kf * dist.sample(shape[0]) 110 | self._keep_mask.set_shape(x.get_shape()) 111 | return self._keep_mask 112 | 113 | def _build(self, x): 114 | if contrib_util.constant_value(self._keep_prob) == 1: 115 | return x 116 | else: 117 | return tf.cond(tf.equal(self._keep_prob, 1.0), 118 | lambda: x, 119 | lambda: x * self._ensure_keep_mask(x)) 120 | 121 | 122 | class DriftingDropout(snt_base.AbstractModule): 123 | """Dropout with gradually changing mask.""" 124 | 125 | def __init__(self, keep_prob, flip_prob=0.0, scaler=1.0, name='dropout'): 126 | super(DriftingDropout, self).__init__(name=name) 127 | self._keep_prob = keep_prob 128 | self._flip_prob = flip_prob 129 | self._scaler = scaler 130 | self._time_step = 0 131 | 132 | def _build(self, x, state): 133 | prev_keep_mask = state 134 | shape = tf.shape(x) 135 | noise = tf.random_uniform(shape, dtype=x.dtype) 136 | other_mask = tf.floor(self._keep_prob + noise) 137 | choice_noise = tf.random_uniform(shape, dtype=x.dtype) 138 | choice = tf.less(choice_noise, self._flip_prob) 139 | # KLUDGE(melisgl): The client has to pass the last keep_mask from 140 | # a batch to the next so the mask may end up next to some 141 | # recurrent cell state. This state is often zero at the beginning 142 | # and may be periodically zeroed (per example) during training. 143 | # While zeroing LSTM state is okay, zeroing the dropout mask is 144 | # not. So instead of forcing every client to deal with this common 145 | # (?) case, if an all zero mask is detected, then regenerate a 146 | # fresh mask. This is of course a major hack and won't help with 147 | # learnt initial states, for example. 148 | sum_ = tf.reduce_sum(prev_keep_mask, 1, keepdims=True) 149 | is_initializing = tf.equal(sum_, 0.0) 150 | 151 | self._keep_mask = tf.where(tf.logical_or(choice, is_initializing), 152 | other_mask, 153 | prev_keep_mask) 154 | self._time_step += 1 155 | return x * self._keep_mask / self._keep_prob * self._scaler 156 | -------------------------------------------------------------------------------- /lamb/dyneval.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Dynamic evaluation.""" 17 | 18 | # pylint: disable=missing-docstring 19 | # pylint: disable=g-complex-comprehension 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import tensorflow.compat.v1 as tf 26 | 27 | 28 | class Dyneval(object): 29 | 30 | def __init__(self, grads_and_vars, learning_rate, decay_rate, epsilon): 31 | with tf.variable_scope('dyneval'): 32 | # convert_to_tensor densifies IndexedSlices 33 | self._grads = [tf.convert_to_tensor(grad) for grad, _ in grads_and_vars] 34 | self._vars = [var for _, var in grads_and_vars] 35 | self._learning_rate = learning_rate 36 | self._decay_rate = decay_rate 37 | def shadow_vars(): 38 | return [ 39 | tf.get_variable( 40 | var.name.replace('/', '-').replace(':', '-'), 41 | var.get_shape(), initializer=tf.zeros_initializer(), 42 | trainable=False) 43 | for var in self._vars] 44 | with tf.variable_scope('save'): 45 | self._saves = shadow_vars() 46 | with tf.variable_scope('sum_squared_grads'): 47 | self._sum_squared_grads = shadow_vars() 48 | self._save = self._make_save() 49 | self._restore = self._make_restore() 50 | 51 | # These are for computing an RMSProplike estimate of the variance of 52 | # minibatch gradients. Here, this quantity is estimated on the training 53 | # set once, while gradient descent happens on validation/test. 54 | self._num_squared_grads = tf.get_variable( 55 | 'num_squared_grads', [], initializer=tf.zeros_initializer(), 56 | trainable=False) 57 | self._zero_sum_squared_grads = self._make_zero_sum_squared_grads() 58 | self._add_squared_grads = self._make_add_squared_grads() 59 | self._epsilon = epsilon 60 | 61 | self._update = self._make_update() 62 | 63 | def _make_save(self): 64 | assignments = [] 65 | for save, var in zip(self._saves, self._vars): 66 | assignments.append(save.assign(var)) 67 | return tf.group(assignments) 68 | 69 | def _make_restore(self): 70 | assignments = [] 71 | for save, var in zip(self._saves, self._vars): 72 | assignments.append(var.assign(save)) 73 | return tf.group(assignments) 74 | 75 | def _make_update(self): 76 | mss = [] 77 | gsum = 0.0 78 | count = 0 79 | for sum_squared_grads in self._sum_squared_grads: 80 | ms = tf.sqrt(sum_squared_grads / self._num_squared_grads) 81 | gsum += tf.reduce_sum(ms) 82 | count += tf.reduce_sum(tf.ones_like(ms)) 83 | mss.append(ms) 84 | gsum = gsum / count 85 | 86 | assignments = [] 87 | for grad, var, save, sum_squared_grads, ms in zip( 88 | self._grads, self._vars, self._saves, self._sum_squared_grads, mss): 89 | decay_rate = tf.minimum(1.0, self._decay_rate*(ms/gsum)) 90 | delta = (-self._learning_rate*grad / (ms + self._epsilon) + 91 | decay_rate*(save-var)) 92 | assignments.append(var.assign_add(delta)) 93 | return tf.group(assignments) 94 | 95 | def _make_add_squared_grads(self): 96 | assignments = [] 97 | for sum_squared_grads, grads in zip(self._sum_squared_grads, self._grads): 98 | assignments.append(sum_squared_grads.assign_add(tf.square(grads))) 99 | return tf.group(assignments + [self._num_squared_grads.assign_add(1)]) 100 | 101 | def _make_zero_sum_squared_grads(self): 102 | assignments = [] 103 | for sum_squared_grads in self._sum_squared_grads: 104 | assignments.append(sum_squared_grads.assign( 105 | tf.zeros_like(sum_squared_grads))) 106 | return tf.group(assignments + [self._num_squared_grads.assign(0)]) 107 | 108 | def save(self): 109 | tf.get_default_session().run(self._save) 110 | 111 | def restore(self): 112 | tf.get_default_session().run(self._restore) 113 | 114 | def update_op(self): 115 | return self._update 116 | 117 | def zero_sum_squared_grads(self): 118 | tf.get_default_session().run(self._zero_sum_squared_grads) 119 | 120 | def add_squared_grads_op(self): 121 | return self._add_squared_grads 122 | 123 | def __enter__(self): 124 | self.save() 125 | 126 | def __exit__(self, type_, value, traceback): 127 | self.restore() 128 | -------------------------------------------------------------------------------- /lamb/experiment/awd/train_awd_lstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # This script reproduces the PTB results from "Regularizing and Optimizing LSTM 19 | # Language Models" (Merity, 2017) without fine-tuning or dynamic evaluation. 20 | # 21 | # Based on https://github.com/salesforce/awd-lstm-lm. 22 | # 23 | # Reaches ~4.084 validation cross-entropy (59.38 ppl) without fine-tuning. 24 | 25 | set -e 26 | 27 | source "$(dirname $0)/../../lib/setup.sh" 28 | source_lib "config/common.sh" 29 | source_lib "config/running.sh" 30 | source_lib "config/ptb_word.sh" 31 | 32 | # Model 33 | 34 | share_input_and_output_embeddings=true 35 | input_embedding_size=400 36 | output_embedding_size=400 37 | cap_input_gate=false 38 | input_dropout=0.4 39 | embedding_dropout=0.1 40 | output_dropout=0.4 41 | shared_mask_dropout=true 42 | 43 | # Cell 44 | 45 | model="lstm" 46 | num_layers=3 47 | lstm_skip_connection=false 48 | hidden_size=1150,1150,400 49 | inter_layer_dropout=0.25 50 | state_dropout=0.5 51 | tie_forget_and_input_gates=false 52 | 53 | # Objective 54 | 55 | activation_norm_penalty=2.0 56 | l2_penalty=8.4e-5 # 1.2e-6*70 57 | drop_state_probability=0.01 58 | 59 | # Initialization 60 | 61 | forget_bias=0.0 62 | 63 | # Schedule 64 | 65 | steps_per_turn=100 66 | print_training_stats_every_num_steps=100 67 | turns=3168 # ~500 epochs (with batch_size=20 and max_time_steps=70). 68 | 69 | # Optimizer 70 | 71 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all 72 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums 73 | # the log probabilities over time steps and averages only over the examples in 74 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty 75 | # had to be adjusted. 76 | max_time_steps=70 77 | max_grad_norm=17.5 # 0.25*70 78 | optimizer_type="sgd" 79 | batch_size=20 80 | learning_rate=0.42857143 # 30.0/70 81 | 82 | # Evaluation hyperparameters 83 | 84 | trigger_averaging_turns=50 85 | trigger_averaging_at_the_latest=2000 86 | max_training_eval_batches=20 87 | 88 | # Misc 89 | 90 | swap_memory=true 91 | 92 | source_lib "run.sh" "$@" 93 | -------------------------------------------------------------------------------- /lamb/experiment/continue.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | 24 | name="$2" 25 | config_file="$3/config" 26 | load_checkpoint="$3/last" 27 | source_lib "run.sh" "$1" 28 | -------------------------------------------------------------------------------- /lamb/experiment/mixture-of-softmaxes/train_awd_lstm_mos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # This script reproduces the PTB results from "Breaking the Softmax Bottleneck: 19 | # A High-Rank RNN Language Model" (Zhilin Yang, Zihang Dai, Ruslan 20 | # Salakhutdinov, William W. Cohen) without fine-tuning or dynamic evaluation. 21 | # 22 | # Based on https://github.com/zihangdai/mos. 23 | 24 | set -e 25 | 26 | source "$(dirname $0)/../../lib/setup.sh" 27 | source_lib "config/common.sh" 28 | source_lib "config/running.sh" 29 | source_lib "config/ptb_word.sh" 30 | 31 | # Model 32 | 33 | share_input_and_output_embeddings=true 34 | input_embedding_size=280 35 | output_embedding_size=280 36 | cap_input_gate=false 37 | input_dropout=0.4 38 | embedding_dropout=0.1 39 | output_dropout=0.4 40 | downprojected_output_dropout=0.29 41 | shared_mask_dropout=true 42 | mos_num_components=15 43 | 44 | # Cell 45 | 46 | model="lstm" 47 | num_layers=3 48 | lstm_skip_connection=false 49 | hidden_size=960,960,620 50 | inter_layer_dropout=0.225 51 | state_dropout=0.5 52 | tie_forget_and_input_gates=false 53 | 54 | # Objective 55 | 56 | l2_penalty=8.4e-5 # 1.2e-6*70 57 | drop_state_probability=0.01 58 | 59 | # Initialization 60 | 61 | forget_bias=0.0 62 | 63 | # Schedule 64 | 65 | steps_per_turn=100 66 | print_training_stats_every_num_steps=100 67 | turns=8000 68 | 69 | # Optimizer 70 | 71 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all 72 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums 73 | # the log probabilities over time steps and averages only over the examples in 74 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty 75 | # had to be adjusted. 76 | max_time_steps=70 77 | max_grad_norm=17.5 # 0.25*70 78 | optimizer_type="sgd" 79 | batch_size=12 80 | learning_rate=0.285 # 20.0/70 81 | 82 | # Evaluation hyperparameters 83 | 84 | trigger_averaging_turns=50 85 | trigger_averaging_at_the_latest=2000 86 | max_training_eval_batches=20 87 | 88 | # Misc 89 | 90 | swap_memory=true 91 | 92 | source_lib "run.sh" "$@" 93 | -------------------------------------------------------------------------------- /lamb/experiment/mixture-of-softmaxes/tune_ptb_24m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/ptb_word.sh" 28 | 29 | # Model 30 | 31 | num_params=$(million 24) 32 | share_input_and_output_embeddings=true 33 | cap_input_gate=false 34 | shared_mask_dropout=true 35 | 36 | # Cell 37 | 38 | model="lstm" 39 | num_layers=3 40 | lstm_skip_connection=false 41 | tie_forget_and_input_gates=false 42 | 43 | # Objective 44 | 45 | drop_state_probability=0.01 46 | 47 | # Initialization 48 | 49 | forget_bias=0.0 50 | 51 | # Schedule 52 | 53 | steps_per_turn=100 54 | print_training_stats_every_num_steps=100 55 | turns=600 56 | 57 | # Optimizer 58 | 59 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all 60 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums 61 | # the log probabilities over time steps and averages only over the examples in 62 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty 63 | # had to be adjusted. 64 | max_time_steps=70 65 | max_grad_norm=10.0 66 | trigger_averaging_turns=25 67 | trigger_averaging_at_the_latest=400 68 | 69 | # Early stopping 70 | 71 | early_stopping_turns=30 72 | early_stopping_worst_xe_target=4.4 73 | 74 | # Evaluation 75 | 76 | max_training_eval_batches=20 77 | eval_softmax_temperature=-0.8 78 | 79 | # Misc 80 | 81 | swap_memory=true 82 | 83 | # Tuning parameters 84 | 85 | num_workers=60 86 | 87 | # SGD 88 | optimizer_type="sgd" 89 | mos_num_components=0 90 | tuneables="batch_size,learning_rate,l2_penalty, 91 | token_dropout,input_dropout,inter_layer_dropout,state_dropout, 92 | output_dropout,downprojected_output_dropout,input_embedding_ratio" 93 | name="$(default_name)_${model}_d${num_layers}_asgd" 94 | source_lib "run.sh" "$@" 95 | 96 | # RMSPROP 97 | optimizer_type="rmsprop" 98 | mos_num_components=0 99 | tuneables="batch_size,learning_rate,l2_penalty, 100 | token_dropout,input_dropout,inter_layer_dropout,state_dropout, 101 | output_dropout,downprojected_output_dropout,input_embedding_ratio" 102 | name="$(default_name)_${model}_d${num_layers}_arms" 103 | source_lib "run.sh" "$@" 104 | 105 | # SGD, MoS 106 | optimizer_type="sgd" 107 | mos_num_components=15 108 | tuneables="batch_size,learning_rate,l2_penalty, 109 | token_dropout,input_dropout,inter_layer_dropout,state_dropout, 110 | output_dropout,downprojected_output_dropout,input_embedding_ratio" 111 | name="$(default_name)_${model}_d${num_layers}_asgd_mos${mos_num_components}" 112 | source_lib "run.sh" "$@" 113 | 114 | # RMSPROP, MoS 115 | optimizer_type="rmsprop" 116 | mos_num_components=15 117 | tuneables="batch_size,learning_rate,l2_penalty, 118 | token_dropout,input_dropout,inter_layer_dropout,state_dropout, 119 | output_dropout,downprojected_output_dropout,input_embedding_ratio" 120 | name="$(default_name)_${model}_d${num_layers}_arms_mos${mos_num_components}" 121 | source_lib "run.sh" "$@" 122 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/README.md: -------------------------------------------------------------------------------- 1 | This directory contains saved configuration files for tuned models from the 2 | [Mogrifier LSTM](https://arxiv.org/abs/1909.01792) paper. Model weights are not 3 | included. 4 | 5 | Don't forget to [set up the data](../../README.md). 6 | 7 | For example, to train a Mogrifier LSTM with 24M parameters on PTB with tuned 8 | hyperparameters (see the paper above): 9 | 10 | ./train_ptb.sh run train-dir-name config/786252db3825+_tune_ptb_24m_lstm_fm_d2_arms/trial_483/config 11 | 12 | There are separate training scripts for other datasets. The `config` directory 13 | holds the best hyperparameters for various model and dataset combinations. The 14 | training will save the model in `./train-dir-name_`. To test the 15 | saved model: 16 | 17 | ../test.sh run test-dir-name ./train-dir-name_/ 18 | 19 | If training runs out of GPU memory, you may want to decrease `max_time_steps` 20 | (the BPTT window size), but don't expect to reproduce the results that way. 21 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/0393c7dc3532+_tune_ptb_char_24m_lstm_fm_d2_asgd_ts150/trial_596/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'utf-8'), 4 | ('word_based', True), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 24000000), 8 | ('share_input_and_output_embeddings', False), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 1.0), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.27860252841733274), 17 | ('output_dropout', 0.2347428361918374), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 2), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 4), 27 | ('feature_mask_rank', 24), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.0737609984911853), 39 | ('state_dropout', 0.17118611234551975), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 0.00025558089199237096), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 200), 58 | ('print_training_stats_every_num_steps', 200), 59 | ('turns', 500), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 64), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 150), 71 | ('trigger_averaging_turns', 25), 72 | ('trigger_averaging_at_the_latest', 400), 73 | # learning rate 74 | ('learning_rate', 0.003739598828019367), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_power_mean_power', 1.0), 99 | ('eval_dropout_multiplier', 1.0), 100 | # experiments 101 | # checkpoints 102 | ('save_checkpoints', True), 103 | # misc 104 | ('seed', 1), 105 | ('swap_memory', False), 106 | ('log_device_placement', False), 107 | ('summary_flush_secs', 120), 108 | ] 109 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/4d0a9a5bdb04+_tune_enwik8_48m_lstm_d4_arms/trial_400/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'CP437'), 4 | ('word_based', False), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 48000000), 8 | ('share_input_and_output_embeddings', False), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 1.0), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.16279440026790548), 17 | ('output_dropout', 0.13860156332143037), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 4), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 0), 27 | ('feature_mask_rank', 0), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.11949666753873665), 39 | ('state_dropout', 0.1036809388104279), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 2.5181258956042348e-05), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 400), 58 | ('print_training_stats_every_num_steps', 1000), 59 | ('turns', 100), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 128), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 500), 71 | ('trigger_averaging_turns', 10), 72 | ('trigger_averaging_at_the_latest', 80), 73 | # learning rate 74 | ('learning_rate', 0.002516709293528533), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/4d0a9a5bdb04+_tune_enwik8_48m_lstm_fm_d4_arms/trial_234/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'CP437'), 4 | ('word_based', False), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 48000000), 8 | ('share_input_and_output_embeddings', False), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 1.0), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.015496029112930465), 17 | ('output_dropout', 0.138307173174503), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 4), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 6), 27 | ('feature_mask_rank', 79), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.008633961431527571), 39 | ('state_dropout', 0.0437288219541186), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 0.000993383826740019), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 400), 58 | ('print_training_stats_every_num_steps', 1000), 59 | ('turns', 100), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 128), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 500), 71 | ('trigger_averaging_turns', 10), 72 | ('trigger_averaging_at_the_latest', 80), 73 | # learning rate 74 | ('learning_rate', 0.001021423409385794), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/558aa30c0b15+_tune_mwc_fi_24m_lstm_d2_arms/trial_758/config: -------------------------------------------------------------------------------- 1 | 2 | [ ('config_version', 5), 3 | # data 4 | ('conditioning_separator', ''), 5 | ('file_encoding', 'utf-8'), 6 | ('word_based', False), 7 | ('episodic', False), 8 | # model 9 | ('num_params', 24000000), 10 | ('share_input_and_output_embeddings', False), 11 | ('input_embedding_size', -1), 12 | ('output_embedding_size', -1), 13 | ('input_embedding_ratio', 1.5665444454253725), 14 | ('output_embedding_ratio', -1.0), 15 | ('mos_num_components', 0), 16 | ('token_dropout', 0.0), 17 | ('embedding_dropout', 0.0), 18 | ('input_dropout', 0.0004278254540998817), 19 | ('output_dropout', 0.21672999424789158), 20 | ('downprojected_output_dropout', -1.0), 21 | ('shared_mask_dropout', False), 22 | ('embed_once', True), 23 | ('output_once', True), 24 | # cell 25 | ('model', 'lstm'), 26 | ('num_layers', 2), 27 | ('residual_connections', False), 28 | ('lstm_skip_connection', True), 29 | ('feature_mask_rounds', 0), 30 | ('feature_mask_rank', 0), 31 | ('feature_mask', False), 32 | ('sparsity_ratio', -1.0), 33 | ('overlay_rank', -1), 34 | ('hidden_size', [-1]), 35 | ('hidden_size_multiplier', 1.0), 36 | ('layer_norm', False), 37 | ('activation_fn', 'tf.tanh'), 38 | ('tie_forget_and_input_gates', False), 39 | ('cap_input_gate', True), 40 | ('trainable_initial_state', False), 41 | ('inter_layer_dropout', 0.03679207573249842), 42 | ('state_dropout', 0.15784488790163897), 43 | ('state_dropout_flip_rate', 0.0), 44 | ('update_dropout', 0.0), 45 | ('cell_clip', -1.0), 46 | # objective 47 | ('model_average', 'arithmetic'), 48 | ('num_training_samples', 1), 49 | ('l2_penalty', 3.35903544036833e-05), 50 | ('l1_penalty', 0.0), 51 | ('activation_norm_penalty', 0.0), 52 | ('drop_state_probability', 0.01), 53 | # initialization 54 | ('embedding_init_factor', 1.0), 55 | ('scale_input_embeddings', False), 56 | ('cell_init_factor', 1.0), 57 | ('forget_bias', 1.0), 58 | ('output_init_factor', 1.0), 59 | # schedule 60 | ('steps_per_turn', 200), 61 | ('print_training_stats_every_num_steps', 200), 62 | ('turns', 500), 63 | # optimization 64 | ('optimizer_type', 'rmsprop'), 65 | ('rmsprop_beta2', 0.999), 66 | ('rmsprop_epsilon', 1e-08), 67 | ('adam_beta1', 0.9), 68 | ('adam_beta2', 0.999), 69 | ('adam_epsilon', 1e-08), 70 | ('batch_size', 64), 71 | ('accum_batch_size', -1), 72 | ('max_grad_norm', 10.0), 73 | ('max_time_steps', 150), 74 | ('trigger_averaging_turns', 25), 75 | ('trigger_averaging_at_the_latest', 400), 76 | # learning rate 77 | ('learning_rate', 0.0038728221226125496), 78 | ('learning_rate_decay', 1.0), 79 | ('learning_rate_decay_burn_in_steps', 0), 80 | ('drop_learning_rate_turns', -1), 81 | ('drop_learning_rate_multiplier', 1.0), 82 | ('drop_learning_rate_at_the_latest', -1), 83 | # early stopping 84 | ('early_stopping_turns', -1), 85 | ('early_stopping_rampup_turns', 0), 86 | ('early_stopping_worst_xe_target', ''), 87 | ('early_stopping_slowest_rate', 0.0), 88 | # cross-validation 89 | ('crossvalidate', False), 90 | ('crossvalidation_folds', 10), 91 | ('crossvalidation_rounds', 1), 92 | # evaluation 93 | ('max_training_eval_batches', 20), 94 | ('max_eval_eval_batches', -1), 95 | ('max_test_eval_batches', -1), 96 | ('min_non_episodic_eval_examples_per_stripe', 100), 97 | ('eval_on_test', False), 98 | ('eval_method', 'deterministic'), 99 | ('num_eval_samples', 0), 100 | ('eval_softmax_temperature', -0.8), 101 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 102 | ('eval_power_mean_power', 1.0), 103 | ('eval_dropout_multiplier', 1.0), 104 | ('validation_prediction_file', ''), 105 | ('dyneval', False), 106 | ('dyneval_learning_rate', 0.001), 107 | ('dyneval_decay_rate', 0.02), 108 | ('dyneval_epsilon', 1e-05), 109 | # experiments 110 | # checkpoints 111 | ('save_checkpoints', True), 112 | # misc 113 | ('seed', 1), 114 | ('swap_memory', True), 115 | ('log_device_placement', False), 116 | ('summary_flush_secs', 120), 117 | ] 118 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/558aa30c0b15+_tune_mwc_fi_24m_lstm_fm_d2_arms/trial_371/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('conditioning_separator', ''), 4 | ('file_encoding', 'utf-8'), 5 | ('word_based', False), 6 | ('episodic', False), 7 | # model 8 | ('num_params', 24000000), 9 | ('share_input_and_output_embeddings', False), 10 | ('input_embedding_size', -1), 11 | ('output_embedding_size', -1), 12 | ('input_embedding_ratio', 1.3016032871831578), 13 | ('output_embedding_ratio', -1.0), 14 | ('mos_num_components', 0), 15 | ('token_dropout', 0.0), 16 | ('embedding_dropout', 0.0), 17 | ('input_dropout', 0.06995541397794428), 18 | ('output_dropout', 0.22149685667402097), 19 | ('downprojected_output_dropout', -1.0), 20 | ('shared_mask_dropout', False), 21 | ('embed_once', True), 22 | ('output_once', True), 23 | # cell 24 | ('model', 'lstm'), 25 | ('num_layers', 2), 26 | ('residual_connections', False), 27 | ('lstm_skip_connection', True), 28 | ('feature_mask_rounds', 5), 29 | ('feature_mask_rank', 100), 30 | ('feature_mask', False), 31 | ('sparsity_ratio', -1.0), 32 | ('overlay_rank', -1), 33 | ('hidden_size', [-1]), 34 | ('hidden_size_multiplier', 1.0), 35 | ('layer_norm', False), 36 | ('activation_fn', 'tf.tanh'), 37 | ('tie_forget_and_input_gates', False), 38 | ('cap_input_gate', True), 39 | ('trainable_initial_state', False), 40 | ('inter_layer_dropout', 0.11571939622760244), 41 | ('state_dropout', 0.1759160317735942), 42 | ('state_dropout_flip_rate', 0.0), 43 | ('update_dropout', 0.0), 44 | ('cell_clip', -1.0), 45 | # objective 46 | ('model_average', 'arithmetic'), 47 | ('num_training_samples', 1), 48 | ('l2_penalty', 9.607977185924193e-05), 49 | ('l1_penalty', 0.0), 50 | ('activation_norm_penalty', 0.0), 51 | ('drop_state_probability', 0.01), 52 | # initialization 53 | ('embedding_init_factor', 1.0), 54 | ('scale_input_embeddings', False), 55 | ('cell_init_factor', 1.0), 56 | ('forget_bias', 1.0), 57 | ('output_init_factor', 1.0), 58 | # schedule 59 | ('steps_per_turn', 200), 60 | ('print_training_stats_every_num_steps', 200), 61 | ('turns', 500), 62 | # optimization 63 | ('optimizer_type', 'rmsprop'), 64 | ('rmsprop_beta2', 0.999), 65 | ('rmsprop_epsilon', 1e-08), 66 | ('adam_beta1', 0.9), 67 | ('adam_beta2', 0.999), 68 | ('adam_epsilon', 1e-08), 69 | ('batch_size', 64), 70 | ('accum_batch_size', -1), 71 | ('max_grad_norm', 10.0), 72 | ('max_time_steps', 150), 73 | ('trigger_averaging_turns', 25), 74 | ('trigger_averaging_at_the_latest', 400), 75 | # learning rate 76 | ('learning_rate', 0.001999992683987708), 77 | ('learning_rate_decay', 1.0), 78 | ('learning_rate_decay_burn_in_steps', 0), 79 | ('drop_learning_rate_turns', -1), 80 | ('drop_learning_rate_multiplier', 1.0), 81 | ('drop_learning_rate_at_the_latest', -1), 82 | # early stopping 83 | ('early_stopping_turns', -1), 84 | ('early_stopping_rampup_turns', 0), 85 | ('early_stopping_worst_xe_target', ''), 86 | ('early_stopping_slowest_rate', 0.0), 87 | # cross-validation 88 | ('crossvalidate', False), 89 | ('crossvalidation_folds', 10), 90 | ('crossvalidation_rounds', 1), 91 | # evaluation 92 | ('max_training_eval_batches', 20), 93 | ('max_eval_eval_batches', -1), 94 | ('max_test_eval_batches', -1), 95 | ('min_non_episodic_eval_examples_per_stripe', 100), 96 | ('eval_on_test', False), 97 | ('eval_method', 'deterministic'), 98 | ('num_eval_samples', 0), 99 | ('eval_softmax_temperature', -0.8), 100 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 101 | ('eval_power_mean_power', 1.0), 102 | ('eval_dropout_multiplier', 1.0), 103 | ('validation_prediction_file', ''), 104 | ('dyneval', False), 105 | ('dyneval_learning_rate', 0.001), 106 | ('dyneval_decay_rate', 0.02), 107 | ('dyneval_epsilon', 1e-05), 108 | # experiments 109 | # checkpoints 110 | ('save_checkpoints', True), 111 | # misc 112 | ('seed', 1), 113 | ('swap_memory', True), 114 | ('log_device_placement', False), 115 | ('summary_flush_secs', 120), 116 | ] 117 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/786252db3825+_tune_ptb_24m_lstm_d2_arms/trial_833/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'utf-8'), 4 | ('word_based', True), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 24000000), 8 | ('share_input_and_output_embeddings', True), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 1.0), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.6275626425150355), 17 | ('output_dropout', 0.6901712653612706), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 2), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 0), 27 | ('feature_mask_rank', 0), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.3069926535017156), 39 | ('state_dropout', 0.3692225400980858), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 0.00024908138497223704), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 200), 58 | ('print_training_stats_every_num_steps', 200), 59 | ('turns', 1000), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 64), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 70), 71 | ('trigger_averaging_turns', 50), 72 | ('trigger_averaging_at_the_latest', 800), 73 | # learning rate 74 | ('learning_rate', 0.0030369099569192135), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', False), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/786252db3825+_tune_ptb_24m_lstm_fm_d2_arms/trial_483/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'utf-8'), 4 | ('word_based', True), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 24000000), 8 | ('share_input_and_output_embeddings', True), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 1.0), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.7290787773167251), 17 | ('output_dropout', 0.7156690388448465), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 2), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 5), 27 | ('feature_mask_rank', 84), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.2909822365241189), 39 | ('state_dropout', 0.38729439899832296), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 0.00025235335778471014), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 200), 58 | ('print_training_stats_every_num_steps', 200), 59 | ('turns', 1000), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 64), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 70), 71 | ('trigger_averaging_turns', 50), 72 | ('trigger_averaging_at_the_latest', 800), 73 | # learning rate 74 | ('learning_rate', 0.002299987130225388), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', False), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/9e20581d3dad+_tune_mwc_en_24m_lstm_d2_arms/trial_502/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'utf-8'), 4 | ('word_based', False), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 24000000), 8 | ('share_input_and_output_embeddings', False), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 0.16763290107221795), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.24803406411000273), 17 | ('output_dropout', 0.06200886700243824), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 2), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 0), 27 | ('feature_mask_rank', 0), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.04740148103981923), 39 | ('state_dropout', 0.046954638037220955), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 7.825277510671981e-06), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 200), 58 | ('print_training_stats_every_num_steps', 200), 59 | ('turns', 500), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 64), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 150), 71 | ('trigger_averaging_turns', 25), 72 | ('trigger_averaging_at_the_latest', 400), 73 | # learning rate 74 | ('learning_rate', 0.0038051220647221428), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/9e20581d3dad+_tune_mwc_en_24m_lstm_fm_d2_arms/trial_422/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'utf-8'), 4 | ('word_based', False), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 24000000), 8 | ('share_input_and_output_embeddings', False), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 0.48783057795681084), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.22379479910762798), 17 | ('output_dropout', 0.005212299871888891), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 2), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 6), 27 | ('feature_mask_rank', 78), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.08779703173530118), 39 | ('state_dropout', 0.09548532162445378), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 9.245434142118616e-05), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 200), 58 | ('print_training_stats_every_num_steps', 200), 59 | ('turns', 500), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 64), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 150), 71 | ('trigger_averaging_turns', 25), 72 | ('trigger_averaging_at_the_latest', 400), 73 | # learning rate 74 | ('learning_rate', 0.0014344414472614946), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_d2_arms/trial_763/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'utf-8'), 4 | ('word_based', True), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 35000000), 8 | ('share_input_and_output_embeddings', True), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 0.3530770457779424), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 2), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.6090979517941943), 17 | ('output_dropout', 0.34845530389157287), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 2), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 0), 27 | ('feature_mask_rank', 0), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.09075401405970591), 39 | ('state_dropout', 0.2714030562283111), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 0.00023063627783021125), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 200), 58 | ('print_training_stats_every_num_steps', 200), 59 | ('turns', 1000), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 64), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 70), 71 | ('trigger_averaging_turns', 50), 72 | ('trigger_averaging_at_the_latest', 800), 73 | # learning rate 74 | ('learning_rate', 0.003183909546336849), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_fm_d2_arms/trial_747/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'utf-8'), 4 | ('word_based', True), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 35000000), 8 | ('share_input_and_output_embeddings', True), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 0.1993194960596213), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 2), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.5469087645499495), 17 | ('output_dropout', 0.34766651193735193), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 2), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 6), 27 | ('feature_mask_rank', 48), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.1988228263748591), 39 | ('state_dropout', 0.22137985867236876), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 0.00018994987193751323), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 200), 58 | ('print_training_stats_every_num_steps', 200), 59 | ('turns', 1000), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 64), 68 | ('accum_batch_size', -1), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 70), 71 | ('trigger_averaging_turns', 50), 72 | ('trigger_averaging_at_the_latest', 800), 73 | # learning rate 74 | ('learning_rate', 0.003287792100749033), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/e81db31261c0+_tune_enwik8_96m_lstm_d4_arms/trial_295/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'CP437'), 4 | ('word_based', False), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 96000000), 8 | ('share_input_and_output_embeddings', False), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 1.0), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.1233672355450206), 17 | ('output_dropout', 0.24846692818769148), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 4), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 0), 27 | ('feature_mask_rank', 0), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.12636500626697247), 39 | ('state_dropout', 0.13063181510547955), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 5.853555404849184e-05), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 400), 58 | ('print_training_stats_every_num_steps', 1000), 59 | ('turns', 100), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 128), 68 | ('accum_batch_size', 64), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 500), 71 | ('trigger_averaging_turns', 10), 72 | ('trigger_averaging_at_the_latest', 80), 73 | # learning rate 74 | ('learning_rate', 0.001975213597736287), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/config/e81db31261c0+_tune_enwik8_96m_lstm_fm_d4_arms_MtngW/trial_216/config: -------------------------------------------------------------------------------- 1 | [ ('config_version', 5), 2 | # data 3 | ('file_encoding', 'CP437'), 4 | ('word_based', False), 5 | ('episodic', False), 6 | # model 7 | ('num_params', 96000000), 8 | ('share_input_and_output_embeddings', False), 9 | ('input_embedding_size', -1), 10 | ('output_embedding_size', -1), 11 | ('input_embedding_ratio', 1.0), 12 | ('output_embedding_ratio', -1.0), 13 | ('mos_num_components', 0), 14 | ('token_dropout', 0.0), 15 | ('embedding_dropout', 0.0), 16 | ('input_dropout', 0.015908852824256536), 17 | ('output_dropout', 0.2878539844807166), 18 | ('downprojected_output_dropout', -1.0), 19 | ('shared_mask_dropout', False), 20 | ('embed_once', True), 21 | # cell 22 | ('model', 'lstm'), 23 | ('num_layers', 4), 24 | ('residual_connections', False), 25 | ('lstm_skip_connection', True), 26 | ('feature_mask_rounds', 5), 27 | ('feature_mask_rank', 61), 28 | ('feature_mask', False), 29 | ('sparsity_ratio', -1.0), 30 | ('overlay_rank', -1), 31 | ('hidden_size', [-1]), 32 | ('hidden_size_multiplier', 1.0), 33 | ('layer_norm', False), 34 | ('activation_fn', 'tf.tanh'), 35 | ('tie_forget_and_input_gates', False), 36 | ('cap_input_gate', True), 37 | ('trainable_initial_state', False), 38 | ('inter_layer_dropout', 0.13785990907975867), 39 | ('state_dropout', 0.1648727901535727), 40 | ('state_dropout_flip_rate', 0.0), 41 | ('update_dropout', 0.0), 42 | ('cell_clip', -1.0), 43 | # objective 44 | ('model_average', 'arithmetic'), 45 | ('num_training_samples', 1), 46 | ('l2_penalty', 4.409390792135428e-05), 47 | ('l1_penalty', 0.0), 48 | ('activation_norm_penalty', 0.0), 49 | ('drop_state_probability', 0.01), 50 | # initialization 51 | ('embedding_init_factor', 1.0), 52 | ('scale_input_embeddings', False), 53 | ('cell_init_factor', 1.0), 54 | ('forget_bias', 1.0), 55 | ('output_init_factor', 1.0), 56 | # schedule 57 | ('steps_per_turn', 400), 58 | ('print_training_stats_every_num_steps', 1000), 59 | ('turns', 100), 60 | # optimization 61 | ('optimizer_type', 'rmsprop'), 62 | ('rmsprop_beta2', 0.999), 63 | ('rmsprop_epsilon', 1e-08), 64 | ('adam_beta1', 0.9), 65 | ('adam_beta2', 0.999), 66 | ('adam_epsilon', 1e-08), 67 | ('batch_size', 128), 68 | ('accum_batch_size', 64), 69 | ('max_grad_norm', 10.0), 70 | ('max_time_steps', 500), 71 | ('trigger_averaging_turns', 10), 72 | ('trigger_averaging_at_the_latest', 80), 73 | # learning rate 74 | ('learning_rate', 0.0022480107672343715), 75 | ('learning_rate_decay', 1.0), 76 | ('learning_rate_decay_burn_in_steps', 0), 77 | ('drop_learning_rate_turns', -1), 78 | ('drop_learning_rate_multiplier', 1.0), 79 | ('drop_learning_rate_at_the_latest', -1), 80 | # early stopping 81 | ('early_stopping_turns', -1), 82 | ('early_stopping_rampup_turns', 0), 83 | ('early_stopping_worst_xe_target', ''), 84 | ('early_stopping_slowest_rate', 0.0), 85 | # cross-validation 86 | ('crossvalidate', False), 87 | ('crossvalidation_folds', 10), 88 | ('crossvalidation_rounds', 1), 89 | # evaluation 90 | ('max_training_eval_batches', 20), 91 | ('max_eval_eval_batches', -1), 92 | ('max_test_eval_batches', -1), 93 | ('min_non_episodic_eval_examples_per_stripe', 100), 94 | ('eval_on_test', False), 95 | ('eval_method', 'deterministic'), 96 | ('num_eval_samples', 0), 97 | ('eval_softmax_temperature', -0.8), 98 | ('eval_softmax_temperature_estimation_num_tokens', 50000), 99 | ('eval_power_mean_power', 1.0), 100 | ('eval_dropout_multiplier', 1.0), 101 | ('validation_prediction_file', ''), 102 | ('dyneval', False), 103 | ('dyneval_learning_rate', 0.001), 104 | ('dyneval_decay_rate', 0.02), 105 | ('dyneval_epsilon', 1e-05), 106 | # experiments 107 | # checkpoints 108 | ('save_checkpoints', True), 109 | # misc 110 | ('seed', 1), 111 | ('swap_memory', True), 112 | ('log_device_placement', False), 113 | ('summary_flush_secs', 120), 114 | ] 115 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/train_enwik8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/enwik8.sh" 24 | 25 | name="$2" 26 | config_file="$3" 27 | 28 | source_lib "run.sh" "$1" 29 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/train_mwc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/mwc.sh" 24 | 25 | # Data 26 | 27 | lang="${2:-en}" 28 | training_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.tr.raw.unk" 29 | validation_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.va.raw.unk" 30 | test_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.te.raw.unk" 31 | 32 | name="$2" 33 | config_file="$3" 34 | 35 | source_lib "run.sh" "$1" 36 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/train_ptb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/ptb_word.sh" 24 | 25 | name="$2" 26 | config_file="$3" 27 | 28 | source_lib "run.sh" "$1" 29 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/train_ptb_char.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/ptb_char.sh" 24 | 25 | name="$2" 26 | config_file="$3" 27 | 28 | source_lib "run.sh" "$1" 29 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/train_wikitext-2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/wikitext-2_word.sh" 24 | 25 | name="$2" 26 | config_file="$3" 27 | 28 | source_lib "run.sh" "$1" 29 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_copy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/copy.sh" 28 | 29 | # Model 30 | 31 | num_param_millions=10 32 | num_params=$(million ${num_param_millions}) 33 | share_input_and_output_embeddings=false 34 | shared_mask_dropout=false 35 | 36 | # Cell 37 | 38 | model="lstm" 39 | num_layers=1 40 | lstm_skip_connection=true 41 | tie_forget_and_input_gates=false 42 | cap_input_gate=true 43 | 44 | # Objective 45 | 46 | drop_state_probability=0.0 47 | 48 | # Initialization 49 | 50 | forget_bias=1.0 51 | 52 | # Schedule 53 | 54 | steps_per_turn=200 55 | print_training_stats_every_num_steps=200 56 | turns=100 57 | 58 | # Optimizer 59 | 60 | optimizer_type="rmsprop" 61 | batch_size=64 62 | max_grad_norm=10.0 63 | max_time_steps=155 64 | 65 | # Early stopping 66 | 67 | # early_stopping_turns=30 68 | # early_stopping_worst_xe_target=4.4 69 | 70 | # Evaluation 71 | 72 | max_training_eval_batches=20 73 | eval_softmax_temperature=-0.8 74 | 75 | # Tuning parameters 76 | 77 | priority=200 78 | num_workers=60 79 | 80 | # Misc 81 | 82 | swap_memory=true 83 | 84 | # Start experiments with averaged optimization 85 | 86 | drop_learning_rate_turns=-1 87 | drop_learning_rate_multiplier=1.0 88 | drop_learning_rate_at_the_latest=-1 89 | trigger_averaging_turns=10 90 | trigger_averaging_at_the_latest=80 91 | 92 | # feature mask 93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 94 | feature_mask_rounds,feature_mask_rank" 95 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms" 96 | source_lib "run.sh" "$@" 97 | 98 | # vanilla 99 | tuneables="input_embedding_ratio,learning_rate,l2_penalty" 100 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms" 101 | source_lib "run.sh" "$@" 102 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_dyneval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$1" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | 28 | name="$2" 29 | config_file="$3/config" 30 | load_checkpoint="$3/best" 31 | 32 | save_checkpoints=false 33 | turns=0 34 | 35 | # Evaluation 36 | 37 | dyneval=true 38 | batch_size=1024 39 | max_training_eval_batches=500 40 | max_grad_norm=0.0 41 | eval_softmax_temperature=-0.8 42 | eval_softmax_temperature_estimation_num_tokens=50000 43 | l2_penalty=0.0 44 | 45 | # Tuning parameters 46 | 47 | priority=200 48 | num_workers=60 49 | 50 | tuneables="batch_size,max_time_steps, 51 | dyneval_learning_rate,dyneval_decay_rate,dyneval_epsilon" 52 | name="$(default_name)_${name}" 53 | source_lib "run.sh" "$1" 54 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_enwik8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/enwik8.sh" 28 | 29 | # Model 30 | 31 | num_param_millions=24 32 | num_params=$(million ${num_param_millions}) 33 | share_input_and_output_embeddings=false 34 | shared_mask_dropout=false 35 | 36 | # Cell 37 | 38 | model="lstm" 39 | num_layers=4 40 | lstm_skip_connection=true 41 | tie_forget_and_input_gates=false 42 | cap_input_gate=true 43 | 44 | # Objective 45 | 46 | drop_state_probability=0.01 47 | 48 | # Initialization 49 | 50 | forget_bias=1.0 51 | 52 | # Schedule 53 | 54 | steps_per_turn=1000 55 | print_training_stats_every_num_steps=1000 56 | turns=100 57 | 58 | # Optimizer 59 | 60 | optimizer_type="rmsprop" 61 | batch_size=128 62 | max_grad_norm=10.0 63 | max_time_steps=200 64 | 65 | # Early stopping 66 | 67 | # early_stopping_turns=30 68 | # early_stopping_worst_xe_target=4.4 69 | 70 | # Evaluation 71 | 72 | max_training_eval_batches=20 73 | eval_softmax_temperature=-0.8 74 | 75 | # Tuning parameters 76 | 77 | priority=200 78 | num_workers=60 79 | 80 | # Misc 81 | 82 | swap_memory=true 83 | 84 | # Start experiments with averaged optimization 85 | 86 | drop_learning_rate_turns=-1 87 | drop_learning_rate_multiplier=1.0 88 | drop_learning_rate_at_the_latest=-1 89 | trigger_averaging_turns=10 90 | trigger_averaging_at_the_latest=80 91 | 92 | # feature mask 93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 94 | input_dropout,inter_layer_dropout,state_dropout, 95 | output_dropout, 96 | feature_mask_rounds,feature_mask_rank" 97 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms" 98 | source_lib "run.sh" "$@" 99 | 100 | # vanilla 101 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 102 | input_dropout,inter_layer_dropout,state_dropout, 103 | output_dropout" 104 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms" 105 | source_lib "run.sh" "$@" 106 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_mwc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/mwc.sh" 28 | 29 | # Data 30 | 31 | lang="${2:-en}" 32 | training_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.tr.raw.unk" 33 | validation_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.va.raw.unk" 34 | test_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.te.raw.unk" 35 | 36 | # Model 37 | 38 | num_param_millions=24 39 | num_params=$(million ${num_param_millions}) 40 | share_input_and_output_embeddings=false 41 | shared_mask_dropout=false 42 | 43 | # Cell 44 | 45 | model="lstm" 46 | num_layers=2 47 | lstm_skip_connection=true 48 | tie_forget_and_input_gates=false 49 | cap_input_gate=true 50 | 51 | # Objective 52 | 53 | drop_state_probability=0.01 54 | 55 | # Initialization 56 | 57 | forget_bias=1.0 58 | 59 | # Schedule 60 | 61 | steps_per_turn=200 62 | print_training_stats_every_num_steps=200 63 | turns=500 64 | 65 | # Optimizer 66 | 67 | optimizer_type="rmsprop" 68 | batch_size=64 69 | max_grad_norm=10.0 70 | max_time_steps=150 71 | 72 | # Early stopping 73 | 74 | # early_stopping_turns=30 75 | # early_stopping_worst_xe_target=4.4 76 | 77 | # Evaluation 78 | 79 | max_training_eval_batches=20 80 | eval_softmax_temperature=-0.8 81 | 82 | # Tuning parameters 83 | 84 | priority=200 85 | num_workers=60 86 | 87 | # Misc 88 | 89 | swap_memory=true 90 | 91 | # Start experiments with averaged optimization 92 | 93 | drop_learning_rate_turns=-1 94 | drop_learning_rate_multiplier=1.0 95 | drop_learning_rate_at_the_latest=-1 96 | trigger_averaging_turns=25 97 | trigger_averaging_at_the_latest=400 98 | 99 | # feature mask 100 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 101 | input_dropout,inter_layer_dropout,state_dropout, 102 | output_dropout, 103 | feature_mask_rounds,feature_mask_rank" 104 | name="$(default_name)_${lang}_${num_param_millions}m_${model}_fm_d${num_layers}_arms" 105 | source_lib "run.sh" "$@" 106 | 107 | # vanilla 108 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 109 | input_dropout,inter_layer_dropout,state_dropout, 110 | output_dropout" 111 | name="$(default_name)_${lang}_${num_param_millions}m_${model}_d${num_layers}_arms" 112 | source_lib "run.sh" "$@" 113 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_ptb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/ptb_word.sh" 28 | 29 | # Model 30 | 31 | num_param_millions=24 32 | num_params=$(million ${num_param_millions}) 33 | share_input_and_output_embeddings=true 34 | shared_mask_dropout=false 35 | 36 | # Cell 37 | 38 | model="lstm" 39 | num_layers=2 40 | lstm_skip_connection=true 41 | tie_forget_and_input_gates=false 42 | cap_input_gate=true 43 | 44 | # Objective 45 | 46 | drop_state_probability=0.01 47 | 48 | # Initialization 49 | 50 | forget_bias=1.0 51 | 52 | # Schedule 53 | 54 | steps_per_turn=200 55 | print_training_stats_every_num_steps=200 56 | turns=1000 57 | 58 | # Optimizer 59 | 60 | optimizer_type="rmsprop" 61 | batch_size=64 62 | max_grad_norm=10.0 63 | max_time_steps=70 64 | 65 | # Early stopping 66 | 67 | # early_stopping_turns=30 68 | # early_stopping_worst_xe_target=4.4 69 | 70 | # Evaluation 71 | 72 | max_training_eval_batches=20 73 | eval_softmax_temperature=-0.8 74 | 75 | # Tuning parameters 76 | 77 | priority=200 78 | num_workers=60 79 | 80 | # Start experiments with dropped learning rate 81 | 82 | # drop_learning_rate_turns=100 83 | # drop_learning_rate_multiplier=0.1 84 | # drop_learning_rate_at_the_latest=1600 85 | # 86 | # # feature mask 87 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty, 88 | # input_dropout,inter_layer_dropout,state_dropout, 89 | # output_dropout, 90 | # feature_mask_rounds,feature_mask_rank" 91 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms" 92 | # source_lib "run.sh" "$@" 93 | # 94 | # # vanilla 95 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty, 96 | # input_dropout,inter_layer_dropout,state_dropout, 97 | # output_dropout" 98 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms" 99 | # source_lib "run.sh" "$@" 100 | 101 | # Start experiments with averaged optimization 102 | 103 | drop_learning_rate_turns=-1 104 | drop_learning_rate_multiplier=1.0 105 | drop_learning_rate_at_the_latest=-1 106 | trigger_averaging_turns=50 107 | trigger_averaging_at_the_latest=800 108 | 109 | # feature mask 110 | tuneables="learning_rate,l2_penalty, 111 | input_dropout,inter_layer_dropout,state_dropout, 112 | output_dropout, 113 | feature_mask_rounds,feature_mask_rank" 114 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms" 115 | source_lib "run.sh" "$@" 116 | 117 | # vanilla 118 | tuneables="learning_rate,l2_penalty, 119 | input_dropout,inter_layer_dropout,state_dropout, 120 | output_dropout" 121 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms" 122 | source_lib "run.sh" "$@" 123 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_ptb_char.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/ptb_char.sh" 28 | 29 | # Model 30 | 31 | num_param_millions=24 32 | num_params=$(million ${num_param_millions}) 33 | share_input_and_output_embeddings=false 34 | shared_mask_dropout=false 35 | 36 | # Cell 37 | 38 | model="lstm" 39 | num_layers=2 40 | lstm_skip_connection=true 41 | tie_forget_and_input_gates=false 42 | cap_input_gate=true 43 | 44 | # Objective 45 | 46 | drop_state_probability=0.01 47 | 48 | # Initialization 49 | 50 | forget_bias=1.0 51 | 52 | # Schedule 53 | 54 | steps_per_turn=200 55 | print_training_stats_every_num_steps=200 56 | turns=500 57 | 58 | # Optimizer 59 | 60 | optimizer_type="rmsprop" 61 | batch_size=64 62 | max_grad_norm=10.0 63 | max_time_steps=150 64 | 65 | # Early stopping 66 | 67 | # early_stopping_turns=30 68 | # early_stopping_worst_xe_target=4.4 69 | 70 | # Evaluation 71 | 72 | max_training_eval_batches=20 73 | eval_softmax_temperature=-0.8 74 | 75 | # Tuning parameters 76 | 77 | priority=200 78 | num_workers=60 79 | 80 | # Misc 81 | 82 | swap_memory=true 83 | 84 | # Start experiments with averaged optimization 85 | 86 | drop_learning_rate_turns=-1 87 | drop_learning_rate_multiplier=1.0 88 | drop_learning_rate_at_the_latest=-1 89 | trigger_averaging_turns=25 90 | trigger_averaging_at_the_latest=400 91 | 92 | # feature mask 93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 94 | input_dropout,inter_layer_dropout,state_dropout, 95 | output_dropout, 96 | feature_mask_rounds,feature_mask_rank" 97 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms" 98 | source_lib "run.sh" "$@" 99 | 100 | # vanilla 101 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 102 | input_dropout,inter_layer_dropout,state_dropout, 103 | output_dropout" 104 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms" 105 | source_lib "run.sh" "$@" 106 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_ptb_fast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/ptb_word.sh" 28 | 29 | # Model 30 | 31 | num_param_millions=24 32 | num_params=$(million ${num_param_millions}) 33 | share_input_and_output_embeddings=true 34 | shared_mask_dropout=false 35 | 36 | # Cell 37 | 38 | model="lstm" 39 | num_layers=2 40 | lstm_skip_connection=true 41 | tie_forget_and_input_gates=false 42 | cap_input_gate=true 43 | 44 | # Objective 45 | 46 | drop_state_probability=0.01 47 | 48 | # Initialization 49 | 50 | forget_bias=1.0 51 | 52 | # Schedule 53 | 54 | steps_per_turn=100 55 | print_training_stats_every_num_steps=100 56 | turns=600 57 | 58 | # Optimizer 59 | 60 | optimizer_type="rmsprop" 61 | batch_size=64 62 | max_grad_norm=10.0 63 | max_time_steps=35 64 | 65 | # Early stopping 66 | 67 | # early_stopping_turns=30 68 | # early_stopping_worst_xe_target=4.4 69 | 70 | # Evaluation 71 | 72 | max_training_eval_batches=20 73 | eval_softmax_temperature=-0.8 74 | 75 | # Tuning parameters 76 | 77 | priority=200 78 | num_workers=60 79 | 80 | # Start experiments with dropped learning rate 81 | 82 | # drop_learning_rate_turns=100 83 | # drop_learning_rate_multiplier=0.1 84 | # drop_learning_rate_at_the_latest=1600 85 | # 86 | # # feature mask 87 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty, 88 | # input_dropout,inter_layer_dropout,state_dropout, 89 | # output_dropout, 90 | # feature_mask_rounds,feature_mask_rank" 91 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms" 92 | # source_lib "run.sh" "$@" 93 | # 94 | # # vanilla 95 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty, 96 | # input_dropout,inter_layer_dropout,state_dropout, 97 | # output_dropout" 98 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms" 99 | # source_lib "run.sh" "$@" 100 | 101 | # Start experiments with averaged optimization 102 | 103 | drop_learning_rate_turns=-1 104 | drop_learning_rate_multiplier=1.0 105 | drop_learning_rate_at_the_latest=-1 106 | trigger_averaging_turns=25 107 | trigger_averaging_at_the_latest=400 108 | 109 | # feature mask 110 | tuneables="learning_rate,l2_penalty, 111 | input_dropout,inter_layer_dropout,state_dropout, 112 | output_dropout, 113 | feature_mask_rounds,feature_mask_rank" 114 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms" 115 | source_lib "run.sh" "$@" 116 | 117 | # vanilla 118 | tuneables="learning_rate,l2_penalty, 119 | input_dropout,inter_layer_dropout,state_dropout, 120 | output_dropout" 121 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms" 122 | source_lib "run.sh" "$@" 123 | -------------------------------------------------------------------------------- /lamb/experiment/mogrifier/tune_wikitext-2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../../lib/setup.sh" "$@" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/wikitext-2_word.sh" 28 | 29 | # Model 30 | 31 | num_param_millions=35 32 | num_params=$(million ${num_param_millions}) 33 | share_input_and_output_embeddings=true 34 | shared_mask_dropout=false 35 | 36 | # Cell 37 | 38 | model="lstm" 39 | num_layers=2 40 | lstm_skip_connection=true 41 | tie_forget_and_input_gates=false 42 | cap_input_gate=true 43 | 44 | # Objective 45 | 46 | drop_state_probability=0.01 47 | 48 | # Initialization 49 | 50 | forget_bias=1.0 51 | 52 | # Schedule 53 | 54 | steps_per_turn=200 55 | print_training_stats_every_num_steps=200 56 | turns=1000 57 | 58 | # Optimizer 59 | 60 | optimizer_type="rmsprop" 61 | batch_size=64 62 | max_grad_norm=10.0 63 | max_time_steps=70 64 | 65 | # Early stopping 66 | 67 | # early_stopping_turns=30 68 | # early_stopping_worst_xe_target=4.4 69 | 70 | # Evaluation 71 | 72 | max_training_eval_batches=20 73 | eval_softmax_temperature=-0.8 74 | 75 | # Tuning parameters 76 | 77 | priority=200 78 | num_workers=60 79 | 80 | # Misc 81 | 82 | swap_memory=true 83 | 84 | # Start experiments with dropped learning rate 85 | 86 | # drop_learning_rate_turns=100 87 | # drop_learning_rate_multiplier=0.1 88 | # drop_learning_rate_at_the_latest=1600 89 | # 90 | # # feature mask 91 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty, 92 | # input_dropout,inter_layer_dropout,state_dropout, 93 | # output_dropout, 94 | # feature_mask_rounds,feature_mask_rank" 95 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms" 96 | # source_lib "run.sh" "$@" 97 | # 98 | # # vanilla 99 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty, 100 | # input_dropout,inter_layer_dropout,state_dropout, 101 | # output_dropout" 102 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms" 103 | # source_lib "run.sh" "$@" 104 | 105 | # Start experiments with averaged optimization 106 | 107 | drop_learning_rate_turns=-1 108 | drop_learning_rate_multiplier=1.0 109 | drop_learning_rate_at_the_latest=-1 110 | trigger_averaging_turns=50 111 | trigger_averaging_at_the_latest=800 112 | 113 | # feature mask 114 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 115 | input_dropout,inter_layer_dropout,state_dropout, 116 | output_dropout, 117 | feature_mask_rounds,feature_mask_rank" 118 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms" 119 | source_lib "run.sh" "$@" 120 | 121 | # vanilla 122 | tuneables="input_embedding_ratio,learning_rate,l2_penalty, 123 | input_dropout,inter_layer_dropout,state_dropout, 124 | output_dropout" 125 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms" 126 | source_lib "run.sh" "$@" 127 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/README.md: -------------------------------------------------------------------------------- 1 | This directory contains saved configuration files for tuned models from the [On 2 | the state of the art of evaluation in neural language 3 | models](https://arxiv.org/abs/1707.05589) paper. Model weights are not included. 4 | 5 | Don't forget to [set up the data](../../README.md). 6 | 7 | To train the 1 layer LSTM model of 10m weights on PTB with tuned hyperparameters 8 | (see the paper above): 9 | 10 | ./train_ptb.sh run ptb_10m_lstm_d1/hps_proto 11 | 12 | There are separate training script for enwik8 and wikitext-2. The training will 13 | save the model in `/tmp/lamb/ptb_10m_lstm_d1/`. To test the saved model: 14 | 15 | ../test.sh run some-descriptive-name /tmp/lamb/ptb_10m_lstm_d1/ 16 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/enwik8_27m_lstm_d4/hps_proto: -------------------------------------------------------------------------------- 1 | hparam { 2 | key: "activation_fn" 3 | value { 4 | bytes_value: "tf.tanh" 5 | } 6 | } 7 | hparam { 8 | key: "adam_beta1" 9 | value { 10 | float_value: 0.899999976158 11 | } 12 | } 13 | hparam { 14 | key: "adam_beta2" 15 | value { 16 | float_value: 0.999000012875 17 | } 18 | } 19 | hparam { 20 | key: "adam_epsilon" 21 | value { 22 | float_value: 9.99999993923e-09 23 | } 24 | } 25 | hparam { 26 | key: "batch_size" 27 | value { 28 | int64_value: 128 29 | } 30 | } 31 | hparam { 32 | key: "cell_clip" 33 | value { 34 | float_value: -1.0 35 | } 36 | } 37 | hparam { 38 | key: "cell_init_factor" 39 | value { 40 | float_value: 1.0 41 | } 42 | } 43 | hparam { 44 | key: "drop_learning_rate_at_the_latest" 45 | value { 46 | int64_value: 450 47 | } 48 | } 49 | hparam { 50 | key: "drop_learning_rate_multiplier" 51 | value { 52 | float_value: 0.10000000149 53 | } 54 | } 55 | hparam { 56 | key: "drop_learning_rate_rounds" 57 | value { 58 | int64_value: 13 59 | } 60 | } 61 | hparam { 62 | key: "drop_state_probability" 63 | value { 64 | float_value: 0.00999999977648 65 | } 66 | } 67 | hparam { 68 | key: "embed_once" 69 | value { 70 | bool_value: true 71 | } 72 | } 73 | hparam { 74 | key: "embedding_init_factor" 75 | value { 76 | float_value: 1.0 77 | } 78 | } 79 | hparam { 80 | key: "feature_mask" 81 | value { 82 | bool_value: false 83 | } 84 | } 85 | hparam { 86 | key: "forget_bias" 87 | value { 88 | float_value: 1.0 89 | } 90 | } 91 | hparam { 92 | key: "hidden_size" 93 | value { 94 | int64_value: 911 95 | } 96 | } 97 | hparam { 98 | key: "input_dropout" 99 | value { 100 | float_value: 0.196795836091 101 | } 102 | } 103 | hparam { 104 | key: "input_embedding_ratio" 105 | value { 106 | float_value: 1.0 107 | } 108 | } 109 | hparam { 110 | key: "input_embedding_size" 111 | value { 112 | int64_value: 911 113 | } 114 | } 115 | hparam { 116 | key: "intra_layer_dropout" 117 | value { 118 | float_value: 0.0307693872601 119 | } 120 | } 121 | hparam { 122 | key: "layer_norm" 123 | value { 124 | bool_value: false 125 | } 126 | } 127 | hparam { 128 | key: "learning_rate" 129 | value { 130 | float_value: 0.00203050486743 131 | } 132 | } 133 | hparam { 134 | key: "learning_rate_decay" 135 | value { 136 | float_value: 1.0 137 | } 138 | } 139 | hparam { 140 | key: "learning_rate_decay_burn_in_steps" 141 | value { 142 | int64_value: 0 143 | } 144 | } 145 | hparam { 146 | key: "lstm_skip_connection" 147 | value { 148 | bool_value: true 149 | } 150 | } 151 | hparam { 152 | key: "max_grad_norm" 153 | value { 154 | float_value: 10.0 155 | } 156 | } 157 | hparam { 158 | key: "model" 159 | value { 160 | bytes_value: "lstm" 161 | } 162 | } 163 | hparam { 164 | key: "num_eval_samples" 165 | value { 166 | int64_value: 0 167 | } 168 | } 169 | hparam { 170 | key: "num_layers" 171 | value { 172 | int64_value: 4 173 | } 174 | } 175 | hparam { 176 | key: "num_params" 177 | value { 178 | int64_value: 27000000 179 | } 180 | } 181 | hparam { 182 | key: "optimizer_type" 183 | value { 184 | bytes_value: "rmsprop" 185 | } 186 | } 187 | hparam { 188 | key: "outer_steps" 189 | value { 190 | int64_value: 500 191 | } 192 | } 193 | hparam { 194 | key: "output_dropout" 195 | value { 196 | float_value: 0.0695193335414 197 | } 198 | } 199 | hparam { 200 | key: "output_embedding_ratio" 201 | value { 202 | float_value: 1.0 203 | } 204 | } 205 | hparam { 206 | key: "output_embedding_size" 207 | value { 208 | int64_value: 911 209 | } 210 | } 211 | hparam { 212 | key: "output_init_factor" 213 | value { 214 | float_value: 1.0 215 | } 216 | } 217 | hparam { 218 | key: "overlay_rank" 219 | value { 220 | int64_value: -1 221 | } 222 | } 223 | hparam { 224 | key: "rmsprop_beta2" 225 | value { 226 | float_value: 0.990000009537 227 | } 228 | } 229 | hparam { 230 | key: "rmsprop_epsilon" 231 | value { 232 | float_value: 9.99999974738e-06 233 | } 234 | } 235 | hparam { 236 | key: "share_input_and_output_embeddings" 237 | value { 238 | bool_value: false 239 | } 240 | } 241 | hparam { 242 | key: "sparsity_ratio" 243 | value { 244 | float_value: -1.0 245 | } 246 | } 247 | hparam { 248 | key: "state_dropout" 249 | value { 250 | float_value: 0.0808205232024 251 | } 252 | } 253 | hparam { 254 | key: "tie_forget_and_input_gates" 255 | value { 256 | bool_value: false 257 | } 258 | } 259 | hparam { 260 | key: "token_dropout" 261 | value { 262 | float_value: 0.0 263 | } 264 | } 265 | hparam { 266 | key: "trainable_initial_state" 267 | value { 268 | bool_value: false 269 | } 270 | } 271 | hparam { 272 | key: "update_dropout" 273 | value { 274 | float_value: 0.0 275 | } 276 | } 277 | hparam { 278 | key: "vocab_size" 279 | value { 280 | int64_value: 206 281 | } 282 | } 283 | hparam { 284 | key: "weight_decay" 285 | value { 286 | float_value: 7.50829849494e-06 287 | } 288 | } 289 | hparam { 290 | key: "weight_penalty" 291 | value { 292 | float_value: 0.0 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/enwik8_46m_lstm_d4/hps_proto: -------------------------------------------------------------------------------- 1 | hparam { 2 | key: "activation_fn" 3 | value { 4 | bytes_value: "tf.tanh" 5 | } 6 | } 7 | hparam { 8 | key: "adam_beta1" 9 | value { 10 | float_value: 0.899999976158 11 | } 12 | } 13 | hparam { 14 | key: "adam_beta2" 15 | value { 16 | float_value: 0.999000012875 17 | } 18 | } 19 | hparam { 20 | key: "adam_epsilon" 21 | value { 22 | float_value: 9.99999993923e-09 23 | } 24 | } 25 | hparam { 26 | key: "batch_size" 27 | value { 28 | int64_value: 128 29 | } 30 | } 31 | hparam { 32 | key: "cell_clip" 33 | value { 34 | float_value: -1.0 35 | } 36 | } 37 | hparam { 38 | key: "cell_init_factor" 39 | value { 40 | float_value: 1.0 41 | } 42 | } 43 | hparam { 44 | key: "drop_learning_rate_at_the_latest" 45 | value { 46 | int64_value: 450 47 | } 48 | } 49 | hparam { 50 | key: "drop_learning_rate_multiplier" 51 | value { 52 | float_value: 0.10000000149 53 | } 54 | } 55 | hparam { 56 | key: "drop_learning_rate_rounds" 57 | value { 58 | int64_value: 13 59 | } 60 | } 61 | hparam { 62 | key: "drop_state_probability" 63 | value { 64 | float_value: 0.00999999977648 65 | } 66 | } 67 | hparam { 68 | key: "embed_once" 69 | value { 70 | bool_value: true 71 | } 72 | } 73 | hparam { 74 | key: "embedding_init_factor" 75 | value { 76 | float_value: 1.0 77 | } 78 | } 79 | hparam { 80 | key: "feature_mask" 81 | value { 82 | bool_value: false 83 | } 84 | } 85 | hparam { 86 | key: "forget_bias" 87 | value { 88 | float_value: 1.0 89 | } 90 | } 91 | hparam { 92 | key: "hidden_size" 93 | value { 94 | int64_value: 1192 95 | } 96 | } 97 | hparam { 98 | key: "input_dropout" 99 | value { 100 | float_value: 0.0335461571813 101 | } 102 | } 103 | hparam { 104 | key: "input_embedding_ratio" 105 | value { 106 | float_value: 1.0 107 | } 108 | } 109 | hparam { 110 | key: "input_embedding_size" 111 | value { 112 | int64_value: 1192 113 | } 114 | } 115 | hparam { 116 | key: "intra_layer_dropout" 117 | value { 118 | float_value: 0.0122289275751 119 | } 120 | } 121 | hparam { 122 | key: "layer_norm" 123 | value { 124 | bool_value: false 125 | } 126 | } 127 | hparam { 128 | key: "learning_rate" 129 | value { 130 | float_value: 0.00218322896399 131 | } 132 | } 133 | hparam { 134 | key: "learning_rate_decay" 135 | value { 136 | float_value: 1.0 137 | } 138 | } 139 | hparam { 140 | key: "learning_rate_decay_burn_in_steps" 141 | value { 142 | int64_value: 0 143 | } 144 | } 145 | hparam { 146 | key: "lstm_skip_connection" 147 | value { 148 | bool_value: true 149 | } 150 | } 151 | hparam { 152 | key: "max_grad_norm" 153 | value { 154 | float_value: 10.0 155 | } 156 | } 157 | hparam { 158 | key: "model" 159 | value { 160 | bytes_value: "lstm" 161 | } 162 | } 163 | hparam { 164 | key: "num_eval_samples" 165 | value { 166 | int64_value: 0 167 | } 168 | } 169 | hparam { 170 | key: "num_layers" 171 | value { 172 | int64_value: 4 173 | } 174 | } 175 | hparam { 176 | key: "num_params" 177 | value { 178 | int64_value: 46000000 179 | } 180 | } 181 | hparam { 182 | key: "optimizer_type" 183 | value { 184 | bytes_value: "rmsprop" 185 | } 186 | } 187 | hparam { 188 | key: "outer_steps" 189 | value { 190 | int64_value: 500 191 | } 192 | } 193 | hparam { 194 | key: "output_dropout" 195 | value { 196 | float_value: 0.279572278261 197 | } 198 | } 199 | hparam { 200 | key: "output_embedding_ratio" 201 | value { 202 | float_value: 1.0 203 | } 204 | } 205 | hparam { 206 | key: "output_embedding_size" 207 | value { 208 | int64_value: 1192 209 | } 210 | } 211 | hparam { 212 | key: "output_init_factor" 213 | value { 214 | float_value: 1.0 215 | } 216 | } 217 | hparam { 218 | key: "overlay_rank" 219 | value { 220 | int64_value: -1 221 | } 222 | } 223 | hparam { 224 | key: "rmsprop_beta2" 225 | value { 226 | float_value: 0.990000009537 227 | } 228 | } 229 | hparam { 230 | key: "rmsprop_epsilon" 231 | value { 232 | float_value: 9.99999974738e-06 233 | } 234 | } 235 | hparam { 236 | key: "share_input_and_output_embeddings" 237 | value { 238 | bool_value: false 239 | } 240 | } 241 | hparam { 242 | key: "sparsity_ratio" 243 | value { 244 | float_value: -1.0 245 | } 246 | } 247 | hparam { 248 | key: "state_dropout" 249 | value { 250 | float_value: 0.0622955262661 251 | } 252 | } 253 | hparam { 254 | key: "tie_forget_and_input_gates" 255 | value { 256 | bool_value: false 257 | } 258 | } 259 | hparam { 260 | key: "token_dropout" 261 | value { 262 | float_value: 0.0 263 | } 264 | } 265 | hparam { 266 | key: "trainable_initial_state" 267 | value { 268 | bool_value: false 269 | } 270 | } 271 | hparam { 272 | key: "update_dropout" 273 | value { 274 | float_value: 0.0 275 | } 276 | } 277 | hparam { 278 | key: "vocab_size" 279 | value { 280 | int64_value: 206 281 | } 282 | } 283 | hparam { 284 | key: "weight_decay" 285 | value { 286 | float_value: 0.0 287 | } 288 | } 289 | hparam { 290 | key: "weight_penalty" 291 | value { 292 | float_value: 0.0 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/ptb_10m_lstm_d1/hps_proto: -------------------------------------------------------------------------------- 1 | hparam { 2 | key: "activation_fn" 3 | value { 4 | bytes_value: "tf.tanh" 5 | } 6 | } 7 | hparam { 8 | key: "adam_beta1" 9 | value { 10 | float_value: 0.899999976158 11 | } 12 | } 13 | hparam { 14 | key: "adam_beta2" 15 | value { 16 | float_value: 0.999000012875 17 | } 18 | } 19 | hparam { 20 | key: "adam_epsilon" 21 | value { 22 | float_value: 9.99999993923e-09 23 | } 24 | } 25 | hparam { 26 | key: "batch_size" 27 | value { 28 | int64_value: 64 29 | } 30 | } 31 | hparam { 32 | key: "cell_clip" 33 | value { 34 | float_value: -1.0 35 | } 36 | } 37 | hparam { 38 | key: "cell_init_factor" 39 | value { 40 | float_value: 1.0 41 | } 42 | } 43 | hparam { 44 | key: "drop_learning_rate_at_the_latest" 45 | value { 46 | int64_value: 900 47 | } 48 | } 49 | hparam { 50 | key: "drop_learning_rate_multiplier" 51 | value { 52 | float_value: 0.10000000149 53 | } 54 | } 55 | hparam { 56 | key: "drop_learning_rate_rounds" 57 | value { 58 | int64_value: 26 59 | } 60 | } 61 | hparam { 62 | key: "drop_state_probability" 63 | value { 64 | float_value: 0.00999999977648 65 | } 66 | } 67 | hparam { 68 | key: "embed_once" 69 | value { 70 | bool_value: true 71 | } 72 | } 73 | hparam { 74 | key: "embedding_init_factor" 75 | value { 76 | float_value: 1.0 77 | } 78 | } 79 | hparam { 80 | key: "feature_mask" 81 | value { 82 | bool_value: false 83 | } 84 | } 85 | hparam { 86 | key: "forget_bias" 87 | value { 88 | float_value: 1.0 89 | } 90 | } 91 | hparam { 92 | key: "hidden_size" 93 | value { 94 | int64_value: 1194 95 | } 96 | } 97 | hparam { 98 | key: "input_dropout" 99 | value { 100 | float_value: 0.579891622066 101 | } 102 | } 103 | hparam { 104 | key: "input_embedding_ratio" 105 | value { 106 | float_value: 0.224374398589 107 | } 108 | } 109 | hparam { 110 | key: "input_embedding_size" 111 | value { 112 | int64_value: 268 113 | } 114 | } 115 | hparam { 116 | key: "intra_layer_dropout" 117 | value { 118 | float_value: 0.873659133911 119 | } 120 | } 121 | hparam { 122 | key: "layer_norm" 123 | value { 124 | bool_value: false 125 | } 126 | } 127 | hparam { 128 | key: "learning_rate" 129 | value { 130 | float_value: 0.00417865626514 131 | } 132 | } 133 | hparam { 134 | key: "learning_rate_decay" 135 | value { 136 | float_value: 1.0 137 | } 138 | } 139 | hparam { 140 | key: "learning_rate_decay_burn_in_steps" 141 | value { 142 | int64_value: 0 143 | } 144 | } 145 | hparam { 146 | key: "lstm_skip_connection" 147 | value { 148 | bool_value: true 149 | } 150 | } 151 | hparam { 152 | key: "max_grad_norm" 153 | value { 154 | float_value: 10.0 155 | } 156 | } 157 | hparam { 158 | key: "model" 159 | value { 160 | bytes_value: "lstm" 161 | } 162 | } 163 | hparam { 164 | key: "num_eval_samples" 165 | value { 166 | int64_value: 0 167 | } 168 | } 169 | hparam { 170 | key: "num_layers" 171 | value { 172 | int64_value: 1 173 | } 174 | } 175 | hparam { 176 | key: "num_params" 177 | value { 178 | int64_value: 10000000 179 | } 180 | } 181 | hparam { 182 | key: "optimizer_type" 183 | value { 184 | bytes_value: "rmsprop" 185 | } 186 | } 187 | hparam { 188 | key: "outer_steps" 189 | value { 190 | int64_value: 1000 191 | } 192 | } 193 | hparam { 194 | key: "output_dropout" 195 | value { 196 | float_value: 0.327008873224 197 | } 198 | } 199 | hparam { 200 | key: "output_embedding_ratio" 201 | value { 202 | float_value: 0.224374398589 203 | } 204 | } 205 | hparam { 206 | key: "output_embedding_size" 207 | value { 208 | int64_value: 268 209 | } 210 | } 211 | hparam { 212 | key: "output_init_factor" 213 | value { 214 | float_value: 1.0 215 | } 216 | } 217 | hparam { 218 | key: "overlay_rank" 219 | value { 220 | int64_value: -1 221 | } 222 | } 223 | hparam { 224 | key: "rmsprop_beta2" 225 | value { 226 | float_value: 0.999000012875 227 | } 228 | } 229 | hparam { 230 | key: "rmsprop_epsilon" 231 | value { 232 | float_value: 9.99999993923e-09 233 | } 234 | } 235 | hparam { 236 | key: "share_input_and_output_embeddings" 237 | value { 238 | bool_value: true 239 | } 240 | } 241 | hparam { 242 | key: "sparsity_ratio" 243 | value { 244 | float_value: -1.0 245 | } 246 | } 247 | hparam { 248 | key: "state_dropout" 249 | value { 250 | float_value: 0.215256482363 251 | } 252 | } 253 | hparam { 254 | key: "tie_forget_and_input_gates" 255 | value { 256 | bool_value: false 257 | } 258 | } 259 | hparam { 260 | key: "token_dropout" 261 | value { 262 | float_value: 0.0 263 | } 264 | } 265 | hparam { 266 | key: "trainable_initial_state" 267 | value { 268 | bool_value: false 269 | } 270 | } 271 | hparam { 272 | key: "update_dropout" 273 | value { 274 | float_value: 0.0 275 | } 276 | } 277 | hparam { 278 | key: "vocab_size" 279 | value { 280 | int64_value: 10001 281 | } 282 | } 283 | hparam { 284 | key: "weight_decay" 285 | value { 286 | float_value: 0.000124350262922 287 | } 288 | } 289 | hparam { 290 | key: "weight_penalty" 291 | value { 292 | float_value: 0.0 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/ptb_24m_lstm_d4/hps_proto: -------------------------------------------------------------------------------- 1 | hparam { 2 | key: "activation_fn" 3 | value { 4 | bytes_value: "tf.tanh" 5 | } 6 | } 7 | hparam { 8 | key: "adam_beta1" 9 | value { 10 | float_value: 0.899999976158 11 | } 12 | } 13 | hparam { 14 | key: "adam_beta2" 15 | value { 16 | float_value: 0.999000012875 17 | } 18 | } 19 | hparam { 20 | key: "adam_epsilon" 21 | value { 22 | float_value: 9.99999993923e-09 23 | } 24 | } 25 | hparam { 26 | key: "batch_size" 27 | value { 28 | int64_value: 64 29 | } 30 | } 31 | hparam { 32 | key: "cell_clip" 33 | value { 34 | float_value: -1.0 35 | } 36 | } 37 | hparam { 38 | key: "cell_init_factor" 39 | value { 40 | float_value: 1.0 41 | } 42 | } 43 | hparam { 44 | key: "drop_learning_rate_at_the_latest" 45 | value { 46 | int64_value: 900 47 | } 48 | } 49 | hparam { 50 | key: "drop_learning_rate_multiplier" 51 | value { 52 | float_value: 0.10000000149 53 | } 54 | } 55 | hparam { 56 | key: "drop_learning_rate_rounds" 57 | value { 58 | int64_value: 26 59 | } 60 | } 61 | hparam { 62 | key: "drop_state_probability" 63 | value { 64 | float_value: 0.00999999977648 65 | } 66 | } 67 | hparam { 68 | key: "embed_once" 69 | value { 70 | bool_value: true 71 | } 72 | } 73 | hparam { 74 | key: "embedding_init_factor" 75 | value { 76 | float_value: 1.0 77 | } 78 | } 79 | hparam { 80 | key: "feature_mask" 81 | value { 82 | bool_value: false 83 | } 84 | } 85 | hparam { 86 | key: "forget_bias" 87 | value { 88 | float_value: 1.0 89 | } 90 | } 91 | hparam { 92 | key: "hidden_size" 93 | value { 94 | int64_value: 723 95 | } 96 | } 97 | hparam { 98 | key: "input_dropout" 99 | value { 100 | float_value: 0.633642196655 101 | } 102 | } 103 | hparam { 104 | key: "input_embedding_ratio" 105 | value { 106 | float_value: 1.0 107 | } 108 | } 109 | hparam { 110 | key: "input_embedding_size" 111 | value { 112 | int64_value: 723 113 | } 114 | } 115 | hparam { 116 | key: "intra_layer_dropout" 117 | value { 118 | float_value: 0.309127420187 119 | } 120 | } 121 | hparam { 122 | key: "layer_norm" 123 | value { 124 | bool_value: false 125 | } 126 | } 127 | hparam { 128 | key: "learning_rate" 129 | value { 130 | float_value: 0.00396024715155 131 | } 132 | } 133 | hparam { 134 | key: "learning_rate_decay" 135 | value { 136 | float_value: 1.0 137 | } 138 | } 139 | hparam { 140 | key: "learning_rate_decay_burn_in_steps" 141 | value { 142 | int64_value: 0 143 | } 144 | } 145 | hparam { 146 | key: "lstm_skip_connection" 147 | value { 148 | bool_value: true 149 | } 150 | } 151 | hparam { 152 | key: "max_grad_norm" 153 | value { 154 | float_value: 10.0 155 | } 156 | } 157 | hparam { 158 | key: "model" 159 | value { 160 | bytes_value: "lstm" 161 | } 162 | } 163 | hparam { 164 | key: "num_eval_samples" 165 | value { 166 | int64_value: 0 167 | } 168 | } 169 | hparam { 170 | key: "num_layers" 171 | value { 172 | int64_value: 4 173 | } 174 | } 175 | hparam { 176 | key: "num_params" 177 | value { 178 | int64_value: 24000000 179 | } 180 | } 181 | hparam { 182 | key: "optimizer_type" 183 | value { 184 | bytes_value: "rmsprop" 185 | } 186 | } 187 | hparam { 188 | key: "outer_steps" 189 | value { 190 | int64_value: 1000 191 | } 192 | } 193 | hparam { 194 | key: "output_dropout" 195 | value { 196 | float_value: 0.700856506824 197 | } 198 | } 199 | hparam { 200 | key: "output_embedding_ratio" 201 | value { 202 | float_value: 1.0 203 | } 204 | } 205 | hparam { 206 | key: "output_embedding_size" 207 | value { 208 | int64_value: 723 209 | } 210 | } 211 | hparam { 212 | key: "output_init_factor" 213 | value { 214 | float_value: 1.0 215 | } 216 | } 217 | hparam { 218 | key: "overlay_rank" 219 | value { 220 | int64_value: -1 221 | } 222 | } 223 | hparam { 224 | key: "rmsprop_beta2" 225 | value { 226 | float_value: 0.999000012875 227 | } 228 | } 229 | hparam { 230 | key: "rmsprop_epsilon" 231 | value { 232 | float_value: 9.99999993923e-09 233 | } 234 | } 235 | hparam { 236 | key: "share_input_and_output_embeddings" 237 | value { 238 | bool_value: true 239 | } 240 | } 241 | hparam { 242 | key: "sparsity_ratio" 243 | value { 244 | float_value: -1.0 245 | } 246 | } 247 | hparam { 248 | key: "state_dropout" 249 | value { 250 | float_value: 0.64275187254 251 | } 252 | } 253 | hparam { 254 | key: "tie_forget_and_input_gates" 255 | value { 256 | bool_value: false 257 | } 258 | } 259 | hparam { 260 | key: "token_dropout" 261 | value { 262 | float_value: 0.0 263 | } 264 | } 265 | hparam { 266 | key: "trainable_initial_state" 267 | value { 268 | bool_value: false 269 | } 270 | } 271 | hparam { 272 | key: "update_dropout" 273 | value { 274 | float_value: 0.0 275 | } 276 | } 277 | hparam { 278 | key: "vocab_size" 279 | value { 280 | int64_value: 10001 281 | } 282 | } 283 | hparam { 284 | key: "weight_decay" 285 | value { 286 | float_value: 7.44869103073e-05 287 | } 288 | } 289 | hparam { 290 | key: "weight_penalty" 291 | value { 292 | float_value: 0.0 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/train_enwik8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/enwik8_char.sh" 24 | 25 | hps_proto_file="$2" 26 | name="$(basename "$(dirname "$2")")" 27 | 28 | source_lib "run.sh" "$1" 29 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/train_ptb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/ptb_word_rmsprop.sh" 24 | 25 | hps_proto_file="$2" 26 | name="$(basename "$(dirname "$2")")" 27 | 28 | source_lib "run.sh" "$1" 29 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/train_wikitext-2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/wikitext-2_word.sh" 24 | 25 | hps_proto_file="$2" 26 | name="$(basename "$(dirname "$2")")" 27 | 28 | source_lib "run.sh" "$1" 29 | -------------------------------------------------------------------------------- /lamb/experiment/on-the-state/wikitext-2_24m_lstm_d2/hps_proto: -------------------------------------------------------------------------------- 1 | hparam { 2 | key: "activation_fn" 3 | value { 4 | bytes_value: "tf.tanh" 5 | } 6 | } 7 | hparam { 8 | key: "adam_beta1" 9 | value { 10 | float_value: 0.899999976158 11 | } 12 | } 13 | hparam { 14 | key: "adam_beta2" 15 | value { 16 | float_value: 0.999000012875 17 | } 18 | } 19 | hparam { 20 | key: "adam_epsilon" 21 | value { 22 | float_value: 9.99999993923e-09 23 | } 24 | } 25 | hparam { 26 | key: "batch_size" 27 | value { 28 | int64_value: 64 29 | } 30 | } 31 | hparam { 32 | key: "cap_input_gate" 33 | value { 34 | bool_value: true 35 | } 36 | } 37 | hparam { 38 | key: "cell_clip" 39 | value { 40 | float_value: -1.0 41 | } 42 | } 43 | hparam { 44 | key: "cell_init_factor" 45 | value { 46 | float_value: 1.0 47 | } 48 | } 49 | hparam { 50 | key: "drop_learning_rate_at_the_latest" 51 | value { 52 | int64_value: 900 53 | } 54 | } 55 | hparam { 56 | key: "drop_learning_rate_multiplier" 57 | value { 58 | float_value: 0.10000000149 59 | } 60 | } 61 | hparam { 62 | key: "drop_learning_rate_rounds" 63 | value { 64 | int64_value: 26 65 | } 66 | } 67 | hparam { 68 | key: "drop_state_probability" 69 | value { 70 | float_value: 0.00999999977648 71 | } 72 | } 73 | hparam { 74 | key: "embed_once" 75 | value { 76 | bool_value: true 77 | } 78 | } 79 | hparam { 80 | key: "embedding_init_factor" 81 | value { 82 | float_value: 1.0 83 | } 84 | } 85 | hparam { 86 | key: "feature_mask" 87 | value { 88 | bool_value: false 89 | } 90 | } 91 | hparam { 92 | key: "forget_bias" 93 | value { 94 | float_value: 1.0 95 | } 96 | } 97 | hparam { 98 | key: "hidden_size" 99 | value { 100 | int64_value: 1227 101 | } 102 | } 103 | hparam { 104 | key: "input_dropout" 105 | value { 106 | float_value: 0.484243571758 107 | } 108 | } 109 | hparam { 110 | key: "input_embedding_ratio" 111 | value { 112 | float_value: 0.121501773596 113 | } 114 | } 115 | hparam { 116 | key: "input_embedding_size" 117 | value { 118 | int64_value: 149 119 | } 120 | } 121 | hparam { 122 | key: "intra_layer_dropout" 123 | value { 124 | float_value: 0.0920244976878 125 | } 126 | } 127 | hparam { 128 | key: "layer_norm" 129 | value { 130 | bool_value: false 131 | } 132 | } 133 | hparam { 134 | key: "learning_rate" 135 | value { 136 | float_value: 0.00246041407809 137 | } 138 | } 139 | hparam { 140 | key: "learning_rate_decay" 141 | value { 142 | float_value: 1.0 143 | } 144 | } 145 | hparam { 146 | key: "learning_rate_decay_burn_in_steps" 147 | value { 148 | int64_value: 0 149 | } 150 | } 151 | hparam { 152 | key: "lstm_skip_connection" 153 | value { 154 | bool_value: true 155 | } 156 | } 157 | hparam { 158 | key: "max_grad_norm" 159 | value { 160 | float_value: 10.0 161 | } 162 | } 163 | hparam { 164 | key: "model" 165 | value { 166 | bytes_value: "lstm" 167 | } 168 | } 169 | hparam { 170 | key: "num_eval_samples" 171 | value { 172 | int64_value: 0 173 | } 174 | } 175 | hparam { 176 | key: "num_layers" 177 | value { 178 | int64_value: 2 179 | } 180 | } 181 | hparam { 182 | key: "num_params" 183 | value { 184 | int64_value: 24000000 185 | } 186 | } 187 | hparam { 188 | key: "optimizer_type" 189 | value { 190 | bytes_value: "rmsprop" 191 | } 192 | } 193 | hparam { 194 | key: "outer_steps" 195 | value { 196 | int64_value: 1000 197 | } 198 | } 199 | hparam { 200 | key: "output_dropout" 201 | value { 202 | float_value: 0.391492575407 203 | } 204 | } 205 | hparam { 206 | key: "output_embedding_ratio" 207 | value { 208 | float_value: 0.121501773596 209 | } 210 | } 211 | hparam { 212 | key: "output_embedding_size" 213 | value { 214 | int64_value: 149 215 | } 216 | } 217 | hparam { 218 | key: "output_init_factor" 219 | value { 220 | float_value: 1.0 221 | } 222 | } 223 | hparam { 224 | key: "overlay_rank" 225 | value { 226 | int64_value: -1 227 | } 228 | } 229 | hparam { 230 | key: "rmsprop_beta2" 231 | value { 232 | float_value: 0.999000012875 233 | } 234 | } 235 | hparam { 236 | key: "rmsprop_epsilon" 237 | value { 238 | float_value: 9.99999993923e-09 239 | } 240 | } 241 | hparam { 242 | key: "share_input_and_output_embeddings" 243 | value { 244 | bool_value: true 245 | } 246 | } 247 | hparam { 248 | key: "sparsity_ratio" 249 | value { 250 | float_value: -1.0 251 | } 252 | } 253 | hparam { 254 | key: "state_dropout" 255 | value { 256 | float_value: 0.453888505697 257 | } 258 | } 259 | hparam { 260 | key: "tie_forget_and_input_gates" 261 | value { 262 | bool_value: false 263 | } 264 | } 265 | hparam { 266 | key: "token_dropout" 267 | value { 268 | float_value: 0.0 269 | } 270 | } 271 | hparam { 272 | key: "trainable_initial_state" 273 | value { 274 | bool_value: false 275 | } 276 | } 277 | hparam { 278 | key: "update_dropout" 279 | value { 280 | float_value: 0.0 281 | } 282 | } 283 | hparam { 284 | key: "vocab_size" 285 | value { 286 | int64_value: 33279 287 | } 288 | } 289 | hparam { 290 | key: "weight_decay" 291 | value { 292 | float_value: 3.77565629606e-05 293 | } 294 | } 295 | hparam { 296 | key: "weight_penalty" 297 | value { 298 | float_value: 0.0 299 | } 300 | } 301 | -------------------------------------------------------------------------------- /lamb/experiment/pushing-the-bounds/README.md: -------------------------------------------------------------------------------- 1 | This directory is to accompany the [Pushing the bounds of 2 | dropout](https://arxiv.org/abs/1805.09208) paper. 3 | 4 | The paper is mostly about how to make predictions with a model trained with 5 | dropout. Use any saved model such as those trained in `../on-the-state/` and 6 | evaluate them with `./test.sh` (in this dir). One difference to `../test.sh` is 7 | that `./test.sh` tunes the optimal evaluation softmax temperature on the 8 | validation set (between 0.8 and 1.0): 9 | 10 | eval_softmax_temperature=-0.8 11 | 12 | Also, in addition to deterministic (or 'standard') dropout, it does MC dropout 13 | (the arithmetic averaged variant) with various `eval_dropout_multiplier`s. See 14 | the linked paper for details. 15 | 16 | So, assuming there is a saved model in `/tmp/lamb/ptb_10m_lstm_d1/`. Test it 17 | with: 18 | 19 | ./test.sh run some-descriptive-name /tmp/lamb/ptb_10m_lstm_d1/ 20 | 21 | Thus the model will be evaluated more than once. In the output, the line with 22 | `final test_det_t0.9 xe:` has the test cross-entropy at the optimal softmax 23 | temperature (in this case 0.9). Similarly, `final test_mca_d0.8_t0.9 xe:` 24 | corresponds to the test cross-entropy with `eval_dropout_multiplier=0.8` and 25 | softmax temperature 0.9. 26 | -------------------------------------------------------------------------------- /lamb/experiment/pushing-the-bounds/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | 24 | saved_args="$1" 25 | 26 | save_checkpoints=false 27 | turns=0 28 | min_non_episodic_eval_examples_per_stripe=500000 29 | eval_on_test=true 30 | 31 | test_one() { 32 | local suffix="$1" 33 | local experiment_dir="$2" 34 | local name="$(default_name)_${suffix}" 35 | local config_file="${experiment_dir}/config" 36 | local load_checkpoint="${experiment_dir}/best" 37 | source_lib "run.sh" "${saved_args}" 38 | } 39 | 40 | name="$2" 41 | experiment_dir="$3" 42 | 43 | eval_softmax_temperature=-0.8 44 | 45 | eval_method="deterministic" 46 | test_one "det" "${experiment_dir}" 47 | 48 | eval_method="arithmetic" 49 | num_eval_samples=200 50 | eval_dropout_multiplier=0.8 51 | test_one "amc$eval_dropout_multiplier" "${experiment_dir}" 52 | -------------------------------------------------------------------------------- /lamb/experiment/rerun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | 24 | cmd="$1" 25 | 26 | run_one() { 27 | local name="$(default_name)_$1" 28 | local config_file="$2/config" 29 | source_lib "run.sh" "${cmd}" 30 | } 31 | 32 | run_one "$2" "$3" 33 | -------------------------------------------------------------------------------- /lamb/experiment/rerun_old.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | 24 | cmd="$1" 25 | 26 | run_one() { 27 | local name="$(default_name)_$1" 28 | local flags_as_dict="$2/args" 29 | local hps_proto_file="$2/config" 30 | source_lib "run.sh" "${cmd}" 31 | } 32 | 33 | run_one "$2" "$3" 34 | -------------------------------------------------------------------------------- /lamb/experiment/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | 24 | cmd="$1" 25 | load_checkpoint="$3/best" 26 | config_file="${4:-$3}/config" 27 | 28 | save_checkpoints=false 29 | turns=0 30 | min_non_episodic_eval_examples_per_stripe=500000 31 | 32 | test_one() { 33 | local name="$1" 34 | source_lib "run.sh" "${cmd}" 35 | } 36 | 37 | cell="lu" 38 | gpu_type="v100" 39 | 40 | eval_on_test=false 41 | 42 | eval_method="deterministic" 43 | test_one "$2_det" 44 | 45 | # MC dropout evaluation can be a bit better, but it's very slow. 46 | eval_method="arithmetic" 47 | num_eval_samples=200 48 | eval_dropout_multiplier=0.6 49 | test_one "$2_amc$eval_dropout_multiplier" 50 | eval_dropout_multiplier=0.7 51 | test_one "$2_amc$eval_dropout_multiplier" 52 | eval_dropout_multiplier=0.8 53 | test_one "$2_amc$eval_dropout_multiplier" 54 | eval_dropout_multiplier=0.9 55 | test_one "$2_amc$eval_dropout_multiplier" 56 | 57 | eval_on_test=true 58 | max_eval_eval_batches=1 59 | 60 | eval_method="deterministic" 61 | test_one "$2_test_det" 62 | 63 | # MC dropout evaluation can be a bit better, but it's very slow. 64 | eval_method="arithmetic" 65 | num_eval_samples=200 66 | eval_dropout_multiplier=0.6 67 | test_one "$2_test_amc$eval_dropout_multiplier" 68 | eval_dropout_multiplier=0.7 69 | test_one "$2_test_amc$eval_dropout_multiplier" 70 | eval_dropout_multiplier=0.8 71 | test_one "$2_test_amc$eval_dropout_multiplier" 72 | eval_dropout_multiplier=0.9 73 | test_one "$2_test_amc$eval_dropout_multiplier" 74 | -------------------------------------------------------------------------------- /lamb/experiment/train_ptb_10m_lstm_d1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/ptb_word_slow.sh" 24 | 25 | # Model hyperparameters 26 | 27 | model="lstm" 28 | num_params=$(million 10) 29 | share_input_and_output_embeddings=true 30 | tie_forget_and_input_gates=false 31 | cap_input_gate=true 32 | forget_bias=1.0 33 | num_layers=1 34 | 35 | # Tuned hyperparameters 36 | 37 | learning_rate=0.0048308 38 | l2_penalty=0.00007676 39 | input_dropout=0.51551 40 | inter_layer_dropout= 41 | state_dropout=0.18417 42 | output_dropout=0.33801 43 | input_embedding_ratio=0.22973 44 | 45 | # Evaluation hyperparameters 46 | 47 | eval_softmax_temperature=-0.8 48 | 49 | source_lib "run.sh" "$@" 50 | -------------------------------------------------------------------------------- /lamb/experiment/train_ptb_24m_lstm_d4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -e 19 | 20 | source "$(dirname $0)/../lib/setup.sh" 21 | source_lib "config/common.sh" 22 | source_lib "config/running.sh" 23 | source_lib "config/ptb_word_slow.sh" 24 | 25 | # Model hyperparameters 26 | 27 | model="lstm" 28 | num_params=$(million 24) 29 | share_input_and_output_embeddings=true 30 | tie_forget_and_input_gates=false 31 | cap_input_gate=true 32 | forget_bias=1.0 33 | num_layers=4 34 | 35 | # Tuned hyperparameters 36 | 37 | learning_rate=0.0033390 38 | l2_penalty=0.000093711 39 | input_dropout=0.68697 40 | inter_layer_dropout=0.31323 41 | state_dropout=0.48479 42 | output_dropout=0.69626 43 | 44 | # Evaluation hyperparameters 45 | 46 | eval_softmax_temperature=-0.8 47 | 48 | source_lib "run.sh" "$@" 49 | -------------------------------------------------------------------------------- /lamb/experiment/tune_ptb_10m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for 19 | # illustration only. 20 | 21 | set -e 22 | 23 | # Include definitions of dataset and tuning related variables. 24 | source "$(dirname $0)/../lib/setup.sh" 25 | source_lib "config/common.sh" 26 | source_lib "config/tuning.sh" 27 | source_lib "config/ptb_word_rmsprop.sh" 28 | 29 | # Model hyperparameters 30 | 31 | num_params=$(million 10) 32 | share_input_and_output_embeddings=true 33 | 34 | # Evaluation hyperparameters 35 | 36 | eval_softmax_temperature=-0.8 37 | 38 | # Tuning parameters 39 | 40 | num_workers=60 41 | 42 | # Start a number of tuning studies, setting model specific parameters. 43 | 44 | model="lstm" 45 | tie_forget_and_input_gates=false 46 | forget_bias=1.0 47 | num_layers=1 48 | 49 | tuneables="learning_rate,l2_penalty, 50 | input_dropout,inter_layer_dropout,state_dropout, 51 | output_dropout,input_embedding_ratio" 52 | name="$(default_name)_${model}_d${num_layers}" 53 | source_lib "run.sh" "$@" 54 | -------------------------------------------------------------------------------- /lamb/lib/config/README.md: -------------------------------------------------------------------------------- 1 | Shell scripts here set up variables for datasets and all kinds of arguments to 2 | the main binary. They are intended to be sourced and can source other scripts. 3 | -------------------------------------------------------------------------------- /lamb/lib/config/common.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | logtostderr=true 17 | 18 | default_name() { 19 | if which git > /dev/null 2>&1; then 20 | echo "$(git rev-parse --short HEAD)_$(basename $0 .sh)" 21 | else 22 | echo "$(basename $0 .sh)" 23 | fi 24 | } 25 | 26 | name="$(default_name)" 27 | 28 | million() { 29 | echo $(($1 * 1000 * 1000)) 30 | } 31 | -------------------------------------------------------------------------------- /lamb/lib/config/copy.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | copy_data_dir=${copy_data_dir:-"${HOME}/data/copy/"} 17 | training_file="${copy_data_dir}copy-training.txt" 18 | validation_file="${copy_data_dir}copy-valid.txt" 19 | test_file="${copy_data_dir}copy-test.txt" 20 | word_based=true 21 | episodic=true 22 | conditioning_separator="|" 23 | -------------------------------------------------------------------------------- /lamb/lib/config/enwik8.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | enwik8_data_dir=${enwik8_data_dir:-"${HOME}/data/enwik8/"} 17 | training_file="${enwik8_data_dir}enwik8-training.txt" 18 | validation_file="${enwik8_data_dir}enwik8-valid.txt" 19 | test_file="${enwik8_data_dir}enwik8-test.txt" 20 | file_encoding="CP437" 21 | word_based=false 22 | -------------------------------------------------------------------------------- /lamb/lib/config/enwik8_char.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/common.sh" 17 | source_lib "config/enwik8.sh" 18 | # While utf-8 is the actual encoding, for character based modelling 19 | # the literature seems to have settled on bytes as evidenced by 20 | # mentions of a vocabulary size of 205 (it is more than 5000 with 21 | # utf-8). 22 | file_encoding="CP437" 23 | word_based=false 24 | episodic=false 25 | max_time_steps=50 26 | # 400*500=200k optimization steps. With batch size 128 and max_time_steps 27 | # 50, for example, that's about 14 epochs. 28 | steps_per_turn=400 29 | turns=500 30 | print_training_stats_every_num_steps=100 31 | early_stopping_turns=15 32 | early_stopping_rampup_turns=30 33 | early_stopping_worst_xe_target=1.05,0.93,0.92 34 | drop_learning_rate_turns=13 35 | drop_learning_rate_multiplier=0.1 36 | drop_learning_rate_at_the_latest=450 37 | drop_state_probability=0.01 38 | max_eval_eval_batches=500 39 | -------------------------------------------------------------------------------- /lamb/lib/config/enwik8_char_rmsprop.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/enwik8_char.sh" 17 | optimizer_type=rmsprop 18 | rmsprop_beta2=0.99 19 | rmsprop_epsilon=1e-5 20 | batch_size=128 21 | max_grad_norm=10.0 22 | -------------------------------------------------------------------------------- /lamb/lib/config/mwc.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | mwc_data_dir=${mwc_data_dir:-"${HOME}/data/mwc/"} 17 | file_encoding="utf-8" 18 | word_based=false 19 | -------------------------------------------------------------------------------- /lamb/lib/config/ptb.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | ptb_data_dir=${ptb_data_dir:-"${HOME}/data/ptb/"} 17 | training_file="${ptb_data_dir}ptb.train.txt" 18 | validation_file="${ptb_data_dir}ptb.valid.txt" 19 | test_file="${ptb_data_dir}ptb.test.txt" 20 | -------------------------------------------------------------------------------- /lamb/lib/config/ptb_char.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | ptb_data_dir=${ptb_data_dir:-"${HOME}/data/ptb/"} 17 | training_file="${ptb_data_dir}ptb.char.train.txt" 18 | validation_file="${ptb_data_dir}ptb.char.valid.txt" 19 | test_file="${ptb_data_dir}ptb.char.test.txt" 20 | # There are spaces between characters. 21 | word_based=true 22 | -------------------------------------------------------------------------------- /lamb/lib/config/ptb_word.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/common.sh" 17 | source_lib "config/ptb.sh" 18 | word_based=true 19 | episodic=false 20 | -------------------------------------------------------------------------------- /lamb/lib/config/ptb_word_rmsprop.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/ptb_word.sh" 17 | 18 | optimizer_type=rmsprop 19 | batch_size=64 20 | max_grad_norm=10.0 21 | max_time_steps=35 22 | 23 | steps_per_turn=100 24 | turns=1000 25 | print_training_stats_every_num_steps=100 26 | 27 | early_stopping_turns=30 28 | early_stopping_rampup_turns=60 29 | early_stopping_worst_xe_target=4.4,4.2 30 | 31 | drop_learning_rate_turns=26 32 | drop_learning_rate_multiplier=0.1 33 | drop_learning_rate_at_the_latest=900 34 | drop_state_probability=0.01 35 | -------------------------------------------------------------------------------- /lamb/lib/config/ptb_word_slow.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/common.sh" 17 | source_lib "config/ptb_word_rmsprop.sh" 18 | episodic=false 19 | max_time_steps=35 20 | steps_per_turn=100 21 | turns=2500 22 | print_training_stats_every_num_steps=100 23 | early_stopping_turns=100 24 | early_stopping_rampup_turns=200 25 | early_stopping_worst_xe_target=4.4,4.2 26 | drop_learning_rate_turns=90 27 | drop_learning_rate_multiplier=0.1 28 | drop_learning_rate_at_the_latest=2000 29 | drop_state_probability=0.01 30 | -------------------------------------------------------------------------------- /lamb/lib/config/running.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | # Just a placeholder for now. Don't remove it though as it is necessary for the 17 | # source_lib override mechanism. 18 | -------------------------------------------------------------------------------- /lamb/lib/config/tuning.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | # Just a placeholder for now. Don't remove it though as it is necessary for the 17 | # source_lib override mechanism. 18 | -------------------------------------------------------------------------------- /lamb/lib/config/wikitext-103.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | wikitext_103_data_dir=${wikitext_103_data_dir:-"${HOME}/data/wikitext-103/"} 17 | training_file="${wikitext_103_data_dir}wiki.train.tokens" 18 | validation_file="${wikitext_103_data_dir}wiki.valid.tokens" 19 | test_file="${wikitext_103_data_dir}wiki.test.tokens" 20 | -------------------------------------------------------------------------------- /lamb/lib/config/wikitext-103_word.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/common.sh" 17 | source_lib "config/wikitext-103.sh" 18 | word_based=true 19 | episodic=false 20 | max_time_steps=35 21 | steps_per_turn=1000 22 | turns=1000 23 | print_training_stats_every_num_steps=1000 24 | early_stopping_turns=30 25 | early_stopping_rampup_turns=60 26 | early_stopping_worst_xe_target=3.5,3.3 27 | drop_learning_rate_turns=26 28 | drop_learning_rate_multiplier=0.1 29 | drop_learning_rate_at_the_latest=900 30 | drop_state_probability=0.01 31 | -------------------------------------------------------------------------------- /lamb/lib/config/wikitext-103_word_rmsprop.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/wikitext-103_word.sh" 17 | optimizer_type=rmsprop 18 | batch_size=64 19 | max_grad_norm=10.0 20 | -------------------------------------------------------------------------------- /lamb/lib/config/wikitext-2.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | wikitext_2_data_dir=${wikitext_2_data_dir:-"${HOME}/data/wikitext-2/"} 17 | training_file="${wikitext_2_data_dir}wiki.train.tokens" 18 | validation_file="${wikitext_2_data_dir}wiki.valid.tokens" 19 | test_file="${wikitext_2_data_dir}wiki.test.tokens" 20 | -------------------------------------------------------------------------------- /lamb/lib/config/wikitext-2_word.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/common.sh" 17 | source_lib "config/wikitext-2.sh" 18 | word_based=true 19 | episodic=false 20 | # max_time_steps=35 21 | # steps_per_turn=200 22 | # turns=1000 23 | # print_training_stats_every_num_steps=100 24 | # early_stopping_turns=30 25 | # early_stopping_rampup_turns=60 26 | # early_stopping_worst_xe_target=4.9,4.5 27 | # drop_learning_rate_turns=26 28 | # drop_learning_rate_multiplier=0.1 29 | # drop_learning_rate_at_the_latest=900 30 | # drop_state_probability=0.01 31 | -------------------------------------------------------------------------------- /lamb/lib/config/wikitext-2_word_rmsprop.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | source_lib "config/wikitext-2_word.sh" 17 | optimizer_type=rmsprop 18 | batch_size=64 19 | max_grad_norm=10.0 20 | -------------------------------------------------------------------------------- /lamb/lib/describe_version.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | # We want to know what code was run for an experiment. This prints the git 17 | # version, status and the non-committed diffs, if any. 18 | 19 | echo "$(date): Invoking LAMB." 20 | if (which git && git rev-parse --is-inside-work-tree) > /dev/null 2>&1; then 21 | echo "git version: $(git rev-parse --short HEAD)" 22 | git --no-pager status 23 | git --no-pager diff 24 | git --no-pager diff --cached 25 | fi 26 | -------------------------------------------------------------------------------- /lamb/lib/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | # This script runs LAMB. 17 | # 18 | # Usage 19 | # ----- 20 | # 21 | # See experiment/*.sh for examples. 22 | # 23 | # Assign values to shell variables of the same name as hyperparameters, command 24 | # line flags and source this script. The single, optional command line argument 25 | # (of the sourcee) is the command which must be "run" in the open source 26 | # version. 27 | # 28 | # setup.py is assumed to have been sourced. 29 | # 30 | # How it works 31 | # ------------ 32 | # 33 | # The configuration options (see ../README.md) are gathered from shell variables 34 | # and passed as command line arguments to the binary. 35 | 36 | cmd="${1:-run}" 37 | 38 | source_lib "run_helper.sh" 39 | 40 | _project_dir=${project_dir:-"."} 41 | _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}" 42 | # If ensure_new_experiment, add a random suffix that makes experiment_dir 43 | # unique. 44 | if [ "${ensure_new_experiment}" != "false" ]; then 45 | _suffix="$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c5)" 46 | _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}_${_suffix}" 47 | while test -d "${_experiment_dir}"; do 48 | _suffix="$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c5)" 49 | _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}_${_suffix}" 50 | done 51 | fi 52 | 53 | mkdir -p "${_experiment_dir}" 54 | 55 | { 56 | source_lib "describe_version.sh" 57 | } > >(tee -a "${_experiment_dir}/lamb_version") 58 | 59 | { 60 | if [ "${cmd}" = "run" ]; then 61 | eval $(echo "python" "${base}/main.py" "$(gather_args)") 62 | elif [ "${cmd}" = "run_par" ]; then 63 | eval $(echo "${base}/lamb.par" "$(gather_args)") 64 | else 65 | echo "Unsupported command ${cmd}." 66 | exit 1 67 | fi 68 | } > >(tee -a "${_experiment_dir}/stdout") \ 69 | 2> >(tee -a "${_experiment_dir}/stderr" >&2) 70 | -------------------------------------------------------------------------------- /lamb/lib/run_helper.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | 17 | set -e 18 | 19 | escape_cl_arg() { 20 | printf "%q" "$1" 21 | } 22 | 23 | # This command: 24 | # add_param hps "--" model "X" "escape_cl_arg" 25 | # will add to $hps the line: 26 | # --model=${model}X 27 | # where ${model} is actually evaluated and transformed by 28 | # escape_cl_arg. See the 'indirect references' shell concept. 29 | add_param() { 30 | var1="\${$1}" 31 | prefix=$2 32 | var2="\${$3}" 33 | suffix=$4 34 | val2=$(eval "echo \$$3") 35 | if [ "$val2" ]; then 36 | local escape_fn=$5 37 | if [ "$escape_fn" ]; then 38 | var2="\$($escape_fn \"$var2\")" 39 | fi 40 | eval $1="\"$var1$prefix$3=$var2$suffix\"" 41 | fi 42 | } 43 | 44 | add_cl_arg() { 45 | add_param "$1" "--" "$2" " " "escape_cl_arg" 46 | } 47 | 48 | gather_args() { 49 | ## Populate args (mirroring the structure of README.md). See command line 50 | ## argument definitions in lamb_flags.py. 51 | 52 | local args="" 53 | 54 | # data 55 | add_cl_arg args training_file 56 | add_cl_arg args validation_file 57 | add_cl_arg args test_file 58 | add_cl_arg args conditioning_separator 59 | add_cl_arg args file_encoding 60 | add_cl_arg args word_based 61 | add_cl_arg args episodic 62 | 63 | # model 64 | add_cl_arg args num_params 65 | add_cl_arg args share_input_and_output_embeddings 66 | add_cl_arg args input_embedding_size 67 | add_cl_arg args output_embedding_size 68 | add_cl_arg args input_embedding_ratio 69 | add_cl_arg args output_embedding_ratio 70 | add_cl_arg args embedding_dropout 71 | add_cl_arg args token_dropout 72 | add_cl_arg args input_dropout 73 | add_cl_arg args input_dropout_base 74 | add_cl_arg args output_dropout 75 | add_cl_arg args downprojected_output_dropout 76 | add_cl_arg args shared_mask_dropout 77 | add_cl_arg args embed_once 78 | add_cl_arg args output_once 79 | 80 | # cell 81 | add_cl_arg args model 82 | add_cl_arg args num_layers 83 | add_cl_arg args residual_connections 84 | add_cl_arg args lstm_skip_connection 85 | add_cl_arg args feature_mask_rounds 86 | add_cl_arg args feature_mask_rank 87 | add_cl_arg args sparsity_ratio 88 | add_cl_arg args overlay_rank 89 | add_cl_arg args hidden_size 90 | add_cl_arg args hidden_size_multiplier 91 | add_cl_arg args layer_norm 92 | add_cl_arg args activation_fn 93 | add_cl_arg args tie_forget_and_input_gates 94 | add_cl_arg args cap_input_gate 95 | add_cl_arg args mos_num_components 96 | add_cl_arg args trainable_initial_state 97 | add_cl_arg args inter_layer_dropout 98 | add_cl_arg args state_dropout 99 | add_cl_arg args state_dropout_flip_rate 100 | add_cl_arg args update_dropout 101 | add_cl_arg args cell_clip 102 | 103 | # objective 104 | add_cl_arg args model_average 105 | add_cl_arg args num_training_samples 106 | add_cl_arg args l2_penalty 107 | add_cl_arg args l1_penalty 108 | add_cl_arg args activation_norm_penalty 109 | add_cl_arg args drop_state_probability 110 | 111 | # initialization 112 | add_cl_arg args embedding_init_factor 113 | add_cl_arg args scale_input_embeddings 114 | add_cl_arg args cell_init_factor 115 | add_cl_arg args forget_bias 116 | add_cl_arg args output_init_factor 117 | 118 | # schedule 119 | add_cl_arg args steps_per_turn 120 | add_cl_arg args turns 121 | add_cl_arg args print_training_stats_every_num_steps 122 | 123 | # optimization 124 | add_cl_arg args optimizer_type 125 | add_cl_arg args rmsprop_beta2 126 | add_cl_arg args rmsprop_epsilon 127 | add_cl_arg args adam_beta1 128 | add_cl_arg args adam_beta2 129 | add_cl_arg args adam_epsilon 130 | add_cl_arg args max_grad_norm 131 | add_cl_arg args batch_size 132 | add_cl_arg args accum_batch_size 133 | add_cl_arg args max_time_steps 134 | add_cl_arg args trigger_averaging_turns 135 | add_cl_arg args trigger_averaging_at_the_latest 136 | 137 | # learning rate 138 | add_cl_arg args learning_rate 139 | add_cl_arg args learning_rate_decay 140 | add_cl_arg args learning_rate_decay_burn_in_steps 141 | add_cl_arg args drop_learning_rate_turns 142 | add_cl_arg args drop_learning_rate_multiplier 143 | add_cl_arg args drop_learning_rate_at_the_latest 144 | 145 | # early stopping 146 | add_cl_arg args early_stopping_turns 147 | add_cl_arg args early_stopping_rampup_turns 148 | add_cl_arg args early_stopping_worst_xe_target 149 | add_cl_arg args early_stopping_slowest_rate 150 | 151 | # cross-validation 152 | add_cl_arg args crossvalidate 153 | add_cl_arg args crossvalidation_rounds 154 | add_cl_arg args crossvalidate_max_folds 155 | 156 | # evaluation 157 | add_cl_arg args max_training_eval_batches 158 | add_cl_arg args max_eval_eval_batches 159 | add_cl_arg args max_test_eval_batches 160 | add_cl_arg args min_non_episodic_eval_examples_per_stripe 161 | add_cl_arg args eval_on_test 162 | add_cl_arg args eval_method 163 | add_cl_arg args num_eval_samples 164 | add_cl_arg args eval_softmax_temperature 165 | add_cl_arg args eval_softmax_temperature_estimation_num_tokens 166 | add_cl_arg args eval_power_mean_power 167 | add_cl_arg args eval_dropout_multiplier 168 | add_cl_arg args validation_prediction_file 169 | add_cl_arg args dyneval 170 | add_cl_arg args dyneval_learning_rate 171 | add_cl_arg args dyneval_decay_rate 172 | add_cl_arg args dyneval_epsilon 173 | 174 | # experiments 175 | local experiment_dir="${_experiment_dir}" 176 | add_cl_arg args experiment_dir 177 | add_cl_arg args save_config 178 | add_cl_arg args config_file 179 | add_cl_arg args hps_proto_file # deprecated 180 | add_cl_arg args flags_as_dict # deprecated 181 | 182 | # checkpoints 183 | add_cl_arg args save_checkpoints 184 | add_cl_arg args load_checkpoint 185 | add_cl_arg args load_optimizer_state 186 | add_cl_arg args load_averaged 187 | add_cl_arg args use_old_linear_names 188 | 189 | # Misc flags 190 | add_cl_arg args seed 191 | add_cl_arg args swap_memory 192 | add_cl_arg args logtostderr 193 | add_cl_arg args log_device_placement 194 | add_cl_arg args summary_flush_secs 195 | 196 | echo "${args}" 197 | } 198 | -------------------------------------------------------------------------------- /lamb/lib/setup.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | if [[ "$0" == "$BASH_SOURCE" ]]; then 17 | echo "This script must be sourced." 18 | exit 1 19 | fi 20 | 21 | base=$(dirname "$BASH_SOURCE")/.. 22 | 23 | cmd=${1:-"run"} 24 | 25 | lib_override_path= 26 | 27 | # `source_lib` is like the shell built-in `source`, but allows files in 28 | # `lib_override_path` to shadow those in lamb/lib/. 29 | source_lib() { 30 | local _name="$1" 31 | shift 32 | if [ -d "${lib_override_path}" -a \ 33 | -f "${lib_override_path}/lib/${_name}" ]; then 34 | source "${lib_override_path}/lib/${_name}" "$@" 35 | else 36 | source "${base}/lib/${_name}" "$@" 37 | fi 38 | } 39 | -------------------------------------------------------------------------------- /lamb/res_multi_rnn_cell.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """A stacked RNN cell with residual connections.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow.compat.v1 as tf 23 | from tensorflow.contrib import framework as contrib_framework 24 | 25 | nest = contrib_framework.nest 26 | 27 | 28 | class ResMultiRNNCell(tf.nn.rnn_cell.RNNCell): 29 | """RNN cell composed sequentially of multiple simple cells.""" 30 | 31 | def __init__(self, cells, state_is_tuple=True): 32 | """Create a RNN cell composed sequentially of a number of RNNCells. 33 | 34 | Args: 35 | cells: list of RNNCells that will be composed in this order. 36 | state_is_tuple: If True, accepted and returned states are n-tuples, where 37 | `n = len(cells)`. If False, the states are all 38 | concatenated along the column axis. This latter behavior will soon be 39 | deprecated. 40 | 41 | Raises: 42 | ValueError: if cells is empty (not allowed), or at least one of the cells 43 | returns a state tuple but the flag `state_is_tuple` is `False`. 44 | """ 45 | if not cells: 46 | raise ValueError("Must specify at least one cell for ResMultiRNNCell.") 47 | if not nest.is_sequence(cells): 48 | raise TypeError( 49 | "cells must be a list or tuple, but saw: %s." % cells) 50 | 51 | self._cells = cells 52 | self._state_is_tuple = state_is_tuple 53 | if not state_is_tuple: 54 | if any(nest.is_sequence(c.state_size) for c in self._cells): 55 | raise ValueError("Some cells return tuples of states, but the flag " 56 | "state_is_tuple is not set. State sizes are: %s" 57 | % str([c.state_size for c in self._cells])) 58 | 59 | @property 60 | def state_size(self): 61 | if self._state_is_tuple: 62 | return tuple(cell.state_size for cell in self._cells) 63 | else: 64 | return sum([cell.state_size for cell in self._cells]) 65 | 66 | @property 67 | def output_size(self): 68 | return self._cells[-1].output_size 69 | 70 | def __call__(self, inputs, state, scope=None): 71 | """Run this multi-layer cell on inputs, starting from state.""" 72 | with tf.variable_scope(scope or "res_multi_rnn_cell"): 73 | cur_state_pos = 0 74 | cur_inp = inputs 75 | new_states = [] 76 | for i, cell in enumerate(self._cells): 77 | with tf.variable_scope("cell_%d" % i): 78 | if self._state_is_tuple: 79 | if not nest.is_sequence(state): 80 | raise ValueError( 81 | "Expected state to be a tuple of length %d, but received: %s" 82 | % (len(self.state_size), state)) 83 | cur_state = state[i] 84 | else: 85 | cur_state = tf.slice( 86 | state, [0, cur_state_pos], [-1, cell.state_size]) 87 | cur_state_pos += cell.state_size 88 | cur_inp2, new_state = cell(cur_inp, cur_state) 89 | if i == 0: 90 | cur_inp = cur_inp2 91 | else: 92 | cur_inp = cur_inp + cur_inp2 93 | new_states.append(new_state) 94 | new_states = (tuple(new_states) if self._state_is_tuple else 95 | tf.concat(new_states, 1)) 96 | return cur_inp, new_states 97 | -------------------------------------------------------------------------------- /lamb/skip_multi_rnn_cell.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """A RNN cell with skip connections.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow.compat.v1 as tf 23 | from tensorflow.contrib import framework as contrib_framework 24 | 25 | nest = contrib_framework.nest 26 | 27 | 28 | class SkipMultiRNNCell(tf.nn.rnn_cell.RNNCell): 29 | """RNN cell composed sequentially of multiple simple cells.""" 30 | 31 | def __init__(self, cells, state_is_tuple=True): 32 | """Create a RNN cell composed sequentially of a number of RNNCells. 33 | 34 | Args: 35 | cells: list of RNNCells that will be composed in this order. 36 | state_is_tuple: If True, accepted and returned states are n-tuples, where 37 | `n = len(cells)`. If False, the states are all 38 | concatenated along the column axis. This latter behavior will soon be 39 | deprecated. 40 | 41 | Raises: 42 | ValueError: if cells is empty (not allowed), or at least one of the cells 43 | returns a state tuple but the flag `state_is_tuple` is `False`. 44 | """ 45 | if not cells: 46 | raise ValueError("Must specify at least one cell for SkipMultiRNNCell.") 47 | if not nest.is_sequence(cells): 48 | raise TypeError( 49 | "cells must be a list or tuple, but saw: %s." % cells) 50 | 51 | self._cells = cells 52 | self._state_is_tuple = state_is_tuple 53 | if not state_is_tuple: 54 | if any(nest.is_sequence(c.state_size) for c in self._cells): 55 | raise ValueError("Some cells return tuples of states, but the flag " 56 | "state_is_tuple is not set. State sizes are: %s" 57 | % str([c.state_size for c in self._cells])) 58 | 59 | @property 60 | def state_size(self): 61 | if self._state_is_tuple: 62 | return tuple(cell.state_size for cell in self._cells) 63 | else: 64 | return sum([cell.state_size for cell in self._cells]) 65 | 66 | @property 67 | def output_size(self): 68 | return self._cells[-1].output_size 69 | 70 | def __call__(self, inputs, state, scope=None): 71 | """Run this multi-layer cell on inputs, starting from state.""" 72 | output = None 73 | with tf.variable_scope(scope or "skip_multi_rnn_cell"): 74 | cur_state_pos = 0 75 | cur_inp = inputs 76 | new_states = [] 77 | for i, cell in enumerate(self._cells): 78 | with tf.variable_scope("cell_%d" % i): 79 | if self._state_is_tuple: 80 | if not nest.is_sequence(state): 81 | raise ValueError( 82 | "Expected state to be a tuple of length %d, but received: %s" 83 | % (len(self.state_size), state)) 84 | cur_state = state[i] 85 | else: 86 | cur_state = tf.slice( 87 | state, [0, cur_state_pos], [-1, cell.state_size]) 88 | cur_state_pos += cell.state_size 89 | cur_inp, new_state = cell(cur_inp, cur_state) 90 | new_states.append(new_state) 91 | if output is None: 92 | output = cur_inp 93 | else: 94 | output += cur_inp 95 | new_states = (tuple(new_states) if self._state_is_tuple else 96 | tf.concat(new_states, 1)) 97 | return output, new_states 98 | -------------------------------------------------------------------------------- /lamb/test/data/save_v1/args: -------------------------------------------------------------------------------- 1 | {'swap_memory': False, 'crossvalidate': False, 'seed': 1, 'early_stopping_rounds': 10, 'max_test_eval_batches': None, 'crossvalidation_rounds': 1, 'early_stopping_worst_xe_target': '9.0', 'hps': 'model=lstm,num_layers=1,num_params=10000000,share_input_and_output_embeddings=true,tie_forget_and_input_gates=false,cap_input_gate=true,forget_bias=1.0,input_embedding_ratio=0.22973,input_dropout=0.51551,state_dropout=0.18417,output_dropout=0.33801,weight_decay=0.00007676,optimizer_type=rmsprop,max_grad_norm=10.0,outer_steps=25,batch_size=,learning_rate=0.0048308,drop_learning_rate_rounds=90,drop_learning_rate_multiplier=0.1,drop_learning_rate_at_the_latest=2000,drop_state_probability=0.01,softmax_test_time_temperature=-0.8,', 'use_old_linear_names': False, 'crossvalidation_folds': 10, 'training_file': '/non-existent-dir/data/ptb/ptb.train.txt', 'file_encoding': 'utf-8', 'max_training_eval_batches': 100, 'word_based': True, 'experiment_dir': '/non-existent-dir/baf254a0c6_train_ptb_10m_lstm_lstm_d1', 'max_eval_eval_batches': None, 'test_file': '/non-existent-dir/data/ptb/ptb.test.txt', 'min_non_episodic_eval_examples_per_stripe': 100, 'print_every': 100, 'hps_proto_file': None, 'save_checkpoints': True, 'load_checkpoint': None, 'log_device_placement': False, 'early_stopping_rampup_rounds': 20, 'episodic': False, 'eval_file': '/non-existent-dir/data/ptb/ptb.valid.txt', 'steps': 100, 'summary_flush_secs': 120, 'max_steps': 35} 2 | -------------------------------------------------------------------------------- /lamb/test/data/save_v1/config: -------------------------------------------------------------------------------- 1 | hparam { 2 | key: "activation_fn" 3 | value { 4 | bytes_value: "tf.tanh" 5 | } 6 | } 7 | hparam { 8 | key: "activation_norm_penalty" 9 | value { 10 | float_value: 0.0 11 | } 12 | } 13 | hparam { 14 | key: "adam_beta1" 15 | value { 16 | float_value: 0.899999976158 17 | } 18 | } 19 | hparam { 20 | key: "adam_beta2" 21 | value { 22 | float_value: 0.999000012875 23 | } 24 | } 25 | hparam { 26 | key: "adam_epsilon" 27 | value { 28 | float_value: 9.99999993923e-09 29 | } 30 | } 31 | hparam { 32 | key: "batch_size" 33 | value { 34 | int64_value: 64 35 | } 36 | } 37 | hparam { 38 | key: "cap_input_gate" 39 | value { 40 | bool_value: true 41 | } 42 | } 43 | hparam { 44 | key: "cell_clip" 45 | value { 46 | float_value: -1.0 47 | } 48 | } 49 | hparam { 50 | key: "cell_init_factor" 51 | value { 52 | float_value: 1.0 53 | } 54 | } 55 | hparam { 56 | key: "downprojected_output_dropout" 57 | value { 58 | float_value: -1.0 59 | } 60 | } 61 | hparam { 62 | key: "drop_learning_rate_at_the_latest" 63 | value { 64 | int64_value: 2000 65 | } 66 | } 67 | hparam { 68 | key: "drop_learning_rate_multiplier" 69 | value { 70 | float_value: 0.10000000149 71 | } 72 | } 73 | hparam { 74 | key: "drop_learning_rate_rounds" 75 | value { 76 | int64_value: 90 77 | } 78 | } 79 | hparam { 80 | key: "drop_state_probability" 81 | value { 82 | float_value: 0.00999999977648 83 | } 84 | } 85 | hparam { 86 | key: "embed_once" 87 | value { 88 | bool_value: true 89 | } 90 | } 91 | hparam { 92 | key: "embedding_init_factor" 93 | value { 94 | float_value: 1.0 95 | } 96 | } 97 | hparam { 98 | key: "eval_method" 99 | value { 100 | bytes_value: "deterministic" 101 | } 102 | } 103 | hparam { 104 | key: "feature_mask" 105 | value { 106 | bool_value: false 107 | } 108 | } 109 | hparam { 110 | key: "feature_mask_rank" 111 | value { 112 | int64_value: 0 113 | } 114 | } 115 | hparam { 116 | key: "feature_mask_rounds" 117 | value { 118 | int64_value: 0 119 | } 120 | } 121 | hparam { 122 | key: "forget_bias" 123 | value { 124 | float_value: 1.0 125 | } 126 | } 127 | hparam { 128 | key: "hidden_size" 129 | value { 130 | int64_value: -1 131 | } 132 | } 133 | hparam { 134 | key: "input_dropout" 135 | value { 136 | float_value: 0.51551002264 137 | } 138 | } 139 | hparam { 140 | key: "input_embedding_ratio" 141 | value { 142 | float_value: 0.229729995131 143 | } 144 | } 145 | hparam { 146 | key: "input_embedding_size" 147 | value { 148 | int64_value: -1 149 | } 150 | } 151 | hparam { 152 | key: "intra_layer_dropout" 153 | value { 154 | float_value: 0.0 155 | } 156 | } 157 | hparam { 158 | key: "layer_norm" 159 | value { 160 | bool_value: false 161 | } 162 | } 163 | hparam { 164 | key: "learning_rate" 165 | value { 166 | float_value: 0.00483079999685 167 | } 168 | } 169 | hparam { 170 | key: "learning_rate_decay" 171 | value { 172 | float_value: 1.0 173 | } 174 | } 175 | hparam { 176 | key: "learning_rate_decay_burn_in_steps" 177 | value { 178 | int64_value: 0 179 | } 180 | } 181 | hparam { 182 | key: "lstm_skip_connection" 183 | value { 184 | bool_value: true 185 | } 186 | } 187 | hparam { 188 | key: "max_grad_norm" 189 | value { 190 | float_value: 10.0 191 | } 192 | } 193 | hparam { 194 | key: "mixture_of_softmaxes_num_components" 195 | value { 196 | int64_value: 1 197 | } 198 | } 199 | hparam { 200 | key: "model" 201 | value { 202 | bytes_value: "lstm" 203 | } 204 | } 205 | hparam { 206 | key: "model_average" 207 | value { 208 | bytes_value: "arithmetic" 209 | } 210 | } 211 | hparam { 212 | key: "num_eval_samples" 213 | value { 214 | int64_value: 0 215 | } 216 | } 217 | hparam { 218 | key: "num_layers" 219 | value { 220 | int64_value: 1 221 | } 222 | } 223 | hparam { 224 | key: "num_params" 225 | value { 226 | int64_value: 50000 227 | } 228 | } 229 | hparam { 230 | key: "num_training_samples" 231 | value { 232 | int64_value: 1 233 | } 234 | } 235 | hparam { 236 | key: "optimizer_type" 237 | value { 238 | bytes_value: "rmsprop" 239 | } 240 | } 241 | hparam { 242 | key: "outer_steps" 243 | value { 244 | int64_value: 2500 245 | } 246 | } 247 | hparam { 248 | key: "output_dropout" 249 | value { 250 | float_value: 0.338010013103 251 | } 252 | } 253 | hparam { 254 | key: "output_embedding_ratio" 255 | value { 256 | float_value: -1.0 257 | } 258 | } 259 | hparam { 260 | key: "output_embedding_size" 261 | value { 262 | int64_value: -1 263 | } 264 | } 265 | hparam { 266 | key: "output_init_factor" 267 | value { 268 | float_value: 1.0 269 | } 270 | } 271 | hparam { 272 | key: "overlay_rank" 273 | value { 274 | int64_value: -1 275 | } 276 | } 277 | hparam { 278 | key: "rmsprop_beta2" 279 | value { 280 | float_value: 0.999000012875 281 | } 282 | } 283 | hparam { 284 | key: "rmsprop_epsilon" 285 | value { 286 | float_value: 9.99999993923e-09 287 | } 288 | } 289 | hparam { 290 | key: "share_input_and_output_embeddings" 291 | value { 292 | bool_value: true 293 | } 294 | } 295 | hparam { 296 | key: "softmax_test_time_temperature" 297 | value { 298 | float_value: -0.800000011921 299 | } 300 | } 301 | hparam { 302 | key: "sparsity_ratio" 303 | value { 304 | float_value: -1.0 305 | } 306 | } 307 | hparam { 308 | key: "state_dropout" 309 | value { 310 | float_value: 0.184169992805 311 | } 312 | } 313 | hparam { 314 | key: "state_dropout_flip_rate" 315 | value { 316 | float_value: 0.0 317 | } 318 | } 319 | hparam { 320 | key: "test_time_dropout_multiplier" 321 | value { 322 | float_value: 1.0 323 | } 324 | } 325 | hparam { 326 | key: "test_time_power_mean_power" 327 | value { 328 | float_value: 1.0 329 | } 330 | } 331 | hparam { 332 | key: "tie_forget_and_input_gates" 333 | value { 334 | bool_value: false 335 | } 336 | } 337 | hparam { 338 | key: "token_dropout" 339 | value { 340 | float_value: 0.0 341 | } 342 | } 343 | hparam { 344 | key: "trainable_initial_state" 345 | value { 346 | bool_value: false 347 | } 348 | } 349 | hparam { 350 | key: "update_dropout" 351 | value { 352 | float_value: 0.0 353 | } 354 | } 355 | hparam { 356 | key: "vocab_size" 357 | value { 358 | int64_value: 10001 359 | } 360 | } 361 | hparam { 362 | key: "weight_decay" 363 | value { 364 | float_value: 7.67599995015e-05 365 | } 366 | } 367 | hparam { 368 | key: "weight_penalty" 369 | value { 370 | float_value: 0.0 371 | } 372 | } 373 | -------------------------------------------------------------------------------- /lamb/test/dummy_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import tensorflow.compat.v1 as tf 17 | 18 | 19 | class DummyTest(tf.test.TestCase): 20 | 21 | def testCompilation(self): 22 | pass 23 | 24 | 25 | if __name__ == "__main__": 26 | tf.test.main() 27 | -------------------------------------------------------------------------------- /lamb/test/finish.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | 17 | # Intended to be sourced after setting all the configuration options. 18 | 19 | experiment_dir="$TEST_TMPDIR/${name}" 20 | 21 | # Run 22 | source_lib "run.sh" run_par 23 | 24 | # Check that the best reported evaluation XE is below a certain 25 | # threshold. 26 | grep_xes() { 27 | cat "${_experiment_dir}/stderr" | 28 | sed -rn "s/.*'best_xe': ([0-9]*)\.([0-9]{1,2}).*/\1.\2/p" 29 | } 30 | first_xe=$(grep_xes | head -n 1) 31 | last_xe=$(grep_xes | tail -n 1) 32 | expected_improvement="${expected_improvement:-0.5}" 33 | # check_ge doesn't work with floats, let's do it by hand. 34 | if (( $(echo "$first_xe - $expected_improvement < $last_xe" | bc -l) )); then 35 | echo "XE went from $first_xe to $last_xe, and that's not a large enough \ 36 | improvement ($expected_improvement)." 37 | exit 1 38 | fi 39 | 40 | echo "PASS" 41 | -------------------------------------------------------------------------------- /lamb/test/start.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | set -e -o pipefail 17 | 18 | source googletest.sh 19 | 20 | if [ "${base}" = "" ]; then 21 | source "$(dirname $0)/../lib/setup.sh" 22 | fi 23 | source_lib "config/common.sh" 24 | source_lib "config/running.sh" 25 | 26 | training_file="${base}/test/data/corpus.txt" 27 | validation_file="${training_file}" 28 | unset test_file 29 | 30 | batch_size=64 31 | max_training_eval_batches=2 32 | max_eval_eval_batches=2 33 | max_test_eval_batches=2 34 | max_time_steps=3 35 | steps_per_turn=5 36 | turns=2 37 | 38 | # Misc 39 | use_gpu=false 40 | -------------------------------------------------------------------------------- /lamb/test/test_episodic_char_lstm_d2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | source "$(dirname $0)/start.sh" 19 | 20 | training_file="${base}/test/data/add.txt" 21 | validation_file="${training_file}" 22 | expected_improvement="${expected_improvement:-0.2}" 23 | 24 | word_based=false 25 | episodic=true 26 | conditioning_separator="=" 27 | max_time_steps=40 28 | 29 | # Model hyperparameters 30 | 31 | model=lstm 32 | num_layers=2 33 | hidden_size=50 34 | num_eval_samples=2 35 | 36 | # Optimization hyperparameters 37 | 38 | learning_rate=0.01 39 | 40 | # Run 41 | source "$(dirname $0)/finish.sh" 42 | -------------------------------------------------------------------------------- /lamb/test/test_load_optimizer_state.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -x 19 | 20 | source "$(dirname $0)/start.sh" 21 | 22 | # Model hyperparameters 23 | 24 | model=lstm 25 | num_layers=1 26 | hidden_size=17 27 | output_embedding_size=15 28 | 29 | # Optimization hyperparameters 30 | 31 | learning_rate=0.2 32 | early_stopping_turns=-1 33 | 34 | # Run 35 | source "$(dirname $0)/finish.sh" 36 | previous_xe=$last_xe 37 | 38 | # Load checkpoint and check that validation XE is the same. 39 | load_checkpoint="${_experiment_dir}/best" 40 | optimizer_type="sgd" 41 | # Loading the checkpoint would if a different optimizer's state were loaded. 42 | load_optimizer_state=false 43 | turns=0 44 | expected_improvement=0.0 45 | source "$(dirname $0)/finish.sh" 46 | 47 | if [ "$previous_xe" != "$last_xe" ]; then 48 | echo "XE was $previous_xe, after reloading checkpoint it became $last_xe." 49 | exit 1 50 | fi 51 | -------------------------------------------------------------------------------- /lamb/test/test_save_v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -x 19 | 20 | source "$(dirname $0)/start.sh" 21 | 22 | hps_proto_file="$(dirname $0)/data/save_v1/config" 23 | flags_as_dict="$(dirname $0)/data/save_v1/args" 24 | 25 | # Run 26 | source "$(dirname $0)/finish.sh" 27 | -------------------------------------------------------------------------------- /lamb/test/test_simple_lstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | set -x 19 | 20 | source "$(dirname $0)/start.sh" 21 | 22 | # Model hyperparameters 23 | 24 | model=lstm 25 | num_layers=2 26 | hidden_size=17,13 27 | output_embedding_size=11 28 | lstm_skip_connection=false 29 | 30 | # Optimization hyperparameters 31 | 32 | learning_rate=0.2 33 | early_stopping_turns=-1 34 | 35 | # Run 36 | source "$(dirname $0)/finish.sh" 37 | previous_xe=$last_xe 38 | 39 | # Load checkpoint and check that validation XE is the same. 40 | load_checkpoint="${_experiment_dir}/last" 41 | turns=0 42 | expected_improvement=0.0 43 | source "$(dirname $0)/finish.sh" 44 | 45 | if [ "$previous_xe" != "$last_xe" ]; then 46 | echo "XE was $previous_xe, after reloading checkpoint it became $last_xe." 47 | exit 1 48 | fi 49 | -------------------------------------------------------------------------------- /lamb/test/test_sparse_rhn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | 18 | source "$(dirname $0)/start.sh" 19 | 20 | # Model hyperparameters 21 | 22 | model=rhn 23 | num_layers=2 24 | hidden_size=17 25 | output_embedding_size=15 26 | sparsity_ratio=0.5 27 | 28 | # Optimization hyperparameters 29 | 30 | expected_improvement=0.3 31 | learning_rate=0.2 32 | steps_per_turn=20 33 | 34 | # Run 35 | source "$(dirname $0)/finish.sh" 36 | -------------------------------------------------------------------------------- /lamb/vocab.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Vocabulary.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | from six.moves import range 22 | 23 | 24 | class Vocab(object): 25 | """Immutable reversible mappings from strings to integers.""" 26 | 27 | def __init__(self, tokens, unk=u'', eos=u'\u25bc'): 28 | """Create a Vocab object that maps `tokens` to dense indices.""" 29 | self._token_to_index = {} 30 | self._token_to_frequency = {} 31 | self._unk = unk 32 | self._eos = eos 33 | token_to_index = self._token_to_index 34 | token_to_frequency = self._token_to_frequency 35 | # Get the unique tokens from `tokens` that might be a generator. 36 | for token in tokens: 37 | token_to_index[token] = True 38 | token_to_frequency[token] = token_to_frequency.get(token, 0) + 1 39 | token_to_index[unk] = True 40 | token_to_index[eos] = True 41 | # Now that we have a smaller set of tokens, assign ids in sorted 42 | # order for deterministic encoding. 43 | self._index_to_token = [None] * len(token_to_index) 44 | index_to_token = self._index_to_token 45 | i = 0 46 | for token in sorted(list(token_to_index)): 47 | token_to_index[token] = i 48 | index_to_token[i] = token 49 | i += 1 50 | 51 | def unk_index(self): 52 | """Returns the index of the unknown token.""" 53 | return self._token_to_index[self._unk] 54 | 55 | def eos_index(self): 56 | """Returns the index of the end-of-sentence token.""" 57 | return self._token_to_index[self._eos] 58 | 59 | def token(self, index_): 60 | """The string whose `index()` is `index_` or an IndexError.""" 61 | return self._index_to_token[index_] 62 | 63 | def __iter__(self): 64 | """Iterates over tokens in order of indices.""" 65 | for i in range(self.size()): 66 | yield self.token(i) 67 | 68 | def index_or_unk(self, token): 69 | """Find the index assigned to `token`. 70 | 71 | Args: 72 | token: a string. 73 | Returns: 74 | The index of `token` or `unk_index()` if it is not in the vocabulary. 75 | """ 76 | if token in self._token_to_index: 77 | return self._token_to_index[token] 78 | else: 79 | return self.unk_index() 80 | 81 | def size(self): 82 | """Returns the number of different tokens in the vocabulary.""" 83 | return len(self._index_to_token) 84 | 85 | def decode(self, ids): 86 | """Decode a sequence of `ids` with `token()`.""" 87 | assert all([0 <= x and x < len(self._index_to_token) for x in ids]) 88 | return [self.token(x) for x in ids] 89 | 90 | def encode(self, tokens, add_eos=True): 91 | """Encodes a sentence into a list of token indices. 92 | 93 | Args: 94 | tokens: A list of tokens. 95 | add_eos: Whether to add the end of sentence token. 96 | Returns: 97 | A list of integer token indices where `unk_index()` stands for 98 | tokens not found in the vocabulary. 99 | """ 100 | ids = [self.index_or_unk(token) for token in tokens] 101 | 102 | if add_eos: 103 | ids += [self.eos_index()] 104 | 105 | return ids 106 | 107 | def index_frequency(self, index_): 108 | return self._token_to_frequency.get(self.token(index_), 0) 109 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Setup for pip package.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | from setuptools import find_packages 23 | from setuptools import setup 24 | 25 | REQUIRED_PACKAGES = ['absl-py', 'numpy', 'dm-sonnet', 'six'] 26 | EXTRA_PACKAGES = { 27 | 'tensorflow': ['tensorflow>=1.15.0', 'tensorflow-probability>=0.4.0'], 28 | 'tensorflow with gpu': ['tensorflow-gpu>=1.8.0', 29 | 'tensorflow-probability-gpu>=0.4.0'], 30 | } 31 | 32 | 33 | setup( 34 | name='lamb', 35 | version='1.0', 36 | description=('LAnguage Modelling Benchmarks is ' 37 | 'to tune and test Tensorflow LM models.'), 38 | long_description='', 39 | url='http://github.com/deepmind/lamb/', 40 | author='Gabor Melis', 41 | author_email='melisgl@google.com', 42 | # Contained modules and scripts. 43 | packages=find_packages(), 44 | install_requires=REQUIRED_PACKAGES, 45 | extras_require=EXTRA_PACKAGES, 46 | zip_safe=False, 47 | license='Apache 2.0', 48 | classifiers=[ 49 | 'Development Status :: 5 - Production/Stable', 50 | 'Intended Audience :: Developers', 51 | 'Intended Audience :: Education', 52 | 'Intended Audience :: Science/Research', 53 | 'License :: OSI Approved :: Apache Software License', 54 | 'Operating System :: MacOS :: MacOS X', 55 | 'Operating System :: Microsoft :: Windows', 56 | 'Operating System :: POSIX', 57 | 'Operating System :: Unix', 58 | 'Programming Language :: Python :: 2.7', 59 | 'Programming Language :: Python :: 3.4', 60 | 'Programming Language :: Python :: 3.5', 61 | 'Programming Language :: Python :: 3.6', 62 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 63 | 'Topic :: Software Development :: Libraries', 64 | ], 65 | keywords='lamb tensorflow language modelling machine learning', 66 | ) 67 | --------------------------------------------------------------------------------