├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── lamb
├── VERSION
├── __init__.py
├── averaged.py
├── cell.py
├── corpus.py
├── dropout.py
├── dyneval.py
├── evaluation.py
├── experiment
│ ├── awd
│ │ └── train_awd_lstm.sh
│ ├── continue.sh
│ ├── mixture-of-softmaxes
│ │ ├── train_awd_lstm_mos.sh
│ │ └── tune_ptb_24m.sh
│ ├── mogrifier
│ │ ├── README.md
│ │ ├── config
│ │ │ ├── 0393c7dc3532+_tune_ptb_char_24m_lstm_fm_d2_asgd_ts150
│ │ │ │ └── trial_596
│ │ │ │ │ └── config
│ │ │ ├── 4d0a9a5bdb04+_tune_enwik8_48m_lstm_d4_arms
│ │ │ │ └── trial_400
│ │ │ │ │ └── config
│ │ │ ├── 4d0a9a5bdb04+_tune_enwik8_48m_lstm_fm_d4_arms
│ │ │ │ └── trial_234
│ │ │ │ │ └── config
│ │ │ ├── 558aa30c0b15+_tune_mwc_fi_24m_lstm_d2_arms
│ │ │ │ └── trial_758
│ │ │ │ │ └── config
│ │ │ ├── 558aa30c0b15+_tune_mwc_fi_24m_lstm_fm_d2_arms
│ │ │ │ └── trial_371
│ │ │ │ │ └── config
│ │ │ ├── 786252db3825+_tune_ptb_24m_lstm_d2_arms
│ │ │ │ └── trial_833
│ │ │ │ │ └── config
│ │ │ ├── 786252db3825+_tune_ptb_24m_lstm_fm_d2_arms
│ │ │ │ └── trial_483
│ │ │ │ │ └── config
│ │ │ ├── 9e20581d3dad+_tune_mwc_en_24m_lstm_d2_arms
│ │ │ │ └── trial_502
│ │ │ │ │ └── config
│ │ │ ├── 9e20581d3dad+_tune_mwc_en_24m_lstm_fm_d2_arms
│ │ │ │ └── trial_422
│ │ │ │ │ └── config
│ │ │ ├── c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_d2_arms
│ │ │ │ └── trial_763
│ │ │ │ │ └── config
│ │ │ ├── c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_fm_d2_arms
│ │ │ │ └── trial_747
│ │ │ │ │ └── config
│ │ │ ├── e81db31261c0+_tune_enwik8_96m_lstm_d4_arms
│ │ │ │ └── trial_295
│ │ │ │ │ └── config
│ │ │ └── e81db31261c0+_tune_enwik8_96m_lstm_fm_d4_arms_MtngW
│ │ │ │ └── trial_216
│ │ │ │ └── config
│ │ ├── train_enwik8.sh
│ │ ├── train_mwc.sh
│ │ ├── train_ptb.sh
│ │ ├── train_ptb_char.sh
│ │ ├── train_wikitext-2.sh
│ │ ├── tune_copy.sh
│ │ ├── tune_dyneval.sh
│ │ ├── tune_enwik8.sh
│ │ ├── tune_mwc.sh
│ │ ├── tune_ptb.sh
│ │ ├── tune_ptb_char.sh
│ │ ├── tune_ptb_fast.sh
│ │ └── tune_wikitext-2.sh
│ ├── on-the-state
│ │ ├── README.md
│ │ ├── enwik8_27m_lstm_d4
│ │ │ └── hps_proto
│ │ ├── enwik8_46m_lstm_d4
│ │ │ └── hps_proto
│ │ ├── ptb_10m_lstm_d1
│ │ │ └── hps_proto
│ │ ├── ptb_24m_lstm_d4
│ │ │ └── hps_proto
│ │ ├── train_enwik8.sh
│ │ ├── train_ptb.sh
│ │ ├── train_wikitext-2.sh
│ │ └── wikitext-2_24m_lstm_d2
│ │ │ └── hps_proto
│ ├── pushing-the-bounds
│ │ ├── README.md
│ │ └── test.sh
│ ├── rerun.sh
│ ├── rerun_old.sh
│ ├── test.sh
│ ├── train_ptb_10m_lstm_d1.sh
│ ├── train_ptb_24m_lstm_d4.sh
│ └── tune_ptb_10m.sh
├── lamb_flags.py
├── lib
│ ├── config
│ │ ├── README.md
│ │ ├── common.sh
│ │ ├── copy.sh
│ │ ├── enwik8.sh
│ │ ├── enwik8_char.sh
│ │ ├── enwik8_char_rmsprop.sh
│ │ ├── mwc.sh
│ │ ├── ptb.sh
│ │ ├── ptb_char.sh
│ │ ├── ptb_word.sh
│ │ ├── ptb_word_rmsprop.sh
│ │ ├── ptb_word_slow.sh
│ │ ├── running.sh
│ │ ├── tuning.sh
│ │ ├── wikitext-103.sh
│ │ ├── wikitext-103_word.sh
│ │ ├── wikitext-103_word_rmsprop.sh
│ │ ├── wikitext-2.sh
│ │ ├── wikitext-2_word.sh
│ │ └── wikitext-2_word_rmsprop.sh
│ ├── describe_version.sh
│ ├── run.sh
│ ├── run_helper.sh
│ └── setup.sh
├── lm.py
├── main.py
├── monitoring.py
├── nascell.py
├── res_multi_rnn_cell.py
├── skip_multi_rnn_cell.py
├── test
│ ├── data
│ │ ├── add.txt
│ │ ├── corpus.txt
│ │ └── save_v1
│ │ │ ├── args
│ │ │ └── config
│ ├── dummy_test.py
│ ├── finish.sh
│ ├── start.sh
│ ├── test_episodic_char_lstm_d2.sh
│ ├── test_load_optimizer_state.sh
│ ├── test_save_v1.sh
│ ├── test_simple_lstm.sh
│ └── test_sparse_rhn.sh
├── tiled_linear.py
├── tiled_lstm.py
├── tiled_rhn.py
├── training.py
├── utils.py
└── vocab.py
└── setup.py
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google.com/conduct/).
29 |
--------------------------------------------------------------------------------
/lamb/VERSION:
--------------------------------------------------------------------------------
1 | 1.0
2 |
--------------------------------------------------------------------------------
/lamb/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 |
--------------------------------------------------------------------------------
/lamb/averaged.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """Averaging of model weights."""
17 |
18 | # pylint: disable=missing-docstring
19 | # pylint: disable=g-complex-comprehension
20 |
21 | from __future__ import absolute_import
22 | from __future__ import division
23 | from __future__ import print_function
24 |
25 | import tensorflow.compat.v1 as tf
26 |
27 |
28 | class Averaged(object):
29 |
30 | def __init__(self, tensors):
31 | tensors = list(tensors)
32 | with tf.variable_scope('averaged'):
33 | self._num_samples = tf.Variable(0, name='num_samples', trainable=False)
34 | with tf.variable_scope('avg'):
35 | self._averages = [
36 | tf.get_variable(
37 | tensor.name.replace('/', '-').replace(':', '-'),
38 | tensor.get_shape(), initializer=tf.zeros_initializer(),
39 | trainable=False)
40 | for tensor in tensors]
41 | with tf.variable_scope('save'):
42 | self._saves = [
43 | tf.get_variable(
44 | tensor.name.replace('/', '-').replace(':', '-'),
45 | tensor.get_shape(), initializer=tf.zeros_initializer(),
46 | trainable=False)
47 | for tensor in tensors]
48 | self._tensors = tensors
49 | self._take_sample = self._make_take_sample()
50 | self._switch = self._make_swith_to_average()
51 | self._restore = self._make_restore()
52 | self._reset = self._make_reset()
53 |
54 | def take_sample(self):
55 | tf.get_default_session().run(self._take_sample)
56 |
57 | def switch_to_average(self):
58 | tf.get_default_session().run(self._switch)
59 |
60 | def restore(self):
61 | tf.get_default_session().run(self._restore)
62 |
63 | def reset(self):
64 | tf.get_default_session().run(self._reset)
65 |
66 | def __enter__(self):
67 | self.switch_to_average()
68 |
69 | def __exit__(self, type_, value, traceback):
70 | self.restore()
71 |
72 | def _make_take_sample(self):
73 | assignments = []
74 | n = tf.cast(self._num_samples, tf.float32)
75 | mu = 1.0 / (1.0 + n)
76 | for tensor, average in zip(self._tensors, self._averages):
77 | assignments.append(tf.assign_add(average, (tensor-average)*mu))
78 | add_to_averages = tf.group(assignments)
79 | with tf.control_dependencies([add_to_averages]):
80 | incr_num_samples = tf.assign(self._num_samples, self._num_samples + 1)
81 | return incr_num_samples
82 |
83 | def _make_swith_to_average(self):
84 | assignments = []
85 | for save, tensor, average in zip(
86 | self._saves, self._tensors, self._averages):
87 | with tf.control_dependencies([save.assign(tensor)]):
88 | assignments.append(tensor.assign(average))
89 | return tf.group(assignments)
90 |
91 | def _make_restore(self):
92 | assignments = []
93 | for save, tensor in zip(self._saves, self._tensors):
94 | assignments.append(tf.assign(tensor, save))
95 | return tf.group(assignments)
96 |
97 | def _make_reset(self):
98 | return tf.assign(self._num_samples, 0)
99 |
100 |
101 | # TODO(melisgl): I think this works with ResourceVariables but not with normal
102 | # Variables. Deferred until TF2.0.
103 | def _swap(x, y):
104 | x_value = x.read_value()
105 | y_value = y.read_value()
106 | with tf.control_dependencies([x_value, y_value]):
107 | swap = tf.group(y.assign(x_value), x.assign(y_value))
108 | return swap
109 |
--------------------------------------------------------------------------------
/lamb/dropout.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """Variational Dropout."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | from sonnet.python.modules import base as snt_base
23 | import tensorflow.compat.v1 as tf
24 | import tensorflow_probability as tfp
25 | from tensorflow.contrib import util as contrib_util
26 |
27 |
28 | class Dropout(snt_base.AbstractModule):
29 | """Possibly variational dropout."""
30 |
31 | def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'):
32 | super(Dropout, self).__init__(name=name)
33 | self._keep_prob = keep_prob
34 | self._keep_mask = None
35 | self._share_mask = share_mask
36 | self._scaler = scaler
37 |
38 | def _ensure_keep_mask(self, x):
39 | if self._keep_mask is None or not self._share_mask:
40 | shape = tf.shape(x)
41 | noise = tf.random_uniform(shape, dtype=x.dtype)
42 | self._keep_mask = (tf.floor(self._keep_prob + noise)
43 | * (self._scaler / self._keep_prob))
44 | self._keep_mask.set_shape(x.get_shape())
45 | return self._keep_mask
46 |
47 | def _build(self, x):
48 | if contrib_util.constant_value(self._keep_prob) == 1:
49 | return x
50 | else:
51 | return x * self._ensure_keep_mask(x)
52 |
53 |
54 | class GaussianDropout(snt_base.AbstractModule):
55 | """Possibly variational dropout."""
56 |
57 | def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'):
58 | super(GaussianDropout, self).__init__(name=name)
59 | self._keep_prob = keep_prob
60 | self._keep_mask = None
61 | self._share_mask = share_mask
62 | self._scaler = scaler
63 |
64 | def _ensure_keep_mask(self, x):
65 | if self._keep_mask is None or not self._share_mask:
66 | shape = tf.shape(x)
67 | # Calculate the stddev for the normal distribution that
68 | # matches the stddev of the bernoulli with p=keep_prob.
69 | stddev = tf.sqrt((1 - self._keep_prob) / self._keep_prob)
70 | self._keep_mask = tf.random_normal(shape, mean=1.0, stddev=stddev,
71 | dtype=x.dtype)
72 | self._keep_mask.set_shape(x.get_shape())
73 | return self._keep_mask
74 |
75 | def _build(self, x):
76 | if contrib_util.constant_value(self._keep_prob) == 1:
77 | return x
78 | else:
79 | return x * self._ensure_keep_mask(x)
80 |
81 |
82 | class DirichletDropout(snt_base.AbstractModule):
83 | """Possibly variational dropout."""
84 |
85 | def __init__(self, keep_prob, share_mask=True, scaler=1.0, name='dropout'):
86 | super(DirichletDropout, self).__init__(name=name)
87 | self._keep_prob = keep_prob
88 | self._keep_mask = None
89 | self._share_mask = share_mask
90 | self._scaler = scaler
91 |
92 | def _ensure_keep_mask(self, x):
93 | if self._keep_mask is None or not self._share_mask:
94 | shape = tf.shape(x)
95 | k = shape[1]
96 | # To make this class a drop-in replacement for bernoulli dropout we
97 | # paramaterize it with keep_prob. Set alpha of the dirichlet so that the
98 | # variance is equal to the variance of the bernoulli with p=keep_prob
99 | # divided by keep_prob.
100 | # Now the variance of the dirichlet with k equal alphas is
101 | # (k-1)/(k^2(k*alpha+1). Solve that for alpha.
102 | kf = tf.cast(k, tf.float32)
103 | alpha = self._keep_prob * (kf - 1.0) / ((1-self._keep_prob)*kf) - 1.0/kf
104 | dist = tfp.distributions.Dirichlet(tf.ones(shape=k) * alpha)
105 | assert (dist.reparameterization_type ==
106 | tfp.distributions.FULLY_REPARAMETERIZED)
107 | # The E[dir(alpha)] = 1/k for all elements, but we want the expectation to
108 | # be keep_prob, hence the multiplication.
109 | self._keep_mask = kf * dist.sample(shape[0])
110 | self._keep_mask.set_shape(x.get_shape())
111 | return self._keep_mask
112 |
113 | def _build(self, x):
114 | if contrib_util.constant_value(self._keep_prob) == 1:
115 | return x
116 | else:
117 | return tf.cond(tf.equal(self._keep_prob, 1.0),
118 | lambda: x,
119 | lambda: x * self._ensure_keep_mask(x))
120 |
121 |
122 | class DriftingDropout(snt_base.AbstractModule):
123 | """Dropout with gradually changing mask."""
124 |
125 | def __init__(self, keep_prob, flip_prob=0.0, scaler=1.0, name='dropout'):
126 | super(DriftingDropout, self).__init__(name=name)
127 | self._keep_prob = keep_prob
128 | self._flip_prob = flip_prob
129 | self._scaler = scaler
130 | self._time_step = 0
131 |
132 | def _build(self, x, state):
133 | prev_keep_mask = state
134 | shape = tf.shape(x)
135 | noise = tf.random_uniform(shape, dtype=x.dtype)
136 | other_mask = tf.floor(self._keep_prob + noise)
137 | choice_noise = tf.random_uniform(shape, dtype=x.dtype)
138 | choice = tf.less(choice_noise, self._flip_prob)
139 | # KLUDGE(melisgl): The client has to pass the last keep_mask from
140 | # a batch to the next so the mask may end up next to some
141 | # recurrent cell state. This state is often zero at the beginning
142 | # and may be periodically zeroed (per example) during training.
143 | # While zeroing LSTM state is okay, zeroing the dropout mask is
144 | # not. So instead of forcing every client to deal with this common
145 | # (?) case, if an all zero mask is detected, then regenerate a
146 | # fresh mask. This is of course a major hack and won't help with
147 | # learnt initial states, for example.
148 | sum_ = tf.reduce_sum(prev_keep_mask, 1, keepdims=True)
149 | is_initializing = tf.equal(sum_, 0.0)
150 |
151 | self._keep_mask = tf.where(tf.logical_or(choice, is_initializing),
152 | other_mask,
153 | prev_keep_mask)
154 | self._time_step += 1
155 | return x * self._keep_mask / self._keep_prob * self._scaler
156 |
--------------------------------------------------------------------------------
/lamb/dyneval.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """Dynamic evaluation."""
17 |
18 | # pylint: disable=missing-docstring
19 | # pylint: disable=g-complex-comprehension
20 |
21 | from __future__ import absolute_import
22 | from __future__ import division
23 | from __future__ import print_function
24 |
25 | import tensorflow.compat.v1 as tf
26 |
27 |
28 | class Dyneval(object):
29 |
30 | def __init__(self, grads_and_vars, learning_rate, decay_rate, epsilon):
31 | with tf.variable_scope('dyneval'):
32 | # convert_to_tensor densifies IndexedSlices
33 | self._grads = [tf.convert_to_tensor(grad) for grad, _ in grads_and_vars]
34 | self._vars = [var for _, var in grads_and_vars]
35 | self._learning_rate = learning_rate
36 | self._decay_rate = decay_rate
37 | def shadow_vars():
38 | return [
39 | tf.get_variable(
40 | var.name.replace('/', '-').replace(':', '-'),
41 | var.get_shape(), initializer=tf.zeros_initializer(),
42 | trainable=False)
43 | for var in self._vars]
44 | with tf.variable_scope('save'):
45 | self._saves = shadow_vars()
46 | with tf.variable_scope('sum_squared_grads'):
47 | self._sum_squared_grads = shadow_vars()
48 | self._save = self._make_save()
49 | self._restore = self._make_restore()
50 |
51 | # These are for computing an RMSProplike estimate of the variance of
52 | # minibatch gradients. Here, this quantity is estimated on the training
53 | # set once, while gradient descent happens on validation/test.
54 | self._num_squared_grads = tf.get_variable(
55 | 'num_squared_grads', [], initializer=tf.zeros_initializer(),
56 | trainable=False)
57 | self._zero_sum_squared_grads = self._make_zero_sum_squared_grads()
58 | self._add_squared_grads = self._make_add_squared_grads()
59 | self._epsilon = epsilon
60 |
61 | self._update = self._make_update()
62 |
63 | def _make_save(self):
64 | assignments = []
65 | for save, var in zip(self._saves, self._vars):
66 | assignments.append(save.assign(var))
67 | return tf.group(assignments)
68 |
69 | def _make_restore(self):
70 | assignments = []
71 | for save, var in zip(self._saves, self._vars):
72 | assignments.append(var.assign(save))
73 | return tf.group(assignments)
74 |
75 | def _make_update(self):
76 | mss = []
77 | gsum = 0.0
78 | count = 0
79 | for sum_squared_grads in self._sum_squared_grads:
80 | ms = tf.sqrt(sum_squared_grads / self._num_squared_grads)
81 | gsum += tf.reduce_sum(ms)
82 | count += tf.reduce_sum(tf.ones_like(ms))
83 | mss.append(ms)
84 | gsum = gsum / count
85 |
86 | assignments = []
87 | for grad, var, save, sum_squared_grads, ms in zip(
88 | self._grads, self._vars, self._saves, self._sum_squared_grads, mss):
89 | decay_rate = tf.minimum(1.0, self._decay_rate*(ms/gsum))
90 | delta = (-self._learning_rate*grad / (ms + self._epsilon) +
91 | decay_rate*(save-var))
92 | assignments.append(var.assign_add(delta))
93 | return tf.group(assignments)
94 |
95 | def _make_add_squared_grads(self):
96 | assignments = []
97 | for sum_squared_grads, grads in zip(self._sum_squared_grads, self._grads):
98 | assignments.append(sum_squared_grads.assign_add(tf.square(grads)))
99 | return tf.group(assignments + [self._num_squared_grads.assign_add(1)])
100 |
101 | def _make_zero_sum_squared_grads(self):
102 | assignments = []
103 | for sum_squared_grads in self._sum_squared_grads:
104 | assignments.append(sum_squared_grads.assign(
105 | tf.zeros_like(sum_squared_grads)))
106 | return tf.group(assignments + [self._num_squared_grads.assign(0)])
107 |
108 | def save(self):
109 | tf.get_default_session().run(self._save)
110 |
111 | def restore(self):
112 | tf.get_default_session().run(self._restore)
113 |
114 | def update_op(self):
115 | return self._update
116 |
117 | def zero_sum_squared_grads(self):
118 | tf.get_default_session().run(self._zero_sum_squared_grads)
119 |
120 | def add_squared_grads_op(self):
121 | return self._add_squared_grads
122 |
123 | def __enter__(self):
124 | self.save()
125 |
126 | def __exit__(self, type_, value, traceback):
127 | self.restore()
128 |
--------------------------------------------------------------------------------
/lamb/experiment/awd/train_awd_lstm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # This script reproduces the PTB results from "Regularizing and Optimizing LSTM
19 | # Language Models" (Merity, 2017) without fine-tuning or dynamic evaluation.
20 | #
21 | # Based on https://github.com/salesforce/awd-lstm-lm.
22 | #
23 | # Reaches ~4.084 validation cross-entropy (59.38 ppl) without fine-tuning.
24 |
25 | set -e
26 |
27 | source "$(dirname $0)/../../lib/setup.sh"
28 | source_lib "config/common.sh"
29 | source_lib "config/running.sh"
30 | source_lib "config/ptb_word.sh"
31 |
32 | # Model
33 |
34 | share_input_and_output_embeddings=true
35 | input_embedding_size=400
36 | output_embedding_size=400
37 | cap_input_gate=false
38 | input_dropout=0.4
39 | embedding_dropout=0.1
40 | output_dropout=0.4
41 | shared_mask_dropout=true
42 |
43 | # Cell
44 |
45 | model="lstm"
46 | num_layers=3
47 | lstm_skip_connection=false
48 | hidden_size=1150,1150,400
49 | inter_layer_dropout=0.25
50 | state_dropout=0.5
51 | tie_forget_and_input_gates=false
52 |
53 | # Objective
54 |
55 | activation_norm_penalty=2.0
56 | l2_penalty=8.4e-5 # 1.2e-6*70
57 | drop_state_probability=0.01
58 |
59 | # Initialization
60 |
61 | forget_bias=0.0
62 |
63 | # Schedule
64 |
65 | steps_per_turn=100
66 | print_training_stats_every_num_steps=100
67 | turns=3168 # ~500 epochs (with batch_size=20 and max_time_steps=70).
68 |
69 | # Optimizer
70 |
71 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all
72 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums
73 | # the log probabilities over time steps and averages only over the examples in
74 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty
75 | # had to be adjusted.
76 | max_time_steps=70
77 | max_grad_norm=17.5 # 0.25*70
78 | optimizer_type="sgd"
79 | batch_size=20
80 | learning_rate=0.42857143 # 30.0/70
81 |
82 | # Evaluation hyperparameters
83 |
84 | trigger_averaging_turns=50
85 | trigger_averaging_at_the_latest=2000
86 | max_training_eval_batches=20
87 |
88 | # Misc
89 |
90 | swap_memory=true
91 |
92 | source_lib "run.sh" "$@"
93 |
--------------------------------------------------------------------------------
/lamb/experiment/continue.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 |
24 | name="$2"
25 | config_file="$3/config"
26 | load_checkpoint="$3/last"
27 | source_lib "run.sh" "$1"
28 |
--------------------------------------------------------------------------------
/lamb/experiment/mixture-of-softmaxes/train_awd_lstm_mos.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # This script reproduces the PTB results from "Breaking the Softmax Bottleneck:
19 | # A High-Rank RNN Language Model" (Zhilin Yang, Zihang Dai, Ruslan
20 | # Salakhutdinov, William W. Cohen) without fine-tuning or dynamic evaluation.
21 | #
22 | # Based on https://github.com/zihangdai/mos.
23 |
24 | set -e
25 |
26 | source "$(dirname $0)/../../lib/setup.sh"
27 | source_lib "config/common.sh"
28 | source_lib "config/running.sh"
29 | source_lib "config/ptb_word.sh"
30 |
31 | # Model
32 |
33 | share_input_and_output_embeddings=true
34 | input_embedding_size=280
35 | output_embedding_size=280
36 | cap_input_gate=false
37 | input_dropout=0.4
38 | embedding_dropout=0.1
39 | output_dropout=0.4
40 | downprojected_output_dropout=0.29
41 | shared_mask_dropout=true
42 | mos_num_components=15
43 |
44 | # Cell
45 |
46 | model="lstm"
47 | num_layers=3
48 | lstm_skip_connection=false
49 | hidden_size=960,960,620
50 | inter_layer_dropout=0.225
51 | state_dropout=0.5
52 | tie_forget_and_input_gates=false
53 |
54 | # Objective
55 |
56 | l2_penalty=8.4e-5 # 1.2e-6*70
57 | drop_state_probability=0.01
58 |
59 | # Initialization
60 |
61 | forget_bias=0.0
62 |
63 | # Schedule
64 |
65 | steps_per_turn=100
66 | print_training_stats_every_num_steps=100
67 | turns=8000
68 |
69 | # Optimizer
70 |
71 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all
72 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums
73 | # the log probabilities over time steps and averages only over the examples in
74 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty
75 | # had to be adjusted.
76 | max_time_steps=70
77 | max_grad_norm=17.5 # 0.25*70
78 | optimizer_type="sgd"
79 | batch_size=12
80 | learning_rate=0.285 # 20.0/70
81 |
82 | # Evaluation hyperparameters
83 |
84 | trigger_averaging_turns=50
85 | trigger_averaging_at_the_latest=2000
86 | max_training_eval_batches=20
87 |
88 | # Misc
89 |
90 | swap_memory=true
91 |
92 | source_lib "run.sh" "$@"
93 |
--------------------------------------------------------------------------------
/lamb/experiment/mixture-of-softmaxes/tune_ptb_24m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/ptb_word.sh"
28 |
29 | # Model
30 |
31 | num_params=$(million 24)
32 | share_input_and_output_embeddings=true
33 | cap_input_gate=false
34 | shared_mask_dropout=true
35 |
36 | # Cell
37 |
38 | model="lstm"
39 | num_layers=3
40 | lstm_skip_connection=false
41 | tie_forget_and_input_gates=false
42 |
43 | # Objective
44 |
45 | drop_state_probability=0.01
46 |
47 | # Initialization
48 |
49 | forget_bias=0.0
50 |
51 | # Schedule
52 |
53 | steps_per_turn=100
54 | print_training_stats_every_num_steps=100
55 | turns=600
56 |
57 | # Optimizer
58 |
59 | # In the loss, the pytorch code (https://github.com/zihangdai/mos) averages all
60 | # log probabilities in the [batch_size, max_time_steps] matrix, while lamb sums
61 | # the log probabilities over time steps and averages only over the examples in
62 | # the batch. To compensate for that, max_grad_norm, learning_rate and l2_penalty
63 | # had to be adjusted.
64 | max_time_steps=70
65 | max_grad_norm=10.0
66 | trigger_averaging_turns=25
67 | trigger_averaging_at_the_latest=400
68 |
69 | # Early stopping
70 |
71 | early_stopping_turns=30
72 | early_stopping_worst_xe_target=4.4
73 |
74 | # Evaluation
75 |
76 | max_training_eval_batches=20
77 | eval_softmax_temperature=-0.8
78 |
79 | # Misc
80 |
81 | swap_memory=true
82 |
83 | # Tuning parameters
84 |
85 | num_workers=60
86 |
87 | # SGD
88 | optimizer_type="sgd"
89 | mos_num_components=0
90 | tuneables="batch_size,learning_rate,l2_penalty,
91 | token_dropout,input_dropout,inter_layer_dropout,state_dropout,
92 | output_dropout,downprojected_output_dropout,input_embedding_ratio"
93 | name="$(default_name)_${model}_d${num_layers}_asgd"
94 | source_lib "run.sh" "$@"
95 |
96 | # RMSPROP
97 | optimizer_type="rmsprop"
98 | mos_num_components=0
99 | tuneables="batch_size,learning_rate,l2_penalty,
100 | token_dropout,input_dropout,inter_layer_dropout,state_dropout,
101 | output_dropout,downprojected_output_dropout,input_embedding_ratio"
102 | name="$(default_name)_${model}_d${num_layers}_arms"
103 | source_lib "run.sh" "$@"
104 |
105 | # SGD, MoS
106 | optimizer_type="sgd"
107 | mos_num_components=15
108 | tuneables="batch_size,learning_rate,l2_penalty,
109 | token_dropout,input_dropout,inter_layer_dropout,state_dropout,
110 | output_dropout,downprojected_output_dropout,input_embedding_ratio"
111 | name="$(default_name)_${model}_d${num_layers}_asgd_mos${mos_num_components}"
112 | source_lib "run.sh" "$@"
113 |
114 | # RMSPROP, MoS
115 | optimizer_type="rmsprop"
116 | mos_num_components=15
117 | tuneables="batch_size,learning_rate,l2_penalty,
118 | token_dropout,input_dropout,inter_layer_dropout,state_dropout,
119 | output_dropout,downprojected_output_dropout,input_embedding_ratio"
120 | name="$(default_name)_${model}_d${num_layers}_arms_mos${mos_num_components}"
121 | source_lib "run.sh" "$@"
122 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/README.md:
--------------------------------------------------------------------------------
1 | This directory contains saved configuration files for tuned models from the
2 | [Mogrifier LSTM](https://arxiv.org/abs/1909.01792) paper. Model weights are not
3 | included.
4 |
5 | Don't forget to [set up the data](../../README.md).
6 |
7 | For example, to train a Mogrifier LSTM with 24M parameters on PTB with tuned
8 | hyperparameters (see the paper above):
9 |
10 | ./train_ptb.sh run train-dir-name config/786252db3825+_tune_ptb_24m_lstm_fm_d2_arms/trial_483/config
11 |
12 | There are separate training scripts for other datasets. The `config` directory
13 | holds the best hyperparameters for various model and dataset combinations. The
14 | training will save the model in `./train-dir-name_`. To test the
15 | saved model:
16 |
17 | ../test.sh run test-dir-name ./train-dir-name_/
18 |
19 | If training runs out of GPU memory, you may want to decrease `max_time_steps`
20 | (the BPTT window size), but don't expect to reproduce the results that way.
21 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/0393c7dc3532+_tune_ptb_char_24m_lstm_fm_d2_asgd_ts150/trial_596/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'utf-8'),
4 | ('word_based', True),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 24000000),
8 | ('share_input_and_output_embeddings', False),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 1.0),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.27860252841733274),
17 | ('output_dropout', 0.2347428361918374),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 2),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 4),
27 | ('feature_mask_rank', 24),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.0737609984911853),
39 | ('state_dropout', 0.17118611234551975),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 0.00025558089199237096),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 200),
58 | ('print_training_stats_every_num_steps', 200),
59 | ('turns', 500),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 64),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 150),
71 | ('trigger_averaging_turns', 25),
72 | ('trigger_averaging_at_the_latest', 400),
73 | # learning rate
74 | ('learning_rate', 0.003739598828019367),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_power_mean_power', 1.0),
99 | ('eval_dropout_multiplier', 1.0),
100 | # experiments
101 | # checkpoints
102 | ('save_checkpoints', True),
103 | # misc
104 | ('seed', 1),
105 | ('swap_memory', False),
106 | ('log_device_placement', False),
107 | ('summary_flush_secs', 120),
108 | ]
109 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/4d0a9a5bdb04+_tune_enwik8_48m_lstm_d4_arms/trial_400/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'CP437'),
4 | ('word_based', False),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 48000000),
8 | ('share_input_and_output_embeddings', False),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 1.0),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.16279440026790548),
17 | ('output_dropout', 0.13860156332143037),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 4),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 0),
27 | ('feature_mask_rank', 0),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.11949666753873665),
39 | ('state_dropout', 0.1036809388104279),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 2.5181258956042348e-05),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 400),
58 | ('print_training_stats_every_num_steps', 1000),
59 | ('turns', 100),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 128),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 500),
71 | ('trigger_averaging_turns', 10),
72 | ('trigger_averaging_at_the_latest', 80),
73 | # learning rate
74 | ('learning_rate', 0.002516709293528533),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/4d0a9a5bdb04+_tune_enwik8_48m_lstm_fm_d4_arms/trial_234/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'CP437'),
4 | ('word_based', False),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 48000000),
8 | ('share_input_and_output_embeddings', False),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 1.0),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.015496029112930465),
17 | ('output_dropout', 0.138307173174503),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 4),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 6),
27 | ('feature_mask_rank', 79),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.008633961431527571),
39 | ('state_dropout', 0.0437288219541186),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 0.000993383826740019),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 400),
58 | ('print_training_stats_every_num_steps', 1000),
59 | ('turns', 100),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 128),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 500),
71 | ('trigger_averaging_turns', 10),
72 | ('trigger_averaging_at_the_latest', 80),
73 | # learning rate
74 | ('learning_rate', 0.001021423409385794),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/558aa30c0b15+_tune_mwc_fi_24m_lstm_d2_arms/trial_758/config:
--------------------------------------------------------------------------------
1 |
2 | [ ('config_version', 5),
3 | # data
4 | ('conditioning_separator', ''),
5 | ('file_encoding', 'utf-8'),
6 | ('word_based', False),
7 | ('episodic', False),
8 | # model
9 | ('num_params', 24000000),
10 | ('share_input_and_output_embeddings', False),
11 | ('input_embedding_size', -1),
12 | ('output_embedding_size', -1),
13 | ('input_embedding_ratio', 1.5665444454253725),
14 | ('output_embedding_ratio', -1.0),
15 | ('mos_num_components', 0),
16 | ('token_dropout', 0.0),
17 | ('embedding_dropout', 0.0),
18 | ('input_dropout', 0.0004278254540998817),
19 | ('output_dropout', 0.21672999424789158),
20 | ('downprojected_output_dropout', -1.0),
21 | ('shared_mask_dropout', False),
22 | ('embed_once', True),
23 | ('output_once', True),
24 | # cell
25 | ('model', 'lstm'),
26 | ('num_layers', 2),
27 | ('residual_connections', False),
28 | ('lstm_skip_connection', True),
29 | ('feature_mask_rounds', 0),
30 | ('feature_mask_rank', 0),
31 | ('feature_mask', False),
32 | ('sparsity_ratio', -1.0),
33 | ('overlay_rank', -1),
34 | ('hidden_size', [-1]),
35 | ('hidden_size_multiplier', 1.0),
36 | ('layer_norm', False),
37 | ('activation_fn', 'tf.tanh'),
38 | ('tie_forget_and_input_gates', False),
39 | ('cap_input_gate', True),
40 | ('trainable_initial_state', False),
41 | ('inter_layer_dropout', 0.03679207573249842),
42 | ('state_dropout', 0.15784488790163897),
43 | ('state_dropout_flip_rate', 0.0),
44 | ('update_dropout', 0.0),
45 | ('cell_clip', -1.0),
46 | # objective
47 | ('model_average', 'arithmetic'),
48 | ('num_training_samples', 1),
49 | ('l2_penalty', 3.35903544036833e-05),
50 | ('l1_penalty', 0.0),
51 | ('activation_norm_penalty', 0.0),
52 | ('drop_state_probability', 0.01),
53 | # initialization
54 | ('embedding_init_factor', 1.0),
55 | ('scale_input_embeddings', False),
56 | ('cell_init_factor', 1.0),
57 | ('forget_bias', 1.0),
58 | ('output_init_factor', 1.0),
59 | # schedule
60 | ('steps_per_turn', 200),
61 | ('print_training_stats_every_num_steps', 200),
62 | ('turns', 500),
63 | # optimization
64 | ('optimizer_type', 'rmsprop'),
65 | ('rmsprop_beta2', 0.999),
66 | ('rmsprop_epsilon', 1e-08),
67 | ('adam_beta1', 0.9),
68 | ('adam_beta2', 0.999),
69 | ('adam_epsilon', 1e-08),
70 | ('batch_size', 64),
71 | ('accum_batch_size', -1),
72 | ('max_grad_norm', 10.0),
73 | ('max_time_steps', 150),
74 | ('trigger_averaging_turns', 25),
75 | ('trigger_averaging_at_the_latest', 400),
76 | # learning rate
77 | ('learning_rate', 0.0038728221226125496),
78 | ('learning_rate_decay', 1.0),
79 | ('learning_rate_decay_burn_in_steps', 0),
80 | ('drop_learning_rate_turns', -1),
81 | ('drop_learning_rate_multiplier', 1.0),
82 | ('drop_learning_rate_at_the_latest', -1),
83 | # early stopping
84 | ('early_stopping_turns', -1),
85 | ('early_stopping_rampup_turns', 0),
86 | ('early_stopping_worst_xe_target', ''),
87 | ('early_stopping_slowest_rate', 0.0),
88 | # cross-validation
89 | ('crossvalidate', False),
90 | ('crossvalidation_folds', 10),
91 | ('crossvalidation_rounds', 1),
92 | # evaluation
93 | ('max_training_eval_batches', 20),
94 | ('max_eval_eval_batches', -1),
95 | ('max_test_eval_batches', -1),
96 | ('min_non_episodic_eval_examples_per_stripe', 100),
97 | ('eval_on_test', False),
98 | ('eval_method', 'deterministic'),
99 | ('num_eval_samples', 0),
100 | ('eval_softmax_temperature', -0.8),
101 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
102 | ('eval_power_mean_power', 1.0),
103 | ('eval_dropout_multiplier', 1.0),
104 | ('validation_prediction_file', ''),
105 | ('dyneval', False),
106 | ('dyneval_learning_rate', 0.001),
107 | ('dyneval_decay_rate', 0.02),
108 | ('dyneval_epsilon', 1e-05),
109 | # experiments
110 | # checkpoints
111 | ('save_checkpoints', True),
112 | # misc
113 | ('seed', 1),
114 | ('swap_memory', True),
115 | ('log_device_placement', False),
116 | ('summary_flush_secs', 120),
117 | ]
118 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/558aa30c0b15+_tune_mwc_fi_24m_lstm_fm_d2_arms/trial_371/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('conditioning_separator', ''),
4 | ('file_encoding', 'utf-8'),
5 | ('word_based', False),
6 | ('episodic', False),
7 | # model
8 | ('num_params', 24000000),
9 | ('share_input_and_output_embeddings', False),
10 | ('input_embedding_size', -1),
11 | ('output_embedding_size', -1),
12 | ('input_embedding_ratio', 1.3016032871831578),
13 | ('output_embedding_ratio', -1.0),
14 | ('mos_num_components', 0),
15 | ('token_dropout', 0.0),
16 | ('embedding_dropout', 0.0),
17 | ('input_dropout', 0.06995541397794428),
18 | ('output_dropout', 0.22149685667402097),
19 | ('downprojected_output_dropout', -1.0),
20 | ('shared_mask_dropout', False),
21 | ('embed_once', True),
22 | ('output_once', True),
23 | # cell
24 | ('model', 'lstm'),
25 | ('num_layers', 2),
26 | ('residual_connections', False),
27 | ('lstm_skip_connection', True),
28 | ('feature_mask_rounds', 5),
29 | ('feature_mask_rank', 100),
30 | ('feature_mask', False),
31 | ('sparsity_ratio', -1.0),
32 | ('overlay_rank', -1),
33 | ('hidden_size', [-1]),
34 | ('hidden_size_multiplier', 1.0),
35 | ('layer_norm', False),
36 | ('activation_fn', 'tf.tanh'),
37 | ('tie_forget_and_input_gates', False),
38 | ('cap_input_gate', True),
39 | ('trainable_initial_state', False),
40 | ('inter_layer_dropout', 0.11571939622760244),
41 | ('state_dropout', 0.1759160317735942),
42 | ('state_dropout_flip_rate', 0.0),
43 | ('update_dropout', 0.0),
44 | ('cell_clip', -1.0),
45 | # objective
46 | ('model_average', 'arithmetic'),
47 | ('num_training_samples', 1),
48 | ('l2_penalty', 9.607977185924193e-05),
49 | ('l1_penalty', 0.0),
50 | ('activation_norm_penalty', 0.0),
51 | ('drop_state_probability', 0.01),
52 | # initialization
53 | ('embedding_init_factor', 1.0),
54 | ('scale_input_embeddings', False),
55 | ('cell_init_factor', 1.0),
56 | ('forget_bias', 1.0),
57 | ('output_init_factor', 1.0),
58 | # schedule
59 | ('steps_per_turn', 200),
60 | ('print_training_stats_every_num_steps', 200),
61 | ('turns', 500),
62 | # optimization
63 | ('optimizer_type', 'rmsprop'),
64 | ('rmsprop_beta2', 0.999),
65 | ('rmsprop_epsilon', 1e-08),
66 | ('adam_beta1', 0.9),
67 | ('adam_beta2', 0.999),
68 | ('adam_epsilon', 1e-08),
69 | ('batch_size', 64),
70 | ('accum_batch_size', -1),
71 | ('max_grad_norm', 10.0),
72 | ('max_time_steps', 150),
73 | ('trigger_averaging_turns', 25),
74 | ('trigger_averaging_at_the_latest', 400),
75 | # learning rate
76 | ('learning_rate', 0.001999992683987708),
77 | ('learning_rate_decay', 1.0),
78 | ('learning_rate_decay_burn_in_steps', 0),
79 | ('drop_learning_rate_turns', -1),
80 | ('drop_learning_rate_multiplier', 1.0),
81 | ('drop_learning_rate_at_the_latest', -1),
82 | # early stopping
83 | ('early_stopping_turns', -1),
84 | ('early_stopping_rampup_turns', 0),
85 | ('early_stopping_worst_xe_target', ''),
86 | ('early_stopping_slowest_rate', 0.0),
87 | # cross-validation
88 | ('crossvalidate', False),
89 | ('crossvalidation_folds', 10),
90 | ('crossvalidation_rounds', 1),
91 | # evaluation
92 | ('max_training_eval_batches', 20),
93 | ('max_eval_eval_batches', -1),
94 | ('max_test_eval_batches', -1),
95 | ('min_non_episodic_eval_examples_per_stripe', 100),
96 | ('eval_on_test', False),
97 | ('eval_method', 'deterministic'),
98 | ('num_eval_samples', 0),
99 | ('eval_softmax_temperature', -0.8),
100 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
101 | ('eval_power_mean_power', 1.0),
102 | ('eval_dropout_multiplier', 1.0),
103 | ('validation_prediction_file', ''),
104 | ('dyneval', False),
105 | ('dyneval_learning_rate', 0.001),
106 | ('dyneval_decay_rate', 0.02),
107 | ('dyneval_epsilon', 1e-05),
108 | # experiments
109 | # checkpoints
110 | ('save_checkpoints', True),
111 | # misc
112 | ('seed', 1),
113 | ('swap_memory', True),
114 | ('log_device_placement', False),
115 | ('summary_flush_secs', 120),
116 | ]
117 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/786252db3825+_tune_ptb_24m_lstm_d2_arms/trial_833/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'utf-8'),
4 | ('word_based', True),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 24000000),
8 | ('share_input_and_output_embeddings', True),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 1.0),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.6275626425150355),
17 | ('output_dropout', 0.6901712653612706),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 2),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 0),
27 | ('feature_mask_rank', 0),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.3069926535017156),
39 | ('state_dropout', 0.3692225400980858),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 0.00024908138497223704),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 200),
58 | ('print_training_stats_every_num_steps', 200),
59 | ('turns', 1000),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 64),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 70),
71 | ('trigger_averaging_turns', 50),
72 | ('trigger_averaging_at_the_latest', 800),
73 | # learning rate
74 | ('learning_rate', 0.0030369099569192135),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', False),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/786252db3825+_tune_ptb_24m_lstm_fm_d2_arms/trial_483/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'utf-8'),
4 | ('word_based', True),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 24000000),
8 | ('share_input_and_output_embeddings', True),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 1.0),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.7290787773167251),
17 | ('output_dropout', 0.7156690388448465),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 2),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 5),
27 | ('feature_mask_rank', 84),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.2909822365241189),
39 | ('state_dropout', 0.38729439899832296),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 0.00025235335778471014),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 200),
58 | ('print_training_stats_every_num_steps', 200),
59 | ('turns', 1000),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 64),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 70),
71 | ('trigger_averaging_turns', 50),
72 | ('trigger_averaging_at_the_latest', 800),
73 | # learning rate
74 | ('learning_rate', 0.002299987130225388),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', False),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/9e20581d3dad+_tune_mwc_en_24m_lstm_d2_arms/trial_502/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'utf-8'),
4 | ('word_based', False),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 24000000),
8 | ('share_input_and_output_embeddings', False),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 0.16763290107221795),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.24803406411000273),
17 | ('output_dropout', 0.06200886700243824),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 2),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 0),
27 | ('feature_mask_rank', 0),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.04740148103981923),
39 | ('state_dropout', 0.046954638037220955),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 7.825277510671981e-06),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 200),
58 | ('print_training_stats_every_num_steps', 200),
59 | ('turns', 500),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 64),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 150),
71 | ('trigger_averaging_turns', 25),
72 | ('trigger_averaging_at_the_latest', 400),
73 | # learning rate
74 | ('learning_rate', 0.0038051220647221428),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/9e20581d3dad+_tune_mwc_en_24m_lstm_fm_d2_arms/trial_422/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'utf-8'),
4 | ('word_based', False),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 24000000),
8 | ('share_input_and_output_embeddings', False),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 0.48783057795681084),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.22379479910762798),
17 | ('output_dropout', 0.005212299871888891),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 2),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 6),
27 | ('feature_mask_rank', 78),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.08779703173530118),
39 | ('state_dropout', 0.09548532162445378),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 9.245434142118616e-05),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 200),
58 | ('print_training_stats_every_num_steps', 200),
59 | ('turns', 500),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 64),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 150),
71 | ('trigger_averaging_turns', 25),
72 | ('trigger_averaging_at_the_latest', 400),
73 | # learning rate
74 | ('learning_rate', 0.0014344414472614946),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_d2_arms/trial_763/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'utf-8'),
4 | ('word_based', True),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 35000000),
8 | ('share_input_and_output_embeddings', True),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 0.3530770457779424),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 2),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.6090979517941943),
17 | ('output_dropout', 0.34845530389157287),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 2),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 0),
27 | ('feature_mask_rank', 0),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.09075401405970591),
39 | ('state_dropout', 0.2714030562283111),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 0.00023063627783021125),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 200),
58 | ('print_training_stats_every_num_steps', 200),
59 | ('turns', 1000),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 64),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 70),
71 | ('trigger_averaging_turns', 50),
72 | ('trigger_averaging_at_the_latest', 800),
73 | # learning rate
74 | ('learning_rate', 0.003183909546336849),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/c51c838b33a5+_tune_wikitext-2_35m_lstm_mos2_fm_d2_arms/trial_747/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'utf-8'),
4 | ('word_based', True),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 35000000),
8 | ('share_input_and_output_embeddings', True),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 0.1993194960596213),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 2),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.5469087645499495),
17 | ('output_dropout', 0.34766651193735193),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 2),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 6),
27 | ('feature_mask_rank', 48),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.1988228263748591),
39 | ('state_dropout', 0.22137985867236876),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 0.00018994987193751323),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 200),
58 | ('print_training_stats_every_num_steps', 200),
59 | ('turns', 1000),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 64),
68 | ('accum_batch_size', -1),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 70),
71 | ('trigger_averaging_turns', 50),
72 | ('trigger_averaging_at_the_latest', 800),
73 | # learning rate
74 | ('learning_rate', 0.003287792100749033),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/e81db31261c0+_tune_enwik8_96m_lstm_d4_arms/trial_295/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'CP437'),
4 | ('word_based', False),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 96000000),
8 | ('share_input_and_output_embeddings', False),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 1.0),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.1233672355450206),
17 | ('output_dropout', 0.24846692818769148),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 4),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 0),
27 | ('feature_mask_rank', 0),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.12636500626697247),
39 | ('state_dropout', 0.13063181510547955),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 5.853555404849184e-05),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 400),
58 | ('print_training_stats_every_num_steps', 1000),
59 | ('turns', 100),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 128),
68 | ('accum_batch_size', 64),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 500),
71 | ('trigger_averaging_turns', 10),
72 | ('trigger_averaging_at_the_latest', 80),
73 | # learning rate
74 | ('learning_rate', 0.001975213597736287),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/config/e81db31261c0+_tune_enwik8_96m_lstm_fm_d4_arms_MtngW/trial_216/config:
--------------------------------------------------------------------------------
1 | [ ('config_version', 5),
2 | # data
3 | ('file_encoding', 'CP437'),
4 | ('word_based', False),
5 | ('episodic', False),
6 | # model
7 | ('num_params', 96000000),
8 | ('share_input_and_output_embeddings', False),
9 | ('input_embedding_size', -1),
10 | ('output_embedding_size', -1),
11 | ('input_embedding_ratio', 1.0),
12 | ('output_embedding_ratio', -1.0),
13 | ('mos_num_components', 0),
14 | ('token_dropout', 0.0),
15 | ('embedding_dropout', 0.0),
16 | ('input_dropout', 0.015908852824256536),
17 | ('output_dropout', 0.2878539844807166),
18 | ('downprojected_output_dropout', -1.0),
19 | ('shared_mask_dropout', False),
20 | ('embed_once', True),
21 | # cell
22 | ('model', 'lstm'),
23 | ('num_layers', 4),
24 | ('residual_connections', False),
25 | ('lstm_skip_connection', True),
26 | ('feature_mask_rounds', 5),
27 | ('feature_mask_rank', 61),
28 | ('feature_mask', False),
29 | ('sparsity_ratio', -1.0),
30 | ('overlay_rank', -1),
31 | ('hidden_size', [-1]),
32 | ('hidden_size_multiplier', 1.0),
33 | ('layer_norm', False),
34 | ('activation_fn', 'tf.tanh'),
35 | ('tie_forget_and_input_gates', False),
36 | ('cap_input_gate', True),
37 | ('trainable_initial_state', False),
38 | ('inter_layer_dropout', 0.13785990907975867),
39 | ('state_dropout', 0.1648727901535727),
40 | ('state_dropout_flip_rate', 0.0),
41 | ('update_dropout', 0.0),
42 | ('cell_clip', -1.0),
43 | # objective
44 | ('model_average', 'arithmetic'),
45 | ('num_training_samples', 1),
46 | ('l2_penalty', 4.409390792135428e-05),
47 | ('l1_penalty', 0.0),
48 | ('activation_norm_penalty', 0.0),
49 | ('drop_state_probability', 0.01),
50 | # initialization
51 | ('embedding_init_factor', 1.0),
52 | ('scale_input_embeddings', False),
53 | ('cell_init_factor', 1.0),
54 | ('forget_bias', 1.0),
55 | ('output_init_factor', 1.0),
56 | # schedule
57 | ('steps_per_turn', 400),
58 | ('print_training_stats_every_num_steps', 1000),
59 | ('turns', 100),
60 | # optimization
61 | ('optimizer_type', 'rmsprop'),
62 | ('rmsprop_beta2', 0.999),
63 | ('rmsprop_epsilon', 1e-08),
64 | ('adam_beta1', 0.9),
65 | ('adam_beta2', 0.999),
66 | ('adam_epsilon', 1e-08),
67 | ('batch_size', 128),
68 | ('accum_batch_size', 64),
69 | ('max_grad_norm', 10.0),
70 | ('max_time_steps', 500),
71 | ('trigger_averaging_turns', 10),
72 | ('trigger_averaging_at_the_latest', 80),
73 | # learning rate
74 | ('learning_rate', 0.0022480107672343715),
75 | ('learning_rate_decay', 1.0),
76 | ('learning_rate_decay_burn_in_steps', 0),
77 | ('drop_learning_rate_turns', -1),
78 | ('drop_learning_rate_multiplier', 1.0),
79 | ('drop_learning_rate_at_the_latest', -1),
80 | # early stopping
81 | ('early_stopping_turns', -1),
82 | ('early_stopping_rampup_turns', 0),
83 | ('early_stopping_worst_xe_target', ''),
84 | ('early_stopping_slowest_rate', 0.0),
85 | # cross-validation
86 | ('crossvalidate', False),
87 | ('crossvalidation_folds', 10),
88 | ('crossvalidation_rounds', 1),
89 | # evaluation
90 | ('max_training_eval_batches', 20),
91 | ('max_eval_eval_batches', -1),
92 | ('max_test_eval_batches', -1),
93 | ('min_non_episodic_eval_examples_per_stripe', 100),
94 | ('eval_on_test', False),
95 | ('eval_method', 'deterministic'),
96 | ('num_eval_samples', 0),
97 | ('eval_softmax_temperature', -0.8),
98 | ('eval_softmax_temperature_estimation_num_tokens', 50000),
99 | ('eval_power_mean_power', 1.0),
100 | ('eval_dropout_multiplier', 1.0),
101 | ('validation_prediction_file', ''),
102 | ('dyneval', False),
103 | ('dyneval_learning_rate', 0.001),
104 | ('dyneval_decay_rate', 0.02),
105 | ('dyneval_epsilon', 1e-05),
106 | # experiments
107 | # checkpoints
108 | ('save_checkpoints', True),
109 | # misc
110 | ('seed', 1),
111 | ('swap_memory', True),
112 | ('log_device_placement', False),
113 | ('summary_flush_secs', 120),
114 | ]
115 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_enwik8.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/enwik8.sh"
24 |
25 | name="$2"
26 | config_file="$3"
27 |
28 | source_lib "run.sh" "$1"
29 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_mwc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/mwc.sh"
24 |
25 | # Data
26 |
27 | lang="${2:-en}"
28 | training_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.tr.raw.unk"
29 | validation_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.va.raw.unk"
30 | test_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.te.raw.unk"
31 |
32 | name="$2"
33 | config_file="$3"
34 |
35 | source_lib "run.sh" "$1"
36 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_ptb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word.sh"
24 |
25 | name="$2"
26 | config_file="$3"
27 |
28 | source_lib "run.sh" "$1"
29 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_ptb_char.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_char.sh"
24 |
25 | name="$2"
26 | config_file="$3"
27 |
28 | source_lib "run.sh" "$1"
29 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/train_wikitext-2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/wikitext-2_word.sh"
24 |
25 | name="$2"
26 | config_file="$3"
27 |
28 | source_lib "run.sh" "$1"
29 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_copy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/copy.sh"
28 |
29 | # Model
30 |
31 | num_param_millions=10
32 | num_params=$(million ${num_param_millions})
33 | share_input_and_output_embeddings=false
34 | shared_mask_dropout=false
35 |
36 | # Cell
37 |
38 | model="lstm"
39 | num_layers=1
40 | lstm_skip_connection=true
41 | tie_forget_and_input_gates=false
42 | cap_input_gate=true
43 |
44 | # Objective
45 |
46 | drop_state_probability=0.0
47 |
48 | # Initialization
49 |
50 | forget_bias=1.0
51 |
52 | # Schedule
53 |
54 | steps_per_turn=200
55 | print_training_stats_every_num_steps=200
56 | turns=100
57 |
58 | # Optimizer
59 |
60 | optimizer_type="rmsprop"
61 | batch_size=64
62 | max_grad_norm=10.0
63 | max_time_steps=155
64 |
65 | # Early stopping
66 |
67 | # early_stopping_turns=30
68 | # early_stopping_worst_xe_target=4.4
69 |
70 | # Evaluation
71 |
72 | max_training_eval_batches=20
73 | eval_softmax_temperature=-0.8
74 |
75 | # Tuning parameters
76 |
77 | priority=200
78 | num_workers=60
79 |
80 | # Misc
81 |
82 | swap_memory=true
83 |
84 | # Start experiments with averaged optimization
85 |
86 | drop_learning_rate_turns=-1
87 | drop_learning_rate_multiplier=1.0
88 | drop_learning_rate_at_the_latest=-1
89 | trigger_averaging_turns=10
90 | trigger_averaging_at_the_latest=80
91 |
92 | # feature mask
93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
94 | feature_mask_rounds,feature_mask_rank"
95 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
96 | source_lib "run.sh" "$@"
97 |
98 | # vanilla
99 | tuneables="input_embedding_ratio,learning_rate,l2_penalty"
100 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
101 | source_lib "run.sh" "$@"
102 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_dyneval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$1"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 |
28 | name="$2"
29 | config_file="$3/config"
30 | load_checkpoint="$3/best"
31 |
32 | save_checkpoints=false
33 | turns=0
34 |
35 | # Evaluation
36 |
37 | dyneval=true
38 | batch_size=1024
39 | max_training_eval_batches=500
40 | max_grad_norm=0.0
41 | eval_softmax_temperature=-0.8
42 | eval_softmax_temperature_estimation_num_tokens=50000
43 | l2_penalty=0.0
44 |
45 | # Tuning parameters
46 |
47 | priority=200
48 | num_workers=60
49 |
50 | tuneables="batch_size,max_time_steps,
51 | dyneval_learning_rate,dyneval_decay_rate,dyneval_epsilon"
52 | name="$(default_name)_${name}"
53 | source_lib "run.sh" "$1"
54 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_enwik8.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/enwik8.sh"
28 |
29 | # Model
30 |
31 | num_param_millions=24
32 | num_params=$(million ${num_param_millions})
33 | share_input_and_output_embeddings=false
34 | shared_mask_dropout=false
35 |
36 | # Cell
37 |
38 | model="lstm"
39 | num_layers=4
40 | lstm_skip_connection=true
41 | tie_forget_and_input_gates=false
42 | cap_input_gate=true
43 |
44 | # Objective
45 |
46 | drop_state_probability=0.01
47 |
48 | # Initialization
49 |
50 | forget_bias=1.0
51 |
52 | # Schedule
53 |
54 | steps_per_turn=1000
55 | print_training_stats_every_num_steps=1000
56 | turns=100
57 |
58 | # Optimizer
59 |
60 | optimizer_type="rmsprop"
61 | batch_size=128
62 | max_grad_norm=10.0
63 | max_time_steps=200
64 |
65 | # Early stopping
66 |
67 | # early_stopping_turns=30
68 | # early_stopping_worst_xe_target=4.4
69 |
70 | # Evaluation
71 |
72 | max_training_eval_batches=20
73 | eval_softmax_temperature=-0.8
74 |
75 | # Tuning parameters
76 |
77 | priority=200
78 | num_workers=60
79 |
80 | # Misc
81 |
82 | swap_memory=true
83 |
84 | # Start experiments with averaged optimization
85 |
86 | drop_learning_rate_turns=-1
87 | drop_learning_rate_multiplier=1.0
88 | drop_learning_rate_at_the_latest=-1
89 | trigger_averaging_turns=10
90 | trigger_averaging_at_the_latest=80
91 |
92 | # feature mask
93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
94 | input_dropout,inter_layer_dropout,state_dropout,
95 | output_dropout,
96 | feature_mask_rounds,feature_mask_rank"
97 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
98 | source_lib "run.sh" "$@"
99 |
100 | # vanilla
101 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
102 | input_dropout,inter_layer_dropout,state_dropout,
103 | output_dropout"
104 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
105 | source_lib "run.sh" "$@"
106 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_mwc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/mwc.sh"
28 |
29 | # Data
30 |
31 | lang="${2:-en}"
32 | training_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.tr.raw.unk"
33 | validation_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.va.raw.unk"
34 | test_file="${mwc_data_dir}/wiki_${lang}/ptb_format_large/wiki_${lang}.te.raw.unk"
35 |
36 | # Model
37 |
38 | num_param_millions=24
39 | num_params=$(million ${num_param_millions})
40 | share_input_and_output_embeddings=false
41 | shared_mask_dropout=false
42 |
43 | # Cell
44 |
45 | model="lstm"
46 | num_layers=2
47 | lstm_skip_connection=true
48 | tie_forget_and_input_gates=false
49 | cap_input_gate=true
50 |
51 | # Objective
52 |
53 | drop_state_probability=0.01
54 |
55 | # Initialization
56 |
57 | forget_bias=1.0
58 |
59 | # Schedule
60 |
61 | steps_per_turn=200
62 | print_training_stats_every_num_steps=200
63 | turns=500
64 |
65 | # Optimizer
66 |
67 | optimizer_type="rmsprop"
68 | batch_size=64
69 | max_grad_norm=10.0
70 | max_time_steps=150
71 |
72 | # Early stopping
73 |
74 | # early_stopping_turns=30
75 | # early_stopping_worst_xe_target=4.4
76 |
77 | # Evaluation
78 |
79 | max_training_eval_batches=20
80 | eval_softmax_temperature=-0.8
81 |
82 | # Tuning parameters
83 |
84 | priority=200
85 | num_workers=60
86 |
87 | # Misc
88 |
89 | swap_memory=true
90 |
91 | # Start experiments with averaged optimization
92 |
93 | drop_learning_rate_turns=-1
94 | drop_learning_rate_multiplier=1.0
95 | drop_learning_rate_at_the_latest=-1
96 | trigger_averaging_turns=25
97 | trigger_averaging_at_the_latest=400
98 |
99 | # feature mask
100 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
101 | input_dropout,inter_layer_dropout,state_dropout,
102 | output_dropout,
103 | feature_mask_rounds,feature_mask_rank"
104 | name="$(default_name)_${lang}_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
105 | source_lib "run.sh" "$@"
106 |
107 | # vanilla
108 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
109 | input_dropout,inter_layer_dropout,state_dropout,
110 | output_dropout"
111 | name="$(default_name)_${lang}_${num_param_millions}m_${model}_d${num_layers}_arms"
112 | source_lib "run.sh" "$@"
113 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_ptb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/ptb_word.sh"
28 |
29 | # Model
30 |
31 | num_param_millions=24
32 | num_params=$(million ${num_param_millions})
33 | share_input_and_output_embeddings=true
34 | shared_mask_dropout=false
35 |
36 | # Cell
37 |
38 | model="lstm"
39 | num_layers=2
40 | lstm_skip_connection=true
41 | tie_forget_and_input_gates=false
42 | cap_input_gate=true
43 |
44 | # Objective
45 |
46 | drop_state_probability=0.01
47 |
48 | # Initialization
49 |
50 | forget_bias=1.0
51 |
52 | # Schedule
53 |
54 | steps_per_turn=200
55 | print_training_stats_every_num_steps=200
56 | turns=1000
57 |
58 | # Optimizer
59 |
60 | optimizer_type="rmsprop"
61 | batch_size=64
62 | max_grad_norm=10.0
63 | max_time_steps=70
64 |
65 | # Early stopping
66 |
67 | # early_stopping_turns=30
68 | # early_stopping_worst_xe_target=4.4
69 |
70 | # Evaluation
71 |
72 | max_training_eval_batches=20
73 | eval_softmax_temperature=-0.8
74 |
75 | # Tuning parameters
76 |
77 | priority=200
78 | num_workers=60
79 |
80 | # Start experiments with dropped learning rate
81 |
82 | # drop_learning_rate_turns=100
83 | # drop_learning_rate_multiplier=0.1
84 | # drop_learning_rate_at_the_latest=1600
85 | #
86 | # # feature mask
87 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
88 | # input_dropout,inter_layer_dropout,state_dropout,
89 | # output_dropout,
90 | # feature_mask_rounds,feature_mask_rank"
91 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms"
92 | # source_lib "run.sh" "$@"
93 | #
94 | # # vanilla
95 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
96 | # input_dropout,inter_layer_dropout,state_dropout,
97 | # output_dropout"
98 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms"
99 | # source_lib "run.sh" "$@"
100 |
101 | # Start experiments with averaged optimization
102 |
103 | drop_learning_rate_turns=-1
104 | drop_learning_rate_multiplier=1.0
105 | drop_learning_rate_at_the_latest=-1
106 | trigger_averaging_turns=50
107 | trigger_averaging_at_the_latest=800
108 |
109 | # feature mask
110 | tuneables="learning_rate,l2_penalty,
111 | input_dropout,inter_layer_dropout,state_dropout,
112 | output_dropout,
113 | feature_mask_rounds,feature_mask_rank"
114 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
115 | source_lib "run.sh" "$@"
116 |
117 | # vanilla
118 | tuneables="learning_rate,l2_penalty,
119 | input_dropout,inter_layer_dropout,state_dropout,
120 | output_dropout"
121 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
122 | source_lib "run.sh" "$@"
123 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_ptb_char.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/ptb_char.sh"
28 |
29 | # Model
30 |
31 | num_param_millions=24
32 | num_params=$(million ${num_param_millions})
33 | share_input_and_output_embeddings=false
34 | shared_mask_dropout=false
35 |
36 | # Cell
37 |
38 | model="lstm"
39 | num_layers=2
40 | lstm_skip_connection=true
41 | tie_forget_and_input_gates=false
42 | cap_input_gate=true
43 |
44 | # Objective
45 |
46 | drop_state_probability=0.01
47 |
48 | # Initialization
49 |
50 | forget_bias=1.0
51 |
52 | # Schedule
53 |
54 | steps_per_turn=200
55 | print_training_stats_every_num_steps=200
56 | turns=500
57 |
58 | # Optimizer
59 |
60 | optimizer_type="rmsprop"
61 | batch_size=64
62 | max_grad_norm=10.0
63 | max_time_steps=150
64 |
65 | # Early stopping
66 |
67 | # early_stopping_turns=30
68 | # early_stopping_worst_xe_target=4.4
69 |
70 | # Evaluation
71 |
72 | max_training_eval_batches=20
73 | eval_softmax_temperature=-0.8
74 |
75 | # Tuning parameters
76 |
77 | priority=200
78 | num_workers=60
79 |
80 | # Misc
81 |
82 | swap_memory=true
83 |
84 | # Start experiments with averaged optimization
85 |
86 | drop_learning_rate_turns=-1
87 | drop_learning_rate_multiplier=1.0
88 | drop_learning_rate_at_the_latest=-1
89 | trigger_averaging_turns=25
90 | trigger_averaging_at_the_latest=400
91 |
92 | # feature mask
93 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
94 | input_dropout,inter_layer_dropout,state_dropout,
95 | output_dropout,
96 | feature_mask_rounds,feature_mask_rank"
97 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
98 | source_lib "run.sh" "$@"
99 |
100 | # vanilla
101 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
102 | input_dropout,inter_layer_dropout,state_dropout,
103 | output_dropout"
104 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
105 | source_lib "run.sh" "$@"
106 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_ptb_fast.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/ptb_word.sh"
28 |
29 | # Model
30 |
31 | num_param_millions=24
32 | num_params=$(million ${num_param_millions})
33 | share_input_and_output_embeddings=true
34 | shared_mask_dropout=false
35 |
36 | # Cell
37 |
38 | model="lstm"
39 | num_layers=2
40 | lstm_skip_connection=true
41 | tie_forget_and_input_gates=false
42 | cap_input_gate=true
43 |
44 | # Objective
45 |
46 | drop_state_probability=0.01
47 |
48 | # Initialization
49 |
50 | forget_bias=1.0
51 |
52 | # Schedule
53 |
54 | steps_per_turn=100
55 | print_training_stats_every_num_steps=100
56 | turns=600
57 |
58 | # Optimizer
59 |
60 | optimizer_type="rmsprop"
61 | batch_size=64
62 | max_grad_norm=10.0
63 | max_time_steps=35
64 |
65 | # Early stopping
66 |
67 | # early_stopping_turns=30
68 | # early_stopping_worst_xe_target=4.4
69 |
70 | # Evaluation
71 |
72 | max_training_eval_batches=20
73 | eval_softmax_temperature=-0.8
74 |
75 | # Tuning parameters
76 |
77 | priority=200
78 | num_workers=60
79 |
80 | # Start experiments with dropped learning rate
81 |
82 | # drop_learning_rate_turns=100
83 | # drop_learning_rate_multiplier=0.1
84 | # drop_learning_rate_at_the_latest=1600
85 | #
86 | # # feature mask
87 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
88 | # input_dropout,inter_layer_dropout,state_dropout,
89 | # output_dropout,
90 | # feature_mask_rounds,feature_mask_rank"
91 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms"
92 | # source_lib "run.sh" "$@"
93 | #
94 | # # vanilla
95 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
96 | # input_dropout,inter_layer_dropout,state_dropout,
97 | # output_dropout"
98 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms"
99 | # source_lib "run.sh" "$@"
100 |
101 | # Start experiments with averaged optimization
102 |
103 | drop_learning_rate_turns=-1
104 | drop_learning_rate_multiplier=1.0
105 | drop_learning_rate_at_the_latest=-1
106 | trigger_averaging_turns=25
107 | trigger_averaging_at_the_latest=400
108 |
109 | # feature mask
110 | tuneables="learning_rate,l2_penalty,
111 | input_dropout,inter_layer_dropout,state_dropout,
112 | output_dropout,
113 | feature_mask_rounds,feature_mask_rank"
114 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
115 | source_lib "run.sh" "$@"
116 |
117 | # vanilla
118 | tuneables="learning_rate,l2_penalty,
119 | input_dropout,inter_layer_dropout,state_dropout,
120 | output_dropout"
121 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
122 | source_lib "run.sh" "$@"
123 |
--------------------------------------------------------------------------------
/lamb/experiment/mogrifier/tune_wikitext-2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../../lib/setup.sh" "$@"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/wikitext-2_word.sh"
28 |
29 | # Model
30 |
31 | num_param_millions=35
32 | num_params=$(million ${num_param_millions})
33 | share_input_and_output_embeddings=true
34 | shared_mask_dropout=false
35 |
36 | # Cell
37 |
38 | model="lstm"
39 | num_layers=2
40 | lstm_skip_connection=true
41 | tie_forget_and_input_gates=false
42 | cap_input_gate=true
43 |
44 | # Objective
45 |
46 | drop_state_probability=0.01
47 |
48 | # Initialization
49 |
50 | forget_bias=1.0
51 |
52 | # Schedule
53 |
54 | steps_per_turn=200
55 | print_training_stats_every_num_steps=200
56 | turns=1000
57 |
58 | # Optimizer
59 |
60 | optimizer_type="rmsprop"
61 | batch_size=64
62 | max_grad_norm=10.0
63 | max_time_steps=70
64 |
65 | # Early stopping
66 |
67 | # early_stopping_turns=30
68 | # early_stopping_worst_xe_target=4.4
69 |
70 | # Evaluation
71 |
72 | max_training_eval_batches=20
73 | eval_softmax_temperature=-0.8
74 |
75 | # Tuning parameters
76 |
77 | priority=200
78 | num_workers=60
79 |
80 | # Misc
81 |
82 | swap_memory=true
83 |
84 | # Start experiments with dropped learning rate
85 |
86 | # drop_learning_rate_turns=100
87 | # drop_learning_rate_multiplier=0.1
88 | # drop_learning_rate_at_the_latest=1600
89 | #
90 | # # feature mask
91 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
92 | # input_dropout,inter_layer_dropout,state_dropout,
93 | # output_dropout,
94 | # feature_mask_rounds,feature_mask_rank"
95 | # name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_rms"
96 | # source_lib "run.sh" "$@"
97 | #
98 | # # vanilla
99 | # tuneables="input_embedding_ratio,learning_rate,l2_penalty,
100 | # input_dropout,inter_layer_dropout,state_dropout,
101 | # output_dropout"
102 | # name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_rms"
103 | # source_lib "run.sh" "$@"
104 |
105 | # Start experiments with averaged optimization
106 |
107 | drop_learning_rate_turns=-1
108 | drop_learning_rate_multiplier=1.0
109 | drop_learning_rate_at_the_latest=-1
110 | trigger_averaging_turns=50
111 | trigger_averaging_at_the_latest=800
112 |
113 | # feature mask
114 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
115 | input_dropout,inter_layer_dropout,state_dropout,
116 | output_dropout,
117 | feature_mask_rounds,feature_mask_rank"
118 | name="$(default_name)_${num_param_millions}m_${model}_fm_d${num_layers}_arms"
119 | source_lib "run.sh" "$@"
120 |
121 | # vanilla
122 | tuneables="input_embedding_ratio,learning_rate,l2_penalty,
123 | input_dropout,inter_layer_dropout,state_dropout,
124 | output_dropout"
125 | name="$(default_name)_${num_param_millions}m_${model}_d${num_layers}_arms"
126 | source_lib "run.sh" "$@"
127 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/README.md:
--------------------------------------------------------------------------------
1 | This directory contains saved configuration files for tuned models from the [On
2 | the state of the art of evaluation in neural language
3 | models](https://arxiv.org/abs/1707.05589) paper. Model weights are not included.
4 |
5 | Don't forget to [set up the data](../../README.md).
6 |
7 | To train the 1 layer LSTM model of 10m weights on PTB with tuned hyperparameters
8 | (see the paper above):
9 |
10 | ./train_ptb.sh run ptb_10m_lstm_d1/hps_proto
11 |
12 | There are separate training script for enwik8 and wikitext-2. The training will
13 | save the model in `/tmp/lamb/ptb_10m_lstm_d1/`. To test the saved model:
14 |
15 | ../test.sh run some-descriptive-name /tmp/lamb/ptb_10m_lstm_d1/
16 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/enwik8_27m_lstm_d4/hps_proto:
--------------------------------------------------------------------------------
1 | hparam {
2 | key: "activation_fn"
3 | value {
4 | bytes_value: "tf.tanh"
5 | }
6 | }
7 | hparam {
8 | key: "adam_beta1"
9 | value {
10 | float_value: 0.899999976158
11 | }
12 | }
13 | hparam {
14 | key: "adam_beta2"
15 | value {
16 | float_value: 0.999000012875
17 | }
18 | }
19 | hparam {
20 | key: "adam_epsilon"
21 | value {
22 | float_value: 9.99999993923e-09
23 | }
24 | }
25 | hparam {
26 | key: "batch_size"
27 | value {
28 | int64_value: 128
29 | }
30 | }
31 | hparam {
32 | key: "cell_clip"
33 | value {
34 | float_value: -1.0
35 | }
36 | }
37 | hparam {
38 | key: "cell_init_factor"
39 | value {
40 | float_value: 1.0
41 | }
42 | }
43 | hparam {
44 | key: "drop_learning_rate_at_the_latest"
45 | value {
46 | int64_value: 450
47 | }
48 | }
49 | hparam {
50 | key: "drop_learning_rate_multiplier"
51 | value {
52 | float_value: 0.10000000149
53 | }
54 | }
55 | hparam {
56 | key: "drop_learning_rate_rounds"
57 | value {
58 | int64_value: 13
59 | }
60 | }
61 | hparam {
62 | key: "drop_state_probability"
63 | value {
64 | float_value: 0.00999999977648
65 | }
66 | }
67 | hparam {
68 | key: "embed_once"
69 | value {
70 | bool_value: true
71 | }
72 | }
73 | hparam {
74 | key: "embedding_init_factor"
75 | value {
76 | float_value: 1.0
77 | }
78 | }
79 | hparam {
80 | key: "feature_mask"
81 | value {
82 | bool_value: false
83 | }
84 | }
85 | hparam {
86 | key: "forget_bias"
87 | value {
88 | float_value: 1.0
89 | }
90 | }
91 | hparam {
92 | key: "hidden_size"
93 | value {
94 | int64_value: 911
95 | }
96 | }
97 | hparam {
98 | key: "input_dropout"
99 | value {
100 | float_value: 0.196795836091
101 | }
102 | }
103 | hparam {
104 | key: "input_embedding_ratio"
105 | value {
106 | float_value: 1.0
107 | }
108 | }
109 | hparam {
110 | key: "input_embedding_size"
111 | value {
112 | int64_value: 911
113 | }
114 | }
115 | hparam {
116 | key: "intra_layer_dropout"
117 | value {
118 | float_value: 0.0307693872601
119 | }
120 | }
121 | hparam {
122 | key: "layer_norm"
123 | value {
124 | bool_value: false
125 | }
126 | }
127 | hparam {
128 | key: "learning_rate"
129 | value {
130 | float_value: 0.00203050486743
131 | }
132 | }
133 | hparam {
134 | key: "learning_rate_decay"
135 | value {
136 | float_value: 1.0
137 | }
138 | }
139 | hparam {
140 | key: "learning_rate_decay_burn_in_steps"
141 | value {
142 | int64_value: 0
143 | }
144 | }
145 | hparam {
146 | key: "lstm_skip_connection"
147 | value {
148 | bool_value: true
149 | }
150 | }
151 | hparam {
152 | key: "max_grad_norm"
153 | value {
154 | float_value: 10.0
155 | }
156 | }
157 | hparam {
158 | key: "model"
159 | value {
160 | bytes_value: "lstm"
161 | }
162 | }
163 | hparam {
164 | key: "num_eval_samples"
165 | value {
166 | int64_value: 0
167 | }
168 | }
169 | hparam {
170 | key: "num_layers"
171 | value {
172 | int64_value: 4
173 | }
174 | }
175 | hparam {
176 | key: "num_params"
177 | value {
178 | int64_value: 27000000
179 | }
180 | }
181 | hparam {
182 | key: "optimizer_type"
183 | value {
184 | bytes_value: "rmsprop"
185 | }
186 | }
187 | hparam {
188 | key: "outer_steps"
189 | value {
190 | int64_value: 500
191 | }
192 | }
193 | hparam {
194 | key: "output_dropout"
195 | value {
196 | float_value: 0.0695193335414
197 | }
198 | }
199 | hparam {
200 | key: "output_embedding_ratio"
201 | value {
202 | float_value: 1.0
203 | }
204 | }
205 | hparam {
206 | key: "output_embedding_size"
207 | value {
208 | int64_value: 911
209 | }
210 | }
211 | hparam {
212 | key: "output_init_factor"
213 | value {
214 | float_value: 1.0
215 | }
216 | }
217 | hparam {
218 | key: "overlay_rank"
219 | value {
220 | int64_value: -1
221 | }
222 | }
223 | hparam {
224 | key: "rmsprop_beta2"
225 | value {
226 | float_value: 0.990000009537
227 | }
228 | }
229 | hparam {
230 | key: "rmsprop_epsilon"
231 | value {
232 | float_value: 9.99999974738e-06
233 | }
234 | }
235 | hparam {
236 | key: "share_input_and_output_embeddings"
237 | value {
238 | bool_value: false
239 | }
240 | }
241 | hparam {
242 | key: "sparsity_ratio"
243 | value {
244 | float_value: -1.0
245 | }
246 | }
247 | hparam {
248 | key: "state_dropout"
249 | value {
250 | float_value: 0.0808205232024
251 | }
252 | }
253 | hparam {
254 | key: "tie_forget_and_input_gates"
255 | value {
256 | bool_value: false
257 | }
258 | }
259 | hparam {
260 | key: "token_dropout"
261 | value {
262 | float_value: 0.0
263 | }
264 | }
265 | hparam {
266 | key: "trainable_initial_state"
267 | value {
268 | bool_value: false
269 | }
270 | }
271 | hparam {
272 | key: "update_dropout"
273 | value {
274 | float_value: 0.0
275 | }
276 | }
277 | hparam {
278 | key: "vocab_size"
279 | value {
280 | int64_value: 206
281 | }
282 | }
283 | hparam {
284 | key: "weight_decay"
285 | value {
286 | float_value: 7.50829849494e-06
287 | }
288 | }
289 | hparam {
290 | key: "weight_penalty"
291 | value {
292 | float_value: 0.0
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/enwik8_46m_lstm_d4/hps_proto:
--------------------------------------------------------------------------------
1 | hparam {
2 | key: "activation_fn"
3 | value {
4 | bytes_value: "tf.tanh"
5 | }
6 | }
7 | hparam {
8 | key: "adam_beta1"
9 | value {
10 | float_value: 0.899999976158
11 | }
12 | }
13 | hparam {
14 | key: "adam_beta2"
15 | value {
16 | float_value: 0.999000012875
17 | }
18 | }
19 | hparam {
20 | key: "adam_epsilon"
21 | value {
22 | float_value: 9.99999993923e-09
23 | }
24 | }
25 | hparam {
26 | key: "batch_size"
27 | value {
28 | int64_value: 128
29 | }
30 | }
31 | hparam {
32 | key: "cell_clip"
33 | value {
34 | float_value: -1.0
35 | }
36 | }
37 | hparam {
38 | key: "cell_init_factor"
39 | value {
40 | float_value: 1.0
41 | }
42 | }
43 | hparam {
44 | key: "drop_learning_rate_at_the_latest"
45 | value {
46 | int64_value: 450
47 | }
48 | }
49 | hparam {
50 | key: "drop_learning_rate_multiplier"
51 | value {
52 | float_value: 0.10000000149
53 | }
54 | }
55 | hparam {
56 | key: "drop_learning_rate_rounds"
57 | value {
58 | int64_value: 13
59 | }
60 | }
61 | hparam {
62 | key: "drop_state_probability"
63 | value {
64 | float_value: 0.00999999977648
65 | }
66 | }
67 | hparam {
68 | key: "embed_once"
69 | value {
70 | bool_value: true
71 | }
72 | }
73 | hparam {
74 | key: "embedding_init_factor"
75 | value {
76 | float_value: 1.0
77 | }
78 | }
79 | hparam {
80 | key: "feature_mask"
81 | value {
82 | bool_value: false
83 | }
84 | }
85 | hparam {
86 | key: "forget_bias"
87 | value {
88 | float_value: 1.0
89 | }
90 | }
91 | hparam {
92 | key: "hidden_size"
93 | value {
94 | int64_value: 1192
95 | }
96 | }
97 | hparam {
98 | key: "input_dropout"
99 | value {
100 | float_value: 0.0335461571813
101 | }
102 | }
103 | hparam {
104 | key: "input_embedding_ratio"
105 | value {
106 | float_value: 1.0
107 | }
108 | }
109 | hparam {
110 | key: "input_embedding_size"
111 | value {
112 | int64_value: 1192
113 | }
114 | }
115 | hparam {
116 | key: "intra_layer_dropout"
117 | value {
118 | float_value: 0.0122289275751
119 | }
120 | }
121 | hparam {
122 | key: "layer_norm"
123 | value {
124 | bool_value: false
125 | }
126 | }
127 | hparam {
128 | key: "learning_rate"
129 | value {
130 | float_value: 0.00218322896399
131 | }
132 | }
133 | hparam {
134 | key: "learning_rate_decay"
135 | value {
136 | float_value: 1.0
137 | }
138 | }
139 | hparam {
140 | key: "learning_rate_decay_burn_in_steps"
141 | value {
142 | int64_value: 0
143 | }
144 | }
145 | hparam {
146 | key: "lstm_skip_connection"
147 | value {
148 | bool_value: true
149 | }
150 | }
151 | hparam {
152 | key: "max_grad_norm"
153 | value {
154 | float_value: 10.0
155 | }
156 | }
157 | hparam {
158 | key: "model"
159 | value {
160 | bytes_value: "lstm"
161 | }
162 | }
163 | hparam {
164 | key: "num_eval_samples"
165 | value {
166 | int64_value: 0
167 | }
168 | }
169 | hparam {
170 | key: "num_layers"
171 | value {
172 | int64_value: 4
173 | }
174 | }
175 | hparam {
176 | key: "num_params"
177 | value {
178 | int64_value: 46000000
179 | }
180 | }
181 | hparam {
182 | key: "optimizer_type"
183 | value {
184 | bytes_value: "rmsprop"
185 | }
186 | }
187 | hparam {
188 | key: "outer_steps"
189 | value {
190 | int64_value: 500
191 | }
192 | }
193 | hparam {
194 | key: "output_dropout"
195 | value {
196 | float_value: 0.279572278261
197 | }
198 | }
199 | hparam {
200 | key: "output_embedding_ratio"
201 | value {
202 | float_value: 1.0
203 | }
204 | }
205 | hparam {
206 | key: "output_embedding_size"
207 | value {
208 | int64_value: 1192
209 | }
210 | }
211 | hparam {
212 | key: "output_init_factor"
213 | value {
214 | float_value: 1.0
215 | }
216 | }
217 | hparam {
218 | key: "overlay_rank"
219 | value {
220 | int64_value: -1
221 | }
222 | }
223 | hparam {
224 | key: "rmsprop_beta2"
225 | value {
226 | float_value: 0.990000009537
227 | }
228 | }
229 | hparam {
230 | key: "rmsprop_epsilon"
231 | value {
232 | float_value: 9.99999974738e-06
233 | }
234 | }
235 | hparam {
236 | key: "share_input_and_output_embeddings"
237 | value {
238 | bool_value: false
239 | }
240 | }
241 | hparam {
242 | key: "sparsity_ratio"
243 | value {
244 | float_value: -1.0
245 | }
246 | }
247 | hparam {
248 | key: "state_dropout"
249 | value {
250 | float_value: 0.0622955262661
251 | }
252 | }
253 | hparam {
254 | key: "tie_forget_and_input_gates"
255 | value {
256 | bool_value: false
257 | }
258 | }
259 | hparam {
260 | key: "token_dropout"
261 | value {
262 | float_value: 0.0
263 | }
264 | }
265 | hparam {
266 | key: "trainable_initial_state"
267 | value {
268 | bool_value: false
269 | }
270 | }
271 | hparam {
272 | key: "update_dropout"
273 | value {
274 | float_value: 0.0
275 | }
276 | }
277 | hparam {
278 | key: "vocab_size"
279 | value {
280 | int64_value: 206
281 | }
282 | }
283 | hparam {
284 | key: "weight_decay"
285 | value {
286 | float_value: 0.0
287 | }
288 | }
289 | hparam {
290 | key: "weight_penalty"
291 | value {
292 | float_value: 0.0
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/ptb_10m_lstm_d1/hps_proto:
--------------------------------------------------------------------------------
1 | hparam {
2 | key: "activation_fn"
3 | value {
4 | bytes_value: "tf.tanh"
5 | }
6 | }
7 | hparam {
8 | key: "adam_beta1"
9 | value {
10 | float_value: 0.899999976158
11 | }
12 | }
13 | hparam {
14 | key: "adam_beta2"
15 | value {
16 | float_value: 0.999000012875
17 | }
18 | }
19 | hparam {
20 | key: "adam_epsilon"
21 | value {
22 | float_value: 9.99999993923e-09
23 | }
24 | }
25 | hparam {
26 | key: "batch_size"
27 | value {
28 | int64_value: 64
29 | }
30 | }
31 | hparam {
32 | key: "cell_clip"
33 | value {
34 | float_value: -1.0
35 | }
36 | }
37 | hparam {
38 | key: "cell_init_factor"
39 | value {
40 | float_value: 1.0
41 | }
42 | }
43 | hparam {
44 | key: "drop_learning_rate_at_the_latest"
45 | value {
46 | int64_value: 900
47 | }
48 | }
49 | hparam {
50 | key: "drop_learning_rate_multiplier"
51 | value {
52 | float_value: 0.10000000149
53 | }
54 | }
55 | hparam {
56 | key: "drop_learning_rate_rounds"
57 | value {
58 | int64_value: 26
59 | }
60 | }
61 | hparam {
62 | key: "drop_state_probability"
63 | value {
64 | float_value: 0.00999999977648
65 | }
66 | }
67 | hparam {
68 | key: "embed_once"
69 | value {
70 | bool_value: true
71 | }
72 | }
73 | hparam {
74 | key: "embedding_init_factor"
75 | value {
76 | float_value: 1.0
77 | }
78 | }
79 | hparam {
80 | key: "feature_mask"
81 | value {
82 | bool_value: false
83 | }
84 | }
85 | hparam {
86 | key: "forget_bias"
87 | value {
88 | float_value: 1.0
89 | }
90 | }
91 | hparam {
92 | key: "hidden_size"
93 | value {
94 | int64_value: 1194
95 | }
96 | }
97 | hparam {
98 | key: "input_dropout"
99 | value {
100 | float_value: 0.579891622066
101 | }
102 | }
103 | hparam {
104 | key: "input_embedding_ratio"
105 | value {
106 | float_value: 0.224374398589
107 | }
108 | }
109 | hparam {
110 | key: "input_embedding_size"
111 | value {
112 | int64_value: 268
113 | }
114 | }
115 | hparam {
116 | key: "intra_layer_dropout"
117 | value {
118 | float_value: 0.873659133911
119 | }
120 | }
121 | hparam {
122 | key: "layer_norm"
123 | value {
124 | bool_value: false
125 | }
126 | }
127 | hparam {
128 | key: "learning_rate"
129 | value {
130 | float_value: 0.00417865626514
131 | }
132 | }
133 | hparam {
134 | key: "learning_rate_decay"
135 | value {
136 | float_value: 1.0
137 | }
138 | }
139 | hparam {
140 | key: "learning_rate_decay_burn_in_steps"
141 | value {
142 | int64_value: 0
143 | }
144 | }
145 | hparam {
146 | key: "lstm_skip_connection"
147 | value {
148 | bool_value: true
149 | }
150 | }
151 | hparam {
152 | key: "max_grad_norm"
153 | value {
154 | float_value: 10.0
155 | }
156 | }
157 | hparam {
158 | key: "model"
159 | value {
160 | bytes_value: "lstm"
161 | }
162 | }
163 | hparam {
164 | key: "num_eval_samples"
165 | value {
166 | int64_value: 0
167 | }
168 | }
169 | hparam {
170 | key: "num_layers"
171 | value {
172 | int64_value: 1
173 | }
174 | }
175 | hparam {
176 | key: "num_params"
177 | value {
178 | int64_value: 10000000
179 | }
180 | }
181 | hparam {
182 | key: "optimizer_type"
183 | value {
184 | bytes_value: "rmsprop"
185 | }
186 | }
187 | hparam {
188 | key: "outer_steps"
189 | value {
190 | int64_value: 1000
191 | }
192 | }
193 | hparam {
194 | key: "output_dropout"
195 | value {
196 | float_value: 0.327008873224
197 | }
198 | }
199 | hparam {
200 | key: "output_embedding_ratio"
201 | value {
202 | float_value: 0.224374398589
203 | }
204 | }
205 | hparam {
206 | key: "output_embedding_size"
207 | value {
208 | int64_value: 268
209 | }
210 | }
211 | hparam {
212 | key: "output_init_factor"
213 | value {
214 | float_value: 1.0
215 | }
216 | }
217 | hparam {
218 | key: "overlay_rank"
219 | value {
220 | int64_value: -1
221 | }
222 | }
223 | hparam {
224 | key: "rmsprop_beta2"
225 | value {
226 | float_value: 0.999000012875
227 | }
228 | }
229 | hparam {
230 | key: "rmsprop_epsilon"
231 | value {
232 | float_value: 9.99999993923e-09
233 | }
234 | }
235 | hparam {
236 | key: "share_input_and_output_embeddings"
237 | value {
238 | bool_value: true
239 | }
240 | }
241 | hparam {
242 | key: "sparsity_ratio"
243 | value {
244 | float_value: -1.0
245 | }
246 | }
247 | hparam {
248 | key: "state_dropout"
249 | value {
250 | float_value: 0.215256482363
251 | }
252 | }
253 | hparam {
254 | key: "tie_forget_and_input_gates"
255 | value {
256 | bool_value: false
257 | }
258 | }
259 | hparam {
260 | key: "token_dropout"
261 | value {
262 | float_value: 0.0
263 | }
264 | }
265 | hparam {
266 | key: "trainable_initial_state"
267 | value {
268 | bool_value: false
269 | }
270 | }
271 | hparam {
272 | key: "update_dropout"
273 | value {
274 | float_value: 0.0
275 | }
276 | }
277 | hparam {
278 | key: "vocab_size"
279 | value {
280 | int64_value: 10001
281 | }
282 | }
283 | hparam {
284 | key: "weight_decay"
285 | value {
286 | float_value: 0.000124350262922
287 | }
288 | }
289 | hparam {
290 | key: "weight_penalty"
291 | value {
292 | float_value: 0.0
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/ptb_24m_lstm_d4/hps_proto:
--------------------------------------------------------------------------------
1 | hparam {
2 | key: "activation_fn"
3 | value {
4 | bytes_value: "tf.tanh"
5 | }
6 | }
7 | hparam {
8 | key: "adam_beta1"
9 | value {
10 | float_value: 0.899999976158
11 | }
12 | }
13 | hparam {
14 | key: "adam_beta2"
15 | value {
16 | float_value: 0.999000012875
17 | }
18 | }
19 | hparam {
20 | key: "adam_epsilon"
21 | value {
22 | float_value: 9.99999993923e-09
23 | }
24 | }
25 | hparam {
26 | key: "batch_size"
27 | value {
28 | int64_value: 64
29 | }
30 | }
31 | hparam {
32 | key: "cell_clip"
33 | value {
34 | float_value: -1.0
35 | }
36 | }
37 | hparam {
38 | key: "cell_init_factor"
39 | value {
40 | float_value: 1.0
41 | }
42 | }
43 | hparam {
44 | key: "drop_learning_rate_at_the_latest"
45 | value {
46 | int64_value: 900
47 | }
48 | }
49 | hparam {
50 | key: "drop_learning_rate_multiplier"
51 | value {
52 | float_value: 0.10000000149
53 | }
54 | }
55 | hparam {
56 | key: "drop_learning_rate_rounds"
57 | value {
58 | int64_value: 26
59 | }
60 | }
61 | hparam {
62 | key: "drop_state_probability"
63 | value {
64 | float_value: 0.00999999977648
65 | }
66 | }
67 | hparam {
68 | key: "embed_once"
69 | value {
70 | bool_value: true
71 | }
72 | }
73 | hparam {
74 | key: "embedding_init_factor"
75 | value {
76 | float_value: 1.0
77 | }
78 | }
79 | hparam {
80 | key: "feature_mask"
81 | value {
82 | bool_value: false
83 | }
84 | }
85 | hparam {
86 | key: "forget_bias"
87 | value {
88 | float_value: 1.0
89 | }
90 | }
91 | hparam {
92 | key: "hidden_size"
93 | value {
94 | int64_value: 723
95 | }
96 | }
97 | hparam {
98 | key: "input_dropout"
99 | value {
100 | float_value: 0.633642196655
101 | }
102 | }
103 | hparam {
104 | key: "input_embedding_ratio"
105 | value {
106 | float_value: 1.0
107 | }
108 | }
109 | hparam {
110 | key: "input_embedding_size"
111 | value {
112 | int64_value: 723
113 | }
114 | }
115 | hparam {
116 | key: "intra_layer_dropout"
117 | value {
118 | float_value: 0.309127420187
119 | }
120 | }
121 | hparam {
122 | key: "layer_norm"
123 | value {
124 | bool_value: false
125 | }
126 | }
127 | hparam {
128 | key: "learning_rate"
129 | value {
130 | float_value: 0.00396024715155
131 | }
132 | }
133 | hparam {
134 | key: "learning_rate_decay"
135 | value {
136 | float_value: 1.0
137 | }
138 | }
139 | hparam {
140 | key: "learning_rate_decay_burn_in_steps"
141 | value {
142 | int64_value: 0
143 | }
144 | }
145 | hparam {
146 | key: "lstm_skip_connection"
147 | value {
148 | bool_value: true
149 | }
150 | }
151 | hparam {
152 | key: "max_grad_norm"
153 | value {
154 | float_value: 10.0
155 | }
156 | }
157 | hparam {
158 | key: "model"
159 | value {
160 | bytes_value: "lstm"
161 | }
162 | }
163 | hparam {
164 | key: "num_eval_samples"
165 | value {
166 | int64_value: 0
167 | }
168 | }
169 | hparam {
170 | key: "num_layers"
171 | value {
172 | int64_value: 4
173 | }
174 | }
175 | hparam {
176 | key: "num_params"
177 | value {
178 | int64_value: 24000000
179 | }
180 | }
181 | hparam {
182 | key: "optimizer_type"
183 | value {
184 | bytes_value: "rmsprop"
185 | }
186 | }
187 | hparam {
188 | key: "outer_steps"
189 | value {
190 | int64_value: 1000
191 | }
192 | }
193 | hparam {
194 | key: "output_dropout"
195 | value {
196 | float_value: 0.700856506824
197 | }
198 | }
199 | hparam {
200 | key: "output_embedding_ratio"
201 | value {
202 | float_value: 1.0
203 | }
204 | }
205 | hparam {
206 | key: "output_embedding_size"
207 | value {
208 | int64_value: 723
209 | }
210 | }
211 | hparam {
212 | key: "output_init_factor"
213 | value {
214 | float_value: 1.0
215 | }
216 | }
217 | hparam {
218 | key: "overlay_rank"
219 | value {
220 | int64_value: -1
221 | }
222 | }
223 | hparam {
224 | key: "rmsprop_beta2"
225 | value {
226 | float_value: 0.999000012875
227 | }
228 | }
229 | hparam {
230 | key: "rmsprop_epsilon"
231 | value {
232 | float_value: 9.99999993923e-09
233 | }
234 | }
235 | hparam {
236 | key: "share_input_and_output_embeddings"
237 | value {
238 | bool_value: true
239 | }
240 | }
241 | hparam {
242 | key: "sparsity_ratio"
243 | value {
244 | float_value: -1.0
245 | }
246 | }
247 | hparam {
248 | key: "state_dropout"
249 | value {
250 | float_value: 0.64275187254
251 | }
252 | }
253 | hparam {
254 | key: "tie_forget_and_input_gates"
255 | value {
256 | bool_value: false
257 | }
258 | }
259 | hparam {
260 | key: "token_dropout"
261 | value {
262 | float_value: 0.0
263 | }
264 | }
265 | hparam {
266 | key: "trainable_initial_state"
267 | value {
268 | bool_value: false
269 | }
270 | }
271 | hparam {
272 | key: "update_dropout"
273 | value {
274 | float_value: 0.0
275 | }
276 | }
277 | hparam {
278 | key: "vocab_size"
279 | value {
280 | int64_value: 10001
281 | }
282 | }
283 | hparam {
284 | key: "weight_decay"
285 | value {
286 | float_value: 7.44869103073e-05
287 | }
288 | }
289 | hparam {
290 | key: "weight_penalty"
291 | value {
292 | float_value: 0.0
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/train_enwik8.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/enwik8_char.sh"
24 |
25 | hps_proto_file="$2"
26 | name="$(basename "$(dirname "$2")")"
27 |
28 | source_lib "run.sh" "$1"
29 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/train_ptb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word_rmsprop.sh"
24 |
25 | hps_proto_file="$2"
26 | name="$(basename "$(dirname "$2")")"
27 |
28 | source_lib "run.sh" "$1"
29 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/train_wikitext-2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/wikitext-2_word.sh"
24 |
25 | hps_proto_file="$2"
26 | name="$(basename "$(dirname "$2")")"
27 |
28 | source_lib "run.sh" "$1"
29 |
--------------------------------------------------------------------------------
/lamb/experiment/on-the-state/wikitext-2_24m_lstm_d2/hps_proto:
--------------------------------------------------------------------------------
1 | hparam {
2 | key: "activation_fn"
3 | value {
4 | bytes_value: "tf.tanh"
5 | }
6 | }
7 | hparam {
8 | key: "adam_beta1"
9 | value {
10 | float_value: 0.899999976158
11 | }
12 | }
13 | hparam {
14 | key: "adam_beta2"
15 | value {
16 | float_value: 0.999000012875
17 | }
18 | }
19 | hparam {
20 | key: "adam_epsilon"
21 | value {
22 | float_value: 9.99999993923e-09
23 | }
24 | }
25 | hparam {
26 | key: "batch_size"
27 | value {
28 | int64_value: 64
29 | }
30 | }
31 | hparam {
32 | key: "cap_input_gate"
33 | value {
34 | bool_value: true
35 | }
36 | }
37 | hparam {
38 | key: "cell_clip"
39 | value {
40 | float_value: -1.0
41 | }
42 | }
43 | hparam {
44 | key: "cell_init_factor"
45 | value {
46 | float_value: 1.0
47 | }
48 | }
49 | hparam {
50 | key: "drop_learning_rate_at_the_latest"
51 | value {
52 | int64_value: 900
53 | }
54 | }
55 | hparam {
56 | key: "drop_learning_rate_multiplier"
57 | value {
58 | float_value: 0.10000000149
59 | }
60 | }
61 | hparam {
62 | key: "drop_learning_rate_rounds"
63 | value {
64 | int64_value: 26
65 | }
66 | }
67 | hparam {
68 | key: "drop_state_probability"
69 | value {
70 | float_value: 0.00999999977648
71 | }
72 | }
73 | hparam {
74 | key: "embed_once"
75 | value {
76 | bool_value: true
77 | }
78 | }
79 | hparam {
80 | key: "embedding_init_factor"
81 | value {
82 | float_value: 1.0
83 | }
84 | }
85 | hparam {
86 | key: "feature_mask"
87 | value {
88 | bool_value: false
89 | }
90 | }
91 | hparam {
92 | key: "forget_bias"
93 | value {
94 | float_value: 1.0
95 | }
96 | }
97 | hparam {
98 | key: "hidden_size"
99 | value {
100 | int64_value: 1227
101 | }
102 | }
103 | hparam {
104 | key: "input_dropout"
105 | value {
106 | float_value: 0.484243571758
107 | }
108 | }
109 | hparam {
110 | key: "input_embedding_ratio"
111 | value {
112 | float_value: 0.121501773596
113 | }
114 | }
115 | hparam {
116 | key: "input_embedding_size"
117 | value {
118 | int64_value: 149
119 | }
120 | }
121 | hparam {
122 | key: "intra_layer_dropout"
123 | value {
124 | float_value: 0.0920244976878
125 | }
126 | }
127 | hparam {
128 | key: "layer_norm"
129 | value {
130 | bool_value: false
131 | }
132 | }
133 | hparam {
134 | key: "learning_rate"
135 | value {
136 | float_value: 0.00246041407809
137 | }
138 | }
139 | hparam {
140 | key: "learning_rate_decay"
141 | value {
142 | float_value: 1.0
143 | }
144 | }
145 | hparam {
146 | key: "learning_rate_decay_burn_in_steps"
147 | value {
148 | int64_value: 0
149 | }
150 | }
151 | hparam {
152 | key: "lstm_skip_connection"
153 | value {
154 | bool_value: true
155 | }
156 | }
157 | hparam {
158 | key: "max_grad_norm"
159 | value {
160 | float_value: 10.0
161 | }
162 | }
163 | hparam {
164 | key: "model"
165 | value {
166 | bytes_value: "lstm"
167 | }
168 | }
169 | hparam {
170 | key: "num_eval_samples"
171 | value {
172 | int64_value: 0
173 | }
174 | }
175 | hparam {
176 | key: "num_layers"
177 | value {
178 | int64_value: 2
179 | }
180 | }
181 | hparam {
182 | key: "num_params"
183 | value {
184 | int64_value: 24000000
185 | }
186 | }
187 | hparam {
188 | key: "optimizer_type"
189 | value {
190 | bytes_value: "rmsprop"
191 | }
192 | }
193 | hparam {
194 | key: "outer_steps"
195 | value {
196 | int64_value: 1000
197 | }
198 | }
199 | hparam {
200 | key: "output_dropout"
201 | value {
202 | float_value: 0.391492575407
203 | }
204 | }
205 | hparam {
206 | key: "output_embedding_ratio"
207 | value {
208 | float_value: 0.121501773596
209 | }
210 | }
211 | hparam {
212 | key: "output_embedding_size"
213 | value {
214 | int64_value: 149
215 | }
216 | }
217 | hparam {
218 | key: "output_init_factor"
219 | value {
220 | float_value: 1.0
221 | }
222 | }
223 | hparam {
224 | key: "overlay_rank"
225 | value {
226 | int64_value: -1
227 | }
228 | }
229 | hparam {
230 | key: "rmsprop_beta2"
231 | value {
232 | float_value: 0.999000012875
233 | }
234 | }
235 | hparam {
236 | key: "rmsprop_epsilon"
237 | value {
238 | float_value: 9.99999993923e-09
239 | }
240 | }
241 | hparam {
242 | key: "share_input_and_output_embeddings"
243 | value {
244 | bool_value: true
245 | }
246 | }
247 | hparam {
248 | key: "sparsity_ratio"
249 | value {
250 | float_value: -1.0
251 | }
252 | }
253 | hparam {
254 | key: "state_dropout"
255 | value {
256 | float_value: 0.453888505697
257 | }
258 | }
259 | hparam {
260 | key: "tie_forget_and_input_gates"
261 | value {
262 | bool_value: false
263 | }
264 | }
265 | hparam {
266 | key: "token_dropout"
267 | value {
268 | float_value: 0.0
269 | }
270 | }
271 | hparam {
272 | key: "trainable_initial_state"
273 | value {
274 | bool_value: false
275 | }
276 | }
277 | hparam {
278 | key: "update_dropout"
279 | value {
280 | float_value: 0.0
281 | }
282 | }
283 | hparam {
284 | key: "vocab_size"
285 | value {
286 | int64_value: 33279
287 | }
288 | }
289 | hparam {
290 | key: "weight_decay"
291 | value {
292 | float_value: 3.77565629606e-05
293 | }
294 | }
295 | hparam {
296 | key: "weight_penalty"
297 | value {
298 | float_value: 0.0
299 | }
300 | }
301 |
--------------------------------------------------------------------------------
/lamb/experiment/pushing-the-bounds/README.md:
--------------------------------------------------------------------------------
1 | This directory is to accompany the [Pushing the bounds of
2 | dropout](https://arxiv.org/abs/1805.09208) paper.
3 |
4 | The paper is mostly about how to make predictions with a model trained with
5 | dropout. Use any saved model such as those trained in `../on-the-state/` and
6 | evaluate them with `./test.sh` (in this dir). One difference to `../test.sh` is
7 | that `./test.sh` tunes the optimal evaluation softmax temperature on the
8 | validation set (between 0.8 and 1.0):
9 |
10 | eval_softmax_temperature=-0.8
11 |
12 | Also, in addition to deterministic (or 'standard') dropout, it does MC dropout
13 | (the arithmetic averaged variant) with various `eval_dropout_multiplier`s. See
14 | the linked paper for details.
15 |
16 | So, assuming there is a saved model in `/tmp/lamb/ptb_10m_lstm_d1/`. Test it
17 | with:
18 |
19 | ./test.sh run some-descriptive-name /tmp/lamb/ptb_10m_lstm_d1/
20 |
21 | Thus the model will be evaluated more than once. In the output, the line with
22 | `final test_det_t0.9 xe:` has the test cross-entropy at the optimal softmax
23 | temperature (in this case 0.9). Similarly, `final test_mca_d0.8_t0.9 xe:`
24 | corresponds to the test cross-entropy with `eval_dropout_multiplier=0.8` and
25 | softmax temperature 0.9.
26 |
--------------------------------------------------------------------------------
/lamb/experiment/pushing-the-bounds/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 |
24 | saved_args="$1"
25 |
26 | save_checkpoints=false
27 | turns=0
28 | min_non_episodic_eval_examples_per_stripe=500000
29 | eval_on_test=true
30 |
31 | test_one() {
32 | local suffix="$1"
33 | local experiment_dir="$2"
34 | local name="$(default_name)_${suffix}"
35 | local config_file="${experiment_dir}/config"
36 | local load_checkpoint="${experiment_dir}/best"
37 | source_lib "run.sh" "${saved_args}"
38 | }
39 |
40 | name="$2"
41 | experiment_dir="$3"
42 |
43 | eval_softmax_temperature=-0.8
44 |
45 | eval_method="deterministic"
46 | test_one "det" "${experiment_dir}"
47 |
48 | eval_method="arithmetic"
49 | num_eval_samples=200
50 | eval_dropout_multiplier=0.8
51 | test_one "amc$eval_dropout_multiplier" "${experiment_dir}"
52 |
--------------------------------------------------------------------------------
/lamb/experiment/rerun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 |
24 | cmd="$1"
25 |
26 | run_one() {
27 | local name="$(default_name)_$1"
28 | local config_file="$2/config"
29 | source_lib "run.sh" "${cmd}"
30 | }
31 |
32 | run_one "$2" "$3"
33 |
--------------------------------------------------------------------------------
/lamb/experiment/rerun_old.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 |
24 | cmd="$1"
25 |
26 | run_one() {
27 | local name="$(default_name)_$1"
28 | local flags_as_dict="$2/args"
29 | local hps_proto_file="$2/config"
30 | source_lib "run.sh" "${cmd}"
31 | }
32 |
33 | run_one "$2" "$3"
34 |
--------------------------------------------------------------------------------
/lamb/experiment/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 |
24 | cmd="$1"
25 | load_checkpoint="$3/best"
26 | config_file="${4:-$3}/config"
27 |
28 | save_checkpoints=false
29 | turns=0
30 | min_non_episodic_eval_examples_per_stripe=500000
31 |
32 | test_one() {
33 | local name="$1"
34 | source_lib "run.sh" "${cmd}"
35 | }
36 |
37 | cell="lu"
38 | gpu_type="v100"
39 |
40 | eval_on_test=false
41 |
42 | eval_method="deterministic"
43 | test_one "$2_det"
44 |
45 | # MC dropout evaluation can be a bit better, but it's very slow.
46 | eval_method="arithmetic"
47 | num_eval_samples=200
48 | eval_dropout_multiplier=0.6
49 | test_one "$2_amc$eval_dropout_multiplier"
50 | eval_dropout_multiplier=0.7
51 | test_one "$2_amc$eval_dropout_multiplier"
52 | eval_dropout_multiplier=0.8
53 | test_one "$2_amc$eval_dropout_multiplier"
54 | eval_dropout_multiplier=0.9
55 | test_one "$2_amc$eval_dropout_multiplier"
56 |
57 | eval_on_test=true
58 | max_eval_eval_batches=1
59 |
60 | eval_method="deterministic"
61 | test_one "$2_test_det"
62 |
63 | # MC dropout evaluation can be a bit better, but it's very slow.
64 | eval_method="arithmetic"
65 | num_eval_samples=200
66 | eval_dropout_multiplier=0.6
67 | test_one "$2_test_amc$eval_dropout_multiplier"
68 | eval_dropout_multiplier=0.7
69 | test_one "$2_test_amc$eval_dropout_multiplier"
70 | eval_dropout_multiplier=0.8
71 | test_one "$2_test_amc$eval_dropout_multiplier"
72 | eval_dropout_multiplier=0.9
73 | test_one "$2_test_amc$eval_dropout_multiplier"
74 |
--------------------------------------------------------------------------------
/lamb/experiment/train_ptb_10m_lstm_d1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word_slow.sh"
24 |
25 | # Model hyperparameters
26 |
27 | model="lstm"
28 | num_params=$(million 10)
29 | share_input_and_output_embeddings=true
30 | tie_forget_and_input_gates=false
31 | cap_input_gate=true
32 | forget_bias=1.0
33 | num_layers=1
34 |
35 | # Tuned hyperparameters
36 |
37 | learning_rate=0.0048308
38 | l2_penalty=0.00007676
39 | input_dropout=0.51551
40 | inter_layer_dropout=
41 | state_dropout=0.18417
42 | output_dropout=0.33801
43 | input_embedding_ratio=0.22973
44 |
45 | # Evaluation hyperparameters
46 |
47 | eval_softmax_temperature=-0.8
48 |
49 | source_lib "run.sh" "$@"
50 |
--------------------------------------------------------------------------------
/lamb/experiment/train_ptb_24m_lstm_d4.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -e
19 |
20 | source "$(dirname $0)/../lib/setup.sh"
21 | source_lib "config/common.sh"
22 | source_lib "config/running.sh"
23 | source_lib "config/ptb_word_slow.sh"
24 |
25 | # Model hyperparameters
26 |
27 | model="lstm"
28 | num_params=$(million 24)
29 | share_input_and_output_embeddings=true
30 | tie_forget_and_input_gates=false
31 | cap_input_gate=true
32 | forget_bias=1.0
33 | num_layers=4
34 |
35 | # Tuned hyperparameters
36 |
37 | learning_rate=0.0033390
38 | l2_penalty=0.000093711
39 | input_dropout=0.68697
40 | inter_layer_dropout=0.31323
41 | state_dropout=0.48479
42 | output_dropout=0.69626
43 |
44 | # Evaluation hyperparameters
45 |
46 | eval_softmax_temperature=-0.8
47 |
48 | source_lib "run.sh" "$@"
49 |
--------------------------------------------------------------------------------
/lamb/experiment/tune_ptb_10m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | # TUNING IS NOT CURRENTLY SUPPORTED IN THE OPEN-SOURCE VERSION. This for
19 | # illustration only.
20 |
21 | set -e
22 |
23 | # Include definitions of dataset and tuning related variables.
24 | source "$(dirname $0)/../lib/setup.sh"
25 | source_lib "config/common.sh"
26 | source_lib "config/tuning.sh"
27 | source_lib "config/ptb_word_rmsprop.sh"
28 |
29 | # Model hyperparameters
30 |
31 | num_params=$(million 10)
32 | share_input_and_output_embeddings=true
33 |
34 | # Evaluation hyperparameters
35 |
36 | eval_softmax_temperature=-0.8
37 |
38 | # Tuning parameters
39 |
40 | num_workers=60
41 |
42 | # Start a number of tuning studies, setting model specific parameters.
43 |
44 | model="lstm"
45 | tie_forget_and_input_gates=false
46 | forget_bias=1.0
47 | num_layers=1
48 |
49 | tuneables="learning_rate,l2_penalty,
50 | input_dropout,inter_layer_dropout,state_dropout,
51 | output_dropout,input_embedding_ratio"
52 | name="$(default_name)_${model}_d${num_layers}"
53 | source_lib "run.sh" "$@"
54 |
--------------------------------------------------------------------------------
/lamb/lib/config/README.md:
--------------------------------------------------------------------------------
1 | Shell scripts here set up variables for datasets and all kinds of arguments to
2 | the main binary. They are intended to be sourced and can source other scripts.
3 |
--------------------------------------------------------------------------------
/lamb/lib/config/common.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | logtostderr=true
17 |
18 | default_name() {
19 | if which git > /dev/null 2>&1; then
20 | echo "$(git rev-parse --short HEAD)_$(basename $0 .sh)"
21 | else
22 | echo "$(basename $0 .sh)"
23 | fi
24 | }
25 |
26 | name="$(default_name)"
27 |
28 | million() {
29 | echo $(($1 * 1000 * 1000))
30 | }
31 |
--------------------------------------------------------------------------------
/lamb/lib/config/copy.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | copy_data_dir=${copy_data_dir:-"${HOME}/data/copy/"}
17 | training_file="${copy_data_dir}copy-training.txt"
18 | validation_file="${copy_data_dir}copy-valid.txt"
19 | test_file="${copy_data_dir}copy-test.txt"
20 | word_based=true
21 | episodic=true
22 | conditioning_separator="|"
23 |
--------------------------------------------------------------------------------
/lamb/lib/config/enwik8.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | enwik8_data_dir=${enwik8_data_dir:-"${HOME}/data/enwik8/"}
17 | training_file="${enwik8_data_dir}enwik8-training.txt"
18 | validation_file="${enwik8_data_dir}enwik8-valid.txt"
19 | test_file="${enwik8_data_dir}enwik8-test.txt"
20 | file_encoding="CP437"
21 | word_based=false
22 |
--------------------------------------------------------------------------------
/lamb/lib/config/enwik8_char.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/common.sh"
17 | source_lib "config/enwik8.sh"
18 | # While utf-8 is the actual encoding, for character based modelling
19 | # the literature seems to have settled on bytes as evidenced by
20 | # mentions of a vocabulary size of 205 (it is more than 5000 with
21 | # utf-8).
22 | file_encoding="CP437"
23 | word_based=false
24 | episodic=false
25 | max_time_steps=50
26 | # 400*500=200k optimization steps. With batch size 128 and max_time_steps
27 | # 50, for example, that's about 14 epochs.
28 | steps_per_turn=400
29 | turns=500
30 | print_training_stats_every_num_steps=100
31 | early_stopping_turns=15
32 | early_stopping_rampup_turns=30
33 | early_stopping_worst_xe_target=1.05,0.93,0.92
34 | drop_learning_rate_turns=13
35 | drop_learning_rate_multiplier=0.1
36 | drop_learning_rate_at_the_latest=450
37 | drop_state_probability=0.01
38 | max_eval_eval_batches=500
39 |
--------------------------------------------------------------------------------
/lamb/lib/config/enwik8_char_rmsprop.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/enwik8_char.sh"
17 | optimizer_type=rmsprop
18 | rmsprop_beta2=0.99
19 | rmsprop_epsilon=1e-5
20 | batch_size=128
21 | max_grad_norm=10.0
22 |
--------------------------------------------------------------------------------
/lamb/lib/config/mwc.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | mwc_data_dir=${mwc_data_dir:-"${HOME}/data/mwc/"}
17 | file_encoding="utf-8"
18 | word_based=false
19 |
--------------------------------------------------------------------------------
/lamb/lib/config/ptb.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | ptb_data_dir=${ptb_data_dir:-"${HOME}/data/ptb/"}
17 | training_file="${ptb_data_dir}ptb.train.txt"
18 | validation_file="${ptb_data_dir}ptb.valid.txt"
19 | test_file="${ptb_data_dir}ptb.test.txt"
20 |
--------------------------------------------------------------------------------
/lamb/lib/config/ptb_char.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | ptb_data_dir=${ptb_data_dir:-"${HOME}/data/ptb/"}
17 | training_file="${ptb_data_dir}ptb.char.train.txt"
18 | validation_file="${ptb_data_dir}ptb.char.valid.txt"
19 | test_file="${ptb_data_dir}ptb.char.test.txt"
20 | # There are spaces between characters.
21 | word_based=true
22 |
--------------------------------------------------------------------------------
/lamb/lib/config/ptb_word.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/common.sh"
17 | source_lib "config/ptb.sh"
18 | word_based=true
19 | episodic=false
20 |
--------------------------------------------------------------------------------
/lamb/lib/config/ptb_word_rmsprop.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/ptb_word.sh"
17 |
18 | optimizer_type=rmsprop
19 | batch_size=64
20 | max_grad_norm=10.0
21 | max_time_steps=35
22 |
23 | steps_per_turn=100
24 | turns=1000
25 | print_training_stats_every_num_steps=100
26 |
27 | early_stopping_turns=30
28 | early_stopping_rampup_turns=60
29 | early_stopping_worst_xe_target=4.4,4.2
30 |
31 | drop_learning_rate_turns=26
32 | drop_learning_rate_multiplier=0.1
33 | drop_learning_rate_at_the_latest=900
34 | drop_state_probability=0.01
35 |
--------------------------------------------------------------------------------
/lamb/lib/config/ptb_word_slow.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/common.sh"
17 | source_lib "config/ptb_word_rmsprop.sh"
18 | episodic=false
19 | max_time_steps=35
20 | steps_per_turn=100
21 | turns=2500
22 | print_training_stats_every_num_steps=100
23 | early_stopping_turns=100
24 | early_stopping_rampup_turns=200
25 | early_stopping_worst_xe_target=4.4,4.2
26 | drop_learning_rate_turns=90
27 | drop_learning_rate_multiplier=0.1
28 | drop_learning_rate_at_the_latest=2000
29 | drop_state_probability=0.01
30 |
--------------------------------------------------------------------------------
/lamb/lib/config/running.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | # Just a placeholder for now. Don't remove it though as it is necessary for the
17 | # source_lib override mechanism.
18 |
--------------------------------------------------------------------------------
/lamb/lib/config/tuning.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | # Just a placeholder for now. Don't remove it though as it is necessary for the
17 | # source_lib override mechanism.
18 |
--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-103.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | wikitext_103_data_dir=${wikitext_103_data_dir:-"${HOME}/data/wikitext-103/"}
17 | training_file="${wikitext_103_data_dir}wiki.train.tokens"
18 | validation_file="${wikitext_103_data_dir}wiki.valid.tokens"
19 | test_file="${wikitext_103_data_dir}wiki.test.tokens"
20 |
--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-103_word.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/common.sh"
17 | source_lib "config/wikitext-103.sh"
18 | word_based=true
19 | episodic=false
20 | max_time_steps=35
21 | steps_per_turn=1000
22 | turns=1000
23 | print_training_stats_every_num_steps=1000
24 | early_stopping_turns=30
25 | early_stopping_rampup_turns=60
26 | early_stopping_worst_xe_target=3.5,3.3
27 | drop_learning_rate_turns=26
28 | drop_learning_rate_multiplier=0.1
29 | drop_learning_rate_at_the_latest=900
30 | drop_state_probability=0.01
31 |
--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-103_word_rmsprop.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/wikitext-103_word.sh"
17 | optimizer_type=rmsprop
18 | batch_size=64
19 | max_grad_norm=10.0
20 |
--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-2.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | wikitext_2_data_dir=${wikitext_2_data_dir:-"${HOME}/data/wikitext-2/"}
17 | training_file="${wikitext_2_data_dir}wiki.train.tokens"
18 | validation_file="${wikitext_2_data_dir}wiki.valid.tokens"
19 | test_file="${wikitext_2_data_dir}wiki.test.tokens"
20 |
--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-2_word.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/common.sh"
17 | source_lib "config/wikitext-2.sh"
18 | word_based=true
19 | episodic=false
20 | # max_time_steps=35
21 | # steps_per_turn=200
22 | # turns=1000
23 | # print_training_stats_every_num_steps=100
24 | # early_stopping_turns=30
25 | # early_stopping_rampup_turns=60
26 | # early_stopping_worst_xe_target=4.9,4.5
27 | # drop_learning_rate_turns=26
28 | # drop_learning_rate_multiplier=0.1
29 | # drop_learning_rate_at_the_latest=900
30 | # drop_state_probability=0.01
31 |
--------------------------------------------------------------------------------
/lamb/lib/config/wikitext-2_word_rmsprop.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | source_lib "config/wikitext-2_word.sh"
17 | optimizer_type=rmsprop
18 | batch_size=64
19 | max_grad_norm=10.0
20 |
--------------------------------------------------------------------------------
/lamb/lib/describe_version.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | # We want to know what code was run for an experiment. This prints the git
17 | # version, status and the non-committed diffs, if any.
18 |
19 | echo "$(date): Invoking LAMB."
20 | if (which git && git rev-parse --is-inside-work-tree) > /dev/null 2>&1; then
21 | echo "git version: $(git rev-parse --short HEAD)"
22 | git --no-pager status
23 | git --no-pager diff
24 | git --no-pager diff --cached
25 | fi
26 |
--------------------------------------------------------------------------------
/lamb/lib/run.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | # This script runs LAMB.
17 | #
18 | # Usage
19 | # -----
20 | #
21 | # See experiment/*.sh for examples.
22 | #
23 | # Assign values to shell variables of the same name as hyperparameters, command
24 | # line flags and source this script. The single, optional command line argument
25 | # (of the sourcee) is the command which must be "run" in the open source
26 | # version.
27 | #
28 | # setup.py is assumed to have been sourced.
29 | #
30 | # How it works
31 | # ------------
32 | #
33 | # The configuration options (see ../README.md) are gathered from shell variables
34 | # and passed as command line arguments to the binary.
35 |
36 | cmd="${1:-run}"
37 |
38 | source_lib "run_helper.sh"
39 |
40 | _project_dir=${project_dir:-"."}
41 | _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}"
42 | # If ensure_new_experiment, add a random suffix that makes experiment_dir
43 | # unique.
44 | if [ "${ensure_new_experiment}" != "false" ]; then
45 | _suffix="$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c5)"
46 | _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}_${_suffix}"
47 | while test -d "${_experiment_dir}"; do
48 | _suffix="$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c5)"
49 | _experiment_dir="${experiment_dir:-"${_project_dir}/${name}"}_${_suffix}"
50 | done
51 | fi
52 |
53 | mkdir -p "${_experiment_dir}"
54 |
55 | {
56 | source_lib "describe_version.sh"
57 | } > >(tee -a "${_experiment_dir}/lamb_version")
58 |
59 | {
60 | if [ "${cmd}" = "run" ]; then
61 | eval $(echo "python" "${base}/main.py" "$(gather_args)")
62 | elif [ "${cmd}" = "run_par" ]; then
63 | eval $(echo "${base}/lamb.par" "$(gather_args)")
64 | else
65 | echo "Unsupported command ${cmd}."
66 | exit 1
67 | fi
68 | } > >(tee -a "${_experiment_dir}/stdout") \
69 | 2> >(tee -a "${_experiment_dir}/stderr" >&2)
70 |
--------------------------------------------------------------------------------
/lamb/lib/run_helper.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 |
17 | set -e
18 |
19 | escape_cl_arg() {
20 | printf "%q" "$1"
21 | }
22 |
23 | # This command:
24 | # add_param hps "--" model "X" "escape_cl_arg"
25 | # will add to $hps the line:
26 | # --model=${model}X
27 | # where ${model} is actually evaluated and transformed by
28 | # escape_cl_arg. See the 'indirect references' shell concept.
29 | add_param() {
30 | var1="\${$1}"
31 | prefix=$2
32 | var2="\${$3}"
33 | suffix=$4
34 | val2=$(eval "echo \$$3")
35 | if [ "$val2" ]; then
36 | local escape_fn=$5
37 | if [ "$escape_fn" ]; then
38 | var2="\$($escape_fn \"$var2\")"
39 | fi
40 | eval $1="\"$var1$prefix$3=$var2$suffix\""
41 | fi
42 | }
43 |
44 | add_cl_arg() {
45 | add_param "$1" "--" "$2" " " "escape_cl_arg"
46 | }
47 |
48 | gather_args() {
49 | ## Populate args (mirroring the structure of README.md). See command line
50 | ## argument definitions in lamb_flags.py.
51 |
52 | local args=""
53 |
54 | # data
55 | add_cl_arg args training_file
56 | add_cl_arg args validation_file
57 | add_cl_arg args test_file
58 | add_cl_arg args conditioning_separator
59 | add_cl_arg args file_encoding
60 | add_cl_arg args word_based
61 | add_cl_arg args episodic
62 |
63 | # model
64 | add_cl_arg args num_params
65 | add_cl_arg args share_input_and_output_embeddings
66 | add_cl_arg args input_embedding_size
67 | add_cl_arg args output_embedding_size
68 | add_cl_arg args input_embedding_ratio
69 | add_cl_arg args output_embedding_ratio
70 | add_cl_arg args embedding_dropout
71 | add_cl_arg args token_dropout
72 | add_cl_arg args input_dropout
73 | add_cl_arg args input_dropout_base
74 | add_cl_arg args output_dropout
75 | add_cl_arg args downprojected_output_dropout
76 | add_cl_arg args shared_mask_dropout
77 | add_cl_arg args embed_once
78 | add_cl_arg args output_once
79 |
80 | # cell
81 | add_cl_arg args model
82 | add_cl_arg args num_layers
83 | add_cl_arg args residual_connections
84 | add_cl_arg args lstm_skip_connection
85 | add_cl_arg args feature_mask_rounds
86 | add_cl_arg args feature_mask_rank
87 | add_cl_arg args sparsity_ratio
88 | add_cl_arg args overlay_rank
89 | add_cl_arg args hidden_size
90 | add_cl_arg args hidden_size_multiplier
91 | add_cl_arg args layer_norm
92 | add_cl_arg args activation_fn
93 | add_cl_arg args tie_forget_and_input_gates
94 | add_cl_arg args cap_input_gate
95 | add_cl_arg args mos_num_components
96 | add_cl_arg args trainable_initial_state
97 | add_cl_arg args inter_layer_dropout
98 | add_cl_arg args state_dropout
99 | add_cl_arg args state_dropout_flip_rate
100 | add_cl_arg args update_dropout
101 | add_cl_arg args cell_clip
102 |
103 | # objective
104 | add_cl_arg args model_average
105 | add_cl_arg args num_training_samples
106 | add_cl_arg args l2_penalty
107 | add_cl_arg args l1_penalty
108 | add_cl_arg args activation_norm_penalty
109 | add_cl_arg args drop_state_probability
110 |
111 | # initialization
112 | add_cl_arg args embedding_init_factor
113 | add_cl_arg args scale_input_embeddings
114 | add_cl_arg args cell_init_factor
115 | add_cl_arg args forget_bias
116 | add_cl_arg args output_init_factor
117 |
118 | # schedule
119 | add_cl_arg args steps_per_turn
120 | add_cl_arg args turns
121 | add_cl_arg args print_training_stats_every_num_steps
122 |
123 | # optimization
124 | add_cl_arg args optimizer_type
125 | add_cl_arg args rmsprop_beta2
126 | add_cl_arg args rmsprop_epsilon
127 | add_cl_arg args adam_beta1
128 | add_cl_arg args adam_beta2
129 | add_cl_arg args adam_epsilon
130 | add_cl_arg args max_grad_norm
131 | add_cl_arg args batch_size
132 | add_cl_arg args accum_batch_size
133 | add_cl_arg args max_time_steps
134 | add_cl_arg args trigger_averaging_turns
135 | add_cl_arg args trigger_averaging_at_the_latest
136 |
137 | # learning rate
138 | add_cl_arg args learning_rate
139 | add_cl_arg args learning_rate_decay
140 | add_cl_arg args learning_rate_decay_burn_in_steps
141 | add_cl_arg args drop_learning_rate_turns
142 | add_cl_arg args drop_learning_rate_multiplier
143 | add_cl_arg args drop_learning_rate_at_the_latest
144 |
145 | # early stopping
146 | add_cl_arg args early_stopping_turns
147 | add_cl_arg args early_stopping_rampup_turns
148 | add_cl_arg args early_stopping_worst_xe_target
149 | add_cl_arg args early_stopping_slowest_rate
150 |
151 | # cross-validation
152 | add_cl_arg args crossvalidate
153 | add_cl_arg args crossvalidation_rounds
154 | add_cl_arg args crossvalidate_max_folds
155 |
156 | # evaluation
157 | add_cl_arg args max_training_eval_batches
158 | add_cl_arg args max_eval_eval_batches
159 | add_cl_arg args max_test_eval_batches
160 | add_cl_arg args min_non_episodic_eval_examples_per_stripe
161 | add_cl_arg args eval_on_test
162 | add_cl_arg args eval_method
163 | add_cl_arg args num_eval_samples
164 | add_cl_arg args eval_softmax_temperature
165 | add_cl_arg args eval_softmax_temperature_estimation_num_tokens
166 | add_cl_arg args eval_power_mean_power
167 | add_cl_arg args eval_dropout_multiplier
168 | add_cl_arg args validation_prediction_file
169 | add_cl_arg args dyneval
170 | add_cl_arg args dyneval_learning_rate
171 | add_cl_arg args dyneval_decay_rate
172 | add_cl_arg args dyneval_epsilon
173 |
174 | # experiments
175 | local experiment_dir="${_experiment_dir}"
176 | add_cl_arg args experiment_dir
177 | add_cl_arg args save_config
178 | add_cl_arg args config_file
179 | add_cl_arg args hps_proto_file # deprecated
180 | add_cl_arg args flags_as_dict # deprecated
181 |
182 | # checkpoints
183 | add_cl_arg args save_checkpoints
184 | add_cl_arg args load_checkpoint
185 | add_cl_arg args load_optimizer_state
186 | add_cl_arg args load_averaged
187 | add_cl_arg args use_old_linear_names
188 |
189 | # Misc flags
190 | add_cl_arg args seed
191 | add_cl_arg args swap_memory
192 | add_cl_arg args logtostderr
193 | add_cl_arg args log_device_placement
194 | add_cl_arg args summary_flush_secs
195 |
196 | echo "${args}"
197 | }
198 |
--------------------------------------------------------------------------------
/lamb/lib/setup.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | if [[ "$0" == "$BASH_SOURCE" ]]; then
17 | echo "This script must be sourced."
18 | exit 1
19 | fi
20 |
21 | base=$(dirname "$BASH_SOURCE")/..
22 |
23 | cmd=${1:-"run"}
24 |
25 | lib_override_path=
26 |
27 | # `source_lib` is like the shell built-in `source`, but allows files in
28 | # `lib_override_path` to shadow those in lamb/lib/.
29 | source_lib() {
30 | local _name="$1"
31 | shift
32 | if [ -d "${lib_override_path}" -a \
33 | -f "${lib_override_path}/lib/${_name}" ]; then
34 | source "${lib_override_path}/lib/${_name}" "$@"
35 | else
36 | source "${base}/lib/${_name}" "$@"
37 | fi
38 | }
39 |
--------------------------------------------------------------------------------
/lamb/res_multi_rnn_cell.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """A stacked RNN cell with residual connections."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow.compat.v1 as tf
23 | from tensorflow.contrib import framework as contrib_framework
24 |
25 | nest = contrib_framework.nest
26 |
27 |
28 | class ResMultiRNNCell(tf.nn.rnn_cell.RNNCell):
29 | """RNN cell composed sequentially of multiple simple cells."""
30 |
31 | def __init__(self, cells, state_is_tuple=True):
32 | """Create a RNN cell composed sequentially of a number of RNNCells.
33 |
34 | Args:
35 | cells: list of RNNCells that will be composed in this order.
36 | state_is_tuple: If True, accepted and returned states are n-tuples, where
37 | `n = len(cells)`. If False, the states are all
38 | concatenated along the column axis. This latter behavior will soon be
39 | deprecated.
40 |
41 | Raises:
42 | ValueError: if cells is empty (not allowed), or at least one of the cells
43 | returns a state tuple but the flag `state_is_tuple` is `False`.
44 | """
45 | if not cells:
46 | raise ValueError("Must specify at least one cell for ResMultiRNNCell.")
47 | if not nest.is_sequence(cells):
48 | raise TypeError(
49 | "cells must be a list or tuple, but saw: %s." % cells)
50 |
51 | self._cells = cells
52 | self._state_is_tuple = state_is_tuple
53 | if not state_is_tuple:
54 | if any(nest.is_sequence(c.state_size) for c in self._cells):
55 | raise ValueError("Some cells return tuples of states, but the flag "
56 | "state_is_tuple is not set. State sizes are: %s"
57 | % str([c.state_size for c in self._cells]))
58 |
59 | @property
60 | def state_size(self):
61 | if self._state_is_tuple:
62 | return tuple(cell.state_size for cell in self._cells)
63 | else:
64 | return sum([cell.state_size for cell in self._cells])
65 |
66 | @property
67 | def output_size(self):
68 | return self._cells[-1].output_size
69 |
70 | def __call__(self, inputs, state, scope=None):
71 | """Run this multi-layer cell on inputs, starting from state."""
72 | with tf.variable_scope(scope or "res_multi_rnn_cell"):
73 | cur_state_pos = 0
74 | cur_inp = inputs
75 | new_states = []
76 | for i, cell in enumerate(self._cells):
77 | with tf.variable_scope("cell_%d" % i):
78 | if self._state_is_tuple:
79 | if not nest.is_sequence(state):
80 | raise ValueError(
81 | "Expected state to be a tuple of length %d, but received: %s"
82 | % (len(self.state_size), state))
83 | cur_state = state[i]
84 | else:
85 | cur_state = tf.slice(
86 | state, [0, cur_state_pos], [-1, cell.state_size])
87 | cur_state_pos += cell.state_size
88 | cur_inp2, new_state = cell(cur_inp, cur_state)
89 | if i == 0:
90 | cur_inp = cur_inp2
91 | else:
92 | cur_inp = cur_inp + cur_inp2
93 | new_states.append(new_state)
94 | new_states = (tuple(new_states) if self._state_is_tuple else
95 | tf.concat(new_states, 1))
96 | return cur_inp, new_states
97 |
--------------------------------------------------------------------------------
/lamb/skip_multi_rnn_cell.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """A RNN cell with skip connections."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow.compat.v1 as tf
23 | from tensorflow.contrib import framework as contrib_framework
24 |
25 | nest = contrib_framework.nest
26 |
27 |
28 | class SkipMultiRNNCell(tf.nn.rnn_cell.RNNCell):
29 | """RNN cell composed sequentially of multiple simple cells."""
30 |
31 | def __init__(self, cells, state_is_tuple=True):
32 | """Create a RNN cell composed sequentially of a number of RNNCells.
33 |
34 | Args:
35 | cells: list of RNNCells that will be composed in this order.
36 | state_is_tuple: If True, accepted and returned states are n-tuples, where
37 | `n = len(cells)`. If False, the states are all
38 | concatenated along the column axis. This latter behavior will soon be
39 | deprecated.
40 |
41 | Raises:
42 | ValueError: if cells is empty (not allowed), or at least one of the cells
43 | returns a state tuple but the flag `state_is_tuple` is `False`.
44 | """
45 | if not cells:
46 | raise ValueError("Must specify at least one cell for SkipMultiRNNCell.")
47 | if not nest.is_sequence(cells):
48 | raise TypeError(
49 | "cells must be a list or tuple, but saw: %s." % cells)
50 |
51 | self._cells = cells
52 | self._state_is_tuple = state_is_tuple
53 | if not state_is_tuple:
54 | if any(nest.is_sequence(c.state_size) for c in self._cells):
55 | raise ValueError("Some cells return tuples of states, but the flag "
56 | "state_is_tuple is not set. State sizes are: %s"
57 | % str([c.state_size for c in self._cells]))
58 |
59 | @property
60 | def state_size(self):
61 | if self._state_is_tuple:
62 | return tuple(cell.state_size for cell in self._cells)
63 | else:
64 | return sum([cell.state_size for cell in self._cells])
65 |
66 | @property
67 | def output_size(self):
68 | return self._cells[-1].output_size
69 |
70 | def __call__(self, inputs, state, scope=None):
71 | """Run this multi-layer cell on inputs, starting from state."""
72 | output = None
73 | with tf.variable_scope(scope or "skip_multi_rnn_cell"):
74 | cur_state_pos = 0
75 | cur_inp = inputs
76 | new_states = []
77 | for i, cell in enumerate(self._cells):
78 | with tf.variable_scope("cell_%d" % i):
79 | if self._state_is_tuple:
80 | if not nest.is_sequence(state):
81 | raise ValueError(
82 | "Expected state to be a tuple of length %d, but received: %s"
83 | % (len(self.state_size), state))
84 | cur_state = state[i]
85 | else:
86 | cur_state = tf.slice(
87 | state, [0, cur_state_pos], [-1, cell.state_size])
88 | cur_state_pos += cell.state_size
89 | cur_inp, new_state = cell(cur_inp, cur_state)
90 | new_states.append(new_state)
91 | if output is None:
92 | output = cur_inp
93 | else:
94 | output += cur_inp
95 | new_states = (tuple(new_states) if self._state_is_tuple else
96 | tf.concat(new_states, 1))
97 | return output, new_states
98 |
--------------------------------------------------------------------------------
/lamb/test/data/save_v1/args:
--------------------------------------------------------------------------------
1 | {'swap_memory': False, 'crossvalidate': False, 'seed': 1, 'early_stopping_rounds': 10, 'max_test_eval_batches': None, 'crossvalidation_rounds': 1, 'early_stopping_worst_xe_target': '9.0', 'hps': 'model=lstm,num_layers=1,num_params=10000000,share_input_and_output_embeddings=true,tie_forget_and_input_gates=false,cap_input_gate=true,forget_bias=1.0,input_embedding_ratio=0.22973,input_dropout=0.51551,state_dropout=0.18417,output_dropout=0.33801,weight_decay=0.00007676,optimizer_type=rmsprop,max_grad_norm=10.0,outer_steps=25,batch_size=,learning_rate=0.0048308,drop_learning_rate_rounds=90,drop_learning_rate_multiplier=0.1,drop_learning_rate_at_the_latest=2000,drop_state_probability=0.01,softmax_test_time_temperature=-0.8,', 'use_old_linear_names': False, 'crossvalidation_folds': 10, 'training_file': '/non-existent-dir/data/ptb/ptb.train.txt', 'file_encoding': 'utf-8', 'max_training_eval_batches': 100, 'word_based': True, 'experiment_dir': '/non-existent-dir/baf254a0c6_train_ptb_10m_lstm_lstm_d1', 'max_eval_eval_batches': None, 'test_file': '/non-existent-dir/data/ptb/ptb.test.txt', 'min_non_episodic_eval_examples_per_stripe': 100, 'print_every': 100, 'hps_proto_file': None, 'save_checkpoints': True, 'load_checkpoint': None, 'log_device_placement': False, 'early_stopping_rampup_rounds': 20, 'episodic': False, 'eval_file': '/non-existent-dir/data/ptb/ptb.valid.txt', 'steps': 100, 'summary_flush_secs': 120, 'max_steps': 35}
2 |
--------------------------------------------------------------------------------
/lamb/test/data/save_v1/config:
--------------------------------------------------------------------------------
1 | hparam {
2 | key: "activation_fn"
3 | value {
4 | bytes_value: "tf.tanh"
5 | }
6 | }
7 | hparam {
8 | key: "activation_norm_penalty"
9 | value {
10 | float_value: 0.0
11 | }
12 | }
13 | hparam {
14 | key: "adam_beta1"
15 | value {
16 | float_value: 0.899999976158
17 | }
18 | }
19 | hparam {
20 | key: "adam_beta2"
21 | value {
22 | float_value: 0.999000012875
23 | }
24 | }
25 | hparam {
26 | key: "adam_epsilon"
27 | value {
28 | float_value: 9.99999993923e-09
29 | }
30 | }
31 | hparam {
32 | key: "batch_size"
33 | value {
34 | int64_value: 64
35 | }
36 | }
37 | hparam {
38 | key: "cap_input_gate"
39 | value {
40 | bool_value: true
41 | }
42 | }
43 | hparam {
44 | key: "cell_clip"
45 | value {
46 | float_value: -1.0
47 | }
48 | }
49 | hparam {
50 | key: "cell_init_factor"
51 | value {
52 | float_value: 1.0
53 | }
54 | }
55 | hparam {
56 | key: "downprojected_output_dropout"
57 | value {
58 | float_value: -1.0
59 | }
60 | }
61 | hparam {
62 | key: "drop_learning_rate_at_the_latest"
63 | value {
64 | int64_value: 2000
65 | }
66 | }
67 | hparam {
68 | key: "drop_learning_rate_multiplier"
69 | value {
70 | float_value: 0.10000000149
71 | }
72 | }
73 | hparam {
74 | key: "drop_learning_rate_rounds"
75 | value {
76 | int64_value: 90
77 | }
78 | }
79 | hparam {
80 | key: "drop_state_probability"
81 | value {
82 | float_value: 0.00999999977648
83 | }
84 | }
85 | hparam {
86 | key: "embed_once"
87 | value {
88 | bool_value: true
89 | }
90 | }
91 | hparam {
92 | key: "embedding_init_factor"
93 | value {
94 | float_value: 1.0
95 | }
96 | }
97 | hparam {
98 | key: "eval_method"
99 | value {
100 | bytes_value: "deterministic"
101 | }
102 | }
103 | hparam {
104 | key: "feature_mask"
105 | value {
106 | bool_value: false
107 | }
108 | }
109 | hparam {
110 | key: "feature_mask_rank"
111 | value {
112 | int64_value: 0
113 | }
114 | }
115 | hparam {
116 | key: "feature_mask_rounds"
117 | value {
118 | int64_value: 0
119 | }
120 | }
121 | hparam {
122 | key: "forget_bias"
123 | value {
124 | float_value: 1.0
125 | }
126 | }
127 | hparam {
128 | key: "hidden_size"
129 | value {
130 | int64_value: -1
131 | }
132 | }
133 | hparam {
134 | key: "input_dropout"
135 | value {
136 | float_value: 0.51551002264
137 | }
138 | }
139 | hparam {
140 | key: "input_embedding_ratio"
141 | value {
142 | float_value: 0.229729995131
143 | }
144 | }
145 | hparam {
146 | key: "input_embedding_size"
147 | value {
148 | int64_value: -1
149 | }
150 | }
151 | hparam {
152 | key: "intra_layer_dropout"
153 | value {
154 | float_value: 0.0
155 | }
156 | }
157 | hparam {
158 | key: "layer_norm"
159 | value {
160 | bool_value: false
161 | }
162 | }
163 | hparam {
164 | key: "learning_rate"
165 | value {
166 | float_value: 0.00483079999685
167 | }
168 | }
169 | hparam {
170 | key: "learning_rate_decay"
171 | value {
172 | float_value: 1.0
173 | }
174 | }
175 | hparam {
176 | key: "learning_rate_decay_burn_in_steps"
177 | value {
178 | int64_value: 0
179 | }
180 | }
181 | hparam {
182 | key: "lstm_skip_connection"
183 | value {
184 | bool_value: true
185 | }
186 | }
187 | hparam {
188 | key: "max_grad_norm"
189 | value {
190 | float_value: 10.0
191 | }
192 | }
193 | hparam {
194 | key: "mixture_of_softmaxes_num_components"
195 | value {
196 | int64_value: 1
197 | }
198 | }
199 | hparam {
200 | key: "model"
201 | value {
202 | bytes_value: "lstm"
203 | }
204 | }
205 | hparam {
206 | key: "model_average"
207 | value {
208 | bytes_value: "arithmetic"
209 | }
210 | }
211 | hparam {
212 | key: "num_eval_samples"
213 | value {
214 | int64_value: 0
215 | }
216 | }
217 | hparam {
218 | key: "num_layers"
219 | value {
220 | int64_value: 1
221 | }
222 | }
223 | hparam {
224 | key: "num_params"
225 | value {
226 | int64_value: 50000
227 | }
228 | }
229 | hparam {
230 | key: "num_training_samples"
231 | value {
232 | int64_value: 1
233 | }
234 | }
235 | hparam {
236 | key: "optimizer_type"
237 | value {
238 | bytes_value: "rmsprop"
239 | }
240 | }
241 | hparam {
242 | key: "outer_steps"
243 | value {
244 | int64_value: 2500
245 | }
246 | }
247 | hparam {
248 | key: "output_dropout"
249 | value {
250 | float_value: 0.338010013103
251 | }
252 | }
253 | hparam {
254 | key: "output_embedding_ratio"
255 | value {
256 | float_value: -1.0
257 | }
258 | }
259 | hparam {
260 | key: "output_embedding_size"
261 | value {
262 | int64_value: -1
263 | }
264 | }
265 | hparam {
266 | key: "output_init_factor"
267 | value {
268 | float_value: 1.0
269 | }
270 | }
271 | hparam {
272 | key: "overlay_rank"
273 | value {
274 | int64_value: -1
275 | }
276 | }
277 | hparam {
278 | key: "rmsprop_beta2"
279 | value {
280 | float_value: 0.999000012875
281 | }
282 | }
283 | hparam {
284 | key: "rmsprop_epsilon"
285 | value {
286 | float_value: 9.99999993923e-09
287 | }
288 | }
289 | hparam {
290 | key: "share_input_and_output_embeddings"
291 | value {
292 | bool_value: true
293 | }
294 | }
295 | hparam {
296 | key: "softmax_test_time_temperature"
297 | value {
298 | float_value: -0.800000011921
299 | }
300 | }
301 | hparam {
302 | key: "sparsity_ratio"
303 | value {
304 | float_value: -1.0
305 | }
306 | }
307 | hparam {
308 | key: "state_dropout"
309 | value {
310 | float_value: 0.184169992805
311 | }
312 | }
313 | hparam {
314 | key: "state_dropout_flip_rate"
315 | value {
316 | float_value: 0.0
317 | }
318 | }
319 | hparam {
320 | key: "test_time_dropout_multiplier"
321 | value {
322 | float_value: 1.0
323 | }
324 | }
325 | hparam {
326 | key: "test_time_power_mean_power"
327 | value {
328 | float_value: 1.0
329 | }
330 | }
331 | hparam {
332 | key: "tie_forget_and_input_gates"
333 | value {
334 | bool_value: false
335 | }
336 | }
337 | hparam {
338 | key: "token_dropout"
339 | value {
340 | float_value: 0.0
341 | }
342 | }
343 | hparam {
344 | key: "trainable_initial_state"
345 | value {
346 | bool_value: false
347 | }
348 | }
349 | hparam {
350 | key: "update_dropout"
351 | value {
352 | float_value: 0.0
353 | }
354 | }
355 | hparam {
356 | key: "vocab_size"
357 | value {
358 | int64_value: 10001
359 | }
360 | }
361 | hparam {
362 | key: "weight_decay"
363 | value {
364 | float_value: 7.67599995015e-05
365 | }
366 | }
367 | hparam {
368 | key: "weight_penalty"
369 | value {
370 | float_value: 0.0
371 | }
372 | }
373 |
--------------------------------------------------------------------------------
/lamb/test/dummy_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | import tensorflow.compat.v1 as tf
17 |
18 |
19 | class DummyTest(tf.test.TestCase):
20 |
21 | def testCompilation(self):
22 | pass
23 |
24 |
25 | if __name__ == "__main__":
26 | tf.test.main()
27 |
--------------------------------------------------------------------------------
/lamb/test/finish.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 |
17 | # Intended to be sourced after setting all the configuration options.
18 |
19 | experiment_dir="$TEST_TMPDIR/${name}"
20 |
21 | # Run
22 | source_lib "run.sh" run_par
23 |
24 | # Check that the best reported evaluation XE is below a certain
25 | # threshold.
26 | grep_xes() {
27 | cat "${_experiment_dir}/stderr" |
28 | sed -rn "s/.*'best_xe': ([0-9]*)\.([0-9]{1,2}).*/\1.\2/p"
29 | }
30 | first_xe=$(grep_xes | head -n 1)
31 | last_xe=$(grep_xes | tail -n 1)
32 | expected_improvement="${expected_improvement:-0.5}"
33 | # check_ge doesn't work with floats, let's do it by hand.
34 | if (( $(echo "$first_xe - $expected_improvement < $last_xe" | bc -l) )); then
35 | echo "XE went from $first_xe to $last_xe, and that's not a large enough \
36 | improvement ($expected_improvement)."
37 | exit 1
38 | fi
39 |
40 | echo "PASS"
41 |
--------------------------------------------------------------------------------
/lamb/test/start.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | set -e -o pipefail
17 |
18 | source googletest.sh
19 |
20 | if [ "${base}" = "" ]; then
21 | source "$(dirname $0)/../lib/setup.sh"
22 | fi
23 | source_lib "config/common.sh"
24 | source_lib "config/running.sh"
25 |
26 | training_file="${base}/test/data/corpus.txt"
27 | validation_file="${training_file}"
28 | unset test_file
29 |
30 | batch_size=64
31 | max_training_eval_batches=2
32 | max_eval_eval_batches=2
33 | max_test_eval_batches=2
34 | max_time_steps=3
35 | steps_per_turn=5
36 | turns=2
37 |
38 | # Misc
39 | use_gpu=false
40 |
--------------------------------------------------------------------------------
/lamb/test/test_episodic_char_lstm_d2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | source "$(dirname $0)/start.sh"
19 |
20 | training_file="${base}/test/data/add.txt"
21 | validation_file="${training_file}"
22 | expected_improvement="${expected_improvement:-0.2}"
23 |
24 | word_based=false
25 | episodic=true
26 | conditioning_separator="="
27 | max_time_steps=40
28 |
29 | # Model hyperparameters
30 |
31 | model=lstm
32 | num_layers=2
33 | hidden_size=50
34 | num_eval_samples=2
35 |
36 | # Optimization hyperparameters
37 |
38 | learning_rate=0.01
39 |
40 | # Run
41 | source "$(dirname $0)/finish.sh"
42 |
--------------------------------------------------------------------------------
/lamb/test/test_load_optimizer_state.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -x
19 |
20 | source "$(dirname $0)/start.sh"
21 |
22 | # Model hyperparameters
23 |
24 | model=lstm
25 | num_layers=1
26 | hidden_size=17
27 | output_embedding_size=15
28 |
29 | # Optimization hyperparameters
30 |
31 | learning_rate=0.2
32 | early_stopping_turns=-1
33 |
34 | # Run
35 | source "$(dirname $0)/finish.sh"
36 | previous_xe=$last_xe
37 |
38 | # Load checkpoint and check that validation XE is the same.
39 | load_checkpoint="${_experiment_dir}/best"
40 | optimizer_type="sgd"
41 | # Loading the checkpoint would if a different optimizer's state were loaded.
42 | load_optimizer_state=false
43 | turns=0
44 | expected_improvement=0.0
45 | source "$(dirname $0)/finish.sh"
46 |
47 | if [ "$previous_xe" != "$last_xe" ]; then
48 | echo "XE was $previous_xe, after reloading checkpoint it became $last_xe."
49 | exit 1
50 | fi
51 |
--------------------------------------------------------------------------------
/lamb/test/test_save_v1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -x
19 |
20 | source "$(dirname $0)/start.sh"
21 |
22 | hps_proto_file="$(dirname $0)/data/save_v1/config"
23 | flags_as_dict="$(dirname $0)/data/save_v1/args"
24 |
25 | # Run
26 | source "$(dirname $0)/finish.sh"
27 |
--------------------------------------------------------------------------------
/lamb/test/test_simple_lstm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | set -x
19 |
20 | source "$(dirname $0)/start.sh"
21 |
22 | # Model hyperparameters
23 |
24 | model=lstm
25 | num_layers=2
26 | hidden_size=17,13
27 | output_embedding_size=11
28 | lstm_skip_connection=false
29 |
30 | # Optimization hyperparameters
31 |
32 | learning_rate=0.2
33 | early_stopping_turns=-1
34 |
35 | # Run
36 | source "$(dirname $0)/finish.sh"
37 | previous_xe=$last_xe
38 |
39 | # Load checkpoint and check that validation XE is the same.
40 | load_checkpoint="${_experiment_dir}/last"
41 | turns=0
42 | expected_improvement=0.0
43 | source "$(dirname $0)/finish.sh"
44 |
45 | if [ "$previous_xe" != "$last_xe" ]; then
46 | echo "XE was $previous_xe, after reloading checkpoint it became $last_xe."
47 | exit 1
48 | fi
49 |
--------------------------------------------------------------------------------
/lamb/test/test_sparse_rhn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 |
17 |
18 | source "$(dirname $0)/start.sh"
19 |
20 | # Model hyperparameters
21 |
22 | model=rhn
23 | num_layers=2
24 | hidden_size=17
25 | output_embedding_size=15
26 | sparsity_ratio=0.5
27 |
28 | # Optimization hyperparameters
29 |
30 | expected_improvement=0.3
31 | learning_rate=0.2
32 | steps_per_turn=20
33 |
34 | # Run
35 | source "$(dirname $0)/finish.sh"
36 |
--------------------------------------------------------------------------------
/lamb/vocab.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """Vocabulary."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | from six.moves import range
22 |
23 |
24 | class Vocab(object):
25 | """Immutable reversible mappings from strings to integers."""
26 |
27 | def __init__(self, tokens, unk=u'', eos=u'\u25bc'):
28 | """Create a Vocab object that maps `tokens` to dense indices."""
29 | self._token_to_index = {}
30 | self._token_to_frequency = {}
31 | self._unk = unk
32 | self._eos = eos
33 | token_to_index = self._token_to_index
34 | token_to_frequency = self._token_to_frequency
35 | # Get the unique tokens from `tokens` that might be a generator.
36 | for token in tokens:
37 | token_to_index[token] = True
38 | token_to_frequency[token] = token_to_frequency.get(token, 0) + 1
39 | token_to_index[unk] = True
40 | token_to_index[eos] = True
41 | # Now that we have a smaller set of tokens, assign ids in sorted
42 | # order for deterministic encoding.
43 | self._index_to_token = [None] * len(token_to_index)
44 | index_to_token = self._index_to_token
45 | i = 0
46 | for token in sorted(list(token_to_index)):
47 | token_to_index[token] = i
48 | index_to_token[i] = token
49 | i += 1
50 |
51 | def unk_index(self):
52 | """Returns the index of the unknown token."""
53 | return self._token_to_index[self._unk]
54 |
55 | def eos_index(self):
56 | """Returns the index of the end-of-sentence token."""
57 | return self._token_to_index[self._eos]
58 |
59 | def token(self, index_):
60 | """The string whose `index()` is `index_` or an IndexError."""
61 | return self._index_to_token[index_]
62 |
63 | def __iter__(self):
64 | """Iterates over tokens in order of indices."""
65 | for i in range(self.size()):
66 | yield self.token(i)
67 |
68 | def index_or_unk(self, token):
69 | """Find the index assigned to `token`.
70 |
71 | Args:
72 | token: a string.
73 | Returns:
74 | The index of `token` or `unk_index()` if it is not in the vocabulary.
75 | """
76 | if token in self._token_to_index:
77 | return self._token_to_index[token]
78 | else:
79 | return self.unk_index()
80 |
81 | def size(self):
82 | """Returns the number of different tokens in the vocabulary."""
83 | return len(self._index_to_token)
84 |
85 | def decode(self, ids):
86 | """Decode a sequence of `ids` with `token()`."""
87 | assert all([0 <= x and x < len(self._index_to_token) for x in ids])
88 | return [self.token(x) for x in ids]
89 |
90 | def encode(self, tokens, add_eos=True):
91 | """Encodes a sentence into a list of token indices.
92 |
93 | Args:
94 | tokens: A list of tokens.
95 | add_eos: Whether to add the end of sentence token.
96 | Returns:
97 | A list of integer token indices where `unk_index()` stands for
98 | tokens not found in the vocabulary.
99 | """
100 | ids = [self.index_or_unk(token) for token in tokens]
101 |
102 | if add_eos:
103 | ids += [self.eos_index()]
104 |
105 | return ids
106 |
107 | def index_frequency(self, index_):
108 | return self._token_to_frequency.get(self.token(index_), 0)
109 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 |
16 | """Setup for pip package."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | from setuptools import find_packages
23 | from setuptools import setup
24 |
25 | REQUIRED_PACKAGES = ['absl-py', 'numpy', 'dm-sonnet', 'six']
26 | EXTRA_PACKAGES = {
27 | 'tensorflow': ['tensorflow>=1.15.0', 'tensorflow-probability>=0.4.0'],
28 | 'tensorflow with gpu': ['tensorflow-gpu>=1.8.0',
29 | 'tensorflow-probability-gpu>=0.4.0'],
30 | }
31 |
32 |
33 | setup(
34 | name='lamb',
35 | version='1.0',
36 | description=('LAnguage Modelling Benchmarks is '
37 | 'to tune and test Tensorflow LM models.'),
38 | long_description='',
39 | url='http://github.com/deepmind/lamb/',
40 | author='Gabor Melis',
41 | author_email='melisgl@google.com',
42 | # Contained modules and scripts.
43 | packages=find_packages(),
44 | install_requires=REQUIRED_PACKAGES,
45 | extras_require=EXTRA_PACKAGES,
46 | zip_safe=False,
47 | license='Apache 2.0',
48 | classifiers=[
49 | 'Development Status :: 5 - Production/Stable',
50 | 'Intended Audience :: Developers',
51 | 'Intended Audience :: Education',
52 | 'Intended Audience :: Science/Research',
53 | 'License :: OSI Approved :: Apache Software License',
54 | 'Operating System :: MacOS :: MacOS X',
55 | 'Operating System :: Microsoft :: Windows',
56 | 'Operating System :: POSIX',
57 | 'Operating System :: Unix',
58 | 'Programming Language :: Python :: 2.7',
59 | 'Programming Language :: Python :: 3.4',
60 | 'Programming Language :: Python :: 3.5',
61 | 'Programming Language :: Python :: 3.6',
62 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
63 | 'Topic :: Software Development :: Libraries',
64 | ],
65 | keywords='lamb tensorflow language modelling machine learning',
66 | )
67 |
--------------------------------------------------------------------------------