├── README.md ├── Radam.py ├── lookahead.py └── ranger.py /README.md: -------------------------------------------------------------------------------- 1 | # Ranger-tensorflow 2 | Radam+lookahead implemented by tensorflow 3 | 4 | # Usage 5 | from ranger import Ranger 6 | 7 | num_sample=1000 8 | 9 | lr = tf.train.exponential_decay(1e-8, global_step, num_sample*10, 0.9, staircase=True) 10 | 11 | optimize = Ranger(learning_rate=lr,beta1=0.90,epsilon=1e-8) 12 | -------------------------------------------------------------------------------- /Radam.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.eager import context 3 | from tensorflow.python.framework import ops 4 | from tensorflow.python.ops import control_flow_ops 5 | from tensorflow.python.ops import math_ops 6 | from tensorflow.python.ops import resource_variable_ops 7 | from tensorflow.python.ops import state_ops 8 | from tensorflow.python.training import optimizer 9 | 10 | class RAdamOptimizer(optimizer.Optimizer): 11 | 12 | """ 13 | RAdam optimizer : On The Variance Of The Adaptive Learning Rate And Beyond 14 | https://arxiv.org/abs/1908.03265 15 | """ 16 | 17 | def __init__(self, 18 | learning_rate=0.001, 19 | beta1=0.9, 20 | beta2=0.999, 21 | epsilon=1e-8, 22 | weight_decay=0., 23 | use_locking=False, 24 | name="RAdam"): 25 | 26 | super(RAdamOptimizer, self).__init__(use_locking, name) 27 | self._lr = learning_rate 28 | self._beta1 = beta1 29 | self._beta2 = beta2 30 | self._epsilon = epsilon 31 | self._weight_decay = weight_decay 32 | 33 | self._lr_t = None 34 | self._step_t = None 35 | self._beta1_t = None 36 | self._beta2_t = None 37 | self._epsilon_t = None 38 | self._weight_decay_t = None 39 | 40 | def _get_beta_accumulators(self): 41 | with ops.init_scope(): 42 | if context.executing_eagerly(): 43 | graph = None 44 | else: 45 | graph = ops.get_default_graph() 46 | return (self._get_non_slot_variable("step", graph=graph), 47 | self._get_non_slot_variable("beta1_power", graph=graph), 48 | self._get_non_slot_variable("beta2_power", graph=graph)) 49 | 50 | def _create_slots(self, var_list): 51 | first_var = min(var_list, key=lambda x: x.name) 52 | self._create_non_slot_variable(initial_value=1.0, name="step", colocate_with=first_var) 53 | self._create_non_slot_variable(initial_value=self._beta1, name="beta1_power", colocate_with=first_var) 54 | self._create_non_slot_variable(initial_value=self._beta2, name="beta2_power", colocate_with=first_var) 55 | 56 | for v in var_list: 57 | self._zeros_slot(v, "m", self._name) 58 | self._zeros_slot(v, "v", self._name) 59 | 60 | def _prepare(self): 61 | lr = self._call_if_callable(self._lr) 62 | beta1 = self._call_if_callable(self._beta1) 63 | beta2 = self._call_if_callable(self._beta2) 64 | epsilon = self._call_if_callable(self._epsilon) 65 | weight_decay = self._call_if_callable(self._weight_decay) 66 | 67 | self._lr_t = ops.convert_to_tensor(lr, name="learning_rate") 68 | self._beta1_t = ops.convert_to_tensor(beta1, name="beta1") 69 | self._beta2_t = ops.convert_to_tensor(beta2, name="beta2") 70 | self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon") 71 | self._weight_decay_t = ops.convert_to_tensor(weight_decay, name="weight_decay") 72 | 73 | def _apply_dense(self, grad, var): 74 | return self._resource_apply_dense(grad, var) 75 | 76 | def _resource_apply_dense(self, grad, var): 77 | step, beta1_power, beta2_power = self._get_beta_accumulators() 78 | beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) 79 | beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) 80 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 81 | 82 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 83 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 84 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 85 | 86 | sma_inf = (2.0 / (1.0 - beta2_t)) - 1.0 87 | sma_t = sma_inf - (2.0 * step * beta2_power / (1.0 - beta2_power)) 88 | 89 | m = self.get_slot(var, "m") 90 | m_t = state_ops.assign(m, beta1_t * m + ((1.0 - beta1_t) * grad), use_locking=self._use_locking) 91 | mhat_t = m_t / (1.0 - beta1_power) 92 | 93 | v = self.get_slot(var, "v") 94 | v_t = state_ops.assign(v, beta2_t * v + ((1.0 - beta2_t) * math_ops.square(grad)), use_locking=self._use_locking) 95 | vhat_t = math_ops.sqrt(v_t / ((1.0 - beta2_power) + epsilon_t)) 96 | 97 | r_t = math_ops.sqrt( ((sma_t - 4.0) * (sma_t - 2.0) * sma_inf) / ((sma_inf - 4.0) * (sma_inf - 2.0) * sma_t) ) 98 | 99 | var_t = tf.cond(sma_t >= 5.0, lambda : r_t * mhat_t / vhat_t, lambda : mhat_t) 100 | 101 | if self._weight_decay > 0.0: 102 | var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var 103 | 104 | var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) 105 | 106 | updates = [var_update, m_t, v_t] 107 | 108 | return control_flow_ops.group(*updates) 109 | 110 | def _apply_sparse_shared(self, grad, var, indices, scatter_add): 111 | step, beta1_power, beta2_power = self._get_beta_accumulators() 112 | beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) 113 | beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) 114 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 115 | 116 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 117 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 118 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 119 | 120 | sma_inf = (2.0 / (1.0 - beta2_t)) - 1.0 121 | sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) 122 | 123 | m = self.get_slot(var, "m") 124 | m_scaled_g_values = grad * (1 - beta1_t) 125 | m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) 126 | 127 | with ops.control_dependencies([m_t]): 128 | m_t = scatter_add(m, indices, m_scaled_g_values) 129 | 130 | mhat_t = m_t / (1.0 - beta1_power) 131 | 132 | v = self.get_slot(var, "v") 133 | v_scaled_g_values = (grad * grad) * (1 - beta2_t) 134 | v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) 135 | 136 | with ops.control_dependencies([v_t]): 137 | v_t = scatter_add(v, indices, v_scaled_g_values) 138 | 139 | vhat_t = math_ops.sqrt(v_t / (1.0 - beta2_power) + epsilon_t) 140 | 141 | r_t = math_ops.sqrt( ((sma_t - 4.0) * (sma_t - 2.0) * sma_inf) / ((sma_inf - 4.0) * (sma_inf - 2.0) * sma_t) ) 142 | 143 | var_t = tf.cond(sma_t >= 5.0, lambda : r_t * mhat_t / vhat_t, lambda : mhat_t) 144 | 145 | if self._weight_decay > 0.0: 146 | var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var 147 | 148 | var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) 149 | 150 | updates = [var_update, m_t, v_t] 151 | 152 | return control_flow_ops.group(*updates) 153 | 154 | def _apply_sparse(self, grad, var): 155 | return self._apply_sparse_shared( 156 | grad.values, 157 | var, 158 | grad.indices, 159 | lambda x, i, v: state_ops.scatter_add(x, i, v, use_locking=self._use_locking)) 160 | 161 | def _resource_scatter_add(self, x, i, v): 162 | with ops.control_dependencies([resource_variable_ops.resource_scatter_add(x.handle, i, v)]): 163 | return x.value() 164 | 165 | def _resource_apply_sparse(self, grad, var, indices): 166 | return self._apply_sparse_shared(grad, var, indices, self._resource_scatter_add) 167 | 168 | def _finish(self, update_ops, name_scope): 169 | with ops.control_dependencies(update_ops): 170 | step, beta1_power, beta2_power = self._get_beta_accumulators() 171 | with ops.colocate_with(beta1_power): 172 | update_step = step.assign(step + 1.0, use_locking=self._use_locking) 173 | update_beta1 = beta1_power.assign(beta1_power * self._beta1_t, use_locking=self._use_locking) 174 | update_beta2 = beta2_power.assign(beta2_power * self._beta2_t, use_locking=self._use_locking) 175 | return control_flow_ops.group(*update_ops + [update_step, update_beta1, update_beta2], name=name_scope) -------------------------------------------------------------------------------- /lookahead.py: -------------------------------------------------------------------------------- 1 | """ 2 | Lookahead optimizer implementation based on https://arxiv.org/abs/1907.08610 3 | """ 4 | from tensorflow.python.eager import context 5 | import tensorflow as tf 6 | 7 | class LookaheadOptimizer(tf.train.Optimizer): 8 | """ 9 | Lookahead optimizer compatible with other tensorflow optimizers. 10 | This optimizer accepts an optimizer of user's choice to use it as a 11 | `lookahead` fast optimizer for k steps, and updates other weights by 12 | linearly interpolating saved weight before k steps to the direction where 13 | the fast optimizer has reached. 14 | """ 15 | 16 | def __init__(self, 17 | fast_optimizer, 18 | k=5, 19 | alpha=0.5, 20 | use_locking=False, 21 | name='Lookahead'): 22 | super().__init__(use_locking, name) 23 | self._fast_opt = fast_optimizer 24 | self._k_constant = k 25 | self._alpha = alpha 26 | 27 | # Tensors 28 | self._k_t = None 29 | self._alpha_t = None 30 | 31 | def _get_step(self): 32 | with tf.init_scope(): 33 | if context.executing_eagerly(): 34 | graph = None 35 | else: 36 | graph = tf.get_default_graph() 37 | return self._get_non_slot_variable('step', graph=graph) 38 | 39 | def _prepare(self): 40 | # pylint: disable=protected-access 41 | self._k_t = tf.convert_to_tensor(self._k_constant, 42 | name='k_t', 43 | dtype=tf.int32) 44 | self._alpha_t = tf.convert_to_tensor(self._alpha, name='alpha_t') 45 | self._fast_opt._prepare() 46 | 47 | def _create_slots(self, var_list): 48 | # pylint: disable=protected-access 49 | """Create slots for each trainable variables in graph. 50 | Slots make sure that a variable is allocated to closest device 51 | in which the corresponding variable is located. 52 | """ 53 | # Make a copy of each variables to store `slow weights`. 54 | for var in var_list: 55 | self._get_or_make_slot(var, var, 'slow', self._name) 56 | 57 | first_var = min(var_list, key=lambda x: x.name) 58 | self._create_non_slot_variable( 59 | initial_value=0, name='step', colocate_with=first_var) 60 | self._fast_opt._create_slots(var_list) 61 | 62 | def _apply_dense(self, grad, var): 63 | # pylint: disable=protected-access 64 | slow_weight = self.get_slot(var, 'slow') 65 | step = self._get_step() 66 | alpha = tf.cast(self._alpha_t, var.dtype.base_dtype) 67 | 68 | # yapf: disable 69 | update_var = tf.cond( 70 | tf.equal(tf.floormod(step, self._k_t), 0), 71 | lambda: tf.group( 72 | tf.assign(var, 73 | tf.assign_add( 74 | slow_weight, (var - slow_weight) * alpha))), 75 | lambda: self._fast_opt._apply_dense(grad, var)) 76 | # yapf: enable 77 | 78 | return tf.group(update_var) 79 | 80 | def _apply_sparse(self, grad, var): 81 | raise NotImplementedError('Sparse gradients on lookahead optimizer not ' 82 | 'supported.') 83 | 84 | def _finish(self, update_ops, name_scope): 85 | with tf.control_dependencies(update_ops): 86 | step = self._get_step() 87 | update_step = tf.assign_add(step, 1, use_locking=self._use_locking) 88 | 89 | return tf.group(*update_ops + [update_step], name=name_scope) -------------------------------------------------------------------------------- /ranger.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.eager import context 3 | from tensorflow.python.framework import ops 4 | from tensorflow.python.ops import control_flow_ops 5 | from tensorflow.python.ops import math_ops 6 | from tensorflow.python.ops import resource_variable_ops 7 | from tensorflow.python.ops import state_ops 8 | from tensorflow.python.training import optimizer 9 | 10 | class Ranger(optimizer.Optimizer): 11 | def __init__(self, 12 | learning_rate=0.001, 13 | alpha=0.5, 14 | k=6.0, 15 | N_sma_threshhold=5, 16 | beta1=0.9, 17 | beta2=0.999, 18 | epsilon=1e-8, 19 | weight_decay=0., 20 | use_locking=False, 21 | name="Ranger"): 22 | 23 | super(Ranger, self).__init__(use_locking, name) 24 | self._lr = learning_rate 25 | self._beta1 = beta1 26 | self._beta2 = beta2 27 | self._epsilon = epsilon 28 | self._weight_decay = weight_decay 29 | self._alpha = alpha 30 | self._k = k 31 | self._sma_thtrshhold = N_sma_threshhold 32 | 33 | self._lr_t = None 34 | self._step_t = None 35 | self._beta1_t = None 36 | self._beta2_t = None 37 | self._epsilon_t = None 38 | self._weight_decay_t = None 39 | 40 | self._alpha_t = None 41 | self._k_t = None 42 | 43 | 44 | def _get_beta_accumulators(self): 45 | with ops.init_scope(): 46 | if context.executing_eagerly(): 47 | graph = None 48 | else: 49 | graph = ops.get_default_graph() 50 | return (self._get_non_slot_variable("step", graph=graph), 51 | self._get_non_slot_variable("beta1_power", graph=graph), 52 | self._get_non_slot_variable("beta2_power", graph=graph)) 53 | 54 | def _create_slots(self, var_list): 55 | first_var = min(var_list, key=lambda x: x.name) 56 | self._create_non_slot_variable(initial_value=0.0, name="step", colocate_with=first_var) 57 | self._create_non_slot_variable(initial_value=self._beta1, name="beta1_power", colocate_with=first_var) 58 | self._create_non_slot_variable(initial_value=self._beta2, name="beta2_power", colocate_with=first_var) 59 | 60 | for v in var_list: 61 | self._zeros_slot(v, "m", self._name) 62 | self._zeros_slot(v, "v", self._name) 63 | self._zeros_slot(v, "slow", self._name) 64 | 65 | 66 | def _prepare(self): 67 | lr = self._call_if_callable(self._lr) 68 | beta1 = self._call_if_callable(self._beta1) 69 | beta2 = self._call_if_callable(self._beta2) 70 | epsilon = self._call_if_callable(self._epsilon) 71 | weight_decay = self._call_if_callable(self._weight_decay) 72 | 73 | k = self._call_if_callable(self._k) 74 | alpha = self._call_if_callable(self._alpha) 75 | 76 | self._lr_t = ops.convert_to_tensor(lr, name="learning_rate") 77 | self._beta1_t = ops.convert_to_tensor(beta1, name="beta1") 78 | self._beta2_t = ops.convert_to_tensor(beta2, name="beta2") 79 | self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon") 80 | self._weight_decay_t = ops.convert_to_tensor(weight_decay, name="weight_decay") 81 | 82 | self._k_t = ops.convert_to_tensor(k, name="k") 83 | self._alpha_t = ops.convert_to_tensor(alpha, name="alpha") 84 | 85 | 86 | def _apply_dense(self, grad, var): 87 | return self._resource_apply_dense(grad, var) 88 | 89 | def _resource_apply_dense(self, grad, var): 90 | slow_weight = self.get_slot(var, 'slow') 91 | alpha = math_ops.cast(self._alpha_t, var.dtype.base_dtype) 92 | 93 | step, beta1_power, beta2_power = self._get_beta_accumulators() 94 | beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) 95 | beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) 96 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 97 | 98 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 99 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 100 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 101 | 102 | sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 103 | 104 | sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) 105 | 106 | m = self.get_slot(var, "m") 107 | m_t = state_ops.assign(m, beta1_t * m + (1.0 - beta1_t) * grad, use_locking=self._use_locking) 108 | mhat_t = m_t / (1.0 - beta1_power) 109 | 110 | v = self.get_slot(var, "v") 111 | v_t = state_ops.assign(v, beta2_t * v + (1.0 - beta2_t) * math_ops.square(grad), use_locking=self._use_locking) 112 | vhat_t = math_ops.sqrt(v_t / ((1.0 - beta2_power) + epsilon_t)) 113 | 114 | r_t = math_ops.sqrt( ((sma_t - 4.0) * (sma_t - 2.0) * sma_inf) / ((sma_inf - 4.0) * (sma_inf - 2.0) * sma_t) ) 115 | 116 | var_t = tf.cond(sma_t >= self._sma_thtrshhold, lambda : r_t * mhat_t / vhat_t, lambda : mhat_t) 117 | 118 | if self._weight_decay > 0.0: 119 | var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var 120 | 121 | var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) 122 | 123 | # var_temp = var_update.copy() 124 | var_update = tf.cond( 125 | math_ops.equal(math_ops.floor_mod(step, self._k_t), 0), 126 | lambda: state_ops.assign(var_update,state_ops.assign_add(slow_weight, (var_update - slow_weight) * alpha)), 127 | lambda :var_update) 128 | 129 | # print("var_updata : ",var_update) 130 | updates = [var_update, m_t, v_t] 131 | 132 | return control_flow_ops.group(*updates) 133 | 134 | def _apply_sparse_shared(self, grad, var, indices, scatter_add): 135 | slow_weight = self.get_slot(var, 'slow') 136 | alpha = math_ops.cast(self._alpha_t, var.dtype.base_dtype) 137 | 138 | step, beta1_power, beta2_power = self._get_beta_accumulators() 139 | beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) 140 | beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) 141 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 142 | 143 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 144 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 145 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 146 | 147 | sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 148 | sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) 149 | 150 | m = self.get_slot(var, "m") 151 | m_scaled_g_values = grad * (1 - beta1_t) 152 | m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) 153 | 154 | with ops.control_dependencies([m_t]): 155 | m_t = scatter_add(m, indices, m_scaled_g_values) 156 | 157 | mhat_t = m_t / (1.0 - beta1_power) 158 | 159 | v = self.get_slot(var, "v") 160 | v_scaled_g_values = (grad * grad) * (1 - beta2_t) 161 | v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) 162 | 163 | with ops.control_dependencies([v_t]): 164 | v_t = scatter_add(v, indices, v_scaled_g_values) 165 | 166 | vhat_t = math_ops.sqrt(v_t / (1.0 - beta2_power) + epsilon_t) 167 | 168 | r_t = math_ops.sqrt( ((sma_t - 4.0) * (sma_t - 2.0) * sma_inf) / ((sma_inf - 4.0) * (sma_inf - 2.0) * sma_t) ) 169 | 170 | var_t = tf.cond(sma_t >= self._sma_thtrshhold, lambda : r_t * mhat_t / vhat_t, lambda : mhat_t) 171 | 172 | if self._weight_decay > 0.0: 173 | var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var 174 | 175 | var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) 176 | 177 | # var_temp = var_update.copy() 178 | var_update = tf.cond( 179 | math_ops.equal(math_ops.floor_mod(step, self._k_t), 0), 180 | lambda: state_ops.assign(var_update,state_ops.assign_add(slow_weight, (var_update - slow_weight) * alpha)), 181 | lambda :var_update) 182 | 183 | updates = [var_update, m_t, v_t] 184 | 185 | return control_flow_ops.group(*updates) 186 | 187 | def _apply_sparse(self, grad, var): 188 | return self._apply_sparse_shared( 189 | grad.values, 190 | var, 191 | grad.indices, 192 | lambda x, i, v: state_ops.scatter_add(x, i, v, use_locking=self._use_locking)) 193 | 194 | def _resource_scatter_add(self, x, i, v): 195 | with ops.control_dependencies([resource_variable_ops.resource_scatter_add(x.handle, i, v)]): 196 | return x.value() 197 | 198 | def _resource_apply_sparse(self, grad, var, indices): 199 | return self._apply_sparse_shared(grad, var, indices, self._resource_scatter_add) 200 | 201 | def _finish(self, update_ops, name_scope): 202 | with ops.control_dependencies(update_ops): 203 | step, beta1_power, beta2_power = self._get_beta_accumulators() 204 | with ops.colocate_with(beta1_power): 205 | update_step = step.assign(step + 1.0, use_locking=self._use_locking) 206 | update_beta1 = beta1_power.assign(beta1_power * self._beta1_t, use_locking=self._use_locking) 207 | update_beta2 = beta2_power.assign(beta2_power * self._beta2_t, use_locking=self._use_locking) 208 | return control_flow_ops.group(*update_ops + [update_step, update_beta1, update_beta2], name=name_scope) --------------------------------------------------------------------------------