├── .DS_Store ├── .gitignore ├── AMSGrad.py ├── LICENSE ├── README.md └── assests ├── .DS_Store ├── lr_001_acc.png ├── lr_001_loss.png ├── lr_01_acc.png └── lr_01_loss.png /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taki0112/AMSGrad-Tensorflow/e0742a09256fdcca50aa98b9b8670f638925923b/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /AMSGrad.py: -------------------------------------------------------------------------------- 1 | """AMSGrad for TensorFlow.""" 2 | 3 | from tensorflow.python.eager import context 4 | from tensorflow.python.framework import ops 5 | from tensorflow.python.ops import control_flow_ops 6 | from tensorflow.python.ops import math_ops 7 | from tensorflow.python.ops import resource_variable_ops 8 | from tensorflow.python.ops import state_ops 9 | from tensorflow.python.ops import variable_scope 10 | from tensorflow.python.training import optimizer 11 | 12 | 13 | class AMSGrad(optimizer.Optimizer): 14 | def __init__(self, learning_rate=0.01, beta1=0.9, beta2=0.99, epsilon=1e-8, use_locking=False, name="AMSGrad"): 15 | super(AMSGrad, self).__init__(use_locking, name) 16 | self._lr = learning_rate 17 | self._beta1 = beta1 18 | self._beta2 = beta2 19 | self._epsilon = epsilon 20 | 21 | self._lr_t = None 22 | self._beta1_t = None 23 | self._beta2_t = None 24 | self._epsilon_t = None 25 | 26 | self._beta1_power = None 27 | self._beta2_power = None 28 | 29 | def _create_slots(self, var_list): 30 | first_var = min(var_list, key=lambda x: x.name) 31 | 32 | create_new = self._beta1_power is None 33 | if not create_new and context.in_graph_mode(): 34 | create_new = (self._beta1_power.graph is not first_var.graph) 35 | 36 | if create_new: 37 | with ops.colocate_with(first_var): 38 | self._beta1_power = variable_scope.variable(self._beta1, name="beta1_power", trainable=False) 39 | self._beta2_power = variable_scope.variable(self._beta2, name="beta2_power", trainable=False) 40 | # Create slots for the first and second moments. 41 | for v in var_list : 42 | self._zeros_slot(v, "m", self._name) 43 | self._zeros_slot(v, "v", self._name) 44 | self._zeros_slot(v, "vhat", self._name) 45 | 46 | def _prepare(self): 47 | self._lr_t = ops.convert_to_tensor(self._lr) 48 | self._beta1_t = ops.convert_to_tensor(self._beta1) 49 | self._beta2_t = ops.convert_to_tensor(self._beta2) 50 | self._epsilon_t = ops.convert_to_tensor(self._epsilon) 51 | 52 | def _apply_dense(self, grad, var): 53 | beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) 54 | beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) 55 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 56 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 57 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 58 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 59 | 60 | lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) 61 | 62 | # m_t = beta1 * m + (1 - beta1) * g_t 63 | m = self.get_slot(var, "m") 64 | m_scaled_g_values = grad * (1 - beta1_t) 65 | m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking) 66 | 67 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 68 | v = self.get_slot(var, "v") 69 | v_scaled_g_values = (grad * grad) * (1 - beta2_t) 70 | v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking) 71 | 72 | # amsgrad 73 | vhat = self.get_slot(var, "vhat") 74 | vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) 75 | v_sqrt = math_ops.sqrt(vhat_t) 76 | 77 | var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) 78 | return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t]) 79 | 80 | def _resource_apply_dense(self, grad, var): 81 | var = var.handle 82 | beta1_power = math_ops.cast(self._beta1_power, grad.dtype.base_dtype) 83 | beta2_power = math_ops.cast(self._beta2_power, grad.dtype.base_dtype) 84 | lr_t = math_ops.cast(self._lr_t, grad.dtype.base_dtype) 85 | beta1_t = math_ops.cast(self._beta1_t, grad.dtype.base_dtype) 86 | beta2_t = math_ops.cast(self._beta2_t, grad.dtype.base_dtype) 87 | epsilon_t = math_ops.cast(self._epsilon_t, grad.dtype.base_dtype) 88 | 89 | lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) 90 | 91 | # m_t = beta1 * m + (1 - beta1) * g_t 92 | m = self.get_slot(var, "m").handle 93 | m_scaled_g_values = grad * (1 - beta1_t) 94 | m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking) 95 | 96 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 97 | v = self.get_slot(var, "v").handle 98 | v_scaled_g_values = (grad * grad) * (1 - beta2_t) 99 | v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking) 100 | 101 | # amsgrad 102 | vhat = self.get_slot(var, "vhat").handle 103 | vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) 104 | v_sqrt = math_ops.sqrt(vhat_t) 105 | 106 | var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) 107 | return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t]) 108 | 109 | def _apply_sparse_shared(self, grad, var, indices, scatter_add): 110 | beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) 111 | beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) 112 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 113 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 114 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 115 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 116 | 117 | lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) 118 | 119 | # m_t = beta1 * m + (1 - beta1) * g_t 120 | m = self.get_slot(var, "m") 121 | m_scaled_g_values = grad * (1 - beta1_t) 122 | m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) 123 | with ops.control_dependencies([m_t]): 124 | m_t = scatter_add(m, indices, m_scaled_g_values) 125 | 126 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 127 | v = self.get_slot(var, "v") 128 | v_scaled_g_values = (grad * grad) * (1 - beta2_t) 129 | v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) 130 | with ops.control_dependencies([v_t]): 131 | v_t = scatter_add(v, indices, v_scaled_g_values) 132 | 133 | # amsgrad 134 | vhat = self.get_slot(var, "vhat") 135 | vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) 136 | v_sqrt = math_ops.sqrt(vhat_t) 137 | var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) 138 | return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t]) 139 | 140 | def _apply_sparse(self, grad, var): 141 | return self._apply_sparse_shared( 142 | grad.values, var, grad.indices, 143 | lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda 144 | x, i, v, use_locking=self._use_locking)) 145 | 146 | def _resource_scatter_add(self, x, i, v): 147 | with ops.control_dependencies( 148 | [resource_variable_ops.resource_scatter_add(x.handle, i, v)]): 149 | return x.value() 150 | 151 | def _resource_apply_sparse(self, grad, var, indices): 152 | return self._apply_sparse_shared( 153 | grad, var, indices, self._resource_scatter_add) 154 | 155 | def _finish(self, update_ops, name_scope): 156 | # Update the power accumulators. 157 | with ops.control_dependencies(update_ops): 158 | with ops.colocate_with(self._beta1_power): 159 | update_beta1 = self._beta1_power.assign( 160 | self._beta1_power * self._beta1_t, 161 | use_locking=self._use_locking) 162 | update_beta2 = self._beta2_power.assign( 163 | self._beta2_power * self._beta2_t, 164 | use_locking=self._use_locking) 165 | return control_flow_ops.group(*update_ops + [update_beta1, update_beta2], 166 | name=name_scope) 167 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Junho Kim (1993.01.12) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AMSGrad-Tensorflow 2 | Simple Tensorflow implementation of [On the Convergence of Adam and Beyond](https://openreview.net/pdf?id=ryQu7f-RZ) 3 | 4 | ## Hyperparameter 5 | * For the default hyperparameter, we set it to the best value in the [experiment](https://fdlm.github.io/post/amsgrad/) 6 | * `learning_rate` = 0.01 7 | * `beta1` = 0.9 8 | * `beta2` = 0.99 9 | * Depending on which network you are using, performance may be good at `beta2 = 0.99 (default)` 10 | 11 | ## Usage 12 | ```python 13 | from AMSGrad import AMSGrad 14 | 15 | train_op = AMSGrad(learning_rate=0.01, beta1=0.9, beta2=0.99, epsilon=1e-8).minimize(loss) 16 | ``` 17 | 18 | ## Network Architecture 19 | ```python 20 | x = fully_connected(inputs=images, units=100) 21 | x = relu(x) 22 | logits = fully_connected(inputs=x, units=10) 23 | ``` 24 | ## Mnist Result (iteration = 30K) 25 | ### lr=0.1, beta1=0.9, beta2=various 26 |