├── .gitignore ├── DeepMemory ├── DeepMemory-Playground.ipynb ├── README.md └── deepmemory.py ├── LICENSE ├── README.md ├── Ranger └── ranger.py ├── adahessian ├── README.md └── adahessian.py ├── adamod ├── README.md ├── adamod.py └── diffmod.py ├── diffgrad ├── README.md ├── diff_rgrad.py ├── diffgrad-playground.ipynb ├── diffgrad.py └── mxresnet.py ├── diffmod ├── diffmod-playground.ipynb └── diffmod.py ├── images ├── 1120-optimizer-testing.jpg ├── projected_gradient.png ├── ranger-init.jpg └── ranger-with-gc-options.jpg ├── madgrad └── madgrad_wd.py └── sls ├── README.md ├── basic_train.py ├── callback.py ├── sls.py └── sls_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | -------------------------------------------------------------------------------- /DeepMemory/DeepMemory-Playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "\n", 12 | "%matplotlib inline" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from fastai.script import *\n", 22 | "from fastai.vision import *\n", 23 | "from fastai.callbacks import *\n", 24 | "from fastai.distributed import *\n", 25 | "from fastai.callbacks.tracker import *\n", 26 | "\n", 27 | "torch.backends.cudnn.benchmark = True\n", 28 | "\n", 29 | "import time" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "'1.0.57'" 41 | ] 42 | }, 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "import fastai;fastai.__version__ #safety check" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "'1.2.0'" 61 | ] 62 | }, 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "import torch; torch.__version__ #safety check" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from deepmemory import DeepMemory" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "Mish activation loaded...\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "from mxresnet import *" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "PosixPath('/home/ubuntu/.fastai/data/imagenette-160')" 107 | ] 108 | }, 109 | "execution_count": 7, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "path = untar_data(URLs.IMAGENETTE_160); path #optional - IMAGENETTE" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 8, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "def flattenAnneal(learn:Learner, lr:float, n_epochs:int, start_pct:float):\n", 125 | " n = len(learn.data.train_dl)\n", 126 | " anneal_start = int(n*n_epochs*start_pct)\n", 127 | " anneal_end = int(n*n_epochs) - anneal_start\n", 128 | " phases = [TrainingPhase(anneal_start).schedule_hp('lr', lr),\n", 129 | " TrainingPhase(anneal_end).schedule_hp('lr', lr, anneal=annealing_cos)]\n", 130 | " sched = GeneralScheduler(learn, phases)\n", 131 | " learn.callbacks.append(sched)\n", 132 | " learn.fit(n_epochs)\n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 9, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "tfms = ([\n", 142 | "\n", 143 | " flip_lr(p=0.5)#,\n", 144 | " #brightness(change=(0.4,0.6)),\n", 145 | " #contrast(scale=(0.7,1.3)),\n", 146 | " #cutout(n_holes=(2,40),length=(5,30),p=.25)\n", 147 | "\n", 148 | " ], [])" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 10, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "bs=64\n", 158 | "size=128" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 11, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "data = (ImageList.from_folder(path)\n", 168 | " .split_by_folder(valid='val')\n", 169 | " .label_from_folder()\n", 170 | " .transform(tfms=tfms,size=size) \n", 171 | " .databunch(bs=bs, num_workers=8) #windows 10 users - num_workers may need to be set to 1 or 0 (if you get pickle fork error)\n", 172 | " .presize(size, scale=(0.5, 1))\n", 173 | " .normalize(imagenet_stats))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 12, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "201" 185 | ] 186 | }, 187 | "execution_count": 12, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "memory_size = (len(data.x)//bs);memory_size #should be equal to or close to # of batches per epoch in order to build an average step size for the dataset" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 13, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "optar = partial(DeepMemory,betas=(.95,.999),len_memory = memory_size)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 14, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "model = mxresnet50(sa=1)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 15, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "learn = Learner(data, model, metrics=[accuracy], wd=1e-3,\n", 221 | " opt_func=optar,\n", 222 | " bn_wd=False, true_wd=True,\n", 223 | " loss_func = LabelSmoothingCrossEntropy())" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 16, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "learn.callback_fns += [\n", 233 | " partial(ShowGraph),\n", 234 | " #partial(SaveModelCallback, name='model-novotest-1')\n", 235 | " ]" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 17, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "DeepMemory: length of memory is 201 - this should be close or equal to batches per epoch\n" 248 | ] 249 | }, 250 | { 251 | "data": { 252 | "text/html": [ 253 | "\n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | "
epochtrain_lossvalid_lossaccuracytime
02.5711962.5479250.43200000:41
12.2281642.1192140.60400000:38
22.0802571.9475100.66600000:37
31.9792832.0592220.63600000:37
41.8872421.7520530.74200000:37
51.8383711.7793050.73200000:37
61.8018791.7866060.72800000:37
71.7281101.7040010.75200000:37
81.6999861.7143470.75800000:37
91.6819881.6300250.78400000:37
101.6258261.7106770.76400000:37
111.5834071.6071640.81400000:37
121.5654651.5776480.80000000:37
131.5653151.5569620.80600000:37
141.5160481.6664690.78000000:38
151.4757361.5417280.82600000:37
161.4399511.4825740.83400000:37
171.4008931.4343330.85800000:38
181.3749151.4118270.86800000:38
191.3305611.4212910.86800000:37
" 406 | ], 407 | "text/plain": [ 408 | "" 409 | ] 410 | }, 411 | "metadata": {}, 412 | "output_type": "display_data" 413 | }, 414 | { 415 | "data": { 416 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAD4CAYAAAATpHZ6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3deXxc5X3v8c9vFi2jxVosy5I3ycbYxgu2EY4bCCGQBTCB9Iam7itpU5Jb2pD2QnpvE0ialKTkXrqn6c1yIaVJCoWAUxoIhIQmGJOyysY23vdFkmVt1r7OzHP/OGNZNtZiW2dGGn/fr9e8dObMmTM/HUtfP3rOc55jzjlERCQ9BFJdgIiIjB+FuohIGlGoi4ikEYW6iEgaUaiLiKSRkB87nVJY5ObPm+vHrkVE0tLGjRubnHMlF7ofX0J9Wvksqqur/di1iEhaMrPD47EfX7pfNPJdRCQ1/OlTV6qLiKSETydKleoiIqngS5+6Il1EzsXAwAA1NTX09vamuhTfZWVlMXPmTMLhsC/79yXURUTORU1NDXl5eVRUVGBmqS7HN845mpubqampobKy0pfPUJ+6iKRcb28vxcXFaR3oAGZGcXGxr3+RaPSLiEwI6R7oJ/n9feqKUhGRNKKWuohc9FpbW/n2t799zu+76aabaG1t9aGi8+dTn7piXUQmj+FCPRaLjfi+5557joKCAr/KOi8a0igiF7177rmH/fv3s3z5csLhMLm5uZSVlbF582Z27NjBRz7yEY4ePUpvby933XUXd9xxBwAVFRVUV1fT2dnJjTfeyNVXX80rr7zCjBkz+MlPfkJ2dnbSvxcNaRSRCeWrz2xnR137uO7zsvJ8/uLDi4d9/YEHHmDbtm1s3ryZ9evXs2bNGrZt2zY47PDhhx+mqKiInp4errzySj760Y9SXFx82j727t3LY489xkMPPcTHPvYxfvzjH/OJT3xiXL+PsVBLXUTkDKtWrTptHPk3v/lNnnrqKQCOHj3K3r173xHqlZWVLF++HIArrriCQ4cOJa3eofxpqSvVReQ8jdSiTpacnJzB5fXr1/Of//mfvPrqq0QiEa699tqzjjPPzMwcXA4Gg/T09CSl1jNpSKOIXPTy8vLo6Og462ttbW0UFhYSiUTYtWsXr732WpKrOzfqfhGRi15xcTFXXXUVS5YsITs7m9LS0sHXbrjhBr773e+ybNkyFixYwOrVq1NY6ejM+TD8cPaCpe7I7rfHfb8ikp527tzJokWLUl1G0pzt+zWzjc65qgvdtz8XH2mcuohISqhPXUQkjSjURUTSiOZ+ERFJI5pPXUQkjailLiKSRsYU6mZWYGbrzGyXme00s9/wuzARkYksNzcXgLq6Om677bazbnPttddSXV2dzLLGfPHRPwLPO+duM7MMIDLSxk5tdRG5SJSXl7Nu3bpUlzFo1FA3s3zgGuD3AZxz/UD/iG9SpovIJPOFL3yBOXPmcOeddwJw3333YWZs2LCBEydOMDAwwP3338+tt9562vsOHTrEzTffzLZt2+jp6eH2229nx44dLFq0KCXzv4ylpT4XaAT+xcwuBzYCdznnuoZuZGZ3AHcAFJTPHe86ReRi8bN7oH6cr0ifvhRufGDETdauXcvdd989GOpPPPEEzz//PJ/73OfIz8+nqamJ1atXc8sttwx7n9HvfOc7RCIRtm7dytatW1m5cuX4fh9jMJY+9RCwEviOc24F0AXcc+ZGzrkHnXNVzrmqrOyscS5TRMRfK1asoKGhgbq6OrZs2UJhYSFlZWV88YtfZNmyZbz//e+ntraW48ePD7uPDRs2DM6hvmzZMpYtW5as8geNpaVeA9Q4515PPF/HWUJdRGRcjNKi9tNtt93GunXrqK+vZ+3atTz66KM0NjayceNGwuEwFRUVZ512d6jhWvHJMmpL3TlXDxw1swWJVdcDO0Z+zzhUJiKSZGvXruXxxx9n3bp13HbbbbS1tTFt2jTC4TAvvvgihw8fHvH911xzDY8++igA27ZtY+vWrcko+zRjHf3yJ8CjiZEvB4Db/StJRCQ1Fi9eTEdHBzNmzKCsrIyPf/zjfPjDH6aqqorly5ezcOHCEd//mc98httvv51ly5axfPlyVq1alaTKT/Fl6t3p8xa7+v3bx32/IpKeNPXuRJ96V2MaRURSQnO/iIikEc39IiITwsVycx2/v0/Npy4iKZeVlUVzc3PaB7tzjubmZrKy/LuWx58bT6f3v4uIjLOZM2dSU1NDY2NjqkvxXVZWFjNnzvRt/76EujpgRORchMNhKisrU11GWvDpxtN+7FVEREajE6UiImlEJ0pFRNKIul9ERNKITy11pbqISCqoT11EJI2o+0VEJI2opS4ikkY0oZeISBrR1LsiImlE49RFRNKITpSKiKQRnSgVEUkj6n4REUkjPnW/qK0uIpIKaqmLiKQR9amLiKSRMd35yMwOAR1ADIg656pG2l69LyIiqXEut7N7n3OuybdKRETkgulEqYhIGhlrqDvgF2a20czuONsGZnaHmVWbWbUiXUQkNcba/XKVc67OzKYBL5jZLufchqEbOOceBB4EyCybr1wXEUmBMbXUnXN1ia8NwFPAqtHeE48r10VEkm3UUDezHDPLO7kMfBDYNtr7ogp1EZGkG0v3SynwlJmd3P7fnHPPj/amuE6Wiogk3aih7pw7AFx+rjtWS11EJPl8myYgplAXEUk6hbqISBpRqIuIpBGFuohIGvEv1DX6RUQk6fwL9ZhCXUQk2dRSFxFJIz72qcf92rWIiAzDx1D3a88iIjIc30I9qpa6iEjS+RbqynQRkeRTS11EJI3411LX6BcRkaTzr6WuceoiIkmnceoiImlEc7+IiKQRhbqISBpRqIuIpBGFuohIGvFxnLpCXUQk2XwL9f6oLj4SEUk230K9Lxrza9ciIjKMMYe6mQXN7C0z++lYtu8dUEtdRCTZzqWlfhewc6wbq6UuIpJ8Ywp1M5sJrAG+N9Yd96mlLiKSdGNtqX8D+DwwbFKb2R1mVm1m1QC9aqmLiCTdqKFuZjcDDc65jSNt55x70DlX5ZyrCpippS4ikgJjaalfBdxiZoeAx4HrzOyRkd5gppa6iEgqjBrqzrl7nXMznXMVwFrgV865T4y4U7XURURSwpdx6mbQp4uPRESSLnQuGzvn1gPrR9sugNE7oO4XEZFkU0tdRCSN+BLqATN6+tVSFxFJNl9CPRgwOvqifuxaRERG4E9LPQCdfQN+7FpEREbgT0vdjM5etdRFRJLNp5a60dkXxTndKENEJJl8a6kPxJxGwIiIJJlvLXWATp0sFRFJKp9a6t7XDvWri4gklb8tdYW6iEhS+danDtChYY0iIknla0td3S8iIsnlS6iHEqHe3Nnvx+5FRGQYPoW6t9vGjj4/di8iIsPwbZbGwkiYxs5eP3YvIiLD8CXUAablZdHQrpa6iEgy+RbqJXmZNHYq1EVEksnfUFefuohIUvnY/ZJJQ0efJvUSEUkiX1vq/dE47RqrLiKSNL6GOkBjh0bAiIgki++h3qB+dRGRpPG1Tx10AZKISDKNGupmlmVmb5jZFjPbbmZfHcuOS/KyAIW6iEgyhcawTR9wnXOu08zCwK/N7GfOuddGelN+VoiMUEChLiKSRKOGuvPGJHYmnoYTj1HHKZoZpfmZHD3RfWEViojImI2pT93Mgma2GWgAXnDOvX6Wbe4ws2ozq25sbARgcdkUdh7rGNeCRURkeGMKdedczDm3HJgJrDKzJWfZ5kHnXJVzrqqkpASAS6fncbi5i96B2LgWLSIiZ3dOo1+cc63AeuCGsWy/oDSPuIM9x9VaFxFJhrGMfikxs4LEcjbwfmDXWHa+ck4BANWHTlxAiSIiMlZjaamXAS+a2VbgTbw+9Z+OZedlU7KZWZjNm4daLqRGEREZo1FD3Tm31Tm3wjm3zDm3xDn3tXP5gFUVRbx56MTgxF6vHWhm7r3P8sNXD2myLxGRcebbFaUnVVUU0dTZx+HmbjYdOcHaB18j7uArP9nOzf/0a7r7NeGXiMh48T3UV1UWAvDGoRYe+JnXFf9f91zHb1fNYntdO3c+ukktdhGRceJ7qM8ryaUwEubNgy1sr23j999dwYyCbP7Pf1vKR1fOZP3uRp7cWON3GSIiFwXfQ93MWD23mCc31tDVH2NmYbb3wQHjb25bxrsqi/jLZ3ZwqKnL71JERNKe76EO8IHLSgeXF5Xln/rwgPG3v3U5fbE41/7tev7gh9XJKEdEJG0lJdSvWzhtcPnyWQWnvTarKML3b78SgBd2HOd7Lx9IRkkiImkpKaFeEMnAzFvOzXznHGLvnjeVvV+/kao5hdz/7E6+/B/biMd18lRE5FwlJdQBXr/3ejb82fuGfT0cDPDg71VRYq1kvPkdPvgPLxGNxZNVnohIWhjLfOrjYlp+1qjbFOVk8NL7a4m8/AhXtO7hK09m8pWPriIrHExChSIik1/SWupjFbnuz3Af+EtuCFbzuzv+gA985Qc0tOvm1SIiYzHhQh0z7Kr/QeB3f8zcjDaezvgyn3vgGzxRfTTVlYmITHgTL9RPmncdmXe+RHZROT8MP8Dupx7g/63fl+qqREQmtIkb6gBFc8n6o19hC9fw5fAjTP3l3Sy45ynaugdSXZmIyIQ0sUMdIDOPwG//K33vuYePBl/miYyv8aGvPcY///pgqisTEZlwJn6oAwQCZF5/L6z9NxaF63km88957tmnqLz3WbbXtaW6OhGRCWNyhPpJC9eQ8YcvUlBQxOOZX2dt4Jes+eav+fP/eJvmzr5UVyciknLmx7S3VVVVrrrax3lcek7Auk/D/l/ySPR6vhr9JAOJIfd/vmYRt19VSTBg/n2+iMg4M7ONzrmqC93P5Gqpn5RdCB9/Eq66i0+EfsmG0n9g7WWZANz/7E7WfPNl6ts0tl1ELj6Ts6U+1Nvr4CefhUgx0Y89wpN1U/naMzvoGYgxf1ou/7h2BZeV54++HxGRFBqvlvrkD3WAus3wo09AVyPc8k8cLF/DXY+/xdYa7yTqtQtKmJaXya76DmYVRphRmM2ffuBSTT8gIhOGQv1MnY3w5Cfh8H/Bu/+E3vd+hV/taeZLT73NiWHGtV8+cwqY8cHLSrnz2nmYqR9eRFJDoX42sQF4/l548yGYvgyW/hZcdguvnchj17F2ZhdHWDg9nzcOtnD3jzaf9tb8rBB/+N55fGhxKTMLI2rFi0hSJS3UzWwW8ENgOhAHHnTO/eNI70lZqJ+05XF49VtQv9V7Pn0ZXHYLLLoVSi4d3CwWd7yw4zh//8Ju9hzvPG0XS2dM4fffXcHKOYVUTs0ZvDm2WvMi4odkhnoZUOac22RmecBG4CPOuR3DvSfloX5Sy0HY+QzsfBpq3vTWlSyERbd4IV+6hJN373DO8dzb9WypaWXnsXZe3tt01l1+6qpKttS0AvC7q+fwwcWlRDJCEI9Dy36o3Xjq0XIQVnwcrvk8ZOlkrYgML2XdL2b2E+D/OudeGG6bCRPqQ7XVwq6fwo6n4cgr4OJQWHmqBT9j5WDAA3T2Rak+1MJnHtlEz0DsHbsr4QTLA/u5PLCfFcEDrAgeIBL3bp7tMnKx8hWQNQV2PQs5U+H6v4DlH4fA5BxFKiL+Skmom1kFsAFY4pxrH267CRnqQ3U2egG/8xk4+BLEo5A/AxZ92GvFz14NgSF96r3tcGwz1G6kff/rhI69RaS3HoAoQXbGZ7ElPo8tbh6b45ew35UTJ8ANi6fzqbknWPDW15nStAnKlsONfw2z35Wib1xEJqqkh7qZ5QIvAV93zv37WV6/A7gDYPbs2VccPnz4QmtLjp4TsPt5r4tm3y8h1gc502DhTd6J19qN0LgbSBynorkw44pTj+lLIZzNia5+DjR18s+/Pshzb9ef8SGOWwKvcG/4McqshY75v0ns+vuYUjqHw83dzCjMJhxUC17kYpbUUDezMPBT4OfOub8fbfsJ31IfTl8H7P2F10Wz9wXIiAwJ8JVQvhIiRWPaVX80TmtPP1/+j21EMkLEneMXmw/wmdDT/GHwWWIE+Hb0Fh6KrSErO4c/ft8lzC/N5b2XluhkrMhFKJknSg34AdDinLt7LDudtKE+VDwGFjitn308HG7u4tHnX2ZN/be5vOMljtk0Hsr+FA+3LAWMopwMWrr6ASiMhLlhSRl/ct0lOKCjd4Ce/hgNHX0ca+3hhiVl3P/sDmJxx5LEaJ1X9jdzpKWbW5eXMzU3c1xrFxH/JDPUrwZeBt7GG9II8EXn3HPDvSctQj0ZDm6An90DDdvpLFvNX3E7/3ow7x2bhYPGQOzcrye4/aoKbl5WxhVzxvbXhYikji4+ShexKGz6AfzqfuhtpX/57xG/9ktkTZkGwEt7Gvn7F/aw5Wgrq+cWsagsnynZYY40d5OdEeQ3V8xgwfQ8/vdzuzjS0kVeZhgAh+Pn248DUDk1h/deWsJNS8u4sqLwrN07sbjjx5tqqD7UwvrdjVx9yVSyMoJcWVHIjUvKCAcDxJ3DgJD6/0XGnUI93XS3wPoH4M3vQWYuXPtFuPLTEAyf3/5iA+w+Usfru4/w+u5a9tS3ESJGboZRnB2gu6+flTPzmZ4X4s0DjTS2dxMknnjECBInRJxAYl09hRyIl3OMIlbPLeaLNy1i6Ywp6v8XGScK9XTVsBOevwcOrPculLr6c2BB6O+Avk7o7/S+9rWfWj7bupg/Nw3pcpkccGXsd+Xsj5ez35VTMHsJH7n+PayaXz64XTQWV4te5Bwo1NOZc7D7Z/DzL8KJs9yLNSPXe2Tmea36k8sZud7zzDzIGPJaKMsbdx8I4QJBatsGmJKTReeAo2fAmF2SSyiU4f3nEQgObjv43Aza66BpDwPHd9NXv4uu2p2UxBoImPfzE3fGUVfCIco5SDm7Y2W05VTyensxc2bNZn5pPv/9PZXML82jrWeAUMAwg4Go41e7j/PS7kYG4o68zBAtXf2sWTqdDy4sJjsY94aWxqMQzNCVuZK2FOoXg2gfNO6CcORUYIdzJs5Vqf3dbN+2CZr2smf7JvI7DzLXaimP1pLJqb8UWl0Oh10pDggTI5R4hIkSstg71xEjZPGzfmQTBfQXzmf6vGUEShbA1EuhZAHklQ2OVIrFHd39UfKyvK6rfQ2dPL/tGCtmF7J8VgE5maHRv7e+Tmje5z2a9kLzXu/riUPeZ5Utg7LLvcf0ZZBdcKFHUy5yCnWZuOJxaK+Bpj301++ms24n8eaD7Gvspjdu9LsgWZlZdEeNqVNyyMjIpCg/h+L8HNr6IC+SRV1HlA37W2noihMlSHZWFtHeTubaMS4J1HJJoI58ugc/sicQoT48i809peyNlbHPlbPPzeAIpUTd2WfcXDgtm/z+43y+KsgVOU1Y8z76ju8mo/UA1lE3uJ3DsIJZUDwfiiq9v1qObYH22lM7K6w4FfJll3tXD+dM9esIp87gHEeboGE7FMyB2b/hdRVOlMbGJKVQl4uGc27whKxzjqe31PHK3ibqjx2Gxj3MitdwidVyidUyL3CMMmsZfG+/C1IXLOOAm0FX/jxOdPUzte8Ic+0YlVZPpp2aa7/NRTjgyr1zBvGyweXDrpQ+Mrjvw5exsCyfyqk5lOZn8dKmHRS276SgbQfUb6W4fRc5XUdOFZ4/44ygv/y0vygmPOegrQbqNnkhXrcJ6rZAn3fzGSwILjEvUlaBN73G7NVeyJevgJCukzgXCnURoHcgRn1bL8GAEckIUpyb6c3V07SXgeM7CbXsw5r2QNNub9ZMM1xhBfGiS3DFl3Aiew6vtBbyd5sckYJS+uOO/micSEaQGQXZrFlWzpsHW/hR9dHTPjcjFKA/+s4uony6WBY6wmI7RFXmERa5g8yIHcUS00z0ZBQRKltKOG9q4pxIHi4zH8vM955neV/j4TxOxDJpJ0IoK5+SogL6oo5IZtC/KSU6G88I8Le8u4kBBMJQuvjUldUzVsLUBdB2FI685k2Sd+Q1aNrjbR/M9K7EPhnys1api2oUCnWRcxXt91rJ5zFMtLW7n689s4PMcIA5xTnUnujhUHMX18wvYSAeZ1FZPrvrO9jf0EkkI8jB5m7ePNhC3DmC0W4W2hGWBA6yxA5xaeAohYEeikJ9ZEQ7yaR/9NJdgE6y6XARBkI55E0pJB7KJpKTS25uPhbO9s69DPkaC2URyIhg4QiEI7hwFrubYzy5pZmW/iDvnd7HB6bUEmnaitW95QU0AOadpzgZ3uUrvUAPZ41+oLqa4OjrcDgR8sc2eye5MW8fJ0N+9mqYMvOc/x3SmUJdZJLoi8ZoaO+jqz9KY0cf33pxH82d/Rxs6iIad4SJsqAQikJ9NDY1kUc3VWUhZkViBAc6CMe6qK0/Ti49TMvoJzjQSS49ZFk/2fSTG+wnNzBAxPoJRHvI4Oy3bxzOUVdKa+ES6nIW0Vq4lN1WSV13iIJImPfML2FOcYTczBD7GzuZW5JLRXGEnoGYdx+B0fR3e5PiHXnVexx9wxt2CzBlNsy60uuXzy/3Qj6/HPJnenMsTZZuqnGiUBeZ5GJxx676dhaU5o15TL9zjhPdA7x2oJkXdzXQ2ef9R9HU2cehZu/EcYA42dbPnDwozoxxrOkE2fQzryDANRU5LJgawvq7qenP4fOvBmnlnVNTjNWc4gj/84ML+NDiUjJDQdp7B4jGHIWRMLG4Ixiw0y9Qi0W9E6yHX8UdeRXqNmHtdYnW/BChrETAz0g8ymHKjCHPZ6Rd8CvUReQ0LV39HGvrYe7UXLIzxn6P3aMt3ext6GBeSS4dvVFmFmaTmxni1/uaCAUC7D7ewdObaynMyWDh9Hx2HmunoaOPncdO3VJhuPmJSvMzWTA9n6DB27XtVBRHCAWNzr4o+xo6uWJOIbcsm85Nc4Pk9jZQf3QfBdEmWuoOEG+rJbu3nvz+BsLdx7Hhgj+vHHKneY+cEsgtPX05pwRCGed9XJNFoS4iKTcQi/PUplq21LTy6OtHmJqbQdWcIoIBo6Gjl70NXldLPO7ICgdp7uonFvcyJyMYoD926mTzmc+HChBnKm2UWTNl1sLs0AlWFfdQMNDIzFAr+bEThHuaCEc7z/p+sgreGfa5Jd69E3JLvbuUZUQS14TkJM5N5EBwDF1M40ShLiKTxpnDUuMOggHDOcfGwyd46OUDFOVkkhE0pmSHmT4lmxWzC+gZiLG9rp3ttW2UTcnmQFMn9W29dPZF2V73zpuvZdJPibVRYm1MpZUFuT3Myeoiq6+ZrL4mpgc7KAu2kxc7QWasa/TCgxlDgj5yKvgHl3NOrXNxrxvp5BXQZy4PPh/wpvYefM17bp99bVxCPXn/DYnIRWtov7qZEbRTy1UVRVRVDD899MrZhWdd39UXJWDGhr2NhAJGXlaY2tZu7v33t8ksqGBXay87gxm0tQ3ggK7+KMU5mZzo9P5ayKKPqdZGCW3kWTerZ2Zx08IpFIaiRAJ9hGO90N8FA93eCd+BLhjo8db1tkJ7Hb3dHQRjPdhAD4FggEAw7E2xEUh8DYaGPA96I69OPg9nD3keAl4bn2OtlrqIXEz6ojH2NXTS2NHHnuMdzCqMsONYO999af9p5wWm5mYQCgToGYiREQowpyhCaX4WB5q6qG/rIRZ3tPee6ucPB42rL5nK9YtKWbO0jMKcc+vHV/eLiMg4qm/rpfpwC4eaunhlfzPg3ZayMCeDbbVtHGvrpXJqDgBLZ0wh5hzxuGN2UYRrLi3h6c11p12klpsZoi8aozgnk4JImJzMEOGgMX9aHqGgcay1l+6BGPG4o7Gjj1/86XsV6iIiyRKPOwKBkYdQdvZF2d/QyfrdjWypaeVISzdFORm8cbCFd1UWcai5i66+GJ19USqn5jAlO0wwYAQDxpN/9G71qYuIJMtogQ5e6/zyWQVcPmvkKRGGnjg+yf7ogsobpGnVRESSzM87hinURUTSiEJdRCSNKNRFRNKIQl1EJI2MGupm9rCZNZjZtmQUJCIi528sLfXvAzf4XIeIiIyDUUPdObcBaBltOxERSb1x61M3szvMrNrMqhsbG8drtyIicg7GLdSdcw8656qcc1UlJSXjtVsRETkHGv0iIpJGFOoiImlkLEMaHwNeBRaYWY2Zfdr/skRE5HyMOkujc+53klGIiIhcOHW/iIikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpJExhbqZ3WBmu81sn5nd43dRIiJyfkYNdTMLAt8CbgQuA37HzC7zuzARETl3Y2mprwL2OecOOOf6gceBW/0tS0REzkdoDNvMAI4OeV4DvOvMjczsDuCOxNM+M9t24eX5airQlOoixkB1jp/JUCNMjjonQ40wueqcMx47Gkuo21nWuXescO5B4EEAM6t2zlVdYG2+mgw1guocT5OhRpgcdU6GGmHS1VkxHvsaS/dLDTBryPOZQN14fLiIiIyvsYT6m8B8M6s0swxgLfC0v2WJiMj5GLX7xTkXNbM/Bn4OBIGHnXPbR3nbg+NRnM8mQ42gOsfTZKgRJkedk6FGuAjrNOfe0T0uIiKTlK4oFRFJIwp1EZE0Mq6hPtGmEzCzQ2b2tpltNrPqxLoiM3vBzPYmvhYm1puZfTNR+1YzW+lTTQ+bWcPQcfznU5OZfTKx/V4z+2SS6rzPzGoTx3Ozmd005LV7E3XuNrMPDVnv28+Emc0ysxfNbKeZbTezuxLrJ9TxHKHOiXY8s8zsDTPbkqjzq4n1lWb2euLY/CgxYAIzy0w835d4vWK0+n2s8ftmdnDIsVyeWJ+y36HEZwTN7C0z+2niuf/H0jk3Lg+8k6j7gblABrAFuGy89n+eNR0Cpp6x7q+BexLL9wB/lVi+CfgZ3rj81cDrPtV0DbAS2Ha+NQFFwIHE18LEcmES6rwP+F9n2fayxL93JlCZ+DkI+v0zAZQBKxPLecCeRC0T6niOUOdEO54G5CaWw8DrieP0BLA2sf67wGcSy3cC300sr1XbmUIAAAOdSURBVAV+NFL9Ptf4feC2s2yfst+hxOf8KfBvwE8Tz30/luPZUp8s0wncCvwgsfwD4CND1v/QeV4DCsysbLw/3Dm3AWi5wJo+BLzgnGtxzp0AXgBuSEKdw7kVeNw51+ecOwjsw/t58PVnwjl3zDm3KbHcAezEuwJ6Qh3PEeocTqqOp3POdSaehhMPB1wHrEusP/N4njzO64DrzcxGqN/PGoeTst8hM5sJrAG+l3huJOFYjmeon206gZF+cJPBAb8ws43mTWMAUOqcOwbeLxswLbE+lfWfa02prPWPE3/GPnyyW2OEepJWZ+LP1RV4LbcJezzPqBMm2PFMdBdsBhrwgm4/0Oqci57lMwfrSbzeBhT7XeeZNTrnTh7LryeO5T+YWeaZNZ5RSzL+zb8BfB6IJ54Xk4RjOZ6hPqbpBJLsKufcSrwZJj9rZteMsO1ErH+4mlJV63eAecBy4Bjwd4n1Ka3TzHKBHwN3O+faR9p0mHpSVeeEO57OuZhzbjneleOrgEUjfGZK6jyzRjNbAtwLLASuxOtS+UIqazSzm4EG59zGoatH+Mxxq3M8Q33CTSfgnKtLfG0AnsL7IT1+slsl8bUhsXkq6z/XmlJSq3PueOIXKg48xKk/A1NWp5mF8YLyUefcvydWT7jjebY6J+LxPMk51wqsx+uHLjCzkxcqDv3MwXoSr0/B67JLSp1Darwh0cXlnHN9wL+Q+mN5FXCLmR3C6ya7Dq/l7v+xHMcTAiG8kw2VnDqJs3i89n8e9eQAeUOWX8HrM/sbTj+J9teJ5TWcfkLlDR9rq+D0E5DnVBNeS+Qg3gmewsRyURLqLBuy/Dm8vj6AxZx+MucA3kk9X38mEsflh8A3zlg/oY7nCHVOtONZAhQklrOBl4GbgSc5/eTenYnlz3L6yb0nRqrf5xrLhhzrbwAPTITfocRnXcupE6W+H8vxLv4mvDP7+4Ev+XGAzqGWuYmDsQXYfrIevH6qXwJ7E1+LhvwwfCtR+9tAlU91PYb3p/YA3v/Cnz6fmoBP4Z002QfcnqQ6/zVRx1a8+X+GhtKXEnXuBm5Mxs8EcDXen6Jbgc2Jx00T7XiOUOdEO57LgLcS9WwDvjLkd+mNxLF5EshMrM9KPN+XeH3uaPX7WOOvEsdyG/AIp0bIpOx3aMjnXMupUPf9WGqaABGRNKIrSkVE0ohCXUQkjSjURUTSiEJdRCSNKNRFRNKIQl1EJI0o1EVE0sj/B4N8l4hazWFYAAAAAElFTkSuQmCC\n", 417 | "text/plain": [ 418 | "
" 419 | ] 420 | }, 421 | "metadata": {}, 422 | "output_type": "display_data" 423 | } 424 | ], 425 | "source": [ 426 | "flattenAnneal(learn,4e-3, 20, .72) #imagenette" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 23, 432 | "metadata": {}, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/html": [ 437 | "\n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | "
epochtrain_lossvalid_lossaccuracytime
02.7835042.8460700.29200000:38
12.5149732.6563900.37400000:36
22.3319842.5867600.40000000:36
32.2111902.3795110.47800000:36
42.0867472.1229400.58600000:35
51.9771752.1921980.58000000:36
61.8904281.9888420.63600000:36
71.8042302.1073340.61000000:36
81.7430781.8514160.69800000:36
91.7034221.7780180.72000000:36
101.6540181.8857940.68200000:36
111.6213821.8123100.72400000:36
121.5933951.7781660.74000000:36
131.5587901.7759040.72800000:36
141.5186921.7417600.74400000:36
151.4865591.6096520.76600000:36
161.4139431.6840220.76600000:37
171.3542261.6126170.79400000:37
181.2916161.5381400.81600000:36
191.2504321.5424790.80800000:36
" 590 | ], 591 | "text/plain": [ 592 | "" 593 | ] 594 | }, 595 | "metadata": {}, 596 | "output_type": "display_data" 597 | }, 598 | { 599 | "data": { 600 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD6CAYAAACIyQ0UAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd3hc133m8e8BMOgdRCNBAuwUK0iCVCGjYtkyKckqEaPQ3bJjJnYcS8rus5I3u4n1xM46+yRu6yLTjuIiWbZMSZYtq1iyRTOqFCg2sFeAKETvdTBz9o87IAASZQDMAJfE+3meeTCcueU3l8CLg3PPPddYaxEREfeKmOoCRERkZApqERGXU1CLiLicglpExOUU1CIiLqegFhFxuVGD2hiz2Bizb8CjxRjzwGQUJyIiYMYyjtoYEwlUAFdba0uHWy4lLd0unD8vBOWJiEwPe/bsqbPWZg71XtQYt3UzcGqkkAbImjmb4uLiMW5aRGT6MsYMm6tj7aPeCjw5zE62GWOKjTHFLS0tY9ysiIgMJ+igNsZEA3cAvxrqfWvtdmttkbW2KCkpKVT1iYhMe2NpUW8G3rPWVo+2oGYPEREJnbH0UX+YYbo9LqGkFpEx8Hq9lJeX09XVNdWlhF1sbCx5eXl4PJ6g1wkqqI0x8cAHgL8OZnnltIiMRXl5OUlJSRQUFGCMmepywsZaS319PeXl5cydOzfo9YLq+rDWdlhrM6y1zUEtH/TuRUSgq6uLjIyMKzqkAYwxZGRkjPkvh/BcmaikFpExutJDus94PmdYgtoqqUVEQkZzfYjItNfU1MT3vve9Ma9366230tTUFIaKBgtPi1oNahG5jAwX1D6fb8T1XnjhBVJTU8NV1gVjvYRcROSK8/DDD3Pq1CkKCwvxeDwkJiaSm5vLvn37OHz4MHfddRfnzp2jq6uL+++/n23btgFQUFBAcXExbW1tbN68mY0bN/Lmm28ya9YsnnvuOeLi4kJSX1iCWg1qERmvR357iMOVoZ2GYunMZP7pQ8uGff9rX/saJSUl7Nu3j507d3LbbbdRUlJyYQjdY489Rnp6Op2dnaxbt4577rmHjIyMQds4ceIETz75JD/84Q+59957efrpp/nYxz4WkvrDE9Tq+xCRy9j69esHjXP+9re/zbPPPgvAuXPnOHHixCVBPXfuXAoLCwFYu3YtZ8+eDVk96voQEVcZqeU7WRISEi4837lzJ6+++ipvvfUW8fHx3HjjjUOOg46JibnwPDIyks7OzpDVE6bheSIil4+kpCRaW1uHfK+5uZm0tDTi4+M5evQob7/99iRXF64WtZJaRC4jGRkZbNiwgeXLlxMXF0d2dvaF9zZt2sSjjz7KypUrWbx4Mddcc82k1zemO7wEa9bC5bbiREnItysiV6YjR45w1VVXTXUZk2aoz2uM2WOtLRpqeV3wIiLicmG64EV9HyIioaIWtYiIy2nUh4iIy6lFLSLicpqUSUTE5dSiFhEZh8TERAAqKyvZsmXLkMvceOONFBcXT3hfunGAiMgEzJw5kx07doR1H7oyUUQEeOihh8jPz+fzn/88AF/+8pcxxrBr1y4aGxvxer185Stf4c477xy03tmzZ7n99tspKSmhs7OT++67j8OHD3PVVVeFbL4PTXMqIu7y4sNw/mBot5mzAjZ/bcRFtm7dygMPPHAhqJ966ileeuklHnzwQZKTk6mrq+Oaa67hjjvuGPa+h9///veJj4/nwIEDHDhwgDVr1oSkfM2eJyICrF69mpqaGiorK6mtrSUtLY3c3FwefPBBdu3aRUREBBUVFVRXV5OTkzPkNnbt2sUXv/hFAFauXMnKlStDUltQQW2MSQV+BCzHaTB/2lr71nDLa9SHiIzbKC3fcNqyZQs7duzg/PnzbN26lSeeeILa2lr27NmDx+OhoKBgyClOBwrH3dSDPZn4LeAla+0SYBVwJOSViIhMsa1bt/KLX/yCHTt2sGXLFpqbm8nKysLj8fDaa69RWlo64vrXX389TzzxBAAlJSUcOHAgJHWN2qI2xiQD1wOfArDW9gA9I62jUR8icjlatmwZra2tzJo1i9zcXD760Y/yoQ99iKKiIgoLC1myZMmI63/uc5/jvvvuY+XKlRQWFrJ+/fqQ1DXqNKfGmEJgO3AYpzW9B7jfWtt+0XLbgG0ASbnz1rZUngpJgSJy5dM0pxOf5jQKWAN831q7GmgHHr54IWvtdmttkbW2yBMdPfbKRURkSMEEdTlQbq19J/DvHTjBLSIik2DUoLbWngfOGWMWB166GacbZIR1QlCZiEwr02Ue+/F8zmDHUf8d8IQxJho4Ddw3SiljLkREpq/Y2Fjq6+vJyMgIy/A2t7DWUl9fT2xs7JjWCyqorbX7gCE7uYdcfkwliMh0l5eXR3l5ObW1tVNdStjFxsaSl5c3pnXCcwm5klpExsDj8TB37typLsO1NM2piIjL6VZcIiIuF54WtZJaRCRkdOMAERGXUx+1iIjLqY9aRMTl1EctIuJyalGLiLic+qhFRFwuPC1qXZooIhIyalGLiLic+qhFRFxOoz5ERFxOLWoREZfTyUQREZcLW4va71dYi4iEQthGfXj9/nBtWkRkWglfUPvUohYRCYWwBXVPr1rUIiKhEMYWtYJaRCQU1KIWEXG5oO5Cbow5C7QCPqDXWls02jpqUYuIhEZQQR1wk7W2LtiFexTUIiIhEb4+6l6N+hARCYVgg9oCvzfG7DHGbBtqAWPMNmNMsTGmGNSiFhEJlWC7PjZYayuNMVnAK8aYo9baXQMXsNZuB7YDxOQutOqjFhEJjaBa1NbaysDXGuBZYP1o6yioRURCY9SgNsYkGGOS+p4DtwAlo62noBYRCY1guj6ygWeNMX3L/9xa+9JoK/XoZKKISEiMGtTW2tPAqrFuWC1qEZHQ0CXkIiIup0vIRURcTi1qERGXC1+LWvNRi4iEhFrUIiIuF8a5PhTUIiKhoBa1iIjLhSWojYFutahFREIiLEEdYQwdPb5wbFpEZNoJW1C39/SGY9MiItNOmIIaOtWiFhEJiTC2qBXUIiKhEJagjowwdHSr60NEJBTUohYRcbmw9VF36GSiiEhIhCeoIzQ8T0QkVMI3jlp91CIiIRG+rg+vD79fM+iJiExU2FrU1kJXr7o/REQmKmx91ADt3QpqEZGJClvXB+jqRBGRUAhb1weg+T5EREIg6KA2xkQaY/YaY54fdaOBoG7TyA8RkQkbS4v6fuBIMAtGBfo+mju84yhJREQGCiqojTF5wG3Aj4JZPjIQ1I0dPeMuTEREHMG2qL8J/A9g2Nu2GGO2GWOKjTHFTY0NADR3qkUtIjJRowa1MeZ2oMZau2ek5ay12621RdbaoqzMGURGGJrU9SEiMmHBtKg3AHcYY84CvwDeZ4x5fLSVUuI86voQEQmBUYPaWvsla22etbYA2Ar80Vr7sdHWS4330KSuDxGRCQvLOGqA1DiPRn2IiIRA1FgWttbuBHYGs2xafDRVzV3jKElERAYKW4s6KzmW6hYFtYjIRIUtqPPS4qhv79GdXkREJihsQT0nPR6Acw2d4dqFiMi0EPagLmvoCNcuRESmBQW1iIjLhW94XryHpJgozimoRUQmJGxBbYxhdnq8WtQiIhMUtqAGp/ujtL49nLsQEbnihTeoM+I519ipu5GLiExAWIN6fmYCPb1+dX+IiExAWIN62cwUAEoqm8O5GxGRK1pYg3pRdhKeSENJRUs4dyMickULa1BHR0WwKDuJQ2pRi4iMW1iDGmDFrBQOVjRjrU4oioiMR9iDemVeKk0dXp1QFBEZp7AH9bqCNAD+dLw23LsSEbkihT2oF2YnsTArkef3V4V7VyIiV6SwBzXA7StnsvtsA5VNmvJURGSsJiWo71o9E4Dn9lVOxu5ERK4okxLU+RkJrCtI48ndZfh0ObmIyJhMSlADfGbjXMoaOnjhoPqqRUTGYtSgNsbEGmN2G2P2G2MOGWMeGc+Oblmaw/zMBL6385TGVIuIjEEwLepu4H3W2lVAIbDJGHPNmHcUYfj8jQs4UtXCCwfPj3V1EZFpa9Sgto62wD89gce4msR3rZ7F4uwk/u33x+jp9Y9nEyIi005QfdTGmEhjzD6gBnjFWvvOEMtsM8YUG2OKa2uHvrglMsLw8OYlnKlrZ9vPitUFIiIShKCC2lrrs9YWAnnAemPM8iGW2W6tLbLWFmVmZg67rZuWZLFpWQ47j9XyVz8p1thqEZFRjGnUh7W2CdgJbJrITr/zkdU88P6F/OFoDZ/+8bt0eX0T2ZyIyBUtmFEfmcaY1MDzOOD9wNGJ7DQqMoIH3r+IH36iiKPnW9m6/W3drVxEZBjBtKhzgdeMMQeAd3H6qJ8Pxc4/sDSbr9y1nBPVrXzoO69TUtEMlfvgtX+B2uOh2IWIyGXPhOOEXlFRkS0uLg56+ePVrXzqsd30+Pw8v66EnLceASzkrIAVfwHL74GUvJDXKSLiFsaYPdbaoqHem7QrE0eyKDuJn/3V1fj8lmt3LuJ7a3+L75Z/gchoeOUf4RvL4LFN8O6PoL1uqssVEZlUrmhR9znX0MH/efEILxw8z4YFGfzHJ9cR23IWSp6Bkh1QexRMJMy/CZZvgSW3QWxyyOsXEZlsI7WoXRXUfX7y5ln+6TeHWJydxFfvXk5RQTpYC9WHnMA++DQ0l0FULCz6oBPaC28BT2wIP4WIyOS57IIa4PkDlTy04wDtPT4+dV0Bn94wlzkZ8c6b1sK53U5oH3oW2mshJhmu+pDTnz33BoiMCsEnERGZHJdlUAOU1rfz1d8d4ZUj1fSVmZkUw6euK+Dj1+aTHOsBXy+c+ROUPA1HfgvdLZCQCfNugrQCSMuH1DmQmg/JsxTgIuJKl21Q9zlT5wT2yZpWunv9VDV3AWAMfOGmBfz1DfNJjIkCbxecfAUO7oCKPdBSAXbAnCIm0hk9kjonEOAFga+BME/MhghXnF8VkWnmsg/qi+0ta+Sp4nJeOFhFc6eXzKQY/u0vVnHDoosuXfd5obkcmkqhsdT52lTW/7ytevDykTEDQjwf5l4PizdDVEzYPouICFyBQT3Qa8dq+NLTBznf0kXh7FR+fN86UuOjg1vZ2wlN5wJBfnZAoJdBwxnoboa4dFh5L6z+mDOuW0QkDK7ooAZo6fLyz789zK/2lAPw4fVzuHFxJh9cljP+jfp9cPo12Ps4HP0d+HogZyWs/jis2ALx6SGqXkRkGgR1n13Ha/nSMwepCMzIt3xWMtcvzOSvb5hPSpxn/BvuaHD6vfc9DlX7nQtxltwGhR9zxnRHRIboE4jIdDVtgrrP3rJG/tevS2jq8F4I7b+5YT4PbVqMMWZiGz9/EPY+AQd+CZ0NzkiSVVuh8KOQMT8E1YvIdDTtgnqgPxyp5sndZbx6pAaA+OhIblqSxRfft5DFOUnj33BvNxx/yekaOfmqM7okf4MT2EvvhJjEEH0CEZkOpnVQA1hr+fnuMp569xz7y5svvJ6REM3dq2fxdzcvnFjXSEsV7H8S9j0B9SchOhGW3eX0Z8++2hlHKCIygmkf1AN1eX3843MlNLR7efVI//C81XNS+ejV+dyxaiZREYaIiHGEq7Vw7h2nlX3oWehpg5lrYOODsOR2jdEWkWEpqEfw3L4K/vXFo/ispbql+8Lr8zIT+MDSbN63OIvs5Fhmp8cTOZbw7mmHA0/Bm9+GhtOQsRA2PgAr7oWoIIcPisi0oaAOgrWWPx6t4Zm9FbR0eqlv6+FwVcuF9/Mz4rl79SyunpvBirwU50rIYPh9cPjX8Po3nBORyXlw3RdgzScgOiFMn0ZELjcK6nH6w5FqHnr6IHVt3YNej4wwxHsimZ+VyCN3LMMCv9lXSX17Nyeq29i8PIe/XD+brKQBs/lZCyf/AK9/HUrfcC6kufpvYP1nNSZbRBTUodDS5WXnsVpioiL47f5Knj9QNeLyi7OT+Pd7V7F8Vsqlb5a947Swj7/onHhc+ym49m8heWZ4ihcR11NQh0lVcyef/Wkx182fwV/92VxS4jxER0bw8qFq/v6pfXT0+FicncQjdy5jVV4qcdEXXRhTfQhe/6Yz819EpDMee8MDEx+PbS201TiXwjeVQnyGM2+JLswRcS0F9RQ4WdPGx//jnQsz/QGsn5vOQ5uW0Nbdy8KsRGamxjlvNJ6FN/8fvPcz51L1pXfCn/095K4aeuPWQmdjYH6Ssksnm2oqg96uweskzYRVfwmrPgKZi8LymUVk/BTUU6ipo4dn3qtg14ladh6rHfTezUuy+Oz187hmXgYAXY1VRL37KFF7HnPm1Z5/s9PKbq/rD+S+SaN6WgfvKDZ18Mx/qfnO85TZUHfcGed94hWwPshbB6s+DMv/HOLSJutQiMgIFNQusae0kV3Haylr6CDCGP54tJrGDu8ly2VEdXGv/T3bol8izTY5L3oS+kN44M0Q+sI5doi+8Iu1VsPBp2Dfz6HmsDOt65LboPAjzo0WQnlTBb8f6k9AeTGUvwuNZ2DpXc6+NG2syCUmFNTGmNnAT4EcwA9st9Z+a6R1FNTB6fL6+N+/Lrkw698Hl2VTkJFAS1cvT+4uI4Ye5ptKvAm5dHlSuHt1HnevyWPujASstVjL+C/MqdrvBPbBp5xulMSc/q6RrCVj32ZHQ38oVxRD+R5nmliAmBRIzHSu2kzMgWs/D2vv042JRQaYaFDnArnW2veMMUnAHuAua+3h4dZRUIfG4coWvv+nU+w8WkN7Ty/+wH9VUmwUrV29ACzISiQ1zkNXr4/3LcnmCzctIDpqDFdA9nbD8ZedrpHjLztdIzPXOC3f5fcMPXTQ54XqkkAwB8K54ZTznomArGWQVxR4rHMu9jHGuWXa69+A0zudvwDWfdYZopiYeek+RKaZkHZ9GGOeA75jrX1luGUU1KFnreXlQ+d5+r0KDpY3097TS2ePjxV5KRytaqXT67uwbEqchz9fM4uPXp3PgqwxTA7VVgMHf+W0tKtLnOlcF2+GlVudk5zl7zq3OKvc23+yMjHbCeO+UM4tHH1Cqoo9zmiXI7917iS/5uNw7RecLhyRaSpkQW2MKQB2AcuttS0XvbcN2AYwZ86ctaWlpeOtV8aoob0HT6ThP984y9dfOU6E4ULre2ZKLDOSYiit7+C7H1nDdfMzgusuqTrQ3zXSUe+8FhnjjEQZGMwpeeOfdKruBLzxTdj/S2f2wRVbnOGJ2UvHtz2Ry1hIgtoYkwj8CfiqtfaZkZZVi3rqHaps5gd/Os1v9lcOej0mKoJ5mYl09/pYlZfKZ/9sHktnjtBX3NsDZ/8L4lIhe0V45ilproC3vgt7fgzedli02RmeOHt96Pcl4lITDmpjjAd4HnjZWvv10ZZXULuL328pLm1kb1kjLx06z96ypkHvXzMvnXMNnVQ0dbJ6Tip3Fc5i/dx0zjV0cK6xk+vmZ/CtV0+QlRzDnYUzWTYzhVhPGC6e6WiA3T+Edx51bsqQvwE2/j0suFlTxcoVb6InEw3wE6DBWvtAMDtUULub1+cnKsLQ0tnLd3eeZPuu0wBER0UQacyg/u7hXDsvg03Lc7hnbV7wE1QFq6cd3vupcxFQS4XTkt/4gDO8L5RDCEfj90H9KajaB5X7nK/VJRCTDGkFziN9buB54KvmbZFxmmhQbwT+CziIMzwP4H9aa18Ybh0F9eWlvbuXmtbuC8P+3jnTwNdePMqGBRnMSY/Hb2FOejzpCdG8fbqeJ3eXca6hc1Cgz0qNY+OCGaTEe/D7LWkJ0dy2IpeCGROYIbC3xzm5+cY3nYt2EjJhxqLBwdgXlPEZE2t1+33OPir3OUMXq/Y5/fTeduf9qFjIXu7cid7b4dylvvEstNcM3k5syqW1pc11nifP0mX8Mixd8CIh19rl5d9ePsbhqhasheLSxmGXXZufRmZiDPdtKGBeZiIvlVTR5fUzJyOeGxZlEuuJxOvz44kcZlih3w/HfgdHX3AunGk4A23nBy8TnRQIx4LB4ZhW4FydGTngDj6+Xqg71t9KrtrvTEHr7XDe98Q7gZxb6Jw8nVkIMxYP3ZrvbnMCu/GsU1vj2UCIn3GuIPX39i8b4QlcqDTHaXnHpgzzSBvwPFkXCE0TCmqZFDWtXXh9lpzkWI5UtfDY62d4Zm/FmLYxb0YC6wrSuW9jAUtyRjjJ2dMRuKT+bH/rti8oG0vBN2BqWhPpjE5Jn+sEa3VJ//BCTwLkrnRCeWYgmGcsCk3L1++D5vL+2vrqbCqDrubAo2lwmA8lKu7SMI+OB4zzV4SJ6H8+4lcGrzPUdgc+4lKdX4C6M9GkUFDLlNt9poFv/eE4i7KTuH5hJmfq2qls6uSlQ+cpb+xkfmYCpfUd9Pov/X68bUUunV4fi7KT+MS1+f2TWQ3g91uMgcffKSM5JoIs00i2t4p5UbWDgzwqdkAoFzozFU5ld4S1Tkv+QnBf/Gga+vWeDsA661t///NBXxnm9cA6vV3OnDIjMk6r/kKApw5+nrMC8q9z/krQCd8JUVDLZcNaS3ljJ4+/U8oP/nR6yGWSYqJYNzedtPhofrPfabF7fZb46Eg6egafCF2Vl8LmFbn87kAVGxfOYMWsFG5anEVMVAR+a7Fwocul1+cnarjulyuV3+eEdd8vgM5hfjEM9eioh95OZzvJeU5g518HBRshY4GCe4wU1HLZqmzqpK6tm7T4aJo7vfx8dxm7jtdS3th5YZmUOA/NnV5uW5lLYnQU96zNo7Kpk1/tOccbJ+tH3cdNizMpLm2krbuXDfNnsGp2CoWz01ibn0ZavAdrA70FCp7B/H5ncq/SN527FpW+2X9yNSEzENwbnK9ZyybeheLzBqbzHdCN1HDG+YskdU7/SJy0fOccRVzaZfXLQkEtV5wur483TtZxw6LMEVvBzZ1eSuvbyU2Jw1rLq0dqON/Sxdun69l9poEIA/kZCeQkx5KZFMN7ZY0XfgkMvMITICsphoSYKM7UtXPbilxW5KVQkJHAiyVVLJ+Zgs9aDlY0MyMhmrSEaIrPNmIMnK5tJyclloc3L6EoP+3KDXxrneGMpa87oX32DWhxJhwjNgXmXNcf3rkrB5/g7dPdOviE7MCvzeXOXDR9omKdYPbEOQHecdEv5b4TzGn5A0K8oH/WSU8sbqKgFhmD1i4vhypbeONkHfvLm/H2+slMimHvuUYyE2Pw+izHqlvp6fWPvrEhrJqdyqc3FHD7ypljurO9tRZjDNZaurz+S+8Y5EZNZYNb3PUnndc9Cc6VpznLnel3+04Etw+es524tAEjeC76mpgzuJXe3eqcSG4823+iue/kclPpEDfTyB0c4AO3nzBj0lvjCmqRELPWcrqunYb2HhKio5iRGI3PWrKSYmnq6MEYgyfSEOeJJMIYmju9/HpfBU8Vl9PW7eVcQyd5aXEsyUlifmYiuSmxvHmqnvjoSNITYmjscOZv+Yui2VQ2dfLCwSpeO1pLZlIM3b1+6tq6mTcjgfvfv5A7C2eNuXa/ZUy/JEKmtRrKAq3t0jeh9qgTmH1DKS8O5LjU0OzX73e6ZfqCuy/E+wK9pZLA2VdHdGKgjoJLf0Ek54XlwisFtYiL+P2Wlw6d55n3Kihv7OB0bTs9Pj8zEmMuueP9QNFREcRERpA/I56Siv7RGnlpceQkx7IgK5HEmChq27qpaemmsaOHxJgoVs9J5WRNG+3dPnafbbiwXlJMFDcuyeKOVTO5el46Pb1ODZOq7wTAVPN2OaE9VJdLU6kze2SfiKhAn3gguNPnOd0pUTHOaJphH3bE9826zyioRdyqvbuX8sZOFmYl4reWrl4/fms5cK6ZY9WtrCtIY3FOEjFRg7s6yuo7+NHrp6lv66GsoYPj1a10B9Eds2ZOKtFREbx9uuGS95bkJHH0fCtX5SZzw6JMIgx09zpTDvj8lprWbkoqmjld1052cgybl+eSnRzL7StzmZ0eH7Jj4ip+n9PivjjAG89Aw9n+G2RMkHmkRUEtcqXr7vVRWt/BzNQ4EmOi8Pr8+PyWyAhDr886c7lc1N1xvLqVVw5XE2EMe0obOF7dRqfXR23r8C372elxdHv9REdFUNvaTXevn8gIQ1SE4QNLs1mcnURLl5e5MxJJi/fwxqk6cpJjOdfQSX17D3vLGimYkUBeWhw+v+VvbpjP8llB3ErOjfpuNN1U6gR638VEIz6GXsYk5yqoRSR4fbd6a+nyEuuJJDLCcOx8KwUzEgZNwmWt5URNG4+/XcrTe8pp7xl5Qq+UOM+F9Zs7vbR1O1dlFuWnsSArkXvW5tHrs6zMSyEy0IqPj468cBL1ih0xg/qoRWSSWGupbumm1++np9dPQ3sPb56qZ11BOrPT48hLix+0bF1bDz/6r9M8s7dixFZ8ekI0bV29ZCbFEB0VQUqch/SEaLw+PxsWzODj1+STEOpZHCeZglpEXM3vd4Y8vn6iDotzdWq318+Z+nZaOr1kJccSHRlBW7cXv4XmDqc1HhVpKK3vYElOEjcsymRORjzrCtJJjnUugsrPiA/P3OlhMFJQX96/gkTkihARYbgqN5mrcsd2Z3prLb87WMW///44P9g19JQDs1LjuG5+hnNSNNKQmRhDWUMH75U1kpUUS25KLGvz07h79Syykt11EUwftahF5IrQ0uWlvKGT/eVN9Pot1c1deH1+9pc3caiihbSEaNq7e2nt7qWn1090ZARzMuLp7PFR0dRJhIGFWUkszkni1hW53LQk85KRNuGkFrWIXPGSYz0snekZ+R6gOKNjTlS3sSAr8UK3yOnaNp7dW8HBimZePnT+wr1GZ6fHsWJWCokxUZyqbWdxThIAje09dHl9JMRE4beWhOgo0hOjSY+PpqggnaW5ySG9clRBLSLTSkxU5CXDAedlJvLfblkMQE+vnz8erWF/eRMlFc2UVLTQ0eOjrq2bo1Utg0a2zE6Po9dn8frsoIuVkmOjKCpIZ2aq061yVW4yuclxpMR76Ozxcaq2jeoWp8WfnRxLXVsPI1FQi4gMEB0VwablOWxanjPo9b45z+vaevD6/JfMi97l9VHe2Ml7ZY28criakzVt7D7TwONvl024JvVRi4iEic9vOV7dyt6yJqpbuqhs6iQ5zjz/RoEAAAbGSURBVMPa/DRyU2Jp6eqly+sjLT6aq+dlqI9aRGSyRY5zNMvFptntLERELj8KahERlxs1qI0xjxljaowxJZNRkIiIDBZMi/rHwKYw1yEiIsMYNaittbuASyeuFRGRSRGyPmpjzDZjTLExpri2tnb0FUREJCghC2pr7XZrbZG1tigzMzNUmxURmfY06kNExOUU1CIiLhfM8LwngbeAxcaYcmPMZ8JfloiI9Bn1EnJr7YcnoxARERmauj5ERFxOQS0i4nIKahERl1NQi4i4nIJaRMTlFNQiIi6noBYRcTkFtYiIyymoRURcTkEtIuJyCmoREZdTUIuIuJyCWkTE5RTUIiIup6AWEXE5BbWIiMspqEVEXE5BLSLicgpqERGXU1CLiLicglpExOUU1CIiLhdUUBtjNhljjhljThpjHg53USIi0m/UoDbGRALfBTYDS4EPG2OWhrswERFxBNOiXg+ctNaettb2AL8A7gxvWSIi0icqiGVmAecG/LscuPrihYwx24BtgX92G2NKJl7epJgB1E11EUFSreGhWsPjcqoVpr7e/OHeCCaozRCv2UtesHY7sB3AGFNsrS0KurwppFrDQ7WGh2oNHzfXG0zXRzkwe8C/84DK8JQjIiIXCyao3wUWGmPmGmOiga3Ab8JbloiI9Bm168Na22uM+QLwMhAJPGatPTTKattDUdwkUa3hoVrDQ7WGj2vrNdZe0t0sIiIuoisTRURcTkEtIuJyIQ1qN15qbow5a4w5aIzZZ4wpDryWbox5xRhzIvA1LfC6McZ8O1D/AWPMmjDX9pgxpmbgmPPx1GaM+WRg+RPGmE9Ocr1fNsZUBI7vPmPMrQPe+1Kg3mPGmA8OeD2s3yfGmNnGmNeMMUeMMYeMMfcHXnfdsR2hVtcd18A+Yo0xu40x+wP1PhJ4fa4x5p3AcfplYOABxpiYwL9PBt4vGO1zTEKtPzbGnBlwbAsDr0/5z9iwrLUheeCcaDwFzAOigf3A0lBtfwJ1nQVmXPTa/wUeDjx/GPjXwPNbgRdxxo5fA7wT5tquB9YAJeOtDUgHTge+pgWep01ivV8G/vsQyy4NfA/EAHMD3xuRk/F9AuQCawLPk4DjgXpcd2xHqNV1xzWwfwMkBp57gHcCx+wpYGvg9UeBzwWefx54NPB8K/DLkT7HJNX6Y2DLEMtP+c/YcI9Qtqgvp0vN7wR+Enj+E+CuAa//1DreBlKNMbnhKsJauwtomGBtHwResdY2WGsbgVeATZNY73DuBH5hre221p4BTuJ8j4T9+8RaW2WtfS/wvBU4gnOFreuO7Qi1DmfKjmugRmutbQv80xN4WOB9wI7A6xcf275jvgO42RhjRvgck1HrcKb8Z2w4oQzqoS41H+kbbrJY4PfGmD3GucwdINtaWwXODwqQFXjdDZ9hrLW5oeYvBP5UfKyvO2GEuia13sCf2qtxWlOuPrYX1QouPa7GmEhjzD6gBie0TgFN1treIfZ9oa7A+81AxmTVe3Gt1tq+Y/vVwLH9hjEm5uJaL6ppyn/GQhnUQV1qPgU2WGvX4Mz+97fGmOtHWNatnwGGr22qa/4+MB8oBKqAfw+8PuX1GmMSgaeBB6y1LSMtOkxNU1mra4+rtdZnrS3EuUp5PXDVCPue0novrtUYsxz4ErAEWIfTnfGQG2odSSiD2pWXmltrKwNfa4Bncb6xqvu6NAJfawKLu+EzjLW2Ka3ZWlsd+GHwAz+k/8/XKa3XGOPBCb4nrLXPBF525bEdqla3HteBrLVNwE6c/txUY0zfBXQD932hrsD7KTjdZ5Na74BaNwW6m6y1thv4T1x4bC8WyqB23aXmxpgEY0xS33PgFqAkUFffmdtPAs8Fnv8G+ETg7O81QHPfn8qTaKy1vQzcYoxJC/x5fEvgtUlxUR/+3TjHt6/erYGz/nOBhcBuJuH7JNAH+h/AEWvt1we85bpjO1ytbjyugboyjTGpgedxwPtx+tVfA7YEFrv42PYd8y3AH61zhm64zxHuWo8O+GVtcPrSBx5b1/2MAaEb9WH7z5oex+mz+odQbnuc9czDObO8HzjUVxNOH9kfgBOBr+m2/yzxdwP1HwSKwlzfkzh/1npxfmt/Zjy1AZ/GORlzErhvkuv9WaCeAzjf6LkDlv+HQL3HgM2T9X0CbMT50/QAsC/wuNWNx3aEWl13XAP7WAnsDdRVAvzjgJ+13YHj9CsgJvB6bODfJwPvzxvtc0xCrX8MHNsS4HH6R4ZM+c/YcA9dQi4i4nK6MlFExOUU1CIiLqegFhFxOQW1iIjLKahFRFxOQS0i4nIKahERl/v/1GeI0wnbi/8AAAAASUVORK5CYII=\n", 601 | "text/plain": [ 602 | "
" 603 | ] 604 | }, 605 | "metadata": {}, 606 | "output_type": "display_data" 607 | } 608 | ], 609 | "source": [ 610 | "flattenAnneal(learn,4e-3, 20, .72)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [] 619 | } 620 | ], 621 | "metadata": { 622 | "kernelspec": { 623 | "display_name": "Python 3", 624 | "language": "python", 625 | "name": "python3" 626 | }, 627 | "language_info": { 628 | "codemirror_mode": { 629 | "name": "ipython", 630 | "version": 3 631 | }, 632 | "file_extension": ".py", 633 | "mimetype": "text/x-python", 634 | "name": "python", 635 | "nbconvert_exporter": "python", 636 | "pygments_lexer": "ipython3", 637 | "version": "3.7.3" 638 | } 639 | }, 640 | "nbformat": 4, 641 | "nbformat_minor": 2 642 | } 643 | -------------------------------------------------------------------------------- /DeepMemory/README.md: -------------------------------------------------------------------------------- 1 | DeepMemory is a new optimizer I came up with after blending DiffGrad + AdaMod. The core concept is to provide the optimizer with 2 | long term memory of the previous step sizes. 3 | 4 | Results in initial testing put it on par with Ranger and both Ranger and DeepMemory topped the recent testing I did with about 8 different optimizers. 5 | 6 | 7 | DeepMemory is designed to offset the weakness of many adaptive optimizers by creating a 'long term' memory of the gradients over the course of an epoch. 8 | This long term memory is averaged against the current adaptive step size generated from the current mini-batch in order to help guide the step size more optimally. 9 | 10 | DeepMemory also keeps a short term gradient buffer that was developed in diffgrad, and locks down the step size when minimal gradient change is detected. 11 | 12 | 1/1/2020 - @lessw2020 developed the long term memory concept as a blended average (vs max throttle in AdaMod), and created and tested deep Memory 13 | credits: 14 | DiffGrad: Uses the local gradient friction clamp developed by DiffGrad, but with version 1 coded by lessw from the paper: 15 | https://github.com/shivram1987/diffGrad (S.R.Dubey et al) 16 | 17 | AdaMod - DeepMemory builds on the concepts for longer term monitoring in AdaMod (b3 concept but changed from min throttling to blended average and changed input to len_memory and size): 18 | 19 | AdaMod source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py 20 | 21 | modifications @lessw2020 22 | 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99) 23 | -------------------------------------------------------------------------------- /DeepMemory/deepmemory.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim import Optimizer 4 | 5 | # DeepMemory is designed to offset the weakness of many adaptive optimizers by creating a 'long term' memory of the gradients over the course of an epoch. 6 | # This long term memory is averaged against the current adaptive step size generated from the current mini-batch in order to help guide the step size more optimally. 7 | 8 | # DeepMemory also keeps a short term gradient buffer that was developed in diffgrad, and locks down the step size when minimal gradient change is detected. 9 | 10 | # 1/1/2020 - @lessw2020 developed the long term memory concept as a blended average (vs max throttle in AdaMod), and created and tested deep Memory 11 | # credits: 12 | # DiffGrad: Uses the local gradient friction clamp developed by DiffGrad, but with version 1 coded by lessw from the paper: 13 | # https://github.com/shivram1987/diffGrad (S.R.Dubey et al) 14 | 15 | # AdaMod - DeepMemory builds on the concepts for longer term monitoring in AdaMod (b3 concept but changed from min throttling to blended average and changed input to len_memory and size): 16 | 17 | # AdaMod source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py 18 | 19 | # modifications @lessw2020 20 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99) 21 | 22 | 23 | class DeepMemory(Optimizer): 24 | """Implements DeepMemory algorithm (built upon DiffGrad and AdaMod concepts) with Decoupled Weight Decay (arxiv.org/abs/1711.05101) 25 | 26 | Arguments: 27 | params (iterable): iterable of parameters to optimize or dicts defining 28 | parameter groups 29 | lr (float, optional): learning rate (default: 1e-3) 30 | betas (Tuple[float, float], optional): coefficients used for computing 31 | running averages of gradient and its square (default: (0.9, 0.999)) 32 | len_memory = b3 (smoothing coefficient from AdaMod) in easier to use format, mem average with b3 is averaged with immmediate gradient. 33 | specify the memory len, b3 is computed. 34 | version = 0 means .5 clamping rate, 1 = 0-1 clamping rate (from DiffGrad) 35 | eps (float, optional): term added to the denominator to improve 36 | numerical stability (default: 1e-8) 37 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 38 | """ 39 | 40 | def __init__(self, params, lr=4e-3, betas=(0.9, 0.999), len_memory=200, version=1, 41 | eps=1e-6, weight_decay=0, debug_print=False): 42 | if not 0.0 <= lr: 43 | raise ValueError("Invalid learning rate: {}".format(lr)) 44 | if not 0.0 <= eps: 45 | raise ValueError("Invalid epsilon value: {}".format(eps)) 46 | if not 0.0 <= betas[0] < 1.0: 47 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 48 | if not 0.0 <= betas[1] < 1.0: 49 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 50 | 51 | #compute b3 52 | base = 1/len_memory 53 | beta3 = 1-(base) 54 | print(f"DeepMemory: length of memory is {len_memory} - this should be close or equal to batches per epoch") 55 | 56 | #debugging 57 | self.debug_print=debug_print 58 | 59 | 60 | if not 0.0 <= beta3 < 1.0: 61 | raise ValueError("Invalid len_memory parameter: {}".format(beta3)) 62 | 63 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, 64 | weight_decay=weight_decay) 65 | super().__init__(params, defaults) 66 | 67 | self.version = version 68 | 69 | def __setstate__(self, state): 70 | super().__setstate__(state) 71 | 72 | def step(self, closure=None): 73 | """Performs a single optimization step. 74 | Arguments: 75 | closure (callable, optional): A closure that reevaluates the model 76 | and returns the loss. 77 | """ 78 | loss = None 79 | if closure is not None: 80 | loss = closure() 81 | 82 | for group in self.param_groups: 83 | for p in group['params']: 84 | if p.grad is None: 85 | continue 86 | grad = p.grad.data 87 | if grad.is_sparse: 88 | raise RuntimeError( 89 | 'DiffMod does not support sparse gradients') 90 | 91 | state = self.state[p] 92 | 93 | # State initialization 94 | if len(state) == 0: 95 | state['step'] = 0 96 | # Exponential moving average of gradient values 97 | state['exp_avg'] = torch.zeros_like(p.data) 98 | # Exponential moving average of squared gradient values 99 | state['exp_avg_sq'] = torch.zeros_like(p.data) 100 | # Exponential moving average of actual learning rates 101 | state['exp_avg_lr'] = torch.zeros_like(p.data) 102 | # Previous gradient 103 | state['previous_grad'] = torch.zeros_like(p.data) 104 | 105 | 106 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] 107 | previous_grad = state['previous_grad'] 108 | beta1, beta2 = group['betas'] 109 | 110 | state['step'] += 1 111 | 112 | # Decay the first and second moment running average coefficient 113 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 114 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 115 | 116 | denom = exp_avg_sq.sqrt().add_(group['eps']) 117 | 118 | bias_correction1 = 1 - beta1 ** state['step'] 119 | bias_correction2 = 1 - beta2 ** state['step'] 120 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 121 | 122 | # compute diffgrad coefficient (dfc) 123 | if self.version==0: 124 | diff = abs(previous_grad - grad) 125 | 126 | elif self.version ==1: 127 | diff = previous_grad-grad 128 | 129 | 130 | if self.version==0 or self.version==1: 131 | dfc = 1. / (1. + torch.exp(-diff)) 132 | 133 | 134 | state['previous_grad'] = grad 135 | 136 | if group['weight_decay'] != 0: 137 | p.data.add_(-group['weight_decay'] * group['lr'], p.data) 138 | 139 | # create long term memory of actual learning rates (from AdaMod) 140 | step_size = torch.full_like(denom, step_size) 141 | step_size.div_(denom) 142 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) 143 | 144 | if self.debug_print: 145 | print(f"batch step size {step_size} and exp_avg_step {exp_avg_lr}") 146 | 147 | #Blend the mini-batch step size with long term memory 148 | step_size = step_size.add(exp_avg_lr) 149 | step_size = step_size.div(2.) 150 | 151 | 152 | # update momentum with dfc 153 | exp_avg1 = exp_avg * dfc 154 | 155 | step_size.mul_(exp_avg1) 156 | 157 | p.data.add_(-step_size) 158 | 159 | return loss -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Best-Deep-Learning-Optimizers
2 | Collection of the latest, greatest, deep learning optimizers (for Pytorch) - CNN, Transformer, NLP suitable 3 |

4 | Current top performers = Have not run benchmarks lately and a lot has changed. Quick recommendations = transformer or CNN = madgrad / adahessian. For CNN only, Ranger. 5 |

6 | ## Updates - 7 | April 2021: Meet Madgrad!
Have added Madgrad with an improvement to weight decay. Madgrad is a new optimizer released by FB AI in February. In testing with transformers for image classification, madgrad blew away the various Adam variants.
8 | However, as spotted by @nestordemeure, the weight decay impl was like adam instead of adamW. 9 | In testing, AdamW style weight decay was the winner and thus the implementation here is with my modification to use AdamW style wd. 10 | 11 | Recommendations: test with
a)no weight decay, recommended by Madgrad authors and
b)weight decay at same level you would use for AdamW with this madgrad_wd version. 12 |
13 | Important: madgrad is very different than Adam variants...thus recommend you start with madgrad default lr and do quick range of lr tests. Do not just use what worked for you on your dataset with Adam(sh) lr. 14 | 15 | Modified madgrad is here: https://github.com/lessw2020/Best-Deep-Learning-Optimizers/tree/master/madgrad 16 | 17 | And original madgrad is here: https://github.com/facebookresearch/madgrad 18 | 19 | Pending work = there is a new paper discussing Stable Weight Decay as being the ultimate weight decay. Planning to implement and test with madgrad soon. 20 | 21 | August 2020 - AdaHessian, the first 'it really works and works really well' second order optimizer added: 22 | I tested AdaHessian last month on work datasets and it performed extremely well. It's like training with a guided missile compared to most other optimizers. 23 | The big caveat is you will need about 2x the normal GPU memory to run it vs running with a 'first order' optimizer. 24 | I am trying to get a Titan GPU with 24GB GPU memory just for this purpose atm. 25 | 26 | 27 | new version of Ranger with highest accuracy to date for all optimizers tested: 28 | April 11 - New version of Ranger released (20.4.11), highest score for accuracy to date. 29 |
Ranger has been upgraded to use Gradient Centralization. See: https://arxiv.org/abs/2004.01461 and github: https://github.com/Yonghongwei/Gradient-Centralization 30 | 31 | It will now use GC by default, and run it for both conv layers and fc layers. You can turn it on or off with "use_gc" at init to test out the difference on your datasets. 32 | ![](images/projected_gradient.png) 33 | (image from gc github). 34 |
The summary of gradient centralization: "GC can be viewed as a projected gradient descent method with a constrained loss function. The Lipschitzness of the constrained loss function and its gradient is better so that the training process becomes more efficient and stable." 35 |
36 | 37 | Note - for optimal accuracy, make sure you use run with a flat lr for some time and then cosine descent the lr (72% - 28% descent), or if you don't have an lr framework... very comparable results by running at one rate for 75%, then stop and decrease lr, and run remaining 28%. 38 | 39 | ## Usage - GC on by default but you can control all aspects at init: 40 | ![](images/ranger-with-gc-options.jpg) 41 |
42 | ## Ranger will print settings at first init so you can confirm optimization is set the way you want it: 43 | ![](images/ranger-init.jpg) 44 | 45 |
Future work: MARTHE, HyperAdam and other optimizers will be tested and posted if they look good. 46 | 47 |
48 | 12/27 - added DiffGrad, and unofficial version 1 support (coded from the paper). 49 |
50 | 12/28 - added Diff_RGrad = diffGrad + Rectified Adam to start off....seems to work quite well. 51 | 52 | Medium article (summary and FastAI example usage): 53 | https://medium.com/@lessw/meet-diffgrad-new-deep-learning-optimizer-that-solves-adams-overshoot-issue-ec63e28e01b2 54 | 55 | Official diffGrad paper: https://arxiv.org/abs/1909.11015v2 56 | 57 | 12/31 - AdaMod and DiffMod added. Initial SLS files added (but more work needed). 58 | 59 | 60 | In Progress:

61 | A - Parabolic Approximation Line Search: https://arxiv.org/abs/1903.11991v2 62 | 63 | B - Stochastic Line Search (SLS): pending (needs param group support) 64 | 65 | c - AvaGrad 66 | 67 | 68 | General papers of relevance: 69 | 70 | Does Adam stick close to the optimal point? https://arxiv.org/abs/1911.00289v1 71 | 72 | 73 | Probabalistic line searches for stochastic optimization (2017, matlab only but good theory work): https://arxiv.org/abs/1703.10034v2 74 | -------------------------------------------------------------------------------- /Ranger/ranger.py: -------------------------------------------------------------------------------- 1 | # Ranger deep learning optimizer - RAdam + Lookahead + Gradient Centralization, combined into one optimizer. 2 | 3 | # https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer 4 | # and/or 5 | # https://github.com/lessw2020/Best-Deep-Learning-Optimizers 6 | 7 | # Ranger has now been used to capture 12 records on the FastAI leaderboard. 8 | 9 | # This version = 20.4.11 10 | 11 | # Credits: 12 | # Gradient Centralization --> https://arxiv.org/abs/2004.01461v2 (a new optimization technique for DNNs), github: https://github.com/Yonghongwei/Gradient-Centralization 13 | # RAdam --> https://github.com/LiyuanLucasLiu/RAdam 14 | # Lookahead --> rewritten by lessw2020, but big thanks to Github @LonePatient and @RWightman for ideas from their code. 15 | # Lookahead paper --> MZhang,G Hinton https://arxiv.org/abs/1907.08610 16 | 17 | # summary of changes: 18 | # 4/11/20 - add gradient centralization option. Set new testing benchmark for accuracy with it, toggle with use_gc flag at init. 19 | # full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights), 20 | # supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues. 21 | # changes 8/31/19 - fix references to *self*.N_sma_threshold; 22 | # changed eps to 1e-5 as better default than 1e-8. 23 | 24 | import math 25 | import torch 26 | from torch.optim.optimizer import Optimizer, required 27 | 28 | 29 | 30 | class Ranger(Optimizer): 31 | 32 | def __init__(self, params, lr=1e-3, # lr 33 | alpha=0.5, k=6, N_sma_threshhold=5, # Ranger options 34 | betas=(.95,0.999), eps=1e-5, weight_decay=0, # Adam options 35 | use_gc=True, gc_conv_only=False # Gradient centralization on or off, applied to conv layers only or conv + fc layers 36 | ): 37 | 38 | #parameter checks 39 | if not 0.0 <= alpha <= 1.0: 40 | raise ValueError(f'Invalid slow update rate: {alpha}') 41 | if not 1 <= k: 42 | raise ValueError(f'Invalid lookahead steps: {k}') 43 | if not lr > 0: 44 | raise ValueError(f'Invalid Learning Rate: {lr}') 45 | if not eps > 0: 46 | raise ValueError(f'Invalid eps: {eps}') 47 | 48 | #parameter comments: 49 | # beta1 (momentum) of .95 seems to work better than .90... 50 | #N_sma_threshold of 5 seems better in testing than 4. 51 | #In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you. 52 | 53 | #prep defaults and init torch.optim base 54 | defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay) 55 | super().__init__(params,defaults) 56 | 57 | #adjustable threshold 58 | self.N_sma_threshhold = N_sma_threshhold 59 | 60 | 61 | #look ahead params 62 | 63 | self.alpha = alpha 64 | self.k = k 65 | 66 | #radam buffer for state 67 | self.radam_buffer = [[None,None,None] for ind in range(10)] 68 | 69 | #gc on or off 70 | self.use_gc=use_gc 71 | 72 | #level of gradient centralization 73 | self.gc_gradient_threshold = 3 if gc_conv_only else 1 74 | 75 | 76 | print(f"Ranger optimizer loaded. \nGradient Centralization usage = {self.use_gc}") 77 | if (self.use_gc and self.gc_gradient_threshold==1): 78 | print(f"GC applied to both conv and fc layers") 79 | elif (self.use_gc and self.gc_gradient_threshold==3): 80 | print(f"GC applied to conv layers only") 81 | 82 | 83 | 84 | 85 | 86 | def __setstate__(self, state): 87 | print("set state called") 88 | super(Ranger, self).__setstate__(state) 89 | 90 | 91 | def step(self, closure=None): 92 | loss = None 93 | #note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure. 94 | #Uncomment if you need to use the actual closure... 95 | 96 | #if closure is not None: 97 | #loss = closure() 98 | 99 | #Evaluate averages and grad, update param tensors 100 | for group in self.param_groups: 101 | 102 | for p in group['params']: 103 | if p.grad is None: 104 | continue 105 | grad = p.grad.data.float() 106 | 107 | if grad.is_sparse: 108 | raise RuntimeError('Ranger optimizer does not support sparse gradients') 109 | 110 | p_data_fp32 = p.data.float() 111 | 112 | state = self.state[p] #get state dict for this param 113 | 114 | if len(state) == 0: #if first time to run...init dictionary with our desired entries 115 | #if self.first_run_check==0: 116 | #self.first_run_check=1 117 | #print("Initializing slow buffer...should not see this at load from saved model!") 118 | state['step'] = 0 119 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 120 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 121 | 122 | #look ahead weight storage now in state dict 123 | state['slow_buffer'] = torch.empty_like(p.data) 124 | state['slow_buffer'].copy_(p.data) 125 | 126 | else: 127 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 128 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 129 | 130 | #begin computations 131 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 132 | beta1, beta2 = group['betas'] 133 | 134 | 135 | #GC operation for Conv layers and FC layers 136 | if grad.dim() > self.gc_gradient_threshold: 137 | grad.add_(-grad.mean(dim = tuple(range(1,grad.dim())), keepdim = True)) 138 | 139 | 140 | 141 | state['step'] += 1 142 | 143 | #compute variance mov avg 144 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 145 | #compute mean moving avg 146 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 147 | 148 | 149 | 150 | 151 | 152 | buffered = self.radam_buffer[int(state['step'] % 10)] 153 | 154 | if state['step'] == buffered[0]: 155 | N_sma, step_size = buffered[1], buffered[2] 156 | else: 157 | buffered[0] = state['step'] 158 | beta2_t = beta2 ** state['step'] 159 | N_sma_max = 2 / (1 - beta2) - 1 160 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 161 | buffered[1] = N_sma 162 | if N_sma > self.N_sma_threshhold: 163 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 164 | else: 165 | step_size = 1.0 / (1 - beta1 ** state['step']) 166 | buffered[2] = step_size 167 | 168 | 169 | if group['weight_decay'] != 0: 170 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 171 | 172 | # apply lr 173 | if N_sma > self.N_sma_threshhold: 174 | denom = exp_avg_sq.sqrt().add_(group['eps']) 175 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 176 | else: 177 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 178 | 179 | p.data.copy_(p_data_fp32) 180 | 181 | #integrated look ahead... 182 | #we do it at the param level instead of group level 183 | if state['step'] % group['k'] == 0: 184 | slow_p = state['slow_buffer'] #get access to slow param tensor 185 | slow_p.add_(self.alpha, p.data - slow_p) #(fast weights - slow weights) * alpha 186 | p.data.copy_(slow_p) #copy interpolated weights to RAdam param tensor 187 | 188 | return loss 189 | -------------------------------------------------------------------------------- /adahessian/README.md: -------------------------------------------------------------------------------- 1 | adahessian is the first 'second order' optimizer that actually performs (and does so extremely well) on real data. 2 | The big drawback is you'll need to have about 2x the GPU memory that you would otherwise need to run. 3 | 4 | The official github for adahessian is here: 5 | https://github.com/amirgholami/adahessian 6 | 7 | In the implementation here, I've consolidated it into a single file import instead of the util + optim file like in the official repo to make it easier to use. 8 | 9 | Note that you have to update your training loop as below: 10 | # usage example: 11 | from adahessian import Adahessian, get_params_grad 12 | import torch.optim.lr_scheduler as lr_scheduler 13 | # 14 | optimizer = Adahessian(model.parameters(),lr=.15) 15 | scheduler = lr_scheduler.MultiStepLR( 16 | optimizer, 17 | [30,45], # 18 | gamma=.1, 19 | last_epoch=-1) 20 | 21 | # 22 | # config for training loop: 23 | # 24 | loss.backward(create_graph=True) 25 | _, gradsH = get_params_grad(model) 26 | optimizer.step(gradsH) 27 | 28 | 29 | -------------------------------------------------------------------------------- /adahessian/adahessian.py: -------------------------------------------------------------------------------- 1 | #* 2 | # @file Different utility functions 3 | # Copyright (c) Zhewei Yao, Amir Gholami, Sheng Shen 4 | # All rights reserved. 5 | # This file is part of AdaHessian library. 6 | # source: https://github.com/amirgholami/adahessian 7 | # 8 | # AdaHessian is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # AdaHessian is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with adahessian. If not, see . 20 | #* 21 | 22 | import math 23 | import torch 24 | from torch.optim.optimizer import Optimizer 25 | from copy import deepcopy 26 | import numpy as np 27 | 28 | # imported from utils to avoid needing two imports... @lessw2020 29 | def get_params_grad(model): 30 | """ 31 | get model parameters and corresponding gradients 32 | """ 33 | params = [] 34 | grads = [] 35 | for param in model.parameters(): 36 | if not param.requires_grad: 37 | continue 38 | params.append(param) 39 | grads.append(0. if param.grad is None else param.grad + 0.) 40 | return params, grads 41 | 42 | 43 | class Adahessian(Optimizer): 44 | """Implements Adahessian algorithm. 45 | It has been proposed in `ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning`. 46 | Arguments: 47 | params (iterable): iterable of parameters to optimize or dicts defining 48 | parameter groups 49 | lr (float, optional): learning rate (default: 0.15) 50 | betas (Tuple[float, float], optional): coefficients used for computing 51 | running averages of gradient and its square (default: (0.9, 0.999)) 52 | eps (float, optional): term added to the denominator to improve 53 | numerical stability (default: 1e-4) 54 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 55 | hessian_power (float, optional): Hessian power (default: 1) 56 | """ 57 | 58 | def __init__(self, params, lr=0.15, betas=(0.9, 0.999), eps=1e-4, 59 | weight_decay=0, hessian_power=1): 60 | if not 0.0 <= lr: 61 | raise ValueError("Invalid learning rate: {}".format(lr)) 62 | if not 0.0 <= eps: 63 | raise ValueError("Invalid epsilon value: {}".format(eps)) 64 | if not 0.0 <= betas[0] < 1.0: 65 | raise ValueError( 66 | "Invalid beta parameter at index 0: {}".format( 67 | betas[0])) 68 | if not 0.0 <= betas[1] < 1.0: 69 | raise ValueError( 70 | "Invalid beta parameter at index 1: {}".format( 71 | betas[1])) 72 | if not 0.0 <= hessian_power <= 1.0: 73 | raise ValueError("Invalid Hessian power value: {}".format(hessian_power)) 74 | defaults = dict(lr=lr, betas=betas, eps=eps, 75 | weight_decay=weight_decay, hessian_power=hessian_power) 76 | 77 | super(Adahessian, self).__init__(params, defaults) 78 | 79 | def get_trace(self, gradsH): 80 | """ 81 | compute the Hessian vector product with a random vector v, at the current gradient point, 82 | i.e., compute the gradient of . 83 | :param gradsH: a list of torch variables 84 | :return: a list of torch tensors 85 | """ 86 | 87 | params = self.param_groups[0]['params'] 88 | 89 | v = [torch.randint_like(p, high=2, device='cuda') for p in params] 90 | for v_i in v: 91 | v_i[v_i == 0] = -1 92 | hvs = torch.autograd.grad( 93 | gradsH, 94 | params, 95 | grad_outputs=v, 96 | only_inputs=True, 97 | retain_graph=True) 98 | 99 | hutchinson_trace = [] 100 | for hv, vi in zip(hvs, v): 101 | param_size = hv.size() 102 | if len(param_size) <= 2: # for 0/1/2D tensor 103 | tmp_output = torch.abs(hv * vi) 104 | hutchinson_trace.append(tmp_output) # Hessian diagonal block size is 1 here. 105 | elif len(param_size) == 4: # Conv kernel 106 | tmp_output = torch.abs(torch.sum(torch.abs( 107 | hv * vi), dim=[2, 3], keepdim=True)) / vi[0, 1].numel() # Hessian diagonal block size is 9 here: torch.sum() reduces the dim 2/3. 108 | hutchinson_trace.append(tmp_output) 109 | 110 | return hutchinson_trace 111 | 112 | def step(self, gradsH, closure=None): 113 | """Performs a single optimization step. 114 | Arguments: 115 | gradsH: The gradient used to compute Hessian vector product. 116 | closure (callable, optional): A closure that reevaluates the model 117 | and returns the loss. 118 | """ 119 | loss = None 120 | if closure is not None: 121 | loss = closure() 122 | 123 | # get the Hessian diagonal 124 | hut_trace = self.get_trace(gradsH) 125 | 126 | for group in self.param_groups: 127 | for i, p in enumerate(group['params']): 128 | if p.grad is None: 129 | continue 130 | 131 | grad = deepcopy(gradsH[i].data) 132 | state = self.state[p] 133 | 134 | # State initialization 135 | if len(state) == 0: 136 | state['step'] = 0 137 | # Exponential moving average of gradient values 138 | state['exp_avg'] = torch.zeros_like(p.data) 139 | # Exponential moving average of Hessian diagonal square values 140 | state['exp_hessian_diag_sq'] = torch.zeros_like(p.data) 141 | 142 | exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq'] 143 | 144 | beta1, beta2 = group['betas'] 145 | 146 | state['step'] += 1 147 | 148 | # Decay the first and second moment running average coefficient 149 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 150 | exp_hessian_diag_sq.mul_(beta2).addcmul_( 151 | 1 - beta2, hut_trace[i], hut_trace[i]) 152 | 153 | bias_correction1 = 1 - beta1 ** state['step'] 154 | bias_correction2 = 1 - beta2 ** state['step'] 155 | 156 | # make the square root, and the Hessian power 157 | k = group['hessian_power'] 158 | denom = ( 159 | (exp_hessian_diag_sq.sqrt() ** k) / 160 | math.sqrt(bias_correction2) ** k).add_( 161 | group['eps']) 162 | 163 | # make update 164 | p.data = p.data - \ 165 | group['lr'] * (exp_avg / bias_correction1 / denom + group['weight_decay'] * p.data) 166 | 167 | return loss 168 | -------------------------------------------------------------------------------- /adamod/README.md: -------------------------------------------------------------------------------- 1 | AdaMod is a new optimizer that takes Adam but adds an exponential moving average of the adaptive learning rates. 2 | This ensures no large spikes during training and helps achieve faster and better convergence. 3 | 4 | Original source code and paper: https://github.com/lancopku/AdaMod 5 | 6 | DiffMod is a combination of DiffGrad + AdaMod = diffgrad. 7 | 8 | Currently DiffMod, using version 0 of DiffGrad, appears to be the best performer of all. But more testing is needed.
9 | 10 | Usage:
11 | from diffmod import DiffMod
12 | optar = partial(DiffMod,version=0)
13 | learn = Learner(data, model, metrics=[accuracy], wd=1e-3,
14 | opt_func=optar,
15 | bn_wd=False, true_wd=True,
16 | loss_func = LabelSmoothingCrossEntropy())
17 | -------------------------------------------------------------------------------- /adamod/adamod.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim import Optimizer 4 | 5 | #source - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py 6 | #modification - lessw2020 - use len_memory as integer lookback, convert to beta3 for easier usage 7 | 8 | class AdaMod(Optimizer): 9 | """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) 10 | It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_. 11 | Arguments: 12 | params (iterable): iterable of parameters to optimize or dicts defining 13 | parameter groups 14 | lr (float, optional): learning rate (default: 1e-3) 15 | betas (Tuple[float, float], optional): coefficients used for computing 16 | running averages of gradient and its square (default: (0.9, 0.999)) 17 | beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999) 18 | eps (float, optional): term added to the denominator to improve 19 | numerical stability (default: 1e-8) 20 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 21 | """ 22 | 23 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), 24 | len_memory=1000, #will convert to beta3 25 | eps=1e-8, weight_decay=0): 26 | if not 0.0 <= lr: 27 | raise ValueError("Invalid learning rate: {}".format(lr)) 28 | if not 0.0 <= eps: 29 | raise ValueError("Invalid epsilon value: {}".format(eps)) 30 | if not 0.0 <= betas[0] < 1.0: 31 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 32 | if not 0.0 <= betas[1] < 1.0: 33 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 34 | 35 | beta3 = 1 - (1/len_memory) 36 | print(f"AdaMod optimizer: len_memory of {len_memory} set at Beta3 of {beta3}") 37 | if not 0.0 <= beta3 < 1.0: 38 | raise ValueError("Invalid beta3 parameter: {}".format(beta3)) 39 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, 40 | weight_decay=weight_decay) 41 | super().__init__(params, defaults) 42 | 43 | def __setstate__(self, state): 44 | super().__setstate__(state) 45 | 46 | def step(self, closure=None): 47 | """Performs a single optimization step. 48 | Arguments: 49 | closure (callable, optional): A closure that reevaluates the model 50 | and returns the loss. 51 | """ 52 | loss = None 53 | if closure is not None: 54 | loss = closure() 55 | 56 | for group in self.param_groups: 57 | for p in group['params']: 58 | if p.grad is None: 59 | continue 60 | grad = p.grad.data 61 | if grad.is_sparse: 62 | raise RuntimeError( 63 | 'AdaMod does not support sparse gradients') 64 | 65 | state = self.state[p] 66 | 67 | # State initialization 68 | if len(state) == 0: 69 | state['step'] = 0 70 | # Exponential moving average of gradient values 71 | state['exp_avg'] = torch.zeros_like(p.data) 72 | # Exponential moving average of squared gradient values 73 | state['exp_avg_sq'] = torch.zeros_like(p.data) 74 | # Exponential moving average of actual learning rates 75 | state['exp_avg_lr'] = torch.zeros_like(p.data) 76 | 77 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] 78 | beta1, beta2 = group['betas'] 79 | 80 | state['step'] += 1 81 | 82 | # Decay the first and second moment running average coefficient 83 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 84 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 85 | 86 | denom = exp_avg_sq.sqrt().add_(group['eps']) 87 | 88 | bias_correction1 = 1 - beta1 ** state['step'] 89 | bias_correction2 = 1 - beta2 ** state['step'] 90 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 91 | 92 | if group['weight_decay'] != 0: 93 | p.data.add_(-group['weight_decay'] * group['lr'], p.data) 94 | 95 | # Applies momental bounds on actual learning rates 96 | step_size = torch.full_like(denom, step_size) 97 | step_size.div_(denom) 98 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) 99 | step_size = torch.min(step_size, exp_avg_lr) 100 | step_size.mul_(exp_avg) 101 | 102 | p.data.add_(-step_size) 103 | 104 | return loss 105 | -------------------------------------------------------------------------------- /adamod/diffmod.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim import Optimizer 4 | 5 | # source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py 6 | 7 | # modifications @lessw2020 - blend diffGrad + AdaMod = diffmod. 8 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99) 9 | 10 | 11 | class DiffMod(Optimizer): 12 | """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) 13 | It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_. 14 | Arguments: 15 | params (iterable): iterable of parameters to optimize or dicts defining 16 | parameter groups 17 | lr (float, optional): learning rate (default: 1e-3) 18 | betas (Tuple[float, float], optional): coefficients used for computing 19 | running averages of gradient and its square (default: (0.9, 0.999)) 20 | beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999) 21 | len_memory = b3 in easier to use format. specify the memory len, b3 is computed. 22 | eps (float, optional): term added to the denominator to improve 23 | numerical stability (default: 1e-8) 24 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 25 | """ 26 | 27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), len_memory=1000, version=0, 28 | eps=1e-8, weight_decay=0): 29 | if not 0.0 <= lr: 30 | raise ValueError("Invalid learning rate: {}".format(lr)) 31 | if not 0.0 <= eps: 32 | raise ValueError("Invalid epsilon value: {}".format(eps)) 33 | if not 0.0 <= betas[0] < 1.0: 34 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 35 | if not 0.0 <= betas[1] < 1.0: 36 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 37 | 38 | #compute b3 39 | beta3 = 1-(1/len_memory) 40 | print(f"length of memory is ",len_memory," and b3 is thus ",beta3) 41 | 42 | if not 0.0 <= beta3 < 1.0: 43 | raise ValueError("Invalid beta3 parameter: {}".format(beta3)) 44 | 45 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, 46 | weight_decay=weight_decay) 47 | super().__init__(params, defaults) 48 | 49 | self.version = version 50 | 51 | def __setstate__(self, state): 52 | super().__setstate__(state) 53 | 54 | def step(self, closure=None): 55 | """Performs a single optimization step. 56 | Arguments: 57 | closure (callable, optional): A closure that reevaluates the model 58 | and returns the loss. 59 | """ 60 | loss = None 61 | if closure is not None: 62 | loss = closure() 63 | 64 | for group in self.param_groups: 65 | for p in group['params']: 66 | if p.grad is None: 67 | continue 68 | grad = p.grad.data 69 | if grad.is_sparse: 70 | raise RuntimeError( 71 | 'DiffMod does not support sparse gradients') 72 | 73 | state = self.state[p] 74 | 75 | # State initialization 76 | if len(state) == 0: 77 | state['step'] = 0 78 | # Exponential moving average of gradient values 79 | state['exp_avg'] = torch.zeros_like(p.data) 80 | # Exponential moving average of squared gradient values 81 | state['exp_avg_sq'] = torch.zeros_like(p.data) 82 | # Exponential moving average of actual learning rates 83 | state['exp_avg_lr'] = torch.zeros_like(p.data) 84 | # Previous gradient 85 | state['previous_grad'] = torch.zeros_like(p.data) 86 | 87 | 88 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] 89 | previous_grad = state['previous_grad'] 90 | beta1, beta2 = group['betas'] 91 | 92 | state['step'] += 1 93 | 94 | # Decay the first and second moment running average coefficient 95 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 96 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 97 | 98 | denom = exp_avg_sq.sqrt().add_(group['eps']) 99 | 100 | bias_correction1 = 1 - beta1 ** state['step'] 101 | bias_correction2 = 1 - beta2 ** state['step'] 102 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 103 | 104 | # compute diffgrad coefficient (dfc) 105 | 106 | 107 | if self.version==0: 108 | diff = abs(previous_grad - grad) 109 | elif self.version ==1: 110 | diff = previous_grad-grad 111 | elif self.version ==2: 112 | diff = .5*abs(previous_grad - grad) 113 | 114 | if self.version==0 or self.version==1: 115 | dfc = 1. / (1. + torch.exp(-diff)) 116 | elif self.version==2: 117 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 118 | 119 | state['previous_grad'] = grad 120 | 121 | if group['weight_decay'] != 0: 122 | p.data.add_(-group['weight_decay'] * group['lr'], p.data) 123 | 124 | # Applies momental bounds on actual learning rates 125 | step_size = torch.full_like(denom, step_size) 126 | step_size.div_(denom) 127 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) 128 | step_size = torch.min(step_size, exp_avg_lr) 129 | 130 | # update momentum with dfc 131 | exp_avg1 = exp_avg * dfc 132 | 133 | step_size.mul_(exp_avg1) 134 | 135 | p.data.add_(-step_size) 136 | 137 | return loss -------------------------------------------------------------------------------- /diffgrad/README.md: -------------------------------------------------------------------------------- 1 | DiffGrad adjusts the step size for each parameter by comparing the current gradient vs the previous. It is designed to solve the 'Adam' 2 | overshoot problem, where the momentum of Adam can carry it right over the global mininimum. 3 | 4 | https://github.com/shivram1987/diffGrad for original source 5 | 6 | and paper: https://arxiv.org/abs/1909.11015v2 7 | 8 | (TF version - if you are forced to use TF, here's a TF version of diffgrad: 9 | https://github.com/evanatyourservice/diffGrad-tf ) 10 | 11 | 12 | This version adds in a version parameter: version 0 is the main one used in the paper. version 1 removes the abs value from the calculations and 13 | allows faster clamping. 14 | Use: version=1 in your optimizer params. version=0 is default. 15 | 16 | 12/27 - added DiffRGrad - this is diffGrad with Rectified Adam to start. Thus no warmup needed and diffGrad kicks in after Rectified Adam says variance is ready to go. 17 | 18 | Medium article and example usage: https://medium.com/@lessw/meet-diffgrad-new-deep-learning-optimizer-that-solves-adams-overshoot-issue-ec63e28e01b2 19 | -------------------------------------------------------------------------------- /diffgrad/diff_rgrad.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim.optimizer import Optimizer, required 4 | 5 | # Original source: DiffGrad: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py 6 | # RAam: https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py 7 | # modifications: @lessw2020 - blend RAdam with DiffGrad and add version options 8 | # __version__: 12.27.19 9 | 10 | 11 | class diffRGrad(Optimizer): 12 | 13 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 14 | version=1, 15 | weight_decay=0, degenerated_to_sgd=True): 16 | if not 0.0 <= lr: 17 | raise ValueError("Invalid learning rate: {}".format(lr)) 18 | if not 0.0 <= eps: 19 | raise ValueError("Invalid epsilon value: {}".format(eps)) 20 | if not 0.0 <= betas[0] < 1.0: 21 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 22 | if not 0.0 <= betas[1] < 1.0: 23 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 24 | 25 | self.degenerated_to_sgd = degenerated_to_sgd 26 | 27 | self.version = version 28 | 29 | if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): 30 | for param in params: 31 | if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): 32 | param['buffer'] = [[None, None, None] for _ in range(10)] 33 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) 34 | super(diffRGrad, self).__init__(params, defaults) 35 | 36 | def __setstate__(self, state): 37 | super(diffRGrad, self).__setstate__(state) 38 | 39 | def step(self, closure=None): 40 | 41 | loss = None 42 | if closure is not None: 43 | loss = closure() 44 | 45 | for group in self.param_groups: 46 | 47 | for p in group['params']: 48 | if p.grad is None: 49 | continue 50 | grad = p.grad.data.float() 51 | if grad.is_sparse: 52 | raise RuntimeError('diffGRad does not support sparse gradients') 53 | 54 | p_data_fp32 = p.data.float() 55 | 56 | state = self.state[p] 57 | 58 | if len(state) == 0: 59 | state['step'] = 0 60 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 61 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 62 | # Previous gradient 63 | state['previous_grad'] = torch.zeros_like(p_data_fp32) 64 | 65 | else: 66 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 67 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 68 | state['previous_grad'] = state['previous_grad'].type_as(p_data_fp32) 69 | 70 | 71 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 72 | previous_grad = state['previous_grad'] 73 | beta1, beta2 = group['betas'] 74 | 75 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 76 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 77 | 78 | state['step'] += 1 79 | 80 | # compute diffgrad coefficient (dfc) 81 | 82 | #print("grad = ",grad.size()) 83 | #print("prev_grad = ",previous_grad.size()) 84 | 85 | if self.version==0: 86 | diff = abs(previous_grad - grad) 87 | elif self.version ==1: 88 | diff = previous_grad-grad 89 | elif self.version ==2: 90 | diff = .5*abs(previous_grad - grad) 91 | 92 | if self.version==0 or self.version==1: 93 | dfc = 1. / (1. + torch.exp(-diff)) 94 | elif self.version==2: 95 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 96 | 97 | state['previous_grad'] = grad 98 | 99 | 100 | buffered = group['buffer'][int(state['step'] % 10)] 101 | if state['step'] == buffered[0]: 102 | N_sma, step_size = buffered[1], buffered[2] 103 | else: 104 | buffered[0] = state['step'] 105 | beta2_t = beta2 ** state['step'] 106 | N_sma_max = 2 / (1 - beta2) - 1 107 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 108 | buffered[1] = N_sma 109 | 110 | # more conservative since it's an approximated value 111 | if N_sma >= 5: 112 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 113 | elif self.degenerated_to_sgd: 114 | step_size = 1.0 / (1 - beta1 ** state['step']) 115 | else: 116 | step_size = -1 117 | buffered[2] = step_size 118 | 119 | 120 | 121 | 122 | # more conservative since it's an approximated value 123 | if N_sma >= 5: 124 | if group['weight_decay'] != 0: 125 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 126 | 127 | denom = exp_avg_sq.sqrt().add_(group['eps']) 128 | 129 | # update momentum with dfc 130 | #print("dfc ",dfc.size()) 131 | #print("exp_avg ",exp_avg.size()) 132 | exp_avg1 = exp_avg * dfc.float() 133 | 134 | 135 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg1, denom) 136 | p.data.copy_(p_data_fp32) 137 | 138 | elif step_size > 0: 139 | 140 | #print("exp_avg in elif",exp_avg.size()) 141 | if group['weight_decay'] != 0: 142 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 143 | 144 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 145 | p.data.copy_(p_data_fp32) 146 | 147 | return loss 148 | -------------------------------------------------------------------------------- /diffgrad/diffgrad.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | import torch 4 | from torch.optim.optimizer import Optimizer 5 | import numpy as np 6 | import torch.nn as nn 7 | #import torch.optim as Optimizer 8 | 9 | # Original source: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py 10 | 11 | # modifications: @lessw2020 12 | 13 | 14 | class DiffGrad(Optimizer): 15 | r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam. 16 | It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_. 17 | Arguments: 18 | params (iterable): iterable of parameters to optimize or dicts defining 19 | parameter groups 20 | lr (float, optional): learning rate (default: 1e-3) 21 | betas (Tuple[float, float], optional): coefficients used for computing 22 | running averages of gradient and its square (default: (0.9, 0.999)) 23 | eps (float, optional): term added to the denominator to improve 24 | numerical stability (default: 1e-8) 25 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 26 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 27 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 28 | (default: False) 29 | .. _diffGrad: An Optimization Method for Convolutional Neural Networks: 30 | https://arxiv.org/abs/1909.11015 31 | .. _Adam\: A Method for Stochastic Optimization: 32 | https://arxiv.org/abs/1412.6980 33 | .. _On the Convergence of Adam and Beyond: 34 | https://openreview.net/forum?id=ryQu7f-RZ 35 | """ 36 | 37 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0): 38 | if not 0.0 <= lr: 39 | raise ValueError("Invalid learning rate: {}".format(lr)) 40 | if not 0.0 <= eps: 41 | raise ValueError("Invalid epsilon value: {}".format(eps)) 42 | if not 0.0 <= betas[0] < 1.0: 43 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 44 | if not 0.0 <= betas[1] < 1.0: 45 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 46 | 47 | 48 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 49 | 50 | super().__init__(params, defaults) 51 | 52 | #save version 53 | self.version = version 54 | 55 | def __setstate__(self, state): 56 | super().__setstate__(state) 57 | 58 | def step(self, closure=None): 59 | """Performs a single optimization step. 60 | Arguments: 61 | closure (callable, optional): A closure that reevaluates the model 62 | and returns the loss. 63 | """ 64 | loss = None 65 | if closure is not None: 66 | loss = closure() 67 | 68 | for group in self.param_groups: 69 | for p in group['params']: 70 | if p.grad is None: 71 | continue 72 | grad = p.grad.data 73 | if grad.is_sparse: 74 | raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead') 75 | 76 | state = self.state[p] 77 | 78 | # State initialization 79 | if len(state) == 0: 80 | state['step'] = 0 81 | # Exponential moving average of gradient values 82 | state['exp_avg'] = torch.zeros_like(p.data) 83 | # Exponential moving average of squared gradient values 84 | state['exp_avg_sq'] = torch.zeros_like(p.data) 85 | # Previous gradient 86 | state['previous_grad'] = torch.zeros_like(p.data) 87 | 88 | exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad'] 89 | beta1, beta2 = group['betas'] 90 | 91 | state['step'] += 1 92 | 93 | if group['weight_decay'] != 0: 94 | grad.add_(group['weight_decay'], p.data) 95 | 96 | # Decay the first and second moment running average coefficient 97 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 98 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 99 | denom = exp_avg_sq.sqrt().add_(group['eps']) 100 | 101 | bias_correction1 = 1 - beta1 ** state['step'] 102 | bias_correction2 = 1 - beta2 ** state['step'] 103 | 104 | # compute diffgrad coefficient (dfc) 105 | 106 | 107 | if self.version==0: 108 | diff = abs(previous_grad - grad) 109 | elif self.version ==1: 110 | diff = previous_grad-grad 111 | elif self.version ==2: 112 | diff = .5*abs(previous_grad - grad) 113 | 114 | if self.version==0 or self.version==1: 115 | dfc = 1. / (1. + torch.exp(-diff)) 116 | elif self.version==2: 117 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 118 | 119 | state['previous_grad'] = grad 120 | 121 | # update momentum with dfc 122 | exp_avg1 = exp_avg * dfc 123 | 124 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 125 | 126 | p.data.addcdiv_(-step_size, exp_avg1, denom) 127 | 128 | return loss -------------------------------------------------------------------------------- /diffgrad/mxresnet.py: -------------------------------------------------------------------------------- 1 | #FastAI's XResnet modified to use Mish activation function, MXResNet 2 | #https://github.com/fastai/fastai/blob/master/fastai/vision/models/xresnet.py 3 | #modified by lessw2020 - github: https://github.com/lessw2020/mish 4 | 5 | 6 | from fastai.torch_core import * 7 | import torch.nn as nn 8 | import torch,math,sys 9 | import torch.utils.model_zoo as model_zoo 10 | from functools import partial 11 | #from ...torch_core import Module 12 | from fastai.torch_core import Module 13 | 14 | import torch.nn.functional as F #(uncomment if needed,but you likely already have it) 15 | 16 | 17 | class Mish(nn.Module): 18 | def __init__(self): 19 | super().__init__() 20 | print("Mish activation loaded...") 21 | 22 | def forward(self, x): 23 | #save 1 second per epoch with no x= x*() and then return x...just inline it. 24 | return x *( torch.tanh(F.softplus(x))) 25 | 26 | 27 | 28 | 29 | 30 | #Unmodified from https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py 31 | def conv1d(ni:int, no:int, ks:int=1, stride:int=1, padding:int=0, bias:bool=False): 32 | "Create and initialize a `nn.Conv1d` layer with spectral normalization." 33 | conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias) 34 | nn.init.kaiming_normal_(conv.weight) 35 | if bias: conv.bias.data.zero_() 36 | return spectral_norm(conv) 37 | 38 | 39 | 40 | # Adapted from SelfAttention layer at https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py 41 | # Inspired by https://arxiv.org/pdf/1805.08318.pdf 42 | class SimpleSelfAttention(nn.Module): 43 | 44 | def __init__(self, n_in:int, ks=1, sym=False):#, n_out:int): 45 | super().__init__() 46 | 47 | self.conv = conv1d(n_in, n_in, ks, padding=ks//2, bias=False) 48 | 49 | self.gamma = nn.Parameter(tensor([0.])) 50 | 51 | self.sym = sym 52 | self.n_in = n_in 53 | 54 | def forward(self,x): 55 | 56 | 57 | if self.sym: 58 | # symmetry hack by https://github.com/mgrankin 59 | c = self.conv.weight.view(self.n_in,self.n_in) 60 | c = (c + c.t())/2 61 | self.conv.weight = c.view(self.n_in,self.n_in,1) 62 | 63 | size = x.size() 64 | x = x.view(*size[:2],-1) # (C,N) 65 | 66 | # changed the order of mutiplication to avoid O(N^2) complexity 67 | # (x*xT)*(W*x) instead of (x*(xT*(W*x))) 68 | 69 | convx = self.conv(x) # (C,C) * (C,N) = (C,N) => O(NC^2) 70 | xxT = torch.bmm(x,x.permute(0,2,1).contiguous()) # (C,N) * (N,C) = (C,C) => O(NC^2) 71 | 72 | o = torch.bmm(xxT, convx) # (C,C) * (C,N) = (C,N) => O(NC^2) 73 | 74 | o = self.gamma * o + x 75 | 76 | 77 | return o.view(*size).contiguous() 78 | 79 | 80 | 81 | 82 | 83 | __all__ = ['MXResNet', 'mxresnet18', 'mxresnet34', 'mxresnet50', 'mxresnet101', 'mxresnet152'] 84 | 85 | # or: ELU+init (a=0.54; gain=1.55) 86 | act_fn = Mish() #nn.ReLU(inplace=True) 87 | 88 | class Flatten(Module): 89 | def forward(self, x): return x.view(x.size(0), -1) 90 | 91 | def init_cnn(m): 92 | if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0) 93 | if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight) 94 | for l in m.children(): init_cnn(l) 95 | 96 | def conv(ni, nf, ks=3, stride=1, bias=False): 97 | return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias) 98 | 99 | def noop(x): return x 100 | 101 | def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True): 102 | bn = nn.BatchNorm2d(nf) 103 | nn.init.constant_(bn.weight, 0. if zero_bn else 1.) 104 | layers = [conv(ni, nf, ks, stride=stride), bn] 105 | if act: layers.append(act_fn) 106 | return nn.Sequential(*layers) 107 | 108 | class ResBlock(Module): 109 | def __init__(self, expansion, ni, nh, stride=1,sa=False, sym=False): 110 | nf,ni = nh*expansion,ni*expansion 111 | layers = [conv_layer(ni, nh, 3, stride=stride), 112 | conv_layer(nh, nf, 3, zero_bn=True, act=False) 113 | ] if expansion == 1 else [ 114 | conv_layer(ni, nh, 1), 115 | conv_layer(nh, nh, 3, stride=stride), 116 | conv_layer(nh, nf, 1, zero_bn=True, act=False) 117 | ] 118 | self.sa = SimpleSelfAttention(nf,ks=1,sym=sym) if sa else noop 119 | self.convs = nn.Sequential(*layers) 120 | # TODO: check whether act=True works better 121 | self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False) 122 | self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True) 123 | 124 | def forward(self, x): return act_fn(self.sa(self.convs(x)) + self.idconv(self.pool(x))) 125 | 126 | def filt_sz(recep): return min(64, 2**math.floor(math.log2(recep*0.75))) 127 | 128 | class MXResNet(nn.Sequential): 129 | def __init__(self, expansion, layers, c_in=3, c_out=1000, sa = False, sym= False): 130 | stem = [] 131 | sizes = [c_in,32,64,64] #modified per Grankin 132 | for i in range(3): 133 | stem.append(conv_layer(sizes[i], sizes[i+1], stride=2 if i==0 else 1)) 134 | #nf = filt_sz(c_in*9) 135 | #stem.append(conv_layer(c_in, nf, stride=2 if i==1 else 1)) 136 | #c_in = nf 137 | 138 | block_szs = [64//expansion,64,128,256,512] 139 | blocks = [self._make_layer(expansion, block_szs[i], block_szs[i+1], l, 1 if i==0 else 2, sa = sa if i in[len(layers)-4] else False, sym=sym) 140 | for i,l in enumerate(layers)] 141 | super().__init__( 142 | *stem, 143 | nn.MaxPool2d(kernel_size=3, stride=2, padding=1), 144 | *blocks, 145 | nn.AdaptiveAvgPool2d(1), Flatten(), 146 | nn.Linear(block_szs[-1]*expansion, c_out), 147 | ) 148 | init_cnn(self) 149 | 150 | def _make_layer(self, expansion, ni, nf, blocks, stride, sa=False, sym=False): 151 | return nn.Sequential( 152 | *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1, sa if i in [blocks -1] else False,sym) 153 | for i in range(blocks)]) 154 | 155 | def mxresnet(expansion, n_layers, name, pretrained=False, **kwargs): 156 | model = MXResNet(expansion, n_layers, **kwargs) 157 | if pretrained: 158 | #model.load_state_dict(model_zoo.load_url(model_urls[name])) 159 | print("No pretrained yet for MXResNet") 160 | return model 161 | 162 | me = sys.modules[__name__] 163 | for n,e,l in [ 164 | [ 18 , 1, [2,2,2 ,2] ], 165 | [ 34 , 1, [3,4,6 ,3] ], 166 | [ 50 , 4, [3,4,6 ,3] ], 167 | [ 101, 4, [3,4,23,3] ], 168 | [ 152, 4, [3,8,36,3] ], 169 | ]: 170 | name = f'mxresnet{n}' 171 | setattr(me, name, partial(mxresnet, expansion=e, n_layers=l, name=name)) -------------------------------------------------------------------------------- /diffmod/diffmod.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim import Optimizer 4 | 5 | # source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py 6 | 7 | # modifications @lessw2020 - blend diffGrad + AdaMod = diffmod. 8 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99) 9 | 10 | 11 | class DiffMod(Optimizer): 12 | """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) 13 | It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_. 14 | Arguments: 15 | params (iterable): iterable of parameters to optimize or dicts defining 16 | parameter groups 17 | lr (float, optional): learning rate (default: 1e-3) 18 | betas (Tuple[float, float], optional): coefficients used for computing 19 | running averages of gradient and its square (default: (0.9, 0.999)) 20 | beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999) 21 | len_memory = b3 in easier to use format. specify the memory len, b3 is computed. 22 | eps (float, optional): term added to the denominator to improve 23 | numerical stability (default: 1e-8) 24 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 25 | """ 26 | 27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), len_memory=1000, version=0, 28 | eps=1e-8, weight_decay=0, average_step=False, debug_print=False): 29 | if not 0.0 <= lr: 30 | raise ValueError("Invalid learning rate: {}".format(lr)) 31 | if not 0.0 <= eps: 32 | raise ValueError("Invalid epsilon value: {}".format(eps)) 33 | if not 0.0 <= betas[0] < 1.0: 34 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 35 | if not 0.0 <= betas[1] < 1.0: 36 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 37 | 38 | #compute b3 39 | base = 1/len_memory 40 | beta3 = 1-(base) 41 | print(f"DiffMod: length of memory is ",len_memory," and b3 is thus ",beta3, "and base = ",base) 42 | 43 | #debugging 44 | self.debug_print=debug_print 45 | self.average_step = average_step 46 | if self.average_step==True: 47 | print(f"DiffMod: step size and exp avg step will be averaged together.") 48 | 49 | if not 0.0 <= beta3 < 1.0: 50 | raise ValueError("Invalid beta3 parameter: {}".format(beta3)) 51 | 52 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, 53 | weight_decay=weight_decay) 54 | super().__init__(params, defaults) 55 | 56 | self.version = version 57 | 58 | def __setstate__(self, state): 59 | super().__setstate__(state) 60 | 61 | def step(self, closure=None): 62 | """Performs a single optimization step. 63 | Arguments: 64 | closure (callable, optional): A closure that reevaluates the model 65 | and returns the loss. 66 | """ 67 | loss = None 68 | if closure is not None: 69 | loss = closure() 70 | 71 | for group in self.param_groups: 72 | for p in group['params']: 73 | if p.grad is None: 74 | continue 75 | grad = p.grad.data 76 | if grad.is_sparse: 77 | raise RuntimeError( 78 | 'DiffMod does not support sparse gradients') 79 | 80 | state = self.state[p] 81 | 82 | # State initialization 83 | if len(state) == 0: 84 | state['step'] = 0 85 | # Exponential moving average of gradient values 86 | state['exp_avg'] = torch.zeros_like(p.data) 87 | # Exponential moving average of squared gradient values 88 | state['exp_avg_sq'] = torch.zeros_like(p.data) 89 | # Exponential moving average of actual learning rates 90 | state['exp_avg_lr'] = torch.zeros_like(p.data) 91 | # Previous gradient 92 | state['previous_grad'] = torch.zeros_like(p.data) 93 | 94 | 95 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] 96 | previous_grad = state['previous_grad'] 97 | beta1, beta2 = group['betas'] 98 | 99 | state['step'] += 1 100 | 101 | # Decay the first and second moment running average coefficient 102 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 103 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 104 | 105 | denom = exp_avg_sq.sqrt().add_(group['eps']) 106 | 107 | bias_correction1 = 1 - beta1 ** state['step'] 108 | bias_correction2 = 1 - beta2 ** state['step'] 109 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 110 | 111 | # compute diffgrad coefficient (dfc) 112 | 113 | 114 | if self.version==0: 115 | diff = abs(previous_grad - grad) 116 | elif self.version ==1: 117 | diff = previous_grad-grad 118 | elif self.version ==2: 119 | diff = .5*abs(previous_grad - grad) 120 | 121 | if self.version==0 or self.version==1: 122 | dfc = 1. / (1. + torch.exp(-diff)) 123 | elif self.version==2: 124 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 125 | 126 | state['previous_grad'] = grad 127 | 128 | if group['weight_decay'] != 0: 129 | p.data.add_(-group['weight_decay'] * group['lr'], p.data) 130 | 131 | # Applies momental bounds on actual learning rates 132 | step_size = torch.full_like(denom, step_size) 133 | step_size.div_(denom) 134 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) 135 | if self.debug_print: 136 | print(f"batch step size {step_size} and exp_avg_step {exp_avg_lr}") 137 | 138 | if self.average_step: 139 | step_size = step_size.add(exp_avg_lr) 140 | step_size = step_size.div(2.) 141 | 142 | else: 143 | step_size = torch.min(step_size, exp_avg_lr) 144 | 145 | # update momentum with dfc 146 | exp_avg1 = exp_avg * dfc 147 | 148 | step_size.mul_(exp_avg1) 149 | 150 | p.data.add_(-step_size) 151 | 152 | return loss -------------------------------------------------------------------------------- /images/1120-optimizer-testing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/1120-optimizer-testing.jpg -------------------------------------------------------------------------------- /images/projected_gradient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/projected_gradient.png -------------------------------------------------------------------------------- /images/ranger-init.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/ranger-init.jpg -------------------------------------------------------------------------------- /images/ranger-with-gc-options.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/ranger-with-gc-options.jpg -------------------------------------------------------------------------------- /madgrad/madgrad_wd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # modifications - 4/4/2021 @lessw2020 (decay issue spotted by @nestordemeure ) 7 | # weight decay has been implemented AdamW style instead of the original madgrad Adam style. 8 | # in initial image classification testing, this outperformed 0 weight decay or original style weight decay. 9 | 10 | # closure is checked if callable or not since some code passes loss directly, rather than in closure param 11 | 12 | import math 13 | from typing import Collection, TYPE_CHECKING, Any, Callable, Optional 14 | 15 | import torch 16 | import torch.optim 17 | import collections 18 | 19 | if TYPE_CHECKING: 20 | from torch.optim.optimizer import _params_t 21 | else: 22 | _params_t = Any 23 | 24 | 25 | class madgrad_wd(torch.optim.Optimizer): 26 | """ 27 | MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic 28 | Optimization. 29 | 30 | .. _MADGRAD: https://arxiv.org/abs/2101.11075 31 | 32 | MADGRAD is a general purpose optimizer that can be used in place of SGD or 33 | Adam may converge faster and generalize better. Currently GPU-only. 34 | Typically, the same learning rate schedule that is used for SGD or Adam may 35 | be used. The overall learning rate is not comparable to either method and 36 | should be determined by a hyper-parameter sweep. 37 | 38 | MADGRAD requires less weight decay than other methods, often as little as 39 | zero. Momentum values used for SGD or Adam's beta1 should work here also. 40 | 41 | On sparse problems both weight_decay and momentum should be set to 0. 42 | 43 | Arguments: 44 | params (iterable): 45 | Iterable of parameters to optimize or dicts defining parameter groups. 46 | lr (float): 47 | Learning rate (default: 1e-2). 48 | momentum (float): 49 | Momentum value in the range [0,1) (default: 0.9). 50 | weight_decay (float): 51 | Weight decay, i.e. a L2 penalty (default: 0). 52 | eps (float): 53 | Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6). 54 | """ 55 | 56 | def __init__( 57 | self, 58 | params: _params_t, 59 | lr: float = 1e-2, 60 | momentum: float = 0.9, 61 | weight_decay: float = 0, 62 | eps: float = 1e-6, 63 | ): 64 | if momentum < 0 or momentum >= 1: 65 | raise ValueError(f"Momentum {momentum} must be in the range [0,1]") 66 | if lr <= 0: 67 | raise ValueError(f"Learning rate {lr} must be positive") 68 | if weight_decay < 0: 69 | raise ValueError(f"Weight decay {weight_decay} must be non-negative") 70 | if eps < 0: 71 | raise ValueError(f"Eps must be non-negative") 72 | 73 | defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay) 74 | super().__init__(params, defaults) 75 | 76 | @property 77 | def supports_memory_efficient_fp16(self) -> bool: 78 | return False 79 | 80 | @property 81 | def supports_flat_params(self) -> bool: 82 | return True 83 | 84 | def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: 85 | """Performs a single optimization step. 86 | 87 | Arguments: 88 | closure (callable, optional): A closure that reevaluates the model 89 | and returns the loss. 90 | """ 91 | loss = None 92 | if closure is not None and isinstance(closure, collections.Callable): 93 | loss = closure() 94 | 95 | # step counter must be stored in state to ensure correct behavior under 96 | # optimizer sharding 97 | if "k" not in self.state: 98 | self.state["k"] = torch.tensor([0], dtype=torch.long) 99 | k = self.state["k"].item() 100 | 101 | for group in self.param_groups: 102 | eps = group["eps"] 103 | lr = group["lr"] + eps 104 | decay = group["weight_decay"] 105 | momentum = group["momentum"] 106 | 107 | ck = 1 - momentum 108 | lamb = lr * math.pow(k + 1, 0.5) 109 | 110 | for p in group["params"]: 111 | if p.grad is None: 112 | continue 113 | grad = p.grad.data 114 | state = self.state[p] 115 | 116 | if "grad_sum_sq" not in state: 117 | state["grad_sum_sq"] = torch.zeros_like(p.data).detach() 118 | state["s"] = torch.zeros_like(p.data).detach() 119 | if momentum != 0: 120 | state["x0"] = torch.clone(p.data).detach() 121 | 122 | if momentum != 0.0 and grad.is_sparse: 123 | raise RuntimeError( 124 | "momentum != 0 is not compatible with sparse gradients" 125 | ) 126 | 127 | grad_sum_sq = state["grad_sum_sq"] 128 | s = state["s"] 129 | 130 | # Apply weight decay - L2 / AdamW style 131 | if decay: 132 | p.data.mul_(1 - lr * decay) 133 | 134 | """ original impl: 135 | if decay != 0: 136 | if grad.is_sparse: 137 | raise RuntimeError("weight_decay option is not compatible with sparse gradients") 138 | 139 | grad.add_(p.data, alpha=decay) 140 | """ 141 | 142 | if grad.is_sparse: 143 | grad = grad.coalesce() 144 | grad_val = grad._values() 145 | 146 | p_masked = p.sparse_mask(grad) 147 | grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad) 148 | s_masked = s.sparse_mask(grad) 149 | 150 | # Compute x_0 from other known quantities 151 | rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps) 152 | x0_masked_vals = p_masked._values().addcdiv( 153 | s_masked._values(), rms_masked_vals, value=1 154 | ) 155 | 156 | # Dense + sparse op 157 | grad_sq = grad * grad 158 | grad_sum_sq.add_(grad_sq, alpha=lamb) 159 | grad_sum_sq_masked.add_(grad_sq, alpha=lamb) 160 | 161 | rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps) 162 | 163 | s.add_(grad, alpha=lamb) 164 | s_masked._values().add_(grad_val, alpha=lamb) 165 | 166 | # update masked copy of p 167 | p_kp1_masked_vals = x0_masked_vals.addcdiv( 168 | s_masked._values(), rms_masked_vals, value=-1 169 | ) 170 | # Copy updated masked p to dense p using an add operation 171 | p_masked._values().add_(p_kp1_masked_vals, alpha=-1) 172 | p.data.add_(p_masked, alpha=-1) 173 | else: 174 | if momentum == 0: 175 | # Compute x_0 from other known quantities 176 | rms = grad_sum_sq.pow(1 / 3).add_(eps) 177 | x0 = p.data.addcdiv(s, rms, value=1) 178 | else: 179 | x0 = state["x0"] 180 | 181 | # Accumulate second moments 182 | grad_sum_sq.addcmul_(grad, grad, value=lamb) 183 | rms = grad_sum_sq.pow(1 / 3).add_(eps) 184 | 185 | # Update s 186 | s.data.add_(grad, alpha=lamb) 187 | 188 | # Step 189 | if momentum == 0: 190 | p.data.copy_(x0.addcdiv(s, rms, value=-1)) 191 | else: 192 | z = x0.addcdiv(s, rms, value=-1) 193 | 194 | # p is a moving average of z 195 | p.data.mul_(1 - ck).add_(z, alpha=ck) 196 | 197 | self.state["k"] += 1 198 | return loss 199 | -------------------------------------------------------------------------------- /sls/README.md: -------------------------------------------------------------------------------- 1 | SLS = Stochastic Line Search. 2 | 3 | Official source and paper link: https://github.com/IssamLaradji/sls 4 | 5 | The files here represent an integration of SLS into Fast AI 1.057+. 6 | 7 | Testing and additional work in progress... 8 | -------------------------------------------------------------------------------- /sls/basic_train.py: -------------------------------------------------------------------------------- 1 | "Provides basic training and validation with `Learner`" 2 | from .torch_core import * 3 | from .basic_data import * 4 | from .callback import * 5 | from .data_block import * 6 | from .utils.ipython import gpu_mem_restore 7 | import inspect 8 | from fastprogress.fastprogress import format_time, IN_NOTEBOOK 9 | from time import time 10 | from .sixel import plot_sixel 11 | 12 | __all__ = ['Learner', 'LearnerCallback', 'Recorder', 'RecordOnCPU', 'fit', 'loss_batch', 'train_epoch', 'validate', 13 | 'get_preds', 'load_learner'] 14 | 15 | defaults.lr = slice(3e-3) 16 | defaults.wd = 1e-2 17 | defaults.extra_callbacks = None 18 | defaults.extra_callback_fns = None 19 | 20 | def loss_batch(model:nn.Module, xb:Tensor, yb:Tensor, loss_func:OptLossFunc=None, opt:OptOptimizer=None, 21 | cb_handler:Optional[CallbackHandler]=None)->Tuple[Union[Tensor,int,float,str]]: 22 | "Calculate loss and metrics for a batch, call out to callbacks as necessary." 23 | cb_handler = ifnone(cb_handler, CallbackHandler()) 24 | if not is_listy(xb): xb = [xb] 25 | if not is_listy(yb): yb = [yb] 26 | out = model(*xb) 27 | out = cb_handler.on_loss_begin(out) 28 | 29 | if not loss_func: return to_detach(out), to_detach(yb[0]) 30 | loss = loss_func(out, *yb) 31 | 32 | def closure(): 33 | out = model(*xb) 34 | loss = loss_func(out,*yb) 35 | return loss 36 | 37 | if opt is not None: 38 | opt.step(closure) 39 | loss,skip_bwd = cb_handler.on_backward_begin(loss) 40 | #if not skip_bwd: loss.backward() 41 | #if not cb_handler.on_backward_end(): 42 | if not cb_handler.on_step_end(): opt.zero_grad() 43 | 44 | loss = loss_func(model(*xb),*yb) #call one more time for updating metrics from SLS 45 | 46 | return loss.detach().cpu() 47 | 48 | def get_preds(model:nn.Module, dl:DataLoader, pbar:Optional[PBar]=None, cb_handler:Optional[CallbackHandler]=None, 49 | activ:nn.Module=None, loss_func:OptLossFunc=None, n_batch:Optional[int]=None) -> List[Tensor]: 50 | "Tuple of predictions and targets, and optional losses (if `loss_func`) using `dl`, max batches `n_batch`." 51 | res = [to_float(torch.cat(o).cpu()) for o in 52 | zip(*validate(model, dl, cb_handler=cb_handler, pbar=pbar, average=False, n_batch=n_batch))] 53 | if loss_func is not None: 54 | with NoneReduceOnCPU(loss_func) as lf: res.append(lf(res[0], res[1])) 55 | if activ is not None: res[0] = activ(res[0]) 56 | return res 57 | 58 | def validate(model:nn.Module, dl:DataLoader, loss_func:OptLossFunc=None, cb_handler:Optional[CallbackHandler]=None, 59 | pbar:Optional[PBar]=None, average=True, n_batch:Optional[int]=None)->Iterator[Tuple[Union[Tensor,int],...]]: 60 | "Calculate `loss_func` of `model` on `dl` in evaluation mode." 61 | model.eval() 62 | with torch.no_grad(): 63 | val_losses,nums = [],[] 64 | if cb_handler: cb_handler.set_dl(dl) 65 | for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)): 66 | if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False) 67 | val_loss = loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler) 68 | val_losses.append(val_loss) 69 | if not is_listy(yb): yb = [yb] 70 | nums.append(first_el(yb).shape[0]) 71 | if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break 72 | if n_batch and (len(nums)>=n_batch): break 73 | nums = np.array(nums, dtype=np.float32) 74 | if average: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum() 75 | else: return val_losses 76 | 77 | def train_epoch(model:nn.Module, dl:DataLoader, opt:optim.Optimizer, loss_func:LossFunction)->None: 78 | "Simple training of `model` for 1 epoch of `dl` using optim `opt` and loss function `loss_func`." 79 | model.train() 80 | for xb,yb in dl: 81 | loss = loss_func(model(xb), yb) 82 | loss.backward() 83 | opt.step() 84 | opt.zero_grad() 85 | 86 | @dataclass 87 | class BasicLearner(): 88 | model:nn.Module 89 | loss_func:LossFunction 90 | opt:optim.Optimizer 91 | data:DataBunch 92 | 93 | def fit(epochs:int, learn:BasicLearner, callbacks:Optional[CallbackList]=None, metrics:OptMetrics=None)->None: 94 | "Fit the `model` on `data` and learn using `loss_func` and `opt`." 95 | assert len(learn.data.train_dl) != 0, f"""Your training dataloader is empty, can't train a model. 96 | Use a smaller batch size (batch size={learn.data.train_dl.batch_size} for {len(learn.data.train_dl.dataset)} elements).""" 97 | cb_handler = CallbackHandler(callbacks, metrics) 98 | pbar = master_bar(range(epochs)) 99 | cb_handler.on_train_begin(epochs, pbar=pbar, metrics=metrics) 100 | 101 | exception=False 102 | try: 103 | for epoch in pbar: 104 | learn.model.train() 105 | cb_handler.set_dl(learn.data.train_dl) 106 | cb_handler.on_epoch_begin() 107 | for xb,yb in progress_bar(learn.data.train_dl, parent=pbar): 108 | xb, yb = cb_handler.on_batch_begin(xb, yb) 109 | loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler) 110 | if cb_handler.on_batch_end(loss): break 111 | 112 | if not cb_handler.skip_validate and not learn.data.empty_val: 113 | val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func, 114 | cb_handler=cb_handler, pbar=pbar) 115 | else: val_loss=None 116 | if cb_handler.on_epoch_end(val_loss): break 117 | except Exception as e: 118 | exception = e 119 | raise 120 | finally: cb_handler.on_train_end(exception) 121 | 122 | loss_func_name2activ = {'cross_entropy_loss': F.softmax, 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp, 123 | 'kl_div_loss': torch.exp, 'bce_with_logits_loss': torch.sigmoid, 'cross_entropy': F.softmax, 124 | 'kl_div': torch.exp, 'binary_cross_entropy_with_logits': torch.sigmoid, 125 | } 126 | 127 | def _loss_func_name2activ(name:str, axis:int=-1): 128 | res = loss_func_name2activ[name] 129 | if res == F.softmax: res = partial(F.softmax, dim=axis) 130 | return res 131 | 132 | def _loss_func2activ(loss_func): 133 | if getattr(loss_func,'keywords',None): 134 | if not loss_func.keywords.get('log_input', True): return 135 | axis = getattr(loss_func, 'axis', -1) 136 | # flattened loss 137 | loss_func = getattr(loss_func, 'func', loss_func) 138 | # could have a partial inside flattened loss! Duplicate on purpose. 139 | loss_func = getattr(loss_func, 'func', loss_func) 140 | cls_name = camel2snake(loss_func.__class__.__name__) 141 | if cls_name == 'mix_up_loss': 142 | loss_func = loss_func.crit 143 | cls_name = camel2snake(loss_func.__class__.__name__) 144 | if cls_name in loss_func_name2activ: 145 | if cls_name == 'poisson_nll_loss' and (not getattr(loss_func, 'log_input', True)): return 146 | return _loss_func_name2activ(cls_name, axis) 147 | if getattr(loss_func,'__name__','') in loss_func_name2activ: 148 | return _loss_func_name2activ(loss_func.__name__, axis) 149 | return noop 150 | 151 | @dataclass 152 | class Learner(): 153 | "Trainer for `model` using `data` to minimize `loss_func` with optimizer `opt_func`." 154 | data:DataBunch 155 | model:nn.Module 156 | opt_func:Callable=AdamW 157 | loss_func:Callable=None 158 | metrics:Collection[Callable]=None 159 | true_wd:bool=True 160 | bn_wd:bool=True 161 | wd:Floats=defaults.wd 162 | train_bn:bool=True 163 | path:str = None 164 | model_dir:PathOrStr = 'models' 165 | callback_fns:Collection[Callable]=None 166 | callbacks:Collection[Callback]=field(default_factory=list) 167 | layer_groups:Collection[nn.Module]=None 168 | add_time:bool=True 169 | silent:bool=None 170 | def __post_init__(self)->None: 171 | "Setup path,metrics, callbacks and ensure model directory exists." 172 | self.path = Path(ifnone(self.path, self.data.path)) 173 | self.model = self.model.to(self.data.device) 174 | self.loss_func = self.loss_func or self.data.loss_func 175 | self.metrics=listify(self.metrics) 176 | if not self.layer_groups: self.layer_groups = [nn.Sequential(*flatten_model(self.model))] 177 | self.callbacks = listify(self.callbacks) 178 | if self.silent is None: self.silent = defaults.silent 179 | self.callback_fns = [partial(Recorder, add_time=self.add_time, silent=self.silent)] + listify(self.callback_fns) 180 | if defaults.extra_callbacks is not None: self.callbacks += defaults.extra_callbacks 181 | 182 | def init(self, init): apply_init(self.model, init) 183 | 184 | def _test_writeable_path(self): 185 | path = self.path/self.model_dir 186 | try: 187 | path.mkdir(parents=True, exist_ok=True) 188 | tmp_file = get_tmp_file(path) 189 | except OSError as e: 190 | raise Exception(f"{e}\nCan't write to '{path}', set `learn.model_dir` attribute in Learner to a full libpath path that is writable") from None 191 | os.remove(tmp_file) 192 | 193 | def lr_range(self, lr:Union[float,slice])->np.ndarray: 194 | "Build differential learning rates from `lr`." 195 | if not isinstance(lr,slice): return lr 196 | if lr.start: res = even_mults(lr.start, lr.stop, len(self.layer_groups)) 197 | else: res = [lr.stop/10]*(len(self.layer_groups)-1) + [lr.stop] 198 | return np.array(res) 199 | 200 | def fit(self, epochs:int, lr:Union[Floats,slice]=defaults.lr, 201 | wd:Floats=None, callbacks:Collection[Callback]=None)->None: 202 | "Fit the model on this learner with `lr` learning rate, `wd` weight decay for `epochs` with `callbacks`." 203 | lr = self.lr_range(lr) 204 | if wd is None: wd = self.wd 205 | if not getattr(self, 'opt', False): self.create_opt(lr, wd) 206 | else: self.opt.lr,self.opt.wd = lr,wd 207 | callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks) 208 | fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks) 209 | 210 | def create_opt(self, lr:Floats, wd:Floats=0.)->None: 211 | "Create optimizer with `lr` learning rate and `wd` weight decay." 212 | self.opt = OptimWrapper.create(self.opt_func, lr, self.layer_groups, wd=wd, true_wd=self.true_wd, bn_wd=self.bn_wd) 213 | 214 | def split(self, split_on:SplitFuncOrIdxList)->None: 215 | "Split the model at `split_on`." 216 | if isinstance(split_on,Callable): split_on = split_on(self.model) 217 | self.layer_groups = split_model(self.model, split_on) 218 | return self 219 | 220 | def freeze_to(self, n:int)->None: 221 | "Freeze layers up to layer group `n`." 222 | if hasattr(self.model, 'reset'): self.model.reset() 223 | for g in self.layer_groups[:n]: 224 | for l in g: 225 | if not self.train_bn or not isinstance(l, bn_types): requires_grad(l, False) 226 | for g in self.layer_groups[n:]: requires_grad(g, True) 227 | self.create_opt(defaults.lr) 228 | 229 | def freeze(self)->None: 230 | "Freeze up to last layer group." 231 | assert(len(self.layer_groups)>1) 232 | self.freeze_to(-1) 233 | 234 | def unfreeze(self): 235 | "Unfreeze entire model." 236 | self.freeze_to(0) 237 | 238 | def export(self, file:PathLikeOrBinaryStream='export.pkl', destroy=False): 239 | "Export the state of the `Learner` in `self.path/file`. `file` can be file-like (file or buffer)" 240 | if rank_distrib(): return # don't save if slave proc 241 | args = ['opt_func', 'loss_func', 'metrics', 'true_wd', 'bn_wd', 'wd', 'train_bn', 'model_dir', 'callback_fns'] 242 | state = {a:getattr(self,a) for a in args} 243 | state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks} 244 | #layer_groups -> need to find a way 245 | #TO SEE: do we save model structure and weights separately? 246 | with ModelOnCPU(self.model) as m: 247 | state['model'] = m 248 | xtra = dict(normalize=self.data.norm.keywords) if getattr(self.data, 'norm', False) else {} 249 | state['data'] = self.data.valid_ds.get_state(**xtra) 250 | state['cls'] = self.__class__ 251 | try_save(state, self.path, file) 252 | if destroy: self.destroy() 253 | 254 | def save(self, file:PathLikeOrBinaryStream=None, return_path:bool=False, with_opt:bool=True): 255 | "Save model and optimizer state (if `with_opt`) with `file` to `self.model_dir`. `file` can be file-like (file or buffer)" 256 | if is_pathlike(file): self._test_writeable_path() 257 | if rank_distrib(): return # don't save if slave proc 258 | target = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file 259 | if not hasattr(self, 'opt'): with_opt=False 260 | if not with_opt: state = get_model(self.model).state_dict() 261 | else: state = {'model': get_model(self.model).state_dict(), 'opt':self.opt.state_dict()} 262 | torch.save(state, target) 263 | if return_path: return target 264 | 265 | def dl(self, ds_type:DatasetType=DatasetType.Valid): 266 | "Return DataLoader for DatasetType `ds_type`." 267 | return self.data.dl(ds_type) 268 | 269 | def load(self, file:PathLikeOrBinaryStream=None, device:torch.device=None, strict:bool=True, 270 | with_opt:bool=None, purge:bool=False, remove_module:bool=False)->'Learner': 271 | "Load model and optimizer state (if `with_opt`) `file` from `self.model_dir` using `device`. `file` can be file-like (file or buffer)" 272 | if purge: self.purge(clear_opt=ifnone(with_opt, False)) 273 | if device is None: device = self.data.device 274 | elif isinstance(device, int): device = torch.device('cuda', device) 275 | source = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file 276 | distrib_barrier() 277 | state = torch.load(source, map_location=device) 278 | if set(state.keys()) == {'model', 'opt'}: 279 | model_state = state['model'] 280 | if remove_module: model_state = remove_module_load(model_state) 281 | get_model(self.model).load_state_dict(model_state, strict=strict) 282 | if ifnone(with_opt,True): 283 | if not hasattr(self, 'opt'): self.create_opt(defaults.lr, self.wd) 284 | try: self.opt.load_state_dict(state['opt']) 285 | except: pass 286 | else: 287 | if with_opt: warn("Saved filed doesn't contain an optimizer state.") 288 | if remove_module: state = remove_module_load(state) 289 | get_model(self.model).load_state_dict(state, strict=strict) 290 | del state 291 | gc.collect() 292 | return self 293 | 294 | def destroy(self): 295 | "Free the Learner internals, leaving just an empty shell that consumes no memory" 296 | 297 | class ZombieLearner(Learner): 298 | msg = "this object has been destroyed" 299 | def __getattr__(self, item): print(ZombieLearner.msg); return None 300 | def destroyed(*args, **kwargs): print(ZombieLearner.msg) 301 | 302 | attrs = [k for k in self.__dict__.keys() if not k.startswith("__")] 303 | for a in attrs: delattr(self, a) 304 | # the instance methods can still be called, but will just give a message 305 | methods = [k for k in dir(self) if not k.startswith("__") and inspect.isroutine(getattr(self, k))] 306 | for m in methods: setattr(self, m, ZombieLearner.destroyed) 307 | self.__class__ = ZombieLearner 308 | gc.collect() 309 | print("this Learner object self-destroyed - it still exists, but no longer usable") 310 | 311 | def purge(self, clear_opt:bool=True): 312 | "Purge the `Learner` of all cached attributes to release some GPU memory." 313 | self._test_writeable_path() 314 | attrs_all = [k for k in self.__dict__.keys() if not k.startswith("__")] 315 | attrs_pkl = ['bn_wd', 'callback_fns', 'layer_groups', 'loss_func', 'metrics', 'model', 316 | 'model_dir', 'opt_func', 'path', 'train_bn', 'true_wd', 'wd'] 317 | # +callbacks: get pickled too, but not directly 318 | attrs_keep = ['data', 'recorder'] 319 | attrs_del = list(set(attrs_all) - set(attrs_keep)) 320 | state = {a:getattr(self, a) for a in attrs_pkl} 321 | state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks} 322 | if hasattr(self, 'opt'): state['opt'] = self.opt.get_state() 323 | 324 | tmp_file = get_tmp_file(self.path/self.model_dir) 325 | torch.save(state, open(tmp_file, 'wb')) 326 | for a in attrs_del: delattr(self, a) 327 | gc.collect() 328 | state = torch.load(tmp_file) 329 | os.remove(tmp_file) 330 | 331 | for a in attrs_pkl: setattr(self, a, state[a]) 332 | cb_state = state.pop('cb_state') 333 | self.callbacks = [load_callback(c,s, self) for c,s in cb_state.items()] 334 | if not clear_opt and 'opt' in state: 335 | try: self.opt = OptimWrapper.load_with_state_and_layer_group(state['opt'], self.layer_groups) 336 | except: warn("Wasn't able to properly load the optimizer state again.") 337 | del state 338 | gc.collect() 339 | return self 340 | 341 | def get_preds(self, ds_type:DatasetType=DatasetType.Valid, activ:nn.Module=None, 342 | with_loss:bool=False, n_batch:Optional[int]=None, pbar:Optional[PBar]=None) -> List[Tensor]: 343 | "Return predictions and targets on `ds_type` dataset." 344 | lf = self.loss_func if with_loss else None 345 | activ = ifnone(activ, _loss_func2activ(self.loss_func)) 346 | if not getattr(self, 'opt', False): self.create_opt(defaults.lr, self.wd) 347 | callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(self.callbacks) 348 | return get_preds(self.model, self.dl(ds_type), cb_handler=CallbackHandler(callbacks), 349 | activ=activ, loss_func=lf, n_batch=n_batch, pbar=pbar) 350 | 351 | def pred_batch(self, ds_type:DatasetType=DatasetType.Valid, batch:Tuple=None, reconstruct:bool=False, 352 | with_dropout:bool=False, activ:nn.Module=None) -> List[Tensor]: 353 | "Return output of the model on one batch from `ds_type` dataset." 354 | if batch is not None: xb,yb = batch 355 | else: xb,yb = self.data.one_batch(ds_type, detach=False, denorm=False) 356 | cb_handler = CallbackHandler(self.callbacks) 357 | xb,yb = cb_handler.on_batch_begin(xb,yb, train=False) 358 | activ = ifnone(activ, _loss_func2activ(self.loss_func)) 359 | with torch.no_grad(): 360 | if not with_dropout: preds = loss_batch(self.model.eval(), xb, yb, cb_handler=cb_handler) 361 | else: preds = loss_batch(self.model.eval().apply(self.apply_dropout), xb, yb, cb_handler=cb_handler) 362 | res = activ(preds[0]) 363 | if not reconstruct: return res 364 | res = res.detach().cpu() 365 | ds = self.dl(ds_type).dataset 366 | norm = getattr(self.data, 'norm', False) 367 | if norm and norm.keywords.get('do_y',False): 368 | res = self.data.denorm(res, do_x=True) 369 | return [ds.reconstruct(o) for o in res] 370 | 371 | def backward(self, item): 372 | "Pass `item` through the model and computes the gradient. Useful if `backward_hooks` are attached." 373 | xb,yb = self.data.one_item(item) 374 | loss = loss_batch(self.model.eval(), xb, yb, self.loss_func, opt=FakeOptimizer(), 375 | cb_handler=CallbackHandler(self.callbacks)) 376 | return loss 377 | 378 | def predict(self, item:ItemBase, return_x:bool=False, batch_first:bool=True, with_dropout:bool=False, **kwargs): 379 | "Return predicted class, label and probabilities for `item`." 380 | batch = self.data.one_item(item) 381 | res = self.pred_batch(batch=batch, with_dropout=with_dropout) 382 | raw_pred,x = grab_idx(res,0,batch_first=batch_first),batch[0] 383 | norm = getattr(self.data,'norm',False) 384 | if norm: 385 | x = self.data.denorm(x) 386 | if norm.keywords.get('do_y',False): raw_pred = self.data.denorm(raw_pred) 387 | ds = self.data.single_ds 388 | pred = ds.y.analyze_pred(raw_pred, **kwargs) 389 | x = ds.x.reconstruct(grab_idx(x, 0)) 390 | y = ds.y.reconstruct(pred, x) if has_arg(ds.y.reconstruct, 'x') else ds.y.reconstruct(pred) 391 | return (x, y, pred, raw_pred) if return_x else (y, pred, raw_pred) 392 | 393 | def validate(self, dl=None, callbacks=None, metrics=None): 394 | "Validate on `dl` with potential `callbacks` and `metrics`." 395 | dl = ifnone(dl, self.data.valid_dl) 396 | metrics = ifnone(metrics, self.metrics) 397 | cb_handler = CallbackHandler(self.callbacks + ifnone(callbacks, []), metrics) 398 | cb_handler.on_train_begin(1, None, metrics); cb_handler.on_epoch_begin() 399 | val_metrics = validate(self.model, dl, self.loss_func, cb_handler) 400 | cb_handler.on_epoch_end(val_metrics) 401 | return cb_handler.state_dict['last_metrics'] 402 | 403 | def show_results(self, ds_type=DatasetType.Valid, rows:int=5, **kwargs): 404 | "Show `rows` result of predictions on `ds_type` dataset." 405 | #TODO: get read of has_arg x and split_kwargs_by_func if possible 406 | #TODO: simplify this and refactor with pred_batch(...reconstruct=True) 407 | n_items = rows ** 2 if self.data.train_ds.x._square_show_res else rows 408 | if self.dl(ds_type).batch_size < n_items: n_items = self.dl(ds_type).batch_size 409 | ds = self.dl(ds_type).dataset 410 | self.callbacks.append(RecordOnCPU()) 411 | preds = self.pred_batch(ds_type) 412 | *self.callbacks,rec_cpu = self.callbacks 413 | x,y = rec_cpu.input,rec_cpu.target 414 | norm = getattr(self.data,'norm',False) 415 | if norm: 416 | x = self.data.denorm(x) 417 | if norm.keywords.get('do_y',False): 418 | y = self.data.denorm(y, do_x=True) 419 | preds = self.data.denorm(preds, do_x=True) 420 | analyze_kwargs,kwargs = split_kwargs_by_func(kwargs, ds.y.analyze_pred) 421 | preds = [ds.y.analyze_pred(grab_idx(preds, i), **analyze_kwargs) for i in range(n_items)] 422 | xs = [ds.x.reconstruct(grab_idx(x, i)) for i in range(n_items)] 423 | if has_arg(ds.y.reconstruct, 'x'): 424 | ys = [ds.y.reconstruct(grab_idx(y, i), x=x) for i,x in enumerate(xs)] 425 | zs = [ds.y.reconstruct(z, x=x) for z,x in zip(preds,xs)] 426 | else : 427 | ys = [ds.y.reconstruct(grab_idx(y, i)) for i in range(n_items)] 428 | zs = [ds.y.reconstruct(z) for z in preds] 429 | ds.x.show_xyzs(xs, ys, zs, **kwargs) 430 | 431 | def apply_dropout(self, m): 432 | "If a module contains 'dropout' in it's name, it will be switched to .train() mode." 433 | if 'dropout' in m.__class__.__name__.lower(): m.train() 434 | 435 | def predict_with_mc_dropout(self, item:ItemBase, with_dropout:bool=True, n_times=10, **kwargs): 436 | "Make predictions with dropout turned on for n_times (default 10)." 437 | return [self.predict(item, with_dropout=with_dropout) for _ in range(n_times)] 438 | 439 | class RecordOnCPU(Callback): 440 | "Store the `input` and `target` going through the model on the CPU." 441 | def on_batch_begin(self, last_input,last_target,**kwargs): 442 | self.input,self.target = to_cpu(last_input),to_cpu(last_target) 443 | 444 | class LearnerCallback(Callback): 445 | "Base class for creating callbacks for a `Learner`." 446 | def __init__(self, learn): 447 | self._learn = weakref.ref(learn) 448 | self.exclude,self.not_min = ['_learn'],[] 449 | setattr(self.learn, self.cb_name, self) 450 | 451 | def __getattr__(self,k): return getattr(self.learn, k) 452 | def __setstate__(self,data:Any): self.__dict__.update(data) 453 | 454 | @property 455 | def learn(self) -> Learner: return self._learn() 456 | @learn.setter 457 | def learn(self, learn: Learner) -> None: self._learn = weakref.ref(learn) 458 | 459 | @property 460 | def cb_name(self): return camel2snake(self.__class__.__name__) 461 | 462 | class Recorder(LearnerCallback): 463 | "A `LearnerCallback` that records epoch, loss, opt and metric data during training." 464 | _order=-10 465 | def __init__(self, learn:Learner, add_time:bool=True, silent:bool=False): 466 | super().__init__(learn) 467 | if not getattr(self.learn, 'opt', False): self.learn.create_opt(defaults.lr, self.learn.wd) 468 | self.opt = self.learn.opt 469 | self.train_dl = self.learn.data.train_dl 470 | self.no_val,self.silent,self.add_time = False,silent,add_time 471 | 472 | def on_train_begin(self, pbar:PBar, metrics_names:Collection[str], **kwargs:Any)->None: 473 | "Initialize recording status at beginning of training." 474 | self.pbar = pbar 475 | self.names = ['epoch', 'train_loss'] if self.no_val else ['epoch', 'train_loss', 'valid_loss'] 476 | self.metrics_names = metrics_names 477 | if hasattr(self, '_added_met_names'): self.metrics_names += self._added_met_names 478 | self.names += self.metrics_names 479 | if self.add_time: self.names.append('time') 480 | if not self.silent: self.pbar.write(self.names, table=True) 481 | self.losses,self.val_losses,self.lrs,self.moms,self.metrics,self.nb_batches = [],[],[],[],[],[] 482 | 483 | def on_epoch_begin(self, **kwargs:Any)->None: 484 | if self.add_time: self.start_epoch = time() 485 | 486 | def on_batch_begin(self, train, **kwargs:Any)->None: 487 | "Record learning rate and momentum at beginning of batch." 488 | if train: 489 | self.lrs.append(self.opt.lr) 490 | #if self.opt.mom is not None: 491 | #self.moms.append(self.opt.mom) 492 | 493 | def on_backward_begin(self, smooth_loss:Tensor, **kwargs:Any)->None: 494 | "Record the loss before any other callback has a chance to modify it." 495 | self.losses.append(smooth_loss) 496 | if self.pbar is not None and hasattr(self.pbar,'child'): 497 | self.pbar.child.comment = f'{smooth_loss:.4f}' 498 | 499 | def on_epoch_end(self, epoch:int, num_batch:int, smooth_loss:Tensor, 500 | last_metrics:MetricsList, **kwargs:Any)->bool: 501 | "Save epoch info: num_batch, smooth_loss, metrics." 502 | self.nb_batches.append(num_batch) 503 | if last_metrics is not None: self.val_losses.append(last_metrics[0]) 504 | else: last_metrics = [] if self.no_val else [None] 505 | if len(last_metrics) > 1: self.metrics.append(last_metrics[1:]) 506 | self.format_stats([epoch, smooth_loss] + last_metrics) 507 | 508 | def format_stats(self, stats:TensorOrNumList)->None: 509 | "Format stats before printing." 510 | str_stats = [] 511 | for name,stat in zip(self.names,stats): 512 | str_stats.append('#na#' if stat is None else str(stat) if isinstance(stat, int) else f'{stat:.6f}') 513 | if self.add_time: str_stats.append(format_time(time() - self.start_epoch)) 514 | if not self.silent: self.pbar.write(str_stats, table=True) 515 | 516 | def add_metric_names(self, names): 517 | "Add `names` to the inner metric names." 518 | if hasattr(self, '_added_met_names'): self._added_met_names += names 519 | else: self._added_met_names = names 520 | 521 | def plot_lr(self, show_moms=False, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]: 522 | "Plot learning rate, `show_moms` to include momentum." 523 | lrs = self._split_list(self.lrs, skip_start, skip_end) 524 | iterations = self._split_list(range_of(self.lrs), skip_start, skip_end) 525 | if show_moms: 526 | moms = self._split_list(self.moms, skip_start, skip_end) 527 | fig, axs = plt.subplots(1,2, figsize=(12,4)) 528 | axs[0].plot(iterations, lrs) 529 | axs[0].set_xlabel('Iterations') 530 | axs[0].set_ylabel('Learning Rate') 531 | axs[1].plot(iterations, moms) 532 | axs[1].set_xlabel('Iterations') 533 | axs[1].set_ylabel('Momentum') 534 | else: 535 | fig, ax = plt.subplots() 536 | ax.plot(iterations, lrs) 537 | ax.set_xlabel('Iterations') 538 | ax.set_ylabel('Learning Rate') 539 | if ifnone(return_fig, defaults.return_fig): return fig 540 | if not IN_NOTEBOOK: plot_sixel(fig) 541 | 542 | @staticmethod 543 | def smoothen_by_spline(xs, ys, **kwargs): 544 | xs = np.arange(len(ys)) 545 | spl = scipy.interpolate.UnivariateSpline(xs, ys, **kwargs) 546 | ys = spl(xs) 547 | return ys 548 | 549 | def plot(self, skip_start:int=10, skip_end:int=5, suggestion:bool=False, return_fig:bool=None, 550 | **kwargs)->Optional[plt.Figure]: 551 | "Plot learning rate and losses, trimmed between `skip_start` and `skip_end`. Optionally plot and return min gradient" 552 | lrs = self._split_list(self.lrs, skip_start, skip_end) 553 | losses = self._split_list(self.losses, skip_start, skip_end) 554 | losses = [x.item() for x in losses] 555 | if 'k' in kwargs: losses = self.smoothen_by_spline(lrs, losses, **kwargs) 556 | fig, ax = plt.subplots(1,1) 557 | ax.plot(lrs, losses) 558 | ax.set_ylabel("Loss") 559 | ax.set_xlabel("Learning Rate") 560 | ax.set_xscale('log') 561 | ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e')) 562 | if suggestion: 563 | try: mg = (np.gradient(np.array(losses))).argmin() 564 | except: 565 | print("Failed to compute the gradients, there might not be enough points.") 566 | return 567 | print(f"Min numerical gradient: {lrs[mg]:.2E}") 568 | ax.plot(lrs[mg],losses[mg],markersize=10,marker='o',color='red') 569 | self.min_grad_lr = lrs[mg] 570 | ml = np.argmin(losses) 571 | print(f"Min loss divided by 10: {lrs[ml]/10:.2E}") 572 | if ifnone(return_fig, defaults.return_fig): return fig 573 | if not IN_NOTEBOOK: plot_sixel(fig) 574 | 575 | def plot_losses(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]: 576 | "Plot training and validation losses." 577 | fig, ax = plt.subplots(1,1) 578 | losses = self._split_list(self.losses, skip_start, skip_end) 579 | iterations = self._split_list(range_of(self.losses), skip_start, skip_end) 580 | ax.plot(iterations, losses, label='Train') 581 | val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end) 582 | val_losses = self._split_list_val(self.val_losses, skip_start, skip_end) 583 | ax.plot(val_iter, val_losses, label='Validation') 584 | ax.set_ylabel('Loss') 585 | ax.set_xlabel('Batches processed') 586 | ax.legend() 587 | if ifnone(return_fig, defaults.return_fig): return fig 588 | if not IN_NOTEBOOK: plot_sixel(fig) 589 | 590 | def plot_metrics(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]: 591 | "Plot metrics collected during training." 592 | assert len(self.metrics) != 0, "There are no metrics to plot." 593 | fig, axes = plt.subplots(len(self.metrics[0]),1,figsize=(6, 4*len(self.metrics[0]))) 594 | val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end) 595 | axes = axes.flatten() if len(self.metrics[0]) != 1 else [axes] 596 | for i, ax in enumerate(axes): 597 | values = [met[i] for met in self.metrics] 598 | values = self._split_list_val(values, skip_start, skip_end) 599 | ax.plot(val_iter, values) 600 | ax.set_ylabel(str(self.metrics_names[i])) 601 | ax.set_xlabel('Batches processed') 602 | if ifnone(return_fig, defaults.return_fig): return fig 603 | if not IN_NOTEBOOK: plot_sixel(fig) 604 | 605 | def _split_list(self, vals:Collection[float], skip_start:int, skip_end:int): 606 | return vals[skip_start:-skip_end] if skip_end > 0 else vals[skip_start:] 607 | 608 | def _split_list_val(self, vals:Collection[float], skip_start:int, skip_end:int): 609 | val_iter = np.cumsum(self.nb_batches) 610 | start_val = (val_iter - skip_start >= 0).nonzero()[0].min() 611 | end_val = (val_iter[-1] - val_iter - skip_end >= 0).nonzero()[0].max()+1 612 | return vals[start_val:end_val] if skip_end > 0 else vals[start_val:] 613 | 614 | class FakeOptimizer(): 615 | def step(self): pass 616 | def zero_grad(self): pass 617 | 618 | def load_callback(class_func, state, learn:Learner): 619 | init_kwargs, others = split_kwargs_by_func(state, class_func.__init__) 620 | res = class_func(learn, **init_kwargs) if issubclass(class_func, LearnerCallback) else class_func(**init_kwargs) 621 | for k,v in others.items(): setattr(res, k, v) 622 | return res 623 | 624 | def load_learner(path:PathOrStr, file:PathLikeOrBinaryStream='export.pkl', test:ItemList=None, tfm_y=None, **db_kwargs): 625 | "Load a `Learner` object saved with `export_state` in `path/file` with empty data, optionally add `test` and load on `cpu`. `file` can be file-like (file or buffer)" 626 | source = Path(path)/file if is_pathlike(file) else file 627 | state = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source) 628 | model = state.pop('model') 629 | src = LabelLists.load_state(path, state.pop('data')) 630 | if test is not None: src.add_test(test, tfm_y=tfm_y) 631 | data = src.databunch(**db_kwargs) 632 | cb_state = state.pop('cb_state') 633 | clas_func = state.pop('cls') 634 | res = clas_func(data, model, **state) 635 | res.callback_fns = state['callback_fns'] #to avoid duplicates 636 | res.callbacks = [load_callback(c,s, res) for c,s in cb_state.items()] 637 | return res 638 | -------------------------------------------------------------------------------- /sls/callback.py: -------------------------------------------------------------------------------- 1 | "Callbacks provides extensibility to the `basic_train` loop. See `train` for examples of custom callbacks." 2 | from .basic_data import * 3 | from .torch_core import * 4 | import torch.distributed as dist 5 | 6 | __all__ = ['AverageMetric', 'Callback', 'CallbackHandler', 'OptimWrapper', 'SmoothenValue', 'Scheduler', 'annealing_cos', 'CallbackList', 7 | 'annealing_exp', 'annealing_linear', 'annealing_no', 'annealing_poly'] 8 | 9 | class OptimWrapper(): 10 | "Basic wrapper around `opt` to simplify hyper-parameters changes." 11 | def __init__(self, opt:optim.Optimizer, wd:Floats=0., true_wd:bool=False, bn_wd:bool=True): 12 | assert not isinstance(opt, OptimWrapper) 13 | self.opt,self.true_wd,self.bn_wd = opt,true_wd,bn_wd 14 | self.opt_keys = list(self.opt.param_groups[0].keys()) 15 | self.opt_keys.remove('params') 16 | self.read_defaults() 17 | self.wd = wd 18 | 19 | @classmethod 20 | def create(cls, opt_func:Union[type,Callable], lr:Union[float,Tuple,List], layer_groups:ModuleList, wd:Floats=0., 21 | true_wd:bool=False, bn_wd:bool=True)->optim.Optimizer: 22 | "Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`." 23 | split_params = split_no_wd_params(layer_groups) 24 | opt = opt_func([{'params': p, 'lr':0} for p in split_params]) 25 | opt = cls(opt, wd=wd, true_wd=true_wd, bn_wd=bn_wd) 26 | opt.lr,opt.opt_func = listify(lr, layer_groups),opt_func 27 | return opt 28 | 29 | def new(self, layer_groups:Collection[nn.Module], split_no_wd:bool=True): 30 | "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters." 31 | opt_func = getattr(self, 'opt_func', self.opt.__class__) 32 | res = self.create(opt_func, self.lr, layer_groups, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd) 33 | res.mom,res.beta = self.mom,self.beta 34 | return res 35 | 36 | def new_with_params(self, param_groups:Collection[Collection[nn.Parameter]]): 37 | "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters." 38 | opt_func = getattr(self, 'opt_func', self.opt.__class__) 39 | opt = opt_func([{'params': p, 'lr':0} for p in param_groups]) 40 | opt = self.__class__(opt, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd) 41 | opt.lr,opt.opt_func,opt.mom,opt.beta = self.lr,opt_func,self.mom,self.beta 42 | return opt 43 | 44 | def __repr__(self)->str: 45 | return f'OptimWrapper over {repr(self.opt)}.\nTrue weight decay: {self.true_wd}' 46 | 47 | #Pytorch optimizer methods 48 | def step(self,closure=None)->None: 49 | "Set weight decay and step optimizer." 50 | # weight decay outside of optimizer step (AdamW) 51 | if self.true_wd: 52 | for lr,wd,pg1,pg2 in zip(self._lr,self._wd,self.opt.param_groups[::2],self.opt.param_groups[1::2]): 53 | for p in pg1['params']: p.data.mul_(1 - wd*lr) 54 | if self.bn_wd: 55 | for p in pg2['params']: p.data.mul_(1 - wd*lr) 56 | self.set_val('weight_decay', listify(0, self._wd)) 57 | self.opt.step(closure) 58 | 59 | def zero_grad(self)->None: 60 | "Clear optimizer gradients." 61 | self.opt.zero_grad() 62 | 63 | #Passthrough to the inner opt. 64 | def __getattr__(self, k:str)->Any: return getattr(self.opt, k, None) 65 | def __setstate__(self,data:Any): self.__dict__.update(data) 66 | 67 | def clear(self): 68 | "Reset the state of the inner optimizer." 69 | sd = self.state_dict() 70 | sd['state'] = {} 71 | self.load_state_dict(sd) 72 | 73 | @property 74 | def n_params(self): return sum([len(pg['params']) for pg in self.opt.param_groups]) 75 | 76 | #Hyperparameters as properties 77 | @property 78 | def lr(self)->float: return self._lr[-1] 79 | @lr.setter 80 | def lr(self, val:float)->None: 81 | self._lr = self.set_val('lr', listify(val, self._lr)) 82 | 83 | @property 84 | def mom(self)->float:return self._mom[-1] 85 | @mom.setter 86 | def mom(self, val:float)->None: 87 | if 'momentum' in self.opt_keys: self.set_val('momentum', listify(val, self._mom)) 88 | elif 'betas' in self.opt_keys: self.set_val('betas', (listify(val, self._mom), self._beta)) 89 | self._mom = listify(val, self._mom) 90 | 91 | @property 92 | def beta(self)->float: return None if self._beta is None else self._beta[-1] 93 | @beta.setter 94 | def beta(self, val:float)->None: 95 | "Set beta (or alpha as makes sense for given optimizer)." 96 | if val is None: return 97 | if 'betas' in self.opt_keys: self.set_val('betas', (self._mom, listify(val, self._beta))) 98 | elif 'alpha' in self.opt_keys: self.set_val('alpha', listify(val, self._beta)) 99 | self._beta = listify(val, self._beta) 100 | 101 | @property 102 | def wd(self)->float: return self._wd[-1] 103 | @wd.setter 104 | def wd(self, val:float)->None: 105 | "Set weight decay." 106 | if not self.true_wd: self.set_val('weight_decay', listify(val, self._wd), bn_groups=self.bn_wd) 107 | self._wd = listify(val, self._wd) 108 | 109 | #Helper functions 110 | def read_defaults(self)->None: 111 | "Read the values inside the optimizer for the hyper-parameters." 112 | self._beta = None 113 | if 'lr' in self.opt_keys: self._lr = self.read_val('lr') 114 | if 'momentum' in self.opt_keys: self._mom = self.read_val('momentum') 115 | if 'alpha' in self.opt_keys: self._beta = self.read_val('alpha') 116 | if 'betas' in self.opt_keys: self._mom,self._beta = self.read_val('betas') 117 | if 'weight_decay' in self.opt_keys: self._wd = self.read_val('weight_decay') 118 | reserved_names = ['params', 'lr', 'momentum', 'alpha', 'betas', 'weight_decay'] 119 | stat_names = [n for n in self.opt_keys if n not in reserved_names] 120 | self._stats = {n:self.read_val(n) for n in stat_names} 121 | 122 | def get_stat(self, name:str)->float: 123 | if name in ['lr', 'mom', 'beta', 'wd']: return getattr(self, name) 124 | else: return self._stats[name][-1] 125 | def set_stat(self, name:str, value:Union[float, Collection[float]])->None: 126 | if name in ['lr', 'mom', 'beta', 'wd']: setattr(self, name, value) 127 | else: 128 | val = listify(value, self._stats[name]) 129 | self.set_val(name, val) 130 | self._stats[name] = val 131 | 132 | def set_val(self, key:str, val:Any, bn_groups:bool=True)->Any: 133 | "Set `val` inside the optimizer dictionary at `key`." 134 | if is_tuple(val): val = [(v1,v2) for v1,v2 in zip(*val)] 135 | for v,pg1,pg2 in zip(val,self.opt.param_groups[::2],self.opt.param_groups[1::2]): 136 | pg1[key] = v 137 | if bn_groups: pg2[key] = v 138 | return val 139 | 140 | def read_val(self, key:str) -> Union[List[float],Tuple[List[float],List[float]]]: 141 | "Read a hyperparameter `key` in the optimizer dictionary." 142 | val = [pg[key] for pg in self.opt.param_groups[::2]] 143 | if is_tuple(val[0]): val = [o[0] for o in val], [o[1] for o in val] 144 | return val 145 | 146 | def get_state(self): 147 | "Return the inner state minus the layer groups." 148 | return {'opt_state':self.opt.state_dict(), 'lr':self._lr, 'wd':self._wd, 'beta':self._beta, 'mom':self._mom, 149 | 'opt_func':self.opt_func, 'true_wd':self.true_wd, 'bn_wd':self.bn_wd} 150 | 151 | @classmethod 152 | def load_with_state_and_layer_group(cls, state:dict, layer_groups:Collection[nn.Module]): 153 | res = cls.create(state['opt_func'], state['lr'], layer_groups, wd=state['wd'], true_wd=state['true_wd'], 154 | bn_wd=state['bn_wd']) 155 | res._mom,res._beta = state['mom'],state['beta'] 156 | res.load_state_dict(state['opt_state']) 157 | return res 158 | 159 | class Callback(): 160 | "Base class for callbacks that want to record values, dynamically change learner params, etc." 161 | _order=0 162 | def on_train_begin(self, **kwargs:Any)->None: 163 | "To initialize constants in the callback." 164 | pass 165 | def on_epoch_begin(self, **kwargs:Any)->None: 166 | "At the beginning of each epoch." 167 | pass 168 | def on_batch_begin(self, **kwargs:Any)->None: 169 | "Set HP before the output and loss are computed." 170 | pass 171 | def on_loss_begin(self, **kwargs:Any)->None: 172 | "Called after forward pass but before loss has been computed." 173 | pass 174 | def on_backward_begin(self, **kwargs:Any)->None: 175 | "Called after the forward pass and the loss has been computed, but before backprop." 176 | pass 177 | def on_backward_end(self, **kwargs:Any)->None: 178 | "Called after backprop but before optimizer step. Useful for true weight decay in AdamW." 179 | pass 180 | def on_step_end(self, **kwargs:Any)->None: 181 | "Called after the step of the optimizer but before the gradients are zeroed." 182 | pass 183 | def on_batch_end(self, **kwargs:Any)->None: 184 | "Called at the end of the batch." 185 | pass 186 | def on_epoch_end(self, **kwargs:Any)->None: 187 | "Called at the end of an epoch." 188 | pass 189 | def on_train_end(self, **kwargs:Any)->None: 190 | "Useful for cleaning up things and saving files/models." 191 | pass 192 | def jump_to_epoch(self, epoch)->None: 193 | "To resume training at `epoch` directly." 194 | pass 195 | 196 | def get_state(self, minimal:bool=True): 197 | "Return the inner state of the `Callback`, `minimal` or not." 198 | to_remove = ['exclude', 'not_min'] + getattr(self, 'exclude', []).copy() 199 | if minimal: to_remove += getattr(self, 'not_min', []).copy() 200 | return {k:v for k,v in self.__dict__.items() if k not in to_remove} 201 | 202 | def __repr__(self): 203 | attrs = func_args(self.__init__) 204 | to_remove = getattr(self, 'exclude', []) 205 | list_repr = [self.__class__.__name__] + [f'{k}: {getattr(self, k)}' for k in attrs if k != 'self' and k not in to_remove] 206 | return '\n'.join(list_repr) 207 | 208 | class SmoothenValue(): 209 | "Create a smooth moving average for a value (loss, etc) using `beta`." 210 | def __init__(self, beta:float): 211 | self.beta,self.n,self.mov_avg = beta,0,0 212 | 213 | def add_value(self, val:float)->None: 214 | "Add `val` to calculate updated smoothed value." 215 | self.n += 1 216 | self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val 217 | self.smooth = self.mov_avg / (1 - self.beta ** self.n) 218 | 219 | CallbackList = Collection[Callback] 220 | 221 | def _get_init_state(): return {'epoch':0, 'iteration':0, 'num_batch':0, 'skip_validate': False} 222 | 223 | @dataclass 224 | class CallbackHandler(): 225 | "Manage all of the registered `callbacks` and `metrics`, smoothing loss by momentum `beta`." 226 | callbacks:CallbackList=None 227 | metrics:CallbackList=None 228 | beta:float=0.98 229 | 230 | def __post_init__(self)->None: 231 | "Initialize smoother and learning stats." 232 | self.callbacks = ifnone(self.callbacks, []) 233 | self.metrics = ifnone(self.metrics, []) 234 | self.metrics = [(met if isinstance(met, Callback) else AverageMetric(met)) for met in self.metrics] 235 | self.callbacks = sorted(self.callbacks, key=lambda o: getattr(o, '_order', 0)) 236 | self.smoothener = SmoothenValue(self.beta) 237 | self.state_dict:Dict[str,Union[int,float,Tensor]]=_get_init_state() 238 | 239 | def _call_and_update(self, cb, cb_name, **kwargs)->None: 240 | "Call `cb_name` on `cb` and update the inner state." 241 | new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict()) 242 | for k,v in new.items(): 243 | if k not in self.state_dict: 244 | raise Exception(f"{k} isn't a valid key in the state of the callbacks.") 245 | else: self.state_dict[k] = v 246 | 247 | def __call__(self, cb_name, call_mets=True, **kwargs)->None: 248 | "Call through to all of the `CallbakHandler` functions." 249 | if call_mets: 250 | for met in self.metrics: self._call_and_update(met, cb_name, **kwargs) 251 | for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs) 252 | 253 | def set_dl(self, dl:DataLoader): 254 | "Set the current `dl` used." 255 | if hasattr(self, 'cb_dl'): self.callbacks.remove(self.cb_dl) 256 | if isinstance(dl.dataset, Callback): 257 | self.callbacks.append(dl.dataset) 258 | self.cb_dl = dl.dataset 259 | 260 | def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None: 261 | "About to start learning." 262 | self.state_dict = _get_init_state() 263 | self.state_dict.update(dict(n_epochs=epochs, pbar=pbar, metrics=metrics)) 264 | names = [(met.name if hasattr(met, 'name') else camel2snake(met.__class__.__name__)) for met in self.metrics] 265 | self('train_begin', metrics_names=names) 266 | if self.state_dict['epoch'] != 0: 267 | self.state_dict['pbar'].first_bar.total -= self.state_dict['epoch'] 268 | for cb in self.callbacks: cb.jump_to_epoch(self.state_dict['epoch']) 269 | 270 | def on_epoch_begin(self)->None: 271 | "Handle new epoch." 272 | self.state_dict['num_batch'],self.state_dict['stop_training'] = 0,False 273 | self('epoch_begin') 274 | 275 | def on_batch_begin(self, xb:Tensor, yb:Tensor, train:bool=True)->Tuple[Any,Any]: 276 | "Handle new batch `xb`,`yb` in `train` or validation." 277 | self.state_dict.update(dict(last_input=xb, last_target=yb, train=train, 278 | stop_epoch=False, skip_step=False, skip_zero=False, skip_bwd=False)) 279 | self('batch_begin', call_mets = not self.state_dict['train']) 280 | return self.state_dict['last_input'], self.state_dict['last_target'] 281 | 282 | def on_loss_begin(self, out:Tensor)->Any: 283 | "Handle start of loss calculation with model output `out`." 284 | self.state_dict['last_output'] = out 285 | self('loss_begin', call_mets=False) 286 | return self.state_dict['last_output'] 287 | 288 | def on_backward_begin(self, loss:Tensor)->Tuple[Any,Any]: 289 | "Handle gradient calculation on `loss`." 290 | self.smoothener.add_value(loss.float().detach().cpu()) 291 | self.state_dict['last_loss'], self.state_dict['smooth_loss'] = loss, self.smoothener.smooth 292 | self('backward_begin', call_mets=False) 293 | return self.state_dict['last_loss'], self.state_dict['skip_bwd'] 294 | 295 | def on_backward_end(self)->Any: 296 | "Handle end of gradient calculation." 297 | self('backward_end', call_mets=False) 298 | return self.state_dict['skip_step'] 299 | 300 | def on_step_end(self)->Any: 301 | "Handle end of optimization step." 302 | self('step_end', call_mets=False) 303 | return self.state_dict['skip_zero'] 304 | 305 | def on_batch_end(self, loss:Tensor)->Any: 306 | "Handle end of processing one batch with `loss`." 307 | self.state_dict['last_loss'] = loss 308 | self('batch_end', call_mets = not self.state_dict['train']) 309 | if self.state_dict['train']: 310 | self.state_dict['iteration'] += 1 311 | self.state_dict['num_batch'] += 1 312 | return self.state_dict['stop_epoch'] 313 | 314 | def on_epoch_end(self, val_loss:Tensor)->bool: 315 | "Epoch is done, process `val_loss`." 316 | self.state_dict['last_metrics'] = [val_loss] if val_loss is not None else [None] 317 | self('epoch_end', call_mets = val_loss is not None) 318 | self.state_dict['epoch'] += 1 319 | return self.state_dict['stop_training'] 320 | 321 | def on_train_end(self, exception:Union[bool,Exception])->None: 322 | "Handle end of training, `exception` is an `Exception` or False if no exceptions during training." 323 | self('train_end', exception=exception) 324 | 325 | @property 326 | def skip_validate(self): return self.state_dict['skip_validate'] 327 | 328 | class AverageMetric(Callback): 329 | "Wrap a `func` in a callback for metrics computation." 330 | def __init__(self, func): 331 | # If func has a __name__ use this one else it should be a partial 332 | name = func.__name__ if hasattr(func, '__name__') else func.func.__name__ 333 | self.func, self.name = func, name 334 | self.world = num_distrib() 335 | 336 | def on_epoch_begin(self, **kwargs): 337 | "Set the inner value to 0." 338 | self.val, self.count = 0.,0 339 | 340 | def on_batch_end(self, last_output, last_target, **kwargs): 341 | "Update metric computation with `last_output` and `last_target`." 342 | if not is_listy(last_target): last_target=[last_target] 343 | self.count += first_el(last_target).size(0) 344 | val = self.func(last_output, *last_target) 345 | if self.world: 346 | val = val.clone() 347 | dist.all_reduce(val, op=dist.ReduceOp.SUM) 348 | val /= self.world 349 | self.val += first_el(last_target).size(0) * val.detach().cpu() 350 | 351 | def on_epoch_end(self, last_metrics, **kwargs): 352 | "Set the final result in `last_metrics`." 353 | return add_metrics(last_metrics, self.val/self.count) 354 | 355 | def annealing_no(start:Number, end:Number, pct:float)->Number: 356 | "No annealing, always return `start`." 357 | return start 358 | def annealing_linear(start:Number, end:Number, pct:float)->Number: 359 | "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0." 360 | return start + pct * (end-start) 361 | def annealing_exp(start:Number, end:Number, pct:float)->Number: 362 | "Exponentially anneal from `start` to `end` as pct goes from 0.0 to 1.0." 363 | return start * (end/start) ** pct 364 | def annealing_cos(start:Number, end:Number, pct:float)->Number: 365 | "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." 366 | cos_out = np.cos(np.pi * pct) + 1 367 | return end + (start-end)/2 * cos_out 368 | 369 | def do_annealing_poly(start:Number, end:Number, pct:float, degree:Number)->Number: 370 | "Helper function for `anneal_poly`." 371 | return end + (start-end) * (1-pct)**degree 372 | def annealing_poly(degree:Number)->Number: 373 | "Anneal polynomically from `start` to `end` as pct goes from 0.0 to 1.0." 374 | return functools.partial(do_annealing_poly, degree=degree) 375 | 376 | class Scheduler(): 377 | "Used to \"step\" from start,end (`vals`) over `n_iter` iterations on a schedule defined by `func`" 378 | def __init__(self, vals:StartOptEnd, n_iter:int, func:Optional[AnnealFunc]=None): 379 | self.start,self.end = (vals[0],vals[1]) if is_tuple(vals) else (vals,0) 380 | self.n_iter = max(1,n_iter) 381 | if func is None: self.func = annealing_linear if is_tuple(vals) else annealing_no 382 | else: self.func = func 383 | self.n = 0 384 | 385 | def restart(self): self.n = 0 386 | 387 | def step(self)->Number: 388 | "Return next value along annealed schedule." 389 | self.n += 1 390 | return self.func(self.start, self.end, self.n/self.n_iter) 391 | 392 | @property 393 | def is_done(self)->bool: 394 | "Return `True` if schedule completed." 395 | return self.n >= self.n_iter 396 | 397 | -------------------------------------------------------------------------------- /sls/sls.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | import time 4 | 5 | import sls_utils as ut 6 | 7 | class Sls(torch.optim.Optimizer): 8 | """Implements stochastic line search 9 | `paper `_. 10 | Arguments: 11 | params (iterable): iterable of parameters to optimize or dicts defining 12 | parameter groups 13 | n_batches_per_epoch (int, recommended):: the number batches in an epoch 14 | init_step_size (float, optional): initial step size (default: 1) 15 | c (float, optional): armijo condition constant (default: 0.1) 16 | beta_b (float, optional): multiplicative factor for decreasing the step-size (default: 0.9) 17 | gamma (float, optional): factor used by Armijo for scaling the step-size at each line-search step (default: 2.0) 18 | beta_f (float, optional): factor used by Goldstein for scaling the step-size at each line-search step (default: 2.0) 19 | reset_option (float, optional): sets the rest option strategy (default: 1) 20 | eta_max (float, optional): an upper bound used by Goldstein on the step size (default: 10) 21 | bound_step_size (bool, optional): a flag used by Goldstein for whether to bound the step-size (default: True) 22 | line_search_fn (float, optional): the condition used by the line-search to find the 23 | step-size (default: Armijo) 24 | """ 25 | 26 | def __init__(self, 27 | params, 28 | n_batches_per_epoch=500, 29 | init_step_size=1, 30 | c=0.1, 31 | beta_b=0.9, 32 | gamma=2.0, 33 | beta_f=2.0, 34 | reset_option=1, 35 | eta_max=10, 36 | bound_step_size=True, 37 | line_search_fn="armijo"): 38 | defaults = dict(n_batches_per_epoch=n_batches_per_epoch, 39 | init_step_size=init_step_size, 40 | c=c, 41 | beta_b=beta_b, 42 | gamma=gamma, 43 | beta_f=beta_f, 44 | reset_option=reset_option, 45 | eta_max=eta_max, 46 | bound_step_size=bound_step_size, 47 | line_search_fn=line_search_fn) 48 | super().__init__(params, defaults) 49 | 50 | self.state['step'] = 0 51 | self.state['step_size'] = init_step_size 52 | 53 | self.state['n_forwards'] = 0 54 | self.state['n_backwards'] = 0 55 | 56 | def step(self, closure): 57 | # deterministic closure 58 | seed = time.time() 59 | def closure_deterministic(): 60 | #with ut.random_seed_torch(int(seed)): 61 | return closure() 62 | 63 | batch_step_size = self.state['step_size'] 64 | 65 | # get loss and compute gradients 66 | loss = closure() #_deterministic() 67 | loss.backward() 68 | 69 | # increment # forward-backward calls 70 | self.state['n_forwards'] += 1 71 | self.state['n_backwards'] += 1 72 | 73 | # loop over parameter groups 74 | for group in self.param_groups: 75 | params = group["params"] 76 | 77 | # save the current parameters: 78 | params_current = copy.deepcopy(params) 79 | grad_current = ut.get_grad_list(params) 80 | 81 | grad_norm = ut.compute_grad_norm(grad_current) 82 | 83 | step_size = ut.reset_step(step_size=batch_step_size, 84 | n_batches_per_epoch=group['n_batches_per_epoch'], 85 | gamma=group['gamma'], 86 | reset_option=group['reset_option'], 87 | init_step_size=group['init_step_size']) 88 | 89 | # only do the check if the gradient norm is big enough 90 | with torch.no_grad(): 91 | if grad_norm >= 1e-8: 92 | # check if condition is satisfied 93 | found = 0 94 | step_size_old = step_size 95 | 96 | for e in range(100): 97 | # try a prospective step 98 | ut.try_sgd_update(params, step_size, params_current, grad_current) 99 | 100 | # compute the loss at the next step; no need to compute gradients. 101 | loss_next = closure() #closure_deterministic() 102 | self.state['n_forwards'] += 1 103 | 104 | # ================================================= 105 | # Line search 106 | if group['line_search_fn'] == "armijo": 107 | armijo_results = ut.check_armijo_conditions(step_size=step_size, 108 | step_size_old=step_size_old, 109 | loss=loss, 110 | grad_norm=grad_norm, 111 | loss_next=loss_next, 112 | c=group['c'], 113 | beta_b=group['beta_b']) 114 | found, step_size, step_size_old = armijo_results 115 | if found == 1: 116 | break 117 | 118 | elif group['line_search_fn'] == "goldstein": 119 | goldstein_results = ut.check_goldstein_conditions(step_size=step_size, 120 | loss=loss, 121 | grad_norm=grad_norm, 122 | loss_next=loss_next, 123 | c=group['c'], 124 | beta_b=group['beta_b'], 125 | beta_f=group['beta_f'], 126 | bound_step_size=group['bound_step_size'], 127 | eta_max=group['eta_max']) 128 | 129 | found = goldstein_results["found"] 130 | step_size = goldstein_results["step_size"] 131 | 132 | if found == 3: 133 | break 134 | 135 | # if line search exceeds max_epochs 136 | if found == 0: 137 | print("line search attempts exceeded...using defaults") 138 | ut.try_sgd_update(params, 1e-6, params_current, grad_current) 139 | 140 | # save the new step-size 141 | self.state['step_size'] = step_size 142 | self.state['step'] += 1 143 | 144 | return loss -------------------------------------------------------------------------------- /sls/sls_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.cuda 3 | 4 | import numpy as np 5 | #import contextlib 6 | 7 | 8 | def check_armijo_conditions(step_size, step_size_old, loss, grad_norm, 9 | loss_next, c, beta_b): 10 | found = 0 11 | 12 | # computing the new break condition 13 | break_condition = loss_next - \ 14 | (loss - (step_size) * c * grad_norm**2) 15 | 16 | if (break_condition <= 0): 17 | found = 1 18 | 19 | else: 20 | # decrease the step-size by a multiplicative factor 21 | step_size = step_size * beta_b 22 | 23 | return found, step_size, step_size_old 24 | 25 | def check_goldstein_conditions(step_size, loss, grad_norm, 26 | loss_next, 27 | c, beta_b, beta_f, bound_step_size, eta_max): 28 | found = 0 29 | if(loss_next <= (loss - (step_size) * c * grad_norm ** 2)): 30 | found = 1 31 | 32 | if(loss_next >= (loss - (step_size) * (1 - c) * grad_norm ** 2)): 33 | if found == 1: 34 | found = 3 # both conditions are satisfied 35 | else: 36 | found = 2 # only the curvature condition is satisfied 37 | 38 | if (found == 0): 39 | raise ValueError('Error') 40 | 41 | elif (found == 1): 42 | # step-size might be too small 43 | step_size = step_size * beta_f 44 | if bound_step_size: 45 | step_size = min(step_size, eta_max) 46 | 47 | elif (found == 2): 48 | # step-size might be too large 49 | step_size = max(step_size * beta_b, 1e-8) 50 | 51 | return {"found":found, "step_size":step_size} 52 | 53 | 54 | def reset_step(step_size, n_batches_per_epoch=None, gamma=None, reset_option=1, 55 | init_step_size=None): 56 | if reset_option == 0: 57 | pass 58 | 59 | elif reset_option == 1: 60 | step_size = step_size * gamma**(1. / n_batches_per_epoch) 61 | 62 | elif reset_option == 2: 63 | step_size = init_step_size 64 | 65 | return step_size 66 | 67 | def try_sgd_update(params, step_size, params_current, grad_current): 68 | zipped = zip(params, params_current, grad_current) 69 | 70 | for p_next, p_current, g_current in zipped: 71 | p_next.data = p_current - step_size * g_current 72 | 73 | def compute_grad_norm(grad_list): 74 | grad_norm = 0. 75 | for g in grad_list: 76 | if g is None: 77 | continue 78 | grad_norm += torch.sum(torch.mul(g, g)) 79 | grad_norm = torch.sqrt(grad_norm) 80 | return grad_norm 81 | 82 | 83 | def get_grad_list(params): 84 | return [p.grad for p in params] 85 | 86 | #@contextlib.contextmanager 87 | def random_seed(seed): 88 | state = np.random.get_state() 89 | np.random.seed(seed) 90 | try: 91 | yield 92 | finally: 93 | np.random.set_state(state) 94 | 95 | #@contextlib.contextmanager 96 | def random_seed_torch(seed, device=0): 97 | cpu_rng_state = torch.get_rng_state() 98 | gpu_rng_state = torch.cuda.get_rng_state(0) 99 | 100 | np.random.seed(seed) 101 | torch.manual_seed(seed) 102 | torch.cuda.manual_seed_all(seed) 103 | 104 | try: 105 | yield 106 | finally: 107 | torch.set_rng_state(cpu_rng_state) 108 | torch.cuda.set_rng_state(gpu_rng_state, device) --------------------------------------------------------------------------------