├── .gitignore
├── DeepMemory
    ├── DeepMemory-Playground.ipynb
    ├── README.md
    └── deepmemory.py
├── LICENSE
├── README.md
├── Ranger
    └── ranger.py
├── adahessian
    ├── README.md
    └── adahessian.py
├── adamod
    ├── README.md
    ├── adamod.py
    └── diffmod.py
├── diffgrad
    ├── README.md
    ├── diff_rgrad.py
    ├── diffgrad-playground.ipynb
    ├── diffgrad.py
    └── mxresnet.py
├── diffmod
    ├── diffmod-playground.ipynb
    └── diffmod.py
├── images
    ├── 1120-optimizer-testing.jpg
    ├── projected_gradient.png
    ├── ranger-init.jpg
    └── ranger-with-gc-options.jpg
├── madgrad
    └── madgrad_wd.py
└── sls
    ├── README.md
    ├── basic_train.py
    ├── callback.py
    ├── sls.py
    └── sls_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 


--------------------------------------------------------------------------------
/DeepMemory/DeepMemory-Playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "\n",
 12 |     "%matplotlib inline"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "from fastai.script import *\n",
 22 |     "from fastai.vision import *\n",
 23 |     "from fastai.callbacks import *\n",
 24 |     "from fastai.distributed import *\n",
 25 |     "from fastai.callbacks.tracker import *\n",
 26 |     "\n",
 27 |     "torch.backends.cudnn.benchmark = True\n",
 28 |     "\n",
 29 |     "import time"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "'1.0.57'"
 41 |       ]
 42 |      },
 43 |      "execution_count": 3,
 44 |      "metadata": {},
 45 |      "output_type": "execute_result"
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "import fastai;fastai.__version__ #safety check"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 4,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/plain": [
 60 |        "'1.2.0'"
 61 |       ]
 62 |      },
 63 |      "execution_count": 4,
 64 |      "metadata": {},
 65 |      "output_type": "execute_result"
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "import torch; torch.__version__ #safety check"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 5,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "from deepmemory import DeepMemory"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 6,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "Mish activation loaded...\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "from mxresnet import *"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 7,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "PosixPath('/home/ubuntu/.fastai/data/imagenette-160')"
107 |       ]
108 |      },
109 |      "execution_count": 7,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "path = untar_data(URLs.IMAGENETTE_160); path  #optional - IMAGENETTE"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 8,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "def flattenAnneal(learn:Learner, lr:float, n_epochs:int, start_pct:float):\n",
125 |     "  n = len(learn.data.train_dl)\n",
126 |     "  anneal_start = int(n*n_epochs*start_pct)\n",
127 |     "  anneal_end = int(n*n_epochs) - anneal_start\n",
128 |     "  phases = [TrainingPhase(anneal_start).schedule_hp('lr', lr),\n",
129 |     "           TrainingPhase(anneal_end).schedule_hp('lr', lr, anneal=annealing_cos)]\n",
130 |     "  sched = GeneralScheduler(learn, phases)\n",
131 |     "  learn.callbacks.append(sched)\n",
132 |     "  learn.fit(n_epochs)\n"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 9,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "tfms = ([\n",
142 |     "\n",
143 |     "        flip_lr(p=0.5)#,\n",
144 |     "        #brightness(change=(0.4,0.6)),\n",
145 |     "        #contrast(scale=(0.7,1.3)),\n",
146 |     "        #cutout(n_holes=(2,40),length=(5,30),p=.25)\n",
147 |     "\n",
148 |     "    ], [])"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 10,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "bs=64\n",
158 |     "size=128"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 11,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "data = (ImageList.from_folder(path)\n",
168 |     "        .split_by_folder(valid='val')\n",
169 |     "        .label_from_folder()\n",
170 |     "        .transform(tfms=tfms,size=size) \n",
171 |     "        .databunch(bs=bs, num_workers=8)  #windows 10 users - num_workers may need to be set to 1 or 0 (if you get pickle fork error)\n",
172 |     "        .presize(size, scale=(0.5, 1))\n",
173 |     "        .normalize(imagenet_stats))"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 12,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "201"
185 |       ]
186 |      },
187 |      "execution_count": 12,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "memory_size = (len(data.x)//bs);memory_size  #should be equal to or close to # of batches per epoch in order to build an average step size for the dataset"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 13,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "optar = partial(DeepMemory,betas=(.95,.999),len_memory = memory_size)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 14,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "model = mxresnet50(sa=1)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 15,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "learn = Learner(data, model, metrics=[accuracy], wd=1e-3,\n",
221 |     "                opt_func=optar,\n",
222 |     "                 bn_wd=False, true_wd=True,\n",
223 |     "                loss_func = LabelSmoothingCrossEntropy())"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 16,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "learn.callback_fns += [\n",
233 |     "        partial(ShowGraph),\n",
234 |     "        #partial(SaveModelCallback, name='model-novotest-1')\n",
235 |     "    ]"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 17,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "DeepMemory: length of memory is 201 - this should be close or equal to batches per epoch\n"
248 |      ]
249 |     },
250 |     {
251 |      "data": {
252 |       "text/html": [
253 |        "<table border=\"1\" class=\"dataframe\">\n",
254 |        "  <thead>\n",
255 |        "    <tr style=\"text-align: left;\">\n",
256 |        "      <th>epoch</th>\n",
257 |        "      <th>train_loss</th>\n",
258 |        "      <th>valid_loss</th>\n",
259 |        "      <th>accuracy</th>\n",
260 |        "      <th>time</th>\n",
261 |        "    </tr>\n",
262 |        "  </thead>\n",
263 |        "  <tbody>\n",
264 |        "    <tr>\n",
265 |        "      <td>0</td>\n",
266 |        "      <td>2.571196</td>\n",
267 |        "      <td>2.547925</td>\n",
268 |        "      <td>0.432000</td>\n",
269 |        "      <td>00:41</td>\n",
270 |        "    </tr>\n",
271 |        "    <tr>\n",
272 |        "      <td>1</td>\n",
273 |        "      <td>2.228164</td>\n",
274 |        "      <td>2.119214</td>\n",
275 |        "      <td>0.604000</td>\n",
276 |        "      <td>00:38</td>\n",
277 |        "    </tr>\n",
278 |        "    <tr>\n",
279 |        "      <td>2</td>\n",
280 |        "      <td>2.080257</td>\n",
281 |        "      <td>1.947510</td>\n",
282 |        "      <td>0.666000</td>\n",
283 |        "      <td>00:37</td>\n",
284 |        "    </tr>\n",
285 |        "    <tr>\n",
286 |        "      <td>3</td>\n",
287 |        "      <td>1.979283</td>\n",
288 |        "      <td>2.059222</td>\n",
289 |        "      <td>0.636000</td>\n",
290 |        "      <td>00:37</td>\n",
291 |        "    </tr>\n",
292 |        "    <tr>\n",
293 |        "      <td>4</td>\n",
294 |        "      <td>1.887242</td>\n",
295 |        "      <td>1.752053</td>\n",
296 |        "      <td>0.742000</td>\n",
297 |        "      <td>00:37</td>\n",
298 |        "    </tr>\n",
299 |        "    <tr>\n",
300 |        "      <td>5</td>\n",
301 |        "      <td>1.838371</td>\n",
302 |        "      <td>1.779305</td>\n",
303 |        "      <td>0.732000</td>\n",
304 |        "      <td>00:37</td>\n",
305 |        "    </tr>\n",
306 |        "    <tr>\n",
307 |        "      <td>6</td>\n",
308 |        "      <td>1.801879</td>\n",
309 |        "      <td>1.786606</td>\n",
310 |        "      <td>0.728000</td>\n",
311 |        "      <td>00:37</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <td>7</td>\n",
315 |        "      <td>1.728110</td>\n",
316 |        "      <td>1.704001</td>\n",
317 |        "      <td>0.752000</td>\n",
318 |        "      <td>00:37</td>\n",
319 |        "    </tr>\n",
320 |        "    <tr>\n",
321 |        "      <td>8</td>\n",
322 |        "      <td>1.699986</td>\n",
323 |        "      <td>1.714347</td>\n",
324 |        "      <td>0.758000</td>\n",
325 |        "      <td>00:37</td>\n",
326 |        "    </tr>\n",
327 |        "    <tr>\n",
328 |        "      <td>9</td>\n",
329 |        "      <td>1.681988</td>\n",
330 |        "      <td>1.630025</td>\n",
331 |        "      <td>0.784000</td>\n",
332 |        "      <td>00:37</td>\n",
333 |        "    </tr>\n",
334 |        "    <tr>\n",
335 |        "      <td>10</td>\n",
336 |        "      <td>1.625826</td>\n",
337 |        "      <td>1.710677</td>\n",
338 |        "      <td>0.764000</td>\n",
339 |        "      <td>00:37</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <td>11</td>\n",
343 |        "      <td>1.583407</td>\n",
344 |        "      <td>1.607164</td>\n",
345 |        "      <td>0.814000</td>\n",
346 |        "      <td>00:37</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <td>12</td>\n",
350 |        "      <td>1.565465</td>\n",
351 |        "      <td>1.577648</td>\n",
352 |        "      <td>0.800000</td>\n",
353 |        "      <td>00:37</td>\n",
354 |        "    </tr>\n",
355 |        "    <tr>\n",
356 |        "      <td>13</td>\n",
357 |        "      <td>1.565315</td>\n",
358 |        "      <td>1.556962</td>\n",
359 |        "      <td>0.806000</td>\n",
360 |        "      <td>00:37</td>\n",
361 |        "    </tr>\n",
362 |        "    <tr>\n",
363 |        "      <td>14</td>\n",
364 |        "      <td>1.516048</td>\n",
365 |        "      <td>1.666469</td>\n",
366 |        "      <td>0.780000</td>\n",
367 |        "      <td>00:38</td>\n",
368 |        "    </tr>\n",
369 |        "    <tr>\n",
370 |        "      <td>15</td>\n",
371 |        "      <td>1.475736</td>\n",
372 |        "      <td>1.541728</td>\n",
373 |        "      <td>0.826000</td>\n",
374 |        "      <td>00:37</td>\n",
375 |        "    </tr>\n",
376 |        "    <tr>\n",
377 |        "      <td>16</td>\n",
378 |        "      <td>1.439951</td>\n",
379 |        "      <td>1.482574</td>\n",
380 |        "      <td>0.834000</td>\n",
381 |        "      <td>00:37</td>\n",
382 |        "    </tr>\n",
383 |        "    <tr>\n",
384 |        "      <td>17</td>\n",
385 |        "      <td>1.400893</td>\n",
386 |        "      <td>1.434333</td>\n",
387 |        "      <td>0.858000</td>\n",
388 |        "      <td>00:38</td>\n",
389 |        "    </tr>\n",
390 |        "    <tr>\n",
391 |        "      <td>18</td>\n",
392 |        "      <td>1.374915</td>\n",
393 |        "      <td>1.411827</td>\n",
394 |        "      <td>0.868000</td>\n",
395 |        "      <td>00:38</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <td>19</td>\n",
399 |        "      <td>1.330561</td>\n",
400 |        "      <td>1.421291</td>\n",
401 |        "      <td>0.868000</td>\n",
402 |        "      <td>00:37</td>\n",
403 |        "    </tr>\n",
404 |        "  </tbody>\n",
405 |        "</table>"
406 |       ],
407 |       "text/plain": [
408 |        "<IPython.core.display.HTML object>"
409 |       ]
410 |      },
411 |      "metadata": {},
412 |      "output_type": "display_data"
413 |     },
414 |     {
415 |      "data": {
416 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAD4CAYAAAATpHZ6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3deXxc5X3v8c9vFi2jxVosy5I3ycbYxgu2EY4bCCGQBTCB9Iam7itpU5Jb2pD2QnpvE0ialKTkXrqn6c1yIaVJCoWAUxoIhIQmGJOyysY23vdFkmVt1r7OzHP/OGNZNtZiW2dGGn/fr9e8dObMmTM/HUtfP3rOc55jzjlERCQ9BFJdgIiIjB+FuohIGlGoi4ikEYW6iEgaUaiLiKSRkB87nVJY5ObPm+vHrkVE0tLGjRubnHMlF7ofX0J9Wvksqqur/di1iEhaMrPD47EfX7pfNPJdRCQ1/OlTV6qLiKSETydKleoiIqngS5+6Il1EzsXAwAA1NTX09vamuhTfZWVlMXPmTMLhsC/79yXURUTORU1NDXl5eVRUVGBmqS7HN845mpubqampobKy0pfPUJ+6iKRcb28vxcXFaR3oAGZGcXGxr3+RaPSLiEwI6R7oJ/n9feqKUhGRNKKWuohc9FpbW/n2t799zu+76aabaG1t9aGi8+dTn7piXUQmj+FCPRaLjfi+5557joKCAr/KOi8a0igiF7177rmH/fv3s3z5csLhMLm5uZSVlbF582Z27NjBRz7yEY4ePUpvby933XUXd9xxBwAVFRVUV1fT2dnJjTfeyNVXX80rr7zCjBkz+MlPfkJ2dnbSvxcNaRSRCeWrz2xnR137uO7zsvJ8/uLDi4d9/YEHHmDbtm1s3ryZ9evXs2bNGrZt2zY47PDhhx+mqKiInp4errzySj760Y9SXFx82j727t3LY489xkMPPcTHPvYxfvzjH/OJT3xiXL+PsVBLXUTkDKtWrTptHPk3v/lNnnrqKQCOHj3K3r173xHqlZWVLF++HIArrriCQ4cOJa3eofxpqSvVReQ8jdSiTpacnJzB5fXr1/Of//mfvPrqq0QiEa699tqzjjPPzMwcXA4Gg/T09CSl1jNpSKOIXPTy8vLo6Og462ttbW0UFhYSiUTYtWsXr732WpKrOzfqfhGRi15xcTFXXXUVS5YsITs7m9LS0sHXbrjhBr773e+ybNkyFixYwOrVq1NY6ejM+TD8cPaCpe7I7rfHfb8ikp527tzJokWLUl1G0pzt+zWzjc65qgvdtz8XH2mcuohISqhPXUQkjSjURUTSiOZ+ERFJI5pPXUQkjailLiKSRsYU6mZWYGbrzGyXme00s9/wuzARkYksNzcXgLq6Om677bazbnPttddSXV2dzLLGfPHRPwLPO+duM7MMIDLSxk5tdRG5SJSXl7Nu3bpUlzFo1FA3s3zgGuD3AZxz/UD/iG9SpovIJPOFL3yBOXPmcOeddwJw3333YWZs2LCBEydOMDAwwP3338+tt9562vsOHTrEzTffzLZt2+jp6eH2229nx44dLFq0KCXzv4ylpT4XaAT+xcwuBzYCdznnuoZuZGZ3AHcAFJTPHe86ReRi8bN7oH6cr0ifvhRufGDETdauXcvdd989GOpPPPEEzz//PJ/73OfIz8+nqamJ1atXc8sttwx7n9HvfOc7RCIRtm7dytatW1m5cuX4fh9jMJY+9RCwEviOc24F0AXcc+ZGzrkHnXNVzrmqrOyscS5TRMRfK1asoKGhgbq6OrZs2UJhYSFlZWV88YtfZNmyZbz//e+ntraW48ePD7uPDRs2DM6hvmzZMpYtW5as8geNpaVeA9Q4515PPF/HWUJdRGRcjNKi9tNtt93GunXrqK+vZ+3atTz66KM0NjayceNGwuEwFRUVZ512d6jhWvHJMmpL3TlXDxw1swWJVdcDO0Z+zzhUJiKSZGvXruXxxx9n3bp13HbbbbS1tTFt2jTC4TAvvvgihw8fHvH911xzDY8++igA27ZtY+vWrcko+zRjHf3yJ8CjiZEvB4Db/StJRCQ1Fi9eTEdHBzNmzKCsrIyPf/zjfPjDH6aqqorly5ezcOHCEd//mc98httvv51ly5axfPlyVq1alaTKT/Fl6t3p8xa7+v3bx32/IpKeNPXuRJ96V2MaRURSQnO/iIikEc39IiITwsVycx2/v0/Npy4iKZeVlUVzc3PaB7tzjubmZrKy/LuWx58bT6f3v4uIjLOZM2dSU1NDY2NjqkvxXVZWFjNnzvRt/76EujpgRORchMNhKisrU11GWvDpxtN+7FVEREajE6UiImlEJ0pFRNKIul9ERNKITy11pbqISCqoT11EJI2o+0VEJI2opS4ikkY0oZeISBrR1LsiImlE49RFRNKITpSKiKQRnSgVEUkj6n4REUkjPnW/qK0uIpIKaqmLiKQR9amLiKSRMd35yMwOAR1ADIg656pG2l69LyIiqXEut7N7n3OuybdKRETkgulEqYhIGhlrqDvgF2a20czuONsGZnaHmVWbWbUiXUQkNcba/XKVc67OzKYBL5jZLufchqEbOOceBB4EyCybr1wXEUmBMbXUnXN1ia8NwFPAqtHeE48r10VEkm3UUDezHDPLO7kMfBDYNtr7ogp1EZGkG0v3SynwlJmd3P7fnHPPj/amuE6Wiogk3aih7pw7AFx+rjtWS11EJPl8myYgplAXEUk6hbqISBpRqIuIpBGFuohIGvEv1DX6RUQk6fwL9ZhCXUQk2dRSFxFJIz72qcf92rWIiAzDx1D3a88iIjIc30I9qpa6iEjS+RbqynQRkeRTS11EJI3411LX6BcRkaTzr6WuceoiIkmnceoiImlEc7+IiKQRhbqISBpRqIuIpBGFuohIGvFxnLpCXUQk2XwL9f6oLj4SEUk230K9Lxrza9ciIjKMMYe6mQXN7C0z++lYtu8dUEtdRCTZzqWlfhewc6wbq6UuIpJ8Ywp1M5sJrAG+N9Yd96mlLiKSdGNtqX8D+DwwbFKb2R1mVm1m1QC9aqmLiCTdqKFuZjcDDc65jSNt55x70DlX5ZyrCpippS4ikgJjaalfBdxiZoeAx4HrzOyRkd5gppa6iEgqjBrqzrl7nXMznXMVwFrgV865T4y4U7XURURSwpdx6mbQp4uPRESSLnQuGzvn1gPrR9sugNE7oO4XEZFkU0tdRCSN+BLqATN6+tVSFxFJNl9CPRgwOvqifuxaRERG4E9LPQCdfQN+7FpEREbgT0vdjM5etdRFRJLNp5a60dkXxTndKENEJJl8a6kPxJxGwIiIJJlvLXWATp0sFRFJKp9a6t7XDvWri4gklb8tdYW6iEhS+danDtChYY0iIknla0td3S8iIsnlS6iHEqHe3Nnvx+5FRGQYPoW6t9vGjj4/di8iIsPwbZbGwkiYxs5eP3YvIiLD8CXUAablZdHQrpa6iEgy+RbqJXmZNHYq1EVEksnfUFefuohIUvnY/ZJJQ0efJvUSEUkiX1vq/dE47RqrLiKSNL6GOkBjh0bAiIgki++h3qB+dRGRpPG1Tx10AZKISDKNGupmlmVmb5jZFjPbbmZfHcuOS/KyAIW6iEgyhcawTR9wnXOu08zCwK/N7GfOuddGelN+VoiMUEChLiKSRKOGuvPGJHYmnoYTj1HHKZoZpfmZHD3RfWEViojImI2pT93Mgma2GWgAXnDOvX6Wbe4ws2ozq25sbARgcdkUdh7rGNeCRURkeGMKdedczDm3HJgJrDKzJWfZ5kHnXJVzrqqkpASAS6fncbi5i96B2LgWLSIiZ3dOo1+cc63AeuCGsWy/oDSPuIM9x9VaFxFJhrGMfikxs4LEcjbwfmDXWHa+ck4BANWHTlxAiSIiMlZjaamXAS+a2VbgTbw+9Z+OZedlU7KZWZjNm4daLqRGEREZo1FD3Tm31Tm3wjm3zDm3xDn3tXP5gFUVRbx56MTgxF6vHWhm7r3P8sNXD2myLxGRcebbFaUnVVUU0dTZx+HmbjYdOcHaB18j7uArP9nOzf/0a7r7NeGXiMh48T3UV1UWAvDGoRYe+JnXFf9f91zHb1fNYntdO3c+ukktdhGRceJ7qM8ryaUwEubNgy1sr23j999dwYyCbP7Pf1vKR1fOZP3uRp7cWON3GSIiFwXfQ93MWD23mCc31tDVH2NmYbb3wQHjb25bxrsqi/jLZ3ZwqKnL71JERNKe76EO8IHLSgeXF5Xln/rwgPG3v3U5fbE41/7tev7gh9XJKEdEJG0lJdSvWzhtcPnyWQWnvTarKML3b78SgBd2HOd7Lx9IRkkiImkpKaFeEMnAzFvOzXznHGLvnjeVvV+/kao5hdz/7E6+/B/biMd18lRE5FwlJdQBXr/3ejb82fuGfT0cDPDg71VRYq1kvPkdPvgPLxGNxZNVnohIWhjLfOrjYlp+1qjbFOVk8NL7a4m8/AhXtO7hK09m8pWPriIrHExChSIik1/SWupjFbnuz3Af+EtuCFbzuzv+gA985Qc0tOvm1SIiYzHhQh0z7Kr/QeB3f8zcjDaezvgyn3vgGzxRfTTVlYmITHgTL9RPmncdmXe+RHZROT8MP8Dupx7g/63fl+qqREQmtIkb6gBFc8n6o19hC9fw5fAjTP3l3Sy45ynaugdSXZmIyIQ0sUMdIDOPwG//K33vuYePBl/miYyv8aGvPcY///pgqisTEZlwJn6oAwQCZF5/L6z9NxaF63km88957tmnqLz3WbbXtaW6OhGRCWNyhPpJC9eQ8YcvUlBQxOOZX2dt4Jes+eav+fP/eJvmzr5UVyciknLmx7S3VVVVrrrax3lcek7Auk/D/l/ySPR6vhr9JAOJIfd/vmYRt19VSTBg/n2+iMg4M7ONzrmqC93P5Gqpn5RdCB9/Eq66i0+EfsmG0n9g7WWZANz/7E7WfPNl6ts0tl1ELj6Ts6U+1Nvr4CefhUgx0Y89wpN1U/naMzvoGYgxf1ou/7h2BZeV54++HxGRFBqvlvrkD3WAus3wo09AVyPc8k8cLF/DXY+/xdYa7yTqtQtKmJaXya76DmYVRphRmM2ffuBSTT8gIhOGQv1MnY3w5Cfh8H/Bu/+E3vd+hV/taeZLT73NiWHGtV8+cwqY8cHLSrnz2nmYqR9eRFJDoX42sQF4/l548yGYvgyW/hZcdguvnchj17F2ZhdHWDg9nzcOtnD3jzaf9tb8rBB/+N55fGhxKTMLI2rFi0hSJS3UzWwW8ENgOhAHHnTO/eNI70lZqJ+05XF49VtQv9V7Pn0ZXHYLLLoVSi4d3CwWd7yw4zh//8Ju9hzvPG0XS2dM4fffXcHKOYVUTs0ZvDm2WvMi4odkhnoZUOac22RmecBG4CPOuR3DvSfloX5Sy0HY+QzsfBpq3vTWlSyERbd4IV+6hJN373DO8dzb9WypaWXnsXZe3tt01l1+6qpKttS0AvC7q+fwwcWlRDJCEI9Dy36o3Xjq0XIQVnwcrvk8ZOlkrYgML2XdL2b2E+D/OudeGG6bCRPqQ7XVwq6fwo6n4cgr4OJQWHmqBT9j5WDAA3T2Rak+1MJnHtlEz0DsHbsr4QTLA/u5PLCfFcEDrAgeIBL3bp7tMnKx8hWQNQV2PQs5U+H6v4DlH4fA5BxFKiL+Skmom1kFsAFY4pxrH267CRnqQ3U2egG/8xk4+BLEo5A/AxZ92GvFz14NgSF96r3tcGwz1G6kff/rhI69RaS3HoAoQXbGZ7ElPo8tbh6b45ew35UTJ8ANi6fzqbknWPDW15nStAnKlsONfw2z35Wib1xEJqqkh7qZ5QIvAV93zv37WV6/A7gDYPbs2VccPnz4QmtLjp4TsPt5r4tm3y8h1gc502DhTd6J19qN0LgbSBynorkw44pTj+lLIZzNia5+DjR18s+/Pshzb9ef8SGOWwKvcG/4McqshY75v0ns+vuYUjqHw83dzCjMJhxUC17kYpbUUDezMPBT4OfOub8fbfsJ31IfTl8H7P2F10Wz9wXIiAwJ8JVQvhIiRWPaVX80TmtPP1/+j21EMkLEneMXmw/wmdDT/GHwWWIE+Hb0Fh6KrSErO4c/ft8lzC/N5b2XluhkrMhFKJknSg34AdDinLt7LDudtKE+VDwGFjitn308HG7u4tHnX2ZN/be5vOMljtk0Hsr+FA+3LAWMopwMWrr6ASiMhLlhSRl/ct0lOKCjd4Ce/hgNHX0ca+3hhiVl3P/sDmJxx5LEaJ1X9jdzpKWbW5eXMzU3c1xrFxH/JDPUrwZeBt7GG9II8EXn3HPDvSctQj0ZDm6An90DDdvpLFvNX3E7/3ow7x2bhYPGQOzcrye4/aoKbl5WxhVzxvbXhYikji4+ShexKGz6AfzqfuhtpX/57xG/9ktkTZkGwEt7Gvn7F/aw5Wgrq+cWsagsnynZYY40d5OdEeQ3V8xgwfQ8/vdzuzjS0kVeZhgAh+Pn248DUDk1h/deWsJNS8u4sqLwrN07sbjjx5tqqD7UwvrdjVx9yVSyMoJcWVHIjUvKCAcDxJ3DgJD6/0XGnUI93XS3wPoH4M3vQWYuXPtFuPLTEAyf3/5iA+w+Usfru4/w+u5a9tS3ESJGboZRnB2gu6+flTPzmZ4X4s0DjTS2dxMknnjECBInRJxAYl09hRyIl3OMIlbPLeaLNy1i6Ywp6v8XGScK9XTVsBOevwcOrPculLr6c2BB6O+Avk7o7/S+9rWfWj7bupg/Nw3pcpkccGXsd+Xsj5ez35VTMHsJH7n+PayaXz64XTQWV4te5Bwo1NOZc7D7Z/DzL8KJs9yLNSPXe2Tmea36k8sZud7zzDzIGPJaKMsbdx8I4QJBatsGmJKTReeAo2fAmF2SSyiU4f3nEQgObjv43Aza66BpDwPHd9NXv4uu2p2UxBoImPfzE3fGUVfCIco5SDm7Y2W05VTyensxc2bNZn5pPv/9PZXML82jrWeAUMAwg4Go41e7j/PS7kYG4o68zBAtXf2sWTqdDy4sJjsY94aWxqMQzNCVuZK2FOoXg2gfNO6CcORUYIdzJs5Vqf3dbN+2CZr2smf7JvI7DzLXaimP1pLJqb8UWl0Oh10pDggTI5R4hIkSstg71xEjZPGzfmQTBfQXzmf6vGUEShbA1EuhZAHklQ2OVIrFHd39UfKyvK6rfQ2dPL/tGCtmF7J8VgE5maHRv7e+Tmje5z2a9kLzXu/riUPeZ5Utg7LLvcf0ZZBdcKFHUy5yCnWZuOJxaK+Bpj301++ms24n8eaD7Gvspjdu9LsgWZlZdEeNqVNyyMjIpCg/h+L8HNr6IC+SRV1HlA37W2noihMlSHZWFtHeTubaMS4J1HJJoI58ugc/sicQoT48i809peyNlbHPlbPPzeAIpUTd2WfcXDgtm/z+43y+KsgVOU1Y8z76ju8mo/UA1lE3uJ3DsIJZUDwfiiq9v1qObYH22lM7K6w4FfJll3tXD+dM9esIp87gHEeboGE7FMyB2b/hdRVOlMbGJKVQl4uGc27whKxzjqe31PHK3ibqjx2Gxj3MitdwidVyidUyL3CMMmsZfG+/C1IXLOOAm0FX/jxOdPUzte8Ic+0YlVZPpp2aa7/NRTjgyr1zBvGyweXDrpQ+Mrjvw5exsCyfyqk5lOZn8dKmHRS276SgbQfUb6W4fRc5XUdOFZ4/44ygv/y0vygmPOegrQbqNnkhXrcJ6rZAn3fzGSwILjEvUlaBN73G7NVeyJevgJCukzgXCnURoHcgRn1bL8GAEckIUpyb6c3V07SXgeM7CbXsw5r2QNNub9ZMM1xhBfGiS3DFl3Aiew6vtBbyd5sckYJS+uOO/micSEaQGQXZrFlWzpsHW/hR9dHTPjcjFKA/+s4uony6WBY6wmI7RFXmERa5g8yIHcUS00z0ZBQRKltKOG9q4pxIHi4zH8vM955neV/j4TxOxDJpJ0IoK5+SogL6oo5IZtC/KSU6G88I8Le8u4kBBMJQuvjUldUzVsLUBdB2FI685k2Sd+Q1aNrjbR/M9K7EPhnys1api2oUCnWRcxXt91rJ5zFMtLW7n689s4PMcIA5xTnUnujhUHMX18wvYSAeZ1FZPrvrO9jf0EkkI8jB5m7ePNhC3DmC0W4W2hGWBA6yxA5xaeAohYEeikJ9ZEQ7yaR/9NJdgE6y6XARBkI55E0pJB7KJpKTS25uPhbO9s69DPkaC2URyIhg4QiEI7hwFrubYzy5pZmW/iDvnd7HB6bUEmnaitW95QU0AOadpzgZ3uUrvUAPZ41+oLqa4OjrcDgR8sc2eye5MW8fJ0N+9mqYMvOc/x3SmUJdZJLoi8ZoaO+jqz9KY0cf33pxH82d/Rxs6iIad4SJsqAQikJ9NDY1kUc3VWUhZkViBAc6CMe6qK0/Ti49TMvoJzjQSS49ZFk/2fSTG+wnNzBAxPoJRHvI4Oy3bxzOUVdKa+ES6nIW0Vq4lN1WSV13iIJImPfML2FOcYTczBD7GzuZW5JLRXGEnoGYdx+B0fR3e5PiHXnVexx9wxt2CzBlNsy60uuXzy/3Qj6/HPJnenMsTZZuqnGiUBeZ5GJxx676dhaU5o15TL9zjhPdA7x2oJkXdzXQ2ef9R9HU2cehZu/EcYA42dbPnDwozoxxrOkE2fQzryDANRU5LJgawvq7qenP4fOvBmnlnVNTjNWc4gj/84ML+NDiUjJDQdp7B4jGHIWRMLG4Ixiw0y9Qi0W9E6yHX8UdeRXqNmHtdYnW/BChrETAz0g8ymHKjCHPZ6Rd8CvUReQ0LV39HGvrYe7UXLIzxn6P3aMt3ext6GBeSS4dvVFmFmaTmxni1/uaCAUC7D7ewdObaynMyWDh9Hx2HmunoaOPncdO3VJhuPmJSvMzWTA9n6DB27XtVBRHCAWNzr4o+xo6uWJOIbcsm85Nc4Pk9jZQf3QfBdEmWuoOEG+rJbu3nvz+BsLdx7Hhgj+vHHKneY+cEsgtPX05pwRCGed9XJNFoS4iKTcQi/PUplq21LTy6OtHmJqbQdWcIoIBo6Gjl70NXldLPO7ICgdp7uonFvcyJyMYoD926mTzmc+HChBnKm2UWTNl1sLs0AlWFfdQMNDIzFAr+bEThHuaCEc7z/p+sgreGfa5Jd69E3JLvbuUZUQS14TkJM5N5EBwDF1M40ShLiKTxpnDUuMOggHDOcfGwyd46OUDFOVkkhE0pmSHmT4lmxWzC+gZiLG9rp3ttW2UTcnmQFMn9W29dPZF2V73zpuvZdJPibVRYm1MpZUFuT3Myeoiq6+ZrL4mpgc7KAu2kxc7QWasa/TCgxlDgj5yKvgHl3NOrXNxrxvp5BXQZy4PPh/wpvYefM17bp99bVxCPXn/DYnIRWtov7qZEbRTy1UVRVRVDD899MrZhWdd39UXJWDGhr2NhAJGXlaY2tZu7v33t8ksqGBXay87gxm0tQ3ggK7+KMU5mZzo9P5ayKKPqdZGCW3kWTerZ2Zx08IpFIaiRAJ9hGO90N8FA93eCd+BLhjo8db1tkJ7Hb3dHQRjPdhAD4FggEAw7E2xEUh8DYaGPA96I69OPg9nD3keAl4bn2OtlrqIXEz6ojH2NXTS2NHHnuMdzCqMsONYO999af9p5wWm5mYQCgToGYiREQowpyhCaX4WB5q6qG/rIRZ3tPee6ucPB42rL5nK9YtKWbO0jMKcc+vHV/eLiMg4qm/rpfpwC4eaunhlfzPg3ZayMCeDbbVtHGvrpXJqDgBLZ0wh5hzxuGN2UYRrLi3h6c11p12klpsZoi8aozgnk4JImJzMEOGgMX9aHqGgcay1l+6BGPG4o7Gjj1/86XsV6iIiyRKPOwKBkYdQdvZF2d/QyfrdjWypaeVISzdFORm8cbCFd1UWcai5i66+GJ19USqn5jAlO0wwYAQDxpN/9G71qYuIJMtogQ5e6/zyWQVcPmvkKRGGnjg+yf7ogsobpGnVRESSzM87hinURUTSiEJdRCSNKNRFRNKIQl1EJI2MGupm9rCZNZjZtmQUJCIi528sLfXvAzf4XIeIiIyDUUPdObcBaBltOxERSb1x61M3szvMrNrMqhsbG8drtyIicg7GLdSdcw8656qcc1UlJSXjtVsRETkHGv0iIpJGFOoiImlkLEMaHwNeBRaYWY2Zfdr/skRE5HyMOkujc+53klGIiIhcOHW/iIikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpJExhbqZ3WBmu81sn5nd43dRIiJyfkYNdTMLAt8CbgQuA37HzC7zuzARETl3Y2mprwL2OecOOOf6gceBW/0tS0REzkdoDNvMAI4OeV4DvOvMjczsDuCOxNM+M9t24eX5airQlOoixkB1jp/JUCNMjjonQ40wueqcMx47Gkuo21nWuXescO5B4EEAM6t2zlVdYG2+mgw1guocT5OhRpgcdU6GGmHS1VkxHvsaS/dLDTBryPOZQN14fLiIiIyvsYT6m8B8M6s0swxgLfC0v2WJiMj5GLX7xTkXNbM/Bn4OBIGHnXPbR3nbg+NRnM8mQ42gOsfTZKgRJkedk6FGuAjrNOfe0T0uIiKTlK4oFRFJIwp1EZE0Mq6hPtGmEzCzQ2b2tpltNrPqxLoiM3vBzPYmvhYm1puZfTNR+1YzW+lTTQ+bWcPQcfznU5OZfTKx/V4z+2SS6rzPzGoTx3Ozmd005LV7E3XuNrMPDVnv28+Emc0ysxfNbKeZbTezuxLrJ9TxHKHOiXY8s8zsDTPbkqjzq4n1lWb2euLY/CgxYAIzy0w835d4vWK0+n2s8ftmdnDIsVyeWJ+y36HEZwTN7C0z+2niuf/H0jk3Lg+8k6j7gblABrAFuGy89n+eNR0Cpp6x7q+BexLL9wB/lVi+CfgZ3rj81cDrPtV0DbAS2Ha+NQFFwIHE18LEcmES6rwP+F9n2fayxL93JlCZ+DkI+v0zAZQBKxPLecCeRC0T6niOUOdEO54G5CaWw8DrieP0BLA2sf67wGcSy3cC300sr1XbmUIAAAOdSURBVAV+NFL9Ptf4feC2s2yfst+hxOf8KfBvwE8Tz30/luPZUp8s0wncCvwgsfwD4CND1v/QeV4DCsysbLw/3Dm3AWi5wJo+BLzgnGtxzp0AXgBuSEKdw7kVeNw51+ecOwjsw/t58PVnwjl3zDm3KbHcAezEuwJ6Qh3PEeocTqqOp3POdSaehhMPB1wHrEusP/N4njzO64DrzcxGqN/PGoeTst8hM5sJrAG+l3huJOFYjmeon206gZF+cJPBAb8ws43mTWMAUOqcOwbeLxswLbE+lfWfa02prPWPE3/GPnyyW2OEepJWZ+LP1RV4LbcJezzPqBMm2PFMdBdsBhrwgm4/0Oqci57lMwfrSbzeBhT7XeeZNTrnTh7LryeO5T+YWeaZNZ5RSzL+zb8BfB6IJ54Xk4RjOZ6hPqbpBJLsKufcSrwZJj9rZteMsO1ErH+4mlJV63eAecBy4Bjwd4n1Ka3TzHKBHwN3O+faR9p0mHpSVeeEO57OuZhzbjneleOrgEUjfGZK6jyzRjNbAtwLLASuxOtS+UIqazSzm4EG59zGoatH+Mxxq3M8Q33CTSfgnKtLfG0AnsL7IT1+slsl8bUhsXkq6z/XmlJSq3PueOIXKg48xKk/A1NWp5mF8YLyUefcvydWT7jjebY6J+LxPMk51wqsx+uHLjCzkxcqDv3MwXoSr0/B67JLSp1Darwh0cXlnHN9wL+Q+mN5FXCLmR3C6ya7Dq/l7v+xHMcTAiG8kw2VnDqJs3i89n8e9eQAeUOWX8HrM/sbTj+J9teJ5TWcfkLlDR9rq+D0E5DnVBNeS+Qg3gmewsRyURLqLBuy/Dm8vj6AxZx+MucA3kk9X38mEsflh8A3zlg/oY7nCHVOtONZAhQklrOBl4GbgSc5/eTenYnlz3L6yb0nRqrf5xrLhhzrbwAPTITfocRnXcupE6W+H8vxLv4mvDP7+4Ev+XGAzqGWuYmDsQXYfrIevH6qXwJ7E1+LhvwwfCtR+9tAlU91PYb3p/YA3v/Cnz6fmoBP4Z002QfcnqQ6/zVRx1a8+X+GhtKXEnXuBm5Mxs8EcDXen6Jbgc2Jx00T7XiOUOdEO57LgLcS9WwDvjLkd+mNxLF5EshMrM9KPN+XeH3uaPX7WOOvEsdyG/AIp0bIpOx3aMjnXMupUPf9WGqaABGRNKIrSkVE0ohCXUQkjSjURUTSiEJdRCSNKNRFRNKIQl1EJI0o1EVE0sj/B4N8l4hazWFYAAAAAElFTkSuQmCC\n",
417 |       "text/plain": [
418 |        "<Figure size 432x288 with 1 Axes>"
419 |       ]
420 |      },
421 |      "metadata": {},
422 |      "output_type": "display_data"
423 |     }
424 |    ],
425 |    "source": [
426 |     "flattenAnneal(learn,4e-3, 20, .72) #imagenette"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 23,
432 |    "metadata": {},
433 |    "outputs": [
434 |     {
435 |      "data": {
436 |       "text/html": [
437 |        "<table border=\"1\" class=\"dataframe\">\n",
438 |        "  <thead>\n",
439 |        "    <tr style=\"text-align: left;\">\n",
440 |        "      <th>epoch</th>\n",
441 |        "      <th>train_loss</th>\n",
442 |        "      <th>valid_loss</th>\n",
443 |        "      <th>accuracy</th>\n",
444 |        "      <th>time</th>\n",
445 |        "    </tr>\n",
446 |        "  </thead>\n",
447 |        "  <tbody>\n",
448 |        "    <tr>\n",
449 |        "      <td>0</td>\n",
450 |        "      <td>2.783504</td>\n",
451 |        "      <td>2.846070</td>\n",
452 |        "      <td>0.292000</td>\n",
453 |        "      <td>00:38</td>\n",
454 |        "    </tr>\n",
455 |        "    <tr>\n",
456 |        "      <td>1</td>\n",
457 |        "      <td>2.514973</td>\n",
458 |        "      <td>2.656390</td>\n",
459 |        "      <td>0.374000</td>\n",
460 |        "      <td>00:36</td>\n",
461 |        "    </tr>\n",
462 |        "    <tr>\n",
463 |        "      <td>2</td>\n",
464 |        "      <td>2.331984</td>\n",
465 |        "      <td>2.586760</td>\n",
466 |        "      <td>0.400000</td>\n",
467 |        "      <td>00:36</td>\n",
468 |        "    </tr>\n",
469 |        "    <tr>\n",
470 |        "      <td>3</td>\n",
471 |        "      <td>2.211190</td>\n",
472 |        "      <td>2.379511</td>\n",
473 |        "      <td>0.478000</td>\n",
474 |        "      <td>00:36</td>\n",
475 |        "    </tr>\n",
476 |        "    <tr>\n",
477 |        "      <td>4</td>\n",
478 |        "      <td>2.086747</td>\n",
479 |        "      <td>2.122940</td>\n",
480 |        "      <td>0.586000</td>\n",
481 |        "      <td>00:35</td>\n",
482 |        "    </tr>\n",
483 |        "    <tr>\n",
484 |        "      <td>5</td>\n",
485 |        "      <td>1.977175</td>\n",
486 |        "      <td>2.192198</td>\n",
487 |        "      <td>0.580000</td>\n",
488 |        "      <td>00:36</td>\n",
489 |        "    </tr>\n",
490 |        "    <tr>\n",
491 |        "      <td>6</td>\n",
492 |        "      <td>1.890428</td>\n",
493 |        "      <td>1.988842</td>\n",
494 |        "      <td>0.636000</td>\n",
495 |        "      <td>00:36</td>\n",
496 |        "    </tr>\n",
497 |        "    <tr>\n",
498 |        "      <td>7</td>\n",
499 |        "      <td>1.804230</td>\n",
500 |        "      <td>2.107334</td>\n",
501 |        "      <td>0.610000</td>\n",
502 |        "      <td>00:36</td>\n",
503 |        "    </tr>\n",
504 |        "    <tr>\n",
505 |        "      <td>8</td>\n",
506 |        "      <td>1.743078</td>\n",
507 |        "      <td>1.851416</td>\n",
508 |        "      <td>0.698000</td>\n",
509 |        "      <td>00:36</td>\n",
510 |        "    </tr>\n",
511 |        "    <tr>\n",
512 |        "      <td>9</td>\n",
513 |        "      <td>1.703422</td>\n",
514 |        "      <td>1.778018</td>\n",
515 |        "      <td>0.720000</td>\n",
516 |        "      <td>00:36</td>\n",
517 |        "    </tr>\n",
518 |        "    <tr>\n",
519 |        "      <td>10</td>\n",
520 |        "      <td>1.654018</td>\n",
521 |        "      <td>1.885794</td>\n",
522 |        "      <td>0.682000</td>\n",
523 |        "      <td>00:36</td>\n",
524 |        "    </tr>\n",
525 |        "    <tr>\n",
526 |        "      <td>11</td>\n",
527 |        "      <td>1.621382</td>\n",
528 |        "      <td>1.812310</td>\n",
529 |        "      <td>0.724000</td>\n",
530 |        "      <td>00:36</td>\n",
531 |        "    </tr>\n",
532 |        "    <tr>\n",
533 |        "      <td>12</td>\n",
534 |        "      <td>1.593395</td>\n",
535 |        "      <td>1.778166</td>\n",
536 |        "      <td>0.740000</td>\n",
537 |        "      <td>00:36</td>\n",
538 |        "    </tr>\n",
539 |        "    <tr>\n",
540 |        "      <td>13</td>\n",
541 |        "      <td>1.558790</td>\n",
542 |        "      <td>1.775904</td>\n",
543 |        "      <td>0.728000</td>\n",
544 |        "      <td>00:36</td>\n",
545 |        "    </tr>\n",
546 |        "    <tr>\n",
547 |        "      <td>14</td>\n",
548 |        "      <td>1.518692</td>\n",
549 |        "      <td>1.741760</td>\n",
550 |        "      <td>0.744000</td>\n",
551 |        "      <td>00:36</td>\n",
552 |        "    </tr>\n",
553 |        "    <tr>\n",
554 |        "      <td>15</td>\n",
555 |        "      <td>1.486559</td>\n",
556 |        "      <td>1.609652</td>\n",
557 |        "      <td>0.766000</td>\n",
558 |        "      <td>00:36</td>\n",
559 |        "    </tr>\n",
560 |        "    <tr>\n",
561 |        "      <td>16</td>\n",
562 |        "      <td>1.413943</td>\n",
563 |        "      <td>1.684022</td>\n",
564 |        "      <td>0.766000</td>\n",
565 |        "      <td>00:37</td>\n",
566 |        "    </tr>\n",
567 |        "    <tr>\n",
568 |        "      <td>17</td>\n",
569 |        "      <td>1.354226</td>\n",
570 |        "      <td>1.612617</td>\n",
571 |        "      <td>0.794000</td>\n",
572 |        "      <td>00:37</td>\n",
573 |        "    </tr>\n",
574 |        "    <tr>\n",
575 |        "      <td>18</td>\n",
576 |        "      <td>1.291616</td>\n",
577 |        "      <td>1.538140</td>\n",
578 |        "      <td>0.816000</td>\n",
579 |        "      <td>00:36</td>\n",
580 |        "    </tr>\n",
581 |        "    <tr>\n",
582 |        "      <td>19</td>\n",
583 |        "      <td>1.250432</td>\n",
584 |        "      <td>1.542479</td>\n",
585 |        "      <td>0.808000</td>\n",
586 |        "      <td>00:36</td>\n",
587 |        "    </tr>\n",
588 |        "  </tbody>\n",
589 |        "</table>"
590 |       ],
591 |       "text/plain": [
592 |        "<IPython.core.display.HTML object>"
593 |       ]
594 |      },
595 |      "metadata": {},
596 |      "output_type": "display_data"
597 |     },
598 |     {
599 |      "data": {
600 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD6CAYAAACIyQ0UAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd3hc133m8e8BMOgdRCNBAuwUK0iCVCGjYtkyKckqEaPQ3bJjJnYcS8rus5I3u4n1xM46+yRu6yLTjuIiWbZMSZYtq1iyRTOqFCg2sFeAKETvdTBz9o87IAASZQDMAJfE+3meeTCcueU3l8CLg3PPPddYaxEREfeKmOoCRERkZApqERGXU1CLiLicglpExOUU1CIiLqegFhFxuVGD2hiz2Bizb8CjxRjzwGQUJyIiYMYyjtoYEwlUAFdba0uHWy4lLd0unD8vBOWJiEwPe/bsqbPWZg71XtQYt3UzcGqkkAbImjmb4uLiMW5aRGT6MsYMm6tj7aPeCjw5zE62GWOKjTHFLS0tY9ysiIgMJ+igNsZEA3cAvxrqfWvtdmttkbW2KCkpKVT1iYhMe2NpUW8G3rPWVo+2oGYPEREJnbH0UX+YYbo9LqGkFpEx8Hq9lJeX09XVNdWlhF1sbCx5eXl4PJ6g1wkqqI0x8cAHgL8OZnnltIiMRXl5OUlJSRQUFGCMmepywsZaS319PeXl5cydOzfo9YLq+rDWdlhrM6y1zUEtH/TuRUSgq6uLjIyMKzqkAYwxZGRkjPkvh/BcmaikFpExutJDus94PmdYgtoqqUVEQkZzfYjItNfU1MT3vve9Ma9366230tTUFIaKBgtPi1oNahG5jAwX1D6fb8T1XnjhBVJTU8NV1gVjvYRcROSK8/DDD3Pq1CkKCwvxeDwkJiaSm5vLvn37OHz4MHfddRfnzp2jq6uL+++/n23btgFQUFBAcXExbW1tbN68mY0bN/Lmm28ya9YsnnvuOeLi4kJSX1iCWg1qERmvR357iMOVoZ2GYunMZP7pQ8uGff9rX/saJSUl7Nu3j507d3LbbbdRUlJyYQjdY489Rnp6Op2dnaxbt4577rmHjIyMQds4ceIETz75JD/84Q+59957efrpp/nYxz4WkvrDE9Tq+xCRy9j69esHjXP+9re/zbPPPgvAuXPnOHHixCVBPXfuXAoLCwFYu3YtZ8+eDVk96voQEVcZqeU7WRISEi4837lzJ6+++ipvvfUW8fHx3HjjjUOOg46JibnwPDIyks7OzpDVE6bheSIil4+kpCRaW1uHfK+5uZm0tDTi4+M5evQob7/99iRXF64WtZJaRC4jGRkZbNiwgeXLlxMXF0d2dvaF9zZt2sSjjz7KypUrWbx4Mddcc82k1zemO7wEa9bC5bbiREnItysiV6YjR45w1VVXTXUZk2aoz2uM2WOtLRpqeV3wIiLicmG64EV9HyIioaIWtYiIy2nUh4iIy6lFLSLicpqUSUTE5dSiFhEZh8TERAAqKyvZsmXLkMvceOONFBcXT3hfunGAiMgEzJw5kx07doR1H7oyUUQEeOihh8jPz+fzn/88AF/+8pcxxrBr1y4aGxvxer185Stf4c477xy03tmzZ7n99tspKSmhs7OT++67j8OHD3PVVVeFbL4PTXMqIu7y4sNw/mBot5mzAjZ/bcRFtm7dygMPPHAhqJ966ileeuklHnzwQZKTk6mrq+Oaa67hjjvuGPa+h9///veJj4/nwIEDHDhwgDVr1oSkfM2eJyICrF69mpqaGiorK6mtrSUtLY3c3FwefPBBdu3aRUREBBUVFVRXV5OTkzPkNnbt2sUXv/hFAFauXMnKlStDUltQQW2MSQV+BCzHaTB/2lr71nDLa9SHiIzbKC3fcNqyZQs7duzg/PnzbN26lSeeeILa2lr27NmDx+OhoKBgyClOBwrH3dSDPZn4LeAla+0SYBVwJOSViIhMsa1bt/KLX/yCHTt2sGXLFpqbm8nKysLj8fDaa69RWlo64vrXX389TzzxBAAlJSUcOHAgJHWN2qI2xiQD1wOfArDW9gA9I62jUR8icjlatmwZra2tzJo1i9zcXD760Y/yoQ99iKKiIgoLC1myZMmI63/uc5/jvvvuY+XKlRQWFrJ+/fqQ1DXqNKfGmEJgO3AYpzW9B7jfWtt+0XLbgG0ASbnz1rZUngpJgSJy5dM0pxOf5jQKWAN831q7GmgHHr54IWvtdmttkbW2yBMdPfbKRURkSMEEdTlQbq19J/DvHTjBLSIik2DUoLbWngfOGWMWB166GacbZIR1QlCZiEwr02Ue+/F8zmDHUf8d8IQxJho4Ddw3SiljLkREpq/Y2Fjq6+vJyMgIy/A2t7DWUl9fT2xs7JjWCyqorbX7gCE7uYdcfkwliMh0l5eXR3l5ObW1tVNdStjFxsaSl5c3pnXCcwm5klpExsDj8TB37typLsO1NM2piIjL6VZcIiIuF54WtZJaRCRkdOMAERGXUx+1iIjLqY9aRMTl1EctIuJyalGLiLic+qhFRFwuPC1qXZooIhIyalGLiLic+qhFRFxOoz5ERFxOLWoREZfTyUQREZcLW4va71dYi4iEQthGfXj9/nBtWkRkWglfUPvUohYRCYWwBXVPr1rUIiKhEMYWtYJaRCQU1KIWEXG5oO5Cbow5C7QCPqDXWls02jpqUYuIhEZQQR1wk7W2LtiFexTUIiIhEb4+6l6N+hARCYVgg9oCvzfG7DHGbBtqAWPMNmNMsTGmGNSiFhEJlWC7PjZYayuNMVnAK8aYo9baXQMXsNZuB7YDxOQutOqjFhEJjaBa1NbaysDXGuBZYP1o6yioRURCY9SgNsYkGGOS+p4DtwAlo62noBYRCY1guj6ygWeNMX3L/9xa+9JoK/XoZKKISEiMGtTW2tPAqrFuWC1qEZHQ0CXkIiIup0vIRURcTi1qERGXC1+LWvNRi4iEhFrUIiIuF8a5PhTUIiKhoBa1iIjLhSWojYFutahFREIiLEEdYQwdPb5wbFpEZNoJW1C39/SGY9MiItNOmIIaOtWiFhEJiTC2qBXUIiKhEJagjowwdHSr60NEJBTUohYRcbmw9VF36GSiiEhIhCeoIzQ8T0QkVMI3jlp91CIiIRG+rg+vD79fM+iJiExU2FrU1kJXr7o/REQmKmx91ADt3QpqEZGJClvXB+jqRBGRUAhb1weg+T5EREIg6KA2xkQaY/YaY54fdaOBoG7TyA8RkQkbS4v6fuBIMAtGBfo+mju84yhJREQGCiqojTF5wG3Aj4JZPjIQ1I0dPeMuTEREHMG2qL8J/A9g2Nu2GGO2GWOKjTHFTY0NADR3qkUtIjJRowa1MeZ2oMZau2ek5ay12621RdbaoqzMGURGGJrU9SEiMmHBtKg3AHcYY84CvwDeZ4x5fLSVUuI86voQEQmBUYPaWvsla22etbYA2Ar80Vr7sdHWS4330KSuDxGRCQvLOGqA1DiPRn2IiIRA1FgWttbuBHYGs2xafDRVzV3jKElERAYKW4s6KzmW6hYFtYjIRIUtqPPS4qhv79GdXkREJihsQT0nPR6Acw2d4dqFiMi0EPagLmvoCNcuRESmBQW1iIjLhW94XryHpJgozimoRUQmJGxBbYxhdnq8WtQiIhMUtqAGp/ujtL49nLsQEbnihTeoM+I519ipu5GLiExAWIN6fmYCPb1+dX+IiExAWIN62cwUAEoqm8O5GxGRK1pYg3pRdhKeSENJRUs4dyMickULa1BHR0WwKDuJQ2pRi4iMW1iDGmDFrBQOVjRjrU4oioiMR9iDemVeKk0dXp1QFBEZp7AH9bqCNAD+dLw23LsSEbkihT2oF2YnsTArkef3V4V7VyIiV6SwBzXA7StnsvtsA5VNmvJURGSsJiWo71o9E4Dn9lVOxu5ERK4okxLU+RkJrCtI48ndZfh0ObmIyJhMSlADfGbjXMoaOnjhoPqqRUTGYtSgNsbEGmN2G2P2G2MOGWMeGc+Oblmaw/zMBL6385TGVIuIjEEwLepu4H3W2lVAIbDJGHPNmHcUYfj8jQs4UtXCCwfPj3V1EZFpa9Sgto62wD89gce4msR3rZ7F4uwk/u33x+jp9Y9nEyIi005QfdTGmEhjzD6gBnjFWvvOEMtsM8YUG2OKa2uHvrglMsLw8OYlnKlrZ9vPitUFIiIShKCC2lrrs9YWAnnAemPM8iGW2W6tLbLWFmVmZg67rZuWZLFpWQ47j9XyVz8p1thqEZFRjGnUh7W2CdgJbJrITr/zkdU88P6F/OFoDZ/+8bt0eX0T2ZyIyBUtmFEfmcaY1MDzOOD9wNGJ7DQqMoIH3r+IH36iiKPnW9m6/W3drVxEZBjBtKhzgdeMMQeAd3H6qJ8Pxc4/sDSbr9y1nBPVrXzoO69TUtEMlfvgtX+B2uOh2IWIyGXPhOOEXlFRkS0uLg56+ePVrXzqsd30+Pw8v66EnLceASzkrIAVfwHL74GUvJDXKSLiFsaYPdbaoqHem7QrE0eyKDuJn/3V1fj8lmt3LuJ7a3+L75Z/gchoeOUf4RvL4LFN8O6PoL1uqssVEZlUrmhR9znX0MH/efEILxw8z4YFGfzHJ9cR23IWSp6Bkh1QexRMJMy/CZZvgSW3QWxyyOsXEZlsI7WoXRXUfX7y5ln+6TeHWJydxFfvXk5RQTpYC9WHnMA++DQ0l0FULCz6oBPaC28BT2wIP4WIyOS57IIa4PkDlTy04wDtPT4+dV0Bn94wlzkZ8c6b1sK53U5oH3oW2mshJhmu+pDTnz33BoiMCsEnERGZHJdlUAOU1rfz1d8d4ZUj1fSVmZkUw6euK+Dj1+aTHOsBXy+c+ROUPA1HfgvdLZCQCfNugrQCSMuH1DmQmg/JsxTgIuJKl21Q9zlT5wT2yZpWunv9VDV3AWAMfOGmBfz1DfNJjIkCbxecfAUO7oCKPdBSAXbAnCIm0hk9kjonEOAFga+BME/MhghXnF8VkWnmsg/qi+0ta+Sp4nJeOFhFc6eXzKQY/u0vVnHDoosuXfd5obkcmkqhsdT52lTW/7ytevDykTEDQjwf5l4PizdDVEzYPouICFyBQT3Qa8dq+NLTBznf0kXh7FR+fN86UuOjg1vZ2wlN5wJBfnZAoJdBwxnoboa4dFh5L6z+mDOuW0QkDK7ooAZo6fLyz789zK/2lAPw4fVzuHFxJh9cljP+jfp9cPo12Ps4HP0d+HogZyWs/jis2ALx6SGqXkRkGgR1n13Ha/nSMwepCMzIt3xWMtcvzOSvb5hPSpxn/BvuaHD6vfc9DlX7nQtxltwGhR9zxnRHRIboE4jIdDVtgrrP3rJG/tevS2jq8F4I7b+5YT4PbVqMMWZiGz9/EPY+AQd+CZ0NzkiSVVuh8KOQMT8E1YvIdDTtgnqgPxyp5sndZbx6pAaA+OhIblqSxRfft5DFOUnj33BvNxx/yekaOfmqM7okf4MT2EvvhJjEEH0CEZkOpnVQA1hr+fnuMp569xz7y5svvJ6REM3dq2fxdzcvnFjXSEsV7H8S9j0B9SchOhGW3eX0Z8++2hlHKCIygmkf1AN1eX3843MlNLR7efVI//C81XNS+ejV+dyxaiZREYaIiHGEq7Vw7h2nlX3oWehpg5lrYOODsOR2jdEWkWEpqEfw3L4K/vXFo/ispbql+8Lr8zIT+MDSbN63OIvs5Fhmp8cTOZbw7mmHA0/Bm9+GhtOQsRA2PgAr7oWoIIcPisi0oaAOgrWWPx6t4Zm9FbR0eqlv6+FwVcuF9/Mz4rl79SyunpvBirwU50rIYPh9cPjX8Po3nBORyXlw3RdgzScgOiFMn0ZELjcK6nH6w5FqHnr6IHVt3YNej4wwxHsimZ+VyCN3LMMCv9lXSX17Nyeq29i8PIe/XD+brKQBs/lZCyf/AK9/HUrfcC6kufpvYP1nNSZbRBTUodDS5WXnsVpioiL47f5Knj9QNeLyi7OT+Pd7V7F8Vsqlb5a947Swj7/onHhc+ym49m8heWZ4ihcR11NQh0lVcyef/Wkx182fwV/92VxS4jxER0bw8qFq/v6pfXT0+FicncQjdy5jVV4qcdEXXRhTfQhe/6Yz819EpDMee8MDEx+PbS201TiXwjeVQnyGM2+JLswRcS0F9RQ4WdPGx//jnQsz/QGsn5vOQ5uW0Nbdy8KsRGamxjlvNJ6FN/8fvPcz51L1pXfCn/095K4aeuPWQmdjYH6Ssksnm2oqg96uweskzYRVfwmrPgKZi8LymUVk/BTUU6ipo4dn3qtg14ladh6rHfTezUuy+Oz187hmXgYAXY1VRL37KFF7HnPm1Z5/s9PKbq/rD+S+SaN6WgfvKDZ18Mx/qfnO85TZUHfcGed94hWwPshbB6s+DMv/HOLSJutQiMgIFNQusae0kV3Haylr6CDCGP54tJrGDu8ly2VEdXGv/T3bol8izTY5L3oS+kN44M0Q+sI5doi+8Iu1VsPBp2Dfz6HmsDOt65LboPAjzo0WQnlTBb8f6k9AeTGUvwuNZ2DpXc6+NG2syCUmFNTGmNnAT4EcwA9st9Z+a6R1FNTB6fL6+N+/Lrkw698Hl2VTkJFAS1cvT+4uI4Ye5ptKvAm5dHlSuHt1HnevyWPujASstVjL+C/MqdrvBPbBp5xulMSc/q6RrCVj32ZHQ38oVxRD+R5nmliAmBRIzHSu2kzMgWs/D2vv042JRQaYaFDnArnW2veMMUnAHuAua+3h4dZRUIfG4coWvv+nU+w8WkN7Ty/+wH9VUmwUrV29ACzISiQ1zkNXr4/3LcnmCzctIDpqDFdA9nbD8ZedrpHjLztdIzPXOC3f5fcMPXTQ54XqkkAwB8K54ZTznomArGWQVxR4rHMu9jHGuWXa69+A0zudvwDWfdYZopiYeek+RKaZkHZ9GGOeA75jrX1luGUU1KFnreXlQ+d5+r0KDpY3097TS2ePjxV5KRytaqXT67uwbEqchz9fM4uPXp3PgqwxTA7VVgMHf+W0tKtLnOlcF2+GlVudk5zl7zq3OKvc23+yMjHbCeO+UM4tHH1Cqoo9zmiXI7917iS/5uNw7RecLhyRaSpkQW2MKQB2AcuttS0XvbcN2AYwZ86ctaWlpeOtV8aoob0HT6ThP984y9dfOU6E4ULre2ZKLDOSYiit7+C7H1nDdfMzgusuqTrQ3zXSUe+8FhnjjEQZGMwpeeOfdKruBLzxTdj/S2f2wRVbnOGJ2UvHtz2Ry1hIgtoYkwj8CfiqtfaZkZZVi3rqHaps5gd/Os1v9lcOej0mKoJ5mYl09/pYlZfKZ/9sHktnjtBX3NsDZ/8L4lIhe0V45ilproC3vgt7fgzedli02RmeOHt96Pcl4lITDmpjjAd4HnjZWvv10ZZXULuL328pLm1kb1kjLx06z96ypkHvXzMvnXMNnVQ0dbJ6Tip3Fc5i/dx0zjV0cK6xk+vmZ/CtV0+QlRzDnYUzWTYzhVhPGC6e6WiA3T+Edx51bsqQvwE2/j0suFlTxcoVb6InEw3wE6DBWvtAMDtUULub1+cnKsLQ0tnLd3eeZPuu0wBER0UQacyg/u7hXDsvg03Lc7hnbV7wE1QFq6cd3vupcxFQS4XTkt/4gDO8L5RDCEfj90H9KajaB5X7nK/VJRCTDGkFziN9buB54KvmbZFxmmhQbwT+CziIMzwP4H9aa18Ybh0F9eWlvbuXmtbuC8P+3jnTwNdePMqGBRnMSY/Hb2FOejzpCdG8fbqeJ3eXca6hc1Cgz0qNY+OCGaTEe/D7LWkJ0dy2IpeCGROYIbC3xzm5+cY3nYt2EjJhxqLBwdgXlPEZE2t1+33OPir3OUMXq/Y5/fTeduf9qFjIXu7cid7b4dylvvEstNcM3k5syqW1pc11nifP0mX8Mixd8CIh19rl5d9ePsbhqhasheLSxmGXXZufRmZiDPdtKGBeZiIvlVTR5fUzJyOeGxZlEuuJxOvz44kcZlih3w/HfgdHX3AunGk4A23nBy8TnRQIx4LB4ZhW4FydGTngDj6+Xqg71t9KrtrvTEHr7XDe98Q7gZxb6Jw8nVkIMxYP3ZrvbnMCu/GsU1vj2UCIn3GuIPX39i8b4QlcqDTHaXnHpgzzSBvwPFkXCE0TCmqZFDWtXXh9lpzkWI5UtfDY62d4Zm/FmLYxb0YC6wrSuW9jAUtyRjjJ2dMRuKT+bH/rti8oG0vBN2BqWhPpjE5Jn+sEa3VJ//BCTwLkrnRCeWYgmGcsCk3L1++D5vL+2vrqbCqDrubAo2lwmA8lKu7SMI+OB4zzV4SJ6H8+4lcGrzPUdgc+4lKdX4C6M9GkUFDLlNt9poFv/eE4i7KTuH5hJmfq2qls6uSlQ+cpb+xkfmYCpfUd9Pov/X68bUUunV4fi7KT+MS1+f2TWQ3g91uMgcffKSM5JoIs00i2t4p5UbWDgzwqdkAoFzozFU5ld4S1Tkv+QnBf/Gga+vWeDsA661t///NBXxnm9cA6vV3OnDIjMk6r/kKApw5+nrMC8q9z/krQCd8JUVDLZcNaS3ljJ4+/U8oP/nR6yGWSYqJYNzedtPhofrPfabF7fZb46Eg6egafCF2Vl8LmFbn87kAVGxfOYMWsFG5anEVMVAR+a7Fwocul1+cnarjulyuV3+eEdd8vgM5hfjEM9eioh95OZzvJeU5g518HBRshY4GCe4wU1HLZqmzqpK6tm7T4aJo7vfx8dxm7jtdS3th5YZmUOA/NnV5uW5lLYnQU96zNo7Kpk1/tOccbJ+tH3cdNizMpLm2krbuXDfNnsGp2CoWz01ibn0ZavAdrA70FCp7B/H5ncq/SN527FpW+2X9yNSEzENwbnK9ZyybeheLzBqbzHdCN1HDG+YskdU7/SJy0fOccRVzaZfXLQkEtV5wur483TtZxw6LMEVvBzZ1eSuvbyU2Jw1rLq0dqON/Sxdun69l9poEIA/kZCeQkx5KZFMN7ZY0XfgkMvMITICsphoSYKM7UtXPbilxW5KVQkJHAiyVVLJ+Zgs9aDlY0MyMhmrSEaIrPNmIMnK5tJyclloc3L6EoP+3KDXxrneGMpa87oX32DWhxJhwjNgXmXNcf3rkrB5/g7dPdOviE7MCvzeXOXDR9omKdYPbEOQHecdEv5b4TzGn5A0K8oH/WSU8sbqKgFhmD1i4vhypbeONkHfvLm/H2+slMimHvuUYyE2Pw+izHqlvp6fWPvrEhrJqdyqc3FHD7ypljurO9tRZjDNZaurz+S+8Y5EZNZYNb3PUnndc9Cc6VpznLnel3+04Etw+es524tAEjeC76mpgzuJXe3eqcSG4823+iue/kclPpEDfTyB0c4AO3nzBj0lvjCmqRELPWcrqunYb2HhKio5iRGI3PWrKSYmnq6MEYgyfSEOeJJMIYmju9/HpfBU8Vl9PW7eVcQyd5aXEsyUlifmYiuSmxvHmqnvjoSNITYmjscOZv+Yui2VQ2dfLCwSpeO1pLZlIM3b1+6tq6mTcjgfvfv5A7C2eNuXa/ZUy/JEKmtRrKAq3t0jeh9qgTmH1DKS8O5LjU0OzX73e6ZfqCuy/E+wK9pZLA2VdHdGKgjoJLf0Ek54XlwisFtYiL+P2Wlw6d55n3Kihv7OB0bTs9Pj8zEmMuueP9QNFREcRERpA/I56Siv7RGnlpceQkx7IgK5HEmChq27qpaemmsaOHxJgoVs9J5WRNG+3dPnafbbiwXlJMFDcuyeKOVTO5el46Pb1ODZOq7wTAVPN2OaE9VJdLU6kze2SfiKhAn3gguNPnOd0pUTHOaJphH3bE9826zyioRdyqvbuX8sZOFmYl4reWrl4/fms5cK6ZY9WtrCtIY3FOEjFRg7s6yuo7+NHrp6lv66GsoYPj1a10B9Eds2ZOKtFREbx9uuGS95bkJHH0fCtX5SZzw6JMIgx09zpTDvj8lprWbkoqmjld1052cgybl+eSnRzL7StzmZ0eH7Jj4ip+n9PivjjAG89Aw9n+G2RMkHmkRUEtcqXr7vVRWt/BzNQ4EmOi8Pr8+PyWyAhDr886c7lc1N1xvLqVVw5XE2EMe0obOF7dRqfXR23r8C372elxdHv9REdFUNvaTXevn8gIQ1SE4QNLs1mcnURLl5e5MxJJi/fwxqk6cpJjOdfQSX17D3vLGimYkUBeWhw+v+VvbpjP8llB3ErOjfpuNN1U6gR638VEIz6GXsYk5yqoRSR4fbd6a+nyEuuJJDLCcOx8KwUzEgZNwmWt5URNG4+/XcrTe8pp7xl5Qq+UOM+F9Zs7vbR1O1dlFuWnsSArkXvW5tHrs6zMSyEy0IqPj468cBL1ih0xg/qoRWSSWGupbumm1++np9dPQ3sPb56qZ11BOrPT48hLix+0bF1bDz/6r9M8s7dixFZ8ekI0bV29ZCbFEB0VQUqch/SEaLw+PxsWzODj1+STEOpZHCeZglpEXM3vd4Y8vn6iDotzdWq318+Z+nZaOr1kJccSHRlBW7cXv4XmDqc1HhVpKK3vYElOEjcsymRORjzrCtJJjnUugsrPiA/P3OlhMFJQX96/gkTkihARYbgqN5mrcsd2Z3prLb87WMW///44P9g19JQDs1LjuG5+hnNSNNKQmRhDWUMH75U1kpUUS25KLGvz07h79Syykt11EUwftahF5IrQ0uWlvKGT/eVN9Pot1c1deH1+9pc3caiihbSEaNq7e2nt7qWn1090ZARzMuLp7PFR0dRJhIGFWUkszkni1hW53LQk85KRNuGkFrWIXPGSYz0snekZ+R6gOKNjTlS3sSAr8UK3yOnaNp7dW8HBimZePnT+wr1GZ6fHsWJWCokxUZyqbWdxThIAje09dHl9JMRE4beWhOgo0hOjSY+PpqggnaW5ySG9clRBLSLTSkxU5CXDAedlJvLfblkMQE+vnz8erWF/eRMlFc2UVLTQ0eOjrq2bo1Utg0a2zE6Po9dn8frsoIuVkmOjKCpIZ2aq061yVW4yuclxpMR76Ozxcaq2jeoWp8WfnRxLXVsPI1FQi4gMEB0VwablOWxanjPo9b45z+vaevD6/JfMi97l9VHe2Ml7ZY28criakzVt7D7TwONvl024JvVRi4iEic9vOV7dyt6yJqpbuqhs6iQ5zjz/RoEAAAbGSURBVMPa/DRyU2Jp6eqly+sjLT6aq+dlqI9aRGSyRY5zNMvFptntLERELj8KahERlxs1qI0xjxljaowxJZNRkIiIDBZMi/rHwKYw1yEiIsMYNaittbuASyeuFRGRSRGyPmpjzDZjTLExpri2tnb0FUREJCghC2pr7XZrbZG1tigzMzNUmxURmfY06kNExOUU1CIiLhfM8LwngbeAxcaYcmPMZ8JfloiI9Bn1EnJr7YcnoxARERmauj5ERFxOQS0i4nIKahERl1NQi4i4nIJaRMTlFNQiIi6noBYRcTkFtYiIyymoRURcTkEtIuJyCmoREZdTUIuIuJyCWkTE5RTUIiIup6AWEXE5BbWIiMspqEVEXE5BLSLicgpqERGXU1CLiLicglpExOUU1CIiLhdUUBtjNhljjhljThpjHg53USIi0m/UoDbGRALfBTYDS4EPG2OWhrswERFxBNOiXg+ctNaettb2AL8A7gxvWSIi0icqiGVmAecG/LscuPrihYwx24BtgX92G2NKJl7epJgB1E11EUFSreGhWsPjcqoVpr7e/OHeCCaozRCv2UtesHY7sB3AGFNsrS0KurwppFrDQ7WGh2oNHzfXG0zXRzkwe8C/84DK8JQjIiIXCyao3wUWGmPmGmOiga3Ab8JbloiI9Bm168Na22uM+QLwMhAJPGatPTTKattDUdwkUa3hoVrDQ7WGj2vrNdZe0t0sIiIuoisTRURcTkEtIuJyIQ1qN15qbow5a4w5aIzZZ4wpDryWbox5xRhzIvA1LfC6McZ8O1D/AWPMmjDX9pgxpmbgmPPx1GaM+WRg+RPGmE9Ocr1fNsZUBI7vPmPMrQPe+1Kg3mPGmA8OeD2s3yfGmNnGmNeMMUeMMYeMMfcHXnfdsR2hVtcd18A+Yo0xu40x+wP1PhJ4fa4x5p3AcfplYOABxpiYwL9PBt4vGO1zTEKtPzbGnBlwbAsDr0/5z9iwrLUheeCcaDwFzAOigf3A0lBtfwJ1nQVmXPTa/wUeDjx/GPjXwPNbgRdxxo5fA7wT5tquB9YAJeOtDUgHTge+pgWep01ivV8G/vsQyy4NfA/EAHMD3xuRk/F9AuQCawLPk4DjgXpcd2xHqNV1xzWwfwMkBp57gHcCx+wpYGvg9UeBzwWefx54NPB8K/DLkT7HJNX6Y2DLEMtP+c/YcI9Qtqgvp0vN7wR+Enj+E+CuAa//1DreBlKNMbnhKsJauwtomGBtHwResdY2WGsbgVeATZNY73DuBH5hre221p4BTuJ8j4T9+8RaW2WtfS/wvBU4gnOFreuO7Qi1DmfKjmugRmutbQv80xN4WOB9wI7A6xcf275jvgO42RhjRvgck1HrcKb8Z2w4oQzqoS41H+kbbrJY4PfGmD3GucwdINtaWwXODwqQFXjdDZ9hrLW5oeYvBP5UfKyvO2GEuia13sCf2qtxWlOuPrYX1QouPa7GmEhjzD6gBie0TgFN1treIfZ9oa7A+81AxmTVe3Gt1tq+Y/vVwLH9hjEm5uJaL6ppyn/GQhnUQV1qPgU2WGvX4Mz+97fGmOtHWNatnwGGr22qa/4+MB8oBKqAfw+8PuX1GmMSgaeBB6y1LSMtOkxNU1mra4+rtdZnrS3EuUp5PXDVCPue0novrtUYsxz4ErAEWIfTnfGQG2odSSiD2pWXmltrKwNfa4Bncb6xqvu6NAJfawKLu+EzjLW2Ka3ZWlsd+GHwAz+k/8/XKa3XGOPBCb4nrLXPBF525bEdqla3HteBrLVNwE6c/txUY0zfBXQD932hrsD7KTjdZ5Na74BaNwW6m6y1thv4T1x4bC8WyqB23aXmxpgEY0xS33PgFqAkUFffmdtPAs8Fnv8G+ETg7O81QHPfn8qTaKy1vQzcYoxJC/x5fEvgtUlxUR/+3TjHt6/erYGz/nOBhcBuJuH7JNAH+h/AEWvt1we85bpjO1ytbjyugboyjTGpgedxwPtx+tVfA7YEFrv42PYd8y3AH61zhm64zxHuWo8O+GVtcPrSBx5b1/2MAaEb9WH7z5oex+mz+odQbnuc9czDObO8HzjUVxNOH9kfgBOBr+m2/yzxdwP1HwSKwlzfkzh/1npxfmt/Zjy1AZ/GORlzErhvkuv9WaCeAzjf6LkDlv+HQL3HgM2T9X0CbMT50/QAsC/wuNWNx3aEWl13XAP7WAnsDdRVAvzjgJ+13YHj9CsgJvB6bODfJwPvzxvtc0xCrX8MHNsS4HH6R4ZM+c/YcA9dQi4i4nK6MlFExOUU1CIiLqegFhFxOQW1iIjLKahFRFxOQS0i4nIKahERl/v/1GeI0wnbi/8AAAAASUVORK5CYII=\n",
601 |       "text/plain": [
602 |        "<Figure size 432x288 with 1 Axes>"
603 |       ]
604 |      },
605 |      "metadata": {},
606 |      "output_type": "display_data"
607 |     }
608 |    ],
609 |    "source": [
610 |     "flattenAnneal(learn,4e-3, 20, .72)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": []
619 |   }
620 |  ],
621 |  "metadata": {
622 |   "kernelspec": {
623 |    "display_name": "Python 3",
624 |    "language": "python",
625 |    "name": "python3"
626 |   },
627 |   "language_info": {
628 |    "codemirror_mode": {
629 |     "name": "ipython",
630 |     "version": 3
631 |    },
632 |    "file_extension": ".py",
633 |    "mimetype": "text/x-python",
634 |    "name": "python",
635 |    "nbconvert_exporter": "python",
636 |    "pygments_lexer": "ipython3",
637 |    "version": "3.7.3"
638 |   }
639 |  },
640 |  "nbformat": 4,
641 |  "nbformat_minor": 2
642 | }
643 | 


--------------------------------------------------------------------------------
/DeepMemory/README.md:
--------------------------------------------------------------------------------
 1 | DeepMemory is a new optimizer I came up with after blending DiffGrad + AdaMod.  The core concept is to provide the optimizer with
 2 | long term memory of the previous step sizes. 
 3 | 
 4 | Results in initial testing put it on par with Ranger and both Ranger and DeepMemory topped the recent testing I did with about 8 different optimizers.
 5 | 
 6 | 
 7 | DeepMemory is designed to offset the weakness of many adaptive optimizers by creating a 'long term' memory of the gradients over the course of an epoch.
 8 | This long term memory is averaged against the current adaptive step size generated from the current mini-batch in order to help guide the step size more optimally.
 9 | 
10 | DeepMemory also keeps a short term gradient buffer that was developed in diffgrad, and locks down the step size when minimal gradient change is detected.
11 | 
12 | 1/1/2020 - @lessw2020 developed the long term memory concept as a blended average (vs max throttle in AdaMod), and created and tested deep Memory
13 | credits:
14 | DiffGrad:  Uses the local gradient friction clamp developed by DiffGrad, but with version 1 coded by lessw from the paper:
15 | https://github.com/shivram1987/diffGrad (S.R.Dubey et al)
16 | 
17 | AdaMod - DeepMemory builds on the concepts for longer term monitoring in AdaMod (b3 concept but changed from min throttling to blended average and changed input to len_memory and size):
18 | 
19 | AdaMod source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
20 | 
21 | modifications @lessw2020
22 | 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
23 | 


--------------------------------------------------------------------------------
/DeepMemory/deepmemory.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim import Optimizer
  4 | 
  5 | # DeepMemory is designed to offset the weakness of many adaptive optimizers by creating a 'long term' memory of the gradients over the course of an epoch.
  6 | # This long term memory is averaged against the current adaptive step size generated from the current mini-batch in order to help guide the step size more optimally.
  7 | 
  8 | # DeepMemory also keeps a short term gradient buffer that was developed in diffgrad, and locks down the step size when minimal gradient change is detected.
  9 | 
 10 | # 1/1/2020 - @lessw2020 developed the long term memory concept as a blended average (vs max throttle in AdaMod), and created and tested deep Memory
 11 | # credits:
 12 | # DiffGrad:  Uses the local gradient friction clamp developed by DiffGrad, but with version 1 coded by lessw from the paper:
 13 | # https://github.com/shivram1987/diffGrad (S.R.Dubey et al)
 14 | 
 15 | # AdaMod - DeepMemory builds on the concepts for longer term monitoring in AdaMod (b3 concept but changed from min throttling to blended average and changed input to len_memory and size):
 16 | 
 17 | # AdaMod source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
 18 | 
 19 | # modifications @lessw2020
 20 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
 21 | 
 22 | 
 23 | class DeepMemory(Optimizer):
 24 |     """Implements DeepMemory algorithm (built upon DiffGrad and AdaMod concepts) with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
 25 |     
 26 |     Arguments:
 27 |         params (iterable): iterable of parameters to optimize or dicts defining
 28 |             parameter groups
 29 |         lr (float, optional): learning rate (default: 1e-3)
 30 |         betas (Tuple[float, float], optional): coefficients used for computing
 31 |             running averages of gradient and its square (default: (0.9, 0.999))
 32 |         len_memory = b3 (smoothing coefficient from AdaMod) in easier to use format, mem average with b3 is averaged with immmediate gradient.  
 33 |             specify the memory len, b3 is computed.
 34 |         version = 0 means .5 clamping rate, 1 = 0-1 clamping rate (from DiffGrad)
 35 |         eps (float, optional): term added to the denominator to improve
 36 |             numerical stability (default: 1e-8)
 37 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 38 |     """
 39 | 
 40 |     def __init__(self, params, lr=4e-3, betas=(0.9, 0.999), len_memory=200, version=1,
 41 |                  eps=1e-6, weight_decay=0, debug_print=False):
 42 |         if not 0.0 <= lr:
 43 |             raise ValueError("Invalid learning rate: {}".format(lr))
 44 |         if not 0.0 <= eps:
 45 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 46 |         if not 0.0 <= betas[0] < 1.0:
 47 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 48 |         if not 0.0 <= betas[1] < 1.0:
 49 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 50 |         
 51 |         #compute b3
 52 |         base = 1/len_memory
 53 |         beta3 = 1-(base)
 54 |         print(f"DeepMemory: length of memory is {len_memory} - this should be close or equal to batches per epoch")
 55 |         
 56 |         #debugging
 57 |         self.debug_print=debug_print
 58 |         
 59 |         
 60 |         if not 0.0 <= beta3 < 1.0:
 61 |             raise ValueError("Invalid len_memory parameter: {}".format(beta3))
 62 |         
 63 |         defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
 64 |                         weight_decay=weight_decay)
 65 |         super().__init__(params, defaults)
 66 |         
 67 |         self.version = version
 68 | 
 69 |     def __setstate__(self, state):
 70 |         super().__setstate__(state)
 71 | 
 72 |     def step(self, closure=None):
 73 |         """Performs a single optimization step.
 74 |         Arguments:
 75 |             closure (callable, optional): A closure that reevaluates the model
 76 |                 and returns the loss.
 77 |         """
 78 |         loss = None
 79 |         if closure is not None:
 80 |             loss = closure()
 81 | 
 82 |         for group in self.param_groups:
 83 |             for p in group['params']:
 84 |                 if p.grad is None:
 85 |                     continue
 86 |                 grad = p.grad.data
 87 |                 if grad.is_sparse:
 88 |                     raise RuntimeError(
 89 |                         'DiffMod does not support sparse gradients')
 90 | 
 91 |                 state = self.state[p]
 92 | 
 93 |                 # State initialization
 94 |                 if len(state) == 0:
 95 |                     state['step'] = 0
 96 |                     # Exponential moving average of gradient values
 97 |                     state['exp_avg'] = torch.zeros_like(p.data)
 98 |                     # Exponential moving average of squared gradient values
 99 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
100 |                     # Exponential moving average of actual learning rates
101 |                     state['exp_avg_lr'] = torch.zeros_like(p.data)
102 |                     # Previous gradient
103 |                     state['previous_grad'] = torch.zeros_like(p.data)                    
104 |                     
105 | 
106 |                 exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
107 |                 previous_grad = state['previous_grad']
108 |                 beta1, beta2 = group['betas']
109 | 
110 |                 state['step'] += 1
111 | 
112 |                 # Decay the first and second moment running average coefficient
113 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
114 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
115 | 
116 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
117 | 
118 |                 bias_correction1 = 1 - beta1 ** state['step']
119 |                 bias_correction2 = 1 - beta2 ** state['step']
120 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
121 |                 
122 |                 # compute diffgrad coefficient (dfc)
123 |                 if self.version==0:
124 |                     diff = abs(previous_grad - grad)
125 |                     
126 |                 elif self.version ==1:
127 |                     diff = previous_grad-grad
128 |                
129 |                     
130 |                 if self.version==0 or self.version==1:    
131 |                     dfc = 1. / (1. + torch.exp(-diff))
132 |                 
133 |                     
134 |                 state['previous_grad'] = grad                
135 | 
136 |                 if group['weight_decay'] != 0:
137 |                     p.data.add_(-group['weight_decay'] * group['lr'], p.data)
138 | 
139 |                 # create long term memory of actual learning rates (from AdaMod)
140 |                 step_size = torch.full_like(denom, step_size)
141 |                 step_size.div_(denom)
142 |                 exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
143 |                 
144 |                 if self.debug_print:
145 |                     print(f"batch step size {step_size} and exp_avg_step {exp_avg_lr}")
146 |                     
147 |                 #Blend the mini-batch step size with long term memory
148 |                 step_size = step_size.add(exp_avg_lr)
149 |                 step_size = step_size.div(2.)
150 |                     
151 |                 
152 |                 # update momentum with dfc
153 |                 exp_avg1 = exp_avg * dfc
154 |                 
155 |                 step_size.mul_(exp_avg1)
156 | 
157 |                 p.data.add_(-step_size)
158 | 
159 |         return loss


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Best-Deep-Learning-Optimizers</br>
 2 | Collection of the latest, greatest, deep learning optimizers (for Pytorch) - CNN, Transformer, NLP suitable
 3 | </br></br>
 4 | Current top performers = Have not run benchmarks lately and a lot has changed.  Quick recommendations = transformer or CNN = madgrad / adahessian.  For CNN only, Ranger. 
 5 | </br></br>
 6 | ## Updates - 
 7 | April 2021:  Meet Madgrad!  </br>Have added Madgrad with an improvement to weight decay. Madgrad is a new optimizer released by FB AI in February.  In testing with transformers for image classification, madgrad blew away the various Adam variants.</br>
 8 | However, as spotted by @nestordemeure, the weight decay impl was like adam instead of adamW.  
 9 | In testing, AdamW style weight decay was the winner and thus the implementation here is with my modification to use AdamW style wd.
10 | 
11 | Recommendations: test with </br>a)no weight decay, recommended by Madgrad authors and </br>b)weight decay at same level you would use for AdamW with this madgrad_wd version.
12 | </br>
13 | Important: madgrad is very different than Adam variants...thus recommend you start with madgrad default lr and do quick range of lr tests.  Do not just use what worked for you on your dataset with Adam(sh) lr.
14 | 
15 | Modified madgrad is here:  https://github.com/lessw2020/Best-Deep-Learning-Optimizers/tree/master/madgrad
16 | 
17 | And original madgrad is here:  https://github.com/facebookresearch/madgrad
18 | 
19 | Pending work = there is a new paper discussing Stable Weight Decay as being the ultimate weight decay.  Planning to implement and test with madgrad soon. 
20 | 
21 | August 2020 -  AdaHessian, the first 'it really works and works really well' second order optimizer added:
22 |  I tested AdaHessian last month on work datasets and it performed extremely well.  It's like training with a guided missile compared to most other optimizers.
23 | The big caveat is you will need about 2x the normal GPU memory to run it vs running with a 'first order' optimizer.
24 | I am trying to get a Titan GPU with 24GB GPU memory just for this purpose atm.
25 | 
26 | 
27 | new version of Ranger with highest accuracy to date for all optimizers tested:
28 | April 11 - New version of Ranger released (20.4.11), highest score for accuracy to date.  
29 | </br>Ranger has been upgraded to use Gradient Centralization.  See: https://arxiv.org/abs/2004.01461  and github:  https://github.com/Yonghongwei/Gradient-Centralization
30 | 
31 | It will now use GC by default, and run it for both conv layers and fc layers. You can turn it on or off with "use_gc" at init to test out the difference on your datasets.
32 | ![](images/projected_gradient.png)
33 | (image from gc github).   
34 | </br>The summary of gradient centralization: "GC can be viewed as a projected gradient descent method with a constrained loss function. The Lipschitzness of the constrained loss function and its gradient is better so that the training process becomes more efficient and stable."
35 | </br>
36 | 
37 | Note - for optimal accuracy, make sure you use run with a flat lr for some time and then cosine descent the lr (72% - 28% descent), or if you don't have an lr framework... very comparable results by running at one rate for 75%, then stop and decrease lr, and run remaining 28%. 
38 | 
39 | ## Usage - GC on by default but you can control all aspects at init:
40 | ![](images/ranger-with-gc-options.jpg)
41 | </br>
42 | ## Ranger will print settings at first init so you can confirm optimization is set the way you want it:
43 | ![](images/ranger-init.jpg)
44 | 
45 | </br> Future work: MARTHE, HyperAdam and other optimizers will be tested and posted if they look good.  
46 | 
47 | </br>
48 | 12/27 - added DiffGrad, and unofficial version 1 support (coded from the paper). 
49 | </br>
50 | 12/28 - added Diff_RGrad = diffGrad + Rectified Adam to start off....seems to work quite well. 
51 | 
52 | Medium article (summary and FastAI example usage):
53 | https://medium.com/@lessw/meet-diffgrad-new-deep-learning-optimizer-that-solves-adams-overshoot-issue-ec63e28e01b2
54 | 
55 | Official diffGrad paper:  https://arxiv.org/abs/1909.11015v2
56 | 
57 | 12/31 - AdaMod and DiffMod added.  Initial SLS files added (but more work needed).
58 | 
59 | 
60 | <b>In Progress:</b></br></br>
61 | A - Parabolic Approximation Line Search:  https://arxiv.org/abs/1903.11991v2
62 | 
63 | B - Stochastic Line Search (SLS): pending (needs param group support)
64 | 
65 | c - AvaGrad 
66 | 
67 | 
68 | <b>General papers of relevance:</b>
69 | 
70 | Does Adam stick close to the optimal point?  https://arxiv.org/abs/1911.00289v1
71 | 
72 | 
73 | Probabalistic line searches for stochastic optimization (2017, matlab only but good theory work):  https://arxiv.org/abs/1703.10034v2  
74 | 


--------------------------------------------------------------------------------
/Ranger/ranger.py:
--------------------------------------------------------------------------------
  1 | # Ranger deep learning optimizer - RAdam + Lookahead + Gradient Centralization, combined into one optimizer.
  2 | 
  3 | # https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer 
  4 | # and/or
  5 | # https://github.com/lessw2020/Best-Deep-Learning-Optimizers
  6 | 
  7 | # Ranger has now been used to capture 12 records on the FastAI leaderboard.
  8 | 
  9 | # This version = 20.4.11   
 10 | 
 11 | # Credits:
 12 | # Gradient Centralization --> https://arxiv.org/abs/2004.01461v2 (a new optimization technique for DNNs), github:  https://github.com/Yonghongwei/Gradient-Centralization
 13 | # RAdam -->  https://github.com/LiyuanLucasLiu/RAdam
 14 | # Lookahead --> rewritten by lessw2020, but big thanks to Github @LonePatient and @RWightman for ideas from their code.
 15 | # Lookahead paper --> MZhang,G Hinton  https://arxiv.org/abs/1907.08610
 16 | 
 17 | # summary of changes: 
 18 | # 4/11/20 - add gradient centralization option.  Set new testing benchmark for accuracy with it, toggle with use_gc flag at init.  
 19 | # full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights), 
 20 | # supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues.
 21 | # changes 8/31/19 - fix references to *self*.N_sma_threshold; 
 22 | # changed eps to 1e-5 as better default than 1e-8.
 23 | 
 24 | import math
 25 | import torch
 26 | from torch.optim.optimizer import Optimizer, required
 27 | 
 28 | 
 29 | 
 30 | class Ranger(Optimizer):
 31 | 
 32 |     def __init__(self, params, lr=1e-3,                       # lr
 33 |                 alpha=0.5, k=6, N_sma_threshhold=5,           # Ranger options
 34 |                 betas=(.95,0.999), eps=1e-5, weight_decay=0,  # Adam options
 35 |                 use_gc=True, gc_conv_only=False               # Gradient centralization on or off, applied to conv layers only or conv + fc layers
 36 |                 ):   
 37 | 
 38 |         #parameter checks
 39 |         if not 0.0 <= alpha <= 1.0:
 40 |             raise ValueError(f'Invalid slow update rate: {alpha}')
 41 |         if not 1 <= k:
 42 |             raise ValueError(f'Invalid lookahead steps: {k}')
 43 |         if not lr > 0:
 44 |             raise ValueError(f'Invalid Learning Rate: {lr}')
 45 |         if not eps > 0:
 46 |             raise ValueError(f'Invalid eps: {eps}')
 47 | 
 48 |         #parameter comments:
 49 |         # beta1 (momentum) of .95 seems to work better than .90...
 50 |         #N_sma_threshold of 5 seems better in testing than 4.
 51 |         #In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.
 52 | 
 53 |         #prep defaults and init torch.optim base
 54 |         defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
 55 |         super().__init__(params,defaults)
 56 | 
 57 |         #adjustable threshold
 58 |         self.N_sma_threshhold = N_sma_threshhold
 59 | 
 60 |         
 61 |         #look ahead params
 62 | 
 63 |         self.alpha = alpha
 64 |         self.k = k 
 65 | 
 66 |         #radam buffer for state
 67 |         self.radam_buffer = [[None,None,None] for ind in range(10)]
 68 | 
 69 |         #gc on or off
 70 |         self.use_gc=use_gc
 71 |         
 72 |         #level of gradient centralization
 73 |         self.gc_gradient_threshold = 3 if gc_conv_only else 1
 74 |         
 75 |         
 76 |         print(f"Ranger optimizer loaded. \nGradient Centralization usage = {self.use_gc}")
 77 |         if (self.use_gc and self.gc_gradient_threshold==1):
 78 |             print(f"GC applied to both conv and fc layers")
 79 |         elif (self.use_gc and self.gc_gradient_threshold==3):
 80 |             print(f"GC applied to conv layers only")
 81 |                                                                             
 82 |                                                                              
 83 | 
 84 |         
 85 | 
 86 |     def __setstate__(self, state):
 87 |         print("set state called")
 88 |         super(Ranger, self).__setstate__(state)
 89 | 
 90 | 
 91 |     def step(self, closure=None):
 92 |         loss = None
 93 |         #note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.  
 94 |         #Uncomment if you need to use the actual closure...
 95 | 
 96 |         #if closure is not None:
 97 |             #loss = closure()
 98 | 
 99 |         #Evaluate averages and grad, update param tensors
100 |         for group in self.param_groups:
101 | 
102 |             for p in group['params']:
103 |                 if p.grad is None:
104 |                     continue
105 |                 grad = p.grad.data.float()
106 | 
107 |                 if grad.is_sparse:
108 |                     raise RuntimeError('Ranger optimizer does not support sparse gradients')
109 | 
110 |                 p_data_fp32 = p.data.float()
111 | 
112 |                 state = self.state[p]  #get state dict for this param
113 | 
114 |                 if len(state) == 0:   #if first time to run...init dictionary with our desired entries
115 |                     #if self.first_run_check==0:
116 |                         #self.first_run_check=1
117 |                         #print("Initializing slow buffer...should not see this at load from saved model!")
118 |                     state['step'] = 0
119 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
120 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
121 | 
122 |                     #look ahead weight storage now in state dict 
123 |                     state['slow_buffer'] = torch.empty_like(p.data)
124 |                     state['slow_buffer'].copy_(p.data)
125 | 
126 |                 else:
127 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
128 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
129 | 
130 |                 #begin computations 
131 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
132 |                 beta1, beta2 = group['betas']
133 |               
134 |                 
135 |                 #GC operation for Conv layers and FC layers       
136 |                 if grad.dim() > self.gc_gradient_threshold:                    
137 |                    grad.add_(-grad.mean(dim = tuple(range(1,grad.dim())), keepdim = True))
138 |                            
139 |                 
140 | 
141 |                 state['step'] += 1
142 | 
143 |                 #compute variance mov avg
144 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
145 |                 #compute mean moving avg
146 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
147 | 
148 | 
149 |                 
150 | 
151 | 
152 |                 buffered = self.radam_buffer[int(state['step'] % 10)]
153 |                 
154 |                 if state['step'] == buffered[0]:
155 |                     N_sma, step_size = buffered[1], buffered[2]
156 |                 else:
157 |                     buffered[0] = state['step']
158 |                     beta2_t = beta2 ** state['step']
159 |                     N_sma_max = 2 / (1 - beta2) - 1
160 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
161 |                     buffered[1] = N_sma
162 |                     if N_sma > self.N_sma_threshhold:
163 |                         step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
164 |                     else:
165 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
166 |                     buffered[2] = step_size
167 | 
168 | 
169 |                 if group['weight_decay'] != 0:
170 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
171 | 
172 |                 # apply lr
173 |                 if N_sma > self.N_sma_threshhold:
174 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
175 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
176 |                 else:
177 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
178 | 
179 |                 p.data.copy_(p_data_fp32)
180 | 
181 |                 #integrated look ahead...
182 |                 #we do it at the param level instead of group level
183 |                 if state['step'] % group['k'] == 0:
184 |                     slow_p = state['slow_buffer'] #get access to slow param tensor
185 |                     slow_p.add_(self.alpha, p.data - slow_p)  #(fast weights - slow weights) * alpha
186 |                     p.data.copy_(slow_p)  #copy interpolated weights to RAdam param tensor
187 | 
188 |         return loss
189 | 


--------------------------------------------------------------------------------
/adahessian/README.md:
--------------------------------------------------------------------------------
 1 | adahessian is the first 'second order' optimizer that actually performs (and does so extremely well) on real data.
 2 | The big drawback is you'll need to have about 2x the GPU memory that you would otherwise need to run.
 3 | 
 4 | The official github for adahessian is here:
 5 | https://github.com/amirgholami/adahessian
 6 | 
 7 | In the implementation here, I've consolidated it into a single file import instead of the util + optim file like in the official repo to make it easier to use.
 8 | 
 9 | Note that you have to update your training loop as below:
10 | # usage example: 
11 |     from adahessian import Adahessian, get_params_grad
12 |     import torch.optim.lr_scheduler as lr_scheduler
13 | #
14 |     optimizer = Adahessian(model.parameters(),lr=.15)
15 |     scheduler = lr_scheduler.MultiStepLR(
16 |         optimizer,
17 |         [30,45], # 
18 |         gamma=.1,
19 |         last_epoch=-1)
20 | 
21 | #
22 | # config for training loop:
23 | #
24 |             loss.backward(create_graph=True)
25 |             _, gradsH = get_params_grad(model)
26 |             optimizer.step(gradsH)
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/adahessian/adahessian.py:
--------------------------------------------------------------------------------
  1 | #*
  2 | # @file Different utility functions
  3 | # Copyright (c) Zhewei Yao, Amir Gholami, Sheng Shen
  4 | # All rights reserved.
  5 | # This file is part of AdaHessian library.
  6 | # source: https://github.com/amirgholami/adahessian
  7 | #
  8 | # AdaHessian is free software: you can redistribute it and/or modify
  9 | # it under the terms of the GNU General Public License as published by
 10 | # the Free Software Foundation, either version 3 of the License, or
 11 | # (at your option) any later version.
 12 | #
 13 | # AdaHessian is distributed in the hope that it will be useful,
 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | # GNU General Public License for more details.
 17 | #
 18 | # You should have received a copy of the GNU General Public License
 19 | # along with adahessian.  If not, see <http://www.gnu.org/licenses/>.
 20 | #*
 21 | 
 22 | import math
 23 | import torch
 24 | from torch.optim.optimizer import Optimizer
 25 | from copy import deepcopy
 26 | import numpy as np
 27 | 
 28 | # imported from utils to avoid needing two imports... @lessw2020
 29 | def get_params_grad(model):
 30 |     """
 31 |     get model parameters and corresponding gradients
 32 |     """
 33 |     params = []
 34 |     grads = []
 35 |     for param in model.parameters():
 36 |         if not param.requires_grad:
 37 |             continue
 38 |         params.append(param)
 39 |         grads.append(0. if param.grad is None else param.grad + 0.)
 40 |     return params, grads
 41 | 
 42 | 
 43 | class Adahessian(Optimizer):
 44 |     """Implements Adahessian algorithm.
 45 |     It has been proposed in `ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning`.
 46 |     Arguments:
 47 |         params (iterable): iterable of parameters to optimize or dicts defining
 48 |             parameter groups
 49 |         lr (float, optional): learning rate (default: 0.15)
 50 |         betas (Tuple[float, float], optional): coefficients used for computing
 51 |             running averages of gradient and its square (default: (0.9, 0.999))
 52 |         eps (float, optional): term added to the denominator to improve
 53 |             numerical stability (default: 1e-4)
 54 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 55 |         hessian_power (float, optional): Hessian power (default: 1)
 56 |     """
 57 | 
 58 |     def __init__(self, params, lr=0.15, betas=(0.9, 0.999), eps=1e-4,
 59 |                  weight_decay=0, hessian_power=1):
 60 |         if not 0.0 <= lr:
 61 |             raise ValueError("Invalid learning rate: {}".format(lr))
 62 |         if not 0.0 <= eps:
 63 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 64 |         if not 0.0 <= betas[0] < 1.0:
 65 |             raise ValueError(
 66 |                 "Invalid beta parameter at index 0: {}".format(
 67 |                     betas[0]))
 68 |         if not 0.0 <= betas[1] < 1.0:
 69 |             raise ValueError(
 70 |                 "Invalid beta parameter at index 1: {}".format(
 71 |                     betas[1]))
 72 |         if not 0.0 <= hessian_power <= 1.0:
 73 |             raise ValueError("Invalid Hessian power value: {}".format(hessian_power))
 74 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 75 |                         weight_decay=weight_decay, hessian_power=hessian_power)
 76 | 
 77 |         super(Adahessian, self).__init__(params, defaults)
 78 | 
 79 |     def get_trace(self, gradsH):
 80 |         """
 81 |         compute the Hessian vector product with a random vector v, at the current gradient point,
 82 |         i.e., compute the gradient of <gradsH,v>.
 83 |         :param gradsH: a list of torch variables
 84 |         :return: a list of torch tensors
 85 |         """
 86 | 
 87 |         params = self.param_groups[0]['params']
 88 | 
 89 |         v = [torch.randint_like(p, high=2, device='cuda') for p in params]
 90 |         for v_i in v:
 91 |             v_i[v_i == 0] = -1
 92 |         hvs = torch.autograd.grad(
 93 |             gradsH,
 94 |             params,
 95 |             grad_outputs=v,
 96 |             only_inputs=True,
 97 |             retain_graph=True)
 98 | 
 99 |         hutchinson_trace = []
100 |         for hv, vi in zip(hvs, v):
101 |             param_size = hv.size()
102 |             if len(param_size) <= 2:  # for 0/1/2D tensor
103 |                 tmp_output = torch.abs(hv * vi)
104 |                 hutchinson_trace.append(tmp_output) # Hessian diagonal block size is 1 here.
105 |             elif len(param_size) == 4:  # Conv kernel
106 |                 tmp_output = torch.abs(torch.sum(torch.abs(
107 |                     hv * vi), dim=[2, 3], keepdim=True)) / vi[0, 1].numel() # Hessian diagonal block size is 9 here: torch.sum() reduces the dim 2/3.
108 |                 hutchinson_trace.append(tmp_output)
109 |         
110 |         return hutchinson_trace
111 | 
112 |     def step(self, gradsH, closure=None):
113 |         """Performs a single optimization step.
114 |         Arguments:
115 |             gradsH: The gradient used to compute Hessian vector product.
116 |             closure (callable, optional): A closure that reevaluates the model
117 |                 and returns the loss.
118 |         """
119 |         loss = None
120 |         if closure is not None:
121 |             loss = closure()
122 | 
123 |         # get the Hessian diagonal
124 |         hut_trace = self.get_trace(gradsH)
125 | 
126 |         for group in self.param_groups:
127 |             for i, p in enumerate(group['params']):
128 |                 if p.grad is None:
129 |                     continue
130 | 
131 |                 grad = deepcopy(gradsH[i].data)
132 |                 state = self.state[p]
133 | 
134 |                 # State initialization
135 |                 if len(state) == 0:
136 |                     state['step'] = 0
137 |                     # Exponential moving average of gradient values
138 |                     state['exp_avg'] = torch.zeros_like(p.data)
139 |                     # Exponential moving average of Hessian diagonal square values
140 |                     state['exp_hessian_diag_sq'] = torch.zeros_like(p.data)
141 | 
142 |                 exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq']
143 | 
144 |                 beta1, beta2 = group['betas']
145 | 
146 |                 state['step'] += 1
147 | 
148 |                 # Decay the first and second moment running average coefficient
149 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
150 |                 exp_hessian_diag_sq.mul_(beta2).addcmul_(
151 |                     1 - beta2, hut_trace[i], hut_trace[i])
152 | 
153 |                 bias_correction1 = 1 - beta1 ** state['step']
154 |                 bias_correction2 = 1 - beta2 ** state['step']
155 | 
156 |                 # make the square root, and the Hessian power
157 |                 k = group['hessian_power']
158 |                 denom = (
159 |                     (exp_hessian_diag_sq.sqrt() ** k) /
160 |                     math.sqrt(bias_correction2) ** k).add_(
161 |                     group['eps'])
162 | 
163 |                 # make update
164 |                 p.data = p.data - \
165 |                     group['lr'] * (exp_avg / bias_correction1 / denom + group['weight_decay'] * p.data)
166 | 
167 |         return loss
168 | 


--------------------------------------------------------------------------------
/adamod/README.md:
--------------------------------------------------------------------------------
 1 | AdaMod is a new optimizer that takes Adam but adds an exponential moving average of the adaptive learning rates. 
 2 | This ensures no large spikes during training and helps achieve faster and better convergence.
 3 | 
 4 | Original source code and paper:  https://github.com/lancopku/AdaMod
 5 | 
 6 | DiffMod is a combination of DiffGrad + AdaMod = diffgrad.
 7 | 
 8 | Currently DiffMod, using version 0 of DiffGrad, appears to be the best performer of all.  But more testing is needed.</br>
 9 | 
10 | Usage:</br>
11 | from diffmod import DiffMod</br>
12 | optar = partial(DiffMod,version=0)</br>
13 | learn = Learner(data, model, metrics=[accuracy], wd=1e-3,</br>
14 |                 opt_func=optar,</br>
15 |                  bn_wd=False, true_wd=True,</br>
16 |                 loss_func = LabelSmoothingCrossEntropy()) </br>
17 | 


--------------------------------------------------------------------------------
/adamod/adamod.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim import Optimizer
  4 | 
  5 | #source - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
  6 | #modification - lessw2020 - use len_memory as integer lookback, convert to beta3 for easier usage
  7 | 
  8 | class AdaMod(Optimizer):
  9 |     """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
 10 |     It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_.
 11 |     Arguments:
 12 |         params (iterable): iterable of parameters to optimize or dicts defining
 13 |             parameter groups
 14 |         lr (float, optional): learning rate (default: 1e-3)
 15 |         betas (Tuple[float, float], optional): coefficients used for computing
 16 |             running averages of gradient and its square (default: (0.9, 0.999))
 17 |         beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999)
 18 |         eps (float, optional): term added to the denominator to improve
 19 |             numerical stability (default: 1e-8)
 20 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 21 |     """
 22 | 
 23 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), 
 24 |                  len_memory=1000, #will convert to beta3
 25 |                  eps=1e-8, weight_decay=0):
 26 |         if not 0.0 <= lr:
 27 |             raise ValueError("Invalid learning rate: {}".format(lr))
 28 |         if not 0.0 <= eps:
 29 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 30 |         if not 0.0 <= betas[0] < 1.0:
 31 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 32 |         if not 0.0 <= betas[1] < 1.0:
 33 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 34 |             
 35 |         beta3 = 1 - (1/len_memory)
 36 |         print(f"AdaMod optimizer: len_memory of {len_memory} set at Beta3 of {beta3}")
 37 |         if not 0.0 <= beta3 < 1.0:
 38 |             raise ValueError("Invalid beta3 parameter: {}".format(beta3))
 39 |         defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
 40 |                         weight_decay=weight_decay)
 41 |         super().__init__(params, defaults)
 42 | 
 43 |     def __setstate__(self, state):
 44 |         super().__setstate__(state)
 45 | 
 46 |     def step(self, closure=None):
 47 |         """Performs a single optimization step.
 48 |         Arguments:
 49 |             closure (callable, optional): A closure that reevaluates the model
 50 |                 and returns the loss.
 51 |         """
 52 |         loss = None
 53 |         if closure is not None:
 54 |             loss = closure()
 55 | 
 56 |         for group in self.param_groups:
 57 |             for p in group['params']:
 58 |                 if p.grad is None:
 59 |                     continue
 60 |                 grad = p.grad.data
 61 |                 if grad.is_sparse:
 62 |                     raise RuntimeError(
 63 |                         'AdaMod does not support sparse gradients')
 64 | 
 65 |                 state = self.state[p]
 66 | 
 67 |                 # State initialization
 68 |                 if len(state) == 0:
 69 |                     state['step'] = 0
 70 |                     # Exponential moving average of gradient values
 71 |                     state['exp_avg'] = torch.zeros_like(p.data)
 72 |                     # Exponential moving average of squared gradient values
 73 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 74 |                     # Exponential moving average of actual learning rates
 75 |                     state['exp_avg_lr'] = torch.zeros_like(p.data)
 76 | 
 77 |                 exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
 78 |                 beta1, beta2 = group['betas']
 79 | 
 80 |                 state['step'] += 1
 81 | 
 82 |                 # Decay the first and second moment running average coefficient
 83 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 84 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 85 | 
 86 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 87 | 
 88 |                 bias_correction1 = 1 - beta1 ** state['step']
 89 |                 bias_correction2 = 1 - beta2 ** state['step']
 90 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
 91 | 
 92 |                 if group['weight_decay'] != 0:
 93 |                     p.data.add_(-group['weight_decay'] * group['lr'], p.data)
 94 | 
 95 |                 # Applies momental bounds on actual learning rates
 96 |                 step_size = torch.full_like(denom, step_size)
 97 |                 step_size.div_(denom)
 98 |                 exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
 99 |                 step_size = torch.min(step_size,  exp_avg_lr)
100 |                 step_size.mul_(exp_avg)
101 | 
102 |                 p.data.add_(-step_size)
103 | 
104 |         return loss
105 | 


--------------------------------------------------------------------------------
/adamod/diffmod.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim import Optimizer
  4 | 
  5 | # source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
  6 | 
  7 | # modifications @lessw2020 - blend diffGrad + AdaMod = diffmod.
  8 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
  9 | 
 10 | 
 11 | class DiffMod(Optimizer):
 12 |     """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
 13 |     It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_.
 14 |     Arguments:
 15 |         params (iterable): iterable of parameters to optimize or dicts defining
 16 |             parameter groups
 17 |         lr (float, optional): learning rate (default: 1e-3)
 18 |         betas (Tuple[float, float], optional): coefficients used for computing
 19 |             running averages of gradient and its square (default: (0.9, 0.999))
 20 |         beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999)
 21 |         len_memory = b3 in easier to use format.  specify the memory len, b3 is computed.
 22 |         eps (float, optional): term added to the denominator to improve
 23 |             numerical stability (default: 1e-8)
 24 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 25 |     """
 26 | 
 27 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), len_memory=1000, version=0,
 28 |                  eps=1e-8, weight_decay=0):
 29 |         if not 0.0 <= lr:
 30 |             raise ValueError("Invalid learning rate: {}".format(lr))
 31 |         if not 0.0 <= eps:
 32 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 33 |         if not 0.0 <= betas[0] < 1.0:
 34 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 35 |         if not 0.0 <= betas[1] < 1.0:
 36 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 37 |         
 38 |         #compute b3
 39 |         beta3 = 1-(1/len_memory)
 40 |         print(f"length of memory is ",len_memory," and b3 is thus ",beta3)
 41 |         
 42 |         if not 0.0 <= beta3 < 1.0:
 43 |             raise ValueError("Invalid beta3 parameter: {}".format(beta3))
 44 |         
 45 |         defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
 46 |                         weight_decay=weight_decay)
 47 |         super().__init__(params, defaults)
 48 |         
 49 |         self.version = version
 50 | 
 51 |     def __setstate__(self, state):
 52 |         super().__setstate__(state)
 53 | 
 54 |     def step(self, closure=None):
 55 |         """Performs a single optimization step.
 56 |         Arguments:
 57 |             closure (callable, optional): A closure that reevaluates the model
 58 |                 and returns the loss.
 59 |         """
 60 |         loss = None
 61 |         if closure is not None:
 62 |             loss = closure()
 63 | 
 64 |         for group in self.param_groups:
 65 |             for p in group['params']:
 66 |                 if p.grad is None:
 67 |                     continue
 68 |                 grad = p.grad.data
 69 |                 if grad.is_sparse:
 70 |                     raise RuntimeError(
 71 |                         'DiffMod does not support sparse gradients')
 72 | 
 73 |                 state = self.state[p]
 74 | 
 75 |                 # State initialization
 76 |                 if len(state) == 0:
 77 |                     state['step'] = 0
 78 |                     # Exponential moving average of gradient values
 79 |                     state['exp_avg'] = torch.zeros_like(p.data)
 80 |                     # Exponential moving average of squared gradient values
 81 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 82 |                     # Exponential moving average of actual learning rates
 83 |                     state['exp_avg_lr'] = torch.zeros_like(p.data)
 84 |                     # Previous gradient
 85 |                     state['previous_grad'] = torch.zeros_like(p.data)                    
 86 |                     
 87 | 
 88 |                 exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
 89 |                 previous_grad = state['previous_grad']
 90 |                 beta1, beta2 = group['betas']
 91 | 
 92 |                 state['step'] += 1
 93 | 
 94 |                 # Decay the first and second moment running average coefficient
 95 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 96 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 97 | 
 98 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 99 | 
100 |                 bias_correction1 = 1 - beta1 ** state['step']
101 |                 bias_correction2 = 1 - beta2 ** state['step']
102 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
103 |                 
104 |                 # compute diffgrad coefficient (dfc)
105 |                 
106 |                 
107 |                 if self.version==0:
108 |                     diff = abs(previous_grad - grad)
109 |                 elif self.version ==1:
110 |                     diff = previous_grad-grad
111 |                 elif self.version ==2:
112 |                     diff =  .5*abs(previous_grad - grad)
113 |                     
114 |                 if self.version==0 or self.version==1:    
115 |                     dfc = 1. / (1. + torch.exp(-diff))
116 |                 elif self.version==2:
117 |                     dfc = 9. / (1. + torch.exp(-diff))-4      #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
118 |                     
119 |                 state['previous_grad'] = grad                
120 | 
121 |                 if group['weight_decay'] != 0:
122 |                     p.data.add_(-group['weight_decay'] * group['lr'], p.data)
123 | 
124 |                 # Applies momental bounds on actual learning rates
125 |                 step_size = torch.full_like(denom, step_size)
126 |                 step_size.div_(denom)
127 |                 exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
128 |                 step_size = torch.min(step_size,  exp_avg_lr)
129 |                 
130 |                 # update momentum with dfc
131 |                 exp_avg1 = exp_avg * dfc
132 |                 
133 |                 step_size.mul_(exp_avg1)
134 | 
135 |                 p.data.add_(-step_size)
136 | 
137 |         return loss


--------------------------------------------------------------------------------
/diffgrad/README.md:
--------------------------------------------------------------------------------
 1 | DiffGrad adjusts the step size for each parameter by comparing the current gradient vs the previous.  It is designed to solve the 'Adam' 
 2 | overshoot problem, where the momentum of Adam can carry it right over the global mininimum.
 3 | 
 4 | https://github.com/shivram1987/diffGrad  for original source 
 5 | 
 6 | and paper:  https://arxiv.org/abs/1909.11015v2
 7 | 
 8 | (TF version - if you are forced to use TF, here's a TF version of diffgrad:
 9 | https://github.com/evanatyourservice/diffGrad-tf )
10 | 
11 | 
12 | This version adds in a version parameter:  version 0 is the main one used in the paper.  version 1 removes the abs value from the calculations and
13 | allows faster clamping.
14 | Use:  version=1 in your optimizer params.  version=0 is default.
15 | 
16 | 12/27 - added DiffRGrad - this is diffGrad with Rectified Adam to start.  Thus no warmup needed and diffGrad kicks in after Rectified Adam says variance is ready to go. 
17 | 
18 | Medium article and example usage:  https://medium.com/@lessw/meet-diffgrad-new-deep-learning-optimizer-that-solves-adams-overshoot-issue-ec63e28e01b2
19 | 


--------------------------------------------------------------------------------
/diffgrad/diff_rgrad.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer, required
  4 | 
  5 | # Original source:  DiffGrad:  https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py
  6 | # RAam:  https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py
  7 | # modifications: @lessw2020 - blend RAdam with DiffGrad and add version options
  8 | # __version__: 12.27.19
  9 | 
 10 | 
 11 | class diffRGrad(Optimizer):
 12 | 
 13 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 
 14 |                  version=1,
 15 |                  weight_decay=0, degenerated_to_sgd=True):
 16 |         if not 0.0 <= lr:
 17 |             raise ValueError("Invalid learning rate: {}".format(lr))
 18 |         if not 0.0 <= eps:
 19 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 20 |         if not 0.0 <= betas[0] < 1.0:
 21 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 22 |         if not 0.0 <= betas[1] < 1.0:
 23 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 24 | 
 25 |         self.degenerated_to_sgd = degenerated_to_sgd
 26 | 
 27 |         self.version = version
 28 | 
 29 |         if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
 30 |             for param in params:
 31 |                 if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
 32 |                     param['buffer'] = [[None, None, None] for _ in range(10)]
 33 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
 34 |         super(diffRGrad, self).__init__(params, defaults)
 35 | 
 36 |     def __setstate__(self, state):
 37 |         super(diffRGrad, self).__setstate__(state)
 38 | 
 39 |     def step(self, closure=None):
 40 | 
 41 |         loss = None
 42 |         if closure is not None:
 43 |             loss = closure()
 44 | 
 45 |         for group in self.param_groups:
 46 | 
 47 |             for p in group['params']:
 48 |                 if p.grad is None:
 49 |                     continue
 50 |                 grad = p.grad.data.float()
 51 |                 if grad.is_sparse:
 52 |                     raise RuntimeError('diffGRad does not support sparse gradients')
 53 | 
 54 |                 p_data_fp32 = p.data.float()
 55 | 
 56 |                 state = self.state[p]
 57 | 
 58 |                 if len(state) == 0:
 59 |                     state['step'] = 0
 60 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
 61 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
 62 |                     # Previous gradient
 63 |                     state['previous_grad'] = torch.zeros_like(p_data_fp32)                    
 64 | 
 65 |                 else:
 66 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
 67 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
 68 |                     state['previous_grad'] = state['previous_grad'].type_as(p_data_fp32)
 69 | 
 70 | 
 71 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 72 |                 previous_grad = state['previous_grad']
 73 |                 beta1, beta2 = group['betas']
 74 | 
 75 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 76 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 77 | 
 78 |                 state['step'] += 1
 79 | 
 80 |                 # compute diffgrad coefficient (dfc)
 81 | 
 82 |                 #print("grad = ",grad.size())
 83 |                 #print("prev_grad = ",previous_grad.size())
 84 | 
 85 |                 if self.version==0:
 86 |                     diff = abs(previous_grad - grad)
 87 |                 elif self.version ==1:
 88 |                     diff = previous_grad-grad
 89 |                 elif self.version ==2:
 90 |                     diff =  .5*abs(previous_grad - grad)
 91 | 
 92 |                 if self.version==0 or self.version==1:    
 93 |                     dfc = 1. / (1. + torch.exp(-diff))
 94 |                 elif self.version==2:
 95 |                     dfc = 9. / (1. + torch.exp(-diff))-4      #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
 96 | 
 97 |                 state['previous_grad'] = grad  
 98 | 
 99 | 
100 |                 buffered = group['buffer'][int(state['step'] % 10)]
101 |                 if state['step'] == buffered[0]:
102 |                     N_sma, step_size = buffered[1], buffered[2]
103 |                 else:
104 |                     buffered[0] = state['step']
105 |                     beta2_t = beta2 ** state['step']
106 |                     N_sma_max = 2 / (1 - beta2) - 1
107 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
108 |                     buffered[1] = N_sma
109 | 
110 |                     # more conservative since it's an approximated value
111 |                     if N_sma >= 5:
112 |                         step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
113 |                     elif self.degenerated_to_sgd:
114 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
115 |                     else:
116 |                         step_size = -1
117 |                     buffered[2] = step_size
118 | 
119 | 
120 | 
121 | 
122 |                 # more conservative since it's an approximated value
123 |                 if N_sma >= 5:
124 |                     if group['weight_decay'] != 0:
125 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
126 | 
127 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
128 | 
129 |                     # update momentum with dfc
130 |                     #print("dfc ",dfc.size())
131 |                     #print("exp_avg ",exp_avg.size())
132 |                     exp_avg1 = exp_avg * dfc.float()
133 | 
134 | 
135 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg1, denom)
136 |                     p.data.copy_(p_data_fp32)
137 | 
138 |                 elif step_size > 0:
139 | 
140 |                     #print("exp_avg in elif",exp_avg.size())
141 |                     if group['weight_decay'] != 0:
142 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
143 | 
144 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
145 |                     p.data.copy_(p_data_fp32)
146 | 
147 |         return loss
148 | 


--------------------------------------------------------------------------------
/diffgrad/diffgrad.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import math
  3 | import torch
  4 | from torch.optim.optimizer import Optimizer
  5 | import numpy as np
  6 | import torch.nn as nn
  7 | #import torch.optim as Optimizer
  8 | 
  9 | # Original source:  https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py
 10 | 
 11 | # modifications: @lessw2020
 12 | 
 13 | 
 14 | class DiffGrad(Optimizer):
 15 |     r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam.
 16 |     It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_.
 17 |     Arguments:
 18 |         params (iterable): iterable of parameters to optimize or dicts defining
 19 |             parameter groups
 20 |         lr (float, optional): learning rate (default: 1e-3)
 21 |         betas (Tuple[float, float], optional): coefficients used for computing
 22 |             running averages of gradient and its square (default: (0.9, 0.999))
 23 |         eps (float, optional): term added to the denominator to improve
 24 |             numerical stability (default: 1e-8)
 25 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 26 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 27 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 28 |             (default: False)
 29 |     .. _diffGrad: An Optimization Method for Convolutional Neural Networks:
 30 |         https://arxiv.org/abs/1909.11015
 31 |     .. _Adam\: A Method for Stochastic Optimization:
 32 |         https://arxiv.org/abs/1412.6980
 33 |     .. _On the Convergence of Adam and Beyond:
 34 |         https://openreview.net/forum?id=ryQu7f-RZ
 35 |     """
 36 | 
 37 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0):
 38 |         if not 0.0 <= lr:
 39 |             raise ValueError("Invalid learning rate: {}".format(lr))
 40 |         if not 0.0 <= eps:
 41 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 42 |         if not 0.0 <= betas[0] < 1.0:
 43 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 44 |         if not 0.0 <= betas[1] < 1.0:
 45 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 46 |         
 47 |         
 48 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 49 |         
 50 |         super().__init__(params, defaults)
 51 |         
 52 |         #save version
 53 |         self.version = version
 54 | 
 55 |     def __setstate__(self, state):
 56 |         super().__setstate__(state)
 57 | 
 58 |     def step(self, closure=None):
 59 |         """Performs a single optimization step.
 60 |         Arguments:
 61 |             closure (callable, optional): A closure that reevaluates the model
 62 |                 and returns the loss.
 63 |         """
 64 |         loss = None
 65 |         if closure is not None:
 66 |             loss = closure()
 67 | 
 68 |         for group in self.param_groups:
 69 |             for p in group['params']:
 70 |                 if p.grad is None:
 71 |                     continue
 72 |                 grad = p.grad.data
 73 |                 if grad.is_sparse:
 74 |                     raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead')
 75 | 
 76 |                 state = self.state[p]
 77 | 
 78 |                 # State initialization
 79 |                 if len(state) == 0:
 80 |                     state['step'] = 0
 81 |                     # Exponential moving average of gradient values
 82 |                     state['exp_avg'] = torch.zeros_like(p.data)
 83 |                     # Exponential moving average of squared gradient values
 84 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 85 |                     # Previous gradient
 86 |                     state['previous_grad'] = torch.zeros_like(p.data)
 87 | 
 88 |                 exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad']
 89 |                 beta1, beta2 = group['betas']
 90 | 
 91 |                 state['step'] += 1
 92 | 
 93 |                 if group['weight_decay'] != 0:
 94 |                     grad.add_(group['weight_decay'], p.data)
 95 | 
 96 |                 # Decay the first and second moment running average coefficient
 97 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 98 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 99 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
100 | 
101 |                 bias_correction1 = 1 - beta1 ** state['step']
102 |                 bias_correction2 = 1 - beta2 ** state['step']
103 | 
104 |                 # compute diffgrad coefficient (dfc)
105 |                 
106 |                 
107 |                 if self.version==0:
108 |                     diff = abs(previous_grad - grad)
109 |                 elif self.version ==1:
110 |                     diff = previous_grad-grad
111 |                 elif self.version ==2:
112 |                     diff =  .5*abs(previous_grad - grad)
113 |                     
114 |                 if self.version==0 or self.version==1:    
115 |                     dfc = 1. / (1. + torch.exp(-diff))
116 |                 elif self.version==2:
117 |                     dfc = 9. / (1. + torch.exp(-diff))-4      #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
118 |                     
119 |                 state['previous_grad'] = grad
120 | 
121 |                 # update momentum with dfc
122 |                 exp_avg1 = exp_avg * dfc
123 | 
124 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
125 | 
126 |                 p.data.addcdiv_(-step_size, exp_avg1, denom)
127 | 
128 |         return loss


--------------------------------------------------------------------------------
/diffgrad/mxresnet.py:
--------------------------------------------------------------------------------
  1 | #FastAI's XResnet modified to use Mish activation function, MXResNet 
  2 | #https://github.com/fastai/fastai/blob/master/fastai/vision/models/xresnet.py
  3 | #modified by lessw2020 - github:  https://github.com/lessw2020/mish
  4 | 
  5 | 
  6 | from fastai.torch_core import *
  7 | import torch.nn as nn
  8 | import torch,math,sys
  9 | import torch.utils.model_zoo as model_zoo
 10 | from functools import partial
 11 | #from ...torch_core import Module
 12 | from fastai.torch_core import Module
 13 | 
 14 | import torch.nn.functional as F  #(uncomment if needed,but you likely already have it)
 15 | 
 16 | 
 17 | class Mish(nn.Module):
 18 |     def __init__(self):
 19 |         super().__init__()
 20 |         print("Mish activation loaded...")
 21 | 
 22 |     def forward(self, x):  
 23 |         #save 1 second per epoch with no x= x*() and then return x...just inline it.
 24 |         return x *( torch.tanh(F.softplus(x))) 
 25 |         
 26 | 
 27 | 
 28 |     
 29 | 
 30 | #Unmodified from https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py
 31 | def conv1d(ni:int, no:int, ks:int=1, stride:int=1, padding:int=0, bias:bool=False):
 32 |     "Create and initialize a `nn.Conv1d` layer with spectral normalization."
 33 |     conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias)
 34 |     nn.init.kaiming_normal_(conv.weight)
 35 |     if bias: conv.bias.data.zero_()
 36 |     return spectral_norm(conv)
 37 | 
 38 | 
 39 | 
 40 | # Adapted from SelfAttention layer at https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py
 41 | # Inspired by https://arxiv.org/pdf/1805.08318.pdf
 42 | class SimpleSelfAttention(nn.Module):
 43 |     
 44 |     def __init__(self, n_in:int, ks=1, sym=False):#, n_out:int):
 45 |         super().__init__()
 46 |            
 47 |         self.conv = conv1d(n_in, n_in, ks, padding=ks//2, bias=False)      
 48 |        
 49 |         self.gamma = nn.Parameter(tensor([0.]))
 50 |         
 51 |         self.sym = sym
 52 |         self.n_in = n_in
 53 |         
 54 |     def forward(self,x):
 55 |         
 56 |         
 57 |         if self.sym:
 58 |             # symmetry hack by https://github.com/mgrankin
 59 |             c = self.conv.weight.view(self.n_in,self.n_in)
 60 |             c = (c + c.t())/2
 61 |             self.conv.weight = c.view(self.n_in,self.n_in,1)
 62 |                 
 63 |         size = x.size()  
 64 |         x = x.view(*size[:2],-1)   # (C,N)
 65 |         
 66 |         # changed the order of mutiplication to avoid O(N^2) complexity
 67 |         # (x*xT)*(W*x) instead of (x*(xT*(W*x)))
 68 |         
 69 |         convx = self.conv(x)   # (C,C) * (C,N) = (C,N)   => O(NC^2)
 70 |         xxT = torch.bmm(x,x.permute(0,2,1).contiguous())   # (C,N) * (N,C) = (C,C)   => O(NC^2)
 71 |         
 72 |         o = torch.bmm(xxT, convx)   # (C,C) * (C,N) = (C,N)   => O(NC^2)
 73 |           
 74 |         o = self.gamma * o + x
 75 |         
 76 |           
 77 |         return o.view(*size).contiguous()        
 78 |         
 79 | 
 80 | 
 81 |     
 82 |     
 83 | __all__ = ['MXResNet', 'mxresnet18', 'mxresnet34', 'mxresnet50', 'mxresnet101', 'mxresnet152']
 84 | 
 85 | # or: ELU+init (a=0.54; gain=1.55)
 86 | act_fn = Mish() #nn.ReLU(inplace=True)
 87 | 
 88 | class Flatten(Module):
 89 |     def forward(self, x): return x.view(x.size(0), -1)
 90 | 
 91 | def init_cnn(m):
 92 |     if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
 93 |     if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
 94 |     for l in m.children(): init_cnn(l)
 95 | 
 96 | def conv(ni, nf, ks=3, stride=1, bias=False):
 97 |     return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)
 98 | 
 99 | def noop(x): return x
100 | 
101 | def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
102 |     bn = nn.BatchNorm2d(nf)
103 |     nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
104 |     layers = [conv(ni, nf, ks, stride=stride), bn]
105 |     if act: layers.append(act_fn)
106 |     return nn.Sequential(*layers)
107 | 
108 | class ResBlock(Module):
109 |     def __init__(self, expansion, ni, nh, stride=1,sa=False, sym=False):
110 |         nf,ni = nh*expansion,ni*expansion
111 |         layers  = [conv_layer(ni, nh, 3, stride=stride),
112 |                    conv_layer(nh, nf, 3, zero_bn=True, act=False)
113 |         ] if expansion == 1 else [
114 |                    conv_layer(ni, nh, 1),
115 |                    conv_layer(nh, nh, 3, stride=stride),
116 |                    conv_layer(nh, nf, 1, zero_bn=True, act=False)
117 |         ]
118 |         self.sa = SimpleSelfAttention(nf,ks=1,sym=sym) if sa else noop
119 |         self.convs = nn.Sequential(*layers)
120 |         # TODO: check whether act=True works better
121 |         self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
122 |         self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)
123 | 
124 |     def forward(self, x): return act_fn(self.sa(self.convs(x)) + self.idconv(self.pool(x)))
125 | 
126 | def filt_sz(recep): return min(64, 2**math.floor(math.log2(recep*0.75)))
127 | 
128 | class MXResNet(nn.Sequential):
129 |     def __init__(self, expansion, layers, c_in=3, c_out=1000, sa = False, sym= False):
130 |         stem = []
131 |         sizes = [c_in,32,64,64]  #modified per Grankin
132 |         for i in range(3):
133 |             stem.append(conv_layer(sizes[i], sizes[i+1], stride=2 if i==0 else 1))
134 |             #nf = filt_sz(c_in*9)
135 |             #stem.append(conv_layer(c_in, nf, stride=2 if i==1 else 1))
136 |             #c_in = nf
137 | 
138 |         block_szs = [64//expansion,64,128,256,512]
139 |         blocks = [self._make_layer(expansion, block_szs[i], block_szs[i+1], l, 1 if i==0 else 2, sa = sa if i in[len(layers)-4] else False, sym=sym)
140 |                   for i,l in enumerate(layers)]
141 |         super().__init__(
142 |             *stem,
143 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
144 |             *blocks,
145 |             nn.AdaptiveAvgPool2d(1), Flatten(),
146 |             nn.Linear(block_szs[-1]*expansion, c_out),
147 |         )
148 |         init_cnn(self)
149 | 
150 |     def _make_layer(self, expansion, ni, nf, blocks, stride, sa=False, sym=False):
151 |         return nn.Sequential(
152 |             *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1, sa if i in [blocks -1] else False,sym)
153 |               for i in range(blocks)])
154 | 
155 | def mxresnet(expansion, n_layers, name, pretrained=False, **kwargs):
156 |     model = MXResNet(expansion, n_layers, **kwargs)
157 |     if pretrained: 
158 |         #model.load_state_dict(model_zoo.load_url(model_urls[name]))
159 |         print("No pretrained yet for MXResNet")
160 |     return model
161 | 
162 | me = sys.modules[__name__]
163 | for n,e,l in [
164 |     [ 18 , 1, [2,2,2 ,2] ],
165 |     [ 34 , 1, [3,4,6 ,3] ],
166 |     [ 50 , 4, [3,4,6 ,3] ],
167 |     [ 101, 4, [3,4,23,3] ],
168 |     [ 152, 4, [3,8,36,3] ],
169 | ]:
170 |     name = f'mxresnet{n}'
171 |     setattr(me, name, partial(mxresnet, expansion=e, n_layers=l, name=name))


--------------------------------------------------------------------------------
/diffmod/diffmod.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim import Optimizer
  4 | 
  5 | # source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
  6 | 
  7 | # modifications @lessw2020 - blend diffGrad + AdaMod = diffmod.
  8 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
  9 | 
 10 | 
 11 | class DiffMod(Optimizer):
 12 |     """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
 13 |     It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_.
 14 |     Arguments:
 15 |         params (iterable): iterable of parameters to optimize or dicts defining
 16 |             parameter groups
 17 |         lr (float, optional): learning rate (default: 1e-3)
 18 |         betas (Tuple[float, float], optional): coefficients used for computing
 19 |             running averages of gradient and its square (default: (0.9, 0.999))
 20 |         beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999)
 21 |         len_memory = b3 in easier to use format.  specify the memory len, b3 is computed.
 22 |         eps (float, optional): term added to the denominator to improve
 23 |             numerical stability (default: 1e-8)
 24 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 25 |     """
 26 | 
 27 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), len_memory=1000, version=0,
 28 |                  eps=1e-8, weight_decay=0, average_step=False, debug_print=False):
 29 |         if not 0.0 <= lr:
 30 |             raise ValueError("Invalid learning rate: {}".format(lr))
 31 |         if not 0.0 <= eps:
 32 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 33 |         if not 0.0 <= betas[0] < 1.0:
 34 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 35 |         if not 0.0 <= betas[1] < 1.0:
 36 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 37 |         
 38 |         #compute b3
 39 |         base = 1/len_memory
 40 |         beta3 = 1-(base)
 41 |         print(f"DiffMod: length of memory is ",len_memory," and b3 is thus ",beta3, "and base = ",base)
 42 |         
 43 |         #debugging
 44 |         self.debug_print=debug_print
 45 |         self.average_step = average_step
 46 |         if self.average_step==True:
 47 |             print(f"DiffMod: step size and exp avg step will be averaged together.")
 48 |         
 49 |         if not 0.0 <= beta3 < 1.0:
 50 |             raise ValueError("Invalid beta3 parameter: {}".format(beta3))
 51 |         
 52 |         defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
 53 |                         weight_decay=weight_decay)
 54 |         super().__init__(params, defaults)
 55 |         
 56 |         self.version = version
 57 | 
 58 |     def __setstate__(self, state):
 59 |         super().__setstate__(state)
 60 | 
 61 |     def step(self, closure=None):
 62 |         """Performs a single optimization step.
 63 |         Arguments:
 64 |             closure (callable, optional): A closure that reevaluates the model
 65 |                 and returns the loss.
 66 |         """
 67 |         loss = None
 68 |         if closure is not None:
 69 |             loss = closure()
 70 | 
 71 |         for group in self.param_groups:
 72 |             for p in group['params']:
 73 |                 if p.grad is None:
 74 |                     continue
 75 |                 grad = p.grad.data
 76 |                 if grad.is_sparse:
 77 |                     raise RuntimeError(
 78 |                         'DiffMod does not support sparse gradients')
 79 | 
 80 |                 state = self.state[p]
 81 | 
 82 |                 # State initialization
 83 |                 if len(state) == 0:
 84 |                     state['step'] = 0
 85 |                     # Exponential moving average of gradient values
 86 |                     state['exp_avg'] = torch.zeros_like(p.data)
 87 |                     # Exponential moving average of squared gradient values
 88 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 89 |                     # Exponential moving average of actual learning rates
 90 |                     state['exp_avg_lr'] = torch.zeros_like(p.data)
 91 |                     # Previous gradient
 92 |                     state['previous_grad'] = torch.zeros_like(p.data)                    
 93 |                     
 94 | 
 95 |                 exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
 96 |                 previous_grad = state['previous_grad']
 97 |                 beta1, beta2 = group['betas']
 98 | 
 99 |                 state['step'] += 1
100 | 
101 |                 # Decay the first and second moment running average coefficient
102 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
103 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
104 | 
105 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
106 | 
107 |                 bias_correction1 = 1 - beta1 ** state['step']
108 |                 bias_correction2 = 1 - beta2 ** state['step']
109 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
110 |                 
111 |                 # compute diffgrad coefficient (dfc)
112 |                 
113 |                 
114 |                 if self.version==0:
115 |                     diff = abs(previous_grad - grad)
116 |                 elif self.version ==1:
117 |                     diff = previous_grad-grad
118 |                 elif self.version ==2:
119 |                     diff =  .5*abs(previous_grad - grad)
120 |                     
121 |                 if self.version==0 or self.version==1:    
122 |                     dfc = 1. / (1. + torch.exp(-diff))
123 |                 elif self.version==2:
124 |                     dfc = 9. / (1. + torch.exp(-diff))-4      #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
125 |                     
126 |                 state['previous_grad'] = grad                
127 | 
128 |                 if group['weight_decay'] != 0:
129 |                     p.data.add_(-group['weight_decay'] * group['lr'], p.data)
130 | 
131 |                 # Applies momental bounds on actual learning rates
132 |                 step_size = torch.full_like(denom, step_size)
133 |                 step_size.div_(denom)
134 |                 exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
135 |                 if self.debug_print:
136 |                     print(f"batch step size {step_size} and exp_avg_step {exp_avg_lr}")
137 |                     
138 |                 if self.average_step:
139 |                     step_size = step_size.add(exp_avg_lr)
140 |                     step_size = step_size.div(2.)
141 |                     
142 |                 else:
143 |                     step_size = torch.min(step_size,  exp_avg_lr)
144 |                 
145 |                 # update momentum with dfc
146 |                 exp_avg1 = exp_avg * dfc
147 |                 
148 |                 step_size.mul_(exp_avg1)
149 | 
150 |                 p.data.add_(-step_size)
151 | 
152 |         return loss


--------------------------------------------------------------------------------
/images/1120-optimizer-testing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/1120-optimizer-testing.jpg


--------------------------------------------------------------------------------
/images/projected_gradient.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/projected_gradient.png


--------------------------------------------------------------------------------
/images/ranger-init.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/ranger-init.jpg


--------------------------------------------------------------------------------
/images/ranger-with-gc-options.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/ranger-with-gc-options.jpg


--------------------------------------------------------------------------------
/madgrad/madgrad_wd.py:
--------------------------------------------------------------------------------
  1 |   # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | # modifications  - 4/4/2021  @lessw2020  (decay issue spotted by @nestordemeure )
  7 | # weight decay has been implemented AdamW style instead of the original madgrad Adam style.
  8 | # in initial image classification testing, this outperformed 0 weight decay or original style weight decay.
  9 | 
 10 | # closure is checked if callable or not since some code passes loss directly, rather than in closure param
 11 | 
 12 | import math
 13 | from typing import Collection, TYPE_CHECKING, Any, Callable, Optional
 14 | 
 15 | import torch
 16 | import torch.optim
 17 | import collections
 18 | 
 19 | if TYPE_CHECKING:
 20 |     from torch.optim.optimizer import _params_t
 21 | else:
 22 |     _params_t = Any
 23 | 
 24 | 
 25 | class madgrad_wd(torch.optim.Optimizer):
 26 |     """
 27 |     MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
 28 |     Optimization.
 29 | 
 30 |     .. _MADGRAD: https://arxiv.org/abs/2101.11075
 31 | 
 32 |     MADGRAD is a general purpose optimizer that can be used in place of SGD or
 33 |     Adam may converge faster and generalize better. Currently GPU-only.
 34 |     Typically, the same learning rate schedule that is used for SGD or Adam may
 35 |     be used. The overall learning rate is not comparable to either method and
 36 |     should be determined by a hyper-parameter sweep.
 37 | 
 38 |     MADGRAD requires less weight decay than other methods, often as little as
 39 |     zero. Momentum values used for SGD or Adam's beta1 should work here also.
 40 | 
 41 |     On sparse problems both weight_decay and momentum should be set to 0.
 42 | 
 43 |     Arguments:
 44 |         params (iterable):
 45 |             Iterable of parameters to optimize or dicts defining parameter groups.
 46 |         lr (float):
 47 |             Learning rate (default: 1e-2).
 48 |         momentum (float):
 49 |             Momentum value in  the range [0,1) (default: 0.9).
 50 |         weight_decay (float):
 51 |             Weight decay, i.e. a L2 penalty (default: 0).
 52 |         eps (float):
 53 |             Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6).
 54 |     """
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         params: _params_t,
 59 |         lr: float = 1e-2,
 60 |         momentum: float = 0.9,
 61 |         weight_decay: float = 0,
 62 |         eps: float = 1e-6,
 63 |     ):
 64 |         if momentum < 0 or momentum >= 1:
 65 |             raise ValueError(f"Momentum {momentum} must be in the range [0,1]")
 66 |         if lr <= 0:
 67 |             raise ValueError(f"Learning rate {lr} must be positive")
 68 |         if weight_decay < 0:
 69 |             raise ValueError(f"Weight decay {weight_decay} must be non-negative")
 70 |         if eps < 0:
 71 |             raise ValueError(f"Eps must be non-negative")
 72 | 
 73 |         defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay)
 74 |         super().__init__(params, defaults)
 75 | 
 76 |     @property
 77 |     def supports_memory_efficient_fp16(self) -> bool:
 78 |         return False
 79 | 
 80 |     @property
 81 |     def supports_flat_params(self) -> bool:
 82 |         return True
 83 | 
 84 |     def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
 85 |         """Performs a single optimization step.
 86 | 
 87 |         Arguments:
 88 |             closure (callable, optional): A closure that reevaluates the model
 89 |                 and returns the loss.
 90 |         """
 91 |         loss = None
 92 |         if closure is not None and isinstance(closure, collections.Callable):
 93 |             loss = closure()
 94 | 
 95 |         # step counter must be stored in state to ensure correct behavior under
 96 |         # optimizer sharding
 97 |         if "k" not in self.state:
 98 |             self.state["k"] = torch.tensor([0], dtype=torch.long)
 99 |         k = self.state["k"].item()
100 | 
101 |         for group in self.param_groups:
102 |             eps = group["eps"]
103 |             lr = group["lr"] + eps
104 |             decay = group["weight_decay"]
105 |             momentum = group["momentum"]
106 | 
107 |             ck = 1 - momentum
108 |             lamb = lr * math.pow(k + 1, 0.5)
109 | 
110 |             for p in group["params"]:
111 |                 if p.grad is None:
112 |                     continue
113 |                 grad = p.grad.data
114 |                 state = self.state[p]
115 | 
116 |                 if "grad_sum_sq" not in state:
117 |                     state["grad_sum_sq"] = torch.zeros_like(p.data).detach()
118 |                     state["s"] = torch.zeros_like(p.data).detach()
119 |                     if momentum != 0:
120 |                         state["x0"] = torch.clone(p.data).detach()
121 | 
122 |                 if momentum != 0.0 and grad.is_sparse:
123 |                     raise RuntimeError(
124 |                         "momentum != 0 is not compatible with sparse gradients"
125 |                     )
126 | 
127 |                 grad_sum_sq = state["grad_sum_sq"]
128 |                 s = state["s"]
129 | 
130 |                 # Apply weight decay - L2 / AdamW style
131 |                 if decay:
132 |                     p.data.mul_(1 - lr * decay)
133 | 
134 |                 """ original impl:
135 |                 if decay != 0:
136 |                     if grad.is_sparse:
137 |                         raise RuntimeError("weight_decay option is not compatible with sparse gradients")
138 | 
139 |                     grad.add_(p.data, alpha=decay)
140 |                 """
141 | 
142 |                 if grad.is_sparse:
143 |                     grad = grad.coalesce()
144 |                     grad_val = grad._values()
145 | 
146 |                     p_masked = p.sparse_mask(grad)
147 |                     grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad)
148 |                     s_masked = s.sparse_mask(grad)
149 | 
150 |                     # Compute x_0 from other known quantities
151 |                     rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
152 |                     x0_masked_vals = p_masked._values().addcdiv(
153 |                         s_masked._values(), rms_masked_vals, value=1
154 |                     )
155 | 
156 |                     # Dense + sparse op
157 |                     grad_sq = grad * grad
158 |                     grad_sum_sq.add_(grad_sq, alpha=lamb)
159 |                     grad_sum_sq_masked.add_(grad_sq, alpha=lamb)
160 | 
161 |                     rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)
162 | 
163 |                     s.add_(grad, alpha=lamb)
164 |                     s_masked._values().add_(grad_val, alpha=lamb)
165 | 
166 |                     # update masked copy of p
167 |                     p_kp1_masked_vals = x0_masked_vals.addcdiv(
168 |                         s_masked._values(), rms_masked_vals, value=-1
169 |                     )
170 |                     # Copy updated masked p to dense p using an add operation
171 |                     p_masked._values().add_(p_kp1_masked_vals, alpha=-1)
172 |                     p.data.add_(p_masked, alpha=-1)
173 |                 else:
174 |                     if momentum == 0:
175 |                         # Compute x_0 from other known quantities
176 |                         rms = grad_sum_sq.pow(1 / 3).add_(eps)
177 |                         x0 = p.data.addcdiv(s, rms, value=1)
178 |                     else:
179 |                         x0 = state["x0"]
180 | 
181 |                     # Accumulate second moments
182 |                     grad_sum_sq.addcmul_(grad, grad, value=lamb)
183 |                     rms = grad_sum_sq.pow(1 / 3).add_(eps)
184 | 
185 |                     # Update s
186 |                     s.data.add_(grad, alpha=lamb)
187 | 
188 |                     # Step
189 |                     if momentum == 0:
190 |                         p.data.copy_(x0.addcdiv(s, rms, value=-1))
191 |                     else:
192 |                         z = x0.addcdiv(s, rms, value=-1)
193 | 
194 |                         # p is a moving average of z
195 |                         p.data.mul_(1 - ck).add_(z, alpha=ck)
196 | 
197 |         self.state["k"] += 1
198 |         return loss
199 | 


--------------------------------------------------------------------------------
/sls/README.md:
--------------------------------------------------------------------------------
1 | SLS = Stochastic Line Search.
2 | 
3 | Official source and paper link:  https://github.com/IssamLaradji/sls
4 | 
5 | The files here represent an integration of SLS into Fast AI 1.057+.
6 | 
7 | Testing and additional work in progress...
8 | 


--------------------------------------------------------------------------------
/sls/basic_train.py:
--------------------------------------------------------------------------------
  1 | "Provides basic training and validation with `Learner`"
  2 | from .torch_core import *
  3 | from .basic_data import *
  4 | from .callback import *
  5 | from .data_block import *
  6 | from .utils.ipython import gpu_mem_restore
  7 | import inspect
  8 | from fastprogress.fastprogress import format_time, IN_NOTEBOOK
  9 | from time import time
 10 | from .sixel import plot_sixel
 11 | 
 12 | __all__ = ['Learner', 'LearnerCallback', 'Recorder', 'RecordOnCPU', 'fit', 'loss_batch', 'train_epoch', 'validate',
 13 |            'get_preds', 'load_learner']
 14 | 
 15 | defaults.lr = slice(3e-3)
 16 | defaults.wd = 1e-2
 17 | defaults.extra_callbacks    = None
 18 | defaults.extra_callback_fns = None
 19 | 
 20 | def loss_batch(model:nn.Module, xb:Tensor, yb:Tensor, loss_func:OptLossFunc=None, opt:OptOptimizer=None,
 21 |                cb_handler:Optional[CallbackHandler]=None)->Tuple[Union[Tensor,int,float,str]]:
 22 |     "Calculate loss and metrics for a batch, call out to callbacks as necessary."
 23 |     cb_handler = ifnone(cb_handler, CallbackHandler())
 24 |     if not is_listy(xb): xb = [xb]
 25 |     if not is_listy(yb): yb = [yb]
 26 |     out = model(*xb)
 27 |     out = cb_handler.on_loss_begin(out)
 28 | 
 29 |     if not loss_func: return to_detach(out), to_detach(yb[0])
 30 |     loss = loss_func(out, *yb)
 31 |     
 32 |     def closure():
 33 |         out = model(*xb)
 34 |         loss = loss_func(out,*yb)
 35 |         return loss
 36 | 
 37 |     if opt is not None:
 38 |         opt.step(closure)
 39 |         loss,skip_bwd = cb_handler.on_backward_begin(loss)
 40 |         #if not skip_bwd:                     loss.backward()
 41 |         #if not cb_handler.on_backward_end(): 
 42 |         if not cb_handler.on_step_end():     opt.zero_grad()
 43 |     
 44 |     loss = loss_func(model(*xb),*yb) #call one more time for updating metrics from SLS 
 45 |     
 46 |     return loss.detach().cpu()
 47 | 
 48 | def get_preds(model:nn.Module, dl:DataLoader, pbar:Optional[PBar]=None, cb_handler:Optional[CallbackHandler]=None,
 49 |               activ:nn.Module=None, loss_func:OptLossFunc=None, n_batch:Optional[int]=None) -> List[Tensor]:
 50 |     "Tuple of predictions and targets, and optional losses (if `loss_func`) using `dl`, max batches `n_batch`."
 51 |     res = [to_float(torch.cat(o).cpu()) for o in
 52 |            zip(*validate(model, dl, cb_handler=cb_handler, pbar=pbar, average=False, n_batch=n_batch))]
 53 |     if loss_func is not None:
 54 |         with NoneReduceOnCPU(loss_func) as lf: res.append(lf(res[0], res[1]))
 55 |     if activ is not None: res[0] = activ(res[0])
 56 |     return res
 57 | 
 58 | def validate(model:nn.Module, dl:DataLoader, loss_func:OptLossFunc=None, cb_handler:Optional[CallbackHandler]=None,
 59 |              pbar:Optional[PBar]=None, average=True, n_batch:Optional[int]=None)->Iterator[Tuple[Union[Tensor,int],...]]:
 60 |     "Calculate `loss_func` of `model` on `dl` in evaluation mode."
 61 |     model.eval()
 62 |     with torch.no_grad():
 63 |         val_losses,nums = [],[]
 64 |         if cb_handler: cb_handler.set_dl(dl)
 65 |         for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)):
 66 |             if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False)
 67 |             val_loss = loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler)
 68 |             val_losses.append(val_loss)
 69 |             if not is_listy(yb): yb = [yb]
 70 |             nums.append(first_el(yb).shape[0])
 71 |             if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
 72 |             if n_batch and (len(nums)>=n_batch): break
 73 |         nums = np.array(nums, dtype=np.float32)
 74 |         if average: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum()
 75 |         else:       return val_losses
 76 | 
 77 | def train_epoch(model:nn.Module, dl:DataLoader, opt:optim.Optimizer, loss_func:LossFunction)->None:
 78 |     "Simple training of `model` for 1 epoch of `dl` using optim `opt` and loss function `loss_func`."
 79 |     model.train()
 80 |     for xb,yb in dl:
 81 |         loss = loss_func(model(xb), yb)
 82 |         loss.backward()
 83 |         opt.step()
 84 |         opt.zero_grad()
 85 | 
 86 | @dataclass
 87 | class BasicLearner():
 88 |     model:nn.Module
 89 |     loss_func:LossFunction
 90 |     opt:optim.Optimizer
 91 |     data:DataBunch
 92 | 
 93 | def fit(epochs:int, learn:BasicLearner, callbacks:Optional[CallbackList]=None, metrics:OptMetrics=None)->None:
 94 |     "Fit the `model` on `data` and learn using `loss_func` and `opt`."
 95 |     assert len(learn.data.train_dl) != 0, f"""Your training dataloader is empty, can't train a model.
 96 |         Use a smaller batch size (batch size={learn.data.train_dl.batch_size} for {len(learn.data.train_dl.dataset)} elements)."""
 97 |     cb_handler = CallbackHandler(callbacks, metrics)
 98 |     pbar = master_bar(range(epochs))
 99 |     cb_handler.on_train_begin(epochs, pbar=pbar, metrics=metrics)
100 | 
101 |     exception=False
102 |     try:
103 |         for epoch in pbar:
104 |             learn.model.train()
105 |             cb_handler.set_dl(learn.data.train_dl)
106 |             cb_handler.on_epoch_begin()
107 |             for xb,yb in progress_bar(learn.data.train_dl, parent=pbar):
108 |                 xb, yb = cb_handler.on_batch_begin(xb, yb)
109 |                 loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler)
110 |                 if cb_handler.on_batch_end(loss): break
111 | 
112 |             if not cb_handler.skip_validate and not learn.data.empty_val:
113 |                 val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
114 |                                        cb_handler=cb_handler, pbar=pbar)
115 |             else: val_loss=None
116 |             if cb_handler.on_epoch_end(val_loss): break
117 |     except Exception as e:
118 |         exception = e
119 |         raise
120 |     finally: cb_handler.on_train_end(exception)
121 | 
122 | loss_func_name2activ = {'cross_entropy_loss': F.softmax, 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,
123 |     'kl_div_loss': torch.exp, 'bce_with_logits_loss': torch.sigmoid, 'cross_entropy': F.softmax,
124 |     'kl_div': torch.exp, 'binary_cross_entropy_with_logits': torch.sigmoid,
125 | }
126 | 
127 | def _loss_func_name2activ(name:str, axis:int=-1):
128 |     res = loss_func_name2activ[name]
129 |     if res == F.softmax: res = partial(F.softmax, dim=axis)
130 |     return res
131 | 
132 | def _loss_func2activ(loss_func):
133 |     if getattr(loss_func,'keywords',None):
134 |         if not loss_func.keywords.get('log_input', True): return
135 |     axis = getattr(loss_func, 'axis', -1)
136 |     # flattened loss
137 |     loss_func = getattr(loss_func, 'func', loss_func)
138 |     # could have a partial inside flattened loss! Duplicate on purpose.
139 |     loss_func = getattr(loss_func, 'func', loss_func)
140 |     cls_name = camel2snake(loss_func.__class__.__name__)
141 |     if cls_name == 'mix_up_loss':
142 |         loss_func = loss_func.crit
143 |         cls_name = camel2snake(loss_func.__class__.__name__)
144 |     if cls_name in loss_func_name2activ:
145 |         if cls_name == 'poisson_nll_loss' and (not getattr(loss_func, 'log_input', True)): return
146 |         return _loss_func_name2activ(cls_name, axis)
147 |     if getattr(loss_func,'__name__','') in loss_func_name2activ:
148 |         return _loss_func_name2activ(loss_func.__name__, axis)
149 |     return noop
150 | 
151 | @dataclass
152 | class Learner():
153 |     "Trainer for `model` using `data` to minimize `loss_func` with optimizer `opt_func`."
154 |     data:DataBunch
155 |     model:nn.Module
156 |     opt_func:Callable=AdamW
157 |     loss_func:Callable=None
158 |     metrics:Collection[Callable]=None
159 |     true_wd:bool=True
160 |     bn_wd:bool=True
161 |     wd:Floats=defaults.wd
162 |     train_bn:bool=True
163 |     path:str = None
164 |     model_dir:PathOrStr = 'models'
165 |     callback_fns:Collection[Callable]=None
166 |     callbacks:Collection[Callback]=field(default_factory=list)
167 |     layer_groups:Collection[nn.Module]=None
168 |     add_time:bool=True
169 |     silent:bool=None
170 |     def __post_init__(self)->None:
171 |         "Setup path,metrics, callbacks and ensure model directory exists."
172 |         self.path = Path(ifnone(self.path, self.data.path))
173 |         self.model = self.model.to(self.data.device)
174 |         self.loss_func = self.loss_func or self.data.loss_func
175 |         self.metrics=listify(self.metrics)
176 |         if not self.layer_groups: self.layer_groups = [nn.Sequential(*flatten_model(self.model))]
177 |         self.callbacks = listify(self.callbacks)
178 |         if self.silent is None: self.silent = defaults.silent
179 |         self.callback_fns = [partial(Recorder, add_time=self.add_time, silent=self.silent)] + listify(self.callback_fns)
180 |         if defaults.extra_callbacks is not None: self.callbacks += defaults.extra_callbacks
181 | 
182 |     def init(self, init): apply_init(self.model, init)
183 | 
184 |     def _test_writeable_path(self):
185 |         path = self.path/self.model_dir
186 |         try:
187 |             path.mkdir(parents=True, exist_ok=True)
188 |             tmp_file = get_tmp_file(path)
189 |         except OSError as e:
190 |             raise Exception(f"{e}\nCan't write to '{path}', set `learn.model_dir` attribute in Learner to a full libpath path that is writable") from None
191 |         os.remove(tmp_file)
192 | 
193 |     def lr_range(self, lr:Union[float,slice])->np.ndarray:
194 |         "Build differential learning rates from `lr`."
195 |         if not isinstance(lr,slice): return lr
196 |         if lr.start: res = even_mults(lr.start, lr.stop, len(self.layer_groups))
197 |         else: res = [lr.stop/10]*(len(self.layer_groups)-1) + [lr.stop]
198 |         return np.array(res)
199 | 
200 |     def fit(self, epochs:int, lr:Union[Floats,slice]=defaults.lr,
201 |             wd:Floats=None, callbacks:Collection[Callback]=None)->None:
202 |         "Fit the model on this learner with `lr` learning rate, `wd` weight decay for `epochs` with `callbacks`."
203 |         lr = self.lr_range(lr)
204 |         if wd is None: wd = self.wd
205 |         if not getattr(self, 'opt', False): self.create_opt(lr, wd)
206 |         else: self.opt.lr,self.opt.wd = lr,wd
207 |         callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
208 |         fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
209 | 
210 |     def create_opt(self, lr:Floats, wd:Floats=0.)->None:
211 |         "Create optimizer with `lr` learning rate and `wd` weight decay."
212 |         self.opt = OptimWrapper.create(self.opt_func, lr, self.layer_groups, wd=wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
213 | 
214 |     def split(self, split_on:SplitFuncOrIdxList)->None:
215 |         "Split the model at `split_on`."
216 |         if isinstance(split_on,Callable): split_on = split_on(self.model)
217 |         self.layer_groups = split_model(self.model, split_on)
218 |         return self
219 | 
220 |     def freeze_to(self, n:int)->None:
221 |         "Freeze layers up to layer group `n`."
222 |         if hasattr(self.model, 'reset'): self.model.reset()
223 |         for g in self.layer_groups[:n]:
224 |             for l in g:
225 |                 if not self.train_bn or not isinstance(l, bn_types): requires_grad(l, False)
226 |         for g in self.layer_groups[n:]: requires_grad(g, True)
227 |         self.create_opt(defaults.lr)
228 | 
229 |     def freeze(self)->None:
230 |         "Freeze up to last layer group."
231 |         assert(len(self.layer_groups)>1)
232 |         self.freeze_to(-1)
233 | 
234 |     def unfreeze(self):
235 |         "Unfreeze entire model."
236 |         self.freeze_to(0)
237 | 
238 |     def export(self, file:PathLikeOrBinaryStream='export.pkl', destroy=False):
239 |         "Export the state of the `Learner` in `self.path/file`. `file` can be file-like (file or buffer)"
240 |         if rank_distrib(): return # don't save if slave proc
241 |         args = ['opt_func', 'loss_func', 'metrics', 'true_wd', 'bn_wd', 'wd', 'train_bn', 'model_dir', 'callback_fns']
242 |         state = {a:getattr(self,a) for a in args}
243 |         state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks}
244 |         #layer_groups -> need to find a way
245 |         #TO SEE: do we save model structure and weights separately?
246 |         with ModelOnCPU(self.model) as m:
247 |             state['model'] = m
248 |             xtra = dict(normalize=self.data.norm.keywords) if getattr(self.data, 'norm', False) else {}
249 |             state['data'] = self.data.valid_ds.get_state(**xtra)
250 |             state['cls'] = self.__class__
251 |             try_save(state, self.path, file)
252 |         if destroy: self.destroy()
253 | 
254 |     def save(self, file:PathLikeOrBinaryStream=None, return_path:bool=False, with_opt:bool=True):
255 |         "Save model and optimizer state (if `with_opt`) with `file` to `self.model_dir`. `file` can be file-like (file or buffer)"
256 |         if is_pathlike(file): self._test_writeable_path()
257 |         if rank_distrib(): return # don't save if slave proc
258 |         target = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file
259 |         if not hasattr(self, 'opt'): with_opt=False
260 |         if not with_opt: state = get_model(self.model).state_dict()
261 |         else: state = {'model': get_model(self.model).state_dict(), 'opt':self.opt.state_dict()}
262 |         torch.save(state, target)
263 |         if return_path: return target
264 | 
265 |     def dl(self, ds_type:DatasetType=DatasetType.Valid):
266 |         "Return DataLoader for DatasetType `ds_type`."
267 |         return self.data.dl(ds_type)
268 | 
269 |     def load(self, file:PathLikeOrBinaryStream=None, device:torch.device=None, strict:bool=True,
270 |              with_opt:bool=None, purge:bool=False, remove_module:bool=False)->'Learner':
271 |         "Load model and optimizer state (if `with_opt`) `file` from `self.model_dir` using `device`. `file` can be file-like (file or buffer)"
272 |         if purge: self.purge(clear_opt=ifnone(with_opt, False))
273 |         if device is None: device = self.data.device
274 |         elif isinstance(device, int): device = torch.device('cuda', device)
275 |         source = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file
276 |         distrib_barrier()
277 |         state = torch.load(source, map_location=device)
278 |         if set(state.keys()) == {'model', 'opt'}:
279 |             model_state = state['model']
280 |             if remove_module: model_state = remove_module_load(model_state)
281 |             get_model(self.model).load_state_dict(model_state, strict=strict)
282 |             if ifnone(with_opt,True):
283 |                 if not hasattr(self, 'opt'): self.create_opt(defaults.lr, self.wd)
284 |                 try:    self.opt.load_state_dict(state['opt'])
285 |                 except: pass
286 |         else:
287 |             if with_opt: warn("Saved filed doesn't contain an optimizer state.")
288 |             if remove_module: state = remove_module_load(state)
289 |             get_model(self.model).load_state_dict(state, strict=strict)
290 |         del state
291 |         gc.collect()
292 |         return self
293 | 
294 |     def destroy(self):
295 |         "Free the Learner internals, leaving just an empty shell that consumes no memory"
296 | 
297 |         class ZombieLearner(Learner):
298 |             msg = "this object has been destroyed"
299 |             def __getattr__(self, item):    print(ZombieLearner.msg); return None
300 |             def destroyed(*args, **kwargs): print(ZombieLearner.msg)
301 | 
302 |         attrs = [k for k in self.__dict__.keys() if not k.startswith("__")]
303 |         for a in attrs: delattr(self, a)
304 |         # the instance methods can still be called, but will just give a message
305 |         methods = [k for k in dir(self) if not k.startswith("__") and inspect.isroutine(getattr(self, k))]
306 |         for m in methods: setattr(self, m, ZombieLearner.destroyed)
307 |         self.__class__ = ZombieLearner
308 |         gc.collect()
309 |         print("this Learner object self-destroyed - it still exists, but no longer usable")
310 | 
311 |     def purge(self, clear_opt:bool=True):
312 |         "Purge the `Learner` of all cached attributes to release some GPU memory."
313 |         self._test_writeable_path()
314 |         attrs_all = [k for k in self.__dict__.keys() if not k.startswith("__")]
315 |         attrs_pkl = ['bn_wd', 'callback_fns', 'layer_groups', 'loss_func', 'metrics', 'model',
316 |                      'model_dir', 'opt_func', 'path', 'train_bn', 'true_wd', 'wd']
317 |         # +callbacks: get pickled too, but not directly
318 |         attrs_keep = ['data', 'recorder']
319 |         attrs_del = list(set(attrs_all) - set(attrs_keep))
320 |         state = {a:getattr(self, a) for a in attrs_pkl}
321 |         state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks}
322 |         if hasattr(self, 'opt'): state['opt'] = self.opt.get_state()
323 | 
324 |         tmp_file = get_tmp_file(self.path/self.model_dir)
325 |         torch.save(state, open(tmp_file, 'wb'))
326 |         for a in attrs_del: delattr(self, a)
327 |         gc.collect()
328 |         state = torch.load(tmp_file)
329 |         os.remove(tmp_file)
330 | 
331 |         for a in attrs_pkl: setattr(self, a, state[a])
332 |         cb_state = state.pop('cb_state')
333 |         self.callbacks = [load_callback(c,s, self) for c,s in cb_state.items()]
334 |         if not clear_opt and 'opt' in state:
335 |             try: self.opt = OptimWrapper.load_with_state_and_layer_group(state['opt'], self.layer_groups)
336 |             except: warn("Wasn't able to properly load the optimizer state again.")
337 |         del state
338 |         gc.collect()
339 |         return self
340 | 
341 |     def get_preds(self, ds_type:DatasetType=DatasetType.Valid, activ:nn.Module=None,
342 |                   with_loss:bool=False, n_batch:Optional[int]=None, pbar:Optional[PBar]=None) -> List[Tensor]:
343 |         "Return predictions and targets on `ds_type` dataset."
344 |         lf = self.loss_func if with_loss else None
345 |         activ = ifnone(activ, _loss_func2activ(self.loss_func))
346 |         if not getattr(self, 'opt', False): self.create_opt(defaults.lr, self.wd)
347 |         callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(self.callbacks)
348 |         return get_preds(self.model, self.dl(ds_type), cb_handler=CallbackHandler(callbacks),
349 |                          activ=activ, loss_func=lf, n_batch=n_batch, pbar=pbar)
350 | 
351 |     def pred_batch(self, ds_type:DatasetType=DatasetType.Valid, batch:Tuple=None, reconstruct:bool=False,
352 |                    with_dropout:bool=False, activ:nn.Module=None) -> List[Tensor]:
353 |         "Return output of the model on one batch from `ds_type` dataset."
354 |         if batch is not None: xb,yb = batch
355 |         else: xb,yb = self.data.one_batch(ds_type, detach=False, denorm=False)
356 |         cb_handler = CallbackHandler(self.callbacks)
357 |         xb,yb = cb_handler.on_batch_begin(xb,yb, train=False)
358 |         activ = ifnone(activ, _loss_func2activ(self.loss_func))
359 |         with torch.no_grad():
360 |             if not with_dropout: preds = loss_batch(self.model.eval(), xb, yb, cb_handler=cb_handler)
361 |             else: preds = loss_batch(self.model.eval().apply(self.apply_dropout), xb, yb, cb_handler=cb_handler)
362 |             res = activ(preds[0])
363 |         if not reconstruct: return res
364 |         res = res.detach().cpu()
365 |         ds = self.dl(ds_type).dataset
366 |         norm = getattr(self.data, 'norm', False)
367 |         if norm and norm.keywords.get('do_y',False):
368 |             res = self.data.denorm(res, do_x=True)
369 |         return [ds.reconstruct(o) for o in res]
370 | 
371 |     def backward(self, item):
372 |         "Pass `item` through the model and computes the gradient. Useful if `backward_hooks` are attached."
373 |         xb,yb = self.data.one_item(item)
374 |         loss = loss_batch(self.model.eval(), xb, yb, self.loss_func, opt=FakeOptimizer(),
375 |                           cb_handler=CallbackHandler(self.callbacks))
376 |         return loss
377 | 
378 |     def predict(self, item:ItemBase, return_x:bool=False, batch_first:bool=True, with_dropout:bool=False, **kwargs):
379 |         "Return predicted class, label and probabilities for `item`."
380 |         batch = self.data.one_item(item)
381 |         res = self.pred_batch(batch=batch, with_dropout=with_dropout)
382 |         raw_pred,x = grab_idx(res,0,batch_first=batch_first),batch[0]
383 |         norm = getattr(self.data,'norm',False)
384 |         if norm:
385 |             x = self.data.denorm(x)
386 |             if norm.keywords.get('do_y',False): raw_pred = self.data.denorm(raw_pred)
387 |         ds = self.data.single_ds
388 |         pred = ds.y.analyze_pred(raw_pred, **kwargs)
389 |         x = ds.x.reconstruct(grab_idx(x, 0))
390 |         y = ds.y.reconstruct(pred, x) if has_arg(ds.y.reconstruct, 'x') else ds.y.reconstruct(pred)
391 |         return (x, y, pred, raw_pred) if return_x else (y, pred, raw_pred)
392 | 
393 |     def validate(self, dl=None, callbacks=None, metrics=None):
394 |         "Validate on `dl` with potential `callbacks` and `metrics`."
395 |         dl = ifnone(dl, self.data.valid_dl)
396 |         metrics = ifnone(metrics, self.metrics)
397 |         cb_handler = CallbackHandler(self.callbacks + ifnone(callbacks, []), metrics)
398 |         cb_handler.on_train_begin(1, None, metrics); cb_handler.on_epoch_begin()
399 |         val_metrics = validate(self.model, dl, self.loss_func, cb_handler)
400 |         cb_handler.on_epoch_end(val_metrics)
401 |         return cb_handler.state_dict['last_metrics']
402 | 
403 |     def show_results(self, ds_type=DatasetType.Valid, rows:int=5, **kwargs):
404 |         "Show `rows` result of predictions on `ds_type` dataset."
405 |         #TODO: get read of has_arg x and split_kwargs_by_func if possible
406 |         #TODO: simplify this and refactor with pred_batch(...reconstruct=True)
407 |         n_items = rows ** 2 if self.data.train_ds.x._square_show_res else rows
408 |         if self.dl(ds_type).batch_size < n_items: n_items = self.dl(ds_type).batch_size
409 |         ds = self.dl(ds_type).dataset
410 |         self.callbacks.append(RecordOnCPU())
411 |         preds = self.pred_batch(ds_type)
412 |         *self.callbacks,rec_cpu = self.callbacks
413 |         x,y = rec_cpu.input,rec_cpu.target
414 |         norm = getattr(self.data,'norm',False)
415 |         if norm:
416 |             x = self.data.denorm(x)
417 |             if norm.keywords.get('do_y',False):
418 |                 y     = self.data.denorm(y, do_x=True)
419 |                 preds = self.data.denorm(preds, do_x=True)
420 |         analyze_kwargs,kwargs = split_kwargs_by_func(kwargs, ds.y.analyze_pred)
421 |         preds = [ds.y.analyze_pred(grab_idx(preds, i), **analyze_kwargs) for i in range(n_items)]
422 |         xs = [ds.x.reconstruct(grab_idx(x, i)) for i in range(n_items)]
423 |         if has_arg(ds.y.reconstruct, 'x'):
424 |             ys = [ds.y.reconstruct(grab_idx(y, i), x=x) for i,x in enumerate(xs)]
425 |             zs = [ds.y.reconstruct(z, x=x) for z,x in zip(preds,xs)]
426 |         else :
427 |             ys = [ds.y.reconstruct(grab_idx(y, i)) for i in range(n_items)]
428 |             zs = [ds.y.reconstruct(z) for z in preds]
429 |         ds.x.show_xyzs(xs, ys, zs, **kwargs)
430 | 
431 |     def apply_dropout(self, m):
432 |         "If a module contains 'dropout' in it's name, it will be switched to .train() mode."
433 |         if 'dropout' in m.__class__.__name__.lower(): m.train()
434 | 
435 |     def predict_with_mc_dropout(self, item:ItemBase, with_dropout:bool=True, n_times=10, **kwargs):
436 |         "Make predictions with dropout turned on for n_times (default 10)."
437 |         return [self.predict(item, with_dropout=with_dropout) for _ in range(n_times)]
438 | 
439 | class RecordOnCPU(Callback):
440 |     "Store the `input` and `target` going through the model on the CPU."
441 |     def on_batch_begin(self, last_input,last_target,**kwargs):
442 |         self.input,self.target = to_cpu(last_input),to_cpu(last_target)
443 | 
444 | class LearnerCallback(Callback):
445 |     "Base class for creating callbacks for a `Learner`."
446 |     def __init__(self, learn):
447 |         self._learn = weakref.ref(learn)
448 |         self.exclude,self.not_min = ['_learn'],[]
449 |         setattr(self.learn, self.cb_name, self)
450 | 
451 |     def __getattr__(self,k): return getattr(self.learn, k)
452 |     def __setstate__(self,data:Any): self.__dict__.update(data)
453 | 
454 |     @property
455 |     def learn(self) -> Learner: return self._learn()
456 |     @learn.setter
457 |     def learn(self, learn: Learner) -> None: self._learn = weakref.ref(learn)
458 | 
459 |     @property
460 |     def cb_name(self): return camel2snake(self.__class__.__name__)
461 | 
462 | class Recorder(LearnerCallback):
463 |     "A `LearnerCallback` that records epoch, loss, opt and metric data during training."
464 |     _order=-10
465 |     def __init__(self, learn:Learner, add_time:bool=True, silent:bool=False):
466 |         super().__init__(learn)
467 |         if not getattr(self.learn, 'opt', False): self.learn.create_opt(defaults.lr, self.learn.wd)
468 |         self.opt = self.learn.opt
469 |         self.train_dl = self.learn.data.train_dl
470 |         self.no_val,self.silent,self.add_time = False,silent,add_time
471 | 
472 |     def on_train_begin(self, pbar:PBar, metrics_names:Collection[str], **kwargs:Any)->None:
473 |         "Initialize recording status at beginning of training."
474 |         self.pbar = pbar
475 |         self.names = ['epoch', 'train_loss'] if self.no_val else ['epoch', 'train_loss', 'valid_loss']
476 |         self.metrics_names = metrics_names
477 |         if hasattr(self, '_added_met_names'): self.metrics_names += self._added_met_names
478 |         self.names += self.metrics_names
479 |         if self.add_time: self.names.append('time')
480 |         if not self.silent: self.pbar.write(self.names, table=True)
481 |         self.losses,self.val_losses,self.lrs,self.moms,self.metrics,self.nb_batches = [],[],[],[],[],[]
482 | 
483 |     def on_epoch_begin(self, **kwargs:Any)->None:
484 |         if self.add_time: self.start_epoch = time()
485 | 
486 |     def on_batch_begin(self, train, **kwargs:Any)->None:
487 |         "Record learning rate and momentum at beginning of batch."
488 |         if train:
489 |             self.lrs.append(self.opt.lr)
490 |             #if self.opt.mom is not None:
491 |                 #self.moms.append(self.opt.mom)
492 | 
493 |     def on_backward_begin(self, smooth_loss:Tensor, **kwargs:Any)->None:
494 |         "Record the loss before any other callback has a chance to modify it."
495 |         self.losses.append(smooth_loss)
496 |         if self.pbar is not None and hasattr(self.pbar,'child'):
497 |             self.pbar.child.comment = f'{smooth_loss:.4f}'
498 | 
499 |     def on_epoch_end(self, epoch:int, num_batch:int, smooth_loss:Tensor,
500 |                      last_metrics:MetricsList, **kwargs:Any)->bool:
501 |         "Save epoch info: num_batch, smooth_loss, metrics."
502 |         self.nb_batches.append(num_batch)
503 |         if last_metrics is not None: self.val_losses.append(last_metrics[0])
504 |         else: last_metrics = [] if self.no_val else [None]
505 |         if len(last_metrics) > 1: self.metrics.append(last_metrics[1:])
506 |         self.format_stats([epoch, smooth_loss] + last_metrics)
507 | 
508 |     def format_stats(self, stats:TensorOrNumList)->None:
509 |         "Format stats before printing."
510 |         str_stats = []
511 |         for name,stat in zip(self.names,stats):
512 |             str_stats.append('#na#' if stat is None else str(stat) if isinstance(stat, int) else f'{stat:.6f}')
513 |         if self.add_time: str_stats.append(format_time(time() - self.start_epoch))
514 |         if not self.silent: self.pbar.write(str_stats, table=True)
515 | 
516 |     def add_metric_names(self, names):
517 |         "Add `names` to the inner metric names."
518 |         if hasattr(self, '_added_met_names'): self._added_met_names += names
519 |         else:                                 self._added_met_names  = names
520 | 
521 |     def plot_lr(self, show_moms=False, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
522 |         "Plot learning rate, `show_moms` to include momentum."
523 |         lrs = self._split_list(self.lrs, skip_start, skip_end)
524 |         iterations = self._split_list(range_of(self.lrs), skip_start, skip_end)
525 |         if show_moms:
526 |             moms = self._split_list(self.moms, skip_start, skip_end)
527 |             fig, axs = plt.subplots(1,2, figsize=(12,4))
528 |             axs[0].plot(iterations, lrs)
529 |             axs[0].set_xlabel('Iterations')
530 |             axs[0].set_ylabel('Learning Rate')
531 |             axs[1].plot(iterations, moms)
532 |             axs[1].set_xlabel('Iterations')
533 |             axs[1].set_ylabel('Momentum')
534 |         else:
535 |             fig, ax = plt.subplots()
536 |             ax.plot(iterations, lrs)
537 |             ax.set_xlabel('Iterations')
538 |             ax.set_ylabel('Learning Rate')
539 |         if ifnone(return_fig, defaults.return_fig): return fig
540 |         if not IN_NOTEBOOK: plot_sixel(fig)
541 | 
542 |     @staticmethod
543 |     def smoothen_by_spline(xs, ys, **kwargs):
544 |         xs = np.arange(len(ys))
545 |         spl = scipy.interpolate.UnivariateSpline(xs, ys, **kwargs)
546 |         ys = spl(xs)
547 |         return ys
548 | 
549 |     def plot(self, skip_start:int=10, skip_end:int=5, suggestion:bool=False, return_fig:bool=None,
550 |              **kwargs)->Optional[plt.Figure]:
551 |         "Plot learning rate and losses, trimmed between `skip_start` and `skip_end`. Optionally plot and return min gradient"
552 |         lrs = self._split_list(self.lrs, skip_start, skip_end)
553 |         losses = self._split_list(self.losses, skip_start, skip_end)
554 |         losses = [x.item() for x in losses]
555 |         if 'k' in kwargs: losses = self.smoothen_by_spline(lrs, losses, **kwargs)
556 |         fig, ax = plt.subplots(1,1)
557 |         ax.plot(lrs, losses)
558 |         ax.set_ylabel("Loss")
559 |         ax.set_xlabel("Learning Rate")
560 |         ax.set_xscale('log')
561 |         ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e'))
562 |         if suggestion:
563 |             try: mg = (np.gradient(np.array(losses))).argmin()
564 |             except:
565 |                 print("Failed to compute the gradients, there might not be enough points.")
566 |                 return
567 |             print(f"Min numerical gradient: {lrs[mg]:.2E}")
568 |             ax.plot(lrs[mg],losses[mg],markersize=10,marker='o',color='red')
569 |             self.min_grad_lr = lrs[mg]
570 |             ml = np.argmin(losses)
571 |             print(f"Min loss divided by 10: {lrs[ml]/10:.2E}")
572 |         if ifnone(return_fig, defaults.return_fig): return fig
573 |         if not IN_NOTEBOOK: plot_sixel(fig)
574 | 
575 |     def plot_losses(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
576 |         "Plot training and validation losses."
577 |         fig, ax = plt.subplots(1,1)
578 |         losses = self._split_list(self.losses, skip_start, skip_end)
579 |         iterations = self._split_list(range_of(self.losses), skip_start, skip_end)
580 |         ax.plot(iterations, losses, label='Train')
581 |         val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)
582 |         val_losses = self._split_list_val(self.val_losses, skip_start, skip_end)
583 |         ax.plot(val_iter, val_losses, label='Validation')
584 |         ax.set_ylabel('Loss')
585 |         ax.set_xlabel('Batches processed')
586 |         ax.legend()
587 |         if ifnone(return_fig, defaults.return_fig): return fig
588 |         if not IN_NOTEBOOK: plot_sixel(fig)
589 | 
590 |     def plot_metrics(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
591 |         "Plot metrics collected during training."
592 |         assert len(self.metrics) != 0, "There are no metrics to plot."
593 |         fig, axes = plt.subplots(len(self.metrics[0]),1,figsize=(6, 4*len(self.metrics[0])))
594 |         val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)
595 |         axes = axes.flatten() if len(self.metrics[0]) != 1 else [axes]
596 |         for i, ax in enumerate(axes):
597 |             values = [met[i] for met in self.metrics]
598 |             values = self._split_list_val(values, skip_start, skip_end)
599 |             ax.plot(val_iter, values)
600 |             ax.set_ylabel(str(self.metrics_names[i]))
601 |             ax.set_xlabel('Batches processed')
602 |         if ifnone(return_fig, defaults.return_fig): return fig
603 |         if not IN_NOTEBOOK: plot_sixel(fig)
604 | 
605 |     def _split_list(self, vals:Collection[float], skip_start:int, skip_end:int):
606 |         return vals[skip_start:-skip_end] if skip_end > 0 else vals[skip_start:]
607 | 
608 |     def _split_list_val(self, vals:Collection[float], skip_start:int, skip_end:int):
609 |         val_iter = np.cumsum(self.nb_batches)
610 |         start_val = (val_iter - skip_start >= 0).nonzero()[0].min()
611 |         end_val = (val_iter[-1] - val_iter - skip_end >= 0).nonzero()[0].max()+1
612 |         return vals[start_val:end_val] if skip_end > 0 else vals[start_val:]
613 | 
614 | class FakeOptimizer():
615 |     def step(self): pass
616 |     def zero_grad(self): pass
617 | 
618 | def load_callback(class_func, state, learn:Learner):
619 |     init_kwargs, others = split_kwargs_by_func(state, class_func.__init__)
620 |     res = class_func(learn, **init_kwargs) if issubclass(class_func, LearnerCallback) else class_func(**init_kwargs)
621 |     for k,v in others.items(): setattr(res, k, v)
622 |     return res
623 | 
624 | def load_learner(path:PathOrStr, file:PathLikeOrBinaryStream='export.pkl', test:ItemList=None, tfm_y=None, **db_kwargs):
625 |     "Load a `Learner` object saved with `export_state` in `path/file` with empty data, optionally add `test` and load on `cpu`. `file` can be file-like (file or buffer)"
626 |     source = Path(path)/file if is_pathlike(file) else file
627 |     state = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source)
628 |     model = state.pop('model')
629 |     src = LabelLists.load_state(path, state.pop('data'))
630 |     if test is not None: src.add_test(test, tfm_y=tfm_y)
631 |     data = src.databunch(**db_kwargs)
632 |     cb_state = state.pop('cb_state')
633 |     clas_func = state.pop('cls')
634 |     res = clas_func(data, model, **state)
635 |     res.callback_fns = state['callback_fns'] #to avoid duplicates
636 |     res.callbacks = [load_callback(c,s, res) for c,s in cb_state.items()]
637 |     return res
638 | 


--------------------------------------------------------------------------------
/sls/callback.py:
--------------------------------------------------------------------------------
  1 | "Callbacks provides extensibility to the `basic_train` loop. See `train` for examples of custom callbacks."
  2 | from .basic_data import *
  3 | from .torch_core import *
  4 | import torch.distributed as dist
  5 | 
  6 | __all__ = ['AverageMetric', 'Callback', 'CallbackHandler', 'OptimWrapper', 'SmoothenValue', 'Scheduler', 'annealing_cos', 'CallbackList',
  7 |            'annealing_exp', 'annealing_linear', 'annealing_no', 'annealing_poly']
  8 | 
  9 | class OptimWrapper():
 10 |     "Basic wrapper around `opt` to simplify hyper-parameters changes."
 11 |     def __init__(self, opt:optim.Optimizer, wd:Floats=0., true_wd:bool=False, bn_wd:bool=True):
 12 |         assert not isinstance(opt, OptimWrapper)
 13 |         self.opt,self.true_wd,self.bn_wd = opt,true_wd,bn_wd
 14 |         self.opt_keys = list(self.opt.param_groups[0].keys())
 15 |         self.opt_keys.remove('params')
 16 |         self.read_defaults()
 17 |         self.wd = wd
 18 | 
 19 |     @classmethod
 20 |     def create(cls, opt_func:Union[type,Callable], lr:Union[float,Tuple,List], layer_groups:ModuleList, wd:Floats=0., 
 21 |                true_wd:bool=False, bn_wd:bool=True)->optim.Optimizer:
 22 |         "Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`."
 23 |         split_params = split_no_wd_params(layer_groups)
 24 |         opt = opt_func([{'params': p, 'lr':0} for p in split_params])
 25 |         opt = cls(opt, wd=wd, true_wd=true_wd, bn_wd=bn_wd)
 26 |         opt.lr,opt.opt_func = listify(lr, layer_groups),opt_func
 27 |         return opt
 28 | 
 29 |     def new(self, layer_groups:Collection[nn.Module], split_no_wd:bool=True):
 30 |         "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters."
 31 |         opt_func = getattr(self, 'opt_func', self.opt.__class__)
 32 |         res = self.create(opt_func, self.lr, layer_groups, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
 33 |         res.mom,res.beta = self.mom,self.beta
 34 |         return res
 35 | 
 36 |     def new_with_params(self, param_groups:Collection[Collection[nn.Parameter]]):
 37 |         "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters."
 38 |         opt_func = getattr(self, 'opt_func', self.opt.__class__)
 39 |         opt = opt_func([{'params': p, 'lr':0} for p in param_groups])
 40 |         opt = self.__class__(opt, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
 41 |         opt.lr,opt.opt_func,opt.mom,opt.beta = self.lr,opt_func,self.mom,self.beta
 42 |         return opt
 43 | 
 44 |     def __repr__(self)->str:
 45 |         return f'OptimWrapper over {repr(self.opt)}.\nTrue weight decay: {self.true_wd}'
 46 | 
 47 |     #Pytorch optimizer methods
 48 |     def step(self,closure=None)->None:
 49 |         "Set weight decay and step optimizer."
 50 |         # weight decay outside of optimizer step (AdamW)
 51 |         if self.true_wd:
 52 |             for lr,wd,pg1,pg2 in zip(self._lr,self._wd,self.opt.param_groups[::2],self.opt.param_groups[1::2]):
 53 |                 for p in pg1['params']: p.data.mul_(1 - wd*lr)
 54 |                 if self.bn_wd:
 55 |                     for p in pg2['params']: p.data.mul_(1 - wd*lr)
 56 |             self.set_val('weight_decay', listify(0, self._wd))
 57 |         self.opt.step(closure)
 58 | 
 59 |     def zero_grad(self)->None:
 60 |         "Clear optimizer gradients."
 61 |         self.opt.zero_grad()
 62 | 
 63 |     #Passthrough to the inner opt.
 64 |     def __getattr__(self, k:str)->Any: return getattr(self.opt, k, None)
 65 |     def __setstate__(self,data:Any): self.__dict__.update(data)
 66 | 
 67 |     def clear(self):
 68 |         "Reset the state of the inner optimizer."
 69 |         sd = self.state_dict()
 70 |         sd['state'] = {}
 71 |         self.load_state_dict(sd)
 72 | 
 73 |     @property
 74 |     def n_params(self): return sum([len(pg['params']) for pg in self.opt.param_groups])
 75 | 
 76 |     #Hyperparameters as properties
 77 |     @property
 78 |     def lr(self)->float: return self._lr[-1]
 79 |     @lr.setter
 80 |     def lr(self, val:float)->None:
 81 |         self._lr = self.set_val('lr', listify(val, self._lr))
 82 | 
 83 |     @property
 84 |     def mom(self)->float:return self._mom[-1]
 85 |     @mom.setter
 86 |     def mom(self, val:float)->None:
 87 |         if 'momentum' in self.opt_keys: self.set_val('momentum', listify(val, self._mom))
 88 |         elif 'betas' in self.opt_keys:  self.set_val('betas', (listify(val, self._mom), self._beta))
 89 |         self._mom = listify(val, self._mom)
 90 | 
 91 |     @property
 92 |     def beta(self)->float: return None if self._beta is None else self._beta[-1]
 93 |     @beta.setter
 94 |     def beta(self, val:float)->None:
 95 |         "Set beta (or alpha as makes sense for given optimizer)."
 96 |         if val is None: return
 97 |         if 'betas' in self.opt_keys:    self.set_val('betas', (self._mom, listify(val, self._beta)))
 98 |         elif 'alpha' in self.opt_keys:  self.set_val('alpha', listify(val, self._beta))
 99 |         self._beta = listify(val, self._beta)
100 | 
101 |     @property
102 |     def wd(self)->float: return self._wd[-1]
103 |     @wd.setter
104 |     def wd(self, val:float)->None:
105 |         "Set weight decay."
106 |         if not self.true_wd: self.set_val('weight_decay', listify(val, self._wd), bn_groups=self.bn_wd)
107 |         self._wd = listify(val, self._wd)
108 | 
109 |     #Helper functions
110 |     def read_defaults(self)->None:
111 |         "Read the values inside the optimizer for the hyper-parameters."
112 |         self._beta = None
113 |         if 'lr' in self.opt_keys: self._lr = self.read_val('lr')
114 |         if 'momentum' in self.opt_keys: self._mom = self.read_val('momentum')
115 |         if 'alpha' in self.opt_keys: self._beta = self.read_val('alpha')
116 |         if 'betas' in self.opt_keys: self._mom,self._beta = self.read_val('betas')
117 |         if 'weight_decay' in self.opt_keys: self._wd = self.read_val('weight_decay')
118 |         reserved_names = ['params', 'lr', 'momentum', 'alpha', 'betas', 'weight_decay']
119 |         stat_names = [n for n in self.opt_keys if n not in reserved_names]
120 |         self._stats = {n:self.read_val(n) for n in stat_names}
121 | 
122 |     def get_stat(self, name:str)->float: 
123 |         if name in ['lr', 'mom', 'beta', 'wd']: return getattr(self, name)
124 |         else: return self._stats[name][-1]
125 |     def set_stat(self, name:str, value:Union[float, Collection[float]])->None:
126 |         if name in ['lr', 'mom', 'beta', 'wd']: setattr(self, name, value)
127 |         else:
128 |             val = listify(value, self._stats[name])
129 |             self.set_val(name, val)
130 |             self._stats[name] = val
131 | 
132 |     def set_val(self, key:str, val:Any, bn_groups:bool=True)->Any:
133 |         "Set `val` inside the optimizer dictionary at `key`."
134 |         if is_tuple(val): val = [(v1,v2) for v1,v2 in zip(*val)]
135 |         for v,pg1,pg2 in zip(val,self.opt.param_groups[::2],self.opt.param_groups[1::2]):
136 |             pg1[key] = v
137 |             if bn_groups: pg2[key] = v
138 |         return val
139 | 
140 |     def read_val(self, key:str) -> Union[List[float],Tuple[List[float],List[float]]]:
141 |         "Read a hyperparameter `key` in the optimizer dictionary."
142 |         val = [pg[key] for pg in self.opt.param_groups[::2]]
143 |         if is_tuple(val[0]): val = [o[0] for o in val], [o[1] for o in val]
144 |         return val
145 |     
146 |     def get_state(self):
147 |         "Return the inner state minus the layer groups."
148 |         return {'opt_state':self.opt.state_dict(), 'lr':self._lr, 'wd':self._wd, 'beta':self._beta, 'mom':self._mom,
149 |                 'opt_func':self.opt_func, 'true_wd':self.true_wd, 'bn_wd':self.bn_wd}
150 | 
151 |     @classmethod
152 |     def load_with_state_and_layer_group(cls, state:dict, layer_groups:Collection[nn.Module]):
153 |         res = cls.create(state['opt_func'], state['lr'], layer_groups, wd=state['wd'], true_wd=state['true_wd'], 
154 |                      bn_wd=state['bn_wd'])
155 |         res._mom,res._beta = state['mom'],state['beta']
156 |         res.load_state_dict(state['opt_state'])
157 |         return res
158 | 
159 | class Callback():
160 |     "Base class for callbacks that want to record values, dynamically change learner params, etc."
161 |     _order=0
162 |     def on_train_begin(self, **kwargs:Any)->None:
163 |         "To initialize constants in the callback."
164 |         pass
165 |     def on_epoch_begin(self, **kwargs:Any)->None:
166 |         "At the beginning of each epoch."
167 |         pass
168 |     def on_batch_begin(self, **kwargs:Any)->None:
169 |         "Set HP before the output and loss are computed."
170 |         pass
171 |     def on_loss_begin(self, **kwargs:Any)->None:
172 |         "Called after forward pass but before loss has been computed."
173 |         pass
174 |     def on_backward_begin(self, **kwargs:Any)->None:
175 |         "Called after the forward pass and the loss has been computed, but before backprop."
176 |         pass
177 |     def on_backward_end(self, **kwargs:Any)->None:
178 |         "Called after backprop but before optimizer step. Useful for true weight decay in AdamW."
179 |         pass
180 |     def on_step_end(self, **kwargs:Any)->None:
181 |         "Called after the step of the optimizer but before the gradients are zeroed."
182 |         pass
183 |     def on_batch_end(self, **kwargs:Any)->None:
184 |         "Called at the end of the batch."
185 |         pass
186 |     def on_epoch_end(self, **kwargs:Any)->None:
187 |         "Called at the end of an epoch."
188 |         pass
189 |     def on_train_end(self, **kwargs:Any)->None:
190 |         "Useful for cleaning up things and saving files/models."
191 |         pass
192 |     def jump_to_epoch(self, epoch)->None:
193 |         "To resume training at `epoch` directly."
194 |         pass
195 | 
196 |     def get_state(self, minimal:bool=True):
197 |         "Return the inner state of the `Callback`, `minimal` or not."
198 |         to_remove = ['exclude', 'not_min'] + getattr(self, 'exclude', []).copy()
199 |         if minimal: to_remove += getattr(self, 'not_min', []).copy()
200 |         return {k:v for k,v in self.__dict__.items() if k not in to_remove}
201 | 
202 |     def  __repr__(self):
203 |         attrs = func_args(self.__init__)
204 |         to_remove = getattr(self, 'exclude', [])
205 |         list_repr = [self.__class__.__name__] + [f'{k}: {getattr(self, k)}' for k in attrs if k != 'self' and k not in to_remove]
206 |         return '\n'.join(list_repr)
207 | 
208 | class SmoothenValue():
209 |     "Create a smooth moving average for a value (loss, etc) using `beta`."
210 |     def __init__(self, beta:float):
211 |         self.beta,self.n,self.mov_avg = beta,0,0
212 | 
213 |     def add_value(self, val:float)->None:
214 |         "Add `val` to calculate updated smoothed value."
215 |         self.n += 1
216 |         self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val
217 |         self.smooth = self.mov_avg / (1 - self.beta ** self.n)
218 | 
219 | CallbackList = Collection[Callback]
220 | 
221 | def _get_init_state(): return {'epoch':0, 'iteration':0, 'num_batch':0, 'skip_validate': False}
222 | 
223 | @dataclass
224 | class CallbackHandler():
225 |     "Manage all of the registered `callbacks` and `metrics`, smoothing loss by momentum `beta`."
226 |     callbacks:CallbackList=None
227 |     metrics:CallbackList=None
228 |     beta:float=0.98
229 | 
230 |     def __post_init__(self)->None:
231 |         "Initialize smoother and learning stats."
232 |         self.callbacks = ifnone(self.callbacks, [])
233 |         self.metrics = ifnone(self.metrics, [])
234 |         self.metrics = [(met if isinstance(met, Callback) else AverageMetric(met)) for met in self.metrics]
235 |         self.callbacks = sorted(self.callbacks, key=lambda o: getattr(o, '_order', 0))
236 |         self.smoothener = SmoothenValue(self.beta)
237 |         self.state_dict:Dict[str,Union[int,float,Tensor]]=_get_init_state()
238 | 
239 |     def _call_and_update(self, cb, cb_name, **kwargs)->None:
240 |         "Call `cb_name` on `cb` and update the inner state."
241 |         new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
242 |         for k,v in new.items():
243 |             if k not in self.state_dict:
244 |                 raise Exception(f"{k} isn't a valid key in the state of the callbacks.")
245 |             else: self.state_dict[k] = v
246 |     
247 |     def __call__(self, cb_name, call_mets=True, **kwargs)->None:
248 |         "Call through to all of the `CallbakHandler` functions."
249 |         if call_mets: 
250 |             for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
251 |         for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
252 | 
253 |     def set_dl(self, dl:DataLoader):
254 |         "Set the current `dl` used."
255 |         if hasattr(self, 'cb_dl'): self.callbacks.remove(self.cb_dl)
256 |         if isinstance(dl.dataset, Callback):
257 |             self.callbacks.append(dl.dataset)
258 |             self.cb_dl = dl.dataset
259 | 
260 |     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
261 |         "About to start learning."
262 |         self.state_dict = _get_init_state()
263 |         self.state_dict.update(dict(n_epochs=epochs, pbar=pbar, metrics=metrics))
264 |         names = [(met.name if hasattr(met, 'name') else camel2snake(met.__class__.__name__)) for met in self.metrics]
265 |         self('train_begin', metrics_names=names)
266 |         if self.state_dict['epoch'] != 0:
267 |             self.state_dict['pbar'].first_bar.total -= self.state_dict['epoch']
268 |             for cb in self.callbacks: cb.jump_to_epoch(self.state_dict['epoch'])
269 | 
270 |     def on_epoch_begin(self)->None:
271 |         "Handle new epoch."
272 |         self.state_dict['num_batch'],self.state_dict['stop_training'] = 0,False
273 |         self('epoch_begin')
274 | 
275 |     def on_batch_begin(self, xb:Tensor, yb:Tensor, train:bool=True)->Tuple[Any,Any]:
276 |         "Handle new batch `xb`,`yb` in `train` or validation."
277 |         self.state_dict.update(dict(last_input=xb, last_target=yb, train=train, 
278 |             stop_epoch=False, skip_step=False, skip_zero=False, skip_bwd=False))
279 |         self('batch_begin', call_mets = not self.state_dict['train'])
280 |         return self.state_dict['last_input'], self.state_dict['last_target']
281 | 
282 |     def on_loss_begin(self, out:Tensor)->Any:
283 |         "Handle start of loss calculation with model output `out`."
284 |         self.state_dict['last_output'] = out
285 |         self('loss_begin', call_mets=False)
286 |         return self.state_dict['last_output']
287 | 
288 |     def on_backward_begin(self, loss:Tensor)->Tuple[Any,Any]:
289 |         "Handle gradient calculation on `loss`."
290 |         self.smoothener.add_value(loss.float().detach().cpu())
291 |         self.state_dict['last_loss'], self.state_dict['smooth_loss'] = loss, self.smoothener.smooth
292 |         self('backward_begin', call_mets=False)
293 |         return self.state_dict['last_loss'], self.state_dict['skip_bwd']
294 | 
295 |     def on_backward_end(self)->Any:
296 |         "Handle end of gradient calculation."
297 |         self('backward_end', call_mets=False)
298 |         return self.state_dict['skip_step']
299 | 
300 |     def on_step_end(self)->Any:
301 |         "Handle end of optimization step."
302 |         self('step_end', call_mets=False)
303 |         return self.state_dict['skip_zero']
304 | 
305 |     def on_batch_end(self, loss:Tensor)->Any:
306 |         "Handle end of processing one batch with `loss`."
307 |         self.state_dict['last_loss'] = loss
308 |         self('batch_end', call_mets = not self.state_dict['train'])
309 |         if self.state_dict['train']:
310 |             self.state_dict['iteration'] += 1
311 |             self.state_dict['num_batch'] += 1
312 |         return self.state_dict['stop_epoch']
313 | 
314 |     def on_epoch_end(self, val_loss:Tensor)->bool:
315 |         "Epoch is done, process `val_loss`."
316 |         self.state_dict['last_metrics'] = [val_loss] if val_loss is not None else [None]
317 |         self('epoch_end', call_mets = val_loss is not None)
318 |         self.state_dict['epoch'] += 1
319 |         return self.state_dict['stop_training']
320 | 
321 |     def on_train_end(self, exception:Union[bool,Exception])->None:
322 |         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
323 |         self('train_end', exception=exception)
324 |         
325 |     @property
326 |     def skip_validate(self): return self.state_dict['skip_validate']
327 | 
328 | class AverageMetric(Callback):
329 |     "Wrap a `func` in a callback for metrics computation."
330 |     def __init__(self, func):
331 |         # If func has a __name__ use this one else it should be a partial
332 |         name = func.__name__ if hasattr(func, '__name__') else func.func.__name__
333 |         self.func, self.name = func, name
334 |         self.world = num_distrib()
335 | 
336 |     def on_epoch_begin(self, **kwargs):
337 |         "Set the inner value to 0."
338 |         self.val, self.count = 0.,0
339 | 
340 |     def on_batch_end(self, last_output, last_target, **kwargs):
341 |         "Update metric computation with `last_output` and `last_target`."
342 |         if not is_listy(last_target): last_target=[last_target]
343 |         self.count += first_el(last_target).size(0)
344 |         val = self.func(last_output, *last_target)
345 |         if self.world:
346 |             val = val.clone()
347 |             dist.all_reduce(val, op=dist.ReduceOp.SUM)
348 |             val /= self.world
349 |         self.val += first_el(last_target).size(0) * val.detach().cpu()
350 | 
351 |     def on_epoch_end(self, last_metrics, **kwargs):
352 |         "Set the final result in `last_metrics`."
353 |         return add_metrics(last_metrics, self.val/self.count)
354 | 
355 | def annealing_no(start:Number, end:Number, pct:float)->Number:
356 |     "No annealing, always return `start`."
357 |     return start
358 | def annealing_linear(start:Number, end:Number, pct:float)->Number:
359 |     "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."
360 |     return start + pct * (end-start)
361 | def annealing_exp(start:Number, end:Number, pct:float)->Number:
362 |     "Exponentially anneal from `start` to `end` as pct goes from 0.0 to 1.0."
363 |     return start * (end/start) ** pct
364 | def annealing_cos(start:Number, end:Number, pct:float)->Number:
365 |     "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
366 |     cos_out = np.cos(np.pi * pct) + 1
367 |     return end + (start-end)/2 * cos_out
368 | 
369 | def do_annealing_poly(start:Number, end:Number, pct:float, degree:Number)->Number:
370 |     "Helper function for `anneal_poly`."
371 |     return end + (start-end) * (1-pct)**degree
372 | def annealing_poly(degree:Number)->Number:
373 |     "Anneal polynomically from `start` to `end` as pct goes from 0.0 to 1.0."
374 |     return functools.partial(do_annealing_poly, degree=degree)
375 | 
376 | class Scheduler():
377 |     "Used to \"step\" from start,end (`vals`) over `n_iter` iterations on a schedule defined by `func`"
378 |     def __init__(self, vals:StartOptEnd, n_iter:int, func:Optional[AnnealFunc]=None):
379 |         self.start,self.end = (vals[0],vals[1]) if is_tuple(vals) else (vals,0)
380 |         self.n_iter = max(1,n_iter)
381 |         if func is None: self.func = annealing_linear if is_tuple(vals) else annealing_no
382 |         else:          self.func = func
383 |         self.n = 0
384 |         
385 |     def restart(self): self.n = 0
386 | 
387 |     def step(self)->Number:
388 |         "Return next value along annealed schedule."
389 |         self.n += 1
390 |         return self.func(self.start, self.end, self.n/self.n_iter)
391 | 
392 |     @property
393 |     def is_done(self)->bool:
394 |         "Return `True` if schedule completed."
395 |         return self.n >= self.n_iter
396 | 
397 | 


--------------------------------------------------------------------------------
/sls/sls.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import copy
  3 | import time
  4 | 
  5 | import sls_utils as ut
  6 | 
  7 | class Sls(torch.optim.Optimizer):
  8 |     """Implements stochastic line search
  9 |     `paper <https://arxiv.org/abs/1905.09997>`_.
 10 |     Arguments:
 11 |         params (iterable): iterable of parameters to optimize or dicts defining
 12 |             parameter groups
 13 |         n_batches_per_epoch (int, recommended):: the number batches in an epoch
 14 |         init_step_size (float, optional): initial step size (default: 1)
 15 |         c (float, optional): armijo condition constant (default: 0.1)
 16 |         beta_b (float, optional): multiplicative factor for decreasing the step-size (default: 0.9)
 17 |         gamma (float, optional): factor used by Armijo for scaling the step-size at each line-search step (default: 2.0)
 18 |         beta_f (float, optional): factor used by Goldstein for scaling the step-size at each line-search step (default: 2.0)
 19 |         reset_option (float, optional): sets the rest option strategy (default: 1)
 20 |         eta_max (float, optional): an upper bound used by Goldstein on the step size (default: 10)
 21 |         bound_step_size (bool, optional): a flag used by Goldstein for whether to bound the step-size (default: True)
 22 |         line_search_fn (float, optional): the condition used by the line-search to find the 
 23 |                     step-size (default: Armijo)
 24 |     """
 25 | 
 26 |     def __init__(self,
 27 |                  params,
 28 |                  n_batches_per_epoch=500,
 29 |                  init_step_size=1,
 30 |                  c=0.1,
 31 |                  beta_b=0.9,
 32 |                  gamma=2.0,
 33 |                  beta_f=2.0,
 34 |                  reset_option=1,
 35 |                  eta_max=10,
 36 |                  bound_step_size=True,
 37 |                  line_search_fn="armijo"):
 38 |         defaults = dict(n_batches_per_epoch=n_batches_per_epoch,
 39 |                         init_step_size=init_step_size,
 40 |                         c=c,
 41 |                         beta_b=beta_b,
 42 |                         gamma=gamma,
 43 |                         beta_f=beta_f,
 44 |                         reset_option=reset_option,
 45 |                         eta_max=eta_max,
 46 |                         bound_step_size=bound_step_size,
 47 |                         line_search_fn=line_search_fn)
 48 |         super().__init__(params, defaults)       
 49 | 
 50 |         self.state['step'] = 0
 51 |         self.state['step_size'] = init_step_size
 52 | 
 53 |         self.state['n_forwards'] = 0
 54 |         self.state['n_backwards'] = 0
 55 | 
 56 |     def step(self, closure):
 57 |         # deterministic closure
 58 |         seed = time.time()
 59 |         def closure_deterministic():
 60 |             #with ut.random_seed_torch(int(seed)):
 61 |             return closure()
 62 | 
 63 |         batch_step_size = self.state['step_size']
 64 | 
 65 |         # get loss and compute gradients
 66 |         loss = closure() #_deterministic()
 67 |         loss.backward()
 68 | 
 69 |         # increment # forward-backward calls
 70 |         self.state['n_forwards'] += 1
 71 |         self.state['n_backwards'] += 1
 72 | 
 73 |         # loop over parameter groups
 74 |         for group in self.param_groups:
 75 |             params = group["params"]
 76 | 
 77 |             # save the current parameters:
 78 |             params_current = copy.deepcopy(params)
 79 |             grad_current = ut.get_grad_list(params)
 80 | 
 81 |             grad_norm = ut.compute_grad_norm(grad_current)
 82 | 
 83 |             step_size = ut.reset_step(step_size=batch_step_size,
 84 |                                     n_batches_per_epoch=group['n_batches_per_epoch'],
 85 |                                     gamma=group['gamma'],
 86 |                                     reset_option=group['reset_option'],
 87 |                                     init_step_size=group['init_step_size'])
 88 | 
 89 |             # only do the check if the gradient norm is big enough
 90 |             with torch.no_grad():
 91 |                 if grad_norm >= 1e-8:
 92 |                     # check if condition is satisfied
 93 |                     found = 0
 94 |                     step_size_old = step_size
 95 | 
 96 |                     for e in range(100):
 97 |                         # try a prospective step
 98 |                         ut.try_sgd_update(params, step_size, params_current, grad_current)
 99 | 
100 |                         # compute the loss at the next step; no need to compute gradients.
101 |                         loss_next = closure() #closure_deterministic()
102 |                         self.state['n_forwards'] += 1
103 | 
104 |                         # =================================================
105 |                         # Line search
106 |                         if group['line_search_fn'] == "armijo":
107 |                             armijo_results = ut.check_armijo_conditions(step_size=step_size,
108 |                                                         step_size_old=step_size_old,
109 |                                                         loss=loss,
110 |                                                         grad_norm=grad_norm,
111 |                                                         loss_next=loss_next,
112 |                                                         c=group['c'],
113 |                                                         beta_b=group['beta_b'])
114 |                             found, step_size, step_size_old = armijo_results
115 |                             if found == 1:
116 |                                 break
117 |                         
118 |                         elif group['line_search_fn'] == "goldstein":
119 |                             goldstein_results = ut.check_goldstein_conditions(step_size=step_size,
120 |                                                                     loss=loss,
121 |                                                                     grad_norm=grad_norm,
122 |                                                                     loss_next=loss_next,
123 |                                                                     c=group['c'],
124 |                                                                     beta_b=group['beta_b'],
125 |                                                                     beta_f=group['beta_f'],
126 |                                                                     bound_step_size=group['bound_step_size'],
127 |                                                                     eta_max=group['eta_max'])
128 | 
129 |                             found = goldstein_results["found"]
130 |                             step_size = goldstein_results["step_size"]
131 | 
132 |                             if found == 3:
133 |                                 break
134 |                 
135 |                     # if line search exceeds max_epochs
136 |                     if found == 0:
137 |                         print("line search attempts exceeded...using defaults")
138 |                         ut.try_sgd_update(params, 1e-6, params_current, grad_current)
139 | 
140 |             # save the new step-size
141 |             self.state['step_size'] = step_size
142 |             self.state['step'] += 1
143 | 
144 |         return loss


--------------------------------------------------------------------------------
/sls/sls_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.cuda
  3 | 
  4 | import numpy as np
  5 | #import contextlib
  6 | 
  7 | 
  8 | def check_armijo_conditions(step_size, step_size_old, loss, grad_norm,
  9 |                       loss_next, c, beta_b):
 10 |     found = 0
 11 | 
 12 |     # computing the new break condition
 13 |     break_condition = loss_next - \
 14 |         (loss - (step_size) * c * grad_norm**2)
 15 | 
 16 |     if (break_condition <= 0):
 17 |         found = 1
 18 | 
 19 |     else:
 20 |         # decrease the step-size by a multiplicative factor
 21 |         step_size = step_size * beta_b
 22 | 
 23 |     return found, step_size, step_size_old
 24 | 
 25 | def check_goldstein_conditions(step_size, loss, grad_norm,
 26 |                           loss_next,
 27 |                           c, beta_b, beta_f, bound_step_size, eta_max):
 28 | 	found = 0
 29 | 	if(loss_next <= (loss - (step_size) * c * grad_norm ** 2)):
 30 | 		found = 1
 31 | 
 32 | 	if(loss_next >= (loss - (step_size) * (1 - c) * grad_norm ** 2)):
 33 | 		if found == 1:
 34 | 			found = 3 # both conditions are satisfied
 35 | 		else:
 36 | 			found = 2 # only the curvature condition is satisfied
 37 | 
 38 | 	if (found == 0):
 39 | 		raise ValueError('Error')
 40 | 
 41 | 	elif (found == 1):
 42 | 		# step-size might be too small
 43 | 		step_size = step_size * beta_f
 44 | 		if bound_step_size:
 45 | 			step_size = min(step_size, eta_max)
 46 | 
 47 | 	elif (found == 2):
 48 | 		# step-size might be too large
 49 | 		step_size = max(step_size * beta_b, 1e-8)
 50 | 
 51 | 	return {"found":found, "step_size":step_size}
 52 | 
 53 | 
 54 | def reset_step(step_size, n_batches_per_epoch=None, gamma=None, reset_option=1,
 55 |                init_step_size=None):
 56 |     if reset_option == 0:
 57 |         pass
 58 | 
 59 |     elif reset_option == 1:
 60 |         step_size = step_size * gamma**(1. / n_batches_per_epoch)
 61 | 
 62 |     elif reset_option == 2:
 63 |         step_size = init_step_size
 64 | 
 65 |     return step_size
 66 | 
 67 | def try_sgd_update(params, step_size, params_current, grad_current):
 68 |     zipped = zip(params, params_current, grad_current)
 69 | 
 70 |     for p_next, p_current, g_current in zipped:
 71 |         p_next.data = p_current - step_size * g_current
 72 | 
 73 | def compute_grad_norm(grad_list):
 74 |     grad_norm = 0.
 75 |     for g in grad_list:
 76 |         if g is None:
 77 |             continue
 78 |         grad_norm += torch.sum(torch.mul(g, g))
 79 |     grad_norm = torch.sqrt(grad_norm)
 80 |     return grad_norm
 81 | 
 82 | 
 83 | def get_grad_list(params):
 84 |     return [p.grad for p in params]
 85 | 
 86 | #@contextlib.contextmanager
 87 | def random_seed(seed):
 88 |     state = np.random.get_state()
 89 |     np.random.seed(seed)
 90 |     try:
 91 |         yield
 92 |     finally:
 93 |         np.random.set_state(state)
 94 | 
 95 | #@contextlib.contextmanager
 96 | def random_seed_torch(seed, device=0):
 97 |     cpu_rng_state = torch.get_rng_state()
 98 |     gpu_rng_state = torch.cuda.get_rng_state(0)
 99 | 
100 |     np.random.seed(seed)
101 |     torch.manual_seed(seed)
102 |     torch.cuda.manual_seed_all(seed)
103 | 
104 |     try:
105 |         yield
106 |     finally:
107 |         torch.set_rng_state(cpu_rng_state)
108 |         torch.cuda.set_rng_state(gpu_rng_state, device)


--------------------------------------------------------------------------------