├── .gitignore
├── DeepMemory
├── DeepMemory-Playground.ipynb
├── README.md
└── deepmemory.py
├── LICENSE
├── README.md
├── Ranger
└── ranger.py
├── adahessian
├── README.md
└── adahessian.py
├── adamod
├── README.md
├── adamod.py
└── diffmod.py
├── diffgrad
├── README.md
├── diff_rgrad.py
├── diffgrad-playground.ipynb
├── diffgrad.py
└── mxresnet.py
├── diffmod
├── diffmod-playground.ipynb
└── diffmod.py
├── images
├── 1120-optimizer-testing.jpg
├── projected_gradient.png
├── ranger-init.jpg
└── ranger-with-gc-options.jpg
├── madgrad
└── madgrad_wd.py
└── sls
├── README.md
├── basic_train.py
├── callback.py
├── sls.py
└── sls_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
137 | # Cython debug symbols
138 | cython_debug/
139 |
--------------------------------------------------------------------------------
/DeepMemory/DeepMemory-Playground.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "%autoreload 2\n",
11 | "\n",
12 | "%matplotlib inline"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "from fastai.script import *\n",
22 | "from fastai.vision import *\n",
23 | "from fastai.callbacks import *\n",
24 | "from fastai.distributed import *\n",
25 | "from fastai.callbacks.tracker import *\n",
26 | "\n",
27 | "torch.backends.cudnn.benchmark = True\n",
28 | "\n",
29 | "import time"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "data": {
39 | "text/plain": [
40 | "'1.0.57'"
41 | ]
42 | },
43 | "execution_count": 3,
44 | "metadata": {},
45 | "output_type": "execute_result"
46 | }
47 | ],
48 | "source": [
49 | "import fastai;fastai.__version__ #safety check"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 4,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/plain": [
60 | "'1.2.0'"
61 | ]
62 | },
63 | "execution_count": 4,
64 | "metadata": {},
65 | "output_type": "execute_result"
66 | }
67 | ],
68 | "source": [
69 | "import torch; torch.__version__ #safety check"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 5,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "from deepmemory import DeepMemory"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 6,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stdout",
88 | "output_type": "stream",
89 | "text": [
90 | "Mish activation loaded...\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "from mxresnet import *"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 7,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "PosixPath('/home/ubuntu/.fastai/data/imagenette-160')"
107 | ]
108 | },
109 | "execution_count": 7,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "path = untar_data(URLs.IMAGENETTE_160); path #optional - IMAGENETTE"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 8,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "def flattenAnneal(learn:Learner, lr:float, n_epochs:int, start_pct:float):\n",
125 | " n = len(learn.data.train_dl)\n",
126 | " anneal_start = int(n*n_epochs*start_pct)\n",
127 | " anneal_end = int(n*n_epochs) - anneal_start\n",
128 | " phases = [TrainingPhase(anneal_start).schedule_hp('lr', lr),\n",
129 | " TrainingPhase(anneal_end).schedule_hp('lr', lr, anneal=annealing_cos)]\n",
130 | " sched = GeneralScheduler(learn, phases)\n",
131 | " learn.callbacks.append(sched)\n",
132 | " learn.fit(n_epochs)\n"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 9,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "tfms = ([\n",
142 | "\n",
143 | " flip_lr(p=0.5)#,\n",
144 | " #brightness(change=(0.4,0.6)),\n",
145 | " #contrast(scale=(0.7,1.3)),\n",
146 | " #cutout(n_holes=(2,40),length=(5,30),p=.25)\n",
147 | "\n",
148 | " ], [])"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 10,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "bs=64\n",
158 | "size=128"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 11,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "data = (ImageList.from_folder(path)\n",
168 | " .split_by_folder(valid='val')\n",
169 | " .label_from_folder()\n",
170 | " .transform(tfms=tfms,size=size) \n",
171 | " .databunch(bs=bs, num_workers=8) #windows 10 users - num_workers may need to be set to 1 or 0 (if you get pickle fork error)\n",
172 | " .presize(size, scale=(0.5, 1))\n",
173 | " .normalize(imagenet_stats))"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 12,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "data": {
183 | "text/plain": [
184 | "201"
185 | ]
186 | },
187 | "execution_count": 12,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "memory_size = (len(data.x)//bs);memory_size #should be equal to or close to # of batches per epoch in order to build an average step size for the dataset"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 13,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "optar = partial(DeepMemory,betas=(.95,.999),len_memory = memory_size)"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 14,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "model = mxresnet50(sa=1)"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 15,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "learn = Learner(data, model, metrics=[accuracy], wd=1e-3,\n",
221 | " opt_func=optar,\n",
222 | " bn_wd=False, true_wd=True,\n",
223 | " loss_func = LabelSmoothingCrossEntropy())"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 16,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "learn.callback_fns += [\n",
233 | " partial(ShowGraph),\n",
234 | " #partial(SaveModelCallback, name='model-novotest-1')\n",
235 | " ]"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 17,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "name": "stdout",
245 | "output_type": "stream",
246 | "text": [
247 | "DeepMemory: length of memory is 201 - this should be close or equal to batches per epoch\n"
248 | ]
249 | },
250 | {
251 | "data": {
252 | "text/html": [
253 | "
\n",
254 | " \n",
255 | " \n",
256 | " epoch | \n",
257 | " train_loss | \n",
258 | " valid_loss | \n",
259 | " accuracy | \n",
260 | " time | \n",
261 | "
\n",
262 | " \n",
263 | " \n",
264 | " \n",
265 | " 0 | \n",
266 | " 2.571196 | \n",
267 | " 2.547925 | \n",
268 | " 0.432000 | \n",
269 | " 00:41 | \n",
270 | "
\n",
271 | " \n",
272 | " 1 | \n",
273 | " 2.228164 | \n",
274 | " 2.119214 | \n",
275 | " 0.604000 | \n",
276 | " 00:38 | \n",
277 | "
\n",
278 | " \n",
279 | " 2 | \n",
280 | " 2.080257 | \n",
281 | " 1.947510 | \n",
282 | " 0.666000 | \n",
283 | " 00:37 | \n",
284 | "
\n",
285 | " \n",
286 | " 3 | \n",
287 | " 1.979283 | \n",
288 | " 2.059222 | \n",
289 | " 0.636000 | \n",
290 | " 00:37 | \n",
291 | "
\n",
292 | " \n",
293 | " 4 | \n",
294 | " 1.887242 | \n",
295 | " 1.752053 | \n",
296 | " 0.742000 | \n",
297 | " 00:37 | \n",
298 | "
\n",
299 | " \n",
300 | " 5 | \n",
301 | " 1.838371 | \n",
302 | " 1.779305 | \n",
303 | " 0.732000 | \n",
304 | " 00:37 | \n",
305 | "
\n",
306 | " \n",
307 | " 6 | \n",
308 | " 1.801879 | \n",
309 | " 1.786606 | \n",
310 | " 0.728000 | \n",
311 | " 00:37 | \n",
312 | "
\n",
313 | " \n",
314 | " 7 | \n",
315 | " 1.728110 | \n",
316 | " 1.704001 | \n",
317 | " 0.752000 | \n",
318 | " 00:37 | \n",
319 | "
\n",
320 | " \n",
321 | " 8 | \n",
322 | " 1.699986 | \n",
323 | " 1.714347 | \n",
324 | " 0.758000 | \n",
325 | " 00:37 | \n",
326 | "
\n",
327 | " \n",
328 | " 9 | \n",
329 | " 1.681988 | \n",
330 | " 1.630025 | \n",
331 | " 0.784000 | \n",
332 | " 00:37 | \n",
333 | "
\n",
334 | " \n",
335 | " 10 | \n",
336 | " 1.625826 | \n",
337 | " 1.710677 | \n",
338 | " 0.764000 | \n",
339 | " 00:37 | \n",
340 | "
\n",
341 | " \n",
342 | " 11 | \n",
343 | " 1.583407 | \n",
344 | " 1.607164 | \n",
345 | " 0.814000 | \n",
346 | " 00:37 | \n",
347 | "
\n",
348 | " \n",
349 | " 12 | \n",
350 | " 1.565465 | \n",
351 | " 1.577648 | \n",
352 | " 0.800000 | \n",
353 | " 00:37 | \n",
354 | "
\n",
355 | " \n",
356 | " 13 | \n",
357 | " 1.565315 | \n",
358 | " 1.556962 | \n",
359 | " 0.806000 | \n",
360 | " 00:37 | \n",
361 | "
\n",
362 | " \n",
363 | " 14 | \n",
364 | " 1.516048 | \n",
365 | " 1.666469 | \n",
366 | " 0.780000 | \n",
367 | " 00:38 | \n",
368 | "
\n",
369 | " \n",
370 | " 15 | \n",
371 | " 1.475736 | \n",
372 | " 1.541728 | \n",
373 | " 0.826000 | \n",
374 | " 00:37 | \n",
375 | "
\n",
376 | " \n",
377 | " 16 | \n",
378 | " 1.439951 | \n",
379 | " 1.482574 | \n",
380 | " 0.834000 | \n",
381 | " 00:37 | \n",
382 | "
\n",
383 | " \n",
384 | " 17 | \n",
385 | " 1.400893 | \n",
386 | " 1.434333 | \n",
387 | " 0.858000 | \n",
388 | " 00:38 | \n",
389 | "
\n",
390 | " \n",
391 | " 18 | \n",
392 | " 1.374915 | \n",
393 | " 1.411827 | \n",
394 | " 0.868000 | \n",
395 | " 00:38 | \n",
396 | "
\n",
397 | " \n",
398 | " 19 | \n",
399 | " 1.330561 | \n",
400 | " 1.421291 | \n",
401 | " 0.868000 | \n",
402 | " 00:37 | \n",
403 | "
\n",
404 | " \n",
405 | "
"
406 | ],
407 | "text/plain": [
408 | ""
409 | ]
410 | },
411 | "metadata": {},
412 | "output_type": "display_data"
413 | },
414 | {
415 | "data": {
416 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAD4CAYAAAATpHZ6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3deXxc5X3v8c9vFi2jxVosy5I3ycbYxgu2EY4bCCGQBTCB9Iam7itpU5Jb2pD2QnpvE0ialKTkXrqn6c1yIaVJCoWAUxoIhIQmGJOyysY23vdFkmVt1r7OzHP/OGNZNtZiW2dGGn/fr9e8dObMmTM/HUtfP3rOc55jzjlERCQ9BFJdgIiIjB+FuohIGlGoi4ikEYW6iEgaUaiLiKSRkB87nVJY5ObPm+vHrkVE0tLGjRubnHMlF7ofX0J9Wvksqqur/di1iEhaMrPD47EfX7pfNPJdRCQ1/OlTV6qLiKSETydKleoiIqngS5+6Il1EzsXAwAA1NTX09vamuhTfZWVlMXPmTMLhsC/79yXURUTORU1NDXl5eVRUVGBmqS7HN845mpubqampobKy0pfPUJ+6iKRcb28vxcXFaR3oAGZGcXGxr3+RaPSLiEwI6R7oJ/n9feqKUhGRNKKWuohc9FpbW/n2t799zu+76aabaG1t9aGi8+dTn7piXUQmj+FCPRaLjfi+5557joKCAr/KOi8a0igiF7177rmH/fv3s3z5csLhMLm5uZSVlbF582Z27NjBRz7yEY4ePUpvby933XUXd9xxBwAVFRVUV1fT2dnJjTfeyNVXX80rr7zCjBkz+MlPfkJ2dnbSvxcNaRSRCeWrz2xnR137uO7zsvJ8/uLDi4d9/YEHHmDbtm1s3ryZ9evXs2bNGrZt2zY47PDhhx+mqKiInp4errzySj760Y9SXFx82j727t3LY489xkMPPcTHPvYxfvzjH/OJT3xiXL+PsVBLXUTkDKtWrTptHPk3v/lNnnrqKQCOHj3K3r173xHqlZWVLF++HIArrriCQ4cOJa3eofxpqSvVReQ8jdSiTpacnJzB5fXr1/Of//mfvPrqq0QiEa699tqzjjPPzMwcXA4Gg/T09CSl1jNpSKOIXPTy8vLo6Og462ttbW0UFhYSiUTYtWsXr732WpKrOzfqfhGRi15xcTFXXXUVS5YsITs7m9LS0sHXbrjhBr773e+ybNkyFixYwOrVq1NY6ejM+TD8cPaCpe7I7rfHfb8ikp527tzJokWLUl1G0pzt+zWzjc65qgvdtz8XH2mcuohISqhPXUQkjSjURUTSiOZ+ERFJI5pPXUQkjailLiKSRsYU6mZWYGbrzGyXme00s9/wuzARkYksNzcXgLq6Om677bazbnPttddSXV2dzLLGfPHRPwLPO+duM7MMIDLSxk5tdRG5SJSXl7Nu3bpUlzFo1FA3s3zgGuD3AZxz/UD/iG9SpovIJPOFL3yBOXPmcOeddwJw3333YWZs2LCBEydOMDAwwP3338+tt9562vsOHTrEzTffzLZt2+jp6eH2229nx44dLFq0KCXzv4ylpT4XaAT+xcwuBzYCdznnuoZuZGZ3AHcAFJTPHe86ReRi8bN7oH6cr0ifvhRufGDETdauXcvdd989GOpPPPEEzz//PJ/73OfIz8+nqamJ1atXc8sttwx7n9HvfOc7RCIRtm7dytatW1m5cuX4fh9jMJY+9RCwEviOc24F0AXcc+ZGzrkHnXNVzrmqrOyscS5TRMRfK1asoKGhgbq6OrZs2UJhYSFlZWV88YtfZNmyZbz//e+ntraW48ePD7uPDRs2DM6hvmzZMpYtW5as8geNpaVeA9Q4515PPF/HWUJdRGRcjNKi9tNtt93GunXrqK+vZ+3atTz66KM0NjayceNGwuEwFRUVZ512d6jhWvHJMmpL3TlXDxw1swWJVdcDO0Z+zzhUJiKSZGvXruXxxx9n3bp13HbbbbS1tTFt2jTC4TAvvvgihw8fHvH911xzDY8++igA27ZtY+vWrcko+zRjHf3yJ8CjiZEvB4Db/StJRCQ1Fi9eTEdHBzNmzKCsrIyPf/zjfPjDH6aqqorly5ezcOHCEd//mc98httvv51ly5axfPlyVq1alaTKT/Fl6t3p8xa7+v3bx32/IpKeNPXuRJ96V2MaRURSQnO/iIikEc39IiITwsVycx2/v0/Npy4iKZeVlUVzc3PaB7tzjubmZrKy/LuWx58bT6f3v4uIjLOZM2dSU1NDY2NjqkvxXVZWFjNnzvRt/76EujpgRORchMNhKisrU11GWvDpxtN+7FVEREajE6UiImlEJ0pFRNKIul9ERNKITy11pbqISCqoT11EJI2o+0VEJI2opS4ikkY0oZeISBrR1LsiImlE49RFRNKITpSKiKQRnSgVEUkj6n4REUkjPnW/qK0uIpIKaqmLiKQR9amLiKSRMd35yMwOAR1ADIg656pG2l69LyIiqXEut7N7n3OuybdKRETkgulEqYhIGhlrqDvgF2a20czuONsGZnaHmVWbWbUiXUQkNcba/XKVc67OzKYBL5jZLufchqEbOOceBB4EyCybr1wXEUmBMbXUnXN1ia8NwFPAqtHeE48r10VEkm3UUDezHDPLO7kMfBDYNtr7ogp1EZGkG0v3SynwlJmd3P7fnHPPj/amuE6Wiogk3aih7pw7AFx+rjtWS11EJPl8myYgplAXEUk6hbqISBpRqIuIpBGFuohIGvEv1DX6RUQk6fwL9ZhCXUQk2dRSFxFJIz72qcf92rWIiAzDx1D3a88iIjIc30I9qpa6iEjS+RbqynQRkeRTS11EJI3411LX6BcRkaTzr6WuceoiIkmnceoiImlEc7+IiKQRhbqISBpRqIuIpBGFuohIGvFxnLpCXUQk2XwL9f6oLj4SEUk230K9Lxrza9ciIjKMMYe6mQXN7C0z++lYtu8dUEtdRCTZzqWlfhewc6wbq6UuIpJ8Ywp1M5sJrAG+N9Yd96mlLiKSdGNtqX8D+DwwbFKb2R1mVm1m1QC9aqmLiCTdqKFuZjcDDc65jSNt55x70DlX5ZyrCpippS4ikgJjaalfBdxiZoeAx4HrzOyRkd5gppa6iEgqjBrqzrl7nXMznXMVwFrgV865T4y4U7XURURSwpdx6mbQp4uPRESSLnQuGzvn1gPrR9sugNE7oO4XEZFkU0tdRCSN+BLqATN6+tVSFxFJNl9CPRgwOvqifuxaRERG4E9LPQCdfQN+7FpEREbgT0vdjM5etdRFRJLNp5a60dkXxTndKENEJJl8a6kPxJxGwIiIJJlvLXWATp0sFRFJKp9a6t7XDvWri4gklb8tdYW6iEhS+danDtChYY0iIknla0td3S8iIsnlS6iHEqHe3Nnvx+5FRGQYPoW6t9vGjj4/di8iIsPwbZbGwkiYxs5eP3YvIiLD8CXUAablZdHQrpa6iEgy+RbqJXmZNHYq1EVEksnfUFefuohIUvnY/ZJJQ0efJvUSEUkiX1vq/dE47RqrLiKSNL6GOkBjh0bAiIgki++h3qB+dRGRpPG1Tx10AZKISDKNGupmlmVmb5jZFjPbbmZfHcuOS/KyAIW6iEgyhcawTR9wnXOu08zCwK/N7GfOuddGelN+VoiMUEChLiKSRKOGuvPGJHYmnoYTj1HHKZoZpfmZHD3RfWEViojImI2pT93Mgma2GWgAXnDOvX6Wbe4ws2ozq25sbARgcdkUdh7rGNeCRURkeGMKdedczDm3HJgJrDKzJWfZ5kHnXJVzrqqkpASAS6fncbi5i96B2LgWLSIiZ3dOo1+cc63AeuCGsWy/oDSPuIM9x9VaFxFJhrGMfikxs4LEcjbwfmDXWHa+ck4BANWHTlxAiSIiMlZjaamXAS+a2VbgTbw+9Z+OZedlU7KZWZjNm4daLqRGEREZo1FD3Tm31Tm3wjm3zDm3xDn3tXP5gFUVRbx56MTgxF6vHWhm7r3P8sNXD2myLxGRcebbFaUnVVUU0dTZx+HmbjYdOcHaB18j7uArP9nOzf/0a7r7NeGXiMh48T3UV1UWAvDGoRYe+JnXFf9f91zHb1fNYntdO3c+ukktdhGRceJ7qM8ryaUwEubNgy1sr23j999dwYyCbP7Pf1vKR1fOZP3uRp7cWON3GSIiFwXfQ93MWD23mCc31tDVH2NmYbb3wQHjb25bxrsqi/jLZ3ZwqKnL71JERNKe76EO8IHLSgeXF5Xln/rwgPG3v3U5fbE41/7tev7gh9XJKEdEJG0lJdSvWzhtcPnyWQWnvTarKML3b78SgBd2HOd7Lx9IRkkiImkpKaFeEMnAzFvOzXznHGLvnjeVvV+/kao5hdz/7E6+/B/biMd18lRE5FwlJdQBXr/3ejb82fuGfT0cDPDg71VRYq1kvPkdPvgPLxGNxZNVnohIWhjLfOrjYlp+1qjbFOVk8NL7a4m8/AhXtO7hK09m8pWPriIrHExChSIik1/SWupjFbnuz3Af+EtuCFbzuzv+gA985Qc0tOvm1SIiYzHhQh0z7Kr/QeB3f8zcjDaezvgyn3vgGzxRfTTVlYmITHgTL9RPmncdmXe+RHZROT8MP8Dupx7g/63fl+qqREQmtIkb6gBFc8n6o19hC9fw5fAjTP3l3Sy45ynaugdSXZmIyIQ0sUMdIDOPwG//K33vuYePBl/miYyv8aGvPcY///pgqisTEZlwJn6oAwQCZF5/L6z9NxaF63km88957tmnqLz3WbbXtaW6OhGRCWNyhPpJC9eQ8YcvUlBQxOOZX2dt4Jes+eav+fP/eJvmzr5UVyciknLmx7S3VVVVrrrax3lcek7Auk/D/l/ySPR6vhr9JAOJIfd/vmYRt19VSTBg/n2+iMg4M7ONzrmqC93P5Gqpn5RdCB9/Eq66i0+EfsmG0n9g7WWZANz/7E7WfPNl6ts0tl1ELj6Ts6U+1Nvr4CefhUgx0Y89wpN1U/naMzvoGYgxf1ou/7h2BZeV54++HxGRFBqvlvrkD3WAus3wo09AVyPc8k8cLF/DXY+/xdYa7yTqtQtKmJaXya76DmYVRphRmM2ffuBSTT8gIhOGQv1MnY3w5Cfh8H/Bu/+E3vd+hV/taeZLT73NiWHGtV8+cwqY8cHLSrnz2nmYqR9eRFJDoX42sQF4/l548yGYvgyW/hZcdguvnchj17F2ZhdHWDg9nzcOtnD3jzaf9tb8rBB/+N55fGhxKTMLI2rFi0hSJS3UzWwW8ENgOhAHHnTO/eNI70lZqJ+05XF49VtQv9V7Pn0ZXHYLLLoVSi4d3CwWd7yw4zh//8Ju9hzvPG0XS2dM4fffXcHKOYVUTs0ZvDm2WvMi4odkhnoZUOac22RmecBG4CPOuR3DvSfloX5Sy0HY+QzsfBpq3vTWlSyERbd4IV+6hJN373DO8dzb9WypaWXnsXZe3tt01l1+6qpKttS0AvC7q+fwwcWlRDJCEI9Dy36o3Xjq0XIQVnwcrvk8ZOlkrYgML2XdL2b2E+D/OudeGG6bCRPqQ7XVwq6fwo6n4cgr4OJQWHmqBT9j5WDAA3T2Rak+1MJnHtlEz0DsHbsr4QTLA/u5PLCfFcEDrAgeIBL3bp7tMnKx8hWQNQV2PQs5U+H6v4DlH4fA5BxFKiL+Skmom1kFsAFY4pxrH267CRnqQ3U2egG/8xk4+BLEo5A/AxZ92GvFz14NgSF96r3tcGwz1G6kff/rhI69RaS3HoAoQXbGZ7ElPo8tbh6b45ew35UTJ8ANi6fzqbknWPDW15nStAnKlsONfw2z35Wib1xEJqqkh7qZ5QIvAV93zv37WV6/A7gDYPbs2VccPnz4QmtLjp4TsPt5r4tm3y8h1gc502DhTd6J19qN0LgbSBynorkw44pTj+lLIZzNia5+DjR18s+/Pshzb9ef8SGOWwKvcG/4McqshY75v0ns+vuYUjqHw83dzCjMJhxUC17kYpbUUDezMPBT4OfOub8fbfsJ31IfTl8H7P2F10Wz9wXIiAwJ8JVQvhIiRWPaVX80TmtPP1/+j21EMkLEneMXmw/wmdDT/GHwWWIE+Hb0Fh6KrSErO4c/ft8lzC/N5b2XluhkrMhFKJknSg34AdDinLt7LDudtKE+VDwGFjitn308HG7u4tHnX2ZN/be5vOMljtk0Hsr+FA+3LAWMopwMWrr6ASiMhLlhSRl/ct0lOKCjd4Ce/hgNHX0ca+3hhiVl3P/sDmJxx5LEaJ1X9jdzpKWbW5eXMzU3c1xrFxH/JDPUrwZeBt7GG9II8EXn3HPDvSctQj0ZDm6An90DDdvpLFvNX3E7/3ow7x2bhYPGQOzcrye4/aoKbl5WxhVzxvbXhYikji4+ShexKGz6AfzqfuhtpX/57xG/9ktkTZkGwEt7Gvn7F/aw5Wgrq+cWsagsnynZYY40d5OdEeQ3V8xgwfQ8/vdzuzjS0kVeZhgAh+Pn248DUDk1h/deWsJNS8u4sqLwrN07sbjjx5tqqD7UwvrdjVx9yVSyMoJcWVHIjUvKCAcDxJ3DgJD6/0XGnUI93XS3wPoH4M3vQWYuXPtFuPLTEAyf3/5iA+w+Usfru4/w+u5a9tS3ESJGboZRnB2gu6+flTPzmZ4X4s0DjTS2dxMknnjECBInRJxAYl09hRyIl3OMIlbPLeaLNy1i6Ywp6v8XGScK9XTVsBOevwcOrPculLr6c2BB6O+Avk7o7/S+9rWfWj7bupg/Nw3pcpkccGXsd+Xsj5ez35VTMHsJH7n+PayaXz64XTQWV4te5Bwo1NOZc7D7Z/DzL8KJs9yLNSPXe2Tmea36k8sZud7zzDzIGPJaKMsbdx8I4QJBatsGmJKTReeAo2fAmF2SSyiU4f3nEQgObjv43Aza66BpDwPHd9NXv4uu2p2UxBoImPfzE3fGUVfCIco5SDm7Y2W05VTyensxc2bNZn5pPv/9PZXML82jrWeAUMAwg4Go41e7j/PS7kYG4o68zBAtXf2sWTqdDy4sJjsY94aWxqMQzNCVuZK2FOoXg2gfNO6CcORUYIdzJs5Vqf3dbN+2CZr2smf7JvI7DzLXaimP1pLJqb8UWl0Oh10pDggTI5R4hIkSstg71xEjZPGzfmQTBfQXzmf6vGUEShbA1EuhZAHklQ2OVIrFHd39UfKyvK6rfQ2dPL/tGCtmF7J8VgE5maHRv7e+Tmje5z2a9kLzXu/riUPeZ5Utg7LLvcf0ZZBdcKFHUy5yCnWZuOJxaK+Bpj301++ms24n8eaD7Gvspjdu9LsgWZlZdEeNqVNyyMjIpCg/h+L8HNr6IC+SRV1HlA37W2noihMlSHZWFtHeTubaMS4J1HJJoI58ugc/sicQoT48i809peyNlbHPlbPPzeAIpUTd2WfcXDgtm/z+43y+KsgVOU1Y8z76ju8mo/UA1lE3uJ3DsIJZUDwfiiq9v1qObYH22lM7K6w4FfJll3tXD+dM9esIp87gHEeboGE7FMyB2b/hdRVOlMbGJKVQl4uGc27whKxzjqe31PHK3ibqjx2Gxj3MitdwidVyidUyL3CMMmsZfG+/C1IXLOOAm0FX/jxOdPUzte8Ic+0YlVZPpp2aa7/NRTjgyr1zBvGyweXDrpQ+Mrjvw5exsCyfyqk5lOZn8dKmHRS276SgbQfUb6W4fRc5XUdOFZ4/44ygv/y0vygmPOegrQbqNnkhXrcJ6rZAn3fzGSwILjEvUlaBN73G7NVeyJevgJCukzgXCnURoHcgRn1bL8GAEckIUpyb6c3V07SXgeM7CbXsw5r2QNNub9ZMM1xhBfGiS3DFl3Aiew6vtBbyd5sckYJS+uOO/micSEaQGQXZrFlWzpsHW/hR9dHTPjcjFKA/+s4uony6WBY6wmI7RFXmERa5g8yIHcUS00z0ZBQRKltKOG9q4pxIHi4zH8vM955neV/j4TxOxDJpJ0IoK5+SogL6oo5IZtC/KSU6G88I8Le8u4kBBMJQuvjUldUzVsLUBdB2FI685k2Sd+Q1aNrjbR/M9K7EPhnys1api2oUCnWRcxXt91rJ5zFMtLW7n689s4PMcIA5xTnUnujhUHMX18wvYSAeZ1FZPrvrO9jf0EkkI8jB5m7ePNhC3DmC0W4W2hGWBA6yxA5xaeAohYEeikJ9ZEQ7yaR/9NJdgE6y6XARBkI55E0pJB7KJpKTS25uPhbO9s69DPkaC2URyIhg4QiEI7hwFrubYzy5pZmW/iDvnd7HB6bUEmnaitW95QU0AOadpzgZ3uUrvUAPZ41+oLqa4OjrcDgR8sc2eye5MW8fJ0N+9mqYMvOc/x3SmUJdZJLoi8ZoaO+jqz9KY0cf33pxH82d/Rxs6iIad4SJsqAQikJ9NDY1kUc3VWUhZkViBAc6CMe6qK0/Ti49TMvoJzjQSS49ZFk/2fSTG+wnNzBAxPoJRHvI4Oy3bxzOUVdKa+ES6nIW0Vq4lN1WSV13iIJImPfML2FOcYTczBD7GzuZW5JLRXGEnoGYdx+B0fR3e5PiHXnVexx9wxt2CzBlNsy60uuXzy/3Qj6/HPJnenMsTZZuqnGiUBeZ5GJxx676dhaU5o15TL9zjhPdA7x2oJkXdzXQ2ef9R9HU2cehZu/EcYA42dbPnDwozoxxrOkE2fQzryDANRU5LJgawvq7qenP4fOvBmnlnVNTjNWc4gj/84ML+NDiUjJDQdp7B4jGHIWRMLG4Ixiw0y9Qi0W9E6yHX8UdeRXqNmHtdYnW/BChrETAz0g8ymHKjCHPZ6Rd8CvUReQ0LV39HGvrYe7UXLIzxn6P3aMt3ext6GBeSS4dvVFmFmaTmxni1/uaCAUC7D7ewdObaynMyWDh9Hx2HmunoaOPncdO3VJhuPmJSvMzWTA9n6DB27XtVBRHCAWNzr4o+xo6uWJOIbcsm85Nc4Pk9jZQf3QfBdEmWuoOEG+rJbu3nvz+BsLdx7Hhgj+vHHKneY+cEsgtPX05pwRCGed9XJNFoS4iKTcQi/PUplq21LTy6OtHmJqbQdWcIoIBo6Gjl70NXldLPO7ICgdp7uonFvcyJyMYoD926mTzmc+HChBnKm2UWTNl1sLs0AlWFfdQMNDIzFAr+bEThHuaCEc7z/p+sgreGfa5Jd69E3JLvbuUZUQS14TkJM5N5EBwDF1M40ShLiKTxpnDUuMOggHDOcfGwyd46OUDFOVkkhE0pmSHmT4lmxWzC+gZiLG9rp3ttW2UTcnmQFMn9W29dPZF2V73zpuvZdJPibVRYm1MpZUFuT3Myeoiq6+ZrL4mpgc7KAu2kxc7QWasa/TCgxlDgj5yKvgHl3NOrXNxrxvp5BXQZy4PPh/wpvYefM17bp99bVxCPXn/DYnIRWtov7qZEbRTy1UVRVRVDD899MrZhWdd39UXJWDGhr2NhAJGXlaY2tZu7v33t8ksqGBXay87gxm0tQ3ggK7+KMU5mZzo9P5ayKKPqdZGCW3kWTerZ2Zx08IpFIaiRAJ9hGO90N8FA93eCd+BLhjo8db1tkJ7Hb3dHQRjPdhAD4FggEAw7E2xEUh8DYaGPA96I69OPg9nD3keAl4bn2OtlrqIXEz6ojH2NXTS2NHHnuMdzCqMsONYO999af9p5wWm5mYQCgToGYiREQowpyhCaX4WB5q6qG/rIRZ3tPee6ucPB42rL5nK9YtKWbO0jMKcc+vHV/eLiMg4qm/rpfpwC4eaunhlfzPg3ZayMCeDbbVtHGvrpXJqDgBLZ0wh5hzxuGN2UYRrLi3h6c11p12klpsZoi8aozgnk4JImJzMEOGgMX9aHqGgcay1l+6BGPG4o7Gjj1/86XsV6iIiyRKPOwKBkYdQdvZF2d/QyfrdjWypaeVISzdFORm8cbCFd1UWcai5i66+GJ19USqn5jAlO0wwYAQDxpN/9G71qYuIJMtogQ5e6/zyWQVcPmvkKRGGnjg+yf7ogsobpGnVRESSzM87hinURUTSiEJdRCSNKNRFRNKIQl1EJI2MGupm9rCZNZjZtmQUJCIi528sLfXvAzf4XIeIiIyDUUPdObcBaBltOxERSb1x61M3szvMrNrMqhsbG8drtyIicg7GLdSdcw8656qcc1UlJSXjtVsRETkHGv0iIpJGFOoiImlkLEMaHwNeBRaYWY2Zfdr/skRE5HyMOkujc+53klGIiIhcOHW/iIikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpBGFuohIGlGoi4ikEYW6iEgaUaiLiKQRhbqISBpRqIuIpJExhbqZ3WBmu81sn5nd43dRIiJyfkYNdTMLAt8CbgQuA37HzC7zuzARETl3Y2mprwL2OecOOOf6gceBW/0tS0REzkdoDNvMAI4OeV4DvOvMjczsDuCOxNM+M9t24eX5airQlOoixkB1jp/JUCNMjjonQ40wueqcMx47Gkuo21nWuXescO5B4EEAM6t2zlVdYG2+mgw1guocT5OhRpgcdU6GGmHS1VkxHvsaS/dLDTBryPOZQN14fLiIiIyvsYT6m8B8M6s0swxgLfC0v2WJiMj5GLX7xTkXNbM/Bn4OBIGHnXPbR3nbg+NRnM8mQ42gOsfTZKgRJkedk6FGuAjrNOfe0T0uIiKTlK4oFRFJIwp1EZE0Mq6hPtGmEzCzQ2b2tpltNrPqxLoiM3vBzPYmvhYm1puZfTNR+1YzW+lTTQ+bWcPQcfznU5OZfTKx/V4z+2SS6rzPzGoTx3Ozmd005LV7E3XuNrMPDVnv28+Emc0ysxfNbKeZbTezuxLrJ9TxHKHOiXY8s8zsDTPbkqjzq4n1lWb2euLY/CgxYAIzy0w835d4vWK0+n2s8ftmdnDIsVyeWJ+y36HEZwTN7C0z+2niuf/H0jk3Lg+8k6j7gblABrAFuGy89n+eNR0Cpp6x7q+BexLL9wB/lVi+CfgZ3rj81cDrPtV0DbAS2Ha+NQFFwIHE18LEcmES6rwP+F9n2fayxL93JlCZ+DkI+v0zAZQBKxPLecCeRC0T6niOUOdEO54G5CaWw8DrieP0BLA2sf67wGcSy3cC300sr1XbmUIAAAOdSURBVAV+NFL9Ptf4feC2s2yfst+hxOf8KfBvwE8Tz30/luPZUp8s0wncCvwgsfwD4CND1v/QeV4DCsysbLw/3Dm3AWi5wJo+BLzgnGtxzp0AXgBuSEKdw7kVeNw51+ecOwjsw/t58PVnwjl3zDm3KbHcAezEuwJ6Qh3PEeocTqqOp3POdSaehhMPB1wHrEusP/N4njzO64DrzcxGqN/PGoeTst8hM5sJrAG+l3huJOFYjmeon206gZF+cJPBAb8ws43mTWMAUOqcOwbeLxswLbE+lfWfa02prPWPE3/GPnyyW2OEepJWZ+LP1RV4LbcJezzPqBMm2PFMdBdsBhrwgm4/0Oqci57lMwfrSbzeBhT7XeeZNTrnTh7LryeO5T+YWeaZNZ5RSzL+zb8BfB6IJ54Xk4RjOZ6hPqbpBJLsKufcSrwZJj9rZteMsO1ErH+4mlJV63eAecBy4Bjwd4n1Ka3TzHKBHwN3O+faR9p0mHpSVeeEO57OuZhzbjneleOrgEUjfGZK6jyzRjNbAtwLLASuxOtS+UIqazSzm4EG59zGoatH+Mxxq3M8Q33CTSfgnKtLfG0AnsL7IT1+slsl8bUhsXkq6z/XmlJSq3PueOIXKg48xKk/A1NWp5mF8YLyUefcvydWT7jjebY6J+LxPMk51wqsx+uHLjCzkxcqDv3MwXoSr0/B67JLSp1Darwh0cXlnHN9wL+Q+mN5FXCLmR3C6ya7Dq/l7v+xHMcTAiG8kw2VnDqJs3i89n8e9eQAeUOWX8HrM/sbTj+J9teJ5TWcfkLlDR9rq+D0E5DnVBNeS+Qg3gmewsRyURLqLBuy/Dm8vj6AxZx+MucA3kk9X38mEsflh8A3zlg/oY7nCHVOtONZAhQklrOBl4GbgSc5/eTenYnlz3L6yb0nRqrf5xrLhhzrbwAPTITfocRnXcupE6W+H8vxLv4mvDP7+4Ev+XGAzqGWuYmDsQXYfrIevH6qXwJ7E1+LhvwwfCtR+9tAlU91PYb3p/YA3v/Cnz6fmoBP4Z002QfcnqQ6/zVRx1a8+X+GhtKXEnXuBm5Mxs8EcDXen6Jbgc2Jx00T7XiOUOdEO57LgLcS9WwDvjLkd+mNxLF5EshMrM9KPN+XeH3uaPX7WOOvEsdyG/AIp0bIpOx3aMjnXMupUPf9WGqaABGRNKIrSkVE0ohCXUQkjSjURUTSiEJdRCSNKNRFRNKIQl1EJI0o1EVE0sj/B4N8l4hazWFYAAAAAElFTkSuQmCC\n",
417 | "text/plain": [
418 | ""
419 | ]
420 | },
421 | "metadata": {},
422 | "output_type": "display_data"
423 | }
424 | ],
425 | "source": [
426 | "flattenAnneal(learn,4e-3, 20, .72) #imagenette"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 23,
432 | "metadata": {},
433 | "outputs": [
434 | {
435 | "data": {
436 | "text/html": [
437 | "\n",
438 | " \n",
439 | " \n",
440 | " epoch | \n",
441 | " train_loss | \n",
442 | " valid_loss | \n",
443 | " accuracy | \n",
444 | " time | \n",
445 | "
\n",
446 | " \n",
447 | " \n",
448 | " \n",
449 | " 0 | \n",
450 | " 2.783504 | \n",
451 | " 2.846070 | \n",
452 | " 0.292000 | \n",
453 | " 00:38 | \n",
454 | "
\n",
455 | " \n",
456 | " 1 | \n",
457 | " 2.514973 | \n",
458 | " 2.656390 | \n",
459 | " 0.374000 | \n",
460 | " 00:36 | \n",
461 | "
\n",
462 | " \n",
463 | " 2 | \n",
464 | " 2.331984 | \n",
465 | " 2.586760 | \n",
466 | " 0.400000 | \n",
467 | " 00:36 | \n",
468 | "
\n",
469 | " \n",
470 | " 3 | \n",
471 | " 2.211190 | \n",
472 | " 2.379511 | \n",
473 | " 0.478000 | \n",
474 | " 00:36 | \n",
475 | "
\n",
476 | " \n",
477 | " 4 | \n",
478 | " 2.086747 | \n",
479 | " 2.122940 | \n",
480 | " 0.586000 | \n",
481 | " 00:35 | \n",
482 | "
\n",
483 | " \n",
484 | " 5 | \n",
485 | " 1.977175 | \n",
486 | " 2.192198 | \n",
487 | " 0.580000 | \n",
488 | " 00:36 | \n",
489 | "
\n",
490 | " \n",
491 | " 6 | \n",
492 | " 1.890428 | \n",
493 | " 1.988842 | \n",
494 | " 0.636000 | \n",
495 | " 00:36 | \n",
496 | "
\n",
497 | " \n",
498 | " 7 | \n",
499 | " 1.804230 | \n",
500 | " 2.107334 | \n",
501 | " 0.610000 | \n",
502 | " 00:36 | \n",
503 | "
\n",
504 | " \n",
505 | " 8 | \n",
506 | " 1.743078 | \n",
507 | " 1.851416 | \n",
508 | " 0.698000 | \n",
509 | " 00:36 | \n",
510 | "
\n",
511 | " \n",
512 | " 9 | \n",
513 | " 1.703422 | \n",
514 | " 1.778018 | \n",
515 | " 0.720000 | \n",
516 | " 00:36 | \n",
517 | "
\n",
518 | " \n",
519 | " 10 | \n",
520 | " 1.654018 | \n",
521 | " 1.885794 | \n",
522 | " 0.682000 | \n",
523 | " 00:36 | \n",
524 | "
\n",
525 | " \n",
526 | " 11 | \n",
527 | " 1.621382 | \n",
528 | " 1.812310 | \n",
529 | " 0.724000 | \n",
530 | " 00:36 | \n",
531 | "
\n",
532 | " \n",
533 | " 12 | \n",
534 | " 1.593395 | \n",
535 | " 1.778166 | \n",
536 | " 0.740000 | \n",
537 | " 00:36 | \n",
538 | "
\n",
539 | " \n",
540 | " 13 | \n",
541 | " 1.558790 | \n",
542 | " 1.775904 | \n",
543 | " 0.728000 | \n",
544 | " 00:36 | \n",
545 | "
\n",
546 | " \n",
547 | " 14 | \n",
548 | " 1.518692 | \n",
549 | " 1.741760 | \n",
550 | " 0.744000 | \n",
551 | " 00:36 | \n",
552 | "
\n",
553 | " \n",
554 | " 15 | \n",
555 | " 1.486559 | \n",
556 | " 1.609652 | \n",
557 | " 0.766000 | \n",
558 | " 00:36 | \n",
559 | "
\n",
560 | " \n",
561 | " 16 | \n",
562 | " 1.413943 | \n",
563 | " 1.684022 | \n",
564 | " 0.766000 | \n",
565 | " 00:37 | \n",
566 | "
\n",
567 | " \n",
568 | " 17 | \n",
569 | " 1.354226 | \n",
570 | " 1.612617 | \n",
571 | " 0.794000 | \n",
572 | " 00:37 | \n",
573 | "
\n",
574 | " \n",
575 | " 18 | \n",
576 | " 1.291616 | \n",
577 | " 1.538140 | \n",
578 | " 0.816000 | \n",
579 | " 00:36 | \n",
580 | "
\n",
581 | " \n",
582 | " 19 | \n",
583 | " 1.250432 | \n",
584 | " 1.542479 | \n",
585 | " 0.808000 | \n",
586 | " 00:36 | \n",
587 | "
\n",
588 | " \n",
589 | "
"
590 | ],
591 | "text/plain": [
592 | ""
593 | ]
594 | },
595 | "metadata": {},
596 | "output_type": "display_data"
597 | },
598 | {
599 | "data": {
600 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD6CAYAAACIyQ0UAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd3hc133m8e8BMOgdRCNBAuwUK0iCVCGjYtkyKckqEaPQ3bJjJnYcS8rus5I3u4n1xM46+yRu6yLTjuIiWbZMSZYtq1iyRTOqFCg2sFeAKETvdTBz9o87IAASZQDMAJfE+3meeTCcueU3l8CLg3PPPddYaxEREfeKmOoCRERkZApqERGXU1CLiLicglpExOUU1CIiLqegFhFxuVGD2hiz2Bizb8CjxRjzwGQUJyIiYMYyjtoYEwlUAFdba0uHWy4lLd0unD8vBOWJiEwPe/bsqbPWZg71XtQYt3UzcGqkkAbImjmb4uLiMW5aRGT6MsYMm6tj7aPeCjw5zE62GWOKjTHFLS0tY9ysiIgMJ+igNsZEA3cAvxrqfWvtdmttkbW2KCkpKVT1iYhMe2NpUW8G3rPWVo+2oGYPEREJnbH0UX+YYbo9LqGkFpEx8Hq9lJeX09XVNdWlhF1sbCx5eXl4PJ6g1wkqqI0x8cAHgL8OZnnltIiMRXl5OUlJSRQUFGCMmepywsZaS319PeXl5cydOzfo9YLq+rDWdlhrM6y1zUEtH/TuRUSgq6uLjIyMKzqkAYwxZGRkjPkvh/BcmaikFpExutJDus94PmdYgtoqqUVEQkZzfYjItNfU1MT3vve9Ma9366230tTUFIaKBgtPi1oNahG5jAwX1D6fb8T1XnjhBVJTU8NV1gVjvYRcROSK8/DDD3Pq1CkKCwvxeDwkJiaSm5vLvn37OHz4MHfddRfnzp2jq6uL+++/n23btgFQUFBAcXExbW1tbN68mY0bN/Lmm28ya9YsnnvuOeLi4kJSX1iCWg1qERmvR357iMOVoZ2GYunMZP7pQ8uGff9rX/saJSUl7Nu3j507d3LbbbdRUlJyYQjdY489Rnp6Op2dnaxbt4577rmHjIyMQds4ceIETz75JD/84Q+59957efrpp/nYxz4WkvrDE9Tq+xCRy9j69esHjXP+9re/zbPPPgvAuXPnOHHixCVBPXfuXAoLCwFYu3YtZ8+eDVk96voQEVcZqeU7WRISEi4837lzJ6+++ipvvfUW8fHx3HjjjUOOg46JibnwPDIyks7OzpDVE6bheSIil4+kpCRaW1uHfK+5uZm0tDTi4+M5evQob7/99iRXF64WtZJaRC4jGRkZbNiwgeXLlxMXF0d2dvaF9zZt2sSjjz7KypUrWbx4Mddcc82k1zemO7wEa9bC5bbiREnItysiV6YjR45w1VVXTXUZk2aoz2uM2WOtLRpqeV3wIiLicmG64EV9HyIioaIWtYiIy2nUh4iIy6lFLSLicpqUSUTE5dSiFhEZh8TERAAqKyvZsmXLkMvceOONFBcXT3hfunGAiMgEzJw5kx07doR1H7oyUUQEeOihh8jPz+fzn/88AF/+8pcxxrBr1y4aGxvxer185Stf4c477xy03tmzZ7n99tspKSmhs7OT++67j8OHD3PVVVeFbL4PTXMqIu7y4sNw/mBot5mzAjZ/bcRFtm7dygMPPHAhqJ966ileeuklHnzwQZKTk6mrq+Oaa67hjjvuGPa+h9///veJj4/nwIEDHDhwgDVr1oSkfM2eJyICrF69mpqaGiorK6mtrSUtLY3c3FwefPBBdu3aRUREBBUVFVRXV5OTkzPkNnbt2sUXv/hFAFauXMnKlStDUltQQW2MSQV+BCzHaTB/2lr71nDLa9SHiIzbKC3fcNqyZQs7duzg/PnzbN26lSeeeILa2lr27NmDx+OhoKBgyClOBwrH3dSDPZn4LeAla+0SYBVwJOSViIhMsa1bt/KLX/yCHTt2sGXLFpqbm8nKysLj8fDaa69RWlo64vrXX389TzzxBAAlJSUcOHAgJHWN2qI2xiQD1wOfArDW9gA9I62jUR8icjlatmwZra2tzJo1i9zcXD760Y/yoQ99iKKiIgoLC1myZMmI63/uc5/jvvvuY+XKlRQWFrJ+/fqQ1DXqNKfGmEJgO3AYpzW9B7jfWtt+0XLbgG0ASbnz1rZUngpJgSJy5dM0pxOf5jQKWAN831q7GmgHHr54IWvtdmttkbW2yBMdPfbKRURkSMEEdTlQbq19J/DvHTjBLSIik2DUoLbWngfOGWMWB166GacbZIR1QlCZiEwr02Ue+/F8zmDHUf8d8IQxJho4Ddw3SiljLkREpq/Y2Fjq6+vJyMgIy/A2t7DWUl9fT2xs7JjWCyqorbX7gCE7uYdcfkwliMh0l5eXR3l5ObW1tVNdStjFxsaSl5c3pnXCcwm5klpExsDj8TB37typLsO1NM2piIjL6VZcIiIuF54WtZJaRCRkdOMAERGXUx+1iIjLqY9aRMTl1EctIuJyalGLiLic+qhFRFwuPC1qXZooIhIyalGLiLic+qhFRFxOoz5ERFxOLWoREZfTyUQREZcLW4va71dYi4iEQthGfXj9/nBtWkRkWglfUPvUohYRCYWwBXVPr1rUIiKhEMYWtYJaRCQU1KIWEXG5oO5Cbow5C7QCPqDXWls02jpqUYuIhEZQQR1wk7W2LtiFexTUIiIhEb4+6l6N+hARCYVgg9oCvzfG7DHGbBtqAWPMNmNMsTGmGNSiFhEJlWC7PjZYayuNMVnAK8aYo9baXQMXsNZuB7YDxOQutOqjFhEJjaBa1NbaysDXGuBZYP1o6yioRURCY9SgNsYkGGOS+p4DtwAlo62noBYRCY1guj6ygWeNMX3L/9xa+9JoK/XoZKKISEiMGtTW2tPAqrFuWC1qEZHQ0CXkIiIup0vIRURcTi1qERGXC1+LWvNRi4iEhFrUIiIuF8a5PhTUIiKhoBa1iIjLhSWojYFutahFREIiLEEdYQwdPb5wbFpEZNoJW1C39/SGY9MiItNOmIIaOtWiFhEJiTC2qBXUIiKhEJagjowwdHSr60NEJBTUohYRcbmw9VF36GSiiEhIhCeoIzQ8T0QkVMI3jlp91CIiIRG+rg+vD79fM+iJiExU2FrU1kJXr7o/REQmKmx91ADt3QpqEZGJClvXB+jqRBGRUAhb1weg+T5EREIg6KA2xkQaY/YaY54fdaOBoG7TyA8RkQkbS4v6fuBIMAtGBfo+mju84yhJREQGCiqojTF5wG3Aj4JZPjIQ1I0dPeMuTEREHMG2qL8J/A9g2Nu2GGO2GWOKjTHFTY0NADR3qkUtIjJRowa1MeZ2oMZau2ek5ay12621RdbaoqzMGURGGJrU9SEiMmHBtKg3AHcYY84CvwDeZ4x5fLSVUuI86voQEQmBUYPaWvsla22etbYA2Ar80Vr7sdHWS4330KSuDxGRCQvLOGqA1DiPRn2IiIRA1FgWttbuBHYGs2xafDRVzV3jKElERAYKW4s6KzmW6hYFtYjIRIUtqPPS4qhv79GdXkREJihsQT0nPR6Acw2d4dqFiMi0EPagLmvoCNcuRESmBQW1iIjLhW94XryHpJgozimoRUQmJGxBbYxhdnq8WtQiIhMUtqAGp/ujtL49nLsQEbnihTeoM+I519ipu5GLiExAWIN6fmYCPb1+dX+IiExAWIN62cwUAEoqm8O5GxGRK1pYg3pRdhKeSENJRUs4dyMickULa1BHR0WwKDuJQ2pRi4iMW1iDGmDFrBQOVjRjrU4oioiMR9iDemVeKk0dXp1QFBEZp7AH9bqCNAD+dLw23LsSEbkihT2oF2YnsTArkef3V4V7VyIiV6SwBzXA7StnsvtsA5VNmvJURGSsJiWo71o9E4Dn9lVOxu5ERK4okxLU+RkJrCtI48ndZfh0ObmIyJhMSlADfGbjXMoaOnjhoPqqRUTGYtSgNsbEGmN2G2P2G2MOGWMeGc+Oblmaw/zMBL6385TGVIuIjEEwLepu4H3W2lVAIbDJGHPNmHcUYfj8jQs4UtXCCwfPj3V1EZFpa9Sgto62wD89gce4msR3rZ7F4uwk/u33x+jp9Y9nEyIi005QfdTGmEhjzD6gBnjFWvvOEMtsM8YUG2OKa2uHvrglMsLw8OYlnKlrZ9vPitUFIiIShKCC2lrrs9YWAnnAemPM8iGW2W6tLbLWFmVmZg67rZuWZLFpWQ47j9XyVz8p1thqEZFRjGnUh7W2CdgJbJrITr/zkdU88P6F/OFoDZ/+8bt0eX0T2ZyIyBUtmFEfmcaY1MDzOOD9wNGJ7DQqMoIH3r+IH36iiKPnW9m6/W3drVxEZBjBtKhzgdeMMQeAd3H6qJ8Pxc4/sDSbr9y1nBPVrXzoO69TUtEMlfvgtX+B2uOh2IWIyGXPhOOEXlFRkS0uLg56+ePVrXzqsd30+Pw8v66EnLceASzkrIAVfwHL74GUvJDXKSLiFsaYPdbaoqHem7QrE0eyKDuJn/3V1fj8lmt3LuJ7a3+L75Z/gchoeOUf4RvL4LFN8O6PoL1uqssVEZlUrmhR9znX0MH/efEILxw8z4YFGfzHJ9cR23IWSp6Bkh1QexRMJMy/CZZvgSW3QWxyyOsXEZlsI7WoXRXUfX7y5ln+6TeHWJydxFfvXk5RQTpYC9WHnMA++DQ0l0FULCz6oBPaC28BT2wIP4WIyOS57IIa4PkDlTy04wDtPT4+dV0Bn94wlzkZ8c6b1sK53U5oH3oW2mshJhmu+pDTnz33BoiMCsEnERGZHJdlUAOU1rfz1d8d4ZUj1fSVmZkUw6euK+Dj1+aTHOsBXy+c+ROUPA1HfgvdLZCQCfNugrQCSMuH1DmQmg/JsxTgIuJKl21Q9zlT5wT2yZpWunv9VDV3AWAMfOGmBfz1DfNJjIkCbxecfAUO7oCKPdBSAXbAnCIm0hk9kjonEOAFga+BME/MhghXnF8VkWnmsg/qi+0ta+Sp4nJeOFhFc6eXzKQY/u0vVnHDoosuXfd5obkcmkqhsdT52lTW/7ytevDykTEDQjwf5l4PizdDVEzYPouICFyBQT3Qa8dq+NLTBznf0kXh7FR+fN86UuOjg1vZ2wlN5wJBfnZAoJdBwxnoboa4dFh5L6z+mDOuW0QkDK7ooAZo6fLyz789zK/2lAPw4fVzuHFxJh9cljP+jfp9cPo12Ps4HP0d+HogZyWs/jis2ALx6SGqXkRkGgR1n13Ha/nSMwepCMzIt3xWMtcvzOSvb5hPSpxn/BvuaHD6vfc9DlX7nQtxltwGhR9zxnRHRIboE4jIdDVtgrrP3rJG/tevS2jq8F4I7b+5YT4PbVqMMWZiGz9/EPY+AQd+CZ0NzkiSVVuh8KOQMT8E1YvIdDTtgnqgPxyp5sndZbx6pAaA+OhIblqSxRfft5DFOUnj33BvNxx/yekaOfmqM7okf4MT2EvvhJjEEH0CEZkOpnVQA1hr+fnuMp569xz7y5svvJ6REM3dq2fxdzcvnFjXSEsV7H8S9j0B9SchOhGW3eX0Z8++2hlHKCIygmkf1AN1eX3843MlNLR7efVI//C81XNS+ejV+dyxaiZREYaIiHGEq7Vw7h2nlX3oWehpg5lrYOODsOR2jdEWkWEpqEfw3L4K/vXFo/ispbql+8Lr8zIT+MDSbN63OIvs5Fhmp8cTOZbw7mmHA0/Bm9+GhtOQsRA2PgAr7oWoIIcPisi0oaAOgrWWPx6t4Zm9FbR0eqlv6+FwVcuF9/Mz4rl79SyunpvBirwU50rIYPh9cPjX8Po3nBORyXlw3RdgzScgOiFMn0ZELjcK6nH6w5FqHnr6IHVt3YNej4wwxHsimZ+VyCN3LMMCv9lXSX17Nyeq29i8PIe/XD+brKQBs/lZCyf/AK9/HUrfcC6kufpvYP1nNSZbRBTUodDS5WXnsVpioiL47f5Knj9QNeLyi7OT+Pd7V7F8Vsqlb5a947Swj7/onHhc+ym49m8heWZ4ihcR11NQh0lVcyef/Wkx182fwV/92VxS4jxER0bw8qFq/v6pfXT0+FicncQjdy5jVV4qcdEXXRhTfQhe/6Yz819EpDMee8MDEx+PbS201TiXwjeVQnyGM2+JLswRcS0F9RQ4WdPGx//jnQsz/QGsn5vOQ5uW0Nbdy8KsRGamxjlvNJ6FN/8fvPcz51L1pXfCn/095K4aeuPWQmdjYH6Ssksnm2oqg96uweskzYRVfwmrPgKZi8LymUVk/BTUU6ipo4dn3qtg14ladh6rHfTezUuy+Oz187hmXgYAXY1VRL37KFF7HnPm1Z5/s9PKbq/rD+S+SaN6WgfvKDZ18Mx/qfnO85TZUHfcGed94hWwPshbB6s+DMv/HOLSJutQiMgIFNQusae0kV3Haylr6CDCGP54tJrGDu8ly2VEdXGv/T3bol8izTY5L3oS+kN44M0Q+sI5doi+8Iu1VsPBp2Dfz6HmsDOt65LboPAjzo0WQnlTBb8f6k9AeTGUvwuNZ2DpXc6+NG2syCUmFNTGmNnAT4EcwA9st9Z+a6R1FNTB6fL6+N+/Lrkw698Hl2VTkJFAS1cvT+4uI4Ye5ptKvAm5dHlSuHt1HnevyWPujASstVjL+C/MqdrvBPbBp5xulMSc/q6RrCVj32ZHQ38oVxRD+R5nmliAmBRIzHSu2kzMgWs/D2vv042JRQaYaFDnArnW2veMMUnAHuAua+3h4dZRUIfG4coWvv+nU+w8WkN7Ty/+wH9VUmwUrV29ACzISiQ1zkNXr4/3LcnmCzctIDpqDFdA9nbD8ZedrpHjLztdIzPXOC3f5fcMPXTQ54XqkkAwB8K54ZTznomArGWQVxR4rHMu9jHGuWXa69+A0zudvwDWfdYZopiYeek+RKaZkHZ9GGOeA75jrX1luGUU1KFnreXlQ+d5+r0KDpY3097TS2ePjxV5KRytaqXT67uwbEqchz9fM4uPXp3PgqwxTA7VVgMHf+W0tKtLnOlcF2+GlVudk5zl7zq3OKvc23+yMjHbCeO+UM4tHH1Cqoo9zmiXI7917iS/5uNw7RecLhyRaSpkQW2MKQB2AcuttS0XvbcN2AYwZ86ctaWlpeOtV8aoob0HT6ThP984y9dfOU6E4ULre2ZKLDOSYiit7+C7H1nDdfMzgusuqTrQ3zXSUe+8FhnjjEQZGMwpeeOfdKruBLzxTdj/S2f2wRVbnOGJ2UvHtz2Ry1hIgtoYkwj8CfiqtfaZkZZVi3rqHaps5gd/Os1v9lcOej0mKoJ5mYl09/pYlZfKZ/9sHktnjtBX3NsDZ/8L4lIhe0V45ilproC3vgt7fgzedli02RmeOHt96Pcl4lITDmpjjAd4HnjZWvv10ZZXULuL328pLm1kb1kjLx06z96ypkHvXzMvnXMNnVQ0dbJ6Tip3Fc5i/dx0zjV0cK6xk+vmZ/CtV0+QlRzDnYUzWTYzhVhPGC6e6WiA3T+Edx51bsqQvwE2/j0suFlTxcoVb6InEw3wE6DBWvtAMDtUULub1+cnKsLQ0tnLd3eeZPuu0wBER0UQacyg/u7hXDsvg03Lc7hnbV7wE1QFq6cd3vupcxFQS4XTkt/4gDO8L5RDCEfj90H9KajaB5X7nK/VJRCTDGkFziN9buB54KvmbZFxmmhQbwT+CziIMzwP4H9aa18Ybh0F9eWlvbuXmtbuC8P+3jnTwNdePMqGBRnMSY/Hb2FOejzpCdG8fbqeJ3eXca6hc1Cgz0qNY+OCGaTEe/D7LWkJ0dy2IpeCGROYIbC3xzm5+cY3nYt2EjJhxqLBwdgXlPEZE2t1+33OPir3OUMXq/Y5/fTeduf9qFjIXu7cid7b4dylvvEstNcM3k5syqW1pc11nifP0mX8Mixd8CIh19rl5d9ePsbhqhasheLSxmGXXZufRmZiDPdtKGBeZiIvlVTR5fUzJyOeGxZlEuuJxOvz44kcZlih3w/HfgdHX3AunGk4A23nBy8TnRQIx4LB4ZhW4FydGTngDj6+Xqg71t9KrtrvTEHr7XDe98Q7gZxb6Jw8nVkIMxYP3ZrvbnMCu/GsU1vj2UCIn3GuIPX39i8b4QlcqDTHaXnHpgzzSBvwPFkXCE0TCmqZFDWtXXh9lpzkWI5UtfDY62d4Zm/FmLYxb0YC6wrSuW9jAUtyRjjJ2dMRuKT+bH/rti8oG0vBN2BqWhPpjE5Jn+sEa3VJ//BCTwLkrnRCeWYgmGcsCk3L1++D5vL+2vrqbCqDrubAo2lwmA8lKu7SMI+OB4zzV4SJ6H8+4lcGrzPUdgc+4lKdX4C6M9GkUFDLlNt9poFv/eE4i7KTuH5hJmfq2qls6uSlQ+cpb+xkfmYCpfUd9Pov/X68bUUunV4fi7KT+MS1+f2TWQ3g91uMgcffKSM5JoIs00i2t4p5UbWDgzwqdkAoFzozFU5ld4S1Tkv+QnBf/Gga+vWeDsA661t///NBXxnm9cA6vV3OnDIjMk6r/kKApw5+nrMC8q9z/krQCd8JUVDLZcNaS3ljJ4+/U8oP/nR6yGWSYqJYNzedtPhofrPfabF7fZb46Eg6egafCF2Vl8LmFbn87kAVGxfOYMWsFG5anEVMVAR+a7Fwocul1+cnarjulyuV3+eEdd8vgM5hfjEM9eioh95OZzvJeU5g518HBRshY4GCe4wU1HLZqmzqpK6tm7T4aJo7vfx8dxm7jtdS3th5YZmUOA/NnV5uW5lLYnQU96zNo7Kpk1/tOccbJ+tH3cdNizMpLm2krbuXDfNnsGp2CoWz01ibn0ZavAdrA70FCp7B/H5ncq/SN527FpW+2X9yNSEzENwbnK9ZyybeheLzBqbzHdCN1HDG+YskdU7/SJy0fOccRVzaZfXLQkEtV5wur483TtZxw6LMEVvBzZ1eSuvbyU2Jw1rLq0dqON/Sxdun69l9poEIA/kZCeQkx5KZFMN7ZY0XfgkMvMITICsphoSYKM7UtXPbilxW5KVQkJHAiyVVLJ+Zgs9aDlY0MyMhmrSEaIrPNmIMnK5tJyclloc3L6EoP+3KDXxrneGMpa87oX32DWhxJhwjNgXmXNcf3rkrB5/g7dPdOviE7MCvzeXOXDR9omKdYPbEOQHecdEv5b4TzGn5A0K8oH/WSU8sbqKgFhmD1i4vhypbeONkHfvLm/H2+slMimHvuUYyE2Pw+izHqlvp6fWPvrEhrJqdyqc3FHD7ypljurO9tRZjDNZaurz+S+8Y5EZNZYNb3PUnndc9Cc6VpznLnel3+04Etw+es524tAEjeC76mpgzuJXe3eqcSG4823+iue/kclPpEDfTyB0c4AO3nzBj0lvjCmqRELPWcrqunYb2HhKio5iRGI3PWrKSYmnq6MEYgyfSEOeJJMIYmju9/HpfBU8Vl9PW7eVcQyd5aXEsyUlifmYiuSmxvHmqnvjoSNITYmjscOZv+Yui2VQ2dfLCwSpeO1pLZlIM3b1+6tq6mTcjgfvfv5A7C2eNuXa/ZUy/JEKmtRrKAq3t0jeh9qgTmH1DKS8O5LjU0OzX73e6ZfqCuy/E+wK9pZLA2VdHdGKgjoJLf0Ek54XlwisFtYiL+P2Wlw6d55n3Kihv7OB0bTs9Pj8zEmMuueP9QNFREcRERpA/I56Siv7RGnlpceQkx7IgK5HEmChq27qpaemmsaOHxJgoVs9J5WRNG+3dPnafbbiwXlJMFDcuyeKOVTO5el46Pb1ODZOq7wTAVPN2OaE9VJdLU6kze2SfiKhAn3gguNPnOd0pUTHOaJphH3bE9826zyioRdyqvbuX8sZOFmYl4reWrl4/fms5cK6ZY9WtrCtIY3FOEjFRg7s6yuo7+NHrp6lv66GsoYPj1a10B9Eds2ZOKtFREbx9uuGS95bkJHH0fCtX5SZzw6JMIgx09zpTDvj8lprWbkoqmjld1052cgybl+eSnRzL7StzmZ0eH7Jj4ip+n9PivjjAG89Aw9n+G2RMkHmkRUEtcqXr7vVRWt/BzNQ4EmOi8Pr8+PyWyAhDr886c7lc1N1xvLqVVw5XE2EMe0obOF7dRqfXR23r8C372elxdHv9REdFUNvaTXevn8gIQ1SE4QNLs1mcnURLl5e5MxJJi/fwxqk6cpJjOdfQSX17D3vLGimYkUBeWhw+v+VvbpjP8llB3ErOjfpuNN1U6gR638VEIz6GXsYk5yqoRSR4fbd6a+nyEuuJJDLCcOx8KwUzEgZNwmWt5URNG4+/XcrTe8pp7xl5Qq+UOM+F9Zs7vbR1O1dlFuWnsSArkXvW5tHrs6zMSyEy0IqPj468cBL1ih0xg/qoRWSSWGupbumm1++np9dPQ3sPb56qZ11BOrPT48hLix+0bF1bDz/6r9M8s7dixFZ8ekI0bV29ZCbFEB0VQUqch/SEaLw+PxsWzODj1+STEOpZHCeZglpEXM3vd4Y8vn6iDotzdWq318+Z+nZaOr1kJccSHRlBW7cXv4XmDqc1HhVpKK3vYElOEjcsymRORjzrCtJJjnUugsrPiA/P3OlhMFJQX96/gkTkihARYbgqN5mrcsd2Z3prLb87WMW///44P9g19JQDs1LjuG5+hnNSNNKQmRhDWUMH75U1kpUUS25KLGvz07h79Syykt11EUwftahF5IrQ0uWlvKGT/eVN9Pot1c1deH1+9pc3caiihbSEaNq7e2nt7qWn1090ZARzMuLp7PFR0dRJhIGFWUkszkni1hW53LQk85KRNuGkFrWIXPGSYz0snekZ+R6gOKNjTlS3sSAr8UK3yOnaNp7dW8HBimZePnT+wr1GZ6fHsWJWCokxUZyqbWdxThIAje09dHl9JMRE4beWhOgo0hOjSY+PpqggnaW5ySG9clRBLSLTSkxU5CXDAedlJvLfblkMQE+vnz8erWF/eRMlFc2UVLTQ0eOjrq2bo1Utg0a2zE6Po9dn8frsoIuVkmOjKCpIZ2aq061yVW4yuclxpMR76Ozxcaq2jeoWp8WfnRxLXVsPI1FQi4gMEB0VwablOWxanjPo9b45z+vaevD6/JfMi97l9VHe2Ml7ZY28criakzVt7D7TwONvl024JvVRi4iEic9vOV7dyt6yJqpbuqhs6iQ5zjz/RoEAAAbGSURBVMPa/DRyU2Jp6eqly+sjLT6aq+dlqI9aRGSyRY5zNMvFptntLERELj8KahERlxs1qI0xjxljaowxJZNRkIiIDBZMi/rHwKYw1yEiIsMYNaittbuASyeuFRGRSRGyPmpjzDZjTLExpri2tnb0FUREJCghC2pr7XZrbZG1tigzMzNUmxURmfY06kNExOUU1CIiLhfM8LwngbeAxcaYcmPMZ8JfloiI9Bn1EnJr7YcnoxARERmauj5ERFxOQS0i4nIKahERl1NQi4i4nIJaRMTlFNQiIi6noBYRcTkFtYiIyymoRURcTkEtIuJyCmoREZdTUIuIuJyCWkTE5RTUIiIup6AWEXE5BbWIiMspqEVEXE5BLSLicgpqERGXU1CLiLicglpExOUU1CIiLhdUUBtjNhljjhljThpjHg53USIi0m/UoDbGRALfBTYDS4EPG2OWhrswERFxBNOiXg+ctNaettb2AL8A7gxvWSIi0icqiGVmAecG/LscuPrihYwx24BtgX92G2NKJl7epJgB1E11EUFSreGhWsPjcqoVpr7e/OHeCCaozRCv2UtesHY7sB3AGFNsrS0KurwppFrDQ7WGh2oNHzfXG0zXRzkwe8C/84DK8JQjIiIXCyao3wUWGmPmGmOiga3Ab8JbloiI9Bm168Na22uM+QLwMhAJPGatPTTKattDUdwkUa3hoVrDQ7WGj2vrNdZe0t0sIiIuoisTRURcTkEtIuJyIQ1qN15qbow5a4w5aIzZZ4wpDryWbox5xRhzIvA1LfC6McZ8O1D/AWPMmjDX9pgxpmbgmPPx1GaM+WRg+RPGmE9Ocr1fNsZUBI7vPmPMrQPe+1Kg3mPGmA8OeD2s3yfGmNnGmNeMMUeMMYeMMfcHXnfdsR2hVtcd18A+Yo0xu40x+wP1PhJ4fa4x5p3AcfplYOABxpiYwL9PBt4vGO1zTEKtPzbGnBlwbAsDr0/5z9iwrLUheeCcaDwFzAOigf3A0lBtfwJ1nQVmXPTa/wUeDjx/GPjXwPNbgRdxxo5fA7wT5tquB9YAJeOtDUgHTge+pgWep01ivV8G/vsQyy4NfA/EAHMD3xuRk/F9AuQCawLPk4DjgXpcd2xHqNV1xzWwfwMkBp57gHcCx+wpYGvg9UeBzwWefx54NPB8K/DLkT7HJNX6Y2DLEMtP+c/YcI9Qtqgvp0vN7wR+Enj+E+CuAa//1DreBlKNMbnhKsJauwtomGBtHwResdY2WGsbgVeATZNY73DuBH5hre221p4BTuJ8j4T9+8RaW2WtfS/wvBU4gnOFreuO7Qi1DmfKjmugRmutbQv80xN4WOB9wI7A6xcf275jvgO42RhjRvgck1HrcKb8Z2w4oQzqoS41H+kbbrJY4PfGmD3GucwdINtaWwXODwqQFXjdDZ9hrLW5oeYvBP5UfKyvO2GEuia13sCf2qtxWlOuPrYX1QouPa7GmEhjzD6gBie0TgFN1treIfZ9oa7A+81AxmTVe3Gt1tq+Y/vVwLH9hjEm5uJaL6ppyn/GQhnUQV1qPgU2WGvX4Mz+97fGmOtHWNatnwGGr22qa/4+MB8oBKqAfw+8PuX1GmMSgaeBB6y1LSMtOkxNU1mra4+rtdZnrS3EuUp5PXDVCPue0novrtUYsxz4ErAEWIfTnfGQG2odSSiD2pWXmltrKwNfa4Bncb6xqvu6NAJfawKLu+EzjLW2Ka3ZWlsd+GHwAz+k/8/XKa3XGOPBCb4nrLXPBF525bEdqla3HteBrLVNwE6c/txUY0zfBXQD932hrsD7KTjdZ5Na74BaNwW6m6y1thv4T1x4bC8WyqB23aXmxpgEY0xS33PgFqAkUFffmdtPAs8Fnv8G+ETg7O81QHPfn8qTaKy1vQzcYoxJC/x5fEvgtUlxUR/+3TjHt6/erYGz/nOBhcBuJuH7JNAH+h/AEWvt1we85bpjO1ytbjyugboyjTGpgedxwPtx+tVfA7YEFrv42PYd8y3AH61zhm64zxHuWo8O+GVtcPrSBx5b1/2MAaEb9WH7z5oex+mz+odQbnuc9czDObO8HzjUVxNOH9kfgBOBr+m2/yzxdwP1HwSKwlzfkzh/1npxfmt/Zjy1AZ/GORlzErhvkuv9WaCeAzjf6LkDlv+HQL3HgM2T9X0CbMT50/QAsC/wuNWNx3aEWl13XAP7WAnsDdRVAvzjgJ+13YHj9CsgJvB6bODfJwPvzxvtc0xCrX8MHNsS4HH6R4ZM+c/YcA9dQi4i4nK6MlFExOUU1CIiLqegFhFxOQW1iIjLKahFRFxOQS0i4nIKahERl/v/1GeI0wnbi/8AAAAASUVORK5CYII=\n",
601 | "text/plain": [
602 | ""
603 | ]
604 | },
605 | "metadata": {},
606 | "output_type": "display_data"
607 | }
608 | ],
609 | "source": [
610 | "flattenAnneal(learn,4e-3, 20, .72)"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": null,
616 | "metadata": {},
617 | "outputs": [],
618 | "source": []
619 | }
620 | ],
621 | "metadata": {
622 | "kernelspec": {
623 | "display_name": "Python 3",
624 | "language": "python",
625 | "name": "python3"
626 | },
627 | "language_info": {
628 | "codemirror_mode": {
629 | "name": "ipython",
630 | "version": 3
631 | },
632 | "file_extension": ".py",
633 | "mimetype": "text/x-python",
634 | "name": "python",
635 | "nbconvert_exporter": "python",
636 | "pygments_lexer": "ipython3",
637 | "version": "3.7.3"
638 | }
639 | },
640 | "nbformat": 4,
641 | "nbformat_minor": 2
642 | }
643 |
--------------------------------------------------------------------------------
/DeepMemory/README.md:
--------------------------------------------------------------------------------
1 | DeepMemory is a new optimizer I came up with after blending DiffGrad + AdaMod. The core concept is to provide the optimizer with
2 | long term memory of the previous step sizes.
3 |
4 | Results in initial testing put it on par with Ranger and both Ranger and DeepMemory topped the recent testing I did with about 8 different optimizers.
5 |
6 |
7 | DeepMemory is designed to offset the weakness of many adaptive optimizers by creating a 'long term' memory of the gradients over the course of an epoch.
8 | This long term memory is averaged against the current adaptive step size generated from the current mini-batch in order to help guide the step size more optimally.
9 |
10 | DeepMemory also keeps a short term gradient buffer that was developed in diffgrad, and locks down the step size when minimal gradient change is detected.
11 |
12 | 1/1/2020 - @lessw2020 developed the long term memory concept as a blended average (vs max throttle in AdaMod), and created and tested deep Memory
13 | credits:
14 | DiffGrad: Uses the local gradient friction clamp developed by DiffGrad, but with version 1 coded by lessw from the paper:
15 | https://github.com/shivram1987/diffGrad (S.R.Dubey et al)
16 |
17 | AdaMod - DeepMemory builds on the concepts for longer term monitoring in AdaMod (b3 concept but changed from min throttling to blended average and changed input to len_memory and size):
18 |
19 | AdaMod source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
20 |
21 | modifications @lessw2020
22 | 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
23 |
--------------------------------------------------------------------------------
/DeepMemory/deepmemory.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.optim import Optimizer
4 |
5 | # DeepMemory is designed to offset the weakness of many adaptive optimizers by creating a 'long term' memory of the gradients over the course of an epoch.
6 | # This long term memory is averaged against the current adaptive step size generated from the current mini-batch in order to help guide the step size more optimally.
7 |
8 | # DeepMemory also keeps a short term gradient buffer that was developed in diffgrad, and locks down the step size when minimal gradient change is detected.
9 |
10 | # 1/1/2020 - @lessw2020 developed the long term memory concept as a blended average (vs max throttle in AdaMod), and created and tested deep Memory
11 | # credits:
12 | # DiffGrad: Uses the local gradient friction clamp developed by DiffGrad, but with version 1 coded by lessw from the paper:
13 | # https://github.com/shivram1987/diffGrad (S.R.Dubey et al)
14 |
15 | # AdaMod - DeepMemory builds on the concepts for longer term monitoring in AdaMod (b3 concept but changed from min throttling to blended average and changed input to len_memory and size):
16 |
17 | # AdaMod source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
18 |
19 | # modifications @lessw2020
20 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
21 |
22 |
23 | class DeepMemory(Optimizer):
24 | """Implements DeepMemory algorithm (built upon DiffGrad and AdaMod concepts) with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
25 |
26 | Arguments:
27 | params (iterable): iterable of parameters to optimize or dicts defining
28 | parameter groups
29 | lr (float, optional): learning rate (default: 1e-3)
30 | betas (Tuple[float, float], optional): coefficients used for computing
31 | running averages of gradient and its square (default: (0.9, 0.999))
32 | len_memory = b3 (smoothing coefficient from AdaMod) in easier to use format, mem average with b3 is averaged with immmediate gradient.
33 | specify the memory len, b3 is computed.
34 | version = 0 means .5 clamping rate, 1 = 0-1 clamping rate (from DiffGrad)
35 | eps (float, optional): term added to the denominator to improve
36 | numerical stability (default: 1e-8)
37 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
38 | """
39 |
40 | def __init__(self, params, lr=4e-3, betas=(0.9, 0.999), len_memory=200, version=1,
41 | eps=1e-6, weight_decay=0, debug_print=False):
42 | if not 0.0 <= lr:
43 | raise ValueError("Invalid learning rate: {}".format(lr))
44 | if not 0.0 <= eps:
45 | raise ValueError("Invalid epsilon value: {}".format(eps))
46 | if not 0.0 <= betas[0] < 1.0:
47 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
48 | if not 0.0 <= betas[1] < 1.0:
49 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
50 |
51 | #compute b3
52 | base = 1/len_memory
53 | beta3 = 1-(base)
54 | print(f"DeepMemory: length of memory is {len_memory} - this should be close or equal to batches per epoch")
55 |
56 | #debugging
57 | self.debug_print=debug_print
58 |
59 |
60 | if not 0.0 <= beta3 < 1.0:
61 | raise ValueError("Invalid len_memory parameter: {}".format(beta3))
62 |
63 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
64 | weight_decay=weight_decay)
65 | super().__init__(params, defaults)
66 |
67 | self.version = version
68 |
69 | def __setstate__(self, state):
70 | super().__setstate__(state)
71 |
72 | def step(self, closure=None):
73 | """Performs a single optimization step.
74 | Arguments:
75 | closure (callable, optional): A closure that reevaluates the model
76 | and returns the loss.
77 | """
78 | loss = None
79 | if closure is not None:
80 | loss = closure()
81 |
82 | for group in self.param_groups:
83 | for p in group['params']:
84 | if p.grad is None:
85 | continue
86 | grad = p.grad.data
87 | if grad.is_sparse:
88 | raise RuntimeError(
89 | 'DiffMod does not support sparse gradients')
90 |
91 | state = self.state[p]
92 |
93 | # State initialization
94 | if len(state) == 0:
95 | state['step'] = 0
96 | # Exponential moving average of gradient values
97 | state['exp_avg'] = torch.zeros_like(p.data)
98 | # Exponential moving average of squared gradient values
99 | state['exp_avg_sq'] = torch.zeros_like(p.data)
100 | # Exponential moving average of actual learning rates
101 | state['exp_avg_lr'] = torch.zeros_like(p.data)
102 | # Previous gradient
103 | state['previous_grad'] = torch.zeros_like(p.data)
104 |
105 |
106 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
107 | previous_grad = state['previous_grad']
108 | beta1, beta2 = group['betas']
109 |
110 | state['step'] += 1
111 |
112 | # Decay the first and second moment running average coefficient
113 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
114 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
115 |
116 | denom = exp_avg_sq.sqrt().add_(group['eps'])
117 |
118 | bias_correction1 = 1 - beta1 ** state['step']
119 | bias_correction2 = 1 - beta2 ** state['step']
120 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
121 |
122 | # compute diffgrad coefficient (dfc)
123 | if self.version==0:
124 | diff = abs(previous_grad - grad)
125 |
126 | elif self.version ==1:
127 | diff = previous_grad-grad
128 |
129 |
130 | if self.version==0 or self.version==1:
131 | dfc = 1. / (1. + torch.exp(-diff))
132 |
133 |
134 | state['previous_grad'] = grad
135 |
136 | if group['weight_decay'] != 0:
137 | p.data.add_(-group['weight_decay'] * group['lr'], p.data)
138 |
139 | # create long term memory of actual learning rates (from AdaMod)
140 | step_size = torch.full_like(denom, step_size)
141 | step_size.div_(denom)
142 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
143 |
144 | if self.debug_print:
145 | print(f"batch step size {step_size} and exp_avg_step {exp_avg_lr}")
146 |
147 | #Blend the mini-batch step size with long term memory
148 | step_size = step_size.add(exp_avg_lr)
149 | step_size = step_size.div(2.)
150 |
151 |
152 | # update momentum with dfc
153 | exp_avg1 = exp_avg * dfc
154 |
155 | step_size.mul_(exp_avg1)
156 |
157 | p.data.add_(-step_size)
158 |
159 | return loss
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Best-Deep-Learning-Optimizers
2 | Collection of the latest, greatest, deep learning optimizers (for Pytorch) - CNN, Transformer, NLP suitable
3 |
4 | Current top performers = Have not run benchmarks lately and a lot has changed. Quick recommendations = transformer or CNN = madgrad / adahessian. For CNN only, Ranger.
5 |
6 | ## Updates -
7 | April 2021: Meet Madgrad! Have added Madgrad with an improvement to weight decay. Madgrad is a new optimizer released by FB AI in February. In testing with transformers for image classification, madgrad blew away the various Adam variants.
8 | However, as spotted by @nestordemeure, the weight decay impl was like adam instead of adamW.
9 | In testing, AdamW style weight decay was the winner and thus the implementation here is with my modification to use AdamW style wd.
10 |
11 | Recommendations: test with a)no weight decay, recommended by Madgrad authors and b)weight decay at same level you would use for AdamW with this madgrad_wd version.
12 |
13 | Important: madgrad is very different than Adam variants...thus recommend you start with madgrad default lr and do quick range of lr tests. Do not just use what worked for you on your dataset with Adam(sh) lr.
14 |
15 | Modified madgrad is here: https://github.com/lessw2020/Best-Deep-Learning-Optimizers/tree/master/madgrad
16 |
17 | And original madgrad is here: https://github.com/facebookresearch/madgrad
18 |
19 | Pending work = there is a new paper discussing Stable Weight Decay as being the ultimate weight decay. Planning to implement and test with madgrad soon.
20 |
21 | August 2020 - AdaHessian, the first 'it really works and works really well' second order optimizer added:
22 | I tested AdaHessian last month on work datasets and it performed extremely well. It's like training with a guided missile compared to most other optimizers.
23 | The big caveat is you will need about 2x the normal GPU memory to run it vs running with a 'first order' optimizer.
24 | I am trying to get a Titan GPU with 24GB GPU memory just for this purpose atm.
25 |
26 |
27 | new version of Ranger with highest accuracy to date for all optimizers tested:
28 | April 11 - New version of Ranger released (20.4.11), highest score for accuracy to date.
29 | Ranger has been upgraded to use Gradient Centralization. See: https://arxiv.org/abs/2004.01461 and github: https://github.com/Yonghongwei/Gradient-Centralization
30 |
31 | It will now use GC by default, and run it for both conv layers and fc layers. You can turn it on or off with "use_gc" at init to test out the difference on your datasets.
32 | 
33 | (image from gc github).
34 | The summary of gradient centralization: "GC can be viewed as a projected gradient descent method with a constrained loss function. The Lipschitzness of the constrained loss function and its gradient is better so that the training process becomes more efficient and stable."
35 |
36 |
37 | Note - for optimal accuracy, make sure you use run with a flat lr for some time and then cosine descent the lr (72% - 28% descent), or if you don't have an lr framework... very comparable results by running at one rate for 75%, then stop and decrease lr, and run remaining 28%.
38 |
39 | ## Usage - GC on by default but you can control all aspects at init:
40 | 
41 |
42 | ## Ranger will print settings at first init so you can confirm optimization is set the way you want it:
43 | 
44 |
45 | Future work: MARTHE, HyperAdam and other optimizers will be tested and posted if they look good.
46 |
47 |
48 | 12/27 - added DiffGrad, and unofficial version 1 support (coded from the paper).
49 |
50 | 12/28 - added Diff_RGrad = diffGrad + Rectified Adam to start off....seems to work quite well.
51 |
52 | Medium article (summary and FastAI example usage):
53 | https://medium.com/@lessw/meet-diffgrad-new-deep-learning-optimizer-that-solves-adams-overshoot-issue-ec63e28e01b2
54 |
55 | Official diffGrad paper: https://arxiv.org/abs/1909.11015v2
56 |
57 | 12/31 - AdaMod and DiffMod added. Initial SLS files added (but more work needed).
58 |
59 |
60 | In Progress:
61 | A - Parabolic Approximation Line Search: https://arxiv.org/abs/1903.11991v2
62 |
63 | B - Stochastic Line Search (SLS): pending (needs param group support)
64 |
65 | c - AvaGrad
66 |
67 |
68 | General papers of relevance:
69 |
70 | Does Adam stick close to the optimal point? https://arxiv.org/abs/1911.00289v1
71 |
72 |
73 | Probabalistic line searches for stochastic optimization (2017, matlab only but good theory work): https://arxiv.org/abs/1703.10034v2
74 |
--------------------------------------------------------------------------------
/Ranger/ranger.py:
--------------------------------------------------------------------------------
1 | # Ranger deep learning optimizer - RAdam + Lookahead + Gradient Centralization, combined into one optimizer.
2 |
3 | # https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer
4 | # and/or
5 | # https://github.com/lessw2020/Best-Deep-Learning-Optimizers
6 |
7 | # Ranger has now been used to capture 12 records on the FastAI leaderboard.
8 |
9 | # This version = 20.4.11
10 |
11 | # Credits:
12 | # Gradient Centralization --> https://arxiv.org/abs/2004.01461v2 (a new optimization technique for DNNs), github: https://github.com/Yonghongwei/Gradient-Centralization
13 | # RAdam --> https://github.com/LiyuanLucasLiu/RAdam
14 | # Lookahead --> rewritten by lessw2020, but big thanks to Github @LonePatient and @RWightman for ideas from their code.
15 | # Lookahead paper --> MZhang,G Hinton https://arxiv.org/abs/1907.08610
16 |
17 | # summary of changes:
18 | # 4/11/20 - add gradient centralization option. Set new testing benchmark for accuracy with it, toggle with use_gc flag at init.
19 | # full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights),
20 | # supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues.
21 | # changes 8/31/19 - fix references to *self*.N_sma_threshold;
22 | # changed eps to 1e-5 as better default than 1e-8.
23 |
24 | import math
25 | import torch
26 | from torch.optim.optimizer import Optimizer, required
27 |
28 |
29 |
30 | class Ranger(Optimizer):
31 |
32 | def __init__(self, params, lr=1e-3, # lr
33 | alpha=0.5, k=6, N_sma_threshhold=5, # Ranger options
34 | betas=(.95,0.999), eps=1e-5, weight_decay=0, # Adam options
35 | use_gc=True, gc_conv_only=False # Gradient centralization on or off, applied to conv layers only or conv + fc layers
36 | ):
37 |
38 | #parameter checks
39 | if not 0.0 <= alpha <= 1.0:
40 | raise ValueError(f'Invalid slow update rate: {alpha}')
41 | if not 1 <= k:
42 | raise ValueError(f'Invalid lookahead steps: {k}')
43 | if not lr > 0:
44 | raise ValueError(f'Invalid Learning Rate: {lr}')
45 | if not eps > 0:
46 | raise ValueError(f'Invalid eps: {eps}')
47 |
48 | #parameter comments:
49 | # beta1 (momentum) of .95 seems to work better than .90...
50 | #N_sma_threshold of 5 seems better in testing than 4.
51 | #In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.
52 |
53 | #prep defaults and init torch.optim base
54 | defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
55 | super().__init__(params,defaults)
56 |
57 | #adjustable threshold
58 | self.N_sma_threshhold = N_sma_threshhold
59 |
60 |
61 | #look ahead params
62 |
63 | self.alpha = alpha
64 | self.k = k
65 |
66 | #radam buffer for state
67 | self.radam_buffer = [[None,None,None] for ind in range(10)]
68 |
69 | #gc on or off
70 | self.use_gc=use_gc
71 |
72 | #level of gradient centralization
73 | self.gc_gradient_threshold = 3 if gc_conv_only else 1
74 |
75 |
76 | print(f"Ranger optimizer loaded. \nGradient Centralization usage = {self.use_gc}")
77 | if (self.use_gc and self.gc_gradient_threshold==1):
78 | print(f"GC applied to both conv and fc layers")
79 | elif (self.use_gc and self.gc_gradient_threshold==3):
80 | print(f"GC applied to conv layers only")
81 |
82 |
83 |
84 |
85 |
86 | def __setstate__(self, state):
87 | print("set state called")
88 | super(Ranger, self).__setstate__(state)
89 |
90 |
91 | def step(self, closure=None):
92 | loss = None
93 | #note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.
94 | #Uncomment if you need to use the actual closure...
95 |
96 | #if closure is not None:
97 | #loss = closure()
98 |
99 | #Evaluate averages and grad, update param tensors
100 | for group in self.param_groups:
101 |
102 | for p in group['params']:
103 | if p.grad is None:
104 | continue
105 | grad = p.grad.data.float()
106 |
107 | if grad.is_sparse:
108 | raise RuntimeError('Ranger optimizer does not support sparse gradients')
109 |
110 | p_data_fp32 = p.data.float()
111 |
112 | state = self.state[p] #get state dict for this param
113 |
114 | if len(state) == 0: #if first time to run...init dictionary with our desired entries
115 | #if self.first_run_check==0:
116 | #self.first_run_check=1
117 | #print("Initializing slow buffer...should not see this at load from saved model!")
118 | state['step'] = 0
119 | state['exp_avg'] = torch.zeros_like(p_data_fp32)
120 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
121 |
122 | #look ahead weight storage now in state dict
123 | state['slow_buffer'] = torch.empty_like(p.data)
124 | state['slow_buffer'].copy_(p.data)
125 |
126 | else:
127 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
128 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
129 |
130 | #begin computations
131 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
132 | beta1, beta2 = group['betas']
133 |
134 |
135 | #GC operation for Conv layers and FC layers
136 | if grad.dim() > self.gc_gradient_threshold:
137 | grad.add_(-grad.mean(dim = tuple(range(1,grad.dim())), keepdim = True))
138 |
139 |
140 |
141 | state['step'] += 1
142 |
143 | #compute variance mov avg
144 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
145 | #compute mean moving avg
146 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
147 |
148 |
149 |
150 |
151 |
152 | buffered = self.radam_buffer[int(state['step'] % 10)]
153 |
154 | if state['step'] == buffered[0]:
155 | N_sma, step_size = buffered[1], buffered[2]
156 | else:
157 | buffered[0] = state['step']
158 | beta2_t = beta2 ** state['step']
159 | N_sma_max = 2 / (1 - beta2) - 1
160 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
161 | buffered[1] = N_sma
162 | if N_sma > self.N_sma_threshhold:
163 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
164 | else:
165 | step_size = 1.0 / (1 - beta1 ** state['step'])
166 | buffered[2] = step_size
167 |
168 |
169 | if group['weight_decay'] != 0:
170 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
171 |
172 | # apply lr
173 | if N_sma > self.N_sma_threshhold:
174 | denom = exp_avg_sq.sqrt().add_(group['eps'])
175 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
176 | else:
177 | p_data_fp32.add_(-step_size * group['lr'], exp_avg)
178 |
179 | p.data.copy_(p_data_fp32)
180 |
181 | #integrated look ahead...
182 | #we do it at the param level instead of group level
183 | if state['step'] % group['k'] == 0:
184 | slow_p = state['slow_buffer'] #get access to slow param tensor
185 | slow_p.add_(self.alpha, p.data - slow_p) #(fast weights - slow weights) * alpha
186 | p.data.copy_(slow_p) #copy interpolated weights to RAdam param tensor
187 |
188 | return loss
189 |
--------------------------------------------------------------------------------
/adahessian/README.md:
--------------------------------------------------------------------------------
1 | adahessian is the first 'second order' optimizer that actually performs (and does so extremely well) on real data.
2 | The big drawback is you'll need to have about 2x the GPU memory that you would otherwise need to run.
3 |
4 | The official github for adahessian is here:
5 | https://github.com/amirgholami/adahessian
6 |
7 | In the implementation here, I've consolidated it into a single file import instead of the util + optim file like in the official repo to make it easier to use.
8 |
9 | Note that you have to update your training loop as below:
10 | # usage example:
11 | from adahessian import Adahessian, get_params_grad
12 | import torch.optim.lr_scheduler as lr_scheduler
13 | #
14 | optimizer = Adahessian(model.parameters(),lr=.15)
15 | scheduler = lr_scheduler.MultiStepLR(
16 | optimizer,
17 | [30,45], #
18 | gamma=.1,
19 | last_epoch=-1)
20 |
21 | #
22 | # config for training loop:
23 | #
24 | loss.backward(create_graph=True)
25 | _, gradsH = get_params_grad(model)
26 | optimizer.step(gradsH)
27 |
28 |
29 |
--------------------------------------------------------------------------------
/adahessian/adahessian.py:
--------------------------------------------------------------------------------
1 | #*
2 | # @file Different utility functions
3 | # Copyright (c) Zhewei Yao, Amir Gholami, Sheng Shen
4 | # All rights reserved.
5 | # This file is part of AdaHessian library.
6 | # source: https://github.com/amirgholami/adahessian
7 | #
8 | # AdaHessian is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # AdaHessian is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU General Public License
19 | # along with adahessian. If not, see .
20 | #*
21 |
22 | import math
23 | import torch
24 | from torch.optim.optimizer import Optimizer
25 | from copy import deepcopy
26 | import numpy as np
27 |
28 | # imported from utils to avoid needing two imports... @lessw2020
29 | def get_params_grad(model):
30 | """
31 | get model parameters and corresponding gradients
32 | """
33 | params = []
34 | grads = []
35 | for param in model.parameters():
36 | if not param.requires_grad:
37 | continue
38 | params.append(param)
39 | grads.append(0. if param.grad is None else param.grad + 0.)
40 | return params, grads
41 |
42 |
43 | class Adahessian(Optimizer):
44 | """Implements Adahessian algorithm.
45 | It has been proposed in `ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning`.
46 | Arguments:
47 | params (iterable): iterable of parameters to optimize or dicts defining
48 | parameter groups
49 | lr (float, optional): learning rate (default: 0.15)
50 | betas (Tuple[float, float], optional): coefficients used for computing
51 | running averages of gradient and its square (default: (0.9, 0.999))
52 | eps (float, optional): term added to the denominator to improve
53 | numerical stability (default: 1e-4)
54 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
55 | hessian_power (float, optional): Hessian power (default: 1)
56 | """
57 |
58 | def __init__(self, params, lr=0.15, betas=(0.9, 0.999), eps=1e-4,
59 | weight_decay=0, hessian_power=1):
60 | if not 0.0 <= lr:
61 | raise ValueError("Invalid learning rate: {}".format(lr))
62 | if not 0.0 <= eps:
63 | raise ValueError("Invalid epsilon value: {}".format(eps))
64 | if not 0.0 <= betas[0] < 1.0:
65 | raise ValueError(
66 | "Invalid beta parameter at index 0: {}".format(
67 | betas[0]))
68 | if not 0.0 <= betas[1] < 1.0:
69 | raise ValueError(
70 | "Invalid beta parameter at index 1: {}".format(
71 | betas[1]))
72 | if not 0.0 <= hessian_power <= 1.0:
73 | raise ValueError("Invalid Hessian power value: {}".format(hessian_power))
74 | defaults = dict(lr=lr, betas=betas, eps=eps,
75 | weight_decay=weight_decay, hessian_power=hessian_power)
76 |
77 | super(Adahessian, self).__init__(params, defaults)
78 |
79 | def get_trace(self, gradsH):
80 | """
81 | compute the Hessian vector product with a random vector v, at the current gradient point,
82 | i.e., compute the gradient of .
83 | :param gradsH: a list of torch variables
84 | :return: a list of torch tensors
85 | """
86 |
87 | params = self.param_groups[0]['params']
88 |
89 | v = [torch.randint_like(p, high=2, device='cuda') for p in params]
90 | for v_i in v:
91 | v_i[v_i == 0] = -1
92 | hvs = torch.autograd.grad(
93 | gradsH,
94 | params,
95 | grad_outputs=v,
96 | only_inputs=True,
97 | retain_graph=True)
98 |
99 | hutchinson_trace = []
100 | for hv, vi in zip(hvs, v):
101 | param_size = hv.size()
102 | if len(param_size) <= 2: # for 0/1/2D tensor
103 | tmp_output = torch.abs(hv * vi)
104 | hutchinson_trace.append(tmp_output) # Hessian diagonal block size is 1 here.
105 | elif len(param_size) == 4: # Conv kernel
106 | tmp_output = torch.abs(torch.sum(torch.abs(
107 | hv * vi), dim=[2, 3], keepdim=True)) / vi[0, 1].numel() # Hessian diagonal block size is 9 here: torch.sum() reduces the dim 2/3.
108 | hutchinson_trace.append(tmp_output)
109 |
110 | return hutchinson_trace
111 |
112 | def step(self, gradsH, closure=None):
113 | """Performs a single optimization step.
114 | Arguments:
115 | gradsH: The gradient used to compute Hessian vector product.
116 | closure (callable, optional): A closure that reevaluates the model
117 | and returns the loss.
118 | """
119 | loss = None
120 | if closure is not None:
121 | loss = closure()
122 |
123 | # get the Hessian diagonal
124 | hut_trace = self.get_trace(gradsH)
125 |
126 | for group in self.param_groups:
127 | for i, p in enumerate(group['params']):
128 | if p.grad is None:
129 | continue
130 |
131 | grad = deepcopy(gradsH[i].data)
132 | state = self.state[p]
133 |
134 | # State initialization
135 | if len(state) == 0:
136 | state['step'] = 0
137 | # Exponential moving average of gradient values
138 | state['exp_avg'] = torch.zeros_like(p.data)
139 | # Exponential moving average of Hessian diagonal square values
140 | state['exp_hessian_diag_sq'] = torch.zeros_like(p.data)
141 |
142 | exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq']
143 |
144 | beta1, beta2 = group['betas']
145 |
146 | state['step'] += 1
147 |
148 | # Decay the first and second moment running average coefficient
149 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
150 | exp_hessian_diag_sq.mul_(beta2).addcmul_(
151 | 1 - beta2, hut_trace[i], hut_trace[i])
152 |
153 | bias_correction1 = 1 - beta1 ** state['step']
154 | bias_correction2 = 1 - beta2 ** state['step']
155 |
156 | # make the square root, and the Hessian power
157 | k = group['hessian_power']
158 | denom = (
159 | (exp_hessian_diag_sq.sqrt() ** k) /
160 | math.sqrt(bias_correction2) ** k).add_(
161 | group['eps'])
162 |
163 | # make update
164 | p.data = p.data - \
165 | group['lr'] * (exp_avg / bias_correction1 / denom + group['weight_decay'] * p.data)
166 |
167 | return loss
168 |
--------------------------------------------------------------------------------
/adamod/README.md:
--------------------------------------------------------------------------------
1 | AdaMod is a new optimizer that takes Adam but adds an exponential moving average of the adaptive learning rates.
2 | This ensures no large spikes during training and helps achieve faster and better convergence.
3 |
4 | Original source code and paper: https://github.com/lancopku/AdaMod
5 |
6 | DiffMod is a combination of DiffGrad + AdaMod = diffgrad.
7 |
8 | Currently DiffMod, using version 0 of DiffGrad, appears to be the best performer of all. But more testing is needed.
9 |
10 | Usage:
11 | from diffmod import DiffMod
12 | optar = partial(DiffMod,version=0)
13 | learn = Learner(data, model, metrics=[accuracy], wd=1e-3,
14 | opt_func=optar,
15 | bn_wd=False, true_wd=True,
16 | loss_func = LabelSmoothingCrossEntropy())
17 |
--------------------------------------------------------------------------------
/adamod/adamod.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.optim import Optimizer
4 |
5 | #source - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
6 | #modification - lessw2020 - use len_memory as integer lookback, convert to beta3 for easier usage
7 |
8 | class AdaMod(Optimizer):
9 | """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
10 | It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_.
11 | Arguments:
12 | params (iterable): iterable of parameters to optimize or dicts defining
13 | parameter groups
14 | lr (float, optional): learning rate (default: 1e-3)
15 | betas (Tuple[float, float], optional): coefficients used for computing
16 | running averages of gradient and its square (default: (0.9, 0.999))
17 | beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999)
18 | eps (float, optional): term added to the denominator to improve
19 | numerical stability (default: 1e-8)
20 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
21 | """
22 |
23 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999),
24 | len_memory=1000, #will convert to beta3
25 | eps=1e-8, weight_decay=0):
26 | if not 0.0 <= lr:
27 | raise ValueError("Invalid learning rate: {}".format(lr))
28 | if not 0.0 <= eps:
29 | raise ValueError("Invalid epsilon value: {}".format(eps))
30 | if not 0.0 <= betas[0] < 1.0:
31 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
32 | if not 0.0 <= betas[1] < 1.0:
33 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
34 |
35 | beta3 = 1 - (1/len_memory)
36 | print(f"AdaMod optimizer: len_memory of {len_memory} set at Beta3 of {beta3}")
37 | if not 0.0 <= beta3 < 1.0:
38 | raise ValueError("Invalid beta3 parameter: {}".format(beta3))
39 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
40 | weight_decay=weight_decay)
41 | super().__init__(params, defaults)
42 |
43 | def __setstate__(self, state):
44 | super().__setstate__(state)
45 |
46 | def step(self, closure=None):
47 | """Performs a single optimization step.
48 | Arguments:
49 | closure (callable, optional): A closure that reevaluates the model
50 | and returns the loss.
51 | """
52 | loss = None
53 | if closure is not None:
54 | loss = closure()
55 |
56 | for group in self.param_groups:
57 | for p in group['params']:
58 | if p.grad is None:
59 | continue
60 | grad = p.grad.data
61 | if grad.is_sparse:
62 | raise RuntimeError(
63 | 'AdaMod does not support sparse gradients')
64 |
65 | state = self.state[p]
66 |
67 | # State initialization
68 | if len(state) == 0:
69 | state['step'] = 0
70 | # Exponential moving average of gradient values
71 | state['exp_avg'] = torch.zeros_like(p.data)
72 | # Exponential moving average of squared gradient values
73 | state['exp_avg_sq'] = torch.zeros_like(p.data)
74 | # Exponential moving average of actual learning rates
75 | state['exp_avg_lr'] = torch.zeros_like(p.data)
76 |
77 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
78 | beta1, beta2 = group['betas']
79 |
80 | state['step'] += 1
81 |
82 | # Decay the first and second moment running average coefficient
83 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
84 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
85 |
86 | denom = exp_avg_sq.sqrt().add_(group['eps'])
87 |
88 | bias_correction1 = 1 - beta1 ** state['step']
89 | bias_correction2 = 1 - beta2 ** state['step']
90 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
91 |
92 | if group['weight_decay'] != 0:
93 | p.data.add_(-group['weight_decay'] * group['lr'], p.data)
94 |
95 | # Applies momental bounds on actual learning rates
96 | step_size = torch.full_like(denom, step_size)
97 | step_size.div_(denom)
98 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
99 | step_size = torch.min(step_size, exp_avg_lr)
100 | step_size.mul_(exp_avg)
101 |
102 | p.data.add_(-step_size)
103 |
104 | return loss
105 |
--------------------------------------------------------------------------------
/adamod/diffmod.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.optim import Optimizer
4 |
5 | # source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
6 |
7 | # modifications @lessw2020 - blend diffGrad + AdaMod = diffmod.
8 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
9 |
10 |
11 | class DiffMod(Optimizer):
12 | """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
13 | It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_.
14 | Arguments:
15 | params (iterable): iterable of parameters to optimize or dicts defining
16 | parameter groups
17 | lr (float, optional): learning rate (default: 1e-3)
18 | betas (Tuple[float, float], optional): coefficients used for computing
19 | running averages of gradient and its square (default: (0.9, 0.999))
20 | beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999)
21 | len_memory = b3 in easier to use format. specify the memory len, b3 is computed.
22 | eps (float, optional): term added to the denominator to improve
23 | numerical stability (default: 1e-8)
24 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
25 | """
26 |
27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), len_memory=1000, version=0,
28 | eps=1e-8, weight_decay=0):
29 | if not 0.0 <= lr:
30 | raise ValueError("Invalid learning rate: {}".format(lr))
31 | if not 0.0 <= eps:
32 | raise ValueError("Invalid epsilon value: {}".format(eps))
33 | if not 0.0 <= betas[0] < 1.0:
34 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
35 | if not 0.0 <= betas[1] < 1.0:
36 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
37 |
38 | #compute b3
39 | beta3 = 1-(1/len_memory)
40 | print(f"length of memory is ",len_memory," and b3 is thus ",beta3)
41 |
42 | if not 0.0 <= beta3 < 1.0:
43 | raise ValueError("Invalid beta3 parameter: {}".format(beta3))
44 |
45 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
46 | weight_decay=weight_decay)
47 | super().__init__(params, defaults)
48 |
49 | self.version = version
50 |
51 | def __setstate__(self, state):
52 | super().__setstate__(state)
53 |
54 | def step(self, closure=None):
55 | """Performs a single optimization step.
56 | Arguments:
57 | closure (callable, optional): A closure that reevaluates the model
58 | and returns the loss.
59 | """
60 | loss = None
61 | if closure is not None:
62 | loss = closure()
63 |
64 | for group in self.param_groups:
65 | for p in group['params']:
66 | if p.grad is None:
67 | continue
68 | grad = p.grad.data
69 | if grad.is_sparse:
70 | raise RuntimeError(
71 | 'DiffMod does not support sparse gradients')
72 |
73 | state = self.state[p]
74 |
75 | # State initialization
76 | if len(state) == 0:
77 | state['step'] = 0
78 | # Exponential moving average of gradient values
79 | state['exp_avg'] = torch.zeros_like(p.data)
80 | # Exponential moving average of squared gradient values
81 | state['exp_avg_sq'] = torch.zeros_like(p.data)
82 | # Exponential moving average of actual learning rates
83 | state['exp_avg_lr'] = torch.zeros_like(p.data)
84 | # Previous gradient
85 | state['previous_grad'] = torch.zeros_like(p.data)
86 |
87 |
88 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
89 | previous_grad = state['previous_grad']
90 | beta1, beta2 = group['betas']
91 |
92 | state['step'] += 1
93 |
94 | # Decay the first and second moment running average coefficient
95 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
96 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
97 |
98 | denom = exp_avg_sq.sqrt().add_(group['eps'])
99 |
100 | bias_correction1 = 1 - beta1 ** state['step']
101 | bias_correction2 = 1 - beta2 ** state['step']
102 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
103 |
104 | # compute diffgrad coefficient (dfc)
105 |
106 |
107 | if self.version==0:
108 | diff = abs(previous_grad - grad)
109 | elif self.version ==1:
110 | diff = previous_grad-grad
111 | elif self.version ==2:
112 | diff = .5*abs(previous_grad - grad)
113 |
114 | if self.version==0 or self.version==1:
115 | dfc = 1. / (1. + torch.exp(-diff))
116 | elif self.version==2:
117 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
118 |
119 | state['previous_grad'] = grad
120 |
121 | if group['weight_decay'] != 0:
122 | p.data.add_(-group['weight_decay'] * group['lr'], p.data)
123 |
124 | # Applies momental bounds on actual learning rates
125 | step_size = torch.full_like(denom, step_size)
126 | step_size.div_(denom)
127 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
128 | step_size = torch.min(step_size, exp_avg_lr)
129 |
130 | # update momentum with dfc
131 | exp_avg1 = exp_avg * dfc
132 |
133 | step_size.mul_(exp_avg1)
134 |
135 | p.data.add_(-step_size)
136 |
137 | return loss
--------------------------------------------------------------------------------
/diffgrad/README.md:
--------------------------------------------------------------------------------
1 | DiffGrad adjusts the step size for each parameter by comparing the current gradient vs the previous. It is designed to solve the 'Adam'
2 | overshoot problem, where the momentum of Adam can carry it right over the global mininimum.
3 |
4 | https://github.com/shivram1987/diffGrad for original source
5 |
6 | and paper: https://arxiv.org/abs/1909.11015v2
7 |
8 | (TF version - if you are forced to use TF, here's a TF version of diffgrad:
9 | https://github.com/evanatyourservice/diffGrad-tf )
10 |
11 |
12 | This version adds in a version parameter: version 0 is the main one used in the paper. version 1 removes the abs value from the calculations and
13 | allows faster clamping.
14 | Use: version=1 in your optimizer params. version=0 is default.
15 |
16 | 12/27 - added DiffRGrad - this is diffGrad with Rectified Adam to start. Thus no warmup needed and diffGrad kicks in after Rectified Adam says variance is ready to go.
17 |
18 | Medium article and example usage: https://medium.com/@lessw/meet-diffgrad-new-deep-learning-optimizer-that-solves-adams-overshoot-issue-ec63e28e01b2
19 |
--------------------------------------------------------------------------------
/diffgrad/diff_rgrad.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.optim.optimizer import Optimizer, required
4 |
5 | # Original source: DiffGrad: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py
6 | # RAam: https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py
7 | # modifications: @lessw2020 - blend RAdam with DiffGrad and add version options
8 | # __version__: 12.27.19
9 |
10 |
11 | class diffRGrad(Optimizer):
12 |
13 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
14 | version=1,
15 | weight_decay=0, degenerated_to_sgd=True):
16 | if not 0.0 <= lr:
17 | raise ValueError("Invalid learning rate: {}".format(lr))
18 | if not 0.0 <= eps:
19 | raise ValueError("Invalid epsilon value: {}".format(eps))
20 | if not 0.0 <= betas[0] < 1.0:
21 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
22 | if not 0.0 <= betas[1] < 1.0:
23 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
24 |
25 | self.degenerated_to_sgd = degenerated_to_sgd
26 |
27 | self.version = version
28 |
29 | if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
30 | for param in params:
31 | if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
32 | param['buffer'] = [[None, None, None] for _ in range(10)]
33 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
34 | super(diffRGrad, self).__init__(params, defaults)
35 |
36 | def __setstate__(self, state):
37 | super(diffRGrad, self).__setstate__(state)
38 |
39 | def step(self, closure=None):
40 |
41 | loss = None
42 | if closure is not None:
43 | loss = closure()
44 |
45 | for group in self.param_groups:
46 |
47 | for p in group['params']:
48 | if p.grad is None:
49 | continue
50 | grad = p.grad.data.float()
51 | if grad.is_sparse:
52 | raise RuntimeError('diffGRad does not support sparse gradients')
53 |
54 | p_data_fp32 = p.data.float()
55 |
56 | state = self.state[p]
57 |
58 | if len(state) == 0:
59 | state['step'] = 0
60 | state['exp_avg'] = torch.zeros_like(p_data_fp32)
61 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
62 | # Previous gradient
63 | state['previous_grad'] = torch.zeros_like(p_data_fp32)
64 |
65 | else:
66 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
67 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
68 | state['previous_grad'] = state['previous_grad'].type_as(p_data_fp32)
69 |
70 |
71 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
72 | previous_grad = state['previous_grad']
73 | beta1, beta2 = group['betas']
74 |
75 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
76 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
77 |
78 | state['step'] += 1
79 |
80 | # compute diffgrad coefficient (dfc)
81 |
82 | #print("grad = ",grad.size())
83 | #print("prev_grad = ",previous_grad.size())
84 |
85 | if self.version==0:
86 | diff = abs(previous_grad - grad)
87 | elif self.version ==1:
88 | diff = previous_grad-grad
89 | elif self.version ==2:
90 | diff = .5*abs(previous_grad - grad)
91 |
92 | if self.version==0 or self.version==1:
93 | dfc = 1. / (1. + torch.exp(-diff))
94 | elif self.version==2:
95 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
96 |
97 | state['previous_grad'] = grad
98 |
99 |
100 | buffered = group['buffer'][int(state['step'] % 10)]
101 | if state['step'] == buffered[0]:
102 | N_sma, step_size = buffered[1], buffered[2]
103 | else:
104 | buffered[0] = state['step']
105 | beta2_t = beta2 ** state['step']
106 | N_sma_max = 2 / (1 - beta2) - 1
107 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
108 | buffered[1] = N_sma
109 |
110 | # more conservative since it's an approximated value
111 | if N_sma >= 5:
112 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
113 | elif self.degenerated_to_sgd:
114 | step_size = 1.0 / (1 - beta1 ** state['step'])
115 | else:
116 | step_size = -1
117 | buffered[2] = step_size
118 |
119 |
120 |
121 |
122 | # more conservative since it's an approximated value
123 | if N_sma >= 5:
124 | if group['weight_decay'] != 0:
125 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
126 |
127 | denom = exp_avg_sq.sqrt().add_(group['eps'])
128 |
129 | # update momentum with dfc
130 | #print("dfc ",dfc.size())
131 | #print("exp_avg ",exp_avg.size())
132 | exp_avg1 = exp_avg * dfc.float()
133 |
134 |
135 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg1, denom)
136 | p.data.copy_(p_data_fp32)
137 |
138 | elif step_size > 0:
139 |
140 | #print("exp_avg in elif",exp_avg.size())
141 | if group['weight_decay'] != 0:
142 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
143 |
144 | p_data_fp32.add_(-step_size * group['lr'], exp_avg)
145 | p.data.copy_(p_data_fp32)
146 |
147 | return loss
148 |
--------------------------------------------------------------------------------
/diffgrad/diffgrad.py:
--------------------------------------------------------------------------------
1 |
2 | import math
3 | import torch
4 | from torch.optim.optimizer import Optimizer
5 | import numpy as np
6 | import torch.nn as nn
7 | #import torch.optim as Optimizer
8 |
9 | # Original source: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py
10 |
11 | # modifications: @lessw2020
12 |
13 |
14 | class DiffGrad(Optimizer):
15 | r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam.
16 | It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_.
17 | Arguments:
18 | params (iterable): iterable of parameters to optimize or dicts defining
19 | parameter groups
20 | lr (float, optional): learning rate (default: 1e-3)
21 | betas (Tuple[float, float], optional): coefficients used for computing
22 | running averages of gradient and its square (default: (0.9, 0.999))
23 | eps (float, optional): term added to the denominator to improve
24 | numerical stability (default: 1e-8)
25 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
26 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this
27 | algorithm from the paper `On the Convergence of Adam and Beyond`_
28 | (default: False)
29 | .. _diffGrad: An Optimization Method for Convolutional Neural Networks:
30 | https://arxiv.org/abs/1909.11015
31 | .. _Adam\: A Method for Stochastic Optimization:
32 | https://arxiv.org/abs/1412.6980
33 | .. _On the Convergence of Adam and Beyond:
34 | https://openreview.net/forum?id=ryQu7f-RZ
35 | """
36 |
37 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0):
38 | if not 0.0 <= lr:
39 | raise ValueError("Invalid learning rate: {}".format(lr))
40 | if not 0.0 <= eps:
41 | raise ValueError("Invalid epsilon value: {}".format(eps))
42 | if not 0.0 <= betas[0] < 1.0:
43 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
44 | if not 0.0 <= betas[1] < 1.0:
45 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
46 |
47 |
48 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
49 |
50 | super().__init__(params, defaults)
51 |
52 | #save version
53 | self.version = version
54 |
55 | def __setstate__(self, state):
56 | super().__setstate__(state)
57 |
58 | def step(self, closure=None):
59 | """Performs a single optimization step.
60 | Arguments:
61 | closure (callable, optional): A closure that reevaluates the model
62 | and returns the loss.
63 | """
64 | loss = None
65 | if closure is not None:
66 | loss = closure()
67 |
68 | for group in self.param_groups:
69 | for p in group['params']:
70 | if p.grad is None:
71 | continue
72 | grad = p.grad.data
73 | if grad.is_sparse:
74 | raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead')
75 |
76 | state = self.state[p]
77 |
78 | # State initialization
79 | if len(state) == 0:
80 | state['step'] = 0
81 | # Exponential moving average of gradient values
82 | state['exp_avg'] = torch.zeros_like(p.data)
83 | # Exponential moving average of squared gradient values
84 | state['exp_avg_sq'] = torch.zeros_like(p.data)
85 | # Previous gradient
86 | state['previous_grad'] = torch.zeros_like(p.data)
87 |
88 | exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad']
89 | beta1, beta2 = group['betas']
90 |
91 | state['step'] += 1
92 |
93 | if group['weight_decay'] != 0:
94 | grad.add_(group['weight_decay'], p.data)
95 |
96 | # Decay the first and second moment running average coefficient
97 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
98 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
99 | denom = exp_avg_sq.sqrt().add_(group['eps'])
100 |
101 | bias_correction1 = 1 - beta1 ** state['step']
102 | bias_correction2 = 1 - beta2 ** state['step']
103 |
104 | # compute diffgrad coefficient (dfc)
105 |
106 |
107 | if self.version==0:
108 | diff = abs(previous_grad - grad)
109 | elif self.version ==1:
110 | diff = previous_grad-grad
111 | elif self.version ==2:
112 | diff = .5*abs(previous_grad - grad)
113 |
114 | if self.version==0 or self.version==1:
115 | dfc = 1. / (1. + torch.exp(-diff))
116 | elif self.version==2:
117 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
118 |
119 | state['previous_grad'] = grad
120 |
121 | # update momentum with dfc
122 | exp_avg1 = exp_avg * dfc
123 |
124 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
125 |
126 | p.data.addcdiv_(-step_size, exp_avg1, denom)
127 |
128 | return loss
--------------------------------------------------------------------------------
/diffgrad/mxresnet.py:
--------------------------------------------------------------------------------
1 | #FastAI's XResnet modified to use Mish activation function, MXResNet
2 | #https://github.com/fastai/fastai/blob/master/fastai/vision/models/xresnet.py
3 | #modified by lessw2020 - github: https://github.com/lessw2020/mish
4 |
5 |
6 | from fastai.torch_core import *
7 | import torch.nn as nn
8 | import torch,math,sys
9 | import torch.utils.model_zoo as model_zoo
10 | from functools import partial
11 | #from ...torch_core import Module
12 | from fastai.torch_core import Module
13 |
14 | import torch.nn.functional as F #(uncomment if needed,but you likely already have it)
15 |
16 |
17 | class Mish(nn.Module):
18 | def __init__(self):
19 | super().__init__()
20 | print("Mish activation loaded...")
21 |
22 | def forward(self, x):
23 | #save 1 second per epoch with no x= x*() and then return x...just inline it.
24 | return x *( torch.tanh(F.softplus(x)))
25 |
26 |
27 |
28 |
29 |
30 | #Unmodified from https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py
31 | def conv1d(ni:int, no:int, ks:int=1, stride:int=1, padding:int=0, bias:bool=False):
32 | "Create and initialize a `nn.Conv1d` layer with spectral normalization."
33 | conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias)
34 | nn.init.kaiming_normal_(conv.weight)
35 | if bias: conv.bias.data.zero_()
36 | return spectral_norm(conv)
37 |
38 |
39 |
40 | # Adapted from SelfAttention layer at https://github.com/fastai/fastai/blob/5c51f9eabf76853a89a9bc5741804d2ed4407e49/fastai/layers.py
41 | # Inspired by https://arxiv.org/pdf/1805.08318.pdf
42 | class SimpleSelfAttention(nn.Module):
43 |
44 | def __init__(self, n_in:int, ks=1, sym=False):#, n_out:int):
45 | super().__init__()
46 |
47 | self.conv = conv1d(n_in, n_in, ks, padding=ks//2, bias=False)
48 |
49 | self.gamma = nn.Parameter(tensor([0.]))
50 |
51 | self.sym = sym
52 | self.n_in = n_in
53 |
54 | def forward(self,x):
55 |
56 |
57 | if self.sym:
58 | # symmetry hack by https://github.com/mgrankin
59 | c = self.conv.weight.view(self.n_in,self.n_in)
60 | c = (c + c.t())/2
61 | self.conv.weight = c.view(self.n_in,self.n_in,1)
62 |
63 | size = x.size()
64 | x = x.view(*size[:2],-1) # (C,N)
65 |
66 | # changed the order of mutiplication to avoid O(N^2) complexity
67 | # (x*xT)*(W*x) instead of (x*(xT*(W*x)))
68 |
69 | convx = self.conv(x) # (C,C) * (C,N) = (C,N) => O(NC^2)
70 | xxT = torch.bmm(x,x.permute(0,2,1).contiguous()) # (C,N) * (N,C) = (C,C) => O(NC^2)
71 |
72 | o = torch.bmm(xxT, convx) # (C,C) * (C,N) = (C,N) => O(NC^2)
73 |
74 | o = self.gamma * o + x
75 |
76 |
77 | return o.view(*size).contiguous()
78 |
79 |
80 |
81 |
82 |
83 | __all__ = ['MXResNet', 'mxresnet18', 'mxresnet34', 'mxresnet50', 'mxresnet101', 'mxresnet152']
84 |
85 | # or: ELU+init (a=0.54; gain=1.55)
86 | act_fn = Mish() #nn.ReLU(inplace=True)
87 |
88 | class Flatten(Module):
89 | def forward(self, x): return x.view(x.size(0), -1)
90 |
91 | def init_cnn(m):
92 | if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
93 | if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
94 | for l in m.children(): init_cnn(l)
95 |
96 | def conv(ni, nf, ks=3, stride=1, bias=False):
97 | return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)
98 |
99 | def noop(x): return x
100 |
101 | def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
102 | bn = nn.BatchNorm2d(nf)
103 | nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
104 | layers = [conv(ni, nf, ks, stride=stride), bn]
105 | if act: layers.append(act_fn)
106 | return nn.Sequential(*layers)
107 |
108 | class ResBlock(Module):
109 | def __init__(self, expansion, ni, nh, stride=1,sa=False, sym=False):
110 | nf,ni = nh*expansion,ni*expansion
111 | layers = [conv_layer(ni, nh, 3, stride=stride),
112 | conv_layer(nh, nf, 3, zero_bn=True, act=False)
113 | ] if expansion == 1 else [
114 | conv_layer(ni, nh, 1),
115 | conv_layer(nh, nh, 3, stride=stride),
116 | conv_layer(nh, nf, 1, zero_bn=True, act=False)
117 | ]
118 | self.sa = SimpleSelfAttention(nf,ks=1,sym=sym) if sa else noop
119 | self.convs = nn.Sequential(*layers)
120 | # TODO: check whether act=True works better
121 | self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
122 | self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)
123 |
124 | def forward(self, x): return act_fn(self.sa(self.convs(x)) + self.idconv(self.pool(x)))
125 |
126 | def filt_sz(recep): return min(64, 2**math.floor(math.log2(recep*0.75)))
127 |
128 | class MXResNet(nn.Sequential):
129 | def __init__(self, expansion, layers, c_in=3, c_out=1000, sa = False, sym= False):
130 | stem = []
131 | sizes = [c_in,32,64,64] #modified per Grankin
132 | for i in range(3):
133 | stem.append(conv_layer(sizes[i], sizes[i+1], stride=2 if i==0 else 1))
134 | #nf = filt_sz(c_in*9)
135 | #stem.append(conv_layer(c_in, nf, stride=2 if i==1 else 1))
136 | #c_in = nf
137 |
138 | block_szs = [64//expansion,64,128,256,512]
139 | blocks = [self._make_layer(expansion, block_szs[i], block_szs[i+1], l, 1 if i==0 else 2, sa = sa if i in[len(layers)-4] else False, sym=sym)
140 | for i,l in enumerate(layers)]
141 | super().__init__(
142 | *stem,
143 | nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
144 | *blocks,
145 | nn.AdaptiveAvgPool2d(1), Flatten(),
146 | nn.Linear(block_szs[-1]*expansion, c_out),
147 | )
148 | init_cnn(self)
149 |
150 | def _make_layer(self, expansion, ni, nf, blocks, stride, sa=False, sym=False):
151 | return nn.Sequential(
152 | *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1, sa if i in [blocks -1] else False,sym)
153 | for i in range(blocks)])
154 |
155 | def mxresnet(expansion, n_layers, name, pretrained=False, **kwargs):
156 | model = MXResNet(expansion, n_layers, **kwargs)
157 | if pretrained:
158 | #model.load_state_dict(model_zoo.load_url(model_urls[name]))
159 | print("No pretrained yet for MXResNet")
160 | return model
161 |
162 | me = sys.modules[__name__]
163 | for n,e,l in [
164 | [ 18 , 1, [2,2,2 ,2] ],
165 | [ 34 , 1, [3,4,6 ,3] ],
166 | [ 50 , 4, [3,4,6 ,3] ],
167 | [ 101, 4, [3,4,23,3] ],
168 | [ 152, 4, [3,8,36,3] ],
169 | ]:
170 | name = f'mxresnet{n}'
171 | setattr(me, name, partial(mxresnet, expansion=e, n_layers=l, name=name))
--------------------------------------------------------------------------------
/diffmod/diffmod.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.optim import Optimizer
4 |
5 | # source and paper link - https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py
6 |
7 | # modifications @lessw2020 - blend diffGrad + AdaMod = diffmod.
8 | # 1/1/20 = instead of b3, change to 'len_memory' and compute b3 (.99 is really 100 memory as 1-(1/100)= .99)
9 |
10 |
11 | class DiffMod(Optimizer):
12 | """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
13 | It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_.
14 | Arguments:
15 | params (iterable): iterable of parameters to optimize or dicts defining
16 | parameter groups
17 | lr (float, optional): learning rate (default: 1e-3)
18 | betas (Tuple[float, float], optional): coefficients used for computing
19 | running averages of gradient and its square (default: (0.9, 0.999))
20 | beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999)
21 | len_memory = b3 in easier to use format. specify the memory len, b3 is computed.
22 | eps (float, optional): term added to the denominator to improve
23 | numerical stability (default: 1e-8)
24 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
25 | """
26 |
27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), len_memory=1000, version=0,
28 | eps=1e-8, weight_decay=0, average_step=False, debug_print=False):
29 | if not 0.0 <= lr:
30 | raise ValueError("Invalid learning rate: {}".format(lr))
31 | if not 0.0 <= eps:
32 | raise ValueError("Invalid epsilon value: {}".format(eps))
33 | if not 0.0 <= betas[0] < 1.0:
34 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
35 | if not 0.0 <= betas[1] < 1.0:
36 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
37 |
38 | #compute b3
39 | base = 1/len_memory
40 | beta3 = 1-(base)
41 | print(f"DiffMod: length of memory is ",len_memory," and b3 is thus ",beta3, "and base = ",base)
42 |
43 | #debugging
44 | self.debug_print=debug_print
45 | self.average_step = average_step
46 | if self.average_step==True:
47 | print(f"DiffMod: step size and exp avg step will be averaged together.")
48 |
49 | if not 0.0 <= beta3 < 1.0:
50 | raise ValueError("Invalid beta3 parameter: {}".format(beta3))
51 |
52 | defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
53 | weight_decay=weight_decay)
54 | super().__init__(params, defaults)
55 |
56 | self.version = version
57 |
58 | def __setstate__(self, state):
59 | super().__setstate__(state)
60 |
61 | def step(self, closure=None):
62 | """Performs a single optimization step.
63 | Arguments:
64 | closure (callable, optional): A closure that reevaluates the model
65 | and returns the loss.
66 | """
67 | loss = None
68 | if closure is not None:
69 | loss = closure()
70 |
71 | for group in self.param_groups:
72 | for p in group['params']:
73 | if p.grad is None:
74 | continue
75 | grad = p.grad.data
76 | if grad.is_sparse:
77 | raise RuntimeError(
78 | 'DiffMod does not support sparse gradients')
79 |
80 | state = self.state[p]
81 |
82 | # State initialization
83 | if len(state) == 0:
84 | state['step'] = 0
85 | # Exponential moving average of gradient values
86 | state['exp_avg'] = torch.zeros_like(p.data)
87 | # Exponential moving average of squared gradient values
88 | state['exp_avg_sq'] = torch.zeros_like(p.data)
89 | # Exponential moving average of actual learning rates
90 | state['exp_avg_lr'] = torch.zeros_like(p.data)
91 | # Previous gradient
92 | state['previous_grad'] = torch.zeros_like(p.data)
93 |
94 |
95 | exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
96 | previous_grad = state['previous_grad']
97 | beta1, beta2 = group['betas']
98 |
99 | state['step'] += 1
100 |
101 | # Decay the first and second moment running average coefficient
102 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
103 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
104 |
105 | denom = exp_avg_sq.sqrt().add_(group['eps'])
106 |
107 | bias_correction1 = 1 - beta1 ** state['step']
108 | bias_correction2 = 1 - beta2 ** state['step']
109 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
110 |
111 | # compute diffgrad coefficient (dfc)
112 |
113 |
114 | if self.version==0:
115 | diff = abs(previous_grad - grad)
116 | elif self.version ==1:
117 | diff = previous_grad-grad
118 | elif self.version ==2:
119 | diff = .5*abs(previous_grad - grad)
120 |
121 | if self.version==0 or self.version==1:
122 | dfc = 1. / (1. + torch.exp(-diff))
123 | elif self.version==2:
124 | dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
125 |
126 | state['previous_grad'] = grad
127 |
128 | if group['weight_decay'] != 0:
129 | p.data.add_(-group['weight_decay'] * group['lr'], p.data)
130 |
131 | # Applies momental bounds on actual learning rates
132 | step_size = torch.full_like(denom, step_size)
133 | step_size.div_(denom)
134 | exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
135 | if self.debug_print:
136 | print(f"batch step size {step_size} and exp_avg_step {exp_avg_lr}")
137 |
138 | if self.average_step:
139 | step_size = step_size.add(exp_avg_lr)
140 | step_size = step_size.div(2.)
141 |
142 | else:
143 | step_size = torch.min(step_size, exp_avg_lr)
144 |
145 | # update momentum with dfc
146 | exp_avg1 = exp_avg * dfc
147 |
148 | step_size.mul_(exp_avg1)
149 |
150 | p.data.add_(-step_size)
151 |
152 | return loss
--------------------------------------------------------------------------------
/images/1120-optimizer-testing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/1120-optimizer-testing.jpg
--------------------------------------------------------------------------------
/images/projected_gradient.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/projected_gradient.png
--------------------------------------------------------------------------------
/images/ranger-init.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/ranger-init.jpg
--------------------------------------------------------------------------------
/images/ranger-with-gc-options.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessw2020/Best-Deep-Learning-Optimizers/070a5090a985a8e418ebbc384f90f80187ecf6ac/images/ranger-with-gc-options.jpg
--------------------------------------------------------------------------------
/madgrad/madgrad_wd.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | # modifications - 4/4/2021 @lessw2020 (decay issue spotted by @nestordemeure )
7 | # weight decay has been implemented AdamW style instead of the original madgrad Adam style.
8 | # in initial image classification testing, this outperformed 0 weight decay or original style weight decay.
9 |
10 | # closure is checked if callable or not since some code passes loss directly, rather than in closure param
11 |
12 | import math
13 | from typing import Collection, TYPE_CHECKING, Any, Callable, Optional
14 |
15 | import torch
16 | import torch.optim
17 | import collections
18 |
19 | if TYPE_CHECKING:
20 | from torch.optim.optimizer import _params_t
21 | else:
22 | _params_t = Any
23 |
24 |
25 | class madgrad_wd(torch.optim.Optimizer):
26 | """
27 | MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
28 | Optimization.
29 |
30 | .. _MADGRAD: https://arxiv.org/abs/2101.11075
31 |
32 | MADGRAD is a general purpose optimizer that can be used in place of SGD or
33 | Adam may converge faster and generalize better. Currently GPU-only.
34 | Typically, the same learning rate schedule that is used for SGD or Adam may
35 | be used. The overall learning rate is not comparable to either method and
36 | should be determined by a hyper-parameter sweep.
37 |
38 | MADGRAD requires less weight decay than other methods, often as little as
39 | zero. Momentum values used for SGD or Adam's beta1 should work here also.
40 |
41 | On sparse problems both weight_decay and momentum should be set to 0.
42 |
43 | Arguments:
44 | params (iterable):
45 | Iterable of parameters to optimize or dicts defining parameter groups.
46 | lr (float):
47 | Learning rate (default: 1e-2).
48 | momentum (float):
49 | Momentum value in the range [0,1) (default: 0.9).
50 | weight_decay (float):
51 | Weight decay, i.e. a L2 penalty (default: 0).
52 | eps (float):
53 | Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6).
54 | """
55 |
56 | def __init__(
57 | self,
58 | params: _params_t,
59 | lr: float = 1e-2,
60 | momentum: float = 0.9,
61 | weight_decay: float = 0,
62 | eps: float = 1e-6,
63 | ):
64 | if momentum < 0 or momentum >= 1:
65 | raise ValueError(f"Momentum {momentum} must be in the range [0,1]")
66 | if lr <= 0:
67 | raise ValueError(f"Learning rate {lr} must be positive")
68 | if weight_decay < 0:
69 | raise ValueError(f"Weight decay {weight_decay} must be non-negative")
70 | if eps < 0:
71 | raise ValueError(f"Eps must be non-negative")
72 |
73 | defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay)
74 | super().__init__(params, defaults)
75 |
76 | @property
77 | def supports_memory_efficient_fp16(self) -> bool:
78 | return False
79 |
80 | @property
81 | def supports_flat_params(self) -> bool:
82 | return True
83 |
84 | def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
85 | """Performs a single optimization step.
86 |
87 | Arguments:
88 | closure (callable, optional): A closure that reevaluates the model
89 | and returns the loss.
90 | """
91 | loss = None
92 | if closure is not None and isinstance(closure, collections.Callable):
93 | loss = closure()
94 |
95 | # step counter must be stored in state to ensure correct behavior under
96 | # optimizer sharding
97 | if "k" not in self.state:
98 | self.state["k"] = torch.tensor([0], dtype=torch.long)
99 | k = self.state["k"].item()
100 |
101 | for group in self.param_groups:
102 | eps = group["eps"]
103 | lr = group["lr"] + eps
104 | decay = group["weight_decay"]
105 | momentum = group["momentum"]
106 |
107 | ck = 1 - momentum
108 | lamb = lr * math.pow(k + 1, 0.5)
109 |
110 | for p in group["params"]:
111 | if p.grad is None:
112 | continue
113 | grad = p.grad.data
114 | state = self.state[p]
115 |
116 | if "grad_sum_sq" not in state:
117 | state["grad_sum_sq"] = torch.zeros_like(p.data).detach()
118 | state["s"] = torch.zeros_like(p.data).detach()
119 | if momentum != 0:
120 | state["x0"] = torch.clone(p.data).detach()
121 |
122 | if momentum != 0.0 and grad.is_sparse:
123 | raise RuntimeError(
124 | "momentum != 0 is not compatible with sparse gradients"
125 | )
126 |
127 | grad_sum_sq = state["grad_sum_sq"]
128 | s = state["s"]
129 |
130 | # Apply weight decay - L2 / AdamW style
131 | if decay:
132 | p.data.mul_(1 - lr * decay)
133 |
134 | """ original impl:
135 | if decay != 0:
136 | if grad.is_sparse:
137 | raise RuntimeError("weight_decay option is not compatible with sparse gradients")
138 |
139 | grad.add_(p.data, alpha=decay)
140 | """
141 |
142 | if grad.is_sparse:
143 | grad = grad.coalesce()
144 | grad_val = grad._values()
145 |
146 | p_masked = p.sparse_mask(grad)
147 | grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad)
148 | s_masked = s.sparse_mask(grad)
149 |
150 | # Compute x_0 from other known quantities
151 | rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
152 | x0_masked_vals = p_masked._values().addcdiv(
153 | s_masked._values(), rms_masked_vals, value=1
154 | )
155 |
156 | # Dense + sparse op
157 | grad_sq = grad * grad
158 | grad_sum_sq.add_(grad_sq, alpha=lamb)
159 | grad_sum_sq_masked.add_(grad_sq, alpha=lamb)
160 |
161 | rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)
162 |
163 | s.add_(grad, alpha=lamb)
164 | s_masked._values().add_(grad_val, alpha=lamb)
165 |
166 | # update masked copy of p
167 | p_kp1_masked_vals = x0_masked_vals.addcdiv(
168 | s_masked._values(), rms_masked_vals, value=-1
169 | )
170 | # Copy updated masked p to dense p using an add operation
171 | p_masked._values().add_(p_kp1_masked_vals, alpha=-1)
172 | p.data.add_(p_masked, alpha=-1)
173 | else:
174 | if momentum == 0:
175 | # Compute x_0 from other known quantities
176 | rms = grad_sum_sq.pow(1 / 3).add_(eps)
177 | x0 = p.data.addcdiv(s, rms, value=1)
178 | else:
179 | x0 = state["x0"]
180 |
181 | # Accumulate second moments
182 | grad_sum_sq.addcmul_(grad, grad, value=lamb)
183 | rms = grad_sum_sq.pow(1 / 3).add_(eps)
184 |
185 | # Update s
186 | s.data.add_(grad, alpha=lamb)
187 |
188 | # Step
189 | if momentum == 0:
190 | p.data.copy_(x0.addcdiv(s, rms, value=-1))
191 | else:
192 | z = x0.addcdiv(s, rms, value=-1)
193 |
194 | # p is a moving average of z
195 | p.data.mul_(1 - ck).add_(z, alpha=ck)
196 |
197 | self.state["k"] += 1
198 | return loss
199 |
--------------------------------------------------------------------------------
/sls/README.md:
--------------------------------------------------------------------------------
1 | SLS = Stochastic Line Search.
2 |
3 | Official source and paper link: https://github.com/IssamLaradji/sls
4 |
5 | The files here represent an integration of SLS into Fast AI 1.057+.
6 |
7 | Testing and additional work in progress...
8 |
--------------------------------------------------------------------------------
/sls/basic_train.py:
--------------------------------------------------------------------------------
1 | "Provides basic training and validation with `Learner`"
2 | from .torch_core import *
3 | from .basic_data import *
4 | from .callback import *
5 | from .data_block import *
6 | from .utils.ipython import gpu_mem_restore
7 | import inspect
8 | from fastprogress.fastprogress import format_time, IN_NOTEBOOK
9 | from time import time
10 | from .sixel import plot_sixel
11 |
12 | __all__ = ['Learner', 'LearnerCallback', 'Recorder', 'RecordOnCPU', 'fit', 'loss_batch', 'train_epoch', 'validate',
13 | 'get_preds', 'load_learner']
14 |
15 | defaults.lr = slice(3e-3)
16 | defaults.wd = 1e-2
17 | defaults.extra_callbacks = None
18 | defaults.extra_callback_fns = None
19 |
20 | def loss_batch(model:nn.Module, xb:Tensor, yb:Tensor, loss_func:OptLossFunc=None, opt:OptOptimizer=None,
21 | cb_handler:Optional[CallbackHandler]=None)->Tuple[Union[Tensor,int,float,str]]:
22 | "Calculate loss and metrics for a batch, call out to callbacks as necessary."
23 | cb_handler = ifnone(cb_handler, CallbackHandler())
24 | if not is_listy(xb): xb = [xb]
25 | if not is_listy(yb): yb = [yb]
26 | out = model(*xb)
27 | out = cb_handler.on_loss_begin(out)
28 |
29 | if not loss_func: return to_detach(out), to_detach(yb[0])
30 | loss = loss_func(out, *yb)
31 |
32 | def closure():
33 | out = model(*xb)
34 | loss = loss_func(out,*yb)
35 | return loss
36 |
37 | if opt is not None:
38 | opt.step(closure)
39 | loss,skip_bwd = cb_handler.on_backward_begin(loss)
40 | #if not skip_bwd: loss.backward()
41 | #if not cb_handler.on_backward_end():
42 | if not cb_handler.on_step_end(): opt.zero_grad()
43 |
44 | loss = loss_func(model(*xb),*yb) #call one more time for updating metrics from SLS
45 |
46 | return loss.detach().cpu()
47 |
48 | def get_preds(model:nn.Module, dl:DataLoader, pbar:Optional[PBar]=None, cb_handler:Optional[CallbackHandler]=None,
49 | activ:nn.Module=None, loss_func:OptLossFunc=None, n_batch:Optional[int]=None) -> List[Tensor]:
50 | "Tuple of predictions and targets, and optional losses (if `loss_func`) using `dl`, max batches `n_batch`."
51 | res = [to_float(torch.cat(o).cpu()) for o in
52 | zip(*validate(model, dl, cb_handler=cb_handler, pbar=pbar, average=False, n_batch=n_batch))]
53 | if loss_func is not None:
54 | with NoneReduceOnCPU(loss_func) as lf: res.append(lf(res[0], res[1]))
55 | if activ is not None: res[0] = activ(res[0])
56 | return res
57 |
58 | def validate(model:nn.Module, dl:DataLoader, loss_func:OptLossFunc=None, cb_handler:Optional[CallbackHandler]=None,
59 | pbar:Optional[PBar]=None, average=True, n_batch:Optional[int]=None)->Iterator[Tuple[Union[Tensor,int],...]]:
60 | "Calculate `loss_func` of `model` on `dl` in evaluation mode."
61 | model.eval()
62 | with torch.no_grad():
63 | val_losses,nums = [],[]
64 | if cb_handler: cb_handler.set_dl(dl)
65 | for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)):
66 | if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False)
67 | val_loss = loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler)
68 | val_losses.append(val_loss)
69 | if not is_listy(yb): yb = [yb]
70 | nums.append(first_el(yb).shape[0])
71 | if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
72 | if n_batch and (len(nums)>=n_batch): break
73 | nums = np.array(nums, dtype=np.float32)
74 | if average: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum()
75 | else: return val_losses
76 |
77 | def train_epoch(model:nn.Module, dl:DataLoader, opt:optim.Optimizer, loss_func:LossFunction)->None:
78 | "Simple training of `model` for 1 epoch of `dl` using optim `opt` and loss function `loss_func`."
79 | model.train()
80 | for xb,yb in dl:
81 | loss = loss_func(model(xb), yb)
82 | loss.backward()
83 | opt.step()
84 | opt.zero_grad()
85 |
86 | @dataclass
87 | class BasicLearner():
88 | model:nn.Module
89 | loss_func:LossFunction
90 | opt:optim.Optimizer
91 | data:DataBunch
92 |
93 | def fit(epochs:int, learn:BasicLearner, callbacks:Optional[CallbackList]=None, metrics:OptMetrics=None)->None:
94 | "Fit the `model` on `data` and learn using `loss_func` and `opt`."
95 | assert len(learn.data.train_dl) != 0, f"""Your training dataloader is empty, can't train a model.
96 | Use a smaller batch size (batch size={learn.data.train_dl.batch_size} for {len(learn.data.train_dl.dataset)} elements)."""
97 | cb_handler = CallbackHandler(callbacks, metrics)
98 | pbar = master_bar(range(epochs))
99 | cb_handler.on_train_begin(epochs, pbar=pbar, metrics=metrics)
100 |
101 | exception=False
102 | try:
103 | for epoch in pbar:
104 | learn.model.train()
105 | cb_handler.set_dl(learn.data.train_dl)
106 | cb_handler.on_epoch_begin()
107 | for xb,yb in progress_bar(learn.data.train_dl, parent=pbar):
108 | xb, yb = cb_handler.on_batch_begin(xb, yb)
109 | loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler)
110 | if cb_handler.on_batch_end(loss): break
111 |
112 | if not cb_handler.skip_validate and not learn.data.empty_val:
113 | val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
114 | cb_handler=cb_handler, pbar=pbar)
115 | else: val_loss=None
116 | if cb_handler.on_epoch_end(val_loss): break
117 | except Exception as e:
118 | exception = e
119 | raise
120 | finally: cb_handler.on_train_end(exception)
121 |
122 | loss_func_name2activ = {'cross_entropy_loss': F.softmax, 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,
123 | 'kl_div_loss': torch.exp, 'bce_with_logits_loss': torch.sigmoid, 'cross_entropy': F.softmax,
124 | 'kl_div': torch.exp, 'binary_cross_entropy_with_logits': torch.sigmoid,
125 | }
126 |
127 | def _loss_func_name2activ(name:str, axis:int=-1):
128 | res = loss_func_name2activ[name]
129 | if res == F.softmax: res = partial(F.softmax, dim=axis)
130 | return res
131 |
132 | def _loss_func2activ(loss_func):
133 | if getattr(loss_func,'keywords',None):
134 | if not loss_func.keywords.get('log_input', True): return
135 | axis = getattr(loss_func, 'axis', -1)
136 | # flattened loss
137 | loss_func = getattr(loss_func, 'func', loss_func)
138 | # could have a partial inside flattened loss! Duplicate on purpose.
139 | loss_func = getattr(loss_func, 'func', loss_func)
140 | cls_name = camel2snake(loss_func.__class__.__name__)
141 | if cls_name == 'mix_up_loss':
142 | loss_func = loss_func.crit
143 | cls_name = camel2snake(loss_func.__class__.__name__)
144 | if cls_name in loss_func_name2activ:
145 | if cls_name == 'poisson_nll_loss' and (not getattr(loss_func, 'log_input', True)): return
146 | return _loss_func_name2activ(cls_name, axis)
147 | if getattr(loss_func,'__name__','') in loss_func_name2activ:
148 | return _loss_func_name2activ(loss_func.__name__, axis)
149 | return noop
150 |
151 | @dataclass
152 | class Learner():
153 | "Trainer for `model` using `data` to minimize `loss_func` with optimizer `opt_func`."
154 | data:DataBunch
155 | model:nn.Module
156 | opt_func:Callable=AdamW
157 | loss_func:Callable=None
158 | metrics:Collection[Callable]=None
159 | true_wd:bool=True
160 | bn_wd:bool=True
161 | wd:Floats=defaults.wd
162 | train_bn:bool=True
163 | path:str = None
164 | model_dir:PathOrStr = 'models'
165 | callback_fns:Collection[Callable]=None
166 | callbacks:Collection[Callback]=field(default_factory=list)
167 | layer_groups:Collection[nn.Module]=None
168 | add_time:bool=True
169 | silent:bool=None
170 | def __post_init__(self)->None:
171 | "Setup path,metrics, callbacks and ensure model directory exists."
172 | self.path = Path(ifnone(self.path, self.data.path))
173 | self.model = self.model.to(self.data.device)
174 | self.loss_func = self.loss_func or self.data.loss_func
175 | self.metrics=listify(self.metrics)
176 | if not self.layer_groups: self.layer_groups = [nn.Sequential(*flatten_model(self.model))]
177 | self.callbacks = listify(self.callbacks)
178 | if self.silent is None: self.silent = defaults.silent
179 | self.callback_fns = [partial(Recorder, add_time=self.add_time, silent=self.silent)] + listify(self.callback_fns)
180 | if defaults.extra_callbacks is not None: self.callbacks += defaults.extra_callbacks
181 |
182 | def init(self, init): apply_init(self.model, init)
183 |
184 | def _test_writeable_path(self):
185 | path = self.path/self.model_dir
186 | try:
187 | path.mkdir(parents=True, exist_ok=True)
188 | tmp_file = get_tmp_file(path)
189 | except OSError as e:
190 | raise Exception(f"{e}\nCan't write to '{path}', set `learn.model_dir` attribute in Learner to a full libpath path that is writable") from None
191 | os.remove(tmp_file)
192 |
193 | def lr_range(self, lr:Union[float,slice])->np.ndarray:
194 | "Build differential learning rates from `lr`."
195 | if not isinstance(lr,slice): return lr
196 | if lr.start: res = even_mults(lr.start, lr.stop, len(self.layer_groups))
197 | else: res = [lr.stop/10]*(len(self.layer_groups)-1) + [lr.stop]
198 | return np.array(res)
199 |
200 | def fit(self, epochs:int, lr:Union[Floats,slice]=defaults.lr,
201 | wd:Floats=None, callbacks:Collection[Callback]=None)->None:
202 | "Fit the model on this learner with `lr` learning rate, `wd` weight decay for `epochs` with `callbacks`."
203 | lr = self.lr_range(lr)
204 | if wd is None: wd = self.wd
205 | if not getattr(self, 'opt', False): self.create_opt(lr, wd)
206 | else: self.opt.lr,self.opt.wd = lr,wd
207 | callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
208 | fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
209 |
210 | def create_opt(self, lr:Floats, wd:Floats=0.)->None:
211 | "Create optimizer with `lr` learning rate and `wd` weight decay."
212 | self.opt = OptimWrapper.create(self.opt_func, lr, self.layer_groups, wd=wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
213 |
214 | def split(self, split_on:SplitFuncOrIdxList)->None:
215 | "Split the model at `split_on`."
216 | if isinstance(split_on,Callable): split_on = split_on(self.model)
217 | self.layer_groups = split_model(self.model, split_on)
218 | return self
219 |
220 | def freeze_to(self, n:int)->None:
221 | "Freeze layers up to layer group `n`."
222 | if hasattr(self.model, 'reset'): self.model.reset()
223 | for g in self.layer_groups[:n]:
224 | for l in g:
225 | if not self.train_bn or not isinstance(l, bn_types): requires_grad(l, False)
226 | for g in self.layer_groups[n:]: requires_grad(g, True)
227 | self.create_opt(defaults.lr)
228 |
229 | def freeze(self)->None:
230 | "Freeze up to last layer group."
231 | assert(len(self.layer_groups)>1)
232 | self.freeze_to(-1)
233 |
234 | def unfreeze(self):
235 | "Unfreeze entire model."
236 | self.freeze_to(0)
237 |
238 | def export(self, file:PathLikeOrBinaryStream='export.pkl', destroy=False):
239 | "Export the state of the `Learner` in `self.path/file`. `file` can be file-like (file or buffer)"
240 | if rank_distrib(): return # don't save if slave proc
241 | args = ['opt_func', 'loss_func', 'metrics', 'true_wd', 'bn_wd', 'wd', 'train_bn', 'model_dir', 'callback_fns']
242 | state = {a:getattr(self,a) for a in args}
243 | state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks}
244 | #layer_groups -> need to find a way
245 | #TO SEE: do we save model structure and weights separately?
246 | with ModelOnCPU(self.model) as m:
247 | state['model'] = m
248 | xtra = dict(normalize=self.data.norm.keywords) if getattr(self.data, 'norm', False) else {}
249 | state['data'] = self.data.valid_ds.get_state(**xtra)
250 | state['cls'] = self.__class__
251 | try_save(state, self.path, file)
252 | if destroy: self.destroy()
253 |
254 | def save(self, file:PathLikeOrBinaryStream=None, return_path:bool=False, with_opt:bool=True):
255 | "Save model and optimizer state (if `with_opt`) with `file` to `self.model_dir`. `file` can be file-like (file or buffer)"
256 | if is_pathlike(file): self._test_writeable_path()
257 | if rank_distrib(): return # don't save if slave proc
258 | target = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file
259 | if not hasattr(self, 'opt'): with_opt=False
260 | if not with_opt: state = get_model(self.model).state_dict()
261 | else: state = {'model': get_model(self.model).state_dict(), 'opt':self.opt.state_dict()}
262 | torch.save(state, target)
263 | if return_path: return target
264 |
265 | def dl(self, ds_type:DatasetType=DatasetType.Valid):
266 | "Return DataLoader for DatasetType `ds_type`."
267 | return self.data.dl(ds_type)
268 |
269 | def load(self, file:PathLikeOrBinaryStream=None, device:torch.device=None, strict:bool=True,
270 | with_opt:bool=None, purge:bool=False, remove_module:bool=False)->'Learner':
271 | "Load model and optimizer state (if `with_opt`) `file` from `self.model_dir` using `device`. `file` can be file-like (file or buffer)"
272 | if purge: self.purge(clear_opt=ifnone(with_opt, False))
273 | if device is None: device = self.data.device
274 | elif isinstance(device, int): device = torch.device('cuda', device)
275 | source = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file
276 | distrib_barrier()
277 | state = torch.load(source, map_location=device)
278 | if set(state.keys()) == {'model', 'opt'}:
279 | model_state = state['model']
280 | if remove_module: model_state = remove_module_load(model_state)
281 | get_model(self.model).load_state_dict(model_state, strict=strict)
282 | if ifnone(with_opt,True):
283 | if not hasattr(self, 'opt'): self.create_opt(defaults.lr, self.wd)
284 | try: self.opt.load_state_dict(state['opt'])
285 | except: pass
286 | else:
287 | if with_opt: warn("Saved filed doesn't contain an optimizer state.")
288 | if remove_module: state = remove_module_load(state)
289 | get_model(self.model).load_state_dict(state, strict=strict)
290 | del state
291 | gc.collect()
292 | return self
293 |
294 | def destroy(self):
295 | "Free the Learner internals, leaving just an empty shell that consumes no memory"
296 |
297 | class ZombieLearner(Learner):
298 | msg = "this object has been destroyed"
299 | def __getattr__(self, item): print(ZombieLearner.msg); return None
300 | def destroyed(*args, **kwargs): print(ZombieLearner.msg)
301 |
302 | attrs = [k for k in self.__dict__.keys() if not k.startswith("__")]
303 | for a in attrs: delattr(self, a)
304 | # the instance methods can still be called, but will just give a message
305 | methods = [k for k in dir(self) if not k.startswith("__") and inspect.isroutine(getattr(self, k))]
306 | for m in methods: setattr(self, m, ZombieLearner.destroyed)
307 | self.__class__ = ZombieLearner
308 | gc.collect()
309 | print("this Learner object self-destroyed - it still exists, but no longer usable")
310 |
311 | def purge(self, clear_opt:bool=True):
312 | "Purge the `Learner` of all cached attributes to release some GPU memory."
313 | self._test_writeable_path()
314 | attrs_all = [k for k in self.__dict__.keys() if not k.startswith("__")]
315 | attrs_pkl = ['bn_wd', 'callback_fns', 'layer_groups', 'loss_func', 'metrics', 'model',
316 | 'model_dir', 'opt_func', 'path', 'train_bn', 'true_wd', 'wd']
317 | # +callbacks: get pickled too, but not directly
318 | attrs_keep = ['data', 'recorder']
319 | attrs_del = list(set(attrs_all) - set(attrs_keep))
320 | state = {a:getattr(self, a) for a in attrs_pkl}
321 | state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks}
322 | if hasattr(self, 'opt'): state['opt'] = self.opt.get_state()
323 |
324 | tmp_file = get_tmp_file(self.path/self.model_dir)
325 | torch.save(state, open(tmp_file, 'wb'))
326 | for a in attrs_del: delattr(self, a)
327 | gc.collect()
328 | state = torch.load(tmp_file)
329 | os.remove(tmp_file)
330 |
331 | for a in attrs_pkl: setattr(self, a, state[a])
332 | cb_state = state.pop('cb_state')
333 | self.callbacks = [load_callback(c,s, self) for c,s in cb_state.items()]
334 | if not clear_opt and 'opt' in state:
335 | try: self.opt = OptimWrapper.load_with_state_and_layer_group(state['opt'], self.layer_groups)
336 | except: warn("Wasn't able to properly load the optimizer state again.")
337 | del state
338 | gc.collect()
339 | return self
340 |
341 | def get_preds(self, ds_type:DatasetType=DatasetType.Valid, activ:nn.Module=None,
342 | with_loss:bool=False, n_batch:Optional[int]=None, pbar:Optional[PBar]=None) -> List[Tensor]:
343 | "Return predictions and targets on `ds_type` dataset."
344 | lf = self.loss_func if with_loss else None
345 | activ = ifnone(activ, _loss_func2activ(self.loss_func))
346 | if not getattr(self, 'opt', False): self.create_opt(defaults.lr, self.wd)
347 | callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(self.callbacks)
348 | return get_preds(self.model, self.dl(ds_type), cb_handler=CallbackHandler(callbacks),
349 | activ=activ, loss_func=lf, n_batch=n_batch, pbar=pbar)
350 |
351 | def pred_batch(self, ds_type:DatasetType=DatasetType.Valid, batch:Tuple=None, reconstruct:bool=False,
352 | with_dropout:bool=False, activ:nn.Module=None) -> List[Tensor]:
353 | "Return output of the model on one batch from `ds_type` dataset."
354 | if batch is not None: xb,yb = batch
355 | else: xb,yb = self.data.one_batch(ds_type, detach=False, denorm=False)
356 | cb_handler = CallbackHandler(self.callbacks)
357 | xb,yb = cb_handler.on_batch_begin(xb,yb, train=False)
358 | activ = ifnone(activ, _loss_func2activ(self.loss_func))
359 | with torch.no_grad():
360 | if not with_dropout: preds = loss_batch(self.model.eval(), xb, yb, cb_handler=cb_handler)
361 | else: preds = loss_batch(self.model.eval().apply(self.apply_dropout), xb, yb, cb_handler=cb_handler)
362 | res = activ(preds[0])
363 | if not reconstruct: return res
364 | res = res.detach().cpu()
365 | ds = self.dl(ds_type).dataset
366 | norm = getattr(self.data, 'norm', False)
367 | if norm and norm.keywords.get('do_y',False):
368 | res = self.data.denorm(res, do_x=True)
369 | return [ds.reconstruct(o) for o in res]
370 |
371 | def backward(self, item):
372 | "Pass `item` through the model and computes the gradient. Useful if `backward_hooks` are attached."
373 | xb,yb = self.data.one_item(item)
374 | loss = loss_batch(self.model.eval(), xb, yb, self.loss_func, opt=FakeOptimizer(),
375 | cb_handler=CallbackHandler(self.callbacks))
376 | return loss
377 |
378 | def predict(self, item:ItemBase, return_x:bool=False, batch_first:bool=True, with_dropout:bool=False, **kwargs):
379 | "Return predicted class, label and probabilities for `item`."
380 | batch = self.data.one_item(item)
381 | res = self.pred_batch(batch=batch, with_dropout=with_dropout)
382 | raw_pred,x = grab_idx(res,0,batch_first=batch_first),batch[0]
383 | norm = getattr(self.data,'norm',False)
384 | if norm:
385 | x = self.data.denorm(x)
386 | if norm.keywords.get('do_y',False): raw_pred = self.data.denorm(raw_pred)
387 | ds = self.data.single_ds
388 | pred = ds.y.analyze_pred(raw_pred, **kwargs)
389 | x = ds.x.reconstruct(grab_idx(x, 0))
390 | y = ds.y.reconstruct(pred, x) if has_arg(ds.y.reconstruct, 'x') else ds.y.reconstruct(pred)
391 | return (x, y, pred, raw_pred) if return_x else (y, pred, raw_pred)
392 |
393 | def validate(self, dl=None, callbacks=None, metrics=None):
394 | "Validate on `dl` with potential `callbacks` and `metrics`."
395 | dl = ifnone(dl, self.data.valid_dl)
396 | metrics = ifnone(metrics, self.metrics)
397 | cb_handler = CallbackHandler(self.callbacks + ifnone(callbacks, []), metrics)
398 | cb_handler.on_train_begin(1, None, metrics); cb_handler.on_epoch_begin()
399 | val_metrics = validate(self.model, dl, self.loss_func, cb_handler)
400 | cb_handler.on_epoch_end(val_metrics)
401 | return cb_handler.state_dict['last_metrics']
402 |
403 | def show_results(self, ds_type=DatasetType.Valid, rows:int=5, **kwargs):
404 | "Show `rows` result of predictions on `ds_type` dataset."
405 | #TODO: get read of has_arg x and split_kwargs_by_func if possible
406 | #TODO: simplify this and refactor with pred_batch(...reconstruct=True)
407 | n_items = rows ** 2 if self.data.train_ds.x._square_show_res else rows
408 | if self.dl(ds_type).batch_size < n_items: n_items = self.dl(ds_type).batch_size
409 | ds = self.dl(ds_type).dataset
410 | self.callbacks.append(RecordOnCPU())
411 | preds = self.pred_batch(ds_type)
412 | *self.callbacks,rec_cpu = self.callbacks
413 | x,y = rec_cpu.input,rec_cpu.target
414 | norm = getattr(self.data,'norm',False)
415 | if norm:
416 | x = self.data.denorm(x)
417 | if norm.keywords.get('do_y',False):
418 | y = self.data.denorm(y, do_x=True)
419 | preds = self.data.denorm(preds, do_x=True)
420 | analyze_kwargs,kwargs = split_kwargs_by_func(kwargs, ds.y.analyze_pred)
421 | preds = [ds.y.analyze_pred(grab_idx(preds, i), **analyze_kwargs) for i in range(n_items)]
422 | xs = [ds.x.reconstruct(grab_idx(x, i)) for i in range(n_items)]
423 | if has_arg(ds.y.reconstruct, 'x'):
424 | ys = [ds.y.reconstruct(grab_idx(y, i), x=x) for i,x in enumerate(xs)]
425 | zs = [ds.y.reconstruct(z, x=x) for z,x in zip(preds,xs)]
426 | else :
427 | ys = [ds.y.reconstruct(grab_idx(y, i)) for i in range(n_items)]
428 | zs = [ds.y.reconstruct(z) for z in preds]
429 | ds.x.show_xyzs(xs, ys, zs, **kwargs)
430 |
431 | def apply_dropout(self, m):
432 | "If a module contains 'dropout' in it's name, it will be switched to .train() mode."
433 | if 'dropout' in m.__class__.__name__.lower(): m.train()
434 |
435 | def predict_with_mc_dropout(self, item:ItemBase, with_dropout:bool=True, n_times=10, **kwargs):
436 | "Make predictions with dropout turned on for n_times (default 10)."
437 | return [self.predict(item, with_dropout=with_dropout) for _ in range(n_times)]
438 |
439 | class RecordOnCPU(Callback):
440 | "Store the `input` and `target` going through the model on the CPU."
441 | def on_batch_begin(self, last_input,last_target,**kwargs):
442 | self.input,self.target = to_cpu(last_input),to_cpu(last_target)
443 |
444 | class LearnerCallback(Callback):
445 | "Base class for creating callbacks for a `Learner`."
446 | def __init__(self, learn):
447 | self._learn = weakref.ref(learn)
448 | self.exclude,self.not_min = ['_learn'],[]
449 | setattr(self.learn, self.cb_name, self)
450 |
451 | def __getattr__(self,k): return getattr(self.learn, k)
452 | def __setstate__(self,data:Any): self.__dict__.update(data)
453 |
454 | @property
455 | def learn(self) -> Learner: return self._learn()
456 | @learn.setter
457 | def learn(self, learn: Learner) -> None: self._learn = weakref.ref(learn)
458 |
459 | @property
460 | def cb_name(self): return camel2snake(self.__class__.__name__)
461 |
462 | class Recorder(LearnerCallback):
463 | "A `LearnerCallback` that records epoch, loss, opt and metric data during training."
464 | _order=-10
465 | def __init__(self, learn:Learner, add_time:bool=True, silent:bool=False):
466 | super().__init__(learn)
467 | if not getattr(self.learn, 'opt', False): self.learn.create_opt(defaults.lr, self.learn.wd)
468 | self.opt = self.learn.opt
469 | self.train_dl = self.learn.data.train_dl
470 | self.no_val,self.silent,self.add_time = False,silent,add_time
471 |
472 | def on_train_begin(self, pbar:PBar, metrics_names:Collection[str], **kwargs:Any)->None:
473 | "Initialize recording status at beginning of training."
474 | self.pbar = pbar
475 | self.names = ['epoch', 'train_loss'] if self.no_val else ['epoch', 'train_loss', 'valid_loss']
476 | self.metrics_names = metrics_names
477 | if hasattr(self, '_added_met_names'): self.metrics_names += self._added_met_names
478 | self.names += self.metrics_names
479 | if self.add_time: self.names.append('time')
480 | if not self.silent: self.pbar.write(self.names, table=True)
481 | self.losses,self.val_losses,self.lrs,self.moms,self.metrics,self.nb_batches = [],[],[],[],[],[]
482 |
483 | def on_epoch_begin(self, **kwargs:Any)->None:
484 | if self.add_time: self.start_epoch = time()
485 |
486 | def on_batch_begin(self, train, **kwargs:Any)->None:
487 | "Record learning rate and momentum at beginning of batch."
488 | if train:
489 | self.lrs.append(self.opt.lr)
490 | #if self.opt.mom is not None:
491 | #self.moms.append(self.opt.mom)
492 |
493 | def on_backward_begin(self, smooth_loss:Tensor, **kwargs:Any)->None:
494 | "Record the loss before any other callback has a chance to modify it."
495 | self.losses.append(smooth_loss)
496 | if self.pbar is not None and hasattr(self.pbar,'child'):
497 | self.pbar.child.comment = f'{smooth_loss:.4f}'
498 |
499 | def on_epoch_end(self, epoch:int, num_batch:int, smooth_loss:Tensor,
500 | last_metrics:MetricsList, **kwargs:Any)->bool:
501 | "Save epoch info: num_batch, smooth_loss, metrics."
502 | self.nb_batches.append(num_batch)
503 | if last_metrics is not None: self.val_losses.append(last_metrics[0])
504 | else: last_metrics = [] if self.no_val else [None]
505 | if len(last_metrics) > 1: self.metrics.append(last_metrics[1:])
506 | self.format_stats([epoch, smooth_loss] + last_metrics)
507 |
508 | def format_stats(self, stats:TensorOrNumList)->None:
509 | "Format stats before printing."
510 | str_stats = []
511 | for name,stat in zip(self.names,stats):
512 | str_stats.append('#na#' if stat is None else str(stat) if isinstance(stat, int) else f'{stat:.6f}')
513 | if self.add_time: str_stats.append(format_time(time() - self.start_epoch))
514 | if not self.silent: self.pbar.write(str_stats, table=True)
515 |
516 | def add_metric_names(self, names):
517 | "Add `names` to the inner metric names."
518 | if hasattr(self, '_added_met_names'): self._added_met_names += names
519 | else: self._added_met_names = names
520 |
521 | def plot_lr(self, show_moms=False, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
522 | "Plot learning rate, `show_moms` to include momentum."
523 | lrs = self._split_list(self.lrs, skip_start, skip_end)
524 | iterations = self._split_list(range_of(self.lrs), skip_start, skip_end)
525 | if show_moms:
526 | moms = self._split_list(self.moms, skip_start, skip_end)
527 | fig, axs = plt.subplots(1,2, figsize=(12,4))
528 | axs[0].plot(iterations, lrs)
529 | axs[0].set_xlabel('Iterations')
530 | axs[0].set_ylabel('Learning Rate')
531 | axs[1].plot(iterations, moms)
532 | axs[1].set_xlabel('Iterations')
533 | axs[1].set_ylabel('Momentum')
534 | else:
535 | fig, ax = plt.subplots()
536 | ax.plot(iterations, lrs)
537 | ax.set_xlabel('Iterations')
538 | ax.set_ylabel('Learning Rate')
539 | if ifnone(return_fig, defaults.return_fig): return fig
540 | if not IN_NOTEBOOK: plot_sixel(fig)
541 |
542 | @staticmethod
543 | def smoothen_by_spline(xs, ys, **kwargs):
544 | xs = np.arange(len(ys))
545 | spl = scipy.interpolate.UnivariateSpline(xs, ys, **kwargs)
546 | ys = spl(xs)
547 | return ys
548 |
549 | def plot(self, skip_start:int=10, skip_end:int=5, suggestion:bool=False, return_fig:bool=None,
550 | **kwargs)->Optional[plt.Figure]:
551 | "Plot learning rate and losses, trimmed between `skip_start` and `skip_end`. Optionally plot and return min gradient"
552 | lrs = self._split_list(self.lrs, skip_start, skip_end)
553 | losses = self._split_list(self.losses, skip_start, skip_end)
554 | losses = [x.item() for x in losses]
555 | if 'k' in kwargs: losses = self.smoothen_by_spline(lrs, losses, **kwargs)
556 | fig, ax = plt.subplots(1,1)
557 | ax.plot(lrs, losses)
558 | ax.set_ylabel("Loss")
559 | ax.set_xlabel("Learning Rate")
560 | ax.set_xscale('log')
561 | ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e'))
562 | if suggestion:
563 | try: mg = (np.gradient(np.array(losses))).argmin()
564 | except:
565 | print("Failed to compute the gradients, there might not be enough points.")
566 | return
567 | print(f"Min numerical gradient: {lrs[mg]:.2E}")
568 | ax.plot(lrs[mg],losses[mg],markersize=10,marker='o',color='red')
569 | self.min_grad_lr = lrs[mg]
570 | ml = np.argmin(losses)
571 | print(f"Min loss divided by 10: {lrs[ml]/10:.2E}")
572 | if ifnone(return_fig, defaults.return_fig): return fig
573 | if not IN_NOTEBOOK: plot_sixel(fig)
574 |
575 | def plot_losses(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
576 | "Plot training and validation losses."
577 | fig, ax = plt.subplots(1,1)
578 | losses = self._split_list(self.losses, skip_start, skip_end)
579 | iterations = self._split_list(range_of(self.losses), skip_start, skip_end)
580 | ax.plot(iterations, losses, label='Train')
581 | val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)
582 | val_losses = self._split_list_val(self.val_losses, skip_start, skip_end)
583 | ax.plot(val_iter, val_losses, label='Validation')
584 | ax.set_ylabel('Loss')
585 | ax.set_xlabel('Batches processed')
586 | ax.legend()
587 | if ifnone(return_fig, defaults.return_fig): return fig
588 | if not IN_NOTEBOOK: plot_sixel(fig)
589 |
590 | def plot_metrics(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
591 | "Plot metrics collected during training."
592 | assert len(self.metrics) != 0, "There are no metrics to plot."
593 | fig, axes = plt.subplots(len(self.metrics[0]),1,figsize=(6, 4*len(self.metrics[0])))
594 | val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)
595 | axes = axes.flatten() if len(self.metrics[0]) != 1 else [axes]
596 | for i, ax in enumerate(axes):
597 | values = [met[i] for met in self.metrics]
598 | values = self._split_list_val(values, skip_start, skip_end)
599 | ax.plot(val_iter, values)
600 | ax.set_ylabel(str(self.metrics_names[i]))
601 | ax.set_xlabel('Batches processed')
602 | if ifnone(return_fig, defaults.return_fig): return fig
603 | if not IN_NOTEBOOK: plot_sixel(fig)
604 |
605 | def _split_list(self, vals:Collection[float], skip_start:int, skip_end:int):
606 | return vals[skip_start:-skip_end] if skip_end > 0 else vals[skip_start:]
607 |
608 | def _split_list_val(self, vals:Collection[float], skip_start:int, skip_end:int):
609 | val_iter = np.cumsum(self.nb_batches)
610 | start_val = (val_iter - skip_start >= 0).nonzero()[0].min()
611 | end_val = (val_iter[-1] - val_iter - skip_end >= 0).nonzero()[0].max()+1
612 | return vals[start_val:end_val] if skip_end > 0 else vals[start_val:]
613 |
614 | class FakeOptimizer():
615 | def step(self): pass
616 | def zero_grad(self): pass
617 |
618 | def load_callback(class_func, state, learn:Learner):
619 | init_kwargs, others = split_kwargs_by_func(state, class_func.__init__)
620 | res = class_func(learn, **init_kwargs) if issubclass(class_func, LearnerCallback) else class_func(**init_kwargs)
621 | for k,v in others.items(): setattr(res, k, v)
622 | return res
623 |
624 | def load_learner(path:PathOrStr, file:PathLikeOrBinaryStream='export.pkl', test:ItemList=None, tfm_y=None, **db_kwargs):
625 | "Load a `Learner` object saved with `export_state` in `path/file` with empty data, optionally add `test` and load on `cpu`. `file` can be file-like (file or buffer)"
626 | source = Path(path)/file if is_pathlike(file) else file
627 | state = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source)
628 | model = state.pop('model')
629 | src = LabelLists.load_state(path, state.pop('data'))
630 | if test is not None: src.add_test(test, tfm_y=tfm_y)
631 | data = src.databunch(**db_kwargs)
632 | cb_state = state.pop('cb_state')
633 | clas_func = state.pop('cls')
634 | res = clas_func(data, model, **state)
635 | res.callback_fns = state['callback_fns'] #to avoid duplicates
636 | res.callbacks = [load_callback(c,s, res) for c,s in cb_state.items()]
637 | return res
638 |
--------------------------------------------------------------------------------
/sls/callback.py:
--------------------------------------------------------------------------------
1 | "Callbacks provides extensibility to the `basic_train` loop. See `train` for examples of custom callbacks."
2 | from .basic_data import *
3 | from .torch_core import *
4 | import torch.distributed as dist
5 |
6 | __all__ = ['AverageMetric', 'Callback', 'CallbackHandler', 'OptimWrapper', 'SmoothenValue', 'Scheduler', 'annealing_cos', 'CallbackList',
7 | 'annealing_exp', 'annealing_linear', 'annealing_no', 'annealing_poly']
8 |
9 | class OptimWrapper():
10 | "Basic wrapper around `opt` to simplify hyper-parameters changes."
11 | def __init__(self, opt:optim.Optimizer, wd:Floats=0., true_wd:bool=False, bn_wd:bool=True):
12 | assert not isinstance(opt, OptimWrapper)
13 | self.opt,self.true_wd,self.bn_wd = opt,true_wd,bn_wd
14 | self.opt_keys = list(self.opt.param_groups[0].keys())
15 | self.opt_keys.remove('params')
16 | self.read_defaults()
17 | self.wd = wd
18 |
19 | @classmethod
20 | def create(cls, opt_func:Union[type,Callable], lr:Union[float,Tuple,List], layer_groups:ModuleList, wd:Floats=0.,
21 | true_wd:bool=False, bn_wd:bool=True)->optim.Optimizer:
22 | "Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`."
23 | split_params = split_no_wd_params(layer_groups)
24 | opt = opt_func([{'params': p, 'lr':0} for p in split_params])
25 | opt = cls(opt, wd=wd, true_wd=true_wd, bn_wd=bn_wd)
26 | opt.lr,opt.opt_func = listify(lr, layer_groups),opt_func
27 | return opt
28 |
29 | def new(self, layer_groups:Collection[nn.Module], split_no_wd:bool=True):
30 | "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters."
31 | opt_func = getattr(self, 'opt_func', self.opt.__class__)
32 | res = self.create(opt_func, self.lr, layer_groups, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
33 | res.mom,res.beta = self.mom,self.beta
34 | return res
35 |
36 | def new_with_params(self, param_groups:Collection[Collection[nn.Parameter]]):
37 | "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters."
38 | opt_func = getattr(self, 'opt_func', self.opt.__class__)
39 | opt = opt_func([{'params': p, 'lr':0} for p in param_groups])
40 | opt = self.__class__(opt, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
41 | opt.lr,opt.opt_func,opt.mom,opt.beta = self.lr,opt_func,self.mom,self.beta
42 | return opt
43 |
44 | def __repr__(self)->str:
45 | return f'OptimWrapper over {repr(self.opt)}.\nTrue weight decay: {self.true_wd}'
46 |
47 | #Pytorch optimizer methods
48 | def step(self,closure=None)->None:
49 | "Set weight decay and step optimizer."
50 | # weight decay outside of optimizer step (AdamW)
51 | if self.true_wd:
52 | for lr,wd,pg1,pg2 in zip(self._lr,self._wd,self.opt.param_groups[::2],self.opt.param_groups[1::2]):
53 | for p in pg1['params']: p.data.mul_(1 - wd*lr)
54 | if self.bn_wd:
55 | for p in pg2['params']: p.data.mul_(1 - wd*lr)
56 | self.set_val('weight_decay', listify(0, self._wd))
57 | self.opt.step(closure)
58 |
59 | def zero_grad(self)->None:
60 | "Clear optimizer gradients."
61 | self.opt.zero_grad()
62 |
63 | #Passthrough to the inner opt.
64 | def __getattr__(self, k:str)->Any: return getattr(self.opt, k, None)
65 | def __setstate__(self,data:Any): self.__dict__.update(data)
66 |
67 | def clear(self):
68 | "Reset the state of the inner optimizer."
69 | sd = self.state_dict()
70 | sd['state'] = {}
71 | self.load_state_dict(sd)
72 |
73 | @property
74 | def n_params(self): return sum([len(pg['params']) for pg in self.opt.param_groups])
75 |
76 | #Hyperparameters as properties
77 | @property
78 | def lr(self)->float: return self._lr[-1]
79 | @lr.setter
80 | def lr(self, val:float)->None:
81 | self._lr = self.set_val('lr', listify(val, self._lr))
82 |
83 | @property
84 | def mom(self)->float:return self._mom[-1]
85 | @mom.setter
86 | def mom(self, val:float)->None:
87 | if 'momentum' in self.opt_keys: self.set_val('momentum', listify(val, self._mom))
88 | elif 'betas' in self.opt_keys: self.set_val('betas', (listify(val, self._mom), self._beta))
89 | self._mom = listify(val, self._mom)
90 |
91 | @property
92 | def beta(self)->float: return None if self._beta is None else self._beta[-1]
93 | @beta.setter
94 | def beta(self, val:float)->None:
95 | "Set beta (or alpha as makes sense for given optimizer)."
96 | if val is None: return
97 | if 'betas' in self.opt_keys: self.set_val('betas', (self._mom, listify(val, self._beta)))
98 | elif 'alpha' in self.opt_keys: self.set_val('alpha', listify(val, self._beta))
99 | self._beta = listify(val, self._beta)
100 |
101 | @property
102 | def wd(self)->float: return self._wd[-1]
103 | @wd.setter
104 | def wd(self, val:float)->None:
105 | "Set weight decay."
106 | if not self.true_wd: self.set_val('weight_decay', listify(val, self._wd), bn_groups=self.bn_wd)
107 | self._wd = listify(val, self._wd)
108 |
109 | #Helper functions
110 | def read_defaults(self)->None:
111 | "Read the values inside the optimizer for the hyper-parameters."
112 | self._beta = None
113 | if 'lr' in self.opt_keys: self._lr = self.read_val('lr')
114 | if 'momentum' in self.opt_keys: self._mom = self.read_val('momentum')
115 | if 'alpha' in self.opt_keys: self._beta = self.read_val('alpha')
116 | if 'betas' in self.opt_keys: self._mom,self._beta = self.read_val('betas')
117 | if 'weight_decay' in self.opt_keys: self._wd = self.read_val('weight_decay')
118 | reserved_names = ['params', 'lr', 'momentum', 'alpha', 'betas', 'weight_decay']
119 | stat_names = [n for n in self.opt_keys if n not in reserved_names]
120 | self._stats = {n:self.read_val(n) for n in stat_names}
121 |
122 | def get_stat(self, name:str)->float:
123 | if name in ['lr', 'mom', 'beta', 'wd']: return getattr(self, name)
124 | else: return self._stats[name][-1]
125 | def set_stat(self, name:str, value:Union[float, Collection[float]])->None:
126 | if name in ['lr', 'mom', 'beta', 'wd']: setattr(self, name, value)
127 | else:
128 | val = listify(value, self._stats[name])
129 | self.set_val(name, val)
130 | self._stats[name] = val
131 |
132 | def set_val(self, key:str, val:Any, bn_groups:bool=True)->Any:
133 | "Set `val` inside the optimizer dictionary at `key`."
134 | if is_tuple(val): val = [(v1,v2) for v1,v2 in zip(*val)]
135 | for v,pg1,pg2 in zip(val,self.opt.param_groups[::2],self.opt.param_groups[1::2]):
136 | pg1[key] = v
137 | if bn_groups: pg2[key] = v
138 | return val
139 |
140 | def read_val(self, key:str) -> Union[List[float],Tuple[List[float],List[float]]]:
141 | "Read a hyperparameter `key` in the optimizer dictionary."
142 | val = [pg[key] for pg in self.opt.param_groups[::2]]
143 | if is_tuple(val[0]): val = [o[0] for o in val], [o[1] for o in val]
144 | return val
145 |
146 | def get_state(self):
147 | "Return the inner state minus the layer groups."
148 | return {'opt_state':self.opt.state_dict(), 'lr':self._lr, 'wd':self._wd, 'beta':self._beta, 'mom':self._mom,
149 | 'opt_func':self.opt_func, 'true_wd':self.true_wd, 'bn_wd':self.bn_wd}
150 |
151 | @classmethod
152 | def load_with_state_and_layer_group(cls, state:dict, layer_groups:Collection[nn.Module]):
153 | res = cls.create(state['opt_func'], state['lr'], layer_groups, wd=state['wd'], true_wd=state['true_wd'],
154 | bn_wd=state['bn_wd'])
155 | res._mom,res._beta = state['mom'],state['beta']
156 | res.load_state_dict(state['opt_state'])
157 | return res
158 |
159 | class Callback():
160 | "Base class for callbacks that want to record values, dynamically change learner params, etc."
161 | _order=0
162 | def on_train_begin(self, **kwargs:Any)->None:
163 | "To initialize constants in the callback."
164 | pass
165 | def on_epoch_begin(self, **kwargs:Any)->None:
166 | "At the beginning of each epoch."
167 | pass
168 | def on_batch_begin(self, **kwargs:Any)->None:
169 | "Set HP before the output and loss are computed."
170 | pass
171 | def on_loss_begin(self, **kwargs:Any)->None:
172 | "Called after forward pass but before loss has been computed."
173 | pass
174 | def on_backward_begin(self, **kwargs:Any)->None:
175 | "Called after the forward pass and the loss has been computed, but before backprop."
176 | pass
177 | def on_backward_end(self, **kwargs:Any)->None:
178 | "Called after backprop but before optimizer step. Useful for true weight decay in AdamW."
179 | pass
180 | def on_step_end(self, **kwargs:Any)->None:
181 | "Called after the step of the optimizer but before the gradients are zeroed."
182 | pass
183 | def on_batch_end(self, **kwargs:Any)->None:
184 | "Called at the end of the batch."
185 | pass
186 | def on_epoch_end(self, **kwargs:Any)->None:
187 | "Called at the end of an epoch."
188 | pass
189 | def on_train_end(self, **kwargs:Any)->None:
190 | "Useful for cleaning up things and saving files/models."
191 | pass
192 | def jump_to_epoch(self, epoch)->None:
193 | "To resume training at `epoch` directly."
194 | pass
195 |
196 | def get_state(self, minimal:bool=True):
197 | "Return the inner state of the `Callback`, `minimal` or not."
198 | to_remove = ['exclude', 'not_min'] + getattr(self, 'exclude', []).copy()
199 | if minimal: to_remove += getattr(self, 'not_min', []).copy()
200 | return {k:v for k,v in self.__dict__.items() if k not in to_remove}
201 |
202 | def __repr__(self):
203 | attrs = func_args(self.__init__)
204 | to_remove = getattr(self, 'exclude', [])
205 | list_repr = [self.__class__.__name__] + [f'{k}: {getattr(self, k)}' for k in attrs if k != 'self' and k not in to_remove]
206 | return '\n'.join(list_repr)
207 |
208 | class SmoothenValue():
209 | "Create a smooth moving average for a value (loss, etc) using `beta`."
210 | def __init__(self, beta:float):
211 | self.beta,self.n,self.mov_avg = beta,0,0
212 |
213 | def add_value(self, val:float)->None:
214 | "Add `val` to calculate updated smoothed value."
215 | self.n += 1
216 | self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val
217 | self.smooth = self.mov_avg / (1 - self.beta ** self.n)
218 |
219 | CallbackList = Collection[Callback]
220 |
221 | def _get_init_state(): return {'epoch':0, 'iteration':0, 'num_batch':0, 'skip_validate': False}
222 |
223 | @dataclass
224 | class CallbackHandler():
225 | "Manage all of the registered `callbacks` and `metrics`, smoothing loss by momentum `beta`."
226 | callbacks:CallbackList=None
227 | metrics:CallbackList=None
228 | beta:float=0.98
229 |
230 | def __post_init__(self)->None:
231 | "Initialize smoother and learning stats."
232 | self.callbacks = ifnone(self.callbacks, [])
233 | self.metrics = ifnone(self.metrics, [])
234 | self.metrics = [(met if isinstance(met, Callback) else AverageMetric(met)) for met in self.metrics]
235 | self.callbacks = sorted(self.callbacks, key=lambda o: getattr(o, '_order', 0))
236 | self.smoothener = SmoothenValue(self.beta)
237 | self.state_dict:Dict[str,Union[int,float,Tensor]]=_get_init_state()
238 |
239 | def _call_and_update(self, cb, cb_name, **kwargs)->None:
240 | "Call `cb_name` on `cb` and update the inner state."
241 | new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
242 | for k,v in new.items():
243 | if k not in self.state_dict:
244 | raise Exception(f"{k} isn't a valid key in the state of the callbacks.")
245 | else: self.state_dict[k] = v
246 |
247 | def __call__(self, cb_name, call_mets=True, **kwargs)->None:
248 | "Call through to all of the `CallbakHandler` functions."
249 | if call_mets:
250 | for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
251 | for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
252 |
253 | def set_dl(self, dl:DataLoader):
254 | "Set the current `dl` used."
255 | if hasattr(self, 'cb_dl'): self.callbacks.remove(self.cb_dl)
256 | if isinstance(dl.dataset, Callback):
257 | self.callbacks.append(dl.dataset)
258 | self.cb_dl = dl.dataset
259 |
260 | def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
261 | "About to start learning."
262 | self.state_dict = _get_init_state()
263 | self.state_dict.update(dict(n_epochs=epochs, pbar=pbar, metrics=metrics))
264 | names = [(met.name if hasattr(met, 'name') else camel2snake(met.__class__.__name__)) for met in self.metrics]
265 | self('train_begin', metrics_names=names)
266 | if self.state_dict['epoch'] != 0:
267 | self.state_dict['pbar'].first_bar.total -= self.state_dict['epoch']
268 | for cb in self.callbacks: cb.jump_to_epoch(self.state_dict['epoch'])
269 |
270 | def on_epoch_begin(self)->None:
271 | "Handle new epoch."
272 | self.state_dict['num_batch'],self.state_dict['stop_training'] = 0,False
273 | self('epoch_begin')
274 |
275 | def on_batch_begin(self, xb:Tensor, yb:Tensor, train:bool=True)->Tuple[Any,Any]:
276 | "Handle new batch `xb`,`yb` in `train` or validation."
277 | self.state_dict.update(dict(last_input=xb, last_target=yb, train=train,
278 | stop_epoch=False, skip_step=False, skip_zero=False, skip_bwd=False))
279 | self('batch_begin', call_mets = not self.state_dict['train'])
280 | return self.state_dict['last_input'], self.state_dict['last_target']
281 |
282 | def on_loss_begin(self, out:Tensor)->Any:
283 | "Handle start of loss calculation with model output `out`."
284 | self.state_dict['last_output'] = out
285 | self('loss_begin', call_mets=False)
286 | return self.state_dict['last_output']
287 |
288 | def on_backward_begin(self, loss:Tensor)->Tuple[Any,Any]:
289 | "Handle gradient calculation on `loss`."
290 | self.smoothener.add_value(loss.float().detach().cpu())
291 | self.state_dict['last_loss'], self.state_dict['smooth_loss'] = loss, self.smoothener.smooth
292 | self('backward_begin', call_mets=False)
293 | return self.state_dict['last_loss'], self.state_dict['skip_bwd']
294 |
295 | def on_backward_end(self)->Any:
296 | "Handle end of gradient calculation."
297 | self('backward_end', call_mets=False)
298 | return self.state_dict['skip_step']
299 |
300 | def on_step_end(self)->Any:
301 | "Handle end of optimization step."
302 | self('step_end', call_mets=False)
303 | return self.state_dict['skip_zero']
304 |
305 | def on_batch_end(self, loss:Tensor)->Any:
306 | "Handle end of processing one batch with `loss`."
307 | self.state_dict['last_loss'] = loss
308 | self('batch_end', call_mets = not self.state_dict['train'])
309 | if self.state_dict['train']:
310 | self.state_dict['iteration'] += 1
311 | self.state_dict['num_batch'] += 1
312 | return self.state_dict['stop_epoch']
313 |
314 | def on_epoch_end(self, val_loss:Tensor)->bool:
315 | "Epoch is done, process `val_loss`."
316 | self.state_dict['last_metrics'] = [val_loss] if val_loss is not None else [None]
317 | self('epoch_end', call_mets = val_loss is not None)
318 | self.state_dict['epoch'] += 1
319 | return self.state_dict['stop_training']
320 |
321 | def on_train_end(self, exception:Union[bool,Exception])->None:
322 | "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
323 | self('train_end', exception=exception)
324 |
325 | @property
326 | def skip_validate(self): return self.state_dict['skip_validate']
327 |
328 | class AverageMetric(Callback):
329 | "Wrap a `func` in a callback for metrics computation."
330 | def __init__(self, func):
331 | # If func has a __name__ use this one else it should be a partial
332 | name = func.__name__ if hasattr(func, '__name__') else func.func.__name__
333 | self.func, self.name = func, name
334 | self.world = num_distrib()
335 |
336 | def on_epoch_begin(self, **kwargs):
337 | "Set the inner value to 0."
338 | self.val, self.count = 0.,0
339 |
340 | def on_batch_end(self, last_output, last_target, **kwargs):
341 | "Update metric computation with `last_output` and `last_target`."
342 | if not is_listy(last_target): last_target=[last_target]
343 | self.count += first_el(last_target).size(0)
344 | val = self.func(last_output, *last_target)
345 | if self.world:
346 | val = val.clone()
347 | dist.all_reduce(val, op=dist.ReduceOp.SUM)
348 | val /= self.world
349 | self.val += first_el(last_target).size(0) * val.detach().cpu()
350 |
351 | def on_epoch_end(self, last_metrics, **kwargs):
352 | "Set the final result in `last_metrics`."
353 | return add_metrics(last_metrics, self.val/self.count)
354 |
355 | def annealing_no(start:Number, end:Number, pct:float)->Number:
356 | "No annealing, always return `start`."
357 | return start
358 | def annealing_linear(start:Number, end:Number, pct:float)->Number:
359 | "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."
360 | return start + pct * (end-start)
361 | def annealing_exp(start:Number, end:Number, pct:float)->Number:
362 | "Exponentially anneal from `start` to `end` as pct goes from 0.0 to 1.0."
363 | return start * (end/start) ** pct
364 | def annealing_cos(start:Number, end:Number, pct:float)->Number:
365 | "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
366 | cos_out = np.cos(np.pi * pct) + 1
367 | return end + (start-end)/2 * cos_out
368 |
369 | def do_annealing_poly(start:Number, end:Number, pct:float, degree:Number)->Number:
370 | "Helper function for `anneal_poly`."
371 | return end + (start-end) * (1-pct)**degree
372 | def annealing_poly(degree:Number)->Number:
373 | "Anneal polynomically from `start` to `end` as pct goes from 0.0 to 1.0."
374 | return functools.partial(do_annealing_poly, degree=degree)
375 |
376 | class Scheduler():
377 | "Used to \"step\" from start,end (`vals`) over `n_iter` iterations on a schedule defined by `func`"
378 | def __init__(self, vals:StartOptEnd, n_iter:int, func:Optional[AnnealFunc]=None):
379 | self.start,self.end = (vals[0],vals[1]) if is_tuple(vals) else (vals,0)
380 | self.n_iter = max(1,n_iter)
381 | if func is None: self.func = annealing_linear if is_tuple(vals) else annealing_no
382 | else: self.func = func
383 | self.n = 0
384 |
385 | def restart(self): self.n = 0
386 |
387 | def step(self)->Number:
388 | "Return next value along annealed schedule."
389 | self.n += 1
390 | return self.func(self.start, self.end, self.n/self.n_iter)
391 |
392 | @property
393 | def is_done(self)->bool:
394 | "Return `True` if schedule completed."
395 | return self.n >= self.n_iter
396 |
397 |
--------------------------------------------------------------------------------
/sls/sls.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import copy
3 | import time
4 |
5 | import sls_utils as ut
6 |
7 | class Sls(torch.optim.Optimizer):
8 | """Implements stochastic line search
9 | `paper `_.
10 | Arguments:
11 | params (iterable): iterable of parameters to optimize or dicts defining
12 | parameter groups
13 | n_batches_per_epoch (int, recommended):: the number batches in an epoch
14 | init_step_size (float, optional): initial step size (default: 1)
15 | c (float, optional): armijo condition constant (default: 0.1)
16 | beta_b (float, optional): multiplicative factor for decreasing the step-size (default: 0.9)
17 | gamma (float, optional): factor used by Armijo for scaling the step-size at each line-search step (default: 2.0)
18 | beta_f (float, optional): factor used by Goldstein for scaling the step-size at each line-search step (default: 2.0)
19 | reset_option (float, optional): sets the rest option strategy (default: 1)
20 | eta_max (float, optional): an upper bound used by Goldstein on the step size (default: 10)
21 | bound_step_size (bool, optional): a flag used by Goldstein for whether to bound the step-size (default: True)
22 | line_search_fn (float, optional): the condition used by the line-search to find the
23 | step-size (default: Armijo)
24 | """
25 |
26 | def __init__(self,
27 | params,
28 | n_batches_per_epoch=500,
29 | init_step_size=1,
30 | c=0.1,
31 | beta_b=0.9,
32 | gamma=2.0,
33 | beta_f=2.0,
34 | reset_option=1,
35 | eta_max=10,
36 | bound_step_size=True,
37 | line_search_fn="armijo"):
38 | defaults = dict(n_batches_per_epoch=n_batches_per_epoch,
39 | init_step_size=init_step_size,
40 | c=c,
41 | beta_b=beta_b,
42 | gamma=gamma,
43 | beta_f=beta_f,
44 | reset_option=reset_option,
45 | eta_max=eta_max,
46 | bound_step_size=bound_step_size,
47 | line_search_fn=line_search_fn)
48 | super().__init__(params, defaults)
49 |
50 | self.state['step'] = 0
51 | self.state['step_size'] = init_step_size
52 |
53 | self.state['n_forwards'] = 0
54 | self.state['n_backwards'] = 0
55 |
56 | def step(self, closure):
57 | # deterministic closure
58 | seed = time.time()
59 | def closure_deterministic():
60 | #with ut.random_seed_torch(int(seed)):
61 | return closure()
62 |
63 | batch_step_size = self.state['step_size']
64 |
65 | # get loss and compute gradients
66 | loss = closure() #_deterministic()
67 | loss.backward()
68 |
69 | # increment # forward-backward calls
70 | self.state['n_forwards'] += 1
71 | self.state['n_backwards'] += 1
72 |
73 | # loop over parameter groups
74 | for group in self.param_groups:
75 | params = group["params"]
76 |
77 | # save the current parameters:
78 | params_current = copy.deepcopy(params)
79 | grad_current = ut.get_grad_list(params)
80 |
81 | grad_norm = ut.compute_grad_norm(grad_current)
82 |
83 | step_size = ut.reset_step(step_size=batch_step_size,
84 | n_batches_per_epoch=group['n_batches_per_epoch'],
85 | gamma=group['gamma'],
86 | reset_option=group['reset_option'],
87 | init_step_size=group['init_step_size'])
88 |
89 | # only do the check if the gradient norm is big enough
90 | with torch.no_grad():
91 | if grad_norm >= 1e-8:
92 | # check if condition is satisfied
93 | found = 0
94 | step_size_old = step_size
95 |
96 | for e in range(100):
97 | # try a prospective step
98 | ut.try_sgd_update(params, step_size, params_current, grad_current)
99 |
100 | # compute the loss at the next step; no need to compute gradients.
101 | loss_next = closure() #closure_deterministic()
102 | self.state['n_forwards'] += 1
103 |
104 | # =================================================
105 | # Line search
106 | if group['line_search_fn'] == "armijo":
107 | armijo_results = ut.check_armijo_conditions(step_size=step_size,
108 | step_size_old=step_size_old,
109 | loss=loss,
110 | grad_norm=grad_norm,
111 | loss_next=loss_next,
112 | c=group['c'],
113 | beta_b=group['beta_b'])
114 | found, step_size, step_size_old = armijo_results
115 | if found == 1:
116 | break
117 |
118 | elif group['line_search_fn'] == "goldstein":
119 | goldstein_results = ut.check_goldstein_conditions(step_size=step_size,
120 | loss=loss,
121 | grad_norm=grad_norm,
122 | loss_next=loss_next,
123 | c=group['c'],
124 | beta_b=group['beta_b'],
125 | beta_f=group['beta_f'],
126 | bound_step_size=group['bound_step_size'],
127 | eta_max=group['eta_max'])
128 |
129 | found = goldstein_results["found"]
130 | step_size = goldstein_results["step_size"]
131 |
132 | if found == 3:
133 | break
134 |
135 | # if line search exceeds max_epochs
136 | if found == 0:
137 | print("line search attempts exceeded...using defaults")
138 | ut.try_sgd_update(params, 1e-6, params_current, grad_current)
139 |
140 | # save the new step-size
141 | self.state['step_size'] = step_size
142 | self.state['step'] += 1
143 |
144 | return loss
--------------------------------------------------------------------------------
/sls/sls_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.cuda
3 |
4 | import numpy as np
5 | #import contextlib
6 |
7 |
8 | def check_armijo_conditions(step_size, step_size_old, loss, grad_norm,
9 | loss_next, c, beta_b):
10 | found = 0
11 |
12 | # computing the new break condition
13 | break_condition = loss_next - \
14 | (loss - (step_size) * c * grad_norm**2)
15 |
16 | if (break_condition <= 0):
17 | found = 1
18 |
19 | else:
20 | # decrease the step-size by a multiplicative factor
21 | step_size = step_size * beta_b
22 |
23 | return found, step_size, step_size_old
24 |
25 | def check_goldstein_conditions(step_size, loss, grad_norm,
26 | loss_next,
27 | c, beta_b, beta_f, bound_step_size, eta_max):
28 | found = 0
29 | if(loss_next <= (loss - (step_size) * c * grad_norm ** 2)):
30 | found = 1
31 |
32 | if(loss_next >= (loss - (step_size) * (1 - c) * grad_norm ** 2)):
33 | if found == 1:
34 | found = 3 # both conditions are satisfied
35 | else:
36 | found = 2 # only the curvature condition is satisfied
37 |
38 | if (found == 0):
39 | raise ValueError('Error')
40 |
41 | elif (found == 1):
42 | # step-size might be too small
43 | step_size = step_size * beta_f
44 | if bound_step_size:
45 | step_size = min(step_size, eta_max)
46 |
47 | elif (found == 2):
48 | # step-size might be too large
49 | step_size = max(step_size * beta_b, 1e-8)
50 |
51 | return {"found":found, "step_size":step_size}
52 |
53 |
54 | def reset_step(step_size, n_batches_per_epoch=None, gamma=None, reset_option=1,
55 | init_step_size=None):
56 | if reset_option == 0:
57 | pass
58 |
59 | elif reset_option == 1:
60 | step_size = step_size * gamma**(1. / n_batches_per_epoch)
61 |
62 | elif reset_option == 2:
63 | step_size = init_step_size
64 |
65 | return step_size
66 |
67 | def try_sgd_update(params, step_size, params_current, grad_current):
68 | zipped = zip(params, params_current, grad_current)
69 |
70 | for p_next, p_current, g_current in zipped:
71 | p_next.data = p_current - step_size * g_current
72 |
73 | def compute_grad_norm(grad_list):
74 | grad_norm = 0.
75 | for g in grad_list:
76 | if g is None:
77 | continue
78 | grad_norm += torch.sum(torch.mul(g, g))
79 | grad_norm = torch.sqrt(grad_norm)
80 | return grad_norm
81 |
82 |
83 | def get_grad_list(params):
84 | return [p.grad for p in params]
85 |
86 | #@contextlib.contextmanager
87 | def random_seed(seed):
88 | state = np.random.get_state()
89 | np.random.seed(seed)
90 | try:
91 | yield
92 | finally:
93 | np.random.set_state(state)
94 |
95 | #@contextlib.contextmanager
96 | def random_seed_torch(seed, device=0):
97 | cpu_rng_state = torch.get_rng_state()
98 | gpu_rng_state = torch.cuda.get_rng_state(0)
99 |
100 | np.random.seed(seed)
101 | torch.manual_seed(seed)
102 | torch.cuda.manual_seed_all(seed)
103 |
104 | try:
105 | yield
106 | finally:
107 | torch.set_rng_state(cpu_rng_state)
108 | torch.cuda.set_rng_state(gpu_rng_state, device)
--------------------------------------------------------------------------------