├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.rst ├── docs ├── Makefile ├── _static │ ├── rosenbrock-nag.png │ ├── rosenbrock-nag.py │ └── style-tweaks.css ├── _templates │ └── gitwidgets.html ├── conf.py ├── guide.rst ├── index.rst ├── make.bat ├── reference.rst └── requirements.txt ├── downhill ├── __init__.py ├── adaptive.py ├── base.py ├── dataset.py ├── first_order.py └── util.py ├── examples ├── mnist-sparse-factorization.py ├── rosenbrock-100d.py ├── rosenbrock-2d.py └── rosenbrock.py ├── setup.cfg ├── setup.py └── test ├── adaptive_test.py ├── base_test.py ├── dataset_test.py ├── downhill_test.py ├── first_order_test.py └── util.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit = 3 | */python?.?/* 4 | */site-packages/nose/* 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | docs/generated/ 53 | 54 | # PyBuilder 55 | target/ 56 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | cache: apt 2 | sudo: false 3 | language: python 4 | python: 5 | - "2.7" 6 | - "3.4" 7 | addons: 8 | apt: 9 | packages: 10 | - libatlas-dev 11 | - libatlas-base-dev 12 | - liblapack-dev 13 | - gfortran 14 | before_install: 15 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 16 | - bash miniconda.sh -b -p $HOME/miniconda 17 | - export PATH="$HOME/miniconda/bin:$PATH" 18 | install: 19 | - conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas 20 | - pip install pytest-pep8 pytest-cov python-coveralls 21 | - python setup.py develop 22 | script: 23 | - THEANO_FLAGS=floatX=float32 py.test -v --pep8 --cov=downhill --cov-report=term-missing 24 | after_success: 25 | - coveralls 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2015 lmjohns3 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://travis-ci.org/lmjohns3/downhill.svg 2 | .. image:: https://coveralls.io/repos/lmjohns3/downhill/badge.svg 3 | :target: https://coveralls.io/r/lmjohns3/downhill 4 | .. image:: http://depsy.org/api/package/pypi/downhill/badge.svg 5 | :target: http://depsy.org/package/python/downhill 6 | 7 | ============ 8 | ``DOWNHILL`` 9 | ============ 10 | 11 | The ``downhill`` package provides algorithms for minimizing scalar loss 12 | functions that are defined using Theano_. 13 | 14 | Several optimization algorithms are included: 15 | 16 | - ADADELTA_ 17 | - ADAGRAD_ 18 | - Adam_ 19 | - `Equilibrated SGD`_ 20 | - `Nesterov's Accelerated Gradient`_ 21 | - RMSProp_ 22 | - `Resilient Backpropagation`_ 23 | - `Stochastic Gradient Descent`_ 24 | 25 | All algorithms permit the use of regular or Nesterov-style momentum as well. 26 | 27 | .. _Theano: http://deeplearning.net/software/theano/ 28 | 29 | .. _Stochastic Gradient Descent: http://downhill.readthedocs.org/en/stable/generated/downhill.first_order.SGD.html 30 | .. _Nesterov's Accelerated Gradient: http://downhill.readthedocs.org/en/stable/generated/downhill.first_order.NAG.html 31 | .. _Resilient Backpropagation: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.RProp.html 32 | .. _ADAGRAD: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.ADAGRAD.html 33 | .. _RMSProp: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.RMSProp.html 34 | .. _ADADELTA: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.ADADELTA.html 35 | .. _Adam: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.Adam.html 36 | .. _Equilibrated SGD: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.ESGD.html 37 | 38 | Quick Start: Matrix Factorization 39 | ================================= 40 | 41 | Let's say you have 100 samples of 1000-dimensional data, and you want to 42 | represent your data as 100 coefficients in a 10-dimensional basis. This is 43 | pretty straightforward to model using Theano: you can use a matrix 44 | multiplication as the data model, a squared-error term for optimization, and a 45 | sparse regularizer to encourage small coefficient values. 46 | 47 | Once you have constructed an expression for the loss, you can optimize it with a 48 | single call to ``downhill.minimize``: 49 | 50 | .. code:: python 51 | 52 | import downhill 53 | import numpy as np 54 | import theano 55 | import theano.tensor as TT 56 | 57 | FLOAT = 'df'[theano.config.floatX == 'float32'] 58 | 59 | def rand(a, b): 60 | return np.random.randn(a, b).astype(FLOAT) 61 | 62 | A, B, K = 20, 5, 3 63 | 64 | # Set up a matrix factorization problem to optimize. 65 | u = theano.shared(rand(A, K), name='u') 66 | v = theano.shared(rand(K, B), name='v') 67 | z = TT.matrix() 68 | err = TT.sqr(z - TT.dot(u, v)) 69 | loss = err.mean() + abs(u).mean() + (v * v).mean() 70 | 71 | # Minimize the regularized loss with respect to a data matrix. 72 | y = np.dot(rand(A, K), rand(K, B)) + rand(A, B) 73 | 74 | # Monitor during optimization. 75 | monitors = (('err', err.mean()), 76 | ('|u|<0.1', (abs(u) < 0.1).mean()), 77 | ('|v|<0.1', (abs(v) < 0.1).mean())) 78 | 79 | downhill.minimize( 80 | loss=loss, 81 | train=[y], 82 | patience=0, 83 | batch_size=A, # Process y as a single batch. 84 | max_gradient_norm=1, # Prevent gradient explosion! 85 | learning_rate=0.1, 86 | monitors=monitors, 87 | monitor_gradients=True) 88 | 89 | # Print out the optimized coefficients u and basis v. 90 | print('u =', u.get_value()) 91 | print('v =', v.get_value()) 92 | 93 | If you prefer to maintain more control over your model during optimization, 94 | downhill provides an iterative optimization interface: 95 | 96 | .. code:: python 97 | 98 | opt = downhill.build(algo='rmsprop', 99 | loss=loss, 100 | monitors=monitors, 101 | monitor_gradients=True) 102 | 103 | for metrics, _ in opt.iterate(train=[[y]], 104 | patience=0, 105 | batch_size=A, 106 | max_gradient_norm=1, 107 | learning_rate=0.1): 108 | print(metrics) 109 | 110 | If that's still not enough, you can just plain ask downhill for the updates to 111 | your model variables and do everything else yourself: 112 | 113 | .. code:: python 114 | 115 | updates = downhill.build('rmsprop', loss).get_updates( 116 | batch_size=A, max_gradient_norm=1, learning_rate=0.1) 117 | func = theano.function([z], loss, updates=list(updates)) 118 | for _ in range(100): 119 | print(func(y)) # Evaluate func and apply variable updates. 120 | 121 | More Information 122 | ================ 123 | 124 | Source: http://github.com/lmjohns3/downhill 125 | 126 | Documentation: http://downhill.readthedocs.org 127 | 128 | Mailing list: https://groups.google.com/forum/#!forum/downhill-users 129 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " zip to make standalone HTML files and zip them up" 23 | @echo " dirhtml to make HTML files named index.html in directories" 24 | @echo " singlehtml to make a single large HTML file" 25 | @echo " pickle to make pickle files" 26 | @echo " json to make JSON files" 27 | @echo " htmlhelp to make HTML files and a HTML help project" 28 | @echo " qthelp to make HTML files and a qthelp project" 29 | @echo " devhelp to make HTML files and a Devhelp project" 30 | @echo " epub to make an epub" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " text to make text files" 34 | @echo " man to make manual pages" 35 | @echo " texinfo to make Texinfo files" 36 | @echo " info to make Texinfo files and run them through makeinfo" 37 | @echo " gettext to make PO message catalogs" 38 | @echo " changes to make an overview of all changed/added/deprecated items" 39 | @echo " linkcheck to check all external links for integrity" 40 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 41 | 42 | clean: 43 | -rm -rf $(BUILDDIR)/* 44 | -rm docs.zip 45 | 46 | zip: html 47 | cd $(BUILDDIR)/html && zip -r docs.zip . && mv docs.zip ../.. 48 | 49 | html: 50 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 53 | 54 | dirhtml: 55 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 58 | 59 | singlehtml: 60 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 61 | @echo 62 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 63 | 64 | pickle: 65 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 66 | @echo 67 | @echo "Build finished; now you can process the pickle files." 68 | 69 | json: 70 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 71 | @echo 72 | @echo "Build finished; now you can process the JSON files." 73 | 74 | htmlhelp: 75 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 76 | @echo 77 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 78 | ".hhp project file in $(BUILDDIR)/htmlhelp." 79 | 80 | qthelp: 81 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 82 | @echo 83 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 84 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 85 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/downhill.qhcp" 86 | @echo "To view the help file:" 87 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/downhill.qhc" 88 | 89 | devhelp: 90 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 91 | @echo 92 | @echo "Build finished." 93 | @echo "To view the help file:" 94 | @echo "# mkdir -p $$HOME/.local/share/devhelp/downhill" 95 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/downhill" 96 | @echo "# devhelp" 97 | 98 | epub: 99 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 100 | @echo 101 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 102 | 103 | latex: 104 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 105 | @echo 106 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 107 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 108 | "(use \`make latexpdf' here to do that automatically)." 109 | 110 | latexpdf: 111 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 112 | @echo "Running LaTeX files through pdflatex..." 113 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 114 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 115 | 116 | text: 117 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 118 | @echo 119 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 120 | 121 | man: 122 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 123 | @echo 124 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 125 | 126 | texinfo: 127 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 128 | @echo 129 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 130 | @echo "Run \`make' in that directory to run these through makeinfo" \ 131 | "(use \`make info' here to do that automatically)." 132 | 133 | info: 134 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 135 | @echo "Running Texinfo files through makeinfo..." 136 | make -C $(BUILDDIR)/texinfo info 137 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 138 | 139 | gettext: 140 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 141 | @echo 142 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 143 | 144 | changes: 145 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 146 | @echo 147 | @echo "The overview file is in $(BUILDDIR)/changes." 148 | 149 | linkcheck: 150 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 151 | @echo 152 | @echo "Link check complete; look for any errors in the above output " \ 153 | "or in $(BUILDDIR)/linkcheck/output.txt." 154 | 155 | doctest: 156 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 157 | @echo "Testing of doctests in the sources finished, look at the " \ 158 | "results in $(BUILDDIR)/doctest/output.txt." 159 | -------------------------------------------------------------------------------- /docs/_static/rosenbrock-nag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmjohns3/downhill/42111ab03b5e6fa47b7bf7c7cb5caa402f10ce6d/docs/_static/rosenbrock-nag.png -------------------------------------------------------------------------------- /docs/_static/rosenbrock-nag.py: -------------------------------------------------------------------------------- 1 | import downhill 2 | import matplotlib.pyplot as plt 3 | from mpl_toolkits.mplot3d import Axes3D 4 | import numpy as np 5 | import theano 6 | 7 | x = theano.shared(np.array([-1, 0], 'f'), name='x') 8 | 9 | opt = downhill.build( 10 | 'nag', 11 | loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), 12 | params=[x], 13 | inputs=[], 14 | monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())], 15 | monitor_gradients=True) 16 | 17 | xs, ys, loss = [], [], [] 18 | for tm, _ in opt.iterate([[]], 19 | learning_rate=0.001, 20 | momentum=0.95, 21 | max_gradient_norm=100): 22 | xs.append(tm['x']) 23 | ys.append(tm['y']) 24 | loss.append(tm['loss']) 25 | if len(loss) == 300: 26 | break 27 | 28 | ax = plt.axes(projection='3d') 29 | 30 | c = '#d62728' 31 | ax.plot(xs, ys, zs=loss, linestyle='-', 32 | marker='o', color=c, mec=c, mfc='none', 33 | lw=3, mew=0.5, markersize=7, alpha=0.7) 34 | 35 | X, Y = np.meshgrid(np.linspace(-1.1, 1.1, 127), np.linspace(-0.5, 1.7, 127)) 36 | Z = 100 * (Y - X ** 2) ** 2 + (1 - X) ** 2 37 | ax.plot_surface(X, Y, Z, cmap='YlGnBu', lw=0, rstride=4, cstride=4, alpha=0.9) 38 | ax.plot_wireframe(X, Y, Z, lw=0.5, rstride=4, cstride=4, color='#333333', alpha=0.7) 39 | ax.plot([1], [1], zs=[1], marker='x', mew=3, markersize=10, color='#111111') 40 | 41 | ax.set_xlim(-1.1, 1.1) 42 | ax.set_ylim(-0.5, 1.7) 43 | ax.view_init(azim=10, elev=45) 44 | 45 | ax.w_xaxis.set_pane_color((1, 1, 1, 1)) 46 | ax.w_yaxis.set_pane_color((1, 1, 1, 1)) 47 | ax.w_zaxis.set_pane_color((1, 1, 1, 1)) 48 | 49 | plt.savefig('rosenbrock-nag.png') 50 | plt.show() 51 | -------------------------------------------------------------------------------- /docs/_static/style-tweaks.css: -------------------------------------------------------------------------------- 1 | a, a:visited { color: #258; } 2 | a tt, a:visited tt, a:active tt { color: #258; } 3 | 4 | .banana { float: right; max-width: 45%; } 5 | .banana img { width: 100%; } 6 | 7 | pre { font-size: 0.9rem; line-height: 1.25; } 8 | span.pre { background: #eee; font-size: 0.95rem; padding: 0.1rem 0.2rem; } 9 | 10 | a.internal span.pre { 11 | background: inherit; 12 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 13 | font-size: inherit; 14 | padding: inherit; 15 | } 16 | 17 | th.field-name { background: #ffd; } 18 | 19 | dl.method dt { background: #def; } 20 | -------------------------------------------------------------------------------- /docs/_templates/gitwidgets.html: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 | 6 | 7 |
8 |
9 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import better 2 | 3 | extensions = [ 4 | 'sphinx.ext.autodoc', 5 | 'sphinx.ext.autosummary', 6 | 'sphinx.ext.intersphinx', 7 | 'sphinx.ext.mathjax', 8 | 'sphinx.ext.viewcode', 9 | 'numpydoc', 10 | ] 11 | autosummary_generate = True 12 | autodoc_default_flags = ['members'] 13 | numpydoc_show_class_members = False 14 | numpydoc_show_inherited_class_members = True 15 | source_suffix = '.rst' 16 | source_encoding = 'utf-8-sig' 17 | master_doc = 'index' 18 | project = u'Downhill' 19 | copyright = u'2015, Leif Johnson' 20 | version = '0.5' 21 | release = '0.5.0pre' 22 | exclude_patterns = ['_build'] 23 | templates_path = ['_templates'] 24 | pygments_style = 'tango' 25 | 26 | html_theme = 'better' 27 | html_theme_path = [better.better_theme_path] 28 | html_theme_options = dict( 29 | rightsidebar=False, 30 | inlinecss='', 31 | cssfiles=['_static/style-tweaks.css'], 32 | showheader=True, 33 | showrelbartop=True, 34 | showrelbarbottom=True, 35 | linktotheme=True, 36 | sidebarwidth='15rem', 37 | textcolor='#111', 38 | headtextcolor='#333', 39 | footertextcolor='#333', 40 | ga_ua='', 41 | ga_domain='', 42 | ) 43 | html_short_title = 'Home' 44 | html_static_path = ['_static'] 45 | 46 | 47 | def h(xs): 48 | return ['{}.html'.format(x) for x in xs.split()] 49 | 50 | html_sidebars = { 51 | 'index': h('gitwidgets globaltoc sourcelink searchbox'), 52 | '**': h('gitwidgets localtoc sourcelink searchbox'), 53 | } 54 | 55 | intersphinx_mapping = { 56 | 'python': ('https://docs.python.org/3.4/', None), 57 | 'numpy': ('http://docs.scipy.org/doc/numpy/', None), 58 | 'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None), 59 | } 60 | -------------------------------------------------------------------------------- /docs/guide.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | User Guide 3 | ========== 4 | 5 | You are probably reading this guide because you have a problem. 6 | 7 | There are many problems in the world, and many ways of thinking about solving 8 | them. Happily, some---many---problems can be described mathematically using a 9 | "loss" function, which takes a potential solution for your problem and returns a 10 | single number indicating how terrible that solution is. 11 | 12 | If you can express your problem using a loss function, then it's possible---even 13 | likely---that you can then use a computer to solve your problem for you. This is 14 | what ``downhill`` does: given a computational formulation of a loss, the 15 | optimization routines in ``downhill`` can compute a series of ever-better 16 | solutions to your problem. 17 | 18 | This guide describes how that works. 19 | 20 | .. _creating-loss: 21 | 22 | Creating a Loss 23 | =============== 24 | 25 | Many types of problems can be formulated in terms of a scalar `"loss" function`_ 26 | that ought to be minimized. The "loss" for a problem: 27 | 28 | - is computed with respect to a potential solution to a problem, and 29 | - is a scalar quantity---just a single number. 30 | 31 | A few examples of problems and their associated losses might include: 32 | 33 | - Categorizing pictures into "elephants" versus "acrobats"; the loss might be 34 | the number of mistakes that are made on a given set of test pictures. 35 | - Allocating funds to provide a given set of public services; the loss might be 36 | the monetary cost of the budget. 37 | - Computing the actions of a robot to achieve a goal; the loss might be the 38 | total energy consumed. 39 | 40 | This guide will use linear regression as a running example. Suppose you've made 41 | some measurements of, say, the sizes and prices of various houses for sale where 42 | you live. You want to describe the relationship between the size (let's 43 | represent it as :math:`x_i`) and the price (:math:`y_i`) by fitting a line to 44 | the measurements you've made. 45 | 46 | So you need to take the data points that you collected and somehow use them to 47 | compute a slope :math:`m` and an intercept :math:`b` such that the resulting 48 | line :math:`y = m x + b` passes as closely as possible to your data points. In 49 | this example, the loss :math:`\mathcal{L}` might be expressed as the sum of the 50 | differences between the values on the line and the observed data: 51 | 52 | .. math:: 53 | \mathcal{L}(m,b) = \sum_{i=1}^N ( m x_i + b - y_i )^2 54 | 55 | .. _"loss" function: https://en.wikipedia.org/wiki/Loss_function 56 | 57 | Using Theano 58 | ------------ 59 | 60 | Well, you've formulated a loss for this regression problem. Now it's time to use 61 | ``downhill`` to minimize it, right? 62 | 63 | Not so fast ... the ``downhill`` package provides routines for optimizing scalar 64 | loss functions, but there's a catch: the loss functions must be defined using 65 | Theano_, a Python framework for describing computation graphs. Theano takes a 66 | bit of getting used to, but we'll walk through the linear regression example 67 | here; if you're curious, there are also lots of good tutorials_ on the Theano 68 | site. 69 | 70 | To use Theano with ``downhill``, you need to define `shared variables`_ for each 71 | of the parameters in your model, and `symbolic inputs`_ for the data that you'll 72 | use to evaluate your loss. We'll start with the shared variables:: 73 | 74 | import downhill 75 | import numpy as np 76 | import theano 77 | import theano.tensor as TT 78 | 79 | m = theano.shared(np.ones((1, ), 'f'), name='m') 80 | b = theano.shared(np.zeros((1, ), 'f'), name='b') 81 | 82 | This sets up a vector with one 1 for :math:`m`, and a vector with one 0 for 83 | :math:`b`. The values contained inside these shared variables will be adjusted 84 | automatically by the optimization algorithms in ``downhill``. 85 | 86 | Next, you need to define symbols that represent the data needed to compute 87 | the loss:: 88 | 89 | x = TT.vector('x') 90 | y = TT.vector('y') 91 | 92 | These symbolic vectors represent the inputs---the house sizes :math:`[x_1 \dots 93 | x_N]` and prices :math:`[y_1 \dots y_N]`---needed to compute the loss. Finally, 94 | having created all of these symbolic variables, you can define the loss itself:: 95 | 96 | loss = TT.sqr(m * x + b - y).sum() 97 | 98 | This tells Theano to multiply the data vector ``x`` by the value stored in the 99 | shared ``m`` variable, add the value stored in the shared ``b`` variable, and 100 | then subtract the data vector ``y``. Then that vector gets squared elementwise, 101 | and all of the components of the result get summed up to produce the loss. 102 | 103 | Note that none of these operations have actually been computed; instead, you've 104 | instructed Theano *how* to compute the loss, if you were to give it some values 105 | for ``x`` and ``y``. This is the bizarre thing about Theano: it looks like 106 | you're computing things, but you're actually just telling the computer how to 107 | compute things in the future. 108 | 109 | .. _Theano: http://deeplearning.net/software/theano/ 110 | .. _tutorials: http://deeplearning.net/software/theano/tutorial/index.html 111 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables 112 | .. _symbolic inputs: http://deeplearning.net/software/theano/tutorial/adding.html 113 | 114 | .. _minimizing-loss: 115 | 116 | Minimizing a Loss 117 | ================= 118 | 119 | The ``downhill`` package provides a single high-level function, 120 | :func:`downhill.minimize`, that can be used as a black-box optimizer for losses. 121 | In addition, there are lower-level calls that provide more control over the 122 | interaction between your code and ``downhill``. First, we'll look at the 123 | high-level minimize function, then we'll talk about what happens under the hood. 124 | 125 | Once you've defined your loss using Theano, you can minimize it with a single 126 | function call. Here, we'll minimize the loss defined above:: 127 | 128 | downhill.minimize(loss, [sizes, prices], inputs=[x, y]) 129 | 130 | You just specify the loss to minimize, provide some data to use for computing 131 | the loss, and identify the symbolic inputs that the loss requires. The 132 | ``downhill`` code will select an optimization algorithm (the default is 133 | currently :class:`RMSProp `), identify shared 134 | variables in the loss that need optimization, and run the optimization process 135 | to completion. After the minimization has finished, the shared variables in your 136 | loss will be updated to their optimal values. You can retrieve their values 137 | using any of the methods of `shared variables`_:: 138 | 139 | m_value, b_value = m.get_value(), b.get_value() 140 | 141 | There is much to say about providing data---see :ref:`providing-data` for more 142 | information---but briefly, the data you will need to provide is typically a list 143 | of ``numpy`` arrays of the measurements you've made for your problem. For the 144 | house price regression example, the arrays for house size and house price might 145 | be set up like this:: 146 | 147 | sizes = np.array([1200, 2013, 8129, 2431, 2211]) 148 | prices = np.array([103020, 203310, 3922013, 224321, 449020]) 149 | 150 | .. _training-validation: 151 | 152 | Training and Validation 153 | ----------------------- 154 | 155 | You might have noticed that the formulation of the loss given at the top of this 156 | guide contains a sum over all of the data points that you've observed 157 | :math:`(x_i, y_i)`. (For the house price example, these data are stored in the 158 | ``sizes`` and ``prices`` arrays.) This is a very common state of affairs for 159 | many problems: the loss is computed thanks to observed data. 160 | 161 | But for a typical regression problem, it's not feasible or even possible to 162 | gather *all* of the relevant data---either it's too expensive to do that, or 163 | there might be new data created in the future that you don't have any way of 164 | predicting. 165 | 166 | Given this paucity of data, you're running a risk in using a stochastic 167 | optimizer to solve your problem: the data that you *have* collected might not be 168 | representative of the data that you *haven't* collected! If the data you 169 | collected are quite different from the "true" data out there in the world, then 170 | when you optimize your loss, the optimal model might be skewed toward your 171 | dataset, and your model might not perform well on new, "unseen" data. 172 | 173 | This problem is generally referred to as overfitting_ and is a risk with many 174 | types of models. Generally the risk of overfitting increases with the complexity 175 | of your model, and also increases when you don't have a lot of data. 176 | 177 | There are many ways to combat overfitting: 178 | 179 | - You can tighten your belt and gather more data, which increases the chance 180 | that the data you do have will be representative of data you don't yet have. 181 | 182 | - You can regularize_ your loss; this tends to encourage some solutions to your 183 | problem (e.g., solutions with small parameter values) and discourage others 184 | (e.g., solutions that "memorize" outliers). 185 | 186 | - You can also set aside a bit of the data you've collected as a "validation" 187 | set. You can use this set to stop the optimization process when the 188 | performance of your model on the validation set stops improving---this is 189 | known as "early stopping." 190 | 191 | Collecting more data is almost always a good idea, as long as you can afford to 192 | do so (whether in terms of time, monetary cost, etc.)---but ``downhill`` can't 193 | help you with that. And while it can often be a good idea to incorporate 194 | regularizers into your loss, doing so is something of an art and remains outside 195 | the scope of ``downhill``. 196 | 197 | .. _overfitting: https://en.wikipedia.org/wiki/Overfitting 198 | .. _regularize: https://en.wikipedia.org/wiki/Regularization_(mathematics) 199 | 200 | .. _early-stopping: 201 | 202 | Early Stopping 203 | -------------- 204 | 205 | The algorithms in ``downhill`` implement the "early stopping" regularization 206 | method. To take advantage of it, just provide a second set of data when 207 | minimizing your loss:: 208 | 209 | downhill.minimize(loss, [sizes, prices], [valid_sizes, valid_prices]) 210 | 211 | Here we'll assume that you've gathered another few sizes and prices and put them 212 | in a new pair of ``numpy`` arrays. In practice, the validation dataset can also 213 | just be a small bit (10% or so) of the training data you've collected. Either 214 | way, it's important to make sure the validation data is disjoint from the 215 | training data, to ensure the most accurate predictions on unseen data. The idea 216 | is that you want to use a small part of the data you've gathered as a sort of 217 | canary_ to guess when the performance of your model will be good when you 218 | actually take it out into the world and use it. 219 | 220 | .. _canary: https://en.wikipedia.org/wiki/Animal_sentinel#Historical_examples 221 | 222 | The early stopping method will cause optimization to halt when the loss stops 223 | improving on the validation dataset. If you do not specify a validation dataset, 224 | the training dataset will also be used for validation, which effectively 225 | disables early stopping---that is, optimization will halt whenever the loss 226 | computed on the training dataset stops improving. 227 | 228 | To understand this better, we'll take a look at the lower-level API provided by 229 | ``downhill``. 230 | 231 | .. _iterative-optimization: 232 | 233 | Iterative Optimization 234 | ---------------------- 235 | 236 | The :func:`downhill.minimize` function is actually just a wrapper that performs 237 | a few common lower-level tasks to optimize your loss. These tasks include: 238 | 239 | - creating :class:`datasets ` to wrap your data, 240 | - creating an :class:`Optimizer `, and 241 | - running the optimizer to completion. 242 | 243 | You can perform these tasks yourself to retain more control over the 244 | optimization process, but even if you don't, it's useful to follow the process 245 | to know how it works. In practice it can often be useful to call the 246 | :func:`iterate() ` method yourself, because it 247 | gives you access to the state of the optimizer at each step. 248 | 249 | To learn more about this, have a look at the following example:: 250 | 251 | opt = downhill.build('rmsprop', loss=loss, inputs=[x, y]) 252 | train = downhill.Dataset([sizes, prices]) 253 | valid = downhill.Dataset([valid_sizes, valid_prices]) 254 | for tm, vm in opt.iterate(train, valid): 255 | print('training loss:', tm['loss']) 256 | print('most recent validation loss:', vm['loss']) 257 | 258 | This code constructs an :class:`Optimizer ` object 259 | (specifically, an :class:`RMSProp optimizer `), wraps 260 | the input data with a :class:`Dataset `, and then 261 | steps through the optimization process iteratively. 262 | 263 | Notice that after each iteration, the optimizer yields a pair of dictionaries to 264 | the caller: the first dictionary contains measured values of the loss on the 265 | training data during that iteration, and the second contains measured values of 266 | the loss on the validation data. 267 | 268 | The keys and values in each of these dictionaries give the costs and monitors 269 | that are computed during optimization. There will always be a ``'loss'`` key in 270 | each dictionary that gives the value of the loss function being optimized. In 271 | addition, any :ref:`monitor values ` that were defined when creating 272 | the optimizer will also be provided in these dictionaries. 273 | 274 | .. _batches-epochs: 275 | 276 | Batches and Epochs 277 | ------------------ 278 | 279 | During each iteration, the optimizer instance processes training data in small 280 | pieces called "mini-batches"; each mini-batch is used to compute a gradient 281 | estimate for the loss, and the parameters are updated by a small amount. After a 282 | fixed number of mini-batches have been processed, the ``iterate`` method yields 283 | the loss dictionaries to the caller. 284 | 285 | Each group of parameter updates processed during a single iteration is called an 286 | "epoch." After a fixed number of epochs have taken place, the loss is then 287 | evaluated using a fixed number of mini-batches from the validation dataset, and 288 | this result is saved as the validation dictionary after every epoch until the 289 | next validation happens. 290 | 291 | Optimization epochs continue to occur, with occasional validations, until the loss 292 | on the validation dataset fails to make sufficient progress for long enough. 293 | Optimization halts at that point. 294 | 295 | There are a number of hyperparameters involved in this process, which can be 296 | tuned for the best performance on your problem. 297 | 298 | .. _tuning: 299 | 300 | Tuning 301 | ====== 302 | 303 | The ``downhill`` package provides several ways of tuning the optimization 304 | process. There are many different settings for mini-batch optimization and 305 | validation, many optimization algorithms are available, and there are also 306 | several common learning hyperparameters that might require tuning. 307 | 308 | .. _batch-parameters: 309 | 310 | Batch Parameters 311 | ---------------- 312 | 313 | All algorithms in ``downhill`` provide early stopping and use :ref:`epoch-based 314 | optimization ` as described above. This process is controlled by 315 | a number of parameters that can be tweaked for your optimization problem. 316 | 317 | The size of a minibatch is controlled using the ``batch_size`` parameter when 318 | you create a :class:`Dataset `. To build mini-batches 319 | containing 3 pieces of data, for example:: 320 | 321 | train = downhill.Dataset([sizes, prices], batch_size=3) 322 | 323 | If you call the high-level :func:`downhill.minimize` method directly, you can 324 | pass ``batch_size`` to it directly:: 325 | 326 | downhill.minimize(loss, [sizes, prices], batch_size=3) 327 | 328 | The number of mini-batches that are processed during a single training epoch is 329 | controlled by the ``iteration_size`` parameter when constructing a ``Dataset``:: 330 | 331 | train = downhill.Dataset([sizes, prices], iteration_size=10) 332 | 333 | This will ensure that one iteration loop over the training dataset will produce 334 | 10 mini-batches. If you have fewer than ``batch_size`` times ``iteration_size`` 335 | pieces of data, the ``Dataset`` class will loop over your data multiple times to 336 | ensure that the desired number of batches is processed. (The ``Dataset`` class 337 | also handles shuffling your data as needed during iteration, to avoid issues 338 | that can come up when presenting data to the model in a fixed order.) 339 | 340 | If you call the high-level :func:`downhill.minimize` method, the numbers of 341 | training and validation mini-batches processed per epoch are set using the 342 | ``train_batches`` and ``valid_batches`` parameters, respectively:: 343 | 344 | downhill.minimize(..., train_batches=10, valid_batches=8) 345 | 346 | Finally, a validation takes place after a fixed number of training epochs have 347 | happened. This number is set using the ``validate_every`` parameter; for 348 | example, to validate the loss every 5 training epochs:: 349 | 350 | downhill.minimize(..., validate_every=5) 351 | 352 | If you are processing data using the lower-level API, the ``validate_every`` 353 | parameter is passed directly to :func:`iterate() 354 | `:: 355 | 356 | for tm, vm in opt.iterate(..., validate_every=5): 357 | # ... 358 | 359 | .. _patience-improvement: 360 | 361 | Patience and Improvement 362 | ------------------------ 363 | 364 | The training process halts if there is "insufficient" progress on the validation 365 | loss for "long enough." The precise meanings of these terms are given by the 366 | ``min_improvement`` and ``patience`` parameters, respectively. 367 | 368 | The ``min_improvement`` parameter specifies the minimum relative improvement of 369 | the validation loss that counts as progress in the optimization. If 370 | ``min_improvement`` is set to 0, for example, then any positive improvement in 371 | the validation loss will count as progress, while if ``min_improvement`` is set 372 | to 0.1, then the validation loss must improve by 10% relative to the current 373 | best validation loss before the validation attempt counts as progress. 374 | 375 | The ``patience`` parameter specifies the number of failed validation attempts 376 | that you are willing to tolerate before seeing any progress. If ``patience`` is 377 | set to 0, for instance, then optimization will halt as soon as a validation 378 | attempt fails to make ``min_improvement`` relative loss improvement over the 379 | best validation loss so far. If ``patience`` is set to 3, then optimization will 380 | continue through three failed validation attempts, but if the fourth validation 381 | attempt fails, then optimization will halt. 382 | 383 | These parameters can be set either on a call to the high-level 384 | :func:`downhill.minimize` function:: 385 | 386 | downhill.minimize(..., patience=3, min_improvement=0.1) 387 | 388 | or when calling :func:`iterate() `:: 389 | 390 | for tm, vm in opt.iterate(..., patience=3, min_improvement=0.1): 391 | # ... 392 | 393 | .. _algorithm: 394 | 395 | Optimization Algorithms 396 | ----------------------- 397 | 398 | The following algorithms are currently available in ``downhill``: 399 | 400 | - ``'adadelta'`` --- :class:`ADADELTA ` 401 | - ``'adagrad'`` --- :class:`ADAGRAD ` 402 | - ``'adam'`` --- :class:`Adam ` 403 | - ``'esgd'`` --- :class:`Equilibrated SGD ` 404 | - ``'nag'`` --- :class:`Nesterov's Accelerated Gradient ` 405 | - ``'rmsprop'`` --- :class:`RMSProp ` 406 | - ``'rprop'`` --- :class:`Resilient Backpropagation ` 407 | - ``'sgd'`` --- :class:`Stochastic Gradient Descent ` 408 | 409 | To select an algorithm, specify its name using the ``algo`` keyword argument:: 410 | 411 | downhill.minimize(..., algo='adadelta') 412 | 413 | or pass the algorithm name to build an :class:`Optimizer 414 | ` instance:: 415 | 416 | opt = downhill.build('adadelta', ...) 417 | 418 | Different algorithms have different performance characteristics, different 419 | numbers of hyperparameters to tune, and different suitability for particular 420 | problems. In general, several of the the adaptive procedures seem to work well 421 | across different problems, particularly :class:`Adam `, 422 | :class:`ADADELTA `, and :class:`RMSProp 423 | `. :class:`NAG ` also seems 424 | to work quite well, but can sometimes take longer to converge. 425 | 426 | Many of these algorithms, being based on stochastic gradient descent, rely on a 427 | common set of hyperparameters that control the speed of convergence and the 428 | reliability of the optimization process over time; these parameters are 429 | discussed next. 430 | 431 | .. _learning-rate: 432 | 433 | Learning Rate 434 | ------------- 435 | 436 | Most stochastic gradient optimization methods make small parameter updates based 437 | on the local gradient of the loss at each step in the optimization procedure. 438 | Intuitively, parameters in a model are updated by subtracting a small portion of 439 | the local derivative from the current parameter value. Mathematically, this is 440 | written as: 441 | 442 | .. math:: 443 | 444 | \theta_{t+1} = \theta_t - \alpha \left. 445 | \frac{\partial\mathcal{L}}{\partial\theta} \right|_{\theta_t} 446 | 447 | where :math:`\mathcal{L}` is the loss function being optimized, :math:`\theta` 448 | is the value of a parameter in the model (e.g., :math:`m` or :math:`b` for the 449 | regression problem) at optimization step :math:`t`, :math:`\alpha` is the 450 | learning rate, and :math:`\frac{\partial\mathcal{L}}{\partial\theta}` (also 451 | often written :math:`\nabla_{\theta_t}\mathcal{L}`) is the partial derivative of 452 | the loss with respect to the parameters, evaluated at the current value of those 453 | parameters. 454 | 455 | The learning rate :math:`\alpha` specifies the scale of these parameter updates 456 | with respect to the magnitude of the gradient. Almost all stochastic optimizers 457 | use a fixed learning rate parameter. 458 | 459 | In ``downhill``, the learning rate is passed as a keyword argument to 460 | :func:`downhill.minimize`:: 461 | 462 | downhill.minimize(..., learning_rate=0.1) 463 | 464 | Often the learning rate is set to a very small value---many approaches seem to 465 | start with values around 1e-4. If the learning rate is too large, the 466 | optimization procedure might "bounce around" in the loss landscape because the 467 | parameter steps are too large. If the learning rate is too small, the 468 | optimization procedure might not make progress quickly enough to make 469 | optimization practical. 470 | 471 | .. _momentum: 472 | 473 | Momentum 474 | -------- 475 | 476 | Momentum is a common technique in stochastic gradient optimization algorithms 477 | that seems to accelerate the optimization process in most cases. Intuitively, 478 | momentum avoids "jitter" in the parameters during optimization by smoothing the 479 | estimates of the local gradient information over time. In practice a momentum 480 | method maintains a "velocity" of the most recent parameter steps and combines 481 | these recent individual steps together when making a parameter update. 482 | Mathematically, this is written: 483 | 484 | .. math:: 485 | 486 | \begin{eqnarray*} 487 | \nu_{t+1} &=& \mu \nu_t - \alpha \left. \frac{\partial\mathcal{L}}{\partial\theta} \right|_{\theta_t} \\ 488 | \theta_{t+1} &=& \theta_t + \nu_{t+1} 489 | \end{eqnarray*} 490 | 491 | where the symbols are the same as above, and additionally :math:`\nu` describes 492 | the "velocity" of parameter :math:`\theta`, and :math:`\mu` is the momentum 493 | hyperparameter. The gradient computations using momentum are exactly the same as 494 | when not using momentum; the only difference is the accumulation of recent 495 | updates in the "velocity." 496 | 497 | In ``downhill``, the momentum value is passed as a keyword argument to 498 | :func:`downhill.minimize`:: 499 | 500 | downhill.minimize(..., momentum=0.9) 501 | 502 | Typically momentum is set to a value in :math:`[0, 1)`---when set to 0, momentum 503 | is disabled, and when set to values near 1, the momentum is very high, requiring 504 | several consecutive parameter updates in the same direction to change the 505 | parameter velocity. 506 | 507 | In many problems it is useful to set the momentum to a surprisingly large value, 508 | sometimes even to values greater than 0.9. Such values can be especially 509 | effective with a relatively small learning rate. 510 | 511 | If the momentum is set too low, then parameter updates will be more noisy and 512 | optimization might take longer to converge, but if the momentum is set too high, 513 | the optimization process might diverge entirely. 514 | 515 | Nesterov Momentum 516 | ----------------- 517 | 518 | More recently, a newer momentum technique has been shown to be even more 519 | performant than "traditional" momentum. This technique was originally proposed 520 | by Y. Nesterov and effectively amounts to computing the momentum value at a 521 | different location in the parameter space, namely the location where the 522 | momentum value would have placed the parameter after the current update: 523 | 524 | .. math:: 525 | \begin{eqnarray*} 526 | \nu_{t+1} &=& \mu \nu_t - \alpha \left. 527 | \frac{\partial\mathcal{L}}{\partial\theta}\right|_{\theta_t + \mu\nu_t} \\ 528 | \theta_{t+1} &=& \theta_t + \nu_{t+1} 529 | \end{eqnarray*} 530 | 531 | Note that the partial derivative is evaluated at :math:`\theta_t + \mu\nu_t` 532 | instead of at :math:`\theta_t`. The intuitive rationale for this change is that 533 | if the momentum would have produced an "overshoot," then the gradient at this 534 | overshot parameter value would point backwards, toward the previous parameter 535 | value, which would thus help correct oscillations during optimization. 536 | 537 | To use Nesterov-style momentum, use either the :class:`NAG 538 | ` optimizer (which uses plain stochastic gradient 539 | descent with Nesterov momentum), or specify ``nesterov=True`` in addition to 540 | providing a nonzero ``momentum`` value when minimizing your loss:: 541 | 542 | downhill.minimize(..., momentum=0.9, nesterov=True) 543 | 544 | .. _gradient-clipping: 545 | 546 | Gradient Clipping 547 | ----------------- 548 | 549 | Sometimes during the execution of a stochastic optimization routine---and 550 | particularly at the start of optimization, when the problem parameters are far 551 | from their optimal values---the gradient of the loss with respect to the 552 | parameters can be extremely large. In these cases, taking a step that is 553 | proportional to the magnitude of the gradient can actually be harmful, resulting 554 | in an unpredictable parameter change. 555 | 556 | To prevent this from happening, but still preserve the iterative loss 557 | improvements when parameters are in a region with "more reasonable" gradient 558 | magnitudes, ``downhill`` implements two forms of "gradient clipping." 559 | 560 | The first gradient truncation method rescales the entire gradient vector if its 561 | L2 norm exceeds some threshold. This is accomplished using the 562 | ``max_gradient_norm`` hyperparameter:: 563 | 564 | downhill.minimize(..., max_gradient_norm=1) 565 | 566 | The second gradient truncation method clips each element of the gradient vector 567 | individually. This is accomplished using the ``max_gradient_elem`` 568 | hyperparameter:: 569 | 570 | downhill.minimize(..., max_gradient_elem=1) 571 | 572 | In both cases, gradients that are extremely large will still point in the 573 | correct direction, but their magnitudes will be rescaled to avoid steps that are 574 | too large. Gradients with values smaller than the thresholds (presumably, 575 | gradients near an optimum will be small) will not be affected. In both cases, 576 | the strategy of taking small steps proportional to the gradient seems to work. 577 | 578 | .. _providing-data: 579 | 580 | Providing Data 581 | ============== 582 | 583 | As described above, you'll often need to provide data to ``downhill`` so that 584 | you can compute the loss and optimize the parameters for your problem. There are 585 | two ways of passing data to ``downhill``: using arrays and using callables. 586 | 587 | .. _data-using-arrays: 588 | 589 | Using Arrays 590 | ------------ 591 | 592 | A fairly typical use case for optimizing a loss for a small-ish problem is to 593 | construct a ``numpy`` array containing the data you have:: 594 | 595 | dataset = np.load(filename) 596 | downhill.minimize(..., train=dataset) 597 | 598 | Sometimes the data available for optimizing a loss exceeds the available 599 | resources (e.g., memory) on the computer at hand. There are several ways of 600 | handling this type of situation. If your data are already in a ``numpy`` array 601 | stored on disk, you might want to try loading the array using ``mmap``:: 602 | 603 | dataset = np.load(filename, mmap_mode='r') 604 | downhill.minimize(..., train=dataset) 605 | 606 | Alternatively, you might want to load just part of the data and train on that, 607 | then load another part and train on it:: 608 | 609 | for filename in filenames: 610 | dataset = np.load(filename, mmap_mode='r') 611 | downhill.minimize(..., train=dataset) 612 | 613 | Finally, you can potentially handle large datasets by using a callable to 614 | provide data to the optimization algorithm. 615 | 616 | .. _data-using-callables: 617 | 618 | Using Callables 619 | --------------- 620 | 621 | Instead of an array of data, you can provide a callable for a :class:`Dataset 622 | `. This callable must take no arguments and must 623 | return a list of ``numpy`` arrays of the proper shape for your loss. 624 | 625 | During minimization, the callable will be invoked every time the optimization 626 | algorithm requires a batch of training (or validation) data. Therefore, your 627 | callable should return at least one array containing a batch of data; if your 628 | model requires multiple arrays per batch (e.g., if you are minimizing a loss 629 | that requires some "input" data as well as some "output" data), then your 630 | callable should return a list containing the correct number of arrays (e.g., an 631 | array of "inputs" and the corresponding "outputs"). 632 | 633 | For example, this code defines a ``batch()`` helper that could be used for a 634 | loss that needs one input. The callable chooses a random dataset and a random 635 | offset for each batch:: 636 | 637 | SOURCES = 'foo.npy', 'bar.npy', 'baz.npy' 638 | BATCH_SIZE = 64 639 | 640 | def batch(): 641 | X = np.load(np.random.choice(SOURCES), mmap_mode='r') 642 | i = np.random.randint(len(X)) 643 | return X[i:i+BATCH_SIZE] 644 | 645 | downhill.minimize(..., train=batch) 646 | 647 | If you need to maintain more state than is reasonable from a single closure, you 648 | can also encapsulate the callable inside a class. Just make sure instances of 649 | the class are callable by defining the ``__call__`` method. For example, this 650 | class loads data from a series of ``numpy`` arrays on disk, but only loads one 651 | of the on-disk arrays into memory at a given time:: 652 | 653 | class Loader: 654 | def __init__(sources=('foo.npy', 'bar.npy', 'baz.npy'), batch_size=64): 655 | self.sources = sources 656 | self.batch_size = batch_size 657 | self.src = -1 658 | self.idx = 0 659 | self.X = () 660 | 661 | def __call__(self): 662 | if self.idx + self.batch_size > len(self.X): 663 | self.idx = 0 664 | self.src = (self.src + 1) % len(self.sources) 665 | self.X = np.load(self.sources[self.src], mmap_mode='r') 666 | try: 667 | return self.X[self.idx:self.idx+self.batch_size] 668 | finally: 669 | self.idx += self.batch_size 670 | 671 | downhill.minimize(..., train=Loader()) 672 | 673 | There are almost limitless possibilities for using callables to interface with 674 | the optimization process. 675 | 676 | .. _monitoring: 677 | 678 | Monitoring 679 | ========== 680 | 681 | Sometimes while optimizing a loss, it can be helpful to "see inside" the model. 682 | In a model with a sparsity regularizer, for example, having some idea of the 683 | current sparsity of the model can help diagnose when the model is "too sparse." 684 | 685 | In ``downhill`` you can provide a series of *monitors* during optimization that 686 | satisfy this need. Monitors must be a series of named Theano expressions that 687 | evaluate to scalars; this can be provided as a dictionary that maps names to 688 | expressions, or as a list of (name, expression) ordered pairs. 689 | 690 | Suppose you want to monitor the slope and intercept values that your model is 691 | computing as it works its way through the house price modeling task. You can 692 | provide monitors for these quantities as follows:: 693 | 694 | downhill.minimize( 695 | loss, 696 | [sizes, prices], 697 | inputs=[x, y], 698 | monitors=[ 699 | ('m', m.sum()), 700 | ('b', b.sum()), 701 | ]) 702 | 703 | The Theano expressions here are sums because the ``m`` and ``b`` shared 704 | variables are actually arrays of shared variables. (This also helps generalize 705 | the regression loss to situations where you might have multiple independent 706 | variables, like house size and number of bedrooms.) If you preferred to provide 707 | the monitor values as a dictionary, it would look like:: 708 | 709 | downhill.minimize( 710 | loss, 711 | [sizes, prices], 712 | inputs=[x, y], 713 | monitors=dict(m=m.sum(), b=b.sum())) 714 | 715 | Note that if you construct an :class:`Optimizer ` 716 | directly, then you need to pass the monitors when you create your optimizer 717 | instance:: 718 | 719 | opt = downhill.build( 720 | 'nag', loss=loss, inputs=[sizes, prices], 721 | monitors=dict(m=m.sum(), b=b.sum())) 722 | 723 | Gradients 724 | --------- 725 | 726 | Sometimes when setting parameters like ``learning_rate`` and 727 | ``max_gradient_norm``, it can be quite useful to see how large the gradients of 728 | your model are. These quantities can be included in the monitors easily by 729 | setting the ``monitor_gradients`` flag:: 730 | 731 | downhill.minimize( 732 | loss, 733 | [sizes, prices], 734 | inputs=[x, y], 735 | monitor_gradients=True) 736 | 737 | This will include one monitor for each parameter in your model, indicating the 738 | squared L2 norm of the gradient (averaged across mini-batches). 739 | 740 | More Information 741 | ================ 742 | 743 | This concludes the ``downhill`` guide! Have a good time harnessing the power of 744 | your GPU to optimize your scalar losses! 745 | 746 | If you need more information or just want to discuss things, sign up for the 747 | `mailing list`_, and check out the project page at github_. 748 | 749 | .. _mailing list: https://groups.google.com/forum/#!forum/downhill-users 750 | .. _github: https://github.com/lmjohns3/downhill 751 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. figure:: _static/rosenbrock-nag.png 2 | :figclass: banana 3 | 4 | The ``downhill`` package provides algorithms for minimizing scalar loss 5 | functions that are defined using Theano_. 6 | 7 | Several optimization algorithms are included: 8 | 9 | - :class:`ADADELTA ` 10 | - :class:`ADAGRAD ` 11 | - :class:`Adam ` 12 | - :class:`Equilibrated SGD ` 13 | - :class:`Nesterov's Accelerated Gradient ` 14 | - :class:`RMSProp ` 15 | - :class:`Resilient Backpropagation ` 16 | - :class:`Stochastic Gradient Descent ` 17 | 18 | All algorithms permit the use of regular or Nesterov-style momentum as well. 19 | 20 | The source code for ``downhill`` lives at http://github.com/lmjohns3/downhill, 21 | the documentation lives at http://downhill.readthedocs.org, and announcements 22 | and discussion happen on the `mailing list`_. 23 | 24 | .. _Theano: http://deeplearning.net/software/theano/ 25 | .. _mailing list: https://groups.google.com/forum/#!forum/downhill-users 26 | 27 | Quick Start: Matrix Factorization 28 | ================================= 29 | 30 | Let's say you want to compute a sparse, low-rank approximation for some 31 | 1000-dimensional data that you have lying around. You can represent a batch of 32 | :math:`m` of data points :math:`X \in \mathbb{R}^{m \times 1000}` as the product 33 | of a sparse coefficient matrix :math:`U \in \mathbb{R}^{m \times k}` and a 34 | low-rank basis matrix :math:`V \in \mathbb{R}^{k \times 1000}`. You might 35 | represent the loss as 36 | 37 | .. math:: 38 | 39 | \mathcal{L} = \| X - UV \|_2^2 + \alpha \| U \|_1 + \beta \| V \|_2 40 | 41 | where the first term represents the approximation error, the second represents 42 | the sparsity of the representation, and the third prevents the basis vectors 43 | from growing too large. 44 | 45 | This is pretty straightforward to model using Theano. Once you set up the 46 | appropriate variables and an expression for the loss, you can optimize the loss 47 | with respect to the variables using a single call to :func:`downhill.minimize`:: 48 | 49 | import downhill 50 | import numpy as np 51 | import theano 52 | import theano.tensor as TT 53 | 54 | FLOAT = 'df'[theano.config.floatX == 'float32'] 55 | 56 | def rand(a, b): 57 | return np.random.randn(a, b).astype(FLOAT) 58 | 59 | A, B, K = 20, 5, 3 60 | 61 | # Set up a matrix factorization problem to optimize. 62 | u = theano.shared(rand(A, K), name='u') 63 | v = theano.shared(rand(K, B), name='v') 64 | z = TT.matrix() 65 | err = TT.sqr(z - TT.dot(u, v)) 66 | loss = err.mean() + abs(u).mean() + (v * v).mean() 67 | 68 | # Minimize the regularized loss with respect to a data matrix. 69 | y = np.dot(rand(A, K), rand(K, B)) + rand(A, B) 70 | 71 | # Monitor during optimization. 72 | monitors = (('err', err.mean()), 73 | ('|u|<0.1', (abs(u) < 0.1).mean()), 74 | ('|v|<0.1', (abs(v) < 0.1).mean())) 75 | 76 | downhill.minimize( 77 | loss=loss, 78 | train=[y], 79 | patience=0, 80 | batch_size=A, # Process y as a single batch. 81 | max_gradient_norm=1, # Prevent gradient explosion! 82 | learning_rate=0.1, 83 | monitors=monitors, 84 | monitor_gradients=True) 85 | 86 | # Print out the optimized coefficients u and basis v. 87 | print('u =', u.get_value()) 88 | print('v =', v.get_value()) 89 | 90 | If you prefer to maintain more control over your model during optimization, 91 | downhill provides an iterative optimization interface:: 92 | 93 | opt = downhill.build(algo='rmsprop', 94 | loss=loss, 95 | monitors=monitors, 96 | monitor_gradients=True) 97 | 98 | for metrics, _ in opt.iterate(train=[[y]], 99 | patience=0, 100 | batch_size=A, 101 | max_gradient_norm=1, 102 | learning_rate=0.1): 103 | print(metrics) 104 | 105 | If that's still not enough, you can just plain ask downhill for the updates to 106 | your model variables and do everything else yourself:: 107 | 108 | updates = downhill.build('rmsprop', loss).get_updates( 109 | batch_size=A, max_gradient_norm=1, learning_rate=0.1) 110 | func = theano.function([z], loss, updates=list(updates)) 111 | for _ in range(100): 112 | print(func(y)) # Evaluate func and apply variable updates. 113 | 114 | Documentation 115 | ============= 116 | 117 | .. toctree:: 118 | :maxdepth: 2 119 | 120 | guide 121 | reference 122 | 123 | Indices and tables 124 | ================== 125 | 126 | - :ref:`genindex` 127 | - :ref:`modindex` 128 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\theanets.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\theanets.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/reference.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Reference 3 | ========= 4 | 5 | .. automodule:: downhill 6 | :no-members: 7 | :no-inherited-members: 8 | 9 | .. autosummary:: 10 | :toctree: generated/ 11 | 12 | minimize 13 | 14 | Base 15 | ==== 16 | 17 | .. automodule:: downhill.base 18 | :no-members: 19 | :no-inherited-members: 20 | 21 | .. autosummary:: 22 | :toctree: generated/ 23 | 24 | build 25 | Optimizer 26 | 27 | First-Order Optimizers 28 | ====================== 29 | 30 | .. automodule:: downhill.first_order 31 | :no-members: 32 | :no-inherited-members: 33 | 34 | .. autosummary:: 35 | :toctree: generated/ 36 | 37 | SGD 38 | NAG 39 | 40 | Adaptive Optimizers 41 | =================== 42 | 43 | .. automodule:: downhill.adaptive 44 | :no-members: 45 | :no-inherited-members: 46 | 47 | .. autosummary:: 48 | :toctree: generated/ 49 | 50 | ADADELTA 51 | ADAGRAD 52 | Adam 53 | ESGD 54 | RMSProp 55 | RProp 56 | 57 | Datasets 58 | ======== 59 | 60 | .. automodule:: downhill.dataset 61 | :no-members: 62 | :no-inherited-members: 63 | 64 | .. autosummary:: 65 | :toctree: generated/ 66 | 67 | Dataset 68 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | numpydoc 2 | sphinx-better-theme 3 | -------------------------------------------------------------------------------- /downhill/__init__.py: -------------------------------------------------------------------------------- 1 | from .adaptive import * 2 | from .base import build, Optimizer 3 | from .dataset import Dataset 4 | from .first_order import * 5 | 6 | __version__ = '0.5.0pre' 7 | 8 | 9 | def minimize(loss, train, valid=None, params=None, inputs=None, algo='rmsprop', 10 | updates=(), monitors=(), monitor_gradients=False, batch_size=32, 11 | train_batches=None, valid_batches=None, **kwargs): 12 | '''Minimize a loss function with respect to some symbolic parameters. 13 | 14 | Additional keyword arguments are passed to the underlying :class:`Optimizer 15 | ` instance. 16 | 17 | Parameters 18 | ---------- 19 | loss : Theano expression 20 | Loss function to minimize. This must be a scalar-valued expression. 21 | train : :class:`Dataset `, ndarray, or callable 22 | Dataset to use for computing gradient updates. 23 | valid : :class:`Dataset `, ndarray, or callable, optional 24 | Dataset to use for validating the minimization process. The training 25 | dataset is used if this is not provided. 26 | params : list of Theano variables, optional 27 | Symbolic variables to adjust to minimize the loss. If not given, these 28 | will be computed automatically by walking the computation graph. 29 | inputs : list of Theano variables, optional 30 | Symbolic variables required to compute the loss. If not given, these 31 | will be computed automatically by walking the computation graph. 32 | algo : str, optional 33 | Name of the minimization algorithm to use. Must be one of the strings 34 | that can be passed to :func:`build`. Defaults to ``'rmsprop'``. 35 | updates : list of update pairs, optional 36 | A list of pairs providing updates for the internal of the loss 37 | computation. Normally this is empty, but it can be provided if the loss, 38 | for example, requires an update to an internal random number generator. 39 | monitors : dict or sequence of (str, Theano expression) tuples, optional 40 | Additional values to monitor during optimization. These must be provided 41 | as either a sequence of (name, expression) tuples, or as a dictionary 42 | mapping string names to Theano expressions. 43 | monitor_gradients : bool, optional 44 | If True, add monitors to log the norms of the parameter gradients during 45 | optimization. Defaults to False. 46 | batch_size : int, optional 47 | Size of batches provided by datasets. Defaults to 32. 48 | train_batches : int, optional 49 | Number of batches of training data to iterate over during one pass of 50 | optimization. Defaults to None, which uses the entire training dataset. 51 | valid_batches : int, optional 52 | Number of batches of validation data to iterate over during one pass of 53 | validation. Defaults to None, which uses the entire validation dataset. 54 | 55 | Returns 56 | ------- 57 | train_monitors : dict 58 | A dictionary mapping monitor names to monitor values. This dictionary 59 | will always contain the ``'loss'`` key, giving the value of the loss 60 | evaluated on the training dataset. 61 | valid_monitors : dict 62 | A dictionary mapping monitor names to monitor values, evaluated on the 63 | validation dataset. This dictionary will always contain the ``'loss'`` 64 | key, giving the value of the loss function. Because validation is not 65 | always computed after every optimization update, these monitor values 66 | may be "stale"; however, they will always contain the most recently 67 | computed values. 68 | ''' 69 | if not isinstance(train, Dataset): 70 | train = Dataset( 71 | train, 72 | name='train', 73 | batch_size=batch_size, 74 | iteration_size=train_batches, 75 | ) 76 | if valid is not None and not isinstance(valid, Dataset): 77 | valid = Dataset( 78 | valid, 79 | name='valid', 80 | batch_size=batch_size, 81 | iteration_size=valid_batches, 82 | ) 83 | return build( 84 | algo, 85 | loss=loss, 86 | params=params, 87 | inputs=inputs, 88 | updates=updates, 89 | monitors=monitors, 90 | monitor_gradients=monitor_gradients, 91 | ).minimize(train, valid, **kwargs) 92 | -------------------------------------------------------------------------------- /downhill/adaptive.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | '''This module defines gradient descent optimizers with adaptive learning rates. 4 | ''' 5 | 6 | import numpy as np 7 | import theano 8 | import theano.tensor as TT 9 | 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 11 | 12 | from .base import Optimizer 13 | from . import util 14 | 15 | __all__ = ['RProp', 'RMSProp', 'ADAGRAD', 'ADADELTA', 'ESGD', 'Adam'] 16 | 17 | 18 | class RProp(Optimizer): 19 | r'''Resilient backpropagation optimizer. 20 | 21 | Parameters 22 | ---------- 23 | rprop_increase: float, optional (default 1.01) 24 | Increase step sizes at this rate when the gradient sign stays the same. 25 | rprop_decrease: float, optional (default 0.99) 26 | Decrease step sizes at this rate when the gradient sign changes. 27 | rprop_min_step: float, optional (default 0) 28 | Minimum step size for any parameter. 29 | rprop_max_step: float, optional (default 100) 30 | Maximum step size for any parameter. 31 | momentum: float, optional (default 0) 32 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 33 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 34 | momentum. 35 | nesterov: bool, optional (default False) 36 | Set this to ``True`` to enable Nesterov-style momentum updates, whenever 37 | ``momentum`` is nonzero. 38 | 39 | Notes 40 | ----- 41 | 42 | The RProp method takes small steps in parameter space using local gradient 43 | information. RProp is unlike "vanilla" first-order techniques like 44 | :class:`SGD `, however, because only the signs of 45 | the gradients are taken into account when making parameter updates. That is, 46 | the step size for each parameter is independent of the magnitude of the 47 | gradient for that parameter. 48 | 49 | To accomplish this, RProp maintains a separate learning rate for every 50 | parameter in the model, and adjusts this learning rate based on the 51 | consistency of the sign of the gradient over time. Whenever two consecutive 52 | gradients for a parameter have the same sign, the learning rate for that 53 | parameter increases, and whenever the signs disagree, the learning rate 54 | decreases. This has a similar effect to momentum-based stochastic gradient 55 | methods but effectively maintains parameter-specific learning rates. 56 | 57 | .. math:: 58 | \begin{eqnarray*} 59 | && \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1} 60 | \frac{\partial\mathcal{L}}{\partial p} > 0 \\ 61 | && \qquad \Delta_t = \min (\eta_+\Delta_{t−1}, \Delta_+) \\ 62 | && \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1} 63 | \frac{\partial\mathcal{L}}{\partial p} < 0 \\ 64 | && \qquad \Delta_t = \max (\eta_-\Delta_{t−1}, \Delta_-) \\ 65 | && \qquad \frac{\partial\mathcal{L}}{\partial p} = 0 \\ 66 | && p_{t+1} = p_t − \mbox{sgn}\left( 67 | \frac{\partial\mathcal{L}}{\partial p}\right) \Delta_t 68 | \end{eqnarray*} 69 | 70 | Here, :math:`s(\cdot)` is the sign function (i.e., returns -1 if its 71 | argument is negative and 1 otherwise), :math:`\eta_-` and :math:`\eta_+` are 72 | the amount to decrease (increase) the step size if the gradients disagree 73 | (agree) in sign, and :math:`\Delta_+` and :math:`\Delta_-` are the maximum 74 | and minimum step size. 75 | 76 | The implementation here is actually the "iRprop-" variant of RProp described 77 | in Algorithm 4 from [Igel00]_. This variant resets the running gradient 78 | estimates to zero in cases where the previous and current gradients have 79 | switched signs. 80 | 81 | References 82 | ---------- 83 | 84 | .. [Ried92] M. Riedmiller & H. Braun. (1992) "Rprop - A Fast Adaptive 85 | Learning Algorithm." In Proceedings of the International Symposium on 86 | Computer and Information Science VII. 87 | 88 | .. [Igel00] C. Igel & M. Hüsken. (2000) "Improving the Rprop Learning 89 | Algorithm." In Proceedings of the Second International Symposium on 90 | Neural Computation. 91 | http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332 92 | ''' 93 | 94 | def _prepare(self, 95 | rprop_increase=1.01, 96 | rprop_decrease=0.99, 97 | rprop_min_step=0, 98 | rprop_max_step=100, 99 | **kwargs): 100 | self.step_increase = util.as_float(rprop_increase) 101 | self.step_decrease = util.as_float(rprop_decrease) 102 | self.min_step = util.as_float(rprop_min_step) 103 | self.max_step = util.as_float(rprop_max_step) 104 | util.log_param('rprop_increase', rprop_increase) 105 | util.log_param('rprop_decrease', rprop_decrease) 106 | util.log_param('rprop_min_step', rprop_min_step) 107 | util.log_param('rprop_max_step', rprop_max_step) 108 | super(RProp, self)._prepare(**kwargs) 109 | 110 | def _get_updates_for(self, param, grad): 111 | grad_tm1 = util.shared_like(param, 'grad') 112 | step_tm1 = util.shared_like(param, 'step', self.learning_rate.eval()) 113 | test = grad * grad_tm1 114 | diff = TT.lt(test, 0) 115 | steps = step_tm1 * (TT.eq(test, 0) + 116 | TT.gt(test, 0) * self.step_increase + 117 | diff * self.step_decrease) 118 | step = TT.minimum(self.max_step, TT.maximum(self.min_step, steps)) 119 | grad = grad - diff * grad 120 | yield param, TT.sgn(grad) * step 121 | yield grad_tm1, grad 122 | yield step_tm1, step 123 | 124 | 125 | class ADAGRAD(Optimizer): 126 | r'''ADAGRAD optimizer. 127 | 128 | Parameters 129 | ---------- 130 | rms_regularizer: float, optional (default 1e-8) 131 | Regularize the learning rate scaling factor by this :math:`\epsilon`. 132 | momentum: float, optional (default 0) 133 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 134 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 135 | momentum. 136 | nesterov: bool, optional (default False) 137 | Set this to ``True`` to enable Nesterov-style momentum updates, whenever 138 | ``momentum`` is nonzero. 139 | 140 | Notes 141 | ----- 142 | 143 | The ADAGRAD method uses the same general strategy as all first-order 144 | stochastic gradient methods, in the sense that these methods make small 145 | parameter adjustments iteratively using local derivative information. 146 | 147 | The difference with ADAGRAD is that as gradients are computed during each 148 | parameter update, their squares are accumulated, and this accumulated value 149 | is used to rescale the global learning rate :math:`\alpha` separately for 150 | each parameter. 151 | 152 | .. math:: 153 | \begin{eqnarray*} 154 | g_{t+1} &=& g_t + \left(\frac{\partial\mathcal{L}}{\partial p}\right)^2 \\ 155 | p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1}} + \epsilon} 156 | \frac{\partial\mathcal{L}}{\partial p} 157 | \end{eqnarray*} 158 | 159 | Like the other adaptive learning methods, learning method effectively 160 | maintains a sort of parameter-specific learning rate. Unlike 161 | :class:`RMSProp` and :class:`ADADELTA`, however, in ADAGRAD, the gradient 162 | magnitudes accumulate throughout training, which has the effect of scaling 163 | the learning rate for each parameter, but also effectively anneals the 164 | learning rate overall as training progresses. 165 | 166 | In this implementation, the scale values are regularized (made less extreme) 167 | by :math:`\epsilon`, which is specified using the ``rms_regularizer`` 168 | parameter. 169 | 170 | References 171 | ---------- 172 | 173 | .. [Duch10] J. Duchi, E. Hazan, & Y. Singer (2010) “Adaptive subgradient 174 | methods for online leaning and stochastic optimization.” Proc. Conference 175 | on Learning Theory (COLT). 176 | ''' 177 | 178 | def _prepare(self, rms_regularizer=1e-8, **kwargs): 179 | self.epsilon = util.as_float(rms_regularizer) 180 | util.log_param('rms_regularizer', rms_regularizer) 181 | super(ADAGRAD, self)._prepare(**kwargs) 182 | 183 | def _get_updates_for(self, param, grad): 184 | g2_tm1 = util.shared_like(param, 'g2_acc') 185 | g2_t = g2_tm1 + grad * grad 186 | yield g2_tm1, g2_t 187 | yield param, grad * self.learning_rate / TT.sqrt(g2_t + self.epsilon) 188 | 189 | 190 | class RMSProp(Optimizer): 191 | r'''RMSProp optimizer. 192 | 193 | Parameters 194 | ---------- 195 | learning_rate: float, optional (default 1e-4) 196 | Step size to take during optimization. 197 | rms_halflife: float, optional (default 14) 198 | Compute RMS gradient values using an exponentially weighted moving 199 | average that decays with this halflife. 200 | rms_regularizer: float, optional (default 1e-8) 201 | Regularize RMS gradient values by this :math:`\epsilon`. 202 | momentum: float, optional (default 0) 203 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 204 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 205 | momentum. 206 | nesterov: bool, optional (default False) 207 | Set this to ``True`` to enable Nesterov-style momentum updates, whenever 208 | ``momentum`` is nonzero. 209 | 210 | Notes 211 | ----- 212 | 213 | The RMSProp method uses the same general strategy as all first-order 214 | stochastic gradient methods, in the sense that these methods make small 215 | parameter adjustments iteratively using local derivative information. 216 | 217 | The difference here is that as gradients are computed during each parameter 218 | update, an exponentially-weighted moving average (EWMA) of gradient 219 | magnitudes is maintained as well. At each update, the EWMA is used to 220 | compute the root-mean-square (RMS) gradient value that's been seen in the 221 | recent past. The actual gradient is normalized by this RMS scaling factor 222 | before being applied to update the parameters. Intuitively, this makes 223 | RMSProp take steps near 1 whenever the gradient is of constant magnitude, 224 | and larger steps whenever the local scale of the gradient starts to 225 | increase. 226 | 227 | .. math:: 228 | \begin{eqnarray*} 229 | f_{t+1} &=& \gamma f_t + (1 - \gamma) \frac{\partial\mathcal{L}}{\partial p} \\ 230 | g_{t+1} &=& \gamma g_t + (1 - \gamma) \left( 231 | \frac{\partial\mathcal{L}}{\partial p}\right)^2 \\ 232 | p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1} - f_{t+1}^2 + \epsilon}} 233 | \frac{\partial\mathcal{L}}{\partial p} 234 | \end{eqnarray*} 235 | 236 | Like :class:`RProp`, this learning method effectively maintains a sort of 237 | parameter-specific momentum value, but this method takes into account both 238 | the sign and the magnitude of the gradient for each parameter. 239 | 240 | In this algorithm, RMS values are regularized (made less extreme) by 241 | :math:`\epsilon`, which is specified using the ``rms_regularizer`` keyword 242 | argument. 243 | 244 | The weight parameter :math:`\gamma` for the EWMA window is computed from the 245 | ``rms_halflife`` keyword argument, such that the actual EWMA weight varies 246 | inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 247 | 2}{h}}`. 248 | 249 | The implementation here is taken from [Grav13]_, equations (38)--(45). 250 | Graves' implementation in particular seems to have introduced the 251 | :math:`f_t` terms into the RMS computation; these terms appear to act as a 252 | sort of momentum for the RMS values. 253 | 254 | References 255 | ---------- 256 | 257 | .. [Grav13] A. Graves. (2013) "Generating Sequences With Recurrent Neural 258 | Networks." http://arxiv.org/abs/1308.0850 259 | 260 | ''' 261 | 262 | def _prepare(self, rms_halflife=14, rms_regularizer=1e-8, **kwargs): 263 | self.ewma = util.as_float(np.exp(-np.log(2) / rms_halflife)) 264 | self.epsilon = util.as_float(rms_regularizer) 265 | util.log_param('rms_halflife', rms_halflife) 266 | util.log_param('rms_regularizer', rms_regularizer) 267 | super(RMSProp, self)._prepare(**kwargs) 268 | 269 | def _get_updates_for(self, param, grad): 270 | g1_tm1 = util.shared_like(param, 'g1_ewma') 271 | g2_tm1 = util.shared_like(param, 'g2_ewma') 272 | g1_t = self.ewma * g1_tm1 + (1 - self.ewma) * grad 273 | g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad 274 | rms = TT.sqrt(g2_t - g1_t * g1_t + self.epsilon) 275 | yield g1_tm1, g1_t 276 | yield g2_tm1, g2_t 277 | yield param, self.learning_rate * grad / rms 278 | 279 | 280 | class ADADELTA(RMSProp): 281 | r'''ADADELTA optimizer. 282 | 283 | Parameters 284 | ---------- 285 | rms_halflife: float, optional (default 14) 286 | Compute RMS gradient values using an exponentially weighted moving 287 | average that decays with this halflife. 288 | rms_regularizer: float, optional (default 1e-8) 289 | Regularize RMS gradient values by this :math:`\epsilon`. 290 | momentum: float, optional (default 0) 291 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 292 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 293 | momentum. 294 | nesterov: bool, optional (default False) 295 | Set this to ``True`` to enable Nesterov-style momentum updates, whenever 296 | ``momentum`` is nonzero. 297 | 298 | Notes 299 | ----- 300 | 301 | The ADADELTA method uses the same general strategy as all first-order 302 | stochastic gradient methods, in the sense that these methods make small 303 | parameter adjustments iteratively using local derivative information. 304 | 305 | The difference with ADADELTA is that as gradients are computed during each 306 | parameter update, an exponentially-weighted weighted moving average (EWMA) 307 | gradient value, as well as an EWMA of recent parameter steps, are maintained 308 | as well. The actual gradient is normalized by the ratio of the 309 | root-mean-square (RMS) parameter step size to the RMS gradient magnitude. 310 | 311 | .. math:: 312 | \begin{eqnarray*} 313 | g_{t+1} &=& \gamma g_t + (1 - \gamma) \left( 314 | \frac{\partial\mathcal{L}}{\partial p}\right)^2 \\ 315 | v_{t+1} &=& \frac{\sqrt{x_t + \epsilon}}{\sqrt{g_{t+1} + \epsilon}} 316 | \frac{\partial\mathcal{L}}{\partial p} \\ 317 | x_{t+1} &=& \gamma x_t + (1 - \gamma) v_{t+1}^2 \\ 318 | p_{t+1} &=& p_t - v_{t+1} 319 | \end{eqnarray*} 320 | 321 | Like :class:`RProp` and the :class:`RMSProp`--:class:`ESGD` family, this 322 | learning method effectively maintains a sort of parameter-specific momentum 323 | value. The primary difference between this method and :class:`RMSProp` is 324 | that ADADELTA additionally incorporates a sliding window of RMS parameter 325 | step sizes, (somewhat) obviating the need for a learning rate parameter. 326 | 327 | In this implementation, the RMS values are regularized (made less extreme) 328 | by :math:`\epsilon`, which is specified using the ``rms_regularizer`` 329 | parameter. 330 | 331 | The weight parameter :math:`\gamma` for the EWMA window is computed from the 332 | ``rms_halflife`` keyword argument, such that the actual EWMA weight varies 333 | inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 334 | 2}{h}}`. 335 | 336 | References 337 | ---------- 338 | 339 | .. [Zeil12] M. Zeiler. (2012) "ADADELTA: An adaptive learning rate method." 340 | http://arxiv.org/abs/1212.5701 341 | ''' 342 | 343 | def _get_updates_for(self, param, grad): 344 | x2_tm1 = util.shared_like(param, 'x2_ewma') 345 | g2_tm1 = util.shared_like(param, 'g2_ewma') 346 | g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad 347 | delta = grad * TT.sqrt(x2_tm1 + self.epsilon) / TT.sqrt(g2_t + self.epsilon) 348 | x2_t = self.ewma * x2_tm1 + (1 - self.ewma) * delta * delta 349 | yield g2_tm1, g2_t 350 | yield x2_tm1, x2_t 351 | yield param, delta 352 | 353 | 354 | class ESGD(RMSProp): 355 | r'''Equilibrated SGD computes a diagonal Hessian preconditioner. 356 | 357 | Parameters 358 | ---------- 359 | hv_method: {'rop', 'lop', 'grad'}, optional 360 | The Hv (Hessian-vector) product will be computed using the given method. 361 | The default is to use 'rop'. 362 | learning_rate: float, optional (default 1e-4) 363 | Step size to take during optimization. 364 | rms_halflife: float, optional (default 14) 365 | Compute RMS gradient values using an exponentially weighted moving 366 | average that decays with this halflife. 367 | rms_regularizer: float, optional (default 1e-8) 368 | Regularize RMS gradient values by this :math:`\epsilon`. 369 | momentum: float, optional (default 0) 370 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 371 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 372 | momentum. 373 | nesterov: bool, optional (default False) 374 | Set this to ``True`` to enable Nesterov-style momentum updates, whenever 375 | ``momentum`` is nonzero. 376 | 377 | Notes 378 | ----- 379 | 380 | The ESGD method uses the same general strategy as all first-order 381 | stochastic gradient methods, in the sense that these methods make small 382 | parameter adjustments iteratively using local derivative information. 383 | 384 | The difference here is that as gradients are computed during each parameter 385 | update, an exponentially-weighted moving average (EWMA) of estimates of the 386 | diagonal of the Hessian (the matrix of second derivatives) is maintained as 387 | well. At each update, the EWMA is used to compute the root-mean-square (RMS) 388 | diagonal value that's been seen in the recent past. The actual gradient is 389 | scaled by the inverse of this diagonal preconditioner before being applied 390 | to update the parameters. Intuitively, this causes the algorithm to 391 | "reshape" the loss function in parameter space, such that directions of 392 | steep gradient (i.e., large diagonal values) and directions of shallow 393 | gradient (i.e., small diagonal values) are scaled to be approximately the 394 | same slope. 395 | 396 | The diagonal estimates are computed using a nice trick: A vector :math:`r 397 | \sim \mathcal{N}(0, 1)` consisting of standard normal values is sampled 398 | randomly at each update step, and the value of :math:`Hr` is computed 399 | symbolically. These vector values tend to approximate the diagonal of the 400 | Hessian. Because :math:`Hr` is itself a vector, the full Hessian :math:`H` 401 | does not need to be computed or stored. 402 | 403 | .. math:: 404 | \begin{eqnarray*} 405 | r &\sim& \mathcal{N}(0, 1) \\ 406 | Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2 p}r \\ 407 | D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\ 408 | p_{t+1} &=& p_t + - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}} 409 | \frac{\partial\mathcal{L}}{\partial p} 410 | \end{eqnarray*} 411 | 412 | Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RMSProp` family, this 413 | learning method effectively maintains a sort of parameter-specific learning 414 | rate for each parameter in the loss. 415 | 416 | In this implementation, :math:`\epsilon` regularizes the RMS values; it is 417 | is specified using the ``rms_regularizer`` parameter. 418 | 419 | The weight parameter :math:`\gamma` for the EWMA is computed from the 420 | ``rms_halflife`` keyword argument, such that the actual EWMA weight varies 421 | inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 422 | 2}{h}}`. 423 | 424 | The primary difference between this implementation and the algorithm 425 | described in the paper (see below) is the use of an EWMA to decay the 426 | diagonal values over time, while in the paper the diagonal is divided by the 427 | training iteration. The EWMA halflife should be set to something reasonably 428 | large to ensure that this method emulates the method described in the 429 | original paper. 430 | 431 | References 432 | ---------- 433 | 434 | .. [Daup14] Y. Dauphin, H. de Vries, J. Chung & Y. Bengio. (2014) "RMSProp 435 | and equilibrated adaptive learning rates for non-convex optimization." 436 | http://arxiv.org/abs/1502.04390 437 | ''' 438 | 439 | def __init__(self, *args, **kwargs): 440 | self.rng = RandomStreams() 441 | self.hv_method = kwargs.pop('hv_method', 'rop').lower() 442 | assert self.hv_method in ('rop', 'lop', 'grad') 443 | super(ESGD, self).__init__(*args, **kwargs) 444 | 445 | def _get_updates_for(self, param, grad): 446 | D_tm1 = util.shared_like(param, 'D_ewma') 447 | v = self.rng.normal(param.shape) 448 | if self.hv_method == 'rop': 449 | Hv = TT.Rop(grad, param, v) 450 | if self.hv_method == 'lop': 451 | Hv = TT.Lop(grad, param, v) 452 | if self.hv_method == 'grad': 453 | Hv = TT.grad(TT.sum(grad * v), param) 454 | D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv 455 | denom = TT.sqrt(D_t) + self.epsilon 456 | yield D_tm1, D_t 457 | yield param, grad * self.learning_rate / denom 458 | 459 | 460 | class Adam(RMSProp): 461 | r'''Adam optimizer using unbiased gradient moment estimates. 462 | 463 | Parameters 464 | ---------- 465 | learning_rate: float, optional (default 1e-4) 466 | Step size to take during optimization. 467 | beta1_decay: float, optional (default 1 - 1e-6) 468 | Extend the :math:`\beta_1` halflife by this amount after every update. 469 | beta1_halflife: float, optional (default 7) 470 | Compute RMS gradient estimates using an exponentially weighted moving 471 | average that decays with this halflife. 472 | beta2_halflife: float, optional (default 69) 473 | Compute squared-magnitude RMS gradient estimates using an exponentially 474 | weighted moving average that decays with this halflife. 475 | rms_regularizer: float, optional (default 1e-8) 476 | Regularize RMS gradient values by this :math:`\epsilon`. 477 | momentum: float, optional (default 0) 478 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 479 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 480 | momentum. 481 | nesterov: bool, optional (default False) 482 | Set this to ``True`` to enable Nesterov-style momentum updates, whenever 483 | ``momentum`` is nonzero. 484 | 485 | Notes 486 | ----- 487 | 488 | The Adam method uses the same general strategy as all first-order 489 | stochastic gradient methods, in the sense that these methods make small 490 | parameter adjustments iteratively using local derivative information. 491 | 492 | The difference here is that as gradients are computed during each parameter 493 | update, exponentially-weighted moving averages (EWMAs) of (1) the first 494 | moment of the recent gradient values and (2) the second moment of recent 495 | gradient values are maintained as well. At each update, the step taken is 496 | proportional to the ratio of the first moment to the second moment. 497 | 498 | .. math:: 499 | \begin{eqnarray*} 500 | \beta_1^t &=& \beta_1 \lambda^{t} 501 | f_{t+1} &=& \beta_1^t f_t + (1 - \beta_1^t) 502 | \frac{\partial\mathcal{L}}{\partial\theta} \\ 503 | g_{t+1} &=& \beta_2 g_t + (1 - \beta_2) 504 | \left(\frac{\partial\mathcal{L}}{\partial\theta}\right)^2 \\ 505 | \theta_{t+1} &=& \theta_t - 506 | \frac{f_{t+1} / (1 - \beta_1^t)}{\sqrt{g_{t+1} / (1 - \beta_2)} + \epsilon} 507 | \end{eqnarray*} 508 | 509 | Like all adaptive optimization algorithms, this optimizer effectively 510 | maintains a sort of parameter-specific momentum value. It shares with 511 | :class:`RMSProp` and :class:`ADADELTA` the idea of using an EWMA to track 512 | recent quantities related to the stochastic gradient during optimization. 513 | But the Adam method is unique in that it incorporates an explicit 514 | computation to remove the bias from these estimates. 515 | 516 | In this implementation, :math:`\epsilon` regularizes the RMS values and is 517 | given using the ``rms_regularizer`` keyword argument. The weight parameters 518 | :math:`\beta_1` and :math:`\beta_2` for the first and second EWMA windows 519 | are computed from the ``beta1_halflife`` and ``beta2_halflife`` keyword 520 | arguments, respectively, such that the actual EWMA weight varies inversely 521 | with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The 522 | decay :math:`\lambda` for the :math:`\beta_1` EWMA is provided by the 523 | ``beta1_decay`` keyword argument. 524 | 525 | The implementation here is taken from Algorithm 1 of [King15]_. 526 | 527 | References 528 | ---------- 529 | 530 | .. [King15] D. Kingma & J. Ba. (ICLR 2015) "Adam: A Method for 531 | Stochastic Optimization." http://arxiv.org/abs/1412.6980 532 | ''' 533 | 534 | def _prepare(self, 535 | beta1_halflife=7, 536 | beta2_halflife=69, 537 | **kwargs): 538 | self.beta1 = util.as_float(np.exp(-np.log(2) / beta1_halflife)) 539 | self.beta2 = util.as_float(np.exp(-np.log(2) / beta2_halflife)) 540 | super(Adam, self)._prepare(**kwargs) 541 | 542 | def _get_updates_for(self, param, grad): 543 | t_tm1 = theano.shared(np.cast['float32'](0), 't') 544 | t_t = 1 + t_tm1 545 | g1_tm1 = util.shared_like(param, 'g1_ewma') 546 | g2_tm1 = util.shared_like(param, 'g2_ewma') 547 | g1_t = self.beta1 * g1_tm1 + (1 - self.beta1) * grad 548 | g2_t = self.beta2 * g2_tm1 + (1 - self.beta2) * grad * grad 549 | numer = g1_t / (1 - self.beta1 ** t_t) 550 | denom = TT.sqrt(g2_t / (1 - self.beta2 ** t_t)) 551 | yield t_tm1, t_t 552 | yield g1_tm1, g1_t 553 | yield g2_tm1, g2_t 554 | yield param, self.learning_rate * numer / (denom + self.epsilon) 555 | -------------------------------------------------------------------------------- /downhill/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | '''This module defines a base class for optimization techniques.''' 4 | 5 | import click 6 | import collections 7 | import numpy as np 8 | import theano 9 | import theano.tensor as TT 10 | import warnings 11 | 12 | from . import util 13 | 14 | 15 | def build(algo, loss, params=None, inputs=None, updates=(), monitors=(), 16 | monitor_gradients=False): 17 | '''Construct an optimizer by name. 18 | 19 | Parameters 20 | ---------- 21 | algo : str 22 | The name of the optimization algorithm to build. 23 | loss : Theano expression 24 | Loss function to minimize. This must be a scalar-valued expression. 25 | params : list of Theano variables, optional 26 | Symbolic variables to adjust to minimize the loss. If not given, these 27 | will be computed automatically by walking the computation graph. 28 | inputs : list of Theano variables, optional 29 | Symbolic variables required to compute the loss. If not given, these 30 | will be computed automatically by walking the computation graph. 31 | updates : list of update pairs, optional 32 | A list of pairs providing updates for the internal of the loss 33 | computation. Normally this is empty, but it can be provided if the loss, 34 | for example, requires an update to an internal random number generator. 35 | monitors : dict or sequence of (str, Theano expression) tuples, optional 36 | Additional values to monitor during optimization. These must be provided 37 | as either a sequence of (name, expression) tuples, or as a dictionary 38 | mapping string names to Theano expressions. 39 | monitor_gradients : bool, optional 40 | If True, add monitors to log the norms of the parameter gradients during 41 | optimization. Defaults to False. 42 | 43 | Returns 44 | ------- 45 | optimizer : :class:`Optimizer` 46 | An optimizer instance. 47 | ''' 48 | return Optimizer.build(algo, loss, params, inputs, 49 | updates=updates, monitors=monitors, 50 | monitor_gradients=monitor_gradients) 51 | 52 | 53 | class Optimizer(util.Registrar(str('Base'), (), {})): 54 | '''An optimizer computes gradient updates to iteratively optimize a loss. 55 | 56 | Attributes 57 | ---------- 58 | patience : int, optional 59 | Number of validation "failures" that we are willing to tolerate before 60 | stopping the optimization process. A validation failure happens whenever 61 | the loss on the validation dataset decreases by less than 62 | ``min_improvement`` (relative) over the previous best validation loss. 63 | Defaults to 5. 64 | validate_every : int, optional 65 | Evaluate the loss on the validation dataset after making this many 66 | passes over the training data. Defaults to 10. 67 | min_improvement : float, optional 68 | Insist that the validation loss must improve by this relative amount 69 | before considering that the optimization has made progress. The 70 | optimization process halts when ``patience`` validations have failed to 71 | make this relative improvement. Defaults to 0; set to a larger value 72 | (e.g., 0.01 for 1% improvement) to halt the optimization process sooner. 73 | max_gradient_norm : float, optional 74 | Rescale each parameter's gradient so that it has at most this L2 norm. 75 | Set to 0 (the default) to disable norm rescaling. If 76 | ``max_gradient_elem`` is also specified, then this has no effect. 77 | max_gradient_elem : float, optional 78 | Perform elementwise clipping on the magnitude of gradient values. Set to 79 | 0 (the default) to disable. If elementwise clipping is enabled, norm 80 | rescaling (via ``max_gradient_norm``) will have no effect. Deprecated 81 | synonyms of this parameter are "max_gradient_clip" and "gradient_clip". 82 | learning_rate : float, optional 83 | Many SGD-based optimization algorithms require a learning rate 84 | hyperparameter that scales the gradient step. Defaults to 1e-4. 85 | momentum : float, optional 86 | Apply momentum to the parameter updates for this optimizer, with the 87 | given strength. Typically this value ranges from 0 (no momentum) to 88 | :math:`1 - \epsilon` (large momentum). Defaults to 0. 89 | nesterov : bool, optional 90 | If True, and ``momentum`` is nonzero, apply Nesterov-style momentum to 91 | parameter updates for this optimizer. If False, and ``momentum`` is 92 | nonzero, "regular" momentum is applied. Has no effect if ``momentum`` is 93 | zero. See :class:`NAG ` for a description of Nesterov 94 | momentum. 95 | 96 | Parameters 97 | ---------- 98 | loss : Theano expression 99 | Loss function to minimize. This must be a scalar-valued expression. 100 | params : list of Theano variables, optional 101 | Symbolic variables to adjust to minimize the loss. If not given, these 102 | will be computed automatically by walking the computation graph. 103 | inputs : list of Theano variables, optional 104 | Symbolic variables required to compute the loss. If not given, these 105 | will be computed automatically by walking the computation graph. 106 | updates : list of update pairs, optional 107 | A list of pairs providing updates for the internals of the loss 108 | computation. Normally this is empty, but it can be provided if the loss, 109 | for example, requires an update to an internal random number generator. 110 | monitors : sequence of (str, Theano expression) tuples, optional 111 | Additional values to monitor during optimization. These must be provided 112 | as a sequence of (name, expression) tuples. 113 | monitor_gradients : bool, optional 114 | If True, add monitors to log the norms of the parameter gradients during 115 | optimization. Defaults to False. 116 | ''' 117 | 118 | def __init__(self, loss, params=None, inputs=None, updates=(), monitors=(), 119 | monitor_gradients=False): 120 | inputs_, params_ = util.find_inputs_and_params(loss) 121 | 122 | self._loss = loss 123 | self._params = params or params_ 124 | self._inputs = inputs or inputs_ 125 | self._updates = updates 126 | 127 | self._shapes = [p.get_value(borrow=True).shape for p in self._params] 128 | self._counts = [np.prod(s) for s in self._shapes] 129 | self._starts = np.cumsum([0] + self._counts)[:-1] 130 | self._dtype = self._params[0].get_value().dtype 131 | 132 | self._curr_iter = 0 133 | self._best_iter = 0 134 | self._best_loss = 1e100 135 | self._best_params = [p.get_value().copy() for p in self._params] 136 | 137 | self._monitor_exprs = [self._loss] 138 | self._monitor_names = ['loss'] 139 | for name, monitor in monitors: 140 | self._monitor_names.append(name) 141 | self._monitor_exprs.append(monitor) 142 | if monitor_gradients: 143 | unnamed = 0 144 | for p, g in zip(self._params, TT.grad(self._loss, self._params)): 145 | name = p.name 146 | if not name: 147 | name = 'unnamed{}'.format(unnamed) 148 | unnamed += 1 149 | util.log('"{}" unnamed, will be "{}" internally'.format(p, name)) 150 | self._monitor_names.append('grad({})'.format(name)) 151 | self._monitor_exprs.append((g * g).sum()) 152 | 153 | def _compile(self, **kwargs): 154 | '''Compile the Theano functions for evaluating and updating our model. 155 | ''' 156 | util.log('compiling evaluation function') 157 | self.f_eval = theano.function(self._inputs, 158 | self._monitor_exprs, 159 | updates=self._updates, 160 | name='evaluation') 161 | label = self.__class__.__name__ 162 | util.log('compiling {} optimizer'.format(click.style(label, fg='red'))) 163 | updates = list(self._updates) + list(self.get_updates(**kwargs)) 164 | self.f_step = theano.function(self._inputs, 165 | self._monitor_exprs, 166 | updates=updates, 167 | name=label) 168 | 169 | def get_updates(self, **kwargs): 170 | '''Get parameter update expressions for performing optimization. 171 | 172 | Keyword arguments can be applied here to set any of the global 173 | optimizer attributes. 174 | 175 | Yields 176 | ------ 177 | updates : (parameter, expression) tuples 178 | A sequence of parameter updates to be applied during optimization. 179 | ''' 180 | self._prepare(**kwargs) 181 | for param, grad in self._differentiate(): 182 | for var, update in self._get_updates_for(param, grad): 183 | # For auxiliary variables, updates are meant to replace the 184 | # existing variable value. 185 | if var != param: 186 | yield var, update 187 | continue 188 | # If momentum is disabled, just apply the parameter delta. 189 | if self.momentum == 0: 190 | yield var, param - update 191 | continue 192 | # Momentum is enabled, so we keep track of velocity here. 193 | vel_tm1 = util.shared_like(param, 'vel') 194 | vel_t = util.as_float(self.momentum) * vel_tm1 - update 195 | if self.nesterov: 196 | # see http://arxiv.org/pdf/1212.0901v2.pdf (eq 7) and 197 | # https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617 198 | mom_sqr = util.as_float(self.momentum ** 2) 199 | mom_inc = util.as_float(1 + self.momentum) 200 | vel_t = mom_sqr * vel_tm1 - mom_inc * update 201 | yield vel_tm1, vel_t 202 | yield param, param + vel_t 203 | 204 | def _get_updates_for(self, param, grad): 205 | '''Generate some update pairs for the given model parameter. 206 | 207 | Yields 208 | ------ 209 | updates : (parameter, expression) tuples 210 | A sequence of parameter updates to be applied during optimization. 211 | ''' 212 | raise NotImplementedError 213 | 214 | def _differentiate(self, params=None): 215 | '''Return a sequence of gradients for our parameters. 216 | 217 | If this optimizer has been configured with a gradient norm limit, or 218 | with elementwise gradient clipping, this method applies the appropriate 219 | rescaling and clipping operations before returning the gradient. 220 | 221 | Parameters 222 | ---------- 223 | params : list of Theano variables, optional 224 | Return the gradient with respect to these parameters. Defaults to 225 | all parameters that the optimizer knows about. 226 | 227 | Yields 228 | ------ 229 | pairs : (param, grad) tuples 230 | Generates a sequence of tuples representing each of the parameters 231 | requested and the corresponding Theano gradient expressions. 232 | ''' 233 | if params is None: 234 | params = self._params 235 | for param, grad in zip(params, TT.grad(self._loss, params)): 236 | if self.max_gradient_elem > 0: 237 | limit = util.as_float(self.max_gradient_elem) 238 | yield param, TT.clip(grad, -limit, limit) 239 | elif self.max_gradient_norm > 0: 240 | norm = TT.sqrt((grad * grad).sum()) 241 | limit = util.as_float(self.max_gradient_norm) 242 | yield param, grad * TT.minimum(1, limit / norm) 243 | else: 244 | yield param, grad 245 | 246 | def set_params(self, targets=None): 247 | '''Set the values of the parameters to the given target values. 248 | 249 | Parameters 250 | ---------- 251 | targets : sequence of ndarray, optional 252 | Arrays for setting the parameters of our model. If this is not 253 | provided, the current best parameters for this optimizer will be 254 | used. 255 | ''' 256 | if not isinstance(targets, (list, tuple)): 257 | targets = self._best_params 258 | for param, target in zip(self._params, targets): 259 | param.set_value(target) 260 | 261 | def _log(self, monitors, iteration, label='', suffix=''): 262 | '''Log the state of the optimizer on the console. 263 | 264 | Parameters 265 | ---------- 266 | monitors : OrderedDict 267 | A dictionary of monitor names mapped to values. These names and 268 | values are what is being logged. 269 | iteration : int 270 | Optimization iteration that we are logging. 271 | label : str, optional 272 | A label for the name of the optimizer creating the log line. 273 | Defaults to the name of the current class. 274 | suffix : str, optional 275 | A suffix to add to the end of the log line, if any. 276 | ''' 277 | label = label or self.__class__.__name__ 278 | fields = (('{}={:.6f}').format(k, v) for k, v in monitors.items()) 279 | util.log('{} {} {}{}'.format(label, iteration, ' '.join(fields), suffix)) 280 | 281 | def evaluate(self, dataset): 282 | '''Evaluate the current model parameters on a dataset. 283 | 284 | Parameters 285 | ---------- 286 | dataset : :class:`Dataset ` 287 | A set of data to use for evaluating the model. 288 | 289 | Returns 290 | ------- 291 | monitors : OrderedDict 292 | A dictionary mapping monitor names to values. Monitors are 293 | quantities of interest during optimization---for example, loss 294 | function, accuracy, or whatever the optimization task requires. 295 | ''' 296 | if dataset is None: 297 | values = [self.f_eval()] 298 | else: 299 | values = [self.f_eval(*x) for x in dataset] 300 | monitors = zip(self._monitor_names, np.mean(values, axis=0)) 301 | return collections.OrderedDict(monitors) 302 | 303 | def _test_patience(self, monitors): 304 | '''Test whether our patience with optimization has elapsed. 305 | 306 | Parameters 307 | ---------- 308 | monitors : dict 309 | A dictionary mapping monitor names to values. The 'loss' key from 310 | this dictionary will be used to evaluate optimization progress. 311 | 312 | Returns 313 | ------- 314 | elapsed : bool 315 | True iff our patience has elapsed and the model is no longer 316 | improving. 317 | ''' 318 | self._curr_iter += 1 319 | marker = '' 320 | loss = monitors['loss'] 321 | if self._best_loss - loss > self._best_loss * self.min_improvement: 322 | self._best_loss = loss 323 | self._best_iter = self._curr_iter 324 | self._best_params = [p.get_value().copy() for p in self._params] 325 | marker = ' *' 326 | self._log(monitors, self._curr_iter - 1, 'validation', marker) 327 | return self._curr_iter - self._best_iter > self.patience 328 | 329 | def _prepare(self, **kwargs): 330 | '''Set up properties for optimization. 331 | 332 | This method can be overridden by base classes to provide parameters that 333 | are specific to a particular optimization technique (e.g., setting up a 334 | learning rate value). 335 | ''' 336 | self.learning_rate = util.as_float(kwargs.pop('learning_rate', 1e-4)) 337 | self.momentum = kwargs.pop('momentum', 0) 338 | self.nesterov = kwargs.pop('nesterov', False) 339 | self.patience = kwargs.get('patience', 5) 340 | self.validate_every = kwargs.pop('validate_every', 10) 341 | self.min_improvement = kwargs.pop('min_improvement', 0) 342 | self.max_gradient_norm = kwargs.pop('max_gradient_norm', 0) 343 | self.max_gradient_elem = kwargs.pop('max_gradient_elem', 0) 344 | 345 | util.log_param('patience', self.patience) 346 | util.log_param('validate_every', self.validate_every) 347 | util.log_param('min_improvement', self.min_improvement) 348 | util.log_param('max_gradient_norm', self.max_gradient_norm) 349 | util.log_param('max_gradient_elem', self.max_gradient_elem) 350 | util.log_param('learning_rate', self.learning_rate) 351 | util.log_param('momentum', self.momentum) 352 | util.log_param('nesterov', self.nesterov) 353 | 354 | def iterate(self, train=None, valid=None, max_updates=None, **kwargs): 355 | r'''Optimize a loss iteratively using a training and validation dataset. 356 | 357 | This method yields a series of monitor values to the caller. After every 358 | optimization epoch, a pair of monitor dictionaries is generated: one 359 | evaluated on the training dataset during the epoch, and another 360 | evaluated on the validation dataset at the most recent validation epoch. 361 | 362 | The validation monitors might not be updated during every optimization 363 | iteration; in this case, the most recent validation monitors will be 364 | yielded along with the training monitors. 365 | 366 | Additional keyword arguments supplied here will set the global 367 | optimizer attributes. 368 | 369 | Parameters 370 | ---------- 371 | train : sequence or :class:`Dataset ` 372 | A set of training data for computing updates to model parameters. 373 | valid : sequence or :class:`Dataset ` 374 | A set of validation data for computing monitor values and 375 | determining when the loss has stopped improving. Defaults to the 376 | training data. 377 | max_updates : int, optional 378 | If specified, halt optimization after this many gradient updates 379 | have been processed. If not provided, uses early stopping to decide 380 | when to halt. 381 | 382 | Yields 383 | ------ 384 | train_monitors : dict 385 | A dictionary mapping monitor names to values, evaluated on the 386 | training dataset. 387 | valid_monitors : dict 388 | A dictionary containing monitor values evaluated on the validation 389 | dataset. 390 | ''' 391 | self._compile(**kwargs) 392 | 393 | if valid is None: 394 | valid = train 395 | iteration = 0 396 | training = validation = None 397 | while max_updates is None or iteration < max_updates: 398 | if not iteration % self.validate_every: 399 | try: 400 | validation = self.evaluate(valid) 401 | except KeyboardInterrupt: 402 | util.log('interrupted!') 403 | break 404 | if self._test_patience(validation): 405 | util.log('patience elapsed!') 406 | break 407 | try: 408 | training = self._step(train) 409 | except KeyboardInterrupt: 410 | util.log('interrupted!') 411 | break 412 | iteration += 1 413 | self._log(training, iteration) 414 | yield training, validation 415 | self.set_params('best') 416 | 417 | def minimize(self, *args, **kwargs): 418 | '''Optimize our loss exhaustively. 419 | 420 | This method is a thin wrapper over the :func:`iterate` method. It simply 421 | exhausts the iterative optimization process and returns the final 422 | monitor values. 423 | 424 | Returns 425 | ------- 426 | train_monitors : dict 427 | A dictionary mapping monitor names to values, evaluated on the 428 | training dataset. 429 | valid_monitors : dict 430 | A dictionary containing monitor values evaluated on the validation 431 | dataset. 432 | ''' 433 | monitors = None 434 | for monitors in self.iterate(*args, **kwargs): 435 | pass 436 | return monitors 437 | 438 | def _step(self, dataset): 439 | '''Advance the state of the optimizer by one step. 440 | 441 | Parameters 442 | ---------- 443 | dataset : :class:`Dataset ` 444 | A dataset for optimizing the model. 445 | 446 | Returns 447 | ------- 448 | train_monitors : dict 449 | A dictionary mapping monitor names to values. 450 | ''' 451 | if dataset is None: 452 | values = [self.f_step()] 453 | else: 454 | values = [self.f_step(*x) for x in dataset] 455 | return collections.OrderedDict( 456 | zip(self._monitor_names, np.mean(values, axis=0))) 457 | -------------------------------------------------------------------------------- /downhill/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | r'''This module contains a class for handling batched datasets. 4 | 5 | In many optimization tasks, parameters must be updated by optimizing them with 6 | respect to estimates of a loss function. The loss function for many problems is 7 | estimated using a set of data that we have measured. 8 | ''' 9 | 10 | import collections 11 | import numpy as np 12 | import theano 13 | 14 | from . import util 15 | 16 | 17 | class Dataset: 18 | '''This class handles batching and shuffling a dataset. 19 | 20 | In ``downhill``, losses are optimized using sets of data collected from the 21 | problem that generated the loss. 22 | 23 | During optimization, data are grouped into "mini-batches"---that is, chunks 24 | that are larger than 1 sample and smaller than the entire set of samples; 25 | typically the size of a mini-batch is between 10 and 100, but the specific 26 | setting can be varied depending on your model, hardware, dataset, and so 27 | forth. These mini-batches must be presented to the optimization algorithm in 28 | pseudo-random order to match the underlying stochasticity assumptions of 29 | many optimization algorithms. This class handles the process of grouping 30 | data into mini-batches as well as iterating and shuffling these mini-batches 31 | dynamically as the dataset is consumed by the optimization algorithm. 32 | 33 | For many tasks, a dataset is obtained as a large block of sample data, which 34 | in Python is normally assembled as a ``numpy`` ndarray. To use this class on 35 | such a dataset, just pass in a list or tuple containing ``numpy`` arrays; 36 | the number of these arrays must match the number of inputs that your loss 37 | computation requires. 38 | 39 | There are some cases when a suitable set of training data would be 40 | prohibitively expensive to assemble in memory as a single ``numpy`` array. 41 | To handle these cases, this class can also handle a dataset that is provided 42 | via a Python callable. For more information on using callables to provide 43 | data to your model, see :ref:`data-using-callables`. 44 | 45 | Parameters 46 | ---------- 47 | inputs : callable or list of ndarray/sparse matrix/DataFrame/theano shared var 48 | One or more sets of data. 49 | 50 | If this parameter is callable, then mini-batches will be obtained by 51 | calling the callable with no arguments; the callable is expected to 52 | return a tuple of ndarray-like objects that will be suitable for 53 | optimizing the loss at hand. 54 | 55 | If this parameter is a list (or a tuple), it must contain array-like 56 | objects: ``numpy.ndarray``, ``scipy.sparse.csc_matrix``, 57 | ``scipy.sparse.csr_matrix``, ``pandas.DataFrame`` or ``theano.shared``. 58 | These are assumed to contain data for computing the loss, so the length 59 | of this tuple or list should match the number of inputs required by the 60 | loss computation. If multiple arrays are provided, their lengths along 61 | the axis given by the ``axis`` parameter (defaults to 0) must match. 62 | 63 | name : str, optional 64 | A string that is used to describe this dataset. Usually something like 65 | 'test' or 'train'. 66 | 67 | batch_size : int, optional 68 | The size of the mini-batches to create from the data sequences. If this 69 | is negative or zero, all data in the dataset will be used in one batch. 70 | Defaults to 32. This parameter has no effect if ``inputs`` is callable. 71 | 72 | iteration_size : int, optional 73 | The number of batches to yield for each call to iterate(). Defaults to 74 | the length of the data divided by batch_size. If the dataset is a 75 | callable, then the number is len(callable). If callable has no length, 76 | then the number is set to 100. 77 | 78 | axis : int, optional 79 | The axis along which to split the data arrays, if the first parameter is 80 | given as one or more ndarrays. If not provided, defaults to 0. 81 | 82 | rng : :class:`numpy.random.RandomState` or int, optional 83 | A random number generator, or an integer seed for a random number 84 | generator. If not provided, the random number generator will be created 85 | with an automatically chosen seed. 86 | ''' 87 | 88 | _count = 0 89 | 90 | def __init__(self, inputs, name=None, batch_size=32, iteration_size=None, 91 | axis=0, rng=None): 92 | self.name = name or 'dataset{}'.format(Dataset._count) 93 | Dataset._count += 1 94 | self.batch_size = batch_size 95 | self.iteration_size = iteration_size 96 | self.rng = rng 97 | if rng is None or isinstance(rng, int): 98 | self.rng = np.random.RandomState(rng) 99 | 100 | self._inputs = None 101 | self._slices = None 102 | self._callable = None 103 | 104 | if isinstance(inputs, collections.Callable): 105 | self._init_callable(inputs) 106 | else: 107 | self._init_arrays(inputs, axis) 108 | 109 | def _init_callable(self, inputs): 110 | self._callable = inputs 111 | if not self.iteration_size: 112 | try: 113 | self.iteration_size = len(inputs) 114 | except (TypeError, AttributeError): # has no len 115 | self.iteration_size = 100 116 | util.log('{0.name}: {0.iteration_size} mini-batches from callable', self) 117 | 118 | def _init_arrays(self, inputs, axis=0): 119 | if not isinstance(inputs, (tuple, list)): 120 | inputs = (inputs, ) 121 | 122 | shapes = [] 123 | self._inputs = [] 124 | for i, x in enumerate(inputs): 125 | self._inputs.append(x) 126 | if isinstance(x, np.ndarray): 127 | shapes.append(x.shape) 128 | continue 129 | if isinstance(x, theano.compile.SharedVariable): 130 | shapes.append(x.get_value(borrow=True).shape) 131 | continue 132 | if 'pandas.' in str(type(x)): # hacky but prevents a global import 133 | import pandas as pd 134 | if isinstance(x, (pd.Series, pd.DataFrame)): 135 | shapes.append(x.shape) 136 | continue 137 | if 'scipy.sparse.' in str(type(x)): # same here 138 | import scipy.sparse as ss 139 | if isinstance(x, (ss.csr.csr_matrix, ss.csc.csc_matrix)): 140 | shapes.append(x.shape) 141 | continue 142 | raise ValueError( 143 | 'input {} (type {}) must be numpy.array, theano.shared, ' 144 | 'or pandas.{{Series,DataFrame}}'.format(i, type(x))) 145 | 146 | L = shapes[0][axis] 147 | assert all(L == s[axis] for s in shapes), \ 148 | 'shapes do not match along axis {}: {}'.format( 149 | axis, '; '.join(str(s) for s in shapes)) 150 | 151 | B = L if self.batch_size <= 0 else self.batch_size 152 | 153 | self._index = 0 154 | self._slices = [] 155 | for i in range(0, L, B): 156 | where = [] 157 | for shape in shapes: 158 | slices = [slice(None) for _ in shape] 159 | slices[axis] = slice(i, min(L, i + B)) 160 | where.append(tuple(slices)) 161 | self._slices.append(where) 162 | 163 | self.shuffle() 164 | 165 | if not self.iteration_size: 166 | self.iteration_size = len(self._slices) 167 | 168 | util.log('{0.name}: {0.iteration_size} of {1} mini-batches from {2}', 169 | self, len(self._slices), '; '.join(str(s) for s in shapes)) 170 | 171 | def __iter__(self): 172 | return self.iterate(True) 173 | 174 | def shuffle(self): 175 | '''Shuffle the batches in the dataset. 176 | 177 | If this dataset was constructed using a callable, this method has no 178 | effect. 179 | ''' 180 | if self._slices is not None: 181 | self.rng.shuffle(self._slices) 182 | 183 | def iterate(self, shuffle=True): 184 | '''Iterate over batches in the dataset. 185 | 186 | This method generates ``iteration_size`` batches from the dataset and 187 | then returns. 188 | 189 | Parameters 190 | ---------- 191 | shuffle : bool, optional 192 | Shuffle the batches in this dataset if the iteration reaches the end 193 | of the batch list. Defaults to True. 194 | 195 | Yields 196 | ------ 197 | batches : data batches 198 | A sequence of batches---often from a training, validation, or test 199 | dataset. 200 | ''' 201 | for _ in range(self.iteration_size): 202 | if self._callable is not None: 203 | yield self._callable() 204 | else: 205 | yield self._next_batch(shuffle) 206 | 207 | def _next_batch(self, shuffle=True): 208 | batch = [x.iloc[i] if hasattr(x, 'iloc') else x[i] 209 | for x, i in zip(self._inputs, self._slices[self._index])] 210 | self._index += 1 211 | if self._index >= len(self._slices): 212 | if shuffle: 213 | self.shuffle() 214 | self._index = 0 215 | return batch 216 | -------------------------------------------------------------------------------- /downhill/first_order.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | '''This module defines first-order gradient descent optimizers.''' 4 | 5 | from .base import Optimizer 6 | 7 | __all__ = ['SGD', 'NAG'] 8 | 9 | 10 | class SGD(Optimizer): 11 | r'''Basic optimization using stochastic gradient descent. 12 | 13 | Parameters 14 | ---------- 15 | learning_rate: float, optional (default 1e-4) 16 | Step size to take during optimization. 17 | momentum: float, optional (default 0) 18 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 19 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 20 | momentum. 21 | nesterov: bool, optional (default False) 22 | Set this to ``True`` to enable Nesterov-style momentum updates, whenever 23 | ``momentum`` is nonzero. 24 | 25 | Notes 26 | ----- 27 | 28 | A stochastic gradient trainer with momentum :math:`\mu` and learning rate 29 | :math:`\alpha` updates parameter :math:`\theta` at step :math:`t` by 30 | blending the current "velocity" :math:`v` with the current gradient 31 | :math:`\frac{\partial\mathcal{L}}{\partial\theta}`: 32 | 33 | .. math:: 34 | \begin{eqnarray*} 35 | v_{t+1} &=& \mu v_t - \alpha \frac{\partial\mathcal{L}}{\partial\theta} \\ 36 | \theta_{t+1} &=& \theta_t + v_{t+1} 37 | \end{eqnarray*} 38 | 39 | Without momentum (i.e., when :math:`\mu = 0`), these updates reduce to 40 | :math:`\theta_{t+1} = \theta_t - \alpha \frac{\partial\mathcal{L}}{\partial\theta}`, 41 | which just takes steps downhill according to the the local gradient. 42 | 43 | Adding the momentum term permits the algorithm to incorporate information 44 | from previous steps as well, which in practice is thought to have the effect 45 | of incorporating some information about second-order derivatives of the loss 46 | surface. 47 | 48 | References 49 | ---------- 50 | 51 | .. [Rume86] D. E. Rumelhart, G. E. Hinton, & R. J. Williams. (1986) 52 | "Learning representations by back-propagating errors". Nature 323 53 | (6088):533–536. doi:10.1038/323533a0 54 | http://www.nature.com/nature/journal/v323/n6088/abs/323533a0.html 55 | ''' 56 | 57 | def _get_updates_for(self, param, grad): 58 | yield param, self.learning_rate * grad 59 | 60 | 61 | class NAG(SGD): 62 | r'''Stochastic gradient optimization with Nesterov momentum. 63 | 64 | This class name is an abbreviation for "Nesterov's Accelerated Gradient." 65 | Note that the ``momentum`` parameter must be given during optimization for 66 | Nesterov momentum to be employed; by default ``momentum`` is 0 and so no 67 | momentum is used. 68 | 69 | Parameters 70 | ---------- 71 | learning_rate: float, optional (default 1e-4) 72 | Step size to take during optimization. 73 | 74 | momentum: float, optional (default 0) 75 | Momentum to apply to the updates, if any. Defaults to 0 (no momentum). 76 | Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of 77 | momentum. 78 | 79 | Notes 80 | ----- 81 | 82 | The basic difference between NAG and "classical" momentum in SGD 83 | optimization approaches is that NAG computes the gradients at the position 84 | in parameter space where "classical" momentum would put us at the *next* 85 | step. In classical :class:`SGD` with momentum :math:`\mu` and learning rate 86 | :math:`\alpha`, updates to parameter :math:`p` at step :math:`t` are 87 | computed by blending the current "velocity" :math:`v` with the current 88 | gradient :math:`\frac{\partial\mathcal{L}}{\partial p}`: 89 | 90 | .. math:: 91 | \begin{eqnarray*} 92 | v_{t+1} &=& \mu v_t - \alpha \frac{\partial\mathcal{L}}{\partial p} \\ 93 | p_{t+1} &=& p_t + v_{t+1} 94 | \end{eqnarray*} 95 | 96 | In contrast, NAG adjusts the update by blending the current "velocity" with 97 | the gradient at the next step---that is, the gradient is computed at the 98 | point where the velocity would have taken us: 99 | 100 | .. math:: 101 | \begin{eqnarray*} 102 | v_{t+1} &=& \mu v_t - \alpha \left. 103 | \frac{\partial\mathcal{L}}{\partial p}\right|_{p_t + \mu v_t} \\ 104 | p_{t+1} &=& p_t + v_{t+1} 105 | \end{eqnarray*} 106 | 107 | Again, the difference here is that the gradient is computed at the place in 108 | parameter space where we would have stepped using the classical technique, 109 | in the absence of a new gradient. 110 | 111 | In theory, this helps correct for oversteps during learning: If momentum 112 | would lead us to overshoot, then the gradient at that overshot place will 113 | point backwards, toward where we came from. See [Suts13]_ for a particularly 114 | clear exposition of this idea. 115 | 116 | References 117 | ---------- 118 | .. [Suts13] I. Sutskever, J. Martens, G. Dahl, & G. Hinton. (ICML 2013) "On 119 | the importance of initialization and momentum in deep learning." 120 | http://www.cs.toronto.edu/~fritz/absps/momentum.pdf 121 | 122 | .. [Nest83] Y. Nesterov. (1983) "A method of solving a convex programming 123 | problem with convergence rate O(1/sqr(k))." Soviet Mathematics Doklady, 124 | 27:372–376. 125 | ''' 126 | 127 | def iterate(self, *args, **kwargs): 128 | kwargs['nesterov'] = True 129 | return super(NAG, self).iterate(*args, **kwargs) 130 | -------------------------------------------------------------------------------- /downhill/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | '''A module of utility functions and other goodies.''' 4 | 5 | import click 6 | import datetime 7 | import inspect 8 | import numpy as np 9 | import theano 10 | import theano.tensor as TT 11 | 12 | 13 | class Registrar(type): 14 | '''A metaclass that builds a registry of its subclasses.''' 15 | 16 | def __init__(cls, name, bases, dct): 17 | if not hasattr(cls, '_registry'): 18 | cls._registry = {} 19 | else: 20 | cls._registry[name.lower()] = cls 21 | super(Registrar, cls).__init__(name, bases, dct) 22 | 23 | def build(cls, key, *args, **kwargs): 24 | return cls._registry[key.lower()](*args, **kwargs) 25 | 26 | def is_registered(cls, key): 27 | return key.lower() in cls._registry 28 | 29 | 30 | def shared_like(param, suffix, init=0): 31 | '''Create a Theano shared variable like an existing parameter. 32 | 33 | Parameters 34 | ---------- 35 | param : Theano variable 36 | Theano variable to use for shape information. 37 | suffix : str 38 | Suffix to append to the parameter's name for the new variable. 39 | init : float or ndarray, optional 40 | Initial value of the shared variable. Defaults to 0. 41 | 42 | Returns 43 | ------- 44 | shared : Theano shared variable 45 | A new shared variable with the same shape and data type as ``param``. 46 | ''' 47 | return theano.shared(np.zeros_like(param.get_value()) + init, 48 | name='{}_{}'.format(param.name, suffix), 49 | broadcastable=param.broadcastable) 50 | 51 | 52 | def as_float(x): 53 | '''Cast a floating point value to a Theano ``floatX`` symbol. 54 | 55 | Parameters 56 | ---------- 57 | x : float, ndarray, or Theano expression 58 | Some quantity to cast to floating point. 59 | 60 | Returns 61 | ------- 62 | x : Theano expression 63 | A symbolic variable cast as a ``floatX`` value. 64 | ''' 65 | return TT.cast(x, theano.config.floatX) 66 | 67 | 68 | def find_inputs_and_params(node): 69 | '''Walk a computation graph and extract root variables. 70 | 71 | Parameters 72 | ---------- 73 | node : Theano expression 74 | A symbolic Theano expression to walk. 75 | 76 | Returns 77 | ------- 78 | inputs : list Theano variables 79 | A list of candidate inputs for this graph. Inputs are nodes in the graph 80 | with no parents that are not shared and are not constants. 81 | params : list of Theano shared variables 82 | A list of candidate parameters for this graph. Parameters are nodes in 83 | the graph that are shared variables. 84 | ''' 85 | queue, seen, inputs, params = [node], set(), set(), set() 86 | while queue: 87 | node = queue.pop() 88 | seen.add(node) 89 | queue.extend(p for p in node.get_parents() if p not in seen) 90 | if not node.get_parents(): 91 | if isinstance(node, theano.compile.SharedVariable): 92 | params.add(node) 93 | elif not isinstance(node, TT.Constant): 94 | inputs.add(node) 95 | return list(inputs), list(params) 96 | 97 | 98 | _detailed_callsite = False 99 | 100 | 101 | def enable_detailed_callsite_logging(): 102 | '''Enable detailed callsite logging.''' 103 | global _detailed_callsite 104 | _detailed_callsite = True 105 | 106 | 107 | def log(msg, *args, **kwargs): 108 | '''Log a message to the console. 109 | 110 | Parameters 111 | ---------- 112 | msg : str 113 | A string to display on the console. This can contain {}-style 114 | formatting commands; the remaining positional and keyword arguments 115 | will be used to fill them in. 116 | ''' 117 | now = datetime.datetime.now() 118 | module = 'downhill' 119 | if _detailed_callsite: 120 | caller = inspect.stack()[1] 121 | parts = caller.filename.replace('.py', '').split('/') 122 | module = '{}:{}'.format( 123 | '.'.join(parts[parts.index('downhill')+1:]), caller.lineno) 124 | click.echo(' '.join(( 125 | click.style(now.strftime('%Y%m%d'), fg='blue'), 126 | click.style(now.strftime('%H%M%S'), fg='cyan'), 127 | click.style(module, fg='magenta'), 128 | msg.format(*args, **kwargs), 129 | ))) 130 | 131 | 132 | def log_param(name, value): 133 | '''Log a parameter value to the console. 134 | 135 | Parameters 136 | ---------- 137 | name : str 138 | Name of the parameter being logged. 139 | value : any 140 | Value of the parameter being logged. 141 | ''' 142 | log('setting {} = {}', click.style(str(name)), 143 | click.style(str(value), fg='yellow')) 144 | -------------------------------------------------------------------------------- /examples/mnist-sparse-factorization.py: -------------------------------------------------------------------------------- 1 | import downhill 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import skdata.mnist 5 | import theano 6 | import theano.tensor as TT 7 | 8 | FLOAT = 'df'[theano.config.floatX == 'float32'] 9 | 10 | 11 | def load_mnist(): 12 | '''Load the MNIST digits dataset.''' 13 | mnist = skdata.mnist.dataset.MNIST() 14 | mnist.meta # trigger download if needed. 15 | 16 | def arr(n, dtype): 17 | arr = mnist.arrays[n] 18 | return arr.reshape((len(arr), -1)).astype(dtype) 19 | train_images = arr('train_images', np.float32) / 128 - 1 20 | train_labels = arr('train_labels', np.uint8) 21 | return ((train_images[:50000], train_labels[:50000, 0]), 22 | (train_images[50000:], train_labels[50000:, 0])) 23 | 24 | 25 | def plot_images(imgs, loc=111, title=None, channels=1): 26 | '''Plot an array of images. 27 | 28 | We assume that we are given a matrix of data whose shape is (n*n, s*s*c) -- 29 | that is, there are n^2 images along the first axis of the array, and each 30 | image is c squares measuring s pixels on a side. Each row of the input will 31 | be plotted as a sub-region within a single image array containing an n x n 32 | grid of images. 33 | ''' 34 | n = int(np.sqrt(len(imgs))) 35 | assert n * n == len(imgs), 'images array must contain a square number of rows!' 36 | s = int(np.sqrt(len(imgs[0]) / channels)) 37 | assert s * s == len(imgs[0]) / channels, 'images must be square!' 38 | 39 | img = np.zeros((s * n, s * n, channels), dtype=imgs[0].dtype) 40 | for i, pix in enumerate(imgs): 41 | r, c = divmod(i, n) 42 | img[r * s:(r+1) * s, c * s:(c+1) * s] = pix.reshape((s, s, channels)) 43 | 44 | img -= img.min() 45 | img /= img.max() 46 | 47 | ax = plt.gcf().add_subplot(loc) 48 | ax.xaxis.set_visible(False) 49 | ax.yaxis.set_visible(False) 50 | ax.set_frame_on(False) 51 | ax.imshow(img.squeeze(), cmap=plt.cm.gray) 52 | if title: 53 | ax.set_title(title) 54 | 55 | 56 | (t_images, t_labels), (v_images, v_labels) = load_mnist() 57 | 58 | # construct training/validation sets consisting of the fours. 59 | train = t_images[t_labels == 4] 60 | valid = v_images[v_labels == 4] 61 | 62 | N = 20 63 | K = 20 64 | B = 784 65 | 66 | x = TT.matrix('x') 67 | 68 | u = theano.shared(np.random.randn(N * N, K * K).astype(FLOAT), name='u') 69 | v = theano.shared(np.random.randn(K * K, B).astype(FLOAT), name='v') 70 | 71 | err = TT.sqr(x - TT.dot(u, v)).mean() 72 | 73 | downhill.minimize( 74 | loss=err + 100 * (0.01 * abs(u).mean() + (v * v).mean()), 75 | params=[u, v], 76 | inputs=[x], 77 | train=train, 78 | valid=valid, 79 | batch_size=N * N, 80 | monitor_gradients=True, 81 | monitors=[ 82 | ('err', err), 83 | ('u<-0.5', (u < -0.5).mean()), 84 | ('u<-0.1', (u < -0.1).mean()), 85 | ('u<0.1', (u < 0.1).mean()), 86 | ('u<0.5', (u < 0.5).mean()), 87 | ], 88 | algo='sgd', 89 | max_gradient_clip=1, 90 | learning_rate=0.5, 91 | momentum=0.9, 92 | patience=3, 93 | min_improvement=0.1, 94 | ) 95 | 96 | plot_images(v.get_value(), 121) 97 | plot_images(np.dot(u.get_value(), v.get_value()), 122) 98 | plt.show() 99 | -------------------------------------------------------------------------------- /examples/rosenbrock-100d.py: -------------------------------------------------------------------------------- 1 | '''Optimization example using the 100-dimensional Rosenbrock "banana" function. 2 | 3 | This example trains up several optimization algorithms with randomly chosen 4 | hyperparameters and shows four histograms of the performance spectrum of each 5 | hyperparameter. 6 | 7 | This example is meant to show how optimization hyperparameters affect 8 | performance across different optimization algorithms. 9 | 10 | Due to the large number of optimizers that are evaluated in this example, it can 11 | take a good while to run. 12 | ''' 13 | 14 | import itertools 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | 18 | import rosenbrock 19 | 20 | 21 | algos = 'NAG RMSProp Adam ADADELTA ESGD'.split() 22 | results = rosenbrock.test(algos, n=10, init=[-1] * 100, limit=1000) 23 | 24 | 25 | # Here we make plots of the marginal performance of each of the four 26 | # hyperparameters. These are intended to get a sense of how random 27 | # hyperparameter selection gives a decent idea of how different algorithms 28 | # perform. 29 | 30 | _, ((rate_ax, mu_ax), (half_ax, reg_ax)) = plt.subplots(2, 2) 31 | 32 | by_algo = itertools.groupby(sorted(results), lambda item: item[0][0]) 33 | for color, (algo, items) in zip(rosenbrock.COLORS, by_algo): 34 | items = list(items) 35 | values = np.zeros((len(items), 5), 'f') 36 | for i, ((_, rate, mu, half, reg), (_, _, loss)) in enumerate(items): 37 | values[i] = [rate, mu, half, reg, loss] 38 | rates, mus, halfs, regs, losses = values.T 39 | kw = dict(alpha=0.8, markersize=5, mew=2, mfc='none', mec=color) 40 | rate_ax.plot(rates, losses, 'o', label=algo, **kw) 41 | mu_ax.plot(mus, losses, 'o', label=algo, **kw) 42 | half_ax.plot(halfs, losses, 'o', label=algo, **kw) 43 | reg_ax.plot(regs, losses, 'o', label=algo, **kw) 44 | 45 | for ax in [rate_ax, mu_ax, half_ax, reg_ax]: 46 | ax.set_yscale('log') 47 | ax.set_ylim(None, 4e4) 48 | ax.xaxis.tick_bottom() 49 | ax.yaxis.tick_left() 50 | ax.spines['top'].set_color('none') 51 | ax.spines['right'].set_color('none') 52 | ax.spines['bottom'].set_position(('outward', 3)) 53 | ax.spines['left'].set_position(('outward', 3)) 54 | if ax != mu_ax: 55 | ax.set_xscale('log') 56 | 57 | rate_ax.set_ylabel('Loss') 58 | rate_ax.set_xlabel('Rate') 59 | 60 | mu_ax.set_xlabel('Momentum') 61 | mu_ax.set_xlim(-0.05, 1.05) 62 | 63 | half_ax.set_ylabel('Loss') 64 | half_ax.set_xlabel('RMS Halflife') 65 | 66 | reg_ax.set_xlabel('RMS Regularizer') 67 | 68 | plt.legend() 69 | plt.show() 70 | -------------------------------------------------------------------------------- /examples/rosenbrock-2d.py: -------------------------------------------------------------------------------- 1 | '''Optimization example using the two-dimensional Rosenbrock "banana" function. 2 | 3 | This example trains up several optimization algorithms and displays the 4 | performance of each algorithm across several different (randomly-chosen) 5 | hyperparameter settings. 6 | 7 | This example is meant to show how different optimization algorithms perform when 8 | given the same optimization problem. Many of the algorithms' performances are 9 | strongly dependent on the values of various hyperparameters, such as the 10 | learning rate and momentum values. 11 | ''' 12 | 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | 16 | import rosenbrock 17 | 18 | 19 | def by_loss(item): 20 | '''Helper for sorting optimization runs by their final loss value.''' 21 | label, (xs, ys, loss) = item 22 | return loss 23 | 24 | 25 | def make_label(loss, key): 26 | '''Create a legend label for an optimization run.''' 27 | algo, rate, mu, half, reg = key 28 | slots, args = ['{:.3f}', '{}', 'm={:.3f}'], [loss, algo, mu] 29 | if algo in 'SGD NAG RMSProp Adam ESGD'.split(): 30 | slots.append('lr={:.2e}') 31 | args.append(rate) 32 | if algo in 'RMSProp ADADELTA ESGD'.split(): 33 | slots.append('rmsh={}') 34 | args.append(half) 35 | slots.append('rmsr={:.2e}') 36 | args.append(reg) 37 | return ' '.join(slots).format(*args) 38 | 39 | 40 | # Here we run a number of rosenbrock optimization algorithms and measure their 41 | # performance. Below we plot the results. 42 | 43 | algos = 'SGD NAG RMSProp RProp Adam ADADELTA ESGD'.split() 44 | results = ((make_label(loss, key), xs, ys) 45 | for key, (xs, ys, loss) 46 | in sorted(rosenbrock.test(algos), key=by_loss)) 47 | 48 | _, ax = plt.subplots(1, 1) 49 | 50 | for color, (label, xs, ys) in zip(rosenbrock.COLORS, results): 51 | ax.plot(xs, ys, 'o-', color=color, label=label, 52 | alpha=0.8, lw=2, markersize=5, 53 | mew=1, mec=color, mfc='none') 54 | 55 | # make a contour plot of the rosenbrock function surface. 56 | X, Y = np.meshgrid(np.linspace(-1.3, 1.3, 31), np.linspace(-0.9, 1.7, 31)) 57 | Z = 100 * (Y - X ** 2) ** 2 + (1 - X) ** 2 58 | ax.plot([1], [1], 'x', mew=3, markersize=10, color='#111111') 59 | ax.contourf(X, Y, Z, np.logspace(-1, 3, 31), cmap='gray_r') 60 | 61 | ax.set_xlim(-1.3, 1.3) 62 | ax.set_ylim(-0.9, 1.7) 63 | 64 | plt.legend(loc='lower right') 65 | plt.show() 66 | -------------------------------------------------------------------------------- /examples/rosenbrock.py: -------------------------------------------------------------------------------- 1 | '''Helper functions for rosenbrock optimization examples.''' 2 | 3 | import downhill 4 | import numpy as np 5 | import theano 6 | 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 8 | 9 | COLORS = ('#d62728 #1f77b4 #2ca02c #9467bd #ff7f0e ' 10 | '#e377c2 #8c564b #bcbd22 #7f7f7f #17becf').split() 11 | 12 | FLOAT = 'df'[theano.config.floatX == 'float32'] 13 | 14 | 15 | def build(algo, init): 16 | '''Build and return an optimizer for the rosenbrock function. 17 | 18 | In downhill, an optimizer can be constructed using the build() top-level 19 | function. This function requires several Theano quantities such as the loss 20 | being optimized and the parameters to update during optimization. 21 | ''' 22 | x = theano.shared(np.array(init, FLOAT), name='x') 23 | n = 0.1 * RandomStreams().normal((len(init) - 1, )) 24 | monitors = [] 25 | if len(init) == 2: 26 | # this gives us access to the x and y locations during optimization. 27 | monitors.extend([('x', x[:-1].sum()), ('y', x[1:].sum())]) 28 | return downhill.build( 29 | algo, 30 | loss=(n + 100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), 31 | params=[x], 32 | monitors=monitors, 33 | monitor_gradients=True) 34 | 35 | 36 | def build_and_trace(algo, init, limit=100, **kwargs): 37 | '''Run an optimizer on the rosenbrock function. Return xs, ys, and losses. 38 | 39 | In downhill, optimization algorithms can be iterated over to progressively 40 | minimize the loss. At each iteration, the optimizer yields a dictionary of 41 | monitor values that were computed during that iteration. Here we build an 42 | optimizer and then run it for a fixed number of iterations. 43 | ''' 44 | kw = dict(min_improvement=0, patience=0, max_gradient_norm=100) 45 | kw.update(kwargs) 46 | xs, ys, loss = [], [], [] 47 | for tm, _ in build(algo, init).iterate([[]], **kw): 48 | if len(init) == 2: 49 | xs.append(tm['x']) 50 | ys.append(tm['y']) 51 | loss.append(tm['loss']) 52 | if len(loss) == limit: 53 | break 54 | # Return the optimization up to any failure of patience. 55 | return xs[:-9], ys[:-9], loss[-9] 56 | 57 | 58 | def test(algos, n=10, init=[-1.1, 0], limit=100): 59 | '''Run several optimizers for comparison. 60 | 61 | Each optimizer is run a fixed number of times with random hyperparameter 62 | values, and the results are yielded back to the caller (often stored in a 63 | dictionary). 64 | 65 | Returns 66 | ------- 67 | results : sequence of (key, value) pairs 68 | A sequence of results from running tests. Each result contains a "key" 69 | that describes the test run and a "value" that contains the results from 70 | the run. The key is a tuple containing (a) the algorithm, (b) the 71 | learning rate, (c) the momentum, (d) the RMS halflife, and (e) the RMS 72 | regularizer. The value is a tuple containing the (a) x-values and (b) 73 | y-values during the optimization, and (c) the loss value. (The x- and 74 | y-value are only non-empty for 2D experiments.) 75 | ''' 76 | for algo in algos: 77 | for _ in range(n): 78 | mu = max(0, np.random.uniform(0, 2) - 1) 79 | rate = np.exp(np.random.uniform(-8, -1)) 80 | half = int(np.exp(np.random.uniform(0, 4))) 81 | reg = np.exp(np.random.uniform(-12, 0)) 82 | yield (algo, rate, mu, half, reg), build_and_trace( 83 | algo, init, limit, momentum=mu, learning_rate=rate, 84 | rms_halflife=half, rms_regularizer=reg) 85 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | #pep8ignore = E226,E302,E41 3 | pep8maxlinelength = 90 4 | 5 | [bdist_wheel] 6 | universal = 1 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | setuptools.setup( 5 | name='downhill', 6 | version='0.5.0pre', 7 | packages=setuptools.find_packages(), 8 | author='lmjohns3', 9 | author_email='downhill-users@googlegroups.com', 10 | description='Stochastic optimization routines for Theano', 11 | long_description=open(os.path.join( 12 | os.path.dirname(os.path.abspath(__file__)), 'README.rst')).read(), 13 | license='MIT', 14 | url='http://github.com/lmjohns3/downhill', 15 | keywords=('adadelta ' 16 | 'adam ' 17 | 'esgd ' 18 | 'gradient-descent ' 19 | 'nesterov ' 20 | 'optimization ' 21 | 'rmsprop ' 22 | 'sgd ' 23 | 'theano ' 24 | ), 25 | install_requires=['theano', 'click'], 26 | classifiers=[ 27 | 'Development Status :: 4 - Beta', 28 | 'Intended Audience :: Science/Research', 29 | 'License :: OSI Approved :: MIT License', 30 | 'Operating System :: OS Independent', 31 | 'Topic :: Scientific/Engineering', 32 | ], 33 | ) 34 | -------------------------------------------------------------------------------- /test/adaptive_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import util 4 | 5 | 6 | class TestESGD: 7 | def test_rosen(self): 8 | util.assert_progress(*util.build_rosen('esgd'), learning_rate=1e-6) 9 | 10 | def test_factor(self): 11 | util.assert_progress(*util.build_factor('esgd'), learning_rate=1e-6) 12 | 13 | def test_default_params(self): 14 | opt, data = util.build_rosen('esgd') 15 | assert opt.hv_method == 'rop' 16 | for _ in opt.iterate(data): 17 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 18 | assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 14)) 19 | assert np.allclose(opt.epsilon.eval(), 1e-8) 20 | break 21 | 22 | def test_params(self): 23 | opt, data = util.build_rosen('esgd') 24 | opt.hv_method = 'lop' # TODO(leif): incorporate into downhill.build()? 25 | for _ in opt.iterate(data, 26 | learning_rate=0.3, 27 | rms_halflife=10, 28 | rms_regularizer=20): 29 | assert np.allclose(opt.learning_rate.eval(), 0.3) 30 | assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 10)) 31 | assert np.allclose(opt.epsilon.eval(), 20) 32 | break 33 | 34 | 35 | class TestRProp: 36 | def test_rosen(self): 37 | util.assert_progress(*util.build_rosen('rprop')) 38 | 39 | def test_factor(self): 40 | util.assert_progress(*util.build_factor('rprop')) 41 | 42 | def test_default_params(self): 43 | opt, data = util.build_rosen('rprop') 44 | for _ in opt.iterate(data): 45 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 46 | assert np.allclose(opt.step_increase.eval(), 1.01) 47 | assert np.allclose(opt.step_decrease.eval(), 0.99) 48 | assert np.allclose(opt.min_step.eval(), 0) 49 | assert np.allclose(opt.max_step.eval(), 100) 50 | break 51 | 52 | def test_params(self): 53 | opt, data = util.build_rosen('rprop') 54 | for _ in opt.iterate(data, 55 | learning_rate=0.3, 56 | rprop_increase=22, 57 | rprop_decrease=101, 58 | rprop_min_step=50, 59 | rprop_max_step=-10): 60 | assert np.allclose(opt.learning_rate.eval(), 0.3) 61 | assert np.allclose(opt.step_increase.eval(), 22) 62 | assert np.allclose(opt.step_decrease.eval(), 101) 63 | assert np.allclose(opt.min_step.eval(), 50) 64 | assert np.allclose(opt.max_step.eval(), -10) 65 | break 66 | 67 | 68 | class TestADAGRAD: 69 | def test_rosen(self): 70 | util.assert_progress(*util.build_rosen('adagrad')) 71 | 72 | def test_factor(self): 73 | util.assert_progress(*util.build_factor('adagrad')) 74 | 75 | def test_default_params(self): 76 | opt, data = util.build_rosen('adagrad') 77 | for _ in opt.iterate(data): 78 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 79 | assert np.allclose(opt.epsilon.eval(), 1e-8) 80 | break 81 | 82 | def test_params(self): 83 | opt, data = util.build_rosen('adagrad') 84 | for _ in opt.iterate(data, rms_regularizer=0.1): 85 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 86 | assert np.allclose(opt.epsilon.eval(), 0.1) 87 | break 88 | 89 | 90 | class TestRMSProp: 91 | def test_rosen(self): 92 | util.assert_progress(*util.build_rosen('rmsprop')) 93 | 94 | def test_factor(self): 95 | util.assert_progress(*util.build_factor('rmsprop')) 96 | 97 | def test_default_params(self): 98 | opt, data = util.build_rosen('rmsprop') 99 | for _ in opt.iterate(data): 100 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 101 | assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 14)) 102 | assert np.allclose(opt.epsilon.eval(), 1e-8) 103 | break 104 | 105 | def test_params(self): 106 | opt, data = util.build_rosen('rmsprop') 107 | for _ in opt.iterate(data, 108 | learning_rate=0.3, 109 | rms_halflife=10, 110 | rms_regularizer=20): 111 | assert np.allclose(opt.learning_rate.eval(), 0.3) 112 | assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 10)) 113 | assert np.allclose(opt.epsilon.eval(), 20) 114 | break 115 | 116 | 117 | class TestADADELTA: 118 | def test_rosen(self): 119 | util.assert_progress(*util.build_rosen('adadelta')) 120 | 121 | def test_factor(self): 122 | util.assert_progress(*util.build_factor('adadelta')) 123 | 124 | def test_default_params(self): 125 | opt, data = util.build_rosen('adadelta') 126 | for _ in opt.iterate(data): 127 | assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 14)) 128 | assert np.allclose(opt.epsilon.eval(), 1e-8) 129 | break 130 | 131 | def test_params(self): 132 | opt, data = util.build_rosen('adadelta') 133 | for _ in opt.iterate(data, 134 | rms_halflife=10, 135 | rms_regularizer=20): 136 | assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 10)) 137 | assert np.allclose(opt.epsilon.eval(), 20) 138 | break 139 | 140 | 141 | class TestAdam: 142 | def test_rosen(self): 143 | util.assert_progress(*util.build_rosen('adam')) 144 | 145 | def test_factor(self): 146 | util.assert_progress(*util.build_factor('adam')) 147 | 148 | def test_default_params(self): 149 | opt, data = util.build_rosen('adam') 150 | for _ in opt.iterate(data): 151 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 152 | assert np.allclose(opt.beta1.eval(), np.exp(-np.log(2) / 7)) 153 | assert np.allclose(opt.beta2.eval(), np.exp(-np.log(2) / 69)) 154 | assert np.allclose(opt.epsilon.eval(), 1e-8) 155 | break 156 | 157 | def test_params(self): 158 | opt, data = util.build_rosen('adam') 159 | for _ in opt.iterate(data, 160 | learning_rate=0.3, 161 | beta1_halflife=10, 162 | beta2_halflife=20, 163 | rms_regularizer=11): 164 | assert np.allclose(opt.learning_rate.eval(), 0.3) 165 | assert np.allclose(opt.beta1.eval(), np.exp(-np.log(2) / 10)) 166 | assert np.allclose(opt.beta2.eval(), np.exp(-np.log(2) / 20)) 167 | assert np.allclose(opt.epsilon.eval(), 11) 168 | break 169 | -------------------------------------------------------------------------------- /test/base_test.py: -------------------------------------------------------------------------------- 1 | import downhill 2 | import numpy as np 3 | 4 | import util 5 | 6 | 7 | class TestBuild: 8 | def test_sgd(self): 9 | assert isinstance(util.build_rosen('sgd')[0], downhill.SGD) 10 | assert isinstance(util.build_factor('sgd')[0], downhill.SGD) 11 | 12 | def test_nag(self): 13 | assert isinstance(util.build_rosen('nag')[0], downhill.NAG) 14 | 15 | def test_rprop(self): 16 | assert isinstance(util.build_rosen('RProp')[0], downhill.RProp) 17 | 18 | def test_rmsprop(self): 19 | assert isinstance(util.build_rosen('RmsProp')[0], downhill.RMSProp) 20 | 21 | def test_adadelta(self): 22 | assert isinstance(util.build_rosen('ADADELTA')[0], downhill.ADADELTA) 23 | 24 | def test_esgd(self): 25 | assert isinstance(util.build_rosen('EsGd')[0], downhill.ESGD) 26 | 27 | def test_adam(self): 28 | assert isinstance(util.build_rosen('Adam')[0], downhill.Adam) 29 | 30 | 31 | class Straight(downhill.Optimizer): 32 | def _get_updates_for(self, param, grad): 33 | yield (param, param + 1.1) 34 | 35 | 36 | class TestOptimizer: 37 | def test_rosen(self): 38 | opt, train = util.build_rosen('straight') 39 | assert isinstance(opt, Straight) 40 | 41 | # run the optimizer for three iterations. check that the x and y values 42 | # (being monitored) increase at each iteration. 43 | for i, (tm, vm) in enumerate(opt.iterate(train, max_updates=3)): 44 | assert tm['x'] >= vm['x'] 45 | assert tm['y'] >= vm['y'] 46 | assert i < 3 47 | 48 | def test_rosen_unnamed(self): 49 | opt, train = util.build_rosen('straight', name=False, monitor_gradients=True) 50 | assert isinstance(opt, Straight) 51 | 52 | # run the optimizer for three iterations. check that the x and y values 53 | # (being monitored) increase at each iteration. 54 | for i, (tm, vm) in enumerate(opt.iterate(train, max_updates=3)): 55 | assert tm['x'] >= vm['x'] 56 | assert tm['y'] >= vm['y'] 57 | # check there's a manually-named parameter in here. 58 | assert 1 == sum(1 for k in tm if 'unnamed' in k), tm 59 | assert i < 3 60 | 61 | def test_factor(self): 62 | opt, train = util.build_factor('straight') 63 | assert isinstance(opt, Straight) 64 | 65 | # run the optimizer for two iterations. check that the u and v values 66 | # (being monitored) are reasonable at the start. 67 | for i, (tm, vm) in enumerate(opt.iterate(train)): 68 | assert abs(vm['u<1'] - 0.001) < 1e-5 69 | assert vm['u<-1'] == 0 70 | assert vm['v<1'] == 1 71 | assert vm['v<-1'] == 0 72 | if i == 2: 73 | break 74 | 75 | def test_gradient_clip(self): 76 | opt, data = util.build_rosen('straight') 77 | for _ in opt.iterate(data, max_gradient_elem=3): 78 | assert opt.max_gradient_elem == 3 79 | break 80 | 81 | def test_set_params(self): 82 | opt, _ = util.build_rosen('straight') 83 | opt.set_params([[1, 2]]) 84 | assert np.allclose(opt._params[0].get_value(), [1, 2]) 85 | 86 | def test_set_best_params(self): 87 | opt, _ = util.build_rosen('straight') 88 | opt._best_params = [[1, 2]] 89 | opt.set_params('best') 90 | assert np.allclose(opt._params[0].get_value(), [1, 2]) 91 | -------------------------------------------------------------------------------- /test/dataset_test.py: -------------------------------------------------------------------------------- 1 | import downhill 2 | import numpy as np 3 | import theano 4 | import theano.tensor as TT 5 | 6 | 7 | def assert_size(ds, i, expected): 8 | s = ds._slices[i][0][0] 9 | assert s.stop - s.start == expected 10 | 11 | 12 | class TestDataset: 13 | def test_rng(self): 14 | ds = downhill.Dataset([np.random.randn(40, 2)], rng=4) 15 | assert ds.rng.randint(10) == 7 16 | ds = downhill.Dataset([np.random.randn(40, 2)], rng=np.random.RandomState(4)) 17 | assert ds.rng.randint(10) == 7 18 | 19 | def test_name(self): 20 | ds = downhill.Dataset([np.random.randn(40, 2)], name='foo') 21 | assert ds.name == 'foo' 22 | ds = downhill.Dataset([np.random.randn(40, 2)]) 23 | assert ds.name.startswith('dataset') 24 | assert ds.name[7:].isdigit() 25 | 26 | def test_batch_size(self): 27 | ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=10, rng=4) 28 | assert len(ds._slices) == 4 29 | assert_size(ds, 0, 10) 30 | assert_size(ds, 1, 10) 31 | assert_size(ds, 2, 10) 32 | assert_size(ds, 3, 10) 33 | ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=11, rng=4) 34 | assert len(ds._slices) == 4 35 | assert_size(ds, 0, 11) 36 | assert_size(ds, 1, 11) 37 | assert_size(ds, 2, 7) 38 | assert_size(ds, 3, 11) 39 | 40 | def test_batch_size_zero(self): 41 | ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=0, rng=4) 42 | assert len(ds._slices) == 1 43 | assert_size(ds, 0, 40) 44 | 45 | def test_iteration_size(self): 46 | def batches_unchanged(previous): 47 | return all(a == b for a, b in zip(ds._slices, previous)) 48 | 49 | ds = downhill.Dataset([np.random.randn(40, 2)], 50 | batch_size=5, iteration_size=3) 51 | 52 | previous = list(ds._slices) 53 | c = sum(1 for _ in ds) 54 | assert c == 3, 'got {}'.format(c) 55 | assert ds._index == 3, 'got {}'.format(ds._index) 56 | assert batches_unchanged(previous) 57 | 58 | previous = list(ds._slices) 59 | c = sum(1 for _ in ds) 60 | assert c == 3 61 | assert ds._index == 6, 'got {}'.format(ds._index) 62 | assert batches_unchanged(previous) 63 | 64 | previous = list(ds._slices) 65 | c = sum(1 for _ in ds) 66 | assert c == 3 67 | assert ds._index == 1, 'got {}'.format(ds._index) 68 | assert not batches_unchanged(previous) 69 | 70 | def test_callable(self): 71 | def batches(): 72 | return 'hello' 73 | ds = downhill.Dataset(batches, iteration_size=10) 74 | assert list(ds) == ['hello'] * 10 75 | 76 | def test_callable_length(self): 77 | class Batches: 78 | called = 0 79 | 80 | def __call__(self): 81 | self.called += 1 82 | return 'hello' 83 | 84 | def __len__(self): 85 | return 10 86 | 87 | batches = Batches() 88 | ds = downhill.Dataset(batches, iteration_size=10) 89 | assert list(ds) == ['hello'] * 10 90 | assert batches.called == 10 91 | 92 | def test_shared(self): 93 | x = theano.shared(np.random.randn(40, 2)) 94 | ds = downhill.Dataset([x], batch_size=10, rng=4) 95 | assert len(ds._slices) == 4 96 | assert_size(ds, 0, 10) 97 | assert_size(ds, 1, 10) 98 | assert_size(ds, 2, 10) 99 | assert_size(ds, 3, 10) 100 | f = list(ds)[0][0] 101 | assert isinstance(f, TT.TensorVariable), type(f) 102 | 103 | def test_pandas(self): 104 | import pandas as pd 105 | x = pd.DataFrame(np.random.randn(40, 2)) 106 | ds = downhill.Dataset([x], batch_size=10, rng=4) 107 | assert len(ds._slices) == 4 108 | assert_size(ds, 0, 10) 109 | assert_size(ds, 1, 10) 110 | assert_size(ds, 2, 10) 111 | assert_size(ds, 3, 10) 112 | f = list(ds)[0][0] 113 | assert isinstance(f, pd.DataFrame), type(f) 114 | 115 | def test_sparse_csc(self): 116 | import scipy.sparse as ss 117 | x = ss.csc_matrix(np.random.randn(40, 2)) 118 | ds = downhill.Dataset([x], batch_size=10, rng=4) 119 | assert len(ds._slices) == 4 120 | assert_size(ds, 0, 10) 121 | assert_size(ds, 1, 10) 122 | assert_size(ds, 2, 10) 123 | assert_size(ds, 3, 10) 124 | f = list(ds)[0][0] 125 | assert isinstance(f, ss.csc.csc_matrix), type(f) 126 | 127 | def test_sparse_csr(self): 128 | import scipy.sparse as ss 129 | x = ss.csr_matrix(np.random.randn(40, 2)) 130 | ds = downhill.Dataset([x], batch_size=10, rng=4) 131 | assert len(ds._slices) == 4 132 | assert_size(ds, 0, 10) 133 | assert_size(ds, 1, 10) 134 | assert_size(ds, 2, 10) 135 | assert_size(ds, 3, 10) 136 | f = list(ds)[0][0] 137 | assert isinstance(f, ss.csr.csr_matrix), type(f) 138 | 139 | def test_bad_input_type(self): 140 | try: 141 | downhill.Dataset([[1]]) 142 | assert False 143 | except ValueError: 144 | pass 145 | -------------------------------------------------------------------------------- /test/downhill_test.py: -------------------------------------------------------------------------------- 1 | import downhill 2 | import numpy as np 3 | import theano 4 | 5 | 6 | class TestMinimize: 7 | def test_minimize(self): 8 | x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x') 9 | data = downhill.Dataset(np.zeros((1, 1), 'f'), batch_size=1) 10 | data._slices = [[]] 11 | downhill.minimize( 12 | (100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), 13 | data, 14 | algo='nag', 15 | learning_rate=0.001, 16 | momentum=0.9, 17 | patience=1, 18 | min_improvement=0.1, 19 | max_gradient_norm=1, 20 | ) 21 | assert np.allclose(x.get_value(), [1, 1]), x.get_value() 22 | -------------------------------------------------------------------------------- /test/first_order_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import util 4 | 5 | 6 | class TestSGD: 7 | def test_rosen(self): 8 | util.assert_progress( 9 | *util.build_rosen('sgd'), 10 | monitor_gradients=True) 11 | 12 | def test_factor(self): 13 | util.assert_progress( 14 | *util.build_factor('sgd'), 15 | max_gradient_elem=1, 16 | nesterov=False) 17 | 18 | def test_factor_nesterov(self): 19 | util.assert_progress( 20 | *util.build_factor('sgd'), 21 | max_gradient_norm=1) 22 | 23 | def test_default_params(self): 24 | opt, data = util.build_rosen('sgd') 25 | for _ in opt.iterate(data): 26 | assert opt.nesterov is False 27 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 28 | assert np.allclose(opt.momentum, 0) 29 | assert np.allclose(opt.patience, 5) 30 | assert np.allclose(opt.min_improvement, 0) 31 | assert np.allclose(opt.max_gradient_norm, 0) 32 | assert np.allclose(opt.max_gradient_elem, 0) 33 | break 34 | 35 | def test_params(self): 36 | opt, data = util.build_rosen('sgd') 37 | for _ in opt.iterate(data, 38 | learning_rate=0.3, 39 | momentum=10, 40 | patience=20, 41 | min_improvement=0.1, 42 | max_gradient_elem=4, 43 | max_gradient_norm=5, 44 | nesterov=True): 45 | assert opt.nesterov is True 46 | assert np.allclose(opt.learning_rate.eval(), 0.3) 47 | assert np.allclose(opt.momentum, 10) 48 | assert np.allclose(opt.patience, 20) 49 | assert np.allclose(opt.min_improvement, 0.1) 50 | assert np.allclose(opt.max_gradient_norm, 5) 51 | assert np.allclose(opt.max_gradient_elem, 4) 52 | break 53 | 54 | 55 | class TestNAG: 56 | def test_rosen(self): 57 | util.assert_progress(*util.build_rosen('nag')) 58 | 59 | def test_factor(self): 60 | util.assert_progress(*util.build_factor('nag'), max_gradient_elem=1) 61 | 62 | def test_default_params(self): 63 | opt, data = util.build_rosen('nag') 64 | for _ in opt.iterate(data): 65 | assert opt.nesterov is True 66 | assert np.allclose(opt.learning_rate.eval(), 1e-4) 67 | assert np.allclose(opt.momentum, 0) 68 | assert np.allclose(opt.patience, 5) 69 | assert np.allclose(opt.min_improvement, 0) 70 | assert np.allclose(opt.max_gradient_norm, 0) 71 | assert np.allclose(opt.max_gradient_elem, 0) 72 | break 73 | 74 | def test_params(self): 75 | opt, data = util.build_rosen('nag') 76 | for _ in opt.iterate(data, 77 | learning_rate=0.3, 78 | momentum=10, 79 | patience=20, 80 | min_improvement=0.1, 81 | max_gradient_elem=4, 82 | max_gradient_norm=5, 83 | nesterov=False): 84 | assert opt.nesterov is True # nesterov always True for NAG 85 | assert np.allclose(opt.learning_rate.eval(), 0.3) 86 | assert np.allclose(opt.momentum, 10) 87 | assert np.allclose(opt.patience, 20) 88 | assert np.allclose(opt.min_improvement, 0.1) 89 | assert np.allclose(opt.max_gradient_norm, 5) 90 | assert np.allclose(opt.max_gradient_elem, 4) 91 | break 92 | -------------------------------------------------------------------------------- /test/util.py: -------------------------------------------------------------------------------- 1 | import downhill 2 | import numpy as np 3 | import theano 4 | import theano.tensor as TT 5 | 6 | 7 | def build_rosen(algo, name=True, monitor_gradients=False): 8 | x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x' if name else None) 9 | return downhill.build( 10 | algo, 11 | loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), 12 | monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())], 13 | monitor_gradients=monitor_gradients, 14 | ), None 15 | 16 | 17 | def build_factor(algo): 18 | a = np.arange(1000).reshape((100, 10)).astype('f') 19 | b = 0.1 + np.zeros((10, 100), 'f') 20 | 21 | x = TT.matrix('x') 22 | u = theano.shared(a, name='u') 23 | v = theano.shared(0.1 + b, name='v') 24 | return downhill.build( 25 | algo, 26 | loss=TT.sum(TT.sqr(x - TT.dot(u, v))), 27 | monitors=[ 28 | ('u<1', (u < 1).mean()), 29 | ('u<-1', (u < -1).mean()), 30 | ('v<1', (v < 1).mean()), 31 | ('v<-1', (v < -1).mean()), 32 | ]), [[np.dot(a, b) + np.random.randn(100, 100).astype('f')] 33 | for _ in range(10)] 34 | 35 | 36 | def assert_progress(opt, train, valid=None, **kwargs): 37 | mover = opt.iterate(train, valid=valid, **kwargs) 38 | train0, valid0 = next(mover) 39 | train1, valid1 = next(mover) 40 | assert train1['loss'] < valid0['loss'] # should have made progress! 41 | assert valid1['loss'] == valid0['loss'] # no new validation occurred 42 | --------------------------------------------------------------------------------