├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.rst
├── docs
    ├── Makefile
    ├── _static
    │   ├── rosenbrock-nag.png
    │   ├── rosenbrock-nag.py
    │   └── style-tweaks.css
    ├── _templates
    │   └── gitwidgets.html
    ├── conf.py
    ├── guide.rst
    ├── index.rst
    ├── make.bat
    ├── reference.rst
    └── requirements.txt
├── downhill
    ├── __init__.py
    ├── adaptive.py
    ├── base.py
    ├── dataset.py
    ├── first_order.py
    └── util.py
├── examples
    ├── mnist-sparse-factorization.py
    ├── rosenbrock-100d.py
    ├── rosenbrock-2d.py
    └── rosenbrock.py
├── setup.cfg
├── setup.py
└── test
    ├── adaptive_test.py
    ├── base_test.py
    ├── dataset_test.py
    ├── downhill_test.py
    ├── first_order_test.py
    └── util.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit =
3 |     */python?.?/*
4 |     */site-packages/nose/*
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | docs/generated/
53 | 
54 | # PyBuilder
55 | target/
56 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | cache: apt
 2 | sudo: false
 3 | language: python
 4 | python:
 5 |   - "2.7"
 6 |   - "3.4"
 7 | addons:
 8 |   apt:
 9 |     packages:
10 |     - libatlas-dev
11 |     - libatlas-base-dev
12 |     - liblapack-dev
13 |     - gfortran
14 | before_install:
15 |   - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
16 |   - bash miniconda.sh -b -p $HOME/miniconda
17 |   - export PATH="$HOME/miniconda/bin:$PATH"
18 | install:
19 |   - conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas
20 |   - pip install pytest-pep8 pytest-cov python-coveralls
21 |   - python setup.py develop
22 | script:
23 |   - THEANO_FLAGS=floatX=float32 py.test -v --pep8 --cov=downhill --cov-report=term-missing
24 | after_success:
25 |   - coveralls
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014-2015 lmjohns3
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://travis-ci.org/lmjohns3/downhill.svg
  2 | .. image:: https://coveralls.io/repos/lmjohns3/downhill/badge.svg
  3 |    :target: https://coveralls.io/r/lmjohns3/downhill
  4 | .. image:: http://depsy.org/api/package/pypi/downhill/badge.svg
  5 |    :target: http://depsy.org/package/python/downhill
  6 | 
  7 | ============
  8 | ``DOWNHILL``
  9 | ============
 10 | 
 11 | The ``downhill`` package provides algorithms for minimizing scalar loss
 12 | functions that are defined using Theano_.
 13 | 
 14 | Several optimization algorithms are included:
 15 | 
 16 | - ADADELTA_
 17 | - ADAGRAD_
 18 | - Adam_
 19 | - `Equilibrated SGD`_
 20 | - `Nesterov's Accelerated Gradient`_
 21 | - RMSProp_
 22 | - `Resilient Backpropagation`_
 23 | - `Stochastic Gradient Descent`_
 24 | 
 25 | All algorithms permit the use of regular or Nesterov-style momentum as well.
 26 | 
 27 | .. _Theano: http://deeplearning.net/software/theano/
 28 | 
 29 | .. _Stochastic Gradient Descent: http://downhill.readthedocs.org/en/stable/generated/downhill.first_order.SGD.html
 30 | .. _Nesterov's Accelerated Gradient: http://downhill.readthedocs.org/en/stable/generated/downhill.first_order.NAG.html
 31 | .. _Resilient Backpropagation: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.RProp.html
 32 | .. _ADAGRAD: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.ADAGRAD.html
 33 | .. _RMSProp: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.RMSProp.html
 34 | .. _ADADELTA: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.ADADELTA.html
 35 | .. _Adam: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.Adam.html
 36 | .. _Equilibrated SGD: http://downhill.readthedocs.org/en/stable/generated/downhill.adaptive.ESGD.html
 37 | 
 38 | Quick Start: Matrix Factorization
 39 | =================================
 40 | 
 41 | Let's say you have 100 samples of 1000-dimensional data, and you want to
 42 | represent your data as 100 coefficients in a 10-dimensional basis. This is
 43 | pretty straightforward to model using Theano: you can use a matrix
 44 | multiplication as the data model, a squared-error term for optimization, and a
 45 | sparse regularizer to encourage small coefficient values.
 46 | 
 47 | Once you have constructed an expression for the loss, you can optimize it with a
 48 | single call to ``downhill.minimize``:
 49 | 
 50 | .. code:: python
 51 | 
 52 |   import downhill
 53 |   import numpy as np
 54 |   import theano
 55 |   import theano.tensor as TT
 56 | 
 57 |   FLOAT = 'df'[theano.config.floatX == 'float32']
 58 | 
 59 |   def rand(a, b):
 60 |       return np.random.randn(a, b).astype(FLOAT)
 61 | 
 62 |   A, B, K = 20, 5, 3
 63 | 
 64 |   # Set up a matrix factorization problem to optimize.
 65 |   u = theano.shared(rand(A, K), name='u')
 66 |   v = theano.shared(rand(K, B), name='v')
 67 |   z = TT.matrix()
 68 |   err = TT.sqr(z - TT.dot(u, v))
 69 |   loss = err.mean() + abs(u).mean() + (v * v).mean()
 70 | 
 71 |   # Minimize the regularized loss with respect to a data matrix.
 72 |   y = np.dot(rand(A, K), rand(K, B)) + rand(A, B)
 73 | 
 74 |   # Monitor during optimization.
 75 |   monitors = (('err', err.mean()),
 76 |               ('|u|<0.1', (abs(u) < 0.1).mean()),
 77 |               ('|v|<0.1', (abs(v) < 0.1).mean()))
 78 | 
 79 |   downhill.minimize(
 80 |       loss=loss,
 81 |       train=[y],
 82 |       patience=0,
 83 |       batch_size=A,                 # Process y as a single batch.
 84 |       max_gradient_norm=1,          # Prevent gradient explosion!
 85 |       learning_rate=0.1,
 86 |       monitors=monitors,
 87 |       monitor_gradients=True)
 88 | 
 89 |   # Print out the optimized coefficients u and basis v.
 90 |   print('u =', u.get_value())
 91 |   print('v =', v.get_value())
 92 | 
 93 | If you prefer to maintain more control over your model during optimization,
 94 | downhill provides an iterative optimization interface:
 95 | 
 96 | .. code:: python
 97 | 
 98 |   opt = downhill.build(algo='rmsprop',
 99 |                        loss=loss,
100 |                        monitors=monitors,
101 |                        monitor_gradients=True)
102 | 
103 |   for metrics, _ in opt.iterate(train=[[y]],
104 |                                 patience=0,
105 |                                 batch_size=A,
106 |                                 max_gradient_norm=1,
107 |                                 learning_rate=0.1):
108 |       print(metrics)
109 | 
110 | If that's still not enough, you can just plain ask downhill for the updates to
111 | your model variables and do everything else yourself:
112 | 
113 | .. code:: python
114 | 
115 |   updates = downhill.build('rmsprop', loss).get_updates(
116 |       batch_size=A, max_gradient_norm=1, learning_rate=0.1)
117 |   func = theano.function([z], loss, updates=list(updates))
118 |   for _ in range(100):
119 |       print(func(y))  # Evaluate func and apply variable updates.
120 | 
121 | More Information
122 | ================
123 | 
124 | Source: http://github.com/lmjohns3/downhill
125 | 
126 | Documentation: http://downhill.readthedocs.org
127 | 
128 | Mailing list: https://groups.google.com/forum/#!forum/downhill-users
129 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  zip        to make standalone HTML files and zip them up"
 23 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 24 | 	@echo "  singlehtml to make a single large HTML file"
 25 | 	@echo "  pickle     to make pickle files"
 26 | 	@echo "  json       to make JSON files"
 27 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 28 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 29 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 30 | 	@echo "  epub       to make an epub"
 31 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 32 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 33 | 	@echo "  text       to make text files"
 34 | 	@echo "  man        to make manual pages"
 35 | 	@echo "  texinfo    to make Texinfo files"
 36 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 37 | 	@echo "  gettext    to make PO message catalogs"
 38 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 39 | 	@echo "  linkcheck  to check all external links for integrity"
 40 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 41 | 
 42 | clean:
 43 | 	-rm -rf $(BUILDDIR)/*
 44 | 	-rm docs.zip
 45 | 
 46 | zip: html
 47 | 	cd $(BUILDDIR)/html && zip -r docs.zip . && mv docs.zip ../..
 48 | 
 49 | html:
 50 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 53 | 
 54 | dirhtml:
 55 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 58 | 
 59 | singlehtml:
 60 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 63 | 
 64 | pickle:
 65 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the pickle files."
 68 | 
 69 | json:
 70 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the JSON files."
 73 | 
 74 | htmlhelp:
 75 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 76 | 	@echo
 77 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 78 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 79 | 
 80 | qthelp:
 81 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 82 | 	@echo
 83 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 84 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 85 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/downhill.qhcp"
 86 | 	@echo "To view the help file:"
 87 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/downhill.qhc"
 88 | 
 89 | devhelp:
 90 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 91 | 	@echo
 92 | 	@echo "Build finished."
 93 | 	@echo "To view the help file:"
 94 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/downhill"
 95 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/downhill"
 96 | 	@echo "# devhelp"
 97 | 
 98 | epub:
 99 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
100 | 	@echo
101 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
102 | 
103 | latex:
104 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
105 | 	@echo
106 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
107 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
108 | 	      "(use \`make latexpdf' here to do that automatically)."
109 | 
110 | latexpdf:
111 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
112 | 	@echo "Running LaTeX files through pdflatex..."
113 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
114 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
115 | 
116 | text:
117 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
118 | 	@echo
119 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
120 | 
121 | man:
122 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
123 | 	@echo
124 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
125 | 
126 | texinfo:
127 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
128 | 	@echo
129 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
130 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
131 | 	      "(use \`make info' here to do that automatically)."
132 | 
133 | info:
134 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
135 | 	@echo "Running Texinfo files through makeinfo..."
136 | 	make -C $(BUILDDIR)/texinfo info
137 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
138 | 
139 | gettext:
140 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
141 | 	@echo
142 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
143 | 
144 | changes:
145 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
146 | 	@echo
147 | 	@echo "The overview file is in $(BUILDDIR)/changes."
148 | 
149 | linkcheck:
150 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
151 | 	@echo
152 | 	@echo "Link check complete; look for any errors in the above output " \
153 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
154 | 
155 | doctest:
156 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
157 | 	@echo "Testing of doctests in the sources finished, look at the " \
158 | 	      "results in $(BUILDDIR)/doctest/output.txt."
159 | 


--------------------------------------------------------------------------------
/docs/_static/rosenbrock-nag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmjohns3/downhill/42111ab03b5e6fa47b7bf7c7cb5caa402f10ce6d/docs/_static/rosenbrock-nag.png


--------------------------------------------------------------------------------
/docs/_static/rosenbrock-nag.py:
--------------------------------------------------------------------------------
 1 | import downhill
 2 | import matplotlib.pyplot as plt
 3 | from mpl_toolkits.mplot3d import Axes3D
 4 | import numpy as np
 5 | import theano
 6 | 
 7 | x = theano.shared(np.array([-1, 0], 'f'), name='x')
 8 | 
 9 | opt = downhill.build(
10 |     'nag',
11 |     loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(),
12 |     params=[x],
13 |     inputs=[],
14 |     monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())],
15 |     monitor_gradients=True)
16 | 
17 | xs, ys, loss = [], [], []
18 | for tm, _ in opt.iterate([[]],
19 |                          learning_rate=0.001,
20 |                          momentum=0.95,
21 |                          max_gradient_norm=100):
22 |     xs.append(tm['x'])
23 |     ys.append(tm['y'])
24 |     loss.append(tm['loss'])
25 |     if len(loss) == 300:
26 |         break
27 | 
28 | ax = plt.axes(projection='3d')
29 | 
30 | c = '#d62728'
31 | ax.plot(xs, ys, zs=loss, linestyle='-',
32 |         marker='o', color=c, mec=c, mfc='none',
33 |         lw=3, mew=0.5, markersize=7, alpha=0.7)
34 | 
35 | X, Y = np.meshgrid(np.linspace(-1.1, 1.1, 127), np.linspace(-0.5, 1.7, 127))
36 | Z = 100 * (Y - X ** 2) ** 2 + (1 - X) ** 2
37 | ax.plot_surface(X, Y, Z, cmap='YlGnBu', lw=0, rstride=4, cstride=4, alpha=0.9)
38 | ax.plot_wireframe(X, Y, Z, lw=0.5, rstride=4, cstride=4, color='#333333', alpha=0.7)
39 | ax.plot([1], [1], zs=[1], marker='x', mew=3, markersize=10, color='#111111')
40 | 
41 | ax.set_xlim(-1.1, 1.1)
42 | ax.set_ylim(-0.5, 1.7)
43 | ax.view_init(azim=10, elev=45)
44 | 
45 | ax.w_xaxis.set_pane_color((1, 1, 1, 1))
46 | ax.w_yaxis.set_pane_color((1, 1, 1, 1))
47 | ax.w_zaxis.set_pane_color((1, 1, 1, 1))
48 | 
49 | plt.savefig('rosenbrock-nag.png')
50 | plt.show()
51 | 


--------------------------------------------------------------------------------
/docs/_static/style-tweaks.css:
--------------------------------------------------------------------------------
 1 | a, a:visited { color: #258; }
 2 | a tt, a:visited tt, a:active tt { color: #258; }
 3 | 
 4 | .banana { float: right; max-width: 45%; }
 5 | .banana img { width: 100%; }
 6 | 
 7 | pre { font-size: 0.9rem; line-height: 1.25; }
 8 | span.pre { background: #eee; font-size: 0.95rem; padding: 0.1rem 0.2rem; }
 9 | 
10 | a.internal span.pre {
11 |     background: inherit;
12 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
13 |     font-size: inherit;
14 |     padding: inherit;
15 | }
16 | 
17 | th.field-name { background: #ffd; }
18 | 
19 | dl.method dt { background: #def; }
20 | 


--------------------------------------------------------------------------------
/docs/_templates/gitwidgets.html:
--------------------------------------------------------------------------------
1 | <div style="margin-top:20px">
2 |   <iframe src="https://ghbtns.com/github-btn.html?user=lmjohns3&repo=downhill&type=star&count=false" frameborder="0" scrolling="0" width="55px" height="20px" style="float:left"></iframe>
3 |   <div class="fb-share-button" style="float:left;margin-right:5px" data-layout="button"></div>
4 |   <a href="https://twitter.com/share" class="twitter-share-button" data-count="none" style="float:left">Tweet</a>
5 |   <script>(function(d, s, id) {var js,fjs=d.getElementsByTagName(s)[0];if(d.getElementById(id))return;js=d.createElement(s);js.id=id;js.src="//connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.3";fjs.parentNode.insertBefore(js, fjs);}(document,'script','facebook-jssdk'));</script>
6 |   <script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document,'script','twitter-wjs');</script>
7 |   <div id="fb-root" style="clear:left"></div>
8 | </div>
9 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import better
 2 | 
 3 | extensions = [
 4 |     'sphinx.ext.autodoc',
 5 |     'sphinx.ext.autosummary',
 6 |     'sphinx.ext.intersphinx',
 7 |     'sphinx.ext.mathjax',
 8 |     'sphinx.ext.viewcode',
 9 |     'numpydoc',
10 |     ]
11 | autosummary_generate = True
12 | autodoc_default_flags = ['members']
13 | numpydoc_show_class_members = False
14 | numpydoc_show_inherited_class_members = True
15 | source_suffix = '.rst'
16 | source_encoding = 'utf-8-sig'
17 | master_doc = 'index'
18 | project = u'Downhill'
19 | copyright = u'2015, Leif Johnson'
20 | version = '0.5'
21 | release = '0.5.0pre'
22 | exclude_patterns = ['_build']
23 | templates_path = ['_templates']
24 | pygments_style = 'tango'
25 | 
26 | html_theme = 'better'
27 | html_theme_path = [better.better_theme_path]
28 | html_theme_options = dict(
29 |     rightsidebar=False,
30 |     inlinecss='',
31 |     cssfiles=['_static/style-tweaks.css'],
32 |     showheader=True,
33 |     showrelbartop=True,
34 |     showrelbarbottom=True,
35 |     linktotheme=True,
36 |     sidebarwidth='15rem',
37 |     textcolor='#111',
38 |     headtextcolor='#333',
39 |     footertextcolor='#333',
40 |     ga_ua='',
41 |     ga_domain='',
42 | )
43 | html_short_title = 'Home'
44 | html_static_path = ['_static']
45 | 
46 | 
47 | def h(xs):
48 |     return ['{}.html'.format(x) for x in xs.split()]
49 | 
50 | html_sidebars = {
51 |     'index': h('gitwidgets globaltoc sourcelink searchbox'),
52 |     '**': h('gitwidgets localtoc sourcelink searchbox'),
53 | }
54 | 
55 | intersphinx_mapping = {
56 |     'python': ('https://docs.python.org/3.4/', None),
57 |     'numpy': ('http://docs.scipy.org/doc/numpy/', None),
58 |     'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None),
59 | }
60 | 


--------------------------------------------------------------------------------
/docs/guide.rst:
--------------------------------------------------------------------------------
  1 | ==========
  2 | User Guide
  3 | ==========
  4 | 
  5 | You are probably reading this guide because you have a problem.
  6 | 
  7 | There are many problems in the world, and many ways of thinking about solving
  8 | them. Happily, some---many---problems can be described mathematically using a
  9 | "loss" function, which takes a potential solution for your problem and returns a
 10 | single number indicating how terrible that solution is.
 11 | 
 12 | If you can express your problem using a loss function, then it's possible---even
 13 | likely---that you can then use a computer to solve your problem for you. This is
 14 | what ``downhill`` does: given a computational formulation of a loss, the
 15 | optimization routines in ``downhill`` can compute a series of ever-better
 16 | solutions to your problem.
 17 | 
 18 | This guide describes how that works.
 19 | 
 20 | .. _creating-loss:
 21 | 
 22 | Creating a Loss
 23 | ===============
 24 | 
 25 | Many types of problems can be formulated in terms of a scalar `"loss" function`_
 26 | that ought to be minimized. The "loss" for a problem:
 27 | 
 28 | - is computed with respect to a potential solution to a problem, and
 29 | - is a scalar quantity---just a single number.
 30 | 
 31 | A few examples of problems and their associated losses might include:
 32 | 
 33 | - Categorizing pictures into "elephants" versus "acrobats"; the loss might be
 34 |   the number of mistakes that are made on a given set of test pictures.
 35 | - Allocating funds to provide a given set of public services; the loss might be
 36 |   the monetary cost of the budget.
 37 | - Computing the actions of a robot to achieve a goal; the loss might be the
 38 |   total energy consumed.
 39 | 
 40 | This guide will use linear regression as a running example. Suppose you've made
 41 | some measurements of, say, the sizes and prices of various houses for sale where
 42 | you live. You want to describe the relationship between the size (let's
 43 | represent it as :math:`x_i`) and the price (:math:`y_i`) by fitting a line to
 44 | the measurements you've made.
 45 | 
 46 | So you need to take the data points that you collected and somehow use them to
 47 | compute a slope :math:`m` and an intercept :math:`b` such that the resulting
 48 | line :math:`y = m x + b` passes as closely as possible to your data points. In
 49 | this example, the loss :math:`\mathcal{L}` might be expressed as the sum of the
 50 | differences between the values on the line and the observed data:
 51 | 
 52 | .. math::
 53 |    \mathcal{L}(m,b) = \sum_{i=1}^N ( m x_i + b - y_i )^2
 54 | 
 55 | .. _"loss" function: https://en.wikipedia.org/wiki/Loss_function
 56 | 
 57 | Using Theano
 58 | ------------
 59 | 
 60 | Well, you've formulated a loss for this regression problem. Now it's time to use
 61 | ``downhill`` to minimize it, right?
 62 | 
 63 | Not so fast ... the ``downhill`` package provides routines for optimizing scalar
 64 | loss functions, but there's a catch: the loss functions must be defined using
 65 | Theano_, a Python framework for describing computation graphs. Theano takes a
 66 | bit of getting used to, but we'll walk through the linear regression example
 67 | here; if you're curious, there are also lots of good tutorials_ on the Theano
 68 | site.
 69 | 
 70 | To use Theano with ``downhill``, you need to define `shared variables`_ for each
 71 | of the parameters in your model, and `symbolic inputs`_ for the data that you'll
 72 | use to evaluate your loss. We'll start with the shared variables::
 73 | 
 74 |   import downhill
 75 |   import numpy as np
 76 |   import theano
 77 |   import theano.tensor as TT
 78 | 
 79 |   m = theano.shared(np.ones((1, ), 'f'), name='m')
 80 |   b = theano.shared(np.zeros((1, ), 'f'), name='b')
 81 | 
 82 | This sets up a vector with one 1 for :math:`m`, and a vector with one 0 for
 83 | :math:`b`. The values contained inside these shared variables will be adjusted
 84 | automatically by the optimization algorithms in ``downhill``.
 85 | 
 86 | Next, you need to define symbols that represent the data needed to compute
 87 | the loss::
 88 | 
 89 |   x = TT.vector('x')
 90 |   y = TT.vector('y')
 91 | 
 92 | These symbolic vectors represent the inputs---the house sizes :math:`[x_1 \dots
 93 | x_N]` and prices :math:`[y_1 \dots y_N]`---needed to compute the loss. Finally,
 94 | having created all of these symbolic variables, you can define the loss itself::
 95 | 
 96 |   loss = TT.sqr(m * x + b - y).sum()
 97 | 
 98 | This tells Theano to multiply the data vector ``x`` by the value stored in the
 99 | shared ``m`` variable, add the value stored in the shared ``b`` variable, and
100 | then subtract the data vector ``y``. Then that vector gets squared elementwise,
101 | and all of the components of the result get summed up to produce the loss.
102 | 
103 | Note that none of these operations have actually been computed; instead, you've
104 | instructed Theano *how* to compute the loss, if you were to give it some values
105 | for ``x`` and ``y``. This is the bizarre thing about Theano: it looks like
106 | you're computing things, but you're actually just telling the computer how to
107 | compute things in the future.
108 | 
109 | .. _Theano: http://deeplearning.net/software/theano/
110 | .. _tutorials: http://deeplearning.net/software/theano/tutorial/index.html
111 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
112 | .. _symbolic inputs: http://deeplearning.net/software/theano/tutorial/adding.html
113 | 
114 | .. _minimizing-loss:
115 | 
116 | Minimizing a Loss
117 | =================
118 | 
119 | The ``downhill`` package provides a single high-level function,
120 | :func:`downhill.minimize`, that can be used as a black-box optimizer for losses.
121 | In addition, there are lower-level calls that provide more control over the
122 | interaction between your code and ``downhill``. First, we'll look at the
123 | high-level minimize function, then we'll talk about what happens under the hood.
124 | 
125 | Once you've defined your loss using Theano, you can minimize it with a single
126 | function call. Here, we'll minimize the loss defined above::
127 | 
128 |   downhill.minimize(loss, [sizes, prices], inputs=[x, y])
129 | 
130 | You just specify the loss to minimize, provide some data to use for computing
131 | the loss, and identify the symbolic inputs that the loss requires. The
132 | ``downhill`` code will select an optimization algorithm (the default is
133 | currently :class:`RMSProp <downhill.adaptive.RMSProp>`), identify shared
134 | variables in the loss that need optimization, and run the optimization process
135 | to completion. After the minimization has finished, the shared variables in your
136 | loss will be updated to their optimal values. You can retrieve their values
137 | using any of the methods of `shared variables`_::
138 | 
139 |   m_value, b_value = m.get_value(), b.get_value()
140 | 
141 | There is much to say about providing data---see :ref:`providing-data` for more
142 | information---but briefly, the data you will need to provide is typically a list
143 | of ``numpy`` arrays of the measurements you've made for your problem. For the
144 | house price regression example, the arrays for house size and house price might
145 | be set up like this::
146 | 
147 |   sizes = np.array([1200, 2013, 8129, 2431, 2211])
148 |   prices = np.array([103020, 203310, 3922013, 224321, 449020])
149 | 
150 | .. _training-validation:
151 | 
152 | Training and Validation
153 | -----------------------
154 | 
155 | You might have noticed that the formulation of the loss given at the top of this
156 | guide contains a sum over all of the data points that you've observed
157 | :math:`(x_i, y_i)`. (For the house price example, these data are stored in the
158 | ``sizes`` and ``prices`` arrays.) This is a very common state of affairs for
159 | many problems: the loss is computed thanks to observed data.
160 | 
161 | But for a typical regression problem, it's not feasible or even possible to
162 | gather *all* of the relevant data---either it's too expensive to do that, or
163 | there might be new data created in the future that you don't have any way of
164 | predicting.
165 | 
166 | Given this paucity of data, you're running a risk in using a stochastic
167 | optimizer to solve your problem: the data that you *have* collected might not be
168 | representative of the data that you *haven't* collected! If the data you
169 | collected are quite different from the "true" data out there in the world, then
170 | when you optimize your loss, the optimal model might be skewed toward your
171 | dataset, and your model might not perform well on new, "unseen" data.
172 | 
173 | This problem is generally referred to as overfitting_ and is a risk with many
174 | types of models. Generally the risk of overfitting increases with the complexity
175 | of your model, and also increases when you don't have a lot of data.
176 | 
177 | There are many ways to combat overfitting:
178 | 
179 | - You can tighten your belt and gather more data, which increases the chance
180 |   that the data you do have will be representative of data you don't yet have.
181 | 
182 | - You can regularize_ your loss; this tends to encourage some solutions to your
183 |   problem (e.g., solutions with small parameter values) and discourage others
184 |   (e.g., solutions that "memorize" outliers).
185 | 
186 | - You can also set aside a bit of the data you've collected as a "validation"
187 |   set. You can use this set to stop the optimization process when the
188 |   performance of your model on the validation set stops improving---this is
189 |   known as "early stopping."
190 | 
191 | Collecting more data is almost always a good idea, as long as you can afford to
192 | do so (whether in terms of time, monetary cost, etc.)---but ``downhill`` can't
193 | help you with that. And while it can often be a good idea to incorporate
194 | regularizers into your loss, doing so is something of an art and remains outside
195 | the scope of ``downhill``.
196 | 
197 | .. _overfitting: https://en.wikipedia.org/wiki/Overfitting
198 | .. _regularize: https://en.wikipedia.org/wiki/Regularization_(mathematics)
199 | 
200 | .. _early-stopping:
201 | 
202 | Early Stopping
203 | --------------
204 | 
205 | The algorithms in ``downhill`` implement the "early stopping" regularization
206 | method. To take advantage of it, just provide a second set of data when
207 | minimizing your loss::
208 | 
209 |   downhill.minimize(loss, [sizes, prices], [valid_sizes, valid_prices])
210 | 
211 | Here we'll assume that you've gathered another few sizes and prices and put them
212 | in a new pair of ``numpy`` arrays. In practice, the validation dataset can also
213 | just be a small bit (10% or so) of the training data you've collected. Either
214 | way, it's important to make sure the validation data is disjoint from the
215 | training data, to ensure the most accurate predictions on unseen data. The idea
216 | is that you want to use a small part of the data you've gathered as a sort of
217 | canary_ to guess when the performance of your model will be good when you
218 | actually take it out into the world and use it.
219 | 
220 | .. _canary: https://en.wikipedia.org/wiki/Animal_sentinel#Historical_examples
221 | 
222 | The early stopping method will cause optimization to halt when the loss stops
223 | improving on the validation dataset. If you do not specify a validation dataset,
224 | the training dataset will also be used for validation, which effectively
225 | disables early stopping---that is, optimization will halt whenever the loss
226 | computed on the training dataset stops improving.
227 | 
228 | To understand this better, we'll take a look at the lower-level API provided by
229 | ``downhill``.
230 | 
231 | .. _iterative-optimization:
232 | 
233 | Iterative Optimization
234 | ----------------------
235 | 
236 | The :func:`downhill.minimize` function is actually just a wrapper that performs
237 | a few common lower-level tasks to optimize your loss. These tasks include:
238 | 
239 | - creating :class:`datasets <downhill.dataset.Dataset>` to wrap your data,
240 | - creating an :class:`Optimizer <downhill.base.Optimizer>`, and
241 | - running the optimizer to completion.
242 | 
243 | You can perform these tasks yourself to retain more control over the
244 | optimization process, but even if you don't, it's useful to follow the process
245 | to know how it works. In practice it can often be useful to call the
246 | :func:`iterate() <downhill.base.Optimizer.iterate>` method yourself, because it
247 | gives you access to the state of the optimizer at each step.
248 | 
249 | To learn more about this, have a look at the following example::
250 | 
251 |   opt = downhill.build('rmsprop', loss=loss, inputs=[x, y])
252 |   train = downhill.Dataset([sizes, prices])
253 |   valid = downhill.Dataset([valid_sizes, valid_prices])
254 |   for tm, vm in opt.iterate(train, valid):
255 |       print('training loss:', tm['loss'])
256 |       print('most recent validation loss:', vm['loss'])
257 | 
258 | This code constructs an :class:`Optimizer <downhill.base.Optimizer>` object
259 | (specifically, an :class:`RMSProp optimizer <downhill.adaptive.RMSProp>`), wraps
260 | the input data with a :class:`Dataset <downhill.dataset.Dataset>`, and then
261 | steps through the optimization process iteratively.
262 | 
263 | Notice that after each iteration, the optimizer yields a pair of dictionaries to
264 | the caller: the first dictionary contains measured values of the loss on the
265 | training data during that iteration, and the second contains measured values of
266 | the loss on the validation data.
267 | 
268 | The keys and values in each of these dictionaries give the costs and monitors
269 | that are computed during optimization. There will always be a ``'loss'`` key in
270 | each dictionary that gives the value of the loss function being optimized. In
271 | addition, any :ref:`monitor values <monitoring>` that were defined when creating
272 | the optimizer will also be provided in these dictionaries.
273 | 
274 | .. _batches-epochs:
275 | 
276 | Batches and Epochs
277 | ------------------
278 | 
279 | During each iteration, the optimizer instance processes training data in small
280 | pieces called "mini-batches"; each mini-batch is used to compute a gradient
281 | estimate for the loss, and the parameters are updated by a small amount. After a
282 | fixed number of mini-batches have been processed, the ``iterate`` method yields
283 | the loss dictionaries to the caller.
284 | 
285 | Each group of parameter updates processed during a single iteration is called an
286 | "epoch." After a fixed number of epochs have taken place, the loss is then
287 | evaluated using a fixed number of mini-batches from the validation dataset, and
288 | this result is saved as the validation dictionary after every epoch until the
289 | next validation happens.
290 | 
291 | Optimization epochs continue to occur, with occasional validations, until the loss
292 | on the validation dataset fails to make sufficient progress for long enough.
293 | Optimization halts at that point.
294 | 
295 | There are a number of hyperparameters involved in this process, which can be
296 | tuned for the best performance on your problem.
297 | 
298 | .. _tuning:
299 | 
300 | Tuning
301 | ======
302 | 
303 | The ``downhill`` package provides several ways of tuning the optimization
304 | process. There are many different settings for mini-batch optimization and
305 | validation, many optimization algorithms are available, and there are also
306 | several common learning hyperparameters that might require tuning.
307 | 
308 | .. _batch-parameters:
309 | 
310 | Batch Parameters
311 | ----------------
312 | 
313 | All algorithms in ``downhill`` provide early stopping and use :ref:`epoch-based
314 | optimization <batches-epochs>` as described above. This process is controlled by
315 | a number of parameters that can be tweaked for your optimization problem.
316 | 
317 | The size of a minibatch is controlled using the ``batch_size`` parameter when
318 | you create a :class:`Dataset <downhill.dataset.Dataset>`. To build mini-batches
319 | containing 3 pieces of data, for example::
320 | 
321 |     train = downhill.Dataset([sizes, prices], batch_size=3)
322 | 
323 | If you call the high-level :func:`downhill.minimize` method directly, you can
324 | pass ``batch_size`` to it directly::
325 | 
326 |     downhill.minimize(loss, [sizes, prices], batch_size=3)
327 | 
328 | The number of mini-batches that are processed during a single training epoch is
329 | controlled by the ``iteration_size`` parameter when constructing a ``Dataset``::
330 | 
331 |     train = downhill.Dataset([sizes, prices], iteration_size=10)
332 | 
333 | This will ensure that one iteration loop over the training dataset will produce
334 | 10 mini-batches. If you have fewer than ``batch_size`` times ``iteration_size``
335 | pieces of data, the ``Dataset`` class will loop over your data multiple times to
336 | ensure that the desired number of batches is processed. (The ``Dataset`` class
337 | also handles shuffling your data as needed during iteration, to avoid issues
338 | that can come up when presenting data to the model in a fixed order.)
339 | 
340 | If you call the high-level :func:`downhill.minimize` method, the numbers of
341 | training and validation mini-batches processed per epoch are set using the
342 | ``train_batches`` and ``valid_batches`` parameters, respectively::
343 | 
344 |   downhill.minimize(..., train_batches=10, valid_batches=8)
345 | 
346 | Finally, a validation takes place after a fixed number of training epochs have
347 | happened. This number is set using the ``validate_every`` parameter; for
348 | example, to validate the loss every 5 training epochs::
349 | 
350 |   downhill.minimize(..., validate_every=5)
351 | 
352 | If you are processing data using the lower-level API, the ``validate_every``
353 | parameter is passed directly to :func:`iterate()
354 | <downhill.base.Optimizer.iterate>`::
355 | 
356 |   for tm, vm in opt.iterate(..., validate_every=5):
357 |       # ...
358 | 
359 | .. _patience-improvement:
360 | 
361 | Patience and Improvement
362 | ------------------------
363 | 
364 | The training process halts if there is "insufficient" progress on the validation
365 | loss for "long enough." The precise meanings of these terms are given by the
366 | ``min_improvement`` and ``patience`` parameters, respectively.
367 | 
368 | The ``min_improvement`` parameter specifies the minimum relative improvement of
369 | the validation loss that counts as progress in the optimization. If
370 | ``min_improvement`` is set to 0, for example, then any positive improvement in
371 | the validation loss will count as progress, while if ``min_improvement`` is set
372 | to 0.1, then the validation loss must improve by 10% relative to the current
373 | best validation loss before the validation attempt counts as progress.
374 | 
375 | The ``patience`` parameter specifies the number of failed validation attempts
376 | that you are willing to tolerate before seeing any progress. If ``patience`` is
377 | set to 0, for instance, then optimization will halt as soon as a validation
378 | attempt fails to make ``min_improvement`` relative loss improvement over the
379 | best validation loss so far. If ``patience`` is set to 3, then optimization will
380 | continue through three failed validation attempts, but if the fourth validation
381 | attempt fails, then optimization will halt.
382 | 
383 | These parameters can be set either on a call to the high-level
384 | :func:`downhill.minimize` function::
385 | 
386 |   downhill.minimize(..., patience=3, min_improvement=0.1)
387 | 
388 | or when calling :func:`iterate() <downhill.base.Optimizer.iterate>`::
389 | 
390 |   for tm, vm in opt.iterate(..., patience=3, min_improvement=0.1):
391 |       # ...
392 | 
393 | .. _algorithm:
394 | 
395 | Optimization Algorithms
396 | -----------------------
397 | 
398 | The following algorithms are currently available in ``downhill``:
399 | 
400 | - ``'adadelta'`` --- :class:`ADADELTA <downhill.adaptive.ADADELTA>`
401 | - ``'adagrad'`` --- :class:`ADAGRAD <downhill.adaptive.ADAGRAD>`
402 | - ``'adam'`` --- :class:`Adam <downhill.adaptive.Adam>`
403 | - ``'esgd'`` --- :class:`Equilibrated SGD <downhill.adaptive.ESGD>`
404 | - ``'nag'`` --- :class:`Nesterov's Accelerated Gradient <downhill.first_order.NAG>`
405 | - ``'rmsprop'`` --- :class:`RMSProp <downhill.adaptive.RMSProp>`
406 | - ``'rprop'`` --- :class:`Resilient Backpropagation <downhill.adaptive.RProp>`
407 | - ``'sgd'`` --- :class:`Stochastic Gradient Descent <downhill.first_order.SGD>`
408 | 
409 | To select an algorithm, specify its name using the ``algo`` keyword argument::
410 | 
411 |   downhill.minimize(..., algo='adadelta')
412 | 
413 | or pass the algorithm name to build an :class:`Optimizer
414 | <downhill.base.Optimizer>` instance::
415 | 
416 |   opt = downhill.build('adadelta', ...)
417 | 
418 | Different algorithms have different performance characteristics, different
419 | numbers of hyperparameters to tune, and different suitability for particular
420 | problems. In general, several of the the adaptive procedures seem to work well
421 | across different problems, particularly :class:`Adam <downhill.adaptive.Adam>`,
422 | :class:`ADADELTA <downhill.adaptive.ADADELTA>`, and :class:`RMSProp
423 | <downhill.adaptive.RMSProp>`. :class:`NAG <downhill.first_order.NAG>` also seems
424 | to work quite well, but can sometimes take longer to converge.
425 | 
426 | Many of these algorithms, being based on stochastic gradient descent, rely on a
427 | common set of hyperparameters that control the speed of convergence and the
428 | reliability of the optimization process over time; these parameters are
429 | discussed next.
430 | 
431 | .. _learning-rate:
432 | 
433 | Learning Rate
434 | -------------
435 | 
436 | Most stochastic gradient optimization methods make small parameter updates based
437 | on the local gradient of the loss at each step in the optimization procedure.
438 | Intuitively, parameters in a model are updated by subtracting a small portion of
439 | the local derivative from the current parameter value. Mathematically, this is
440 | written as:
441 | 
442 | .. math::
443 | 
444 |    \theta_{t+1} = \theta_t - \alpha \left.
445 |       \frac{\partial\mathcal{L}}{\partial\theta} \right|_{\theta_t}
446 | 
447 | where :math:`\mathcal{L}` is the loss function being optimized, :math:`\theta`
448 | is the value of a parameter in the model (e.g., :math:`m` or :math:`b` for the
449 | regression problem) at optimization step :math:`t`, :math:`\alpha` is the
450 | learning rate, and :math:`\frac{\partial\mathcal{L}}{\partial\theta}` (also
451 | often written :math:`\nabla_{\theta_t}\mathcal{L}`) is the partial derivative of
452 | the loss with respect to the parameters, evaluated at the current value of those
453 | parameters.
454 | 
455 | The learning rate :math:`\alpha` specifies the scale of these parameter updates
456 | with respect to the magnitude of the gradient. Almost all stochastic optimizers
457 | use a fixed learning rate parameter.
458 | 
459 | In ``downhill``, the learning rate is passed as a keyword argument to
460 | :func:`downhill.minimize`::
461 | 
462 |   downhill.minimize(..., learning_rate=0.1)
463 | 
464 | Often the learning rate is set to a very small value---many approaches seem to
465 | start with values around 1e-4. If the learning rate is too large, the
466 | optimization procedure might "bounce around" in the loss landscape because the
467 | parameter steps are too large. If the learning rate is too small, the
468 | optimization procedure might not make progress quickly enough to make
469 | optimization practical.
470 | 
471 | .. _momentum:
472 | 
473 | Momentum
474 | --------
475 | 
476 | Momentum is a common technique in stochastic gradient optimization algorithms
477 | that seems to accelerate the optimization process in most cases. Intuitively,
478 | momentum avoids "jitter" in the parameters during optimization by smoothing the
479 | estimates of the local gradient information over time. In practice a momentum
480 | method maintains a "velocity" of the most recent parameter steps and combines
481 | these recent individual steps together when making a parameter update.
482 | Mathematically, this is written:
483 | 
484 | .. math::
485 | 
486 |    \begin{eqnarray*}
487 |    \nu_{t+1} &=& \mu \nu_t - \alpha \left. \frac{\partial\mathcal{L}}{\partial\theta} \right|_{\theta_t} \\
488 |    \theta_{t+1} &=& \theta_t + \nu_{t+1}
489 |    \end{eqnarray*}
490 | 
491 | where the symbols are the same as above, and additionally :math:`\nu` describes
492 | the "velocity" of parameter :math:`\theta`, and :math:`\mu` is the momentum
493 | hyperparameter. The gradient computations using momentum are exactly the same as
494 | when not using momentum; the only difference is the accumulation of recent
495 | updates in the "velocity."
496 | 
497 | In ``downhill``, the momentum value is passed as a keyword argument to
498 | :func:`downhill.minimize`::
499 | 
500 |   downhill.minimize(..., momentum=0.9)
501 | 
502 | Typically momentum is set to a value in :math:`[0, 1)`---when set to 0, momentum
503 | is disabled, and when set to values near 1, the momentum is very high, requiring
504 | several consecutive parameter updates in the same direction to change the
505 | parameter velocity.
506 | 
507 | In many problems it is useful to set the momentum to a surprisingly large value,
508 | sometimes even to values greater than 0.9. Such values can be especially
509 | effective with a relatively small learning rate.
510 | 
511 | If the momentum is set too low, then parameter updates will be more noisy and
512 | optimization might take longer to converge, but if the momentum is set too high,
513 | the optimization process might diverge entirely.
514 | 
515 | Nesterov Momentum
516 | -----------------
517 | 
518 | More recently, a newer momentum technique has been shown to be even more
519 | performant than "traditional" momentum. This technique was originally proposed
520 | by Y. Nesterov and effectively amounts to computing the momentum value at a
521 | different location in the parameter space, namely the location where the
522 | momentum value would have placed the parameter after the current update:
523 | 
524 | .. math::
525 |    \begin{eqnarray*}
526 |    \nu_{t+1} &=& \mu \nu_t - \alpha \left.
527 |       \frac{\partial\mathcal{L}}{\partial\theta}\right|_{\theta_t + \mu\nu_t} \\
528 |    \theta_{t+1} &=& \theta_t + \nu_{t+1}
529 |    \end{eqnarray*}
530 | 
531 | Note that the partial derivative is evaluated at :math:`\theta_t + \mu\nu_t`
532 | instead of at :math:`\theta_t`. The intuitive rationale for this change is that
533 | if the momentum would have produced an "overshoot," then the gradient at this
534 | overshot parameter value would point backwards, toward the previous parameter
535 | value, which would thus help correct oscillations during optimization.
536 | 
537 | To use Nesterov-style momentum, use either the :class:`NAG
538 | <downhill.first_order.NAG>` optimizer (which uses plain stochastic gradient
539 | descent with Nesterov momentum), or specify ``nesterov=True`` in addition to
540 | providing a nonzero ``momentum`` value when minimizing your loss::
541 | 
542 |   downhill.minimize(..., momentum=0.9, nesterov=True)
543 | 
544 | .. _gradient-clipping:
545 | 
546 | Gradient Clipping
547 | -----------------
548 | 
549 | Sometimes during the execution of a stochastic optimization routine---and
550 | particularly at the start of optimization, when the problem parameters are far
551 | from their optimal values---the gradient of the loss with respect to the
552 | parameters can be extremely large. In these cases, taking a step that is
553 | proportional to the magnitude of the gradient can actually be harmful, resulting
554 | in an unpredictable parameter change.
555 | 
556 | To prevent this from happening, but still preserve the iterative loss
557 | improvements when parameters are in a region with "more reasonable" gradient
558 | magnitudes, ``downhill`` implements two forms of "gradient clipping."
559 | 
560 | The first gradient truncation method rescales the entire gradient vector if its
561 | L2 norm exceeds some threshold. This is accomplished using the
562 | ``max_gradient_norm`` hyperparameter::
563 | 
564 |   downhill.minimize(..., max_gradient_norm=1)
565 | 
566 | The second gradient truncation method clips each element of the gradient vector
567 | individually. This is accomplished using the ``max_gradient_elem``
568 | hyperparameter::
569 | 
570 |   downhill.minimize(..., max_gradient_elem=1)
571 | 
572 | In both cases, gradients that are extremely large will still point in the
573 | correct direction, but their magnitudes will be rescaled to avoid steps that are
574 | too large. Gradients with values smaller than the thresholds (presumably,
575 | gradients near an optimum will be small) will not be affected. In both cases,
576 | the strategy of taking small steps proportional to the gradient seems to work.
577 | 
578 | .. _providing-data:
579 | 
580 | Providing Data
581 | ==============
582 | 
583 | As described above, you'll often need to provide data to ``downhill`` so that
584 | you can compute the loss and optimize the parameters for your problem. There are
585 | two ways of passing data to ``downhill``: using arrays and using callables.
586 | 
587 | .. _data-using-arrays:
588 | 
589 | Using Arrays
590 | ------------
591 | 
592 | A fairly typical use case for optimizing a loss for a small-ish problem is to
593 | construct a ``numpy`` array containing the data you have::
594 | 
595 |   dataset = np.load(filename)
596 |   downhill.minimize(..., train=dataset)
597 | 
598 | Sometimes the data available for optimizing a loss exceeds the available
599 | resources (e.g., memory) on the computer at hand. There are several ways of
600 | handling this type of situation. If your data are already in a ``numpy`` array
601 | stored on disk, you might want to try loading the array using ``mmap``::
602 | 
603 |   dataset = np.load(filename, mmap_mode='r')
604 |   downhill.minimize(..., train=dataset)
605 | 
606 | Alternatively, you might want to load just part of the data and train on that,
607 | then load another part and train on it::
608 | 
609 |   for filename in filenames:
610 |       dataset = np.load(filename, mmap_mode='r')
611 |       downhill.minimize(..., train=dataset)
612 | 
613 | Finally, you can potentially handle large datasets by using a callable to
614 | provide data to the optimization algorithm.
615 | 
616 | .. _data-using-callables:
617 | 
618 | Using Callables
619 | ---------------
620 | 
621 | Instead of an array of data, you can provide a callable for a :class:`Dataset
622 | <downhill.dataset.Dataset>`. This callable must take no arguments and must
623 | return a list of ``numpy`` arrays of the proper shape for your loss.
624 | 
625 | During minimization, the callable will be invoked every time the optimization
626 | algorithm requires a batch of training (or validation) data. Therefore, your
627 | callable should return at least one array containing a batch of data; if your
628 | model requires multiple arrays per batch (e.g., if you are minimizing a loss
629 | that requires some "input" data as well as some "output" data), then your
630 | callable should return a list containing the correct number of arrays (e.g., an
631 | array of "inputs" and the corresponding "outputs").
632 | 
633 | For example, this code defines a ``batch()`` helper that could be used for a
634 | loss that needs one input. The callable chooses a random dataset and a random
635 | offset for each batch::
636 | 
637 |   SOURCES = 'foo.npy', 'bar.npy', 'baz.npy'
638 |   BATCH_SIZE = 64
639 | 
640 |   def batch():
641 |       X = np.load(np.random.choice(SOURCES), mmap_mode='r')
642 |       i = np.random.randint(len(X))
643 |       return X[i:i+BATCH_SIZE]
644 | 
645 |   downhill.minimize(..., train=batch)
646 | 
647 | If you need to maintain more state than is reasonable from a single closure, you
648 | can also encapsulate the callable inside a class. Just make sure instances of
649 | the class are callable by defining the ``__call__`` method. For example, this
650 | class loads data from a series of ``numpy`` arrays on disk, but only loads one
651 | of the on-disk arrays into memory at a given time::
652 | 
653 |   class Loader:
654 |       def __init__(sources=('foo.npy', 'bar.npy', 'baz.npy'), batch_size=64):
655 |           self.sources = sources
656 |           self.batch_size = batch_size
657 |           self.src = -1
658 |           self.idx = 0
659 |           self.X = ()
660 | 
661 |       def __call__(self):
662 |           if self.idx + self.batch_size > len(self.X):
663 |               self.idx = 0
664 |               self.src = (self.src + 1) % len(self.sources)
665 |               self.X = np.load(self.sources[self.src], mmap_mode='r')
666 |           try:
667 |               return self.X[self.idx:self.idx+self.batch_size]
668 |           finally:
669 |               self.idx += self.batch_size
670 | 
671 |   downhill.minimize(..., train=Loader())
672 | 
673 | There are almost limitless possibilities for using callables to interface with
674 | the optimization process.
675 | 
676 | .. _monitoring:
677 | 
678 | Monitoring
679 | ==========
680 | 
681 | Sometimes while optimizing a loss, it can be helpful to "see inside" the model.
682 | In a model with a sparsity regularizer, for example, having some idea of the
683 | current sparsity of the model can help diagnose when the model is "too sparse."
684 | 
685 | In ``downhill`` you can provide a series of *monitors* during optimization that
686 | satisfy this need. Monitors must be a series of named Theano expressions that
687 | evaluate to scalars; this can be provided as a dictionary that maps names to
688 | expressions, or as a list of (name, expression) ordered pairs.
689 | 
690 | Suppose you want to monitor the slope and intercept values that your model is
691 | computing as it works its way through the house price modeling task. You can
692 | provide monitors for these quantities as follows::
693 | 
694 |   downhill.minimize(
695 |       loss,
696 |       [sizes, prices],
697 |       inputs=[x, y],
698 |       monitors=[
699 |           ('m', m.sum()),
700 |           ('b', b.sum()),
701 |       ])
702 | 
703 | The Theano expressions here are sums because the ``m`` and ``b`` shared
704 | variables are actually arrays of shared variables. (This also helps generalize
705 | the regression loss to situations where you might have multiple independent
706 | variables, like house size and number of bedrooms.) If you preferred to provide
707 | the monitor values as a dictionary, it would look like::
708 | 
709 |   downhill.minimize(
710 |       loss,
711 |       [sizes, prices],
712 |       inputs=[x, y],
713 |       monitors=dict(m=m.sum(), b=b.sum()))
714 | 
715 | Note that if you construct an :class:`Optimizer <downhill.base.Optimizer>`
716 | directly, then you need to pass the monitors when you create your optimizer
717 | instance::
718 | 
719 |   opt = downhill.build(
720 |       'nag', loss=loss, inputs=[sizes, prices],
721 |       monitors=dict(m=m.sum(), b=b.sum()))
722 | 
723 | Gradients
724 | ---------
725 | 
726 | Sometimes when setting parameters like ``learning_rate`` and
727 | ``max_gradient_norm``, it can be quite useful to see how large the gradients of
728 | your model are. These quantities can be included in the monitors easily by
729 | setting the ``monitor_gradients`` flag::
730 | 
731 |   downhill.minimize(
732 |       loss,
733 |       [sizes, prices],
734 |       inputs=[x, y],
735 |       monitor_gradients=True)
736 | 
737 | This will include one monitor for each parameter in your model, indicating the
738 | squared L2 norm of the gradient (averaged across mini-batches).
739 | 
740 | More Information
741 | ================
742 | 
743 | This concludes the ``downhill`` guide! Have a good time harnessing the power of
744 | your GPU to optimize your scalar losses!
745 | 
746 | If you need more information or just want to discuss things, sign up for the
747 | `mailing list`_, and check out the project page at github_.
748 | 
749 | .. _mailing list: https://groups.google.com/forum/#!forum/downhill-users
750 | .. _github: https://github.com/lmjohns3/downhill
751 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. figure:: _static/rosenbrock-nag.png
  2 |    :figclass: banana
  3 | 
  4 | The ``downhill`` package provides algorithms for minimizing scalar loss
  5 | functions that are defined using Theano_.
  6 | 
  7 | Several optimization algorithms are included:
  8 | 
  9 | - :class:`ADADELTA <downhill.adaptive.ADADELTA>`
 10 | - :class:`ADAGRAD <downhill.adaptive.ADAGRAD>`
 11 | - :class:`Adam <downhill.adaptive.Adam>`
 12 | - :class:`Equilibrated SGD <downhill.adaptive.ESGD>`
 13 | - :class:`Nesterov's Accelerated Gradient <downhill.first_order.NAG>`
 14 | - :class:`RMSProp <downhill.adaptive.RMSProp>`
 15 | - :class:`Resilient Backpropagation <downhill.adaptive.RProp>`
 16 | - :class:`Stochastic Gradient Descent <downhill.first_order.SGD>`
 17 | 
 18 | All algorithms permit the use of regular or Nesterov-style momentum as well.
 19 | 
 20 | The source code for ``downhill`` lives at http://github.com/lmjohns3/downhill,
 21 | the documentation lives at http://downhill.readthedocs.org, and announcements
 22 | and discussion happen on the `mailing list`_.
 23 | 
 24 | .. _Theano: http://deeplearning.net/software/theano/
 25 | .. _mailing list: https://groups.google.com/forum/#!forum/downhill-users
 26 | 
 27 | Quick Start: Matrix Factorization
 28 | =================================
 29 | 
 30 | Let's say you want to compute a sparse, low-rank approximation for some
 31 | 1000-dimensional data that you have lying around. You can represent a batch of
 32 | :math:`m` of data points :math:`X \in \mathbb{R}^{m \times 1000}` as the product
 33 | of a sparse coefficient matrix :math:`U \in \mathbb{R}^{m \times k}` and a
 34 | low-rank basis matrix :math:`V \in \mathbb{R}^{k \times 1000}`. You might
 35 | represent the loss as
 36 | 
 37 | .. math::
 38 | 
 39 |    \mathcal{L} = \| X - UV \|_2^2 + \alpha \| U \|_1 + \beta \| V \|_2
 40 | 
 41 | where the first term represents the approximation error, the second represents
 42 | the sparsity of the representation, and the third prevents the basis vectors
 43 | from growing too large.
 44 | 
 45 | This is pretty straightforward to model using Theano. Once you set up the
 46 | appropriate variables and an expression for the loss, you can optimize the loss
 47 | with respect to the variables using a single call to :func:`downhill.minimize`::
 48 | 
 49 |   import downhill
 50 |   import numpy as np
 51 |   import theano
 52 |   import theano.tensor as TT
 53 | 
 54 |   FLOAT = 'df'[theano.config.floatX == 'float32']
 55 | 
 56 |   def rand(a, b):
 57 |       return np.random.randn(a, b).astype(FLOAT)
 58 | 
 59 |   A, B, K = 20, 5, 3
 60 | 
 61 |   # Set up a matrix factorization problem to optimize.
 62 |   u = theano.shared(rand(A, K), name='u')
 63 |   v = theano.shared(rand(K, B), name='v')
 64 |   z = TT.matrix()
 65 |   err = TT.sqr(z - TT.dot(u, v))
 66 |   loss = err.mean() + abs(u).mean() + (v * v).mean()
 67 | 
 68 |   # Minimize the regularized loss with respect to a data matrix.
 69 |   y = np.dot(rand(A, K), rand(K, B)) + rand(A, B)
 70 | 
 71 |   # Monitor during optimization.
 72 |   monitors = (('err', err.mean()),
 73 |               ('|u|<0.1', (abs(u) < 0.1).mean()),
 74 |               ('|v|<0.1', (abs(v) < 0.1).mean()))
 75 | 
 76 |   downhill.minimize(
 77 |       loss=loss,
 78 |       train=[y],
 79 |       patience=0,
 80 |       batch_size=A,                 # Process y as a single batch.
 81 |       max_gradient_norm=1,          # Prevent gradient explosion!
 82 |       learning_rate=0.1,
 83 |       monitors=monitors,
 84 |       monitor_gradients=True)
 85 | 
 86 |   # Print out the optimized coefficients u and basis v.
 87 |   print('u =', u.get_value())
 88 |   print('v =', v.get_value())
 89 | 
 90 | If you prefer to maintain more control over your model during optimization,
 91 | downhill provides an iterative optimization interface::
 92 | 
 93 |   opt = downhill.build(algo='rmsprop',
 94 |                        loss=loss,
 95 |                        monitors=monitors,
 96 |                        monitor_gradients=True)
 97 | 
 98 |   for metrics, _ in opt.iterate(train=[[y]],
 99 |                                 patience=0,
100 |                                 batch_size=A,
101 |                                 max_gradient_norm=1,
102 |                                 learning_rate=0.1):
103 |       print(metrics)
104 | 
105 | If that's still not enough, you can just plain ask downhill for the updates to
106 | your model variables and do everything else yourself::
107 | 
108 |   updates = downhill.build('rmsprop', loss).get_updates(
109 |       batch_size=A, max_gradient_norm=1, learning_rate=0.1)
110 |   func = theano.function([z], loss, updates=list(updates))
111 |   for _ in range(100):
112 |       print(func(y))  # Evaluate func and apply variable updates.
113 | 
114 | Documentation
115 | =============
116 | 
117 | .. toctree::
118 |    :maxdepth: 2
119 | 
120 |    guide
121 |    reference
122 | 
123 | Indices and tables
124 | ==================
125 | 
126 | - :ref:`genindex`
127 | - :ref:`modindex`
128 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\theanets.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\theanets.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/reference.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Reference
 3 | =========
 4 | 
 5 | .. automodule:: downhill
 6 |    :no-members:
 7 |    :no-inherited-members:
 8 | 
 9 | .. autosummary::
10 |    :toctree: generated/
11 | 
12 |    minimize
13 | 
14 | Base
15 | ====
16 | 
17 | .. automodule:: downhill.base
18 |    :no-members:
19 |    :no-inherited-members:
20 | 
21 | .. autosummary::
22 |    :toctree: generated/
23 | 
24 |    build
25 |    Optimizer
26 | 
27 | First-Order Optimizers
28 | ======================
29 | 
30 | .. automodule:: downhill.first_order
31 |    :no-members:
32 |    :no-inherited-members:
33 | 
34 | .. autosummary::
35 |    :toctree: generated/
36 | 
37 |    SGD
38 |    NAG
39 | 
40 | Adaptive Optimizers
41 | ===================
42 | 
43 | .. automodule:: downhill.adaptive
44 |    :no-members:
45 |    :no-inherited-members:
46 | 
47 | .. autosummary::
48 |    :toctree: generated/
49 | 
50 |    ADADELTA
51 |    ADAGRAD
52 |    Adam
53 |    ESGD
54 |    RMSProp
55 |    RProp
56 | 
57 | Datasets
58 | ========
59 | 
60 | .. automodule:: downhill.dataset
61 |    :no-members:
62 |    :no-inherited-members:
63 | 
64 | .. autosummary::
65 |    :toctree: generated/
66 | 
67 |    Dataset
68 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | numpydoc
2 | sphinx-better-theme
3 | 


--------------------------------------------------------------------------------
/downhill/__init__.py:
--------------------------------------------------------------------------------
 1 | from .adaptive import *
 2 | from .base import build, Optimizer
 3 | from .dataset import Dataset
 4 | from .first_order import *
 5 | 
 6 | __version__ = '0.5.0pre'
 7 | 
 8 | 
 9 | def minimize(loss, train, valid=None, params=None, inputs=None, algo='rmsprop',
10 |              updates=(), monitors=(), monitor_gradients=False, batch_size=32,
11 |              train_batches=None, valid_batches=None, **kwargs):
12 |     '''Minimize a loss function with respect to some symbolic parameters.
13 | 
14 |     Additional keyword arguments are passed to the underlying :class:`Optimizer
15 |     <downhill.base.Optimizer>` instance.
16 | 
17 |     Parameters
18 |     ----------
19 |     loss : Theano expression
20 |         Loss function to minimize. This must be a scalar-valued expression.
21 |     train : :class:`Dataset <downhill.dataset.Dataset>`, ndarray, or callable
22 |         Dataset to use for computing gradient updates.
23 |     valid : :class:`Dataset <downhill.dataset.Dataset>`, ndarray, or callable, optional
24 |         Dataset to use for validating the minimization process. The training
25 |         dataset is used if this is not provided.
26 |     params : list of Theano variables, optional
27 |         Symbolic variables to adjust to minimize the loss. If not given, these
28 |         will be computed automatically by walking the computation graph.
29 |     inputs : list of Theano variables, optional
30 |         Symbolic variables required to compute the loss. If not given, these
31 |         will be computed automatically by walking the computation graph.
32 |     algo : str, optional
33 |         Name of the minimization algorithm to use. Must be one of the strings
34 |         that can be passed to :func:`build`. Defaults to ``'rmsprop'``.
35 |     updates : list of update pairs, optional
36 |         A list of pairs providing updates for the internal of the loss
37 |         computation. Normally this is empty, but it can be provided if the loss,
38 |         for example, requires an update to an internal random number generator.
39 |     monitors : dict or sequence of (str, Theano expression) tuples, optional
40 |         Additional values to monitor during optimization. These must be provided
41 |         as either a sequence of (name, expression) tuples, or as a dictionary
42 |         mapping string names to Theano expressions.
43 |     monitor_gradients : bool, optional
44 |         If True, add monitors to log the norms of the parameter gradients during
45 |         optimization. Defaults to False.
46 |     batch_size : int, optional
47 |         Size of batches provided by datasets. Defaults to 32.
48 |     train_batches : int, optional
49 |         Number of batches of training data to iterate over during one pass of
50 |         optimization. Defaults to None, which uses the entire training dataset.
51 |     valid_batches : int, optional
52 |         Number of batches of validation data to iterate over during one pass of
53 |         validation. Defaults to None, which uses the entire validation dataset.
54 | 
55 |     Returns
56 |     -------
57 |     train_monitors : dict
58 |         A dictionary mapping monitor names to monitor values. This dictionary
59 |         will always contain the ``'loss'`` key, giving the value of the loss
60 |         evaluated on the training dataset.
61 |     valid_monitors : dict
62 |         A dictionary mapping monitor names to monitor values, evaluated on the
63 |         validation dataset. This dictionary will always contain the ``'loss'``
64 |         key, giving the value of the loss function. Because validation is not
65 |         always computed after every optimization update, these monitor values
66 |         may be "stale"; however, they will always contain the most recently
67 |         computed values.
68 |     '''
69 |     if not isinstance(train, Dataset):
70 |         train = Dataset(
71 |             train,
72 |             name='train',
73 |             batch_size=batch_size,
74 |             iteration_size=train_batches,
75 |         )
76 |     if valid is not None and not isinstance(valid, Dataset):
77 |         valid = Dataset(
78 |             valid,
79 |             name='valid',
80 |             batch_size=batch_size,
81 |             iteration_size=valid_batches,
82 |         )
83 |     return build(
84 |         algo,
85 |         loss=loss,
86 |         params=params,
87 |         inputs=inputs,
88 |         updates=updates,
89 |         monitors=monitors,
90 |         monitor_gradients=monitor_gradients,
91 |     ).minimize(train, valid, **kwargs)
92 | 


--------------------------------------------------------------------------------
/downhill/adaptive.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | '''This module defines gradient descent optimizers with adaptive learning rates.
  4 | '''
  5 | 
  6 | import numpy as np
  7 | import theano
  8 | import theano.tensor as TT
  9 | 
 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 11 | 
 12 | from .base import Optimizer
 13 | from . import util
 14 | 
 15 | __all__ = ['RProp', 'RMSProp', 'ADAGRAD', 'ADADELTA', 'ESGD', 'Adam']
 16 | 
 17 | 
 18 | class RProp(Optimizer):
 19 |     r'''Resilient backpropagation optimizer.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     rprop_increase: float, optional (default 1.01)
 24 |         Increase step sizes at this rate when the gradient sign stays the same.
 25 |     rprop_decrease: float, optional (default 0.99)
 26 |         Decrease step sizes at this rate when the gradient sign changes.
 27 |     rprop_min_step: float, optional (default 0)
 28 |         Minimum step size for any parameter.
 29 |     rprop_max_step: float, optional (default 100)
 30 |         Maximum step size for any parameter.
 31 |     momentum: float, optional (default 0)
 32 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
 33 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
 34 |         momentum.
 35 |     nesterov: bool, optional (default False)
 36 |         Set this to ``True`` to enable Nesterov-style momentum updates, whenever
 37 |         ``momentum`` is nonzero.
 38 | 
 39 |     Notes
 40 |     -----
 41 | 
 42 |     The RProp method takes small steps in parameter space using local gradient
 43 |     information. RProp is unlike "vanilla" first-order techniques like
 44 |     :class:`SGD <downhill.first_order.SGD>`, however, because only the signs of
 45 |     the gradients are taken into account when making parameter updates. That is,
 46 |     the step size for each parameter is independent of the magnitude of the
 47 |     gradient for that parameter.
 48 | 
 49 |     To accomplish this, RProp maintains a separate learning rate for every
 50 |     parameter in the model, and adjusts this learning rate based on the
 51 |     consistency of the sign of the gradient over time. Whenever two consecutive
 52 |     gradients for a parameter have the same sign, the learning rate for that
 53 |     parameter increases, and whenever the signs disagree, the learning rate
 54 |     decreases. This has a similar effect to momentum-based stochastic gradient
 55 |     methods but effectively maintains parameter-specific learning rates.
 56 | 
 57 |     .. math::
 58 |         \begin{eqnarray*}
 59 |         && \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1}
 60 |            \frac{\partial\mathcal{L}}{\partial p} > 0 \\
 61 |         && \qquad \Delta_t = \min (\eta_+\Delta_{t−1}, \Delta_+) \\
 62 |         && \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1}
 63 |            \frac{\partial\mathcal{L}}{\partial p} < 0 \\
 64 |         && \qquad \Delta_t = \max (\eta_-\Delta_{t−1}, \Delta_-) \\
 65 |         && \qquad \frac{\partial\mathcal{L}}{\partial p} = 0 \\
 66 |         && p_{t+1} = p_t − \mbox{sgn}\left(
 67 |            \frac{\partial\mathcal{L}}{\partial p}\right) \Delta_t
 68 |         \end{eqnarray*}
 69 | 
 70 |     Here, :math:`s(\cdot)` is the sign function (i.e., returns -1 if its
 71 |     argument is negative and 1 otherwise), :math:`\eta_-` and :math:`\eta_+` are
 72 |     the amount to decrease (increase) the step size if the gradients disagree
 73 |     (agree) in sign, and :math:`\Delta_+` and :math:`\Delta_-` are the maximum
 74 |     and minimum step size.
 75 | 
 76 |     The implementation here is actually the "iRprop-" variant of RProp described
 77 |     in Algorithm 4 from [Igel00]_. This variant resets the running gradient
 78 |     estimates to zero in cases where the previous and current gradients have
 79 |     switched signs.
 80 | 
 81 |     References
 82 |     ----------
 83 | 
 84 |     .. [Ried92] M. Riedmiller & H. Braun. (1992) "Rprop - A Fast Adaptive
 85 |        Learning Algorithm." In Proceedings of the International Symposium on
 86 |        Computer and Information Science VII.
 87 | 
 88 |     .. [Igel00] C. Igel & M. Hüsken. (2000) "Improving the Rprop Learning
 89 |        Algorithm." In Proceedings of the Second International Symposium on
 90 |        Neural Computation.
 91 |        http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332
 92 |     '''
 93 | 
 94 |     def _prepare(self,
 95 |                  rprop_increase=1.01,
 96 |                  rprop_decrease=0.99,
 97 |                  rprop_min_step=0,
 98 |                  rprop_max_step=100,
 99 |                  **kwargs):
100 |         self.step_increase = util.as_float(rprop_increase)
101 |         self.step_decrease = util.as_float(rprop_decrease)
102 |         self.min_step = util.as_float(rprop_min_step)
103 |         self.max_step = util.as_float(rprop_max_step)
104 |         util.log_param('rprop_increase', rprop_increase)
105 |         util.log_param('rprop_decrease', rprop_decrease)
106 |         util.log_param('rprop_min_step', rprop_min_step)
107 |         util.log_param('rprop_max_step', rprop_max_step)
108 |         super(RProp, self)._prepare(**kwargs)
109 | 
110 |     def _get_updates_for(self, param, grad):
111 |         grad_tm1 = util.shared_like(param, 'grad')
112 |         step_tm1 = util.shared_like(param, 'step', self.learning_rate.eval())
113 |         test = grad * grad_tm1
114 |         diff = TT.lt(test, 0)
115 |         steps = step_tm1 * (TT.eq(test, 0) +
116 |                             TT.gt(test, 0) * self.step_increase +
117 |                             diff * self.step_decrease)
118 |         step = TT.minimum(self.max_step, TT.maximum(self.min_step, steps))
119 |         grad = grad - diff * grad
120 |         yield param, TT.sgn(grad) * step
121 |         yield grad_tm1, grad
122 |         yield step_tm1, step
123 | 
124 | 
125 | class ADAGRAD(Optimizer):
126 |     r'''ADAGRAD optimizer.
127 | 
128 |     Parameters
129 |     ----------
130 |     rms_regularizer: float, optional (default 1e-8)
131 |         Regularize the learning rate scaling factor by this :math:`\epsilon`.
132 |     momentum: float, optional (default 0)
133 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
134 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
135 |         momentum.
136 |     nesterov: bool, optional (default False)
137 |         Set this to ``True`` to enable Nesterov-style momentum updates, whenever
138 |         ``momentum`` is nonzero.
139 | 
140 |     Notes
141 |     -----
142 | 
143 |     The ADAGRAD method uses the same general strategy as all first-order
144 |     stochastic gradient methods, in the sense that these methods make small
145 |     parameter adjustments iteratively using local derivative information.
146 | 
147 |     The difference with ADAGRAD is that as gradients are computed during each
148 |     parameter update, their squares are accumulated, and this accumulated value
149 |     is used to rescale the global learning rate :math:`\alpha` separately for
150 |     each parameter.
151 | 
152 |     .. math::
153 |         \begin{eqnarray*}
154 |         g_{t+1} &=& g_t + \left(\frac{\partial\mathcal{L}}{\partial p}\right)^2 \\
155 |         p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1}} + \epsilon}
156 |            \frac{\partial\mathcal{L}}{\partial p}
157 |         \end{eqnarray*}
158 | 
159 |     Like the other adaptive learning methods, learning method effectively
160 |     maintains a sort of parameter-specific learning rate. Unlike
161 |     :class:`RMSProp` and :class:`ADADELTA`, however, in ADAGRAD, the gradient
162 |     magnitudes accumulate throughout training, which has the effect of scaling
163 |     the learning rate for each parameter, but also effectively anneals the
164 |     learning rate overall as training progresses.
165 | 
166 |     In this implementation, the scale values are regularized (made less extreme)
167 |     by :math:`\epsilon`, which is specified using the ``rms_regularizer``
168 |     parameter.
169 | 
170 |     References
171 |     ----------
172 | 
173 |     .. [Duch10] J. Duchi, E. Hazan, & Y. Singer (2010) “Adaptive subgradient
174 |        methods for online leaning and stochastic optimization.” Proc. Conference
175 |        on Learning Theory (COLT).
176 |     '''
177 | 
178 |     def _prepare(self, rms_regularizer=1e-8, **kwargs):
179 |         self.epsilon = util.as_float(rms_regularizer)
180 |         util.log_param('rms_regularizer', rms_regularizer)
181 |         super(ADAGRAD, self)._prepare(**kwargs)
182 | 
183 |     def _get_updates_for(self, param, grad):
184 |         g2_tm1 = util.shared_like(param, 'g2_acc')
185 |         g2_t = g2_tm1 + grad * grad
186 |         yield g2_tm1, g2_t
187 |         yield param, grad * self.learning_rate / TT.sqrt(g2_t + self.epsilon)
188 | 
189 | 
190 | class RMSProp(Optimizer):
191 |     r'''RMSProp optimizer.
192 | 
193 |     Parameters
194 |     ----------
195 |     learning_rate: float, optional (default 1e-4)
196 |         Step size to take during optimization.
197 |     rms_halflife: float, optional (default 14)
198 |         Compute RMS gradient values using an exponentially weighted moving
199 |         average that decays with this halflife.
200 |     rms_regularizer: float, optional (default 1e-8)
201 |         Regularize RMS gradient values by this :math:`\epsilon`.
202 |     momentum: float, optional (default 0)
203 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
204 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
205 |         momentum.
206 |     nesterov: bool, optional (default False)
207 |         Set this to ``True`` to enable Nesterov-style momentum updates, whenever
208 |         ``momentum`` is nonzero.
209 | 
210 |     Notes
211 |     -----
212 | 
213 |     The RMSProp method uses the same general strategy as all first-order
214 |     stochastic gradient methods, in the sense that these methods make small
215 |     parameter adjustments iteratively using local derivative information.
216 | 
217 |     The difference here is that as gradients are computed during each parameter
218 |     update, an exponentially-weighted moving average (EWMA) of gradient
219 |     magnitudes is maintained as well. At each update, the EWMA is used to
220 |     compute the root-mean-square (RMS) gradient value that's been seen in the
221 |     recent past. The actual gradient is normalized by this RMS scaling factor
222 |     before being applied to update the parameters. Intuitively, this makes
223 |     RMSProp take steps near 1 whenever the gradient is of constant magnitude,
224 |     and larger steps whenever the local scale of the gradient starts to
225 |     increase.
226 | 
227 |     .. math::
228 |         \begin{eqnarray*}
229 |         f_{t+1} &=& \gamma f_t + (1 - \gamma) \frac{\partial\mathcal{L}}{\partial p} \\
230 |         g_{t+1} &=& \gamma g_t + (1 - \gamma) \left(
231 |            \frac{\partial\mathcal{L}}{\partial p}\right)^2 \\
232 |         p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1} - f_{t+1}^2 + \epsilon}}
233 |            \frac{\partial\mathcal{L}}{\partial p}
234 |         \end{eqnarray*}
235 | 
236 |     Like :class:`RProp`, this learning method effectively maintains a sort of
237 |     parameter-specific momentum value, but this method takes into account both
238 |     the sign and the magnitude of the gradient for each parameter.
239 | 
240 |     In this algorithm, RMS values are regularized (made less extreme) by
241 |     :math:`\epsilon`, which is specified using the ``rms_regularizer`` keyword
242 |     argument.
243 | 
244 |     The weight parameter :math:`\gamma` for the EWMA window is computed from the
245 |     ``rms_halflife`` keyword argument, such that the actual EWMA weight varies
246 |     inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
247 |     2}{h}}`.
248 | 
249 |     The implementation here is taken from [Grav13]_, equations (38)--(45).
250 |     Graves' implementation in particular seems to have introduced the
251 |     :math:`f_t` terms into the RMS computation; these terms appear to act as a
252 |     sort of momentum for the RMS values.
253 | 
254 |     References
255 |     ----------
256 | 
257 |     .. [Grav13] A. Graves. (2013) "Generating Sequences With Recurrent Neural
258 |        Networks." http://arxiv.org/abs/1308.0850
259 | 
260 |     '''
261 | 
262 |     def _prepare(self, rms_halflife=14, rms_regularizer=1e-8, **kwargs):
263 |         self.ewma = util.as_float(np.exp(-np.log(2) / rms_halflife))
264 |         self.epsilon = util.as_float(rms_regularizer)
265 |         util.log_param('rms_halflife', rms_halflife)
266 |         util.log_param('rms_regularizer', rms_regularizer)
267 |         super(RMSProp, self)._prepare(**kwargs)
268 | 
269 |     def _get_updates_for(self, param, grad):
270 |         g1_tm1 = util.shared_like(param, 'g1_ewma')
271 |         g2_tm1 = util.shared_like(param, 'g2_ewma')
272 |         g1_t = self.ewma * g1_tm1 + (1 - self.ewma) * grad
273 |         g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad
274 |         rms = TT.sqrt(g2_t - g1_t * g1_t + self.epsilon)
275 |         yield g1_tm1, g1_t
276 |         yield g2_tm1, g2_t
277 |         yield param, self.learning_rate * grad / rms
278 | 
279 | 
280 | class ADADELTA(RMSProp):
281 |     r'''ADADELTA optimizer.
282 | 
283 |     Parameters
284 |     ----------
285 |     rms_halflife: float, optional (default 14)
286 |         Compute RMS gradient values using an exponentially weighted moving
287 |         average that decays with this halflife.
288 |     rms_regularizer: float, optional (default 1e-8)
289 |         Regularize RMS gradient values by this :math:`\epsilon`.
290 |     momentum: float, optional (default 0)
291 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
292 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
293 |         momentum.
294 |     nesterov: bool, optional (default False)
295 |         Set this to ``True`` to enable Nesterov-style momentum updates, whenever
296 |         ``momentum`` is nonzero.
297 | 
298 |     Notes
299 |     -----
300 | 
301 |     The ADADELTA method uses the same general strategy as all first-order
302 |     stochastic gradient methods, in the sense that these methods make small
303 |     parameter adjustments iteratively using local derivative information.
304 | 
305 |     The difference with ADADELTA is that as gradients are computed during each
306 |     parameter update, an exponentially-weighted weighted moving average (EWMA)
307 |     gradient value, as well as an EWMA of recent parameter steps, are maintained
308 |     as well. The actual gradient is normalized by the ratio of the
309 |     root-mean-square (RMS) parameter step size to the RMS gradient magnitude.
310 | 
311 |     .. math::
312 |         \begin{eqnarray*}
313 |         g_{t+1} &=& \gamma g_t + (1 - \gamma) \left(
314 |            \frac{\partial\mathcal{L}}{\partial p}\right)^2 \\
315 |         v_{t+1} &=& \frac{\sqrt{x_t + \epsilon}}{\sqrt{g_{t+1} + \epsilon}}
316 |            \frac{\partial\mathcal{L}}{\partial p} \\
317 |         x_{t+1} &=& \gamma x_t + (1 - \gamma) v_{t+1}^2 \\
318 |         p_{t+1} &=& p_t - v_{t+1}
319 |         \end{eqnarray*}
320 | 
321 |     Like :class:`RProp` and the :class:`RMSProp`--:class:`ESGD` family, this
322 |     learning method effectively maintains a sort of parameter-specific momentum
323 |     value. The primary difference between this method and :class:`RMSProp` is
324 |     that ADADELTA additionally incorporates a sliding window of RMS parameter
325 |     step sizes, (somewhat) obviating the need for a learning rate parameter.
326 | 
327 |     In this implementation, the RMS values are regularized (made less extreme)
328 |     by :math:`\epsilon`, which is specified using the ``rms_regularizer``
329 |     parameter.
330 | 
331 |     The weight parameter :math:`\gamma` for the EWMA window is computed from the
332 |     ``rms_halflife`` keyword argument, such that the actual EWMA weight varies
333 |     inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
334 |     2}{h}}`.
335 | 
336 |     References
337 |     ----------
338 | 
339 |     .. [Zeil12] M. Zeiler. (2012) "ADADELTA: An adaptive learning rate method."
340 |        http://arxiv.org/abs/1212.5701
341 |     '''
342 | 
343 |     def _get_updates_for(self, param, grad):
344 |         x2_tm1 = util.shared_like(param, 'x2_ewma')
345 |         g2_tm1 = util.shared_like(param, 'g2_ewma')
346 |         g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad
347 |         delta = grad * TT.sqrt(x2_tm1 + self.epsilon) / TT.sqrt(g2_t + self.epsilon)
348 |         x2_t = self.ewma * x2_tm1 + (1 - self.ewma) * delta * delta
349 |         yield g2_tm1, g2_t
350 |         yield x2_tm1, x2_t
351 |         yield param, delta
352 | 
353 | 
354 | class ESGD(RMSProp):
355 |     r'''Equilibrated SGD computes a diagonal Hessian preconditioner.
356 | 
357 |     Parameters
358 |     ----------
359 |     hv_method: {'rop', 'lop', 'grad'}, optional
360 |         The Hv (Hessian-vector) product will be computed using the given method.
361 |         The default is to use 'rop'.
362 |     learning_rate: float, optional (default 1e-4)
363 |         Step size to take during optimization.
364 |     rms_halflife: float, optional (default 14)
365 |         Compute RMS gradient values using an exponentially weighted moving
366 |         average that decays with this halflife.
367 |     rms_regularizer: float, optional (default 1e-8)
368 |         Regularize RMS gradient values by this :math:`\epsilon`.
369 |     momentum: float, optional (default 0)
370 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
371 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
372 |         momentum.
373 |     nesterov: bool, optional (default False)
374 |         Set this to ``True`` to enable Nesterov-style momentum updates, whenever
375 |         ``momentum`` is nonzero.
376 | 
377 |     Notes
378 |     -----
379 | 
380 |     The ESGD method uses the same general strategy as all first-order
381 |     stochastic gradient methods, in the sense that these methods make small
382 |     parameter adjustments iteratively using local derivative information.
383 | 
384 |     The difference here is that as gradients are computed during each parameter
385 |     update, an exponentially-weighted moving average (EWMA) of estimates of the
386 |     diagonal of the Hessian (the matrix of second derivatives) is maintained as
387 |     well. At each update, the EWMA is used to compute the root-mean-square (RMS)
388 |     diagonal value that's been seen in the recent past. The actual gradient is
389 |     scaled by the inverse of this diagonal preconditioner before being applied
390 |     to update the parameters. Intuitively, this causes the algorithm to
391 |     "reshape" the loss function in parameter space, such that directions of
392 |     steep gradient (i.e., large diagonal values) and directions of shallow
393 |     gradient (i.e., small diagonal values) are scaled to be approximately the
394 |     same slope.
395 | 
396 |     The diagonal estimates are computed using a nice trick: A vector :math:`r
397 |     \sim \mathcal{N}(0, 1)` consisting of standard normal values is sampled
398 |     randomly at each update step, and the value of :math:`Hr` is computed
399 |     symbolically. These vector values tend to approximate the diagonal of the
400 |     Hessian. Because :math:`Hr` is itself a vector, the full Hessian :math:`H`
401 |     does not need to be computed or stored.
402 | 
403 |     .. math::
404 |         \begin{eqnarray*}
405 |         r &\sim& \mathcal{N}(0, 1) \\
406 |         Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2 p}r \\
407 |         D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\
408 |         p_{t+1} &=& p_t + - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}}
409 |            \frac{\partial\mathcal{L}}{\partial p}
410 |         \end{eqnarray*}
411 | 
412 |     Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RMSProp` family, this
413 |     learning method effectively maintains a sort of parameter-specific learning
414 |     rate for each parameter in the loss.
415 | 
416 |     In this implementation, :math:`\epsilon` regularizes the RMS values; it is
417 |     is specified using the ``rms_regularizer`` parameter.
418 | 
419 |     The weight parameter :math:`\gamma` for the EWMA is computed from the
420 |     ``rms_halflife`` keyword argument, such that the actual EWMA weight varies
421 |     inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
422 |     2}{h}}`.
423 | 
424 |     The primary difference between this implementation and the algorithm
425 |     described in the paper (see below) is the use of an EWMA to decay the
426 |     diagonal values over time, while in the paper the diagonal is divided by the
427 |     training iteration. The EWMA halflife should be set to something reasonably
428 |     large to ensure that this method emulates the method described in the
429 |     original paper.
430 | 
431 |     References
432 |     ----------
433 | 
434 |     .. [Daup14] Y. Dauphin, H. de Vries, J. Chung & Y. Bengio. (2014) "RMSProp
435 |        and equilibrated adaptive learning rates for non-convex optimization."
436 |        http://arxiv.org/abs/1502.04390
437 |     '''
438 | 
439 |     def __init__(self, *args, **kwargs):
440 |         self.rng = RandomStreams()
441 |         self.hv_method = kwargs.pop('hv_method', 'rop').lower()
442 |         assert self.hv_method in ('rop', 'lop', 'grad')
443 |         super(ESGD, self).__init__(*args, **kwargs)
444 | 
445 |     def _get_updates_for(self, param, grad):
446 |         D_tm1 = util.shared_like(param, 'D_ewma')
447 |         v = self.rng.normal(param.shape)
448 |         if self.hv_method == 'rop':
449 |             Hv = TT.Rop(grad, param, v)
450 |         if self.hv_method == 'lop':
451 |             Hv = TT.Lop(grad, param, v)
452 |         if self.hv_method == 'grad':
453 |             Hv = TT.grad(TT.sum(grad * v), param)
454 |         D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv
455 |         denom = TT.sqrt(D_t) + self.epsilon
456 |         yield D_tm1, D_t
457 |         yield param, grad * self.learning_rate / denom
458 | 
459 | 
460 | class Adam(RMSProp):
461 |     r'''Adam optimizer using unbiased gradient moment estimates.
462 | 
463 |     Parameters
464 |     ----------
465 |     learning_rate: float, optional (default 1e-4)
466 |         Step size to take during optimization.
467 |     beta1_decay: float, optional (default 1 - 1e-6)
468 |         Extend the :math:`\beta_1` halflife by this amount after every update.
469 |     beta1_halflife: float, optional (default 7)
470 |         Compute RMS gradient estimates using an exponentially weighted moving
471 |         average that decays with this halflife.
472 |     beta2_halflife: float, optional (default 69)
473 |         Compute squared-magnitude RMS gradient estimates using an exponentially
474 |         weighted moving average that decays with this halflife.
475 |     rms_regularizer: float, optional (default 1e-8)
476 |         Regularize RMS gradient values by this :math:`\epsilon`.
477 |     momentum: float, optional (default 0)
478 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
479 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
480 |         momentum.
481 |     nesterov: bool, optional (default False)
482 |         Set this to ``True`` to enable Nesterov-style momentum updates, whenever
483 |         ``momentum`` is nonzero.
484 | 
485 |     Notes
486 |     -----
487 | 
488 |     The Adam method uses the same general strategy as all first-order
489 |     stochastic gradient methods, in the sense that these methods make small
490 |     parameter adjustments iteratively using local derivative information.
491 | 
492 |     The difference here is that as gradients are computed during each parameter
493 |     update, exponentially-weighted moving averages (EWMAs) of (1) the first
494 |     moment of the recent gradient values and (2) the second moment of recent
495 |     gradient values are maintained as well. At each update, the step taken is
496 |     proportional to the ratio of the first moment to the second moment.
497 | 
498 |     .. math::
499 |         \begin{eqnarray*}
500 |         \beta_1^t &=& \beta_1 \lambda^{t}
501 |         f_{t+1} &=& \beta_1^t f_t + (1 - \beta_1^t)
502 |            \frac{\partial\mathcal{L}}{\partial\theta} \\
503 |         g_{t+1} &=& \beta_2 g_t + (1 - \beta_2)
504 |            \left(\frac{\partial\mathcal{L}}{\partial\theta}\right)^2 \\
505 |         \theta_{t+1} &=& \theta_t -
506 |            \frac{f_{t+1} / (1 - \beta_1^t)}{\sqrt{g_{t+1} / (1 - \beta_2)} + \epsilon}
507 |         \end{eqnarray*}
508 | 
509 |     Like all adaptive optimization algorithms, this optimizer effectively
510 |     maintains a sort of parameter-specific momentum value. It shares with
511 |     :class:`RMSProp` and :class:`ADADELTA` the idea of using an EWMA to track
512 |     recent quantities related to the stochastic gradient during optimization.
513 |     But the Adam method is unique in that it incorporates an explicit
514 |     computation to remove the bias from these estimates.
515 | 
516 |     In this implementation, :math:`\epsilon` regularizes the RMS values and is
517 |     given using the ``rms_regularizer`` keyword argument. The weight parameters
518 |     :math:`\beta_1` and :math:`\beta_2` for the first and second EWMA windows
519 |     are computed from the ``beta1_halflife`` and ``beta2_halflife`` keyword
520 |     arguments, respectively, such that the actual EWMA weight varies inversely
521 |     with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The
522 |     decay :math:`\lambda` for the :math:`\beta_1` EWMA is provided by the
523 |     ``beta1_decay`` keyword argument.
524 | 
525 |     The implementation here is taken from Algorithm 1 of [King15]_.
526 | 
527 |     References
528 |     ----------
529 | 
530 |     .. [King15] D. Kingma & J. Ba. (ICLR 2015) "Adam: A Method for
531 |        Stochastic Optimization." http://arxiv.org/abs/1412.6980
532 |     '''
533 | 
534 |     def _prepare(self,
535 |                  beta1_halflife=7,
536 |                  beta2_halflife=69,
537 |                  **kwargs):
538 |         self.beta1 = util.as_float(np.exp(-np.log(2) / beta1_halflife))
539 |         self.beta2 = util.as_float(np.exp(-np.log(2) / beta2_halflife))
540 |         super(Adam, self)._prepare(**kwargs)
541 | 
542 |     def _get_updates_for(self, param, grad):
543 |         t_tm1 = theano.shared(np.cast['float32'](0), 't')
544 |         t_t = 1 + t_tm1
545 |         g1_tm1 = util.shared_like(param, 'g1_ewma')
546 |         g2_tm1 = util.shared_like(param, 'g2_ewma')
547 |         g1_t = self.beta1 * g1_tm1 + (1 - self.beta1) * grad
548 |         g2_t = self.beta2 * g2_tm1 + (1 - self.beta2) * grad * grad
549 |         numer = g1_t / (1 - self.beta1 ** t_t)
550 |         denom = TT.sqrt(g2_t / (1 - self.beta2 ** t_t))
551 |         yield t_tm1, t_t
552 |         yield g1_tm1, g1_t
553 |         yield g2_tm1, g2_t
554 |         yield param, self.learning_rate * numer / (denom + self.epsilon)
555 | 


--------------------------------------------------------------------------------
/downhill/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | '''This module defines a base class for optimization techniques.'''
  4 | 
  5 | import click
  6 | import collections
  7 | import numpy as np
  8 | import theano
  9 | import theano.tensor as TT
 10 | import warnings
 11 | 
 12 | from . import util
 13 | 
 14 | 
 15 | def build(algo, loss, params=None, inputs=None, updates=(), monitors=(),
 16 |           monitor_gradients=False):
 17 |     '''Construct an optimizer by name.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     algo : str
 22 |         The name of the optimization algorithm to build.
 23 |     loss : Theano expression
 24 |         Loss function to minimize. This must be a scalar-valued expression.
 25 |     params : list of Theano variables, optional
 26 |         Symbolic variables to adjust to minimize the loss. If not given, these
 27 |         will be computed automatically by walking the computation graph.
 28 |     inputs : list of Theano variables, optional
 29 |         Symbolic variables required to compute the loss. If not given, these
 30 |         will be computed automatically by walking the computation graph.
 31 |     updates : list of update pairs, optional
 32 |         A list of pairs providing updates for the internal of the loss
 33 |         computation. Normally this is empty, but it can be provided if the loss,
 34 |         for example, requires an update to an internal random number generator.
 35 |     monitors : dict or sequence of (str, Theano expression) tuples, optional
 36 |         Additional values to monitor during optimization. These must be provided
 37 |         as either a sequence of (name, expression) tuples, or as a dictionary
 38 |         mapping string names to Theano expressions.
 39 |     monitor_gradients : bool, optional
 40 |         If True, add monitors to log the norms of the parameter gradients during
 41 |         optimization. Defaults to False.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     optimizer : :class:`Optimizer`
 46 |         An optimizer instance.
 47 |     '''
 48 |     return Optimizer.build(algo, loss, params, inputs,
 49 |                            updates=updates, monitors=monitors,
 50 |                            monitor_gradients=monitor_gradients)
 51 | 
 52 | 
 53 | class Optimizer(util.Registrar(str('Base'), (), {})):
 54 |     '''An optimizer computes gradient updates to iteratively optimize a loss.
 55 | 
 56 |     Attributes
 57 |     ----------
 58 |     patience : int, optional
 59 |         Number of validation "failures" that we are willing to tolerate before
 60 |         stopping the optimization process. A validation failure happens whenever
 61 |         the loss on the validation dataset decreases by less than
 62 |         ``min_improvement`` (relative) over the previous best validation loss.
 63 |         Defaults to 5.
 64 |     validate_every : int, optional
 65 |         Evaluate the loss on the validation dataset after making this many
 66 |         passes over the training data. Defaults to 10.
 67 |     min_improvement : float, optional
 68 |         Insist that the validation loss must improve by this relative amount
 69 |         before considering that the optimization has made progress. The
 70 |         optimization process halts when ``patience`` validations have failed to
 71 |         make this relative improvement. Defaults to 0; set to a larger value
 72 |         (e.g., 0.01 for 1% improvement) to halt the optimization process sooner.
 73 |     max_gradient_norm : float, optional
 74 |         Rescale each parameter's gradient so that it has at most this L2 norm.
 75 |         Set to 0 (the default) to disable norm rescaling. If
 76 |         ``max_gradient_elem`` is also specified, then this has no effect.
 77 |     max_gradient_elem : float, optional
 78 |         Perform elementwise clipping on the magnitude of gradient values. Set to
 79 |         0 (the default) to disable. If elementwise clipping is enabled, norm
 80 |         rescaling (via ``max_gradient_norm``) will have no effect. Deprecated
 81 |         synonyms of this parameter are "max_gradient_clip" and "gradient_clip".
 82 |     learning_rate : float, optional
 83 |         Many SGD-based optimization algorithms require a learning rate
 84 |         hyperparameter that scales the gradient step. Defaults to 1e-4.
 85 |     momentum : float, optional
 86 |         Apply momentum to the parameter updates for this optimizer, with the
 87 |         given strength. Typically this value ranges from 0 (no momentum) to
 88 |         :math:`1 - \epsilon` (large momentum). Defaults to 0.
 89 |     nesterov : bool, optional
 90 |         If True, and ``momentum`` is nonzero, apply Nesterov-style momentum to
 91 |         parameter updates for this optimizer. If False, and ``momentum`` is
 92 |         nonzero, "regular" momentum is applied. Has no effect if ``momentum`` is
 93 |         zero. See :class:`NAG <downhill.NAG>` for a description of Nesterov
 94 |         momentum.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     loss : Theano expression
 99 |         Loss function to minimize. This must be a scalar-valued expression.
100 |     params : list of Theano variables, optional
101 |         Symbolic variables to adjust to minimize the loss. If not given, these
102 |         will be computed automatically by walking the computation graph.
103 |     inputs : list of Theano variables, optional
104 |         Symbolic variables required to compute the loss. If not given, these
105 |         will be computed automatically by walking the computation graph.
106 |     updates : list of update pairs, optional
107 |         A list of pairs providing updates for the internals of the loss
108 |         computation. Normally this is empty, but it can be provided if the loss,
109 |         for example, requires an update to an internal random number generator.
110 |     monitors : sequence of (str, Theano expression) tuples, optional
111 |         Additional values to monitor during optimization. These must be provided
112 |         as a sequence of (name, expression) tuples.
113 |     monitor_gradients : bool, optional
114 |         If True, add monitors to log the norms of the parameter gradients during
115 |         optimization. Defaults to False.
116 |     '''
117 | 
118 |     def __init__(self, loss, params=None, inputs=None, updates=(), monitors=(),
119 |                  monitor_gradients=False):
120 |         inputs_, params_ = util.find_inputs_and_params(loss)
121 | 
122 |         self._loss = loss
123 |         self._params = params or params_
124 |         self._inputs = inputs or inputs_
125 |         self._updates = updates
126 | 
127 |         self._shapes = [p.get_value(borrow=True).shape for p in self._params]
128 |         self._counts = [np.prod(s) for s in self._shapes]
129 |         self._starts = np.cumsum([0] + self._counts)[:-1]
130 |         self._dtype = self._params[0].get_value().dtype
131 | 
132 |         self._curr_iter = 0
133 |         self._best_iter = 0
134 |         self._best_loss = 1e100
135 |         self._best_params = [p.get_value().copy() for p in self._params]
136 | 
137 |         self._monitor_exprs = [self._loss]
138 |         self._monitor_names = ['loss']
139 |         for name, monitor in monitors:
140 |             self._monitor_names.append(name)
141 |             self._monitor_exprs.append(monitor)
142 |         if monitor_gradients:
143 |             unnamed = 0
144 |             for p, g in zip(self._params, TT.grad(self._loss, self._params)):
145 |                 name = p.name
146 |                 if not name:
147 |                     name = 'unnamed{}'.format(unnamed)
148 |                     unnamed += 1
149 |                     util.log('"{}" unnamed, will be "{}" internally'.format(p, name))
150 |                 self._monitor_names.append('grad({})'.format(name))
151 |                 self._monitor_exprs.append((g * g).sum())
152 | 
153 |     def _compile(self, **kwargs):
154 |         '''Compile the Theano functions for evaluating and updating our model.
155 |         '''
156 |         util.log('compiling evaluation function')
157 |         self.f_eval = theano.function(self._inputs,
158 |                                       self._monitor_exprs,
159 |                                       updates=self._updates,
160 |                                       name='evaluation')
161 |         label = self.__class__.__name__
162 |         util.log('compiling {} optimizer'.format(click.style(label, fg='red')))
163 |         updates = list(self._updates) + list(self.get_updates(**kwargs))
164 |         self.f_step = theano.function(self._inputs,
165 |                                       self._monitor_exprs,
166 |                                       updates=updates,
167 |                                       name=label)
168 | 
169 |     def get_updates(self, **kwargs):
170 |         '''Get parameter update expressions for performing optimization.
171 | 
172 |         Keyword arguments can be applied here to set any of the global
173 |         optimizer attributes.
174 | 
175 |         Yields
176 |         ------
177 |         updates : (parameter, expression) tuples
178 |             A sequence of parameter updates to be applied during optimization.
179 |         '''
180 |         self._prepare(**kwargs)
181 |         for param, grad in self._differentiate():
182 |             for var, update in self._get_updates_for(param, grad):
183 |                 # For auxiliary variables, updates are meant to replace the
184 |                 # existing variable value.
185 |                 if var != param:
186 |                     yield var, update
187 |                     continue
188 |                 # If momentum is disabled, just apply the parameter delta.
189 |                 if self.momentum == 0:
190 |                     yield var, param - update
191 |                     continue
192 |                 # Momentum is enabled, so we keep track of velocity here.
193 |                 vel_tm1 = util.shared_like(param, 'vel')
194 |                 vel_t = util.as_float(self.momentum) * vel_tm1 - update
195 |                 if self.nesterov:
196 |                     # see http://arxiv.org/pdf/1212.0901v2.pdf (eq 7) and
197 |                     # https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617
198 |                     mom_sqr = util.as_float(self.momentum ** 2)
199 |                     mom_inc = util.as_float(1 + self.momentum)
200 |                     vel_t = mom_sqr * vel_tm1 - mom_inc * update
201 |                 yield vel_tm1, vel_t
202 |                 yield param, param + vel_t
203 | 
204 |     def _get_updates_for(self, param, grad):
205 |         '''Generate some update pairs for the given model parameter.
206 | 
207 |         Yields
208 |         ------
209 |         updates : (parameter, expression) tuples
210 |             A sequence of parameter updates to be applied during optimization.
211 |         '''
212 |         raise NotImplementedError
213 | 
214 |     def _differentiate(self, params=None):
215 |         '''Return a sequence of gradients for our parameters.
216 | 
217 |         If this optimizer has been configured with a gradient norm limit, or
218 |         with elementwise gradient clipping, this method applies the appropriate
219 |         rescaling and clipping operations before returning the gradient.
220 | 
221 |         Parameters
222 |         ----------
223 |         params : list of Theano variables, optional
224 |             Return the gradient with respect to these parameters. Defaults to
225 |             all parameters that the optimizer knows about.
226 | 
227 |         Yields
228 |         ------
229 |         pairs : (param, grad) tuples
230 |             Generates a sequence of tuples representing each of the parameters
231 |             requested and the corresponding Theano gradient expressions.
232 |         '''
233 |         if params is None:
234 |             params = self._params
235 |         for param, grad in zip(params, TT.grad(self._loss, params)):
236 |             if self.max_gradient_elem > 0:
237 |                 limit = util.as_float(self.max_gradient_elem)
238 |                 yield param, TT.clip(grad, -limit, limit)
239 |             elif self.max_gradient_norm > 0:
240 |                 norm = TT.sqrt((grad * grad).sum())
241 |                 limit = util.as_float(self.max_gradient_norm)
242 |                 yield param, grad * TT.minimum(1, limit / norm)
243 |             else:
244 |                 yield param, grad
245 | 
246 |     def set_params(self, targets=None):
247 |         '''Set the values of the parameters to the given target values.
248 | 
249 |         Parameters
250 |         ----------
251 |         targets : sequence of ndarray, optional
252 |             Arrays for setting the parameters of our model. If this is not
253 |             provided, the current best parameters for this optimizer will be
254 |             used.
255 |         '''
256 |         if not isinstance(targets, (list, tuple)):
257 |             targets = self._best_params
258 |         for param, target in zip(self._params, targets):
259 |             param.set_value(target)
260 | 
261 |     def _log(self, monitors, iteration, label='', suffix=''):
262 |         '''Log the state of the optimizer on the console.
263 | 
264 |         Parameters
265 |         ----------
266 |         monitors : OrderedDict
267 |             A dictionary of monitor names mapped to values. These names and
268 |             values are what is being logged.
269 |         iteration : int
270 |             Optimization iteration that we are logging.
271 |         label : str, optional
272 |             A label for the name of the optimizer creating the log line.
273 |             Defaults to the name of the current class.
274 |         suffix : str, optional
275 |             A suffix to add to the end of the log line, if any.
276 |         '''
277 |         label = label or self.__class__.__name__
278 |         fields = (('{}={:.6f}').format(k, v) for k, v in monitors.items())
279 |         util.log('{} {} {}{}'.format(label, iteration, ' '.join(fields), suffix))
280 | 
281 |     def evaluate(self, dataset):
282 |         '''Evaluate the current model parameters on a dataset.
283 | 
284 |         Parameters
285 |         ----------
286 |         dataset : :class:`Dataset <downhill.dataset.Dataset>`
287 |             A set of data to use for evaluating the model.
288 | 
289 |         Returns
290 |         -------
291 |         monitors : OrderedDict
292 |             A dictionary mapping monitor names to values. Monitors are
293 |             quantities of interest during optimization---for example, loss
294 |             function, accuracy, or whatever the optimization task requires.
295 |         '''
296 |         if dataset is None:
297 |             values = [self.f_eval()]
298 |         else:
299 |             values = [self.f_eval(*x) for x in dataset]
300 |         monitors = zip(self._monitor_names, np.mean(values, axis=0))
301 |         return collections.OrderedDict(monitors)
302 | 
303 |     def _test_patience(self, monitors):
304 |         '''Test whether our patience with optimization has elapsed.
305 | 
306 |         Parameters
307 |         ----------
308 |         monitors : dict
309 |             A dictionary mapping monitor names to values. The 'loss' key from
310 |             this dictionary will be used to evaluate optimization progress.
311 | 
312 |         Returns
313 |         -------
314 |         elapsed : bool
315 |             True iff our patience has elapsed and the model is no longer
316 |             improving.
317 |         '''
318 |         self._curr_iter += 1
319 |         marker = ''
320 |         loss = monitors['loss']
321 |         if self._best_loss - loss > self._best_loss * self.min_improvement:
322 |             self._best_loss = loss
323 |             self._best_iter = self._curr_iter
324 |             self._best_params = [p.get_value().copy() for p in self._params]
325 |             marker = ' *'
326 |         self._log(monitors, self._curr_iter - 1, 'validation', marker)
327 |         return self._curr_iter - self._best_iter > self.patience
328 | 
329 |     def _prepare(self, **kwargs):
330 |         '''Set up properties for optimization.
331 | 
332 |         This method can be overridden by base classes to provide parameters that
333 |         are specific to a particular optimization technique (e.g., setting up a
334 |         learning rate value).
335 |         '''
336 |         self.learning_rate = util.as_float(kwargs.pop('learning_rate', 1e-4))
337 |         self.momentum = kwargs.pop('momentum', 0)
338 |         self.nesterov = kwargs.pop('nesterov', False)
339 |         self.patience = kwargs.get('patience', 5)
340 |         self.validate_every = kwargs.pop('validate_every', 10)
341 |         self.min_improvement = kwargs.pop('min_improvement', 0)
342 |         self.max_gradient_norm = kwargs.pop('max_gradient_norm', 0)
343 |         self.max_gradient_elem = kwargs.pop('max_gradient_elem', 0)
344 | 
345 |         util.log_param('patience', self.patience)
346 |         util.log_param('validate_every', self.validate_every)
347 |         util.log_param('min_improvement', self.min_improvement)
348 |         util.log_param('max_gradient_norm', self.max_gradient_norm)
349 |         util.log_param('max_gradient_elem', self.max_gradient_elem)
350 |         util.log_param('learning_rate', self.learning_rate)
351 |         util.log_param('momentum', self.momentum)
352 |         util.log_param('nesterov', self.nesterov)
353 | 
354 |     def iterate(self, train=None, valid=None, max_updates=None, **kwargs):
355 |         r'''Optimize a loss iteratively using a training and validation dataset.
356 | 
357 |         This method yields a series of monitor values to the caller. After every
358 |         optimization epoch, a pair of monitor dictionaries is generated: one
359 |         evaluated on the training dataset during the epoch, and another
360 |         evaluated on the validation dataset at the most recent validation epoch.
361 | 
362 |         The validation monitors might not be updated during every optimization
363 |         iteration; in this case, the most recent validation monitors will be
364 |         yielded along with the training monitors.
365 | 
366 |         Additional keyword arguments supplied here will set the global
367 |         optimizer attributes.
368 | 
369 |         Parameters
370 |         ----------
371 |         train : sequence or :class:`Dataset <downhill.dataset.Dataset>`
372 |             A set of training data for computing updates to model parameters.
373 |         valid : sequence or :class:`Dataset <downhill.dataset.Dataset>`
374 |             A set of validation data for computing monitor values and
375 |             determining when the loss has stopped improving. Defaults to the
376 |             training data.
377 |         max_updates : int, optional
378 |             If specified, halt optimization after this many gradient updates
379 |             have been processed. If not provided, uses early stopping to decide
380 |             when to halt.
381 | 
382 |         Yields
383 |         ------
384 |         train_monitors : dict
385 |             A dictionary mapping monitor names to values, evaluated on the
386 |             training dataset.
387 |         valid_monitors : dict
388 |             A dictionary containing monitor values evaluated on the validation
389 |             dataset.
390 |         '''
391 |         self._compile(**kwargs)
392 | 
393 |         if valid is None:
394 |             valid = train
395 |         iteration = 0
396 |         training = validation = None
397 |         while max_updates is None or iteration < max_updates:
398 |             if not iteration % self.validate_every:
399 |                 try:
400 |                     validation = self.evaluate(valid)
401 |                 except KeyboardInterrupt:
402 |                     util.log('interrupted!')
403 |                     break
404 |                 if self._test_patience(validation):
405 |                     util.log('patience elapsed!')
406 |                     break
407 |             try:
408 |                 training = self._step(train)
409 |             except KeyboardInterrupt:
410 |                 util.log('interrupted!')
411 |                 break
412 |             iteration += 1
413 |             self._log(training, iteration)
414 |             yield training, validation
415 |         self.set_params('best')
416 | 
417 |     def minimize(self, *args, **kwargs):
418 |         '''Optimize our loss exhaustively.
419 | 
420 |         This method is a thin wrapper over the :func:`iterate` method. It simply
421 |         exhausts the iterative optimization process and returns the final
422 |         monitor values.
423 | 
424 |         Returns
425 |         -------
426 |         train_monitors : dict
427 |             A dictionary mapping monitor names to values, evaluated on the
428 |             training dataset.
429 |         valid_monitors : dict
430 |             A dictionary containing monitor values evaluated on the validation
431 |             dataset.
432 |         '''
433 |         monitors = None
434 |         for monitors in self.iterate(*args, **kwargs):
435 |             pass
436 |         return monitors
437 | 
438 |     def _step(self, dataset):
439 |         '''Advance the state of the optimizer by one step.
440 | 
441 |         Parameters
442 |         ----------
443 |         dataset : :class:`Dataset <downhill.dataset.Dataset>`
444 |             A dataset for optimizing the model.
445 | 
446 |         Returns
447 |         -------
448 |         train_monitors : dict
449 |             A dictionary mapping monitor names to values.
450 |         '''
451 |         if dataset is None:
452 |             values = [self.f_step()]
453 |         else:
454 |             values = [self.f_step(*x) for x in dataset]
455 |         return collections.OrderedDict(
456 |             zip(self._monitor_names, np.mean(values, axis=0)))
457 | 


--------------------------------------------------------------------------------
/downhill/dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | r'''This module contains a class for handling batched datasets.
  4 | 
  5 | In many optimization tasks, parameters must be updated by optimizing them with
  6 | respect to estimates of a loss function. The loss function for many problems is
  7 | estimated using a set of data that we have measured.
  8 | '''
  9 | 
 10 | import collections
 11 | import numpy as np
 12 | import theano
 13 | 
 14 | from . import util
 15 | 
 16 | 
 17 | class Dataset:
 18 |     '''This class handles batching and shuffling a dataset.
 19 | 
 20 |     In ``downhill``, losses are optimized using sets of data collected from the
 21 |     problem that generated the loss.
 22 | 
 23 |     During optimization, data are grouped into "mini-batches"---that is, chunks
 24 |     that are larger than 1 sample and smaller than the entire set of samples;
 25 |     typically the size of a mini-batch is between 10 and 100, but the specific
 26 |     setting can be varied depending on your model, hardware, dataset, and so
 27 |     forth. These mini-batches must be presented to the optimization algorithm in
 28 |     pseudo-random order to match the underlying stochasticity assumptions of
 29 |     many optimization algorithms. This class handles the process of grouping
 30 |     data into mini-batches as well as iterating and shuffling these mini-batches
 31 |     dynamically as the dataset is consumed by the optimization algorithm.
 32 | 
 33 |     For many tasks, a dataset is obtained as a large block of sample data, which
 34 |     in Python is normally assembled as a ``numpy`` ndarray. To use this class on
 35 |     such a dataset, just pass in a list or tuple containing ``numpy`` arrays;
 36 |     the number of these arrays must match the number of inputs that your loss
 37 |     computation requires.
 38 | 
 39 |     There are some cases when a suitable set of training data would be
 40 |     prohibitively expensive to assemble in memory as a single ``numpy`` array.
 41 |     To handle these cases, this class can also handle a dataset that is provided
 42 |     via a Python callable. For more information on using callables to provide
 43 |     data to your model, see :ref:`data-using-callables`.
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     inputs : callable or list of ndarray/sparse matrix/DataFrame/theano shared var
 48 |         One or more sets of data.
 49 | 
 50 |         If this parameter is callable, then mini-batches will be obtained by
 51 |         calling the callable with no arguments; the callable is expected to
 52 |         return a tuple of ndarray-like objects that will be suitable for
 53 |         optimizing the loss at hand.
 54 | 
 55 |         If this parameter is a list (or a tuple), it must contain array-like
 56 |         objects: ``numpy.ndarray``, ``scipy.sparse.csc_matrix``,
 57 |         ``scipy.sparse.csr_matrix``, ``pandas.DataFrame`` or ``theano.shared``.
 58 |         These are assumed to contain data for computing the loss, so the length
 59 |         of this tuple or list should match the number of inputs required by the
 60 |         loss computation. If multiple arrays are provided, their lengths along
 61 |         the axis given by the ``axis`` parameter (defaults to 0) must match.
 62 | 
 63 |     name : str, optional
 64 |         A string that is used to describe this dataset. Usually something like
 65 |         'test' or 'train'.
 66 | 
 67 |     batch_size : int, optional
 68 |         The size of the mini-batches to create from the data sequences. If this
 69 |         is negative or zero, all data in the dataset will be used in one batch.
 70 |         Defaults to 32. This parameter has no effect if ``inputs`` is callable.
 71 | 
 72 |     iteration_size : int, optional
 73 |         The number of batches to yield for each call to iterate(). Defaults to
 74 |         the length of the data divided by batch_size. If the dataset is a
 75 |         callable, then the number is len(callable). If callable has no length,
 76 |         then the number is set to 100.
 77 | 
 78 |     axis : int, optional
 79 |         The axis along which to split the data arrays, if the first parameter is
 80 |         given as one or more ndarrays. If not provided, defaults to 0.
 81 | 
 82 |     rng : :class:`numpy.random.RandomState` or int, optional
 83 |         A random number generator, or an integer seed for a random number
 84 |         generator. If not provided, the random number generator will be created
 85 |         with an automatically chosen seed.
 86 |     '''
 87 | 
 88 |     _count = 0
 89 | 
 90 |     def __init__(self, inputs, name=None, batch_size=32, iteration_size=None,
 91 |                  axis=0, rng=None):
 92 |         self.name = name or 'dataset{}'.format(Dataset._count)
 93 |         Dataset._count += 1
 94 |         self.batch_size = batch_size
 95 |         self.iteration_size = iteration_size
 96 |         self.rng = rng
 97 |         if rng is None or isinstance(rng, int):
 98 |             self.rng = np.random.RandomState(rng)
 99 | 
100 |         self._inputs = None
101 |         self._slices = None
102 |         self._callable = None
103 | 
104 |         if isinstance(inputs, collections.Callable):
105 |             self._init_callable(inputs)
106 |         else:
107 |             self._init_arrays(inputs, axis)
108 | 
109 |     def _init_callable(self, inputs):
110 |         self._callable = inputs
111 |         if not self.iteration_size:
112 |             try:
113 |                 self.iteration_size = len(inputs)
114 |             except (TypeError, AttributeError):  # has no len
115 |                 self.iteration_size = 100
116 |         util.log('{0.name}: {0.iteration_size} mini-batches from callable', self)
117 | 
118 |     def _init_arrays(self, inputs, axis=0):
119 |         if not isinstance(inputs, (tuple, list)):
120 |             inputs = (inputs, )
121 | 
122 |         shapes = []
123 |         self._inputs = []
124 |         for i, x in enumerate(inputs):
125 |             self._inputs.append(x)
126 |             if isinstance(x, np.ndarray):
127 |                 shapes.append(x.shape)
128 |                 continue
129 |             if isinstance(x, theano.compile.SharedVariable):
130 |                 shapes.append(x.get_value(borrow=True).shape)
131 |                 continue
132 |             if 'pandas.' in str(type(x)):  # hacky but prevents a global import
133 |                 import pandas as pd
134 |                 if isinstance(x, (pd.Series, pd.DataFrame)):
135 |                     shapes.append(x.shape)
136 |                     continue
137 |             if 'scipy.sparse.' in str(type(x)):  # same here
138 |                 import scipy.sparse as ss
139 |                 if isinstance(x, (ss.csr.csr_matrix, ss.csc.csc_matrix)):
140 |                     shapes.append(x.shape)
141 |                     continue
142 |             raise ValueError(
143 |                 'input {} (type {}) must be numpy.array, theano.shared, '
144 |                 'or pandas.{{Series,DataFrame}}'.format(i, type(x)))
145 | 
146 |         L = shapes[0][axis]
147 |         assert all(L == s[axis] for s in shapes), \
148 |             'shapes do not match along axis {}: {}'.format(
149 |                 axis, '; '.join(str(s) for s in shapes))
150 | 
151 |         B = L if self.batch_size <= 0 else self.batch_size
152 | 
153 |         self._index = 0
154 |         self._slices = []
155 |         for i in range(0, L, B):
156 |             where = []
157 |             for shape in shapes:
158 |                 slices = [slice(None) for _ in shape]
159 |                 slices[axis] = slice(i, min(L, i + B))
160 |                 where.append(tuple(slices))
161 |             self._slices.append(where)
162 | 
163 |         self.shuffle()
164 | 
165 |         if not self.iteration_size:
166 |             self.iteration_size = len(self._slices)
167 | 
168 |         util.log('{0.name}: {0.iteration_size} of {1} mini-batches from {2}',
169 |                  self, len(self._slices), '; '.join(str(s) for s in shapes))
170 | 
171 |     def __iter__(self):
172 |         return self.iterate(True)
173 | 
174 |     def shuffle(self):
175 |         '''Shuffle the batches in the dataset.
176 | 
177 |         If this dataset was constructed using a callable, this method has no
178 |         effect.
179 |         '''
180 |         if self._slices is not None:
181 |             self.rng.shuffle(self._slices)
182 | 
183 |     def iterate(self, shuffle=True):
184 |         '''Iterate over batches in the dataset.
185 | 
186 |         This method generates ``iteration_size`` batches from the dataset and
187 |         then returns.
188 | 
189 |         Parameters
190 |         ----------
191 |         shuffle : bool, optional
192 |             Shuffle the batches in this dataset if the iteration reaches the end
193 |             of the batch list. Defaults to True.
194 | 
195 |         Yields
196 |         ------
197 |         batches : data batches
198 |             A sequence of batches---often from a training, validation, or test
199 |             dataset.
200 |         '''
201 |         for _ in range(self.iteration_size):
202 |             if self._callable is not None:
203 |                 yield self._callable()
204 |             else:
205 |                 yield self._next_batch(shuffle)
206 | 
207 |     def _next_batch(self, shuffle=True):
208 |         batch = [x.iloc[i] if hasattr(x, 'iloc') else x[i]
209 |                  for x, i in zip(self._inputs, self._slices[self._index])]
210 |         self._index += 1
211 |         if self._index >= len(self._slices):
212 |             if shuffle:
213 |                 self.shuffle()
214 |             self._index = 0
215 |         return batch
216 | 


--------------------------------------------------------------------------------
/downhill/first_order.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | '''This module defines first-order gradient descent optimizers.'''
  4 | 
  5 | from .base import Optimizer
  6 | 
  7 | __all__ = ['SGD', 'NAG']
  8 | 
  9 | 
 10 | class SGD(Optimizer):
 11 |     r'''Basic optimization using stochastic gradient descent.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     learning_rate: float, optional (default 1e-4)
 16 |         Step size to take during optimization.
 17 |     momentum: float, optional (default 0)
 18 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
 19 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
 20 |         momentum.
 21 |     nesterov: bool, optional (default False)
 22 |         Set this to ``True`` to enable Nesterov-style momentum updates, whenever
 23 |         ``momentum`` is nonzero.
 24 | 
 25 |     Notes
 26 |     -----
 27 | 
 28 |     A stochastic gradient trainer with momentum :math:`\mu` and learning rate
 29 |     :math:`\alpha` updates parameter :math:`\theta` at step :math:`t` by
 30 |     blending the current "velocity" :math:`v` with the current gradient
 31 |     :math:`\frac{\partial\mathcal{L}}{\partial\theta}`:
 32 | 
 33 |     .. math::
 34 |         \begin{eqnarray*}
 35 |         v_{t+1} &=& \mu v_t - \alpha \frac{\partial\mathcal{L}}{\partial\theta} \\
 36 |         \theta_{t+1} &=& \theta_t + v_{t+1}
 37 |         \end{eqnarray*}
 38 | 
 39 |     Without momentum (i.e., when :math:`\mu = 0`), these updates reduce to
 40 |     :math:`\theta_{t+1} = \theta_t - \alpha \frac{\partial\mathcal{L}}{\partial\theta}`,
 41 |     which just takes steps downhill according to the the local gradient.
 42 | 
 43 |     Adding the momentum term permits the algorithm to incorporate information
 44 |     from previous steps as well, which in practice is thought to have the effect
 45 |     of incorporating some information about second-order derivatives of the loss
 46 |     surface.
 47 | 
 48 |     References
 49 |     ----------
 50 | 
 51 |     .. [Rume86] D. E. Rumelhart, G. E. Hinton, & R. J. Williams. (1986)
 52 |        "Learning representations by back-propagating errors". Nature 323
 53 |        (6088):533–536. doi:10.1038/323533a0
 54 |        http://www.nature.com/nature/journal/v323/n6088/abs/323533a0.html
 55 |     '''
 56 | 
 57 |     def _get_updates_for(self, param, grad):
 58 |         yield param, self.learning_rate * grad
 59 | 
 60 | 
 61 | class NAG(SGD):
 62 |     r'''Stochastic gradient optimization with Nesterov momentum.
 63 | 
 64 |     This class name is an abbreviation for "Nesterov's Accelerated Gradient."
 65 |     Note that the ``momentum`` parameter must be given during optimization for
 66 |     Nesterov momentum to be employed; by default ``momentum`` is 0 and so no
 67 |     momentum is used.
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     learning_rate: float, optional (default 1e-4)
 72 |         Step size to take during optimization.
 73 | 
 74 |     momentum: float, optional (default 0)
 75 |         Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
 76 |         Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
 77 |         momentum.
 78 | 
 79 |     Notes
 80 |     -----
 81 | 
 82 |     The basic difference between NAG and "classical" momentum in SGD
 83 |     optimization approaches is that NAG computes the gradients at the position
 84 |     in parameter space where "classical" momentum would put us at the *next*
 85 |     step. In classical :class:`SGD` with momentum :math:`\mu` and learning rate
 86 |     :math:`\alpha`, updates to parameter :math:`p` at step :math:`t` are
 87 |     computed by blending the current "velocity" :math:`v` with the current
 88 |     gradient :math:`\frac{\partial\mathcal{L}}{\partial p}`:
 89 | 
 90 |     .. math::
 91 |         \begin{eqnarray*}
 92 |         v_{t+1} &=& \mu v_t - \alpha \frac{\partial\mathcal{L}}{\partial p} \\
 93 |         p_{t+1} &=& p_t + v_{t+1}
 94 |         \end{eqnarray*}
 95 | 
 96 |     In contrast, NAG adjusts the update by blending the current "velocity" with
 97 |     the gradient at the next step---that is, the gradient is computed at the
 98 |     point where the velocity would have taken us:
 99 | 
100 |     .. math::
101 |         \begin{eqnarray*}
102 |         v_{t+1} &=& \mu v_t - \alpha \left.
103 |            \frac{\partial\mathcal{L}}{\partial p}\right|_{p_t + \mu v_t} \\
104 |         p_{t+1} &=& p_t + v_{t+1}
105 |         \end{eqnarray*}
106 | 
107 |     Again, the difference here is that the gradient is computed at the place in
108 |     parameter space where we would have stepped using the classical technique,
109 |     in the absence of a new gradient.
110 | 
111 |     In theory, this helps correct for oversteps during learning: If momentum
112 |     would lead us to overshoot, then the gradient at that overshot place will
113 |     point backwards, toward where we came from. See [Suts13]_ for a particularly
114 |     clear exposition of this idea.
115 | 
116 |     References
117 |     ----------
118 |     .. [Suts13] I. Sutskever, J. Martens, G. Dahl, & G. Hinton. (ICML 2013) "On
119 |        the importance of initialization and momentum in deep learning."
120 |        http://www.cs.toronto.edu/~fritz/absps/momentum.pdf
121 | 
122 |     .. [Nest83] Y. Nesterov. (1983) "A method of solving a convex programming
123 |        problem with convergence rate O(1/sqr(k))." Soviet Mathematics Doklady,
124 |        27:372–376.
125 |     '''
126 | 
127 |     def iterate(self, *args, **kwargs):
128 |         kwargs['nesterov'] = True
129 |         return super(NAG, self).iterate(*args, **kwargs)
130 | 


--------------------------------------------------------------------------------
/downhill/util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | '''A module of utility functions and other goodies.'''
  4 | 
  5 | import click
  6 | import datetime
  7 | import inspect
  8 | import numpy as np
  9 | import theano
 10 | import theano.tensor as TT
 11 | 
 12 | 
 13 | class Registrar(type):
 14 |     '''A metaclass that builds a registry of its subclasses.'''
 15 | 
 16 |     def __init__(cls, name, bases, dct):
 17 |         if not hasattr(cls, '_registry'):
 18 |             cls._registry = {}
 19 |         else:
 20 |             cls._registry[name.lower()] = cls
 21 |         super(Registrar, cls).__init__(name, bases, dct)
 22 | 
 23 |     def build(cls, key, *args, **kwargs):
 24 |         return cls._registry[key.lower()](*args, **kwargs)
 25 | 
 26 |     def is_registered(cls, key):
 27 |         return key.lower() in cls._registry
 28 | 
 29 | 
 30 | def shared_like(param, suffix, init=0):
 31 |     '''Create a Theano shared variable like an existing parameter.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     param : Theano variable
 36 |         Theano variable to use for shape information.
 37 |     suffix : str
 38 |         Suffix to append to the parameter's name for the new variable.
 39 |     init : float or ndarray, optional
 40 |         Initial value of the shared variable. Defaults to 0.
 41 | 
 42 |     Returns
 43 |     -------
 44 |     shared : Theano shared variable
 45 |         A new shared variable with the same shape and data type as ``param``.
 46 |     '''
 47 |     return theano.shared(np.zeros_like(param.get_value()) + init,
 48 |                          name='{}_{}'.format(param.name, suffix),
 49 |                          broadcastable=param.broadcastable)
 50 | 
 51 | 
 52 | def as_float(x):
 53 |     '''Cast a floating point value to a Theano ``floatX`` symbol.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     x : float, ndarray, or Theano expression
 58 |         Some quantity to cast to floating point.
 59 | 
 60 |     Returns
 61 |     -------
 62 |     x : Theano expression
 63 |         A symbolic variable cast as a ``floatX`` value.
 64 |     '''
 65 |     return TT.cast(x, theano.config.floatX)
 66 | 
 67 | 
 68 | def find_inputs_and_params(node):
 69 |     '''Walk a computation graph and extract root variables.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     node : Theano expression
 74 |         A symbolic Theano expression to walk.
 75 | 
 76 |     Returns
 77 |     -------
 78 |     inputs : list Theano variables
 79 |         A list of candidate inputs for this graph. Inputs are nodes in the graph
 80 |         with no parents that are not shared and are not constants.
 81 |     params : list of Theano shared variables
 82 |         A list of candidate parameters for this graph. Parameters are nodes in
 83 |         the graph that are shared variables.
 84 |     '''
 85 |     queue, seen, inputs, params = [node], set(), set(), set()
 86 |     while queue:
 87 |         node = queue.pop()
 88 |         seen.add(node)
 89 |         queue.extend(p for p in node.get_parents() if p not in seen)
 90 |         if not node.get_parents():
 91 |             if isinstance(node, theano.compile.SharedVariable):
 92 |                 params.add(node)
 93 |             elif not isinstance(node, TT.Constant):
 94 |                 inputs.add(node)
 95 |     return list(inputs), list(params)
 96 | 
 97 | 
 98 | _detailed_callsite = False
 99 | 
100 | 
101 | def enable_detailed_callsite_logging():
102 |     '''Enable detailed callsite logging.'''
103 |     global _detailed_callsite
104 |     _detailed_callsite = True
105 | 
106 | 
107 | def log(msg, *args, **kwargs):
108 |     '''Log a message to the console.
109 | 
110 |     Parameters
111 |     ----------
112 |     msg : str
113 |         A string to display on the console. This can contain {}-style
114 |         formatting commands; the remaining positional and keyword arguments
115 |         will be used to fill them in.
116 |     '''
117 |     now = datetime.datetime.now()
118 |     module = 'downhill'
119 |     if _detailed_callsite:
120 |         caller = inspect.stack()[1]
121 |         parts = caller.filename.replace('.py', '').split('/')
122 |         module = '{}:{}'.format(
123 |             '.'.join(parts[parts.index('downhill')+1:]), caller.lineno)
124 |     click.echo(' '.join((
125 |         click.style(now.strftime('%Y%m%d'), fg='blue'),
126 |         click.style(now.strftime('%H%M%S'), fg='cyan'),
127 |         click.style(module, fg='magenta'),
128 |         msg.format(*args, **kwargs),
129 |     )))
130 | 
131 | 
132 | def log_param(name, value):
133 |     '''Log a parameter value to the console.
134 | 
135 |     Parameters
136 |     ----------
137 |     name : str
138 |         Name of the parameter being logged.
139 |     value : any
140 |         Value of the parameter being logged.
141 |     '''
142 |     log('setting {} = {}', click.style(str(name)),
143 |         click.style(str(value), fg='yellow'))
144 | 


--------------------------------------------------------------------------------
/examples/mnist-sparse-factorization.py:
--------------------------------------------------------------------------------
 1 | import downhill
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import skdata.mnist
 5 | import theano
 6 | import theano.tensor as TT
 7 | 
 8 | FLOAT = 'df'[theano.config.floatX == 'float32']
 9 | 
10 | 
11 | def load_mnist():
12 |     '''Load the MNIST digits dataset.'''
13 |     mnist = skdata.mnist.dataset.MNIST()
14 |     mnist.meta  # trigger download if needed.
15 | 
16 |     def arr(n, dtype):
17 |         arr = mnist.arrays[n]
18 |         return arr.reshape((len(arr), -1)).astype(dtype)
19 |     train_images = arr('train_images', np.float32) / 128 - 1
20 |     train_labels = arr('train_labels', np.uint8)
21 |     return ((train_images[:50000], train_labels[:50000, 0]),
22 |             (train_images[50000:], train_labels[50000:, 0]))
23 | 
24 | 
25 | def plot_images(imgs, loc=111, title=None, channels=1):
26 |     '''Plot an array of images.
27 | 
28 |     We assume that we are given a matrix of data whose shape is (n*n, s*s*c) --
29 |     that is, there are n^2 images along the first axis of the array, and each
30 |     image is c squares measuring s pixels on a side. Each row of the input will
31 |     be plotted as a sub-region within a single image array containing an n x n
32 |     grid of images.
33 |     '''
34 |     n = int(np.sqrt(len(imgs)))
35 |     assert n * n == len(imgs), 'images array must contain a square number of rows!'
36 |     s = int(np.sqrt(len(imgs[0]) / channels))
37 |     assert s * s == len(imgs[0]) / channels, 'images must be square!'
38 | 
39 |     img = np.zeros((s * n, s * n, channels), dtype=imgs[0].dtype)
40 |     for i, pix in enumerate(imgs):
41 |         r, c = divmod(i, n)
42 |         img[r * s:(r+1) * s, c * s:(c+1) * s] = pix.reshape((s, s, channels))
43 | 
44 |     img -= img.min()
45 |     img /= img.max()
46 | 
47 |     ax = plt.gcf().add_subplot(loc)
48 |     ax.xaxis.set_visible(False)
49 |     ax.yaxis.set_visible(False)
50 |     ax.set_frame_on(False)
51 |     ax.imshow(img.squeeze(), cmap=plt.cm.gray)
52 |     if title:
53 |         ax.set_title(title)
54 | 
55 | 
56 | (t_images, t_labels), (v_images, v_labels) = load_mnist()
57 | 
58 | # construct training/validation sets consisting of the fours.
59 | train = t_images[t_labels == 4]
60 | valid = v_images[v_labels == 4]
61 | 
62 | N = 20
63 | K = 20
64 | B = 784
65 | 
66 | x = TT.matrix('x')
67 | 
68 | u = theano.shared(np.random.randn(N * N, K * K).astype(FLOAT), name='u')
69 | v = theano.shared(np.random.randn(K * K, B).astype(FLOAT), name='v')
70 | 
71 | err = TT.sqr(x - TT.dot(u, v)).mean()
72 | 
73 | downhill.minimize(
74 |     loss=err + 100 * (0.01 * abs(u).mean() + (v * v).mean()),
75 |     params=[u, v],
76 |     inputs=[x],
77 |     train=train,
78 |     valid=valid,
79 |     batch_size=N * N,
80 |     monitor_gradients=True,
81 |     monitors=[
82 |         ('err', err),
83 |         ('u<-0.5', (u < -0.5).mean()),
84 |         ('u<-0.1', (u < -0.1).mean()),
85 |         ('u<0.1', (u < 0.1).mean()),
86 |         ('u<0.5', (u < 0.5).mean()),
87 |     ],
88 |     algo='sgd',
89 |     max_gradient_clip=1,
90 |     learning_rate=0.5,
91 |     momentum=0.9,
92 |     patience=3,
93 |     min_improvement=0.1,
94 | )
95 | 
96 | plot_images(v.get_value(), 121)
97 | plot_images(np.dot(u.get_value(), v.get_value()), 122)
98 | plt.show()
99 | 


--------------------------------------------------------------------------------
/examples/rosenbrock-100d.py:
--------------------------------------------------------------------------------
 1 | '''Optimization example using the 100-dimensional Rosenbrock "banana" function.
 2 | 
 3 | This example trains up several optimization algorithms with randomly chosen
 4 | hyperparameters and shows four histograms of the performance spectrum of each
 5 | hyperparameter.
 6 | 
 7 | This example is meant to show how optimization hyperparameters affect
 8 | performance across different optimization algorithms.
 9 | 
10 | Due to the large number of optimizers that are evaluated in this example, it can
11 | take a good while to run.
12 | '''
13 | 
14 | import itertools
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | 
18 | import rosenbrock
19 | 
20 | 
21 | algos = 'NAG RMSProp Adam ADADELTA ESGD'.split()
22 | results = rosenbrock.test(algos, n=10, init=[-1] * 100, limit=1000)
23 | 
24 | 
25 | # Here we make plots of the marginal performance of each of the four
26 | # hyperparameters. These are intended to get a sense of how random
27 | # hyperparameter selection gives a decent idea of how different algorithms
28 | # perform.
29 | 
30 | _, ((rate_ax, mu_ax), (half_ax, reg_ax)) = plt.subplots(2, 2)
31 | 
32 | by_algo = itertools.groupby(sorted(results), lambda item: item[0][0])
33 | for color, (algo, items) in zip(rosenbrock.COLORS, by_algo):
34 |     items = list(items)
35 |     values = np.zeros((len(items), 5), 'f')
36 |     for i, ((_, rate, mu, half, reg), (_, _, loss)) in enumerate(items):
37 |         values[i] = [rate, mu, half, reg, loss]
38 |     rates, mus, halfs, regs, losses = values.T
39 |     kw = dict(alpha=0.8, markersize=5, mew=2, mfc='none', mec=color)
40 |     rate_ax.plot(rates, losses, 'o', label=algo, **kw)
41 |     mu_ax.plot(mus, losses, 'o', label=algo, **kw)
42 |     half_ax.plot(halfs, losses, 'o', label=algo, **kw)
43 |     reg_ax.plot(regs, losses, 'o', label=algo, **kw)
44 | 
45 | for ax in [rate_ax, mu_ax, half_ax, reg_ax]:
46 |     ax.set_yscale('log')
47 |     ax.set_ylim(None, 4e4)
48 |     ax.xaxis.tick_bottom()
49 |     ax.yaxis.tick_left()
50 |     ax.spines['top'].set_color('none')
51 |     ax.spines['right'].set_color('none')
52 |     ax.spines['bottom'].set_position(('outward', 3))
53 |     ax.spines['left'].set_position(('outward', 3))
54 |     if ax != mu_ax:
55 |         ax.set_xscale('log')
56 | 
57 | rate_ax.set_ylabel('Loss')
58 | rate_ax.set_xlabel('Rate')
59 | 
60 | mu_ax.set_xlabel('Momentum')
61 | mu_ax.set_xlim(-0.05, 1.05)
62 | 
63 | half_ax.set_ylabel('Loss')
64 | half_ax.set_xlabel('RMS Halflife')
65 | 
66 | reg_ax.set_xlabel('RMS Regularizer')
67 | 
68 | plt.legend()
69 | plt.show()
70 | 


--------------------------------------------------------------------------------
/examples/rosenbrock-2d.py:
--------------------------------------------------------------------------------
 1 | '''Optimization example using the two-dimensional Rosenbrock "banana" function.
 2 | 
 3 | This example trains up several optimization algorithms and displays the
 4 | performance of each algorithm across several different (randomly-chosen)
 5 | hyperparameter settings.
 6 | 
 7 | This example is meant to show how different optimization algorithms perform when
 8 | given the same optimization problem. Many of the algorithms' performances are
 9 | strongly dependent on the values of various hyperparameters, such as the
10 | learning rate and momentum values.
11 | '''
12 | 
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | 
16 | import rosenbrock
17 | 
18 | 
19 | def by_loss(item):
20 |     '''Helper for sorting optimization runs by their final loss value.'''
21 |     label, (xs, ys, loss) = item
22 |     return loss
23 | 
24 | 
25 | def make_label(loss, key):
26 |     '''Create a legend label for an optimization run.'''
27 |     algo, rate, mu, half, reg = key
28 |     slots, args = ['{:.3f}', '{}', 'm={:.3f}'], [loss, algo, mu]
29 |     if algo in 'SGD NAG RMSProp Adam ESGD'.split():
30 |         slots.append('lr={:.2e}')
31 |         args.append(rate)
32 |     if algo in 'RMSProp ADADELTA ESGD'.split():
33 |         slots.append('rmsh={}')
34 |         args.append(half)
35 |         slots.append('rmsr={:.2e}')
36 |         args.append(reg)
37 |     return ' '.join(slots).format(*args)
38 | 
39 | 
40 | # Here we run a number of rosenbrock optimization algorithms and measure their
41 | # performance. Below we plot the results.
42 | 
43 | algos = 'SGD NAG RMSProp RProp Adam ADADELTA ESGD'.split()
44 | results = ((make_label(loss, key), xs, ys)
45 |            for key, (xs, ys, loss)
46 |            in sorted(rosenbrock.test(algos), key=by_loss))
47 | 
48 | _, ax = plt.subplots(1, 1)
49 | 
50 | for color, (label, xs, ys) in zip(rosenbrock.COLORS, results):
51 |     ax.plot(xs, ys, 'o-', color=color, label=label,
52 |             alpha=0.8, lw=2, markersize=5,
53 |             mew=1, mec=color, mfc='none')
54 | 
55 | # make a contour plot of the rosenbrock function surface.
56 | X, Y = np.meshgrid(np.linspace(-1.3, 1.3, 31), np.linspace(-0.9, 1.7, 31))
57 | Z = 100 * (Y - X ** 2) ** 2 + (1 - X) ** 2
58 | ax.plot([1], [1], 'x', mew=3, markersize=10, color='#111111')
59 | ax.contourf(X, Y, Z, np.logspace(-1, 3, 31), cmap='gray_r')
60 | 
61 | ax.set_xlim(-1.3, 1.3)
62 | ax.set_ylim(-0.9, 1.7)
63 | 
64 | plt.legend(loc='lower right')
65 | plt.show()
66 | 


--------------------------------------------------------------------------------
/examples/rosenbrock.py:
--------------------------------------------------------------------------------
 1 | '''Helper functions for rosenbrock optimization examples.'''
 2 | 
 3 | import downhill
 4 | import numpy as np
 5 | import theano
 6 | 
 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 8 | 
 9 | COLORS = ('#d62728 #1f77b4 #2ca02c #9467bd #ff7f0e '
10 |           '#e377c2 #8c564b #bcbd22 #7f7f7f #17becf').split()
11 | 
12 | FLOAT = 'df'[theano.config.floatX == 'float32']
13 | 
14 | 
15 | def build(algo, init):
16 |     '''Build and return an optimizer for the rosenbrock function.
17 | 
18 |     In downhill, an optimizer can be constructed using the build() top-level
19 |     function. This function requires several Theano quantities such as the loss
20 |     being optimized and the parameters to update during optimization.
21 |     '''
22 |     x = theano.shared(np.array(init, FLOAT), name='x')
23 |     n = 0.1 * RandomStreams().normal((len(init) - 1, ))
24 |     monitors = []
25 |     if len(init) == 2:
26 |         # this gives us access to the x and y locations during optimization.
27 |         monitors.extend([('x', x[:-1].sum()), ('y', x[1:].sum())])
28 |     return downhill.build(
29 |         algo,
30 |         loss=(n + 100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(),
31 |         params=[x],
32 |         monitors=monitors,
33 |         monitor_gradients=True)
34 | 
35 | 
36 | def build_and_trace(algo, init, limit=100, **kwargs):
37 |     '''Run an optimizer on the rosenbrock function. Return xs, ys, and losses.
38 | 
39 |     In downhill, optimization algorithms can be iterated over to progressively
40 |     minimize the loss. At each iteration, the optimizer yields a dictionary of
41 |     monitor values that were computed during that iteration. Here we build an
42 |     optimizer and then run it for a fixed number of iterations.
43 |     '''
44 |     kw = dict(min_improvement=0, patience=0, max_gradient_norm=100)
45 |     kw.update(kwargs)
46 |     xs, ys, loss = [], [], []
47 |     for tm, _ in build(algo, init).iterate([[]], **kw):
48 |         if len(init) == 2:
49 |             xs.append(tm['x'])
50 |             ys.append(tm['y'])
51 |         loss.append(tm['loss'])
52 |         if len(loss) == limit:
53 |             break
54 |     # Return the optimization up to any failure of patience.
55 |     return xs[:-9], ys[:-9], loss[-9]
56 | 
57 | 
58 | def test(algos, n=10, init=[-1.1, 0], limit=100):
59 |     '''Run several optimizers for comparison.
60 | 
61 |     Each optimizer is run a fixed number of times with random hyperparameter
62 |     values, and the results are yielded back to the caller (often stored in a
63 |     dictionary).
64 | 
65 |     Returns
66 |     -------
67 |     results : sequence of (key, value) pairs
68 |         A sequence of results from running tests. Each result contains a "key"
69 |         that describes the test run and a "value" that contains the results from
70 |         the run. The key is a tuple containing (a) the algorithm, (b) the
71 |         learning rate, (c) the momentum, (d) the RMS halflife, and (e) the RMS
72 |         regularizer. The value is a tuple containing the (a) x-values and (b)
73 |         y-values during the optimization, and (c) the loss value. (The x- and
74 |         y-value are only non-empty for 2D experiments.)
75 |     '''
76 |     for algo in algos:
77 |         for _ in range(n):
78 |             mu = max(0, np.random.uniform(0, 2) - 1)
79 |             rate = np.exp(np.random.uniform(-8, -1))
80 |             half = int(np.exp(np.random.uniform(0, 4)))
81 |             reg = np.exp(np.random.uniform(-12, 0))
82 |             yield (algo, rate, mu, half, reg), build_and_trace(
83 |                 algo, init, limit, momentum=mu, learning_rate=rate,
84 |                 rms_halflife=half, rms_regularizer=reg)
85 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [tool:pytest]
2 | #pep8ignore = E226,E302,E41
3 | pep8maxlinelength = 90
4 | 
5 | [bdist_wheel]
6 | universal = 1
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import setuptools
 3 | 
 4 | setuptools.setup(
 5 |     name='downhill',
 6 |     version='0.5.0pre',
 7 |     packages=setuptools.find_packages(),
 8 |     author='lmjohns3',
 9 |     author_email='downhill-users@googlegroups.com',
10 |     description='Stochastic optimization routines for Theano',
11 |     long_description=open(os.path.join(
12 |         os.path.dirname(os.path.abspath(__file__)), 'README.rst')).read(),
13 |     license='MIT',
14 |     url='http://github.com/lmjohns3/downhill',
15 |     keywords=('adadelta '
16 |               'adam '
17 |               'esgd '
18 |               'gradient-descent '
19 |               'nesterov '
20 |               'optimization '
21 |               'rmsprop '
22 |               'sgd '
23 |               'theano '
24 |               ),
25 |     install_requires=['theano', 'click'],
26 |     classifiers=[
27 |         'Development Status :: 4 - Beta',
28 |         'Intended Audience :: Science/Research',
29 |         'License :: OSI Approved :: MIT License',
30 |         'Operating System :: OS Independent',
31 |         'Topic :: Scientific/Engineering',
32 |         ],
33 |     )
34 | 


--------------------------------------------------------------------------------
/test/adaptive_test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import util
  4 | 
  5 | 
  6 | class TestESGD:
  7 |     def test_rosen(self):
  8 |         util.assert_progress(*util.build_rosen('esgd'), learning_rate=1e-6)
  9 | 
 10 |     def test_factor(self):
 11 |         util.assert_progress(*util.build_factor('esgd'), learning_rate=1e-6)
 12 | 
 13 |     def test_default_params(self):
 14 |         opt, data = util.build_rosen('esgd')
 15 |         assert opt.hv_method == 'rop'
 16 |         for _ in opt.iterate(data):
 17 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
 18 |             assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 14))
 19 |             assert np.allclose(opt.epsilon.eval(), 1e-8)
 20 |             break
 21 | 
 22 |     def test_params(self):
 23 |         opt, data = util.build_rosen('esgd')
 24 |         opt.hv_method = 'lop'  # TODO(leif): incorporate into downhill.build()?
 25 |         for _ in opt.iterate(data,
 26 |                              learning_rate=0.3,
 27 |                              rms_halflife=10,
 28 |                              rms_regularizer=20):
 29 |             assert np.allclose(opt.learning_rate.eval(), 0.3)
 30 |             assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 10))
 31 |             assert np.allclose(opt.epsilon.eval(), 20)
 32 |             break
 33 | 
 34 | 
 35 | class TestRProp:
 36 |     def test_rosen(self):
 37 |         util.assert_progress(*util.build_rosen('rprop'))
 38 | 
 39 |     def test_factor(self):
 40 |         util.assert_progress(*util.build_factor('rprop'))
 41 | 
 42 |     def test_default_params(self):
 43 |         opt, data = util.build_rosen('rprop')
 44 |         for _ in opt.iterate(data):
 45 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
 46 |             assert np.allclose(opt.step_increase.eval(), 1.01)
 47 |             assert np.allclose(opt.step_decrease.eval(), 0.99)
 48 |             assert np.allclose(opt.min_step.eval(), 0)
 49 |             assert np.allclose(opt.max_step.eval(), 100)
 50 |             break
 51 | 
 52 |     def test_params(self):
 53 |         opt, data = util.build_rosen('rprop')
 54 |         for _ in opt.iterate(data,
 55 |                              learning_rate=0.3,
 56 |                              rprop_increase=22,
 57 |                              rprop_decrease=101,
 58 |                              rprop_min_step=50,
 59 |                              rprop_max_step=-10):
 60 |             assert np.allclose(opt.learning_rate.eval(), 0.3)
 61 |             assert np.allclose(opt.step_increase.eval(), 22)
 62 |             assert np.allclose(opt.step_decrease.eval(), 101)
 63 |             assert np.allclose(opt.min_step.eval(), 50)
 64 |             assert np.allclose(opt.max_step.eval(), -10)
 65 |             break
 66 | 
 67 | 
 68 | class TestADAGRAD:
 69 |     def test_rosen(self):
 70 |         util.assert_progress(*util.build_rosen('adagrad'))
 71 | 
 72 |     def test_factor(self):
 73 |         util.assert_progress(*util.build_factor('adagrad'))
 74 | 
 75 |     def test_default_params(self):
 76 |         opt, data = util.build_rosen('adagrad')
 77 |         for _ in opt.iterate(data):
 78 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
 79 |             assert np.allclose(opt.epsilon.eval(), 1e-8)
 80 |             break
 81 | 
 82 |     def test_params(self):
 83 |         opt, data = util.build_rosen('adagrad')
 84 |         for _ in opt.iterate(data, rms_regularizer=0.1):
 85 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
 86 |             assert np.allclose(opt.epsilon.eval(), 0.1)
 87 |             break
 88 | 
 89 | 
 90 | class TestRMSProp:
 91 |     def test_rosen(self):
 92 |         util.assert_progress(*util.build_rosen('rmsprop'))
 93 | 
 94 |     def test_factor(self):
 95 |         util.assert_progress(*util.build_factor('rmsprop'))
 96 | 
 97 |     def test_default_params(self):
 98 |         opt, data = util.build_rosen('rmsprop')
 99 |         for _ in opt.iterate(data):
100 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
101 |             assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 14))
102 |             assert np.allclose(opt.epsilon.eval(), 1e-8)
103 |             break
104 | 
105 |     def test_params(self):
106 |         opt, data = util.build_rosen('rmsprop')
107 |         for _ in opt.iterate(data,
108 |                              learning_rate=0.3,
109 |                              rms_halflife=10,
110 |                              rms_regularizer=20):
111 |             assert np.allclose(opt.learning_rate.eval(), 0.3)
112 |             assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 10))
113 |             assert np.allclose(opt.epsilon.eval(), 20)
114 |             break
115 | 
116 | 
117 | class TestADADELTA:
118 |     def test_rosen(self):
119 |         util.assert_progress(*util.build_rosen('adadelta'))
120 | 
121 |     def test_factor(self):
122 |         util.assert_progress(*util.build_factor('adadelta'))
123 | 
124 |     def test_default_params(self):
125 |         opt, data = util.build_rosen('adadelta')
126 |         for _ in opt.iterate(data):
127 |             assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 14))
128 |             assert np.allclose(opt.epsilon.eval(), 1e-8)
129 |             break
130 | 
131 |     def test_params(self):
132 |         opt, data = util.build_rosen('adadelta')
133 |         for _ in opt.iterate(data,
134 |                              rms_halflife=10,
135 |                              rms_regularizer=20):
136 |             assert np.allclose(opt.ewma.eval(), np.exp(-np.log(2) / 10))
137 |             assert np.allclose(opt.epsilon.eval(), 20)
138 |             break
139 | 
140 | 
141 | class TestAdam:
142 |     def test_rosen(self):
143 |         util.assert_progress(*util.build_rosen('adam'))
144 | 
145 |     def test_factor(self):
146 |         util.assert_progress(*util.build_factor('adam'))
147 | 
148 |     def test_default_params(self):
149 |         opt, data = util.build_rosen('adam')
150 |         for _ in opt.iterate(data):
151 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
152 |             assert np.allclose(opt.beta1.eval(), np.exp(-np.log(2) / 7))
153 |             assert np.allclose(opt.beta2.eval(), np.exp(-np.log(2) / 69))
154 |             assert np.allclose(opt.epsilon.eval(), 1e-8)
155 |             break
156 | 
157 |     def test_params(self):
158 |         opt, data = util.build_rosen('adam')
159 |         for _ in opt.iterate(data,
160 |                              learning_rate=0.3,
161 |                              beta1_halflife=10,
162 |                              beta2_halflife=20,
163 |                              rms_regularizer=11):
164 |             assert np.allclose(opt.learning_rate.eval(), 0.3)
165 |             assert np.allclose(opt.beta1.eval(), np.exp(-np.log(2) / 10))
166 |             assert np.allclose(opt.beta2.eval(), np.exp(-np.log(2) / 20))
167 |             assert np.allclose(opt.epsilon.eval(), 11)
168 |             break
169 | 


--------------------------------------------------------------------------------
/test/base_test.py:
--------------------------------------------------------------------------------
 1 | import downhill
 2 | import numpy as np
 3 | 
 4 | import util
 5 | 
 6 | 
 7 | class TestBuild:
 8 |     def test_sgd(self):
 9 |         assert isinstance(util.build_rosen('sgd')[0], downhill.SGD)
10 |         assert isinstance(util.build_factor('sgd')[0], downhill.SGD)
11 | 
12 |     def test_nag(self):
13 |         assert isinstance(util.build_rosen('nag')[0], downhill.NAG)
14 | 
15 |     def test_rprop(self):
16 |         assert isinstance(util.build_rosen('RProp')[0], downhill.RProp)
17 | 
18 |     def test_rmsprop(self):
19 |         assert isinstance(util.build_rosen('RmsProp')[0], downhill.RMSProp)
20 | 
21 |     def test_adadelta(self):
22 |         assert isinstance(util.build_rosen('ADADELTA')[0], downhill.ADADELTA)
23 | 
24 |     def test_esgd(self):
25 |         assert isinstance(util.build_rosen('EsGd')[0], downhill.ESGD)
26 | 
27 |     def test_adam(self):
28 |         assert isinstance(util.build_rosen('Adam')[0], downhill.Adam)
29 | 
30 | 
31 | class Straight(downhill.Optimizer):
32 |     def _get_updates_for(self, param, grad):
33 |         yield (param, param + 1.1)
34 | 
35 | 
36 | class TestOptimizer:
37 |     def test_rosen(self):
38 |         opt, train = util.build_rosen('straight')
39 |         assert isinstance(opt, Straight)
40 | 
41 |         # run the optimizer for three iterations. check that the x and y values
42 |         # (being monitored) increase at each iteration.
43 |         for i, (tm, vm) in enumerate(opt.iterate(train, max_updates=3)):
44 |             assert tm['x'] >= vm['x']
45 |             assert tm['y'] >= vm['y']
46 |             assert i < 3
47 | 
48 |     def test_rosen_unnamed(self):
49 |         opt, train = util.build_rosen('straight', name=False, monitor_gradients=True)
50 |         assert isinstance(opt, Straight)
51 | 
52 |         # run the optimizer for three iterations. check that the x and y values
53 |         # (being monitored) increase at each iteration.
54 |         for i, (tm, vm) in enumerate(opt.iterate(train, max_updates=3)):
55 |             assert tm['x'] >= vm['x']
56 |             assert tm['y'] >= vm['y']
57 |             # check there's a manually-named parameter in here.
58 |             assert 1 == sum(1 for k in tm if 'unnamed' in k), tm
59 |             assert i < 3
60 | 
61 |     def test_factor(self):
62 |         opt, train = util.build_factor('straight')
63 |         assert isinstance(opt, Straight)
64 | 
65 |         # run the optimizer for two iterations. check that the u and v values
66 |         # (being monitored) are reasonable at the start.
67 |         for i, (tm, vm) in enumerate(opt.iterate(train)):
68 |             assert abs(vm['u<1'] - 0.001) < 1e-5
69 |             assert vm['u<-1'] == 0
70 |             assert vm['v<1'] == 1
71 |             assert vm['v<-1'] == 0
72 |             if i == 2:
73 |                 break
74 | 
75 |     def test_gradient_clip(self):
76 |         opt, data = util.build_rosen('straight')
77 |         for _ in opt.iterate(data, max_gradient_elem=3):
78 |             assert opt.max_gradient_elem == 3
79 |             break
80 | 
81 |     def test_set_params(self):
82 |         opt, _ = util.build_rosen('straight')
83 |         opt.set_params([[1, 2]])
84 |         assert np.allclose(opt._params[0].get_value(), [1, 2])
85 | 
86 |     def test_set_best_params(self):
87 |         opt, _ = util.build_rosen('straight')
88 |         opt._best_params = [[1, 2]]
89 |         opt.set_params('best')
90 |         assert np.allclose(opt._params[0].get_value(), [1, 2])
91 | 


--------------------------------------------------------------------------------
/test/dataset_test.py:
--------------------------------------------------------------------------------
  1 | import downhill
  2 | import numpy as np
  3 | import theano
  4 | import theano.tensor as TT
  5 | 
  6 | 
  7 | def assert_size(ds, i, expected):
  8 |     s = ds._slices[i][0][0]
  9 |     assert s.stop - s.start == expected
 10 | 
 11 | 
 12 | class TestDataset:
 13 |     def test_rng(self):
 14 |         ds = downhill.Dataset([np.random.randn(40, 2)], rng=4)
 15 |         assert ds.rng.randint(10) == 7
 16 |         ds = downhill.Dataset([np.random.randn(40, 2)], rng=np.random.RandomState(4))
 17 |         assert ds.rng.randint(10) == 7
 18 | 
 19 |     def test_name(self):
 20 |         ds = downhill.Dataset([np.random.randn(40, 2)], name='foo')
 21 |         assert ds.name == 'foo'
 22 |         ds = downhill.Dataset([np.random.randn(40, 2)])
 23 |         assert ds.name.startswith('dataset')
 24 |         assert ds.name[7:].isdigit()
 25 | 
 26 |     def test_batch_size(self):
 27 |         ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=10, rng=4)
 28 |         assert len(ds._slices) == 4
 29 |         assert_size(ds, 0, 10)
 30 |         assert_size(ds, 1, 10)
 31 |         assert_size(ds, 2, 10)
 32 |         assert_size(ds, 3, 10)
 33 |         ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=11, rng=4)
 34 |         assert len(ds._slices) == 4
 35 |         assert_size(ds, 0, 11)
 36 |         assert_size(ds, 1, 11)
 37 |         assert_size(ds, 2, 7)
 38 |         assert_size(ds, 3, 11)
 39 | 
 40 |     def test_batch_size_zero(self):
 41 |         ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=0, rng=4)
 42 |         assert len(ds._slices) == 1
 43 |         assert_size(ds, 0, 40)
 44 | 
 45 |     def test_iteration_size(self):
 46 |         def batches_unchanged(previous):
 47 |             return all(a == b for a, b in zip(ds._slices, previous))
 48 | 
 49 |         ds = downhill.Dataset([np.random.randn(40, 2)],
 50 |                               batch_size=5, iteration_size=3)
 51 | 
 52 |         previous = list(ds._slices)
 53 |         c = sum(1 for _ in ds)
 54 |         assert c == 3, 'got {}'.format(c)
 55 |         assert ds._index == 3, 'got {}'.format(ds._index)
 56 |         assert batches_unchanged(previous)
 57 | 
 58 |         previous = list(ds._slices)
 59 |         c = sum(1 for _ in ds)
 60 |         assert c == 3
 61 |         assert ds._index == 6, 'got {}'.format(ds._index)
 62 |         assert batches_unchanged(previous)
 63 | 
 64 |         previous = list(ds._slices)
 65 |         c = sum(1 for _ in ds)
 66 |         assert c == 3
 67 |         assert ds._index == 1, 'got {}'.format(ds._index)
 68 |         assert not batches_unchanged(previous)
 69 | 
 70 |     def test_callable(self):
 71 |         def batches():
 72 |             return 'hello'
 73 |         ds = downhill.Dataset(batches, iteration_size=10)
 74 |         assert list(ds) == ['hello'] * 10
 75 | 
 76 |     def test_callable_length(self):
 77 |         class Batches:
 78 |             called = 0
 79 | 
 80 |             def __call__(self):
 81 |                 self.called += 1
 82 |                 return 'hello'
 83 | 
 84 |             def __len__(self):
 85 |                 return 10
 86 | 
 87 |         batches = Batches()
 88 |         ds = downhill.Dataset(batches, iteration_size=10)
 89 |         assert list(ds) == ['hello'] * 10
 90 |         assert batches.called == 10
 91 | 
 92 |     def test_shared(self):
 93 |         x = theano.shared(np.random.randn(40, 2))
 94 |         ds = downhill.Dataset([x], batch_size=10, rng=4)
 95 |         assert len(ds._slices) == 4
 96 |         assert_size(ds, 0, 10)
 97 |         assert_size(ds, 1, 10)
 98 |         assert_size(ds, 2, 10)
 99 |         assert_size(ds, 3, 10)
100 |         f = list(ds)[0][0]
101 |         assert isinstance(f, TT.TensorVariable), type(f)
102 | 
103 |     def test_pandas(self):
104 |         import pandas as pd
105 |         x = pd.DataFrame(np.random.randn(40, 2))
106 |         ds = downhill.Dataset([x], batch_size=10, rng=4)
107 |         assert len(ds._slices) == 4
108 |         assert_size(ds, 0, 10)
109 |         assert_size(ds, 1, 10)
110 |         assert_size(ds, 2, 10)
111 |         assert_size(ds, 3, 10)
112 |         f = list(ds)[0][0]
113 |         assert isinstance(f, pd.DataFrame), type(f)
114 | 
115 |     def test_sparse_csc(self):
116 |         import scipy.sparse as ss
117 |         x = ss.csc_matrix(np.random.randn(40, 2))
118 |         ds = downhill.Dataset([x], batch_size=10, rng=4)
119 |         assert len(ds._slices) == 4
120 |         assert_size(ds, 0, 10)
121 |         assert_size(ds, 1, 10)
122 |         assert_size(ds, 2, 10)
123 |         assert_size(ds, 3, 10)
124 |         f = list(ds)[0][0]
125 |         assert isinstance(f, ss.csc.csc_matrix), type(f)
126 | 
127 |     def test_sparse_csr(self):
128 |         import scipy.sparse as ss
129 |         x = ss.csr_matrix(np.random.randn(40, 2))
130 |         ds = downhill.Dataset([x], batch_size=10, rng=4)
131 |         assert len(ds._slices) == 4
132 |         assert_size(ds, 0, 10)
133 |         assert_size(ds, 1, 10)
134 |         assert_size(ds, 2, 10)
135 |         assert_size(ds, 3, 10)
136 |         f = list(ds)[0][0]
137 |         assert isinstance(f, ss.csr.csr_matrix), type(f)
138 | 
139 |     def test_bad_input_type(self):
140 |         try:
141 |             downhill.Dataset([[1]])
142 |             assert False
143 |         except ValueError:
144 |             pass
145 | 


--------------------------------------------------------------------------------
/test/downhill_test.py:
--------------------------------------------------------------------------------
 1 | import downhill
 2 | import numpy as np
 3 | import theano
 4 | 
 5 | 
 6 | class TestMinimize:
 7 |     def test_minimize(self):
 8 |         x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x')
 9 |         data = downhill.Dataset(np.zeros((1, 1), 'f'), batch_size=1)
10 |         data._slices = [[]]
11 |         downhill.minimize(
12 |             (100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(),
13 |             data,
14 |             algo='nag',
15 |             learning_rate=0.001,
16 |             momentum=0.9,
17 |             patience=1,
18 |             min_improvement=0.1,
19 |             max_gradient_norm=1,
20 |         )
21 |         assert np.allclose(x.get_value(), [1, 1]), x.get_value()
22 | 


--------------------------------------------------------------------------------
/test/first_order_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import util
 4 | 
 5 | 
 6 | class TestSGD:
 7 |     def test_rosen(self):
 8 |         util.assert_progress(
 9 |             *util.build_rosen('sgd'),
10 |             monitor_gradients=True)
11 | 
12 |     def test_factor(self):
13 |         util.assert_progress(
14 |             *util.build_factor('sgd'),
15 |             max_gradient_elem=1,
16 |             nesterov=False)
17 | 
18 |     def test_factor_nesterov(self):
19 |         util.assert_progress(
20 |             *util.build_factor('sgd'),
21 |             max_gradient_norm=1)
22 | 
23 |     def test_default_params(self):
24 |         opt, data = util.build_rosen('sgd')
25 |         for _ in opt.iterate(data):
26 |             assert opt.nesterov is False
27 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
28 |             assert np.allclose(opt.momentum, 0)
29 |             assert np.allclose(opt.patience, 5)
30 |             assert np.allclose(opt.min_improvement, 0)
31 |             assert np.allclose(opt.max_gradient_norm, 0)
32 |             assert np.allclose(opt.max_gradient_elem, 0)
33 |             break
34 | 
35 |     def test_params(self):
36 |         opt, data = util.build_rosen('sgd')
37 |         for _ in opt.iterate(data,
38 |                              learning_rate=0.3,
39 |                              momentum=10,
40 |                              patience=20,
41 |                              min_improvement=0.1,
42 |                              max_gradient_elem=4,
43 |                              max_gradient_norm=5,
44 |                              nesterov=True):
45 |             assert opt.nesterov is True
46 |             assert np.allclose(opt.learning_rate.eval(), 0.3)
47 |             assert np.allclose(opt.momentum, 10)
48 |             assert np.allclose(opt.patience, 20)
49 |             assert np.allclose(opt.min_improvement, 0.1)
50 |             assert np.allclose(opt.max_gradient_norm, 5)
51 |             assert np.allclose(opt.max_gradient_elem, 4)
52 |             break
53 | 
54 | 
55 | class TestNAG:
56 |     def test_rosen(self):
57 |         util.assert_progress(*util.build_rosen('nag'))
58 | 
59 |     def test_factor(self):
60 |         util.assert_progress(*util.build_factor('nag'), max_gradient_elem=1)
61 | 
62 |     def test_default_params(self):
63 |         opt, data = util.build_rosen('nag')
64 |         for _ in opt.iterate(data):
65 |             assert opt.nesterov is True
66 |             assert np.allclose(opt.learning_rate.eval(), 1e-4)
67 |             assert np.allclose(opt.momentum, 0)
68 |             assert np.allclose(opt.patience, 5)
69 |             assert np.allclose(opt.min_improvement, 0)
70 |             assert np.allclose(opt.max_gradient_norm, 0)
71 |             assert np.allclose(opt.max_gradient_elem, 0)
72 |             break
73 | 
74 |     def test_params(self):
75 |         opt, data = util.build_rosen('nag')
76 |         for _ in opt.iterate(data,
77 |                              learning_rate=0.3,
78 |                              momentum=10,
79 |                              patience=20,
80 |                              min_improvement=0.1,
81 |                              max_gradient_elem=4,
82 |                              max_gradient_norm=5,
83 |                              nesterov=False):
84 |             assert opt.nesterov is True  # nesterov always True for NAG
85 |             assert np.allclose(opt.learning_rate.eval(), 0.3)
86 |             assert np.allclose(opt.momentum, 10)
87 |             assert np.allclose(opt.patience, 20)
88 |             assert np.allclose(opt.min_improvement, 0.1)
89 |             assert np.allclose(opt.max_gradient_norm, 5)
90 |             assert np.allclose(opt.max_gradient_elem, 4)
91 |             break
92 | 


--------------------------------------------------------------------------------
/test/util.py:
--------------------------------------------------------------------------------
 1 | import downhill
 2 | import numpy as np
 3 | import theano
 4 | import theano.tensor as TT
 5 | 
 6 | 
 7 | def build_rosen(algo, name=True, monitor_gradients=False):
 8 |     x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x' if name else None)
 9 |     return downhill.build(
10 |         algo,
11 |         loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(),
12 |         monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())],
13 |         monitor_gradients=monitor_gradients,
14 |     ), None
15 | 
16 | 
17 | def build_factor(algo):
18 |     a = np.arange(1000).reshape((100, 10)).astype('f')
19 |     b = 0.1 + np.zeros((10, 100), 'f')
20 | 
21 |     x = TT.matrix('x')
22 |     u = theano.shared(a, name='u')
23 |     v = theano.shared(0.1 + b, name='v')
24 |     return downhill.build(
25 |         algo,
26 |         loss=TT.sum(TT.sqr(x - TT.dot(u, v))),
27 |         monitors=[
28 |             ('u<1', (u < 1).mean()),
29 |             ('u<-1', (u < -1).mean()),
30 |             ('v<1', (v < 1).mean()),
31 |             ('v<-1', (v < -1).mean()),
32 |         ]), [[np.dot(a, b) + np.random.randn(100, 100).astype('f')]
33 |              for _ in range(10)]
34 | 
35 | 
36 | def assert_progress(opt, train, valid=None, **kwargs):
37 |     mover = opt.iterate(train, valid=valid, **kwargs)
38 |     train0, valid0 = next(mover)
39 |     train1, valid1 = next(mover)
40 |     assert train1['loss'] < valid0['loss']   # should have made progress!
41 |     assert valid1['loss'] == valid0['loss']  # no new validation occurred
42 | 


--------------------------------------------------------------------------------