├── CITATION.cff ├── hessianlearn ├── test │ ├── __init__.py │ ├── test_randomizedEigensolver.py │ ├── test_rangeFinders.py │ ├── test_HessianlearnModel.py │ └── test_varianceBasedNystrom.py ├── version.py ├── model │ └── __init__.py ├── utilities │ ├── plotting.py │ ├── __init__.py │ ├── parameterList.py │ └── finiteDifferenceCheck.py ├── __init__.py ├── data │ ├── __init__.py │ └── lfw.py ├── problem │ ├── __init__.py │ ├── preconditioner.py │ ├── regularization.py │ └── hessian.py └── algorithms │ ├── __init__.py │ ├── globalization.py │ ├── gradientDescent.py │ ├── adam.py │ ├── optimizer.py │ ├── gmresSolver.py │ ├── inexactNewtonMINRES.py │ ├── inexactNewtonGMRES.py │ ├── minresSolver.py │ ├── rangeFinders.py │ ├── inexactNewtonCG.py │ ├── randomizedEigensolver.py │ ├── varianceBasedNystrom.py │ ├── lowRankSaddleFreeNewton.py │ └── cgSolver.py ├── ci └── run_tests.py ├── .travis.yml ├── .gitignore ├── applications ├── README.md ├── mnist │ ├── mnist_autoencoder.py │ └── mnist_vae.py └── transfer_learning │ ├── imagenet_cifar10_classification_evaluate_test.py │ ├── imagenet_cifar100_classification_evaluate_test.py │ ├── imagenet_cifar10_classification.py │ └── imagenet_cifar100_classification.py ├── LICENSE.md └── README.md /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "O'Leary-Roseberry" 5 | given-names: "Thomas" 6 | orcid: "https://orcid.org/0000-0002-8938-7074" 7 | title: "hessianlearn: Stochastic Nonconvex Optimization in TensorFlow and keras" 8 | version: 0.1.0 9 | doi: 10.5281/zenodo.4608644 10 | date-released: 2021-03-16 11 | url: "https://github.com/tomoleary/hessianlearn" -------------------------------------------------------------------------------- /hessianlearn/test/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | -------------------------------------------------------------------------------- /hessianlearn/version.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | version_info = (0, 2, 0) 19 | __version__ = '.'.join([str(x) for x in version_info]) -------------------------------------------------------------------------------- /hessianlearn/model/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from .model import HessianlearnModel, HessianlearnModelSettings 19 | 20 | from .kerasModelWrapper import KerasModelWrapper, KerasModelWrapperSettings 21 | -------------------------------------------------------------------------------- /hessianlearn/utilities/plotting.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | from abc import ABC, abstractmethod 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /hessianlearn/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | from .parameterList import ParameterList 21 | 22 | from .finiteDifferenceCheck import finite_difference_check -------------------------------------------------------------------------------- /hessianlearn/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | from .algorithms import * 21 | 22 | from .problem import * 23 | 24 | from .model import * 25 | 26 | from .data import * 27 | 28 | from .utilities import * -------------------------------------------------------------------------------- /hessianlearn/data/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | from .data import * 21 | 22 | # from .mnist import load_mnist 23 | 24 | # from .lfw import load_lfw 25 | 26 | # from .cifar10 import load_cifar10 27 | -------------------------------------------------------------------------------- /ci/run_tests.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | import os 19 | 20 | # Run randomized eigenvalue test 21 | os.system('python hessianlearn/test/test_randomizedEigensolver.py') 22 | os.system('python hessianlearn/test/test_rangeFinders.py') 23 | os.system('python hessianlearn/test/test_HessianlearnModel.py') 24 | 25 | -------------------------------------------------------------------------------- /hessianlearn/problem/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | from .problem import Problem, ClassificationProblem, KerasModelProblem, RegressionProblem, H1RegressionProblem,\ 21 | AutoencoderProblem,VariationalAutoencoderProblem, GenerativeAdversarialNetworkProblem 22 | 23 | from .hessian import Hessian, HessianWrapper 24 | 25 | from .preconditioner import Preconditioner, IdentityPreconditioner 26 | 27 | from .regularization import Regularization, L2Regularization -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | language: python 19 | python: 20 | - "3.6" 21 | - "3.7" 22 | install: 23 | - sudo apt-get update 24 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 25 | - bash miniconda.sh -b -p $HOME/miniconda 26 | - source "$HOME/miniconda/etc/profile.d/conda.sh" 27 | - hash -r 28 | - "export PYTHONPATH=$PYTHONPATH:$(pwd)" 29 | - conda config --set always_yes yes --set changeps1 no 30 | - conda update -q conda 31 | # # Useful for debugging any issues with conda 32 | # - conda info -a 33 | # Replace dep1 dep2 ... with your dependencies 34 | - conda create -n hessianlearn2 python=$TRAVIS_PYTHON_VERSION tensorflow=2.0.0 scipy 35 | - conda activate hessianlearn2 36 | # # - python setup.py install 37 | script: 38 | # - ls -l 39 | # - ls ci/ 40 | - python ci/run_tests.py -------------------------------------------------------------------------------- /hessianlearn/test/test_randomizedEigensolver.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import unittest 20 | import numpy as np 21 | import sys 22 | 23 | sys.path.append('../../') 24 | from hessianlearn import (randomized_eigensolver) 25 | 26 | class TestRandomizedEigensolver(unittest.TestCase): 27 | 28 | def test_basic(self): 29 | my_state = np.random.RandomState(seed=0) 30 | n = 100 31 | Q,_ = np.linalg.qr(my_state.randn(n,n)) 32 | d = np.concatenate((np.ones(10),np.exp(-np.arange(n-10)))) 33 | Aop = lambda x: Q@np.diag(d)@(Q.T@x) 34 | d_hl, Q_hl = randomized_eigensolver(Aop,100, 100) 35 | assert np.linalg.norm(d[:50] - d_hl[0:50]) < 1e-10 36 | error = np.linalg.norm(Q@np.diag(d)@Q.T - Q_hl@np.diag(d_hl)@Q_hl.T) 37 | assert error < 1e-10 38 | 39 | if __name__ == '__main__': 40 | unittest.main() -------------------------------------------------------------------------------- /hessianlearn/test/test_rangeFinders.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import unittest 20 | import numpy as np 21 | import sys 22 | 23 | sys.path.append('../../') 24 | from hessianlearn import (block_range_finder) 25 | 26 | class TestRangeFinders(unittest.TestCase): 27 | 28 | def test_basic(self): 29 | my_state = np.random.RandomState(seed=0) 30 | n = 100 31 | Q,_ = np.linalg.qr(my_state.randn(n,n)) 32 | d = np.concatenate((np.ones(10),np.exp(-np.arange(n-10)))) 33 | Aop = lambda x: Q@np.diag(d)@(Q.T@x) 34 | 35 | Q_range = block_range_finder(Aop,100,1e-5,10) 36 | assert Q_range.shape[-1] <=40 37 | w_action = my_state.randn(100,1) 38 | action = Aop(w_action) 39 | error = np.linalg.norm(action - Q_range@(Q_range.T@ action)) 40 | print(error) 41 | assert error < 1e-5 42 | 43 | if __name__ == '__main__': 44 | unittest.main() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Misc 7 | *.gz 8 | *.npy 9 | *.png 10 | *.h5 11 | *.DS_Store 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | from .randomizedEigensolver import low_rank_hessian, randomized_eigensolver, eigensolver_from_range 21 | 22 | from .rangeFinders import block_range_finder, noise_aware_adaptive_range_finder 23 | 24 | from .varianceBasedNystrom import variance_based_nystrom 25 | 26 | from .optimizer import Optimizer, ParametersOptimizer 27 | 28 | from .cgSolver import CGSolver, ParametersCGSolver 29 | 30 | from .gmresSolver import GMRESSolver, ParametersGMRESSolver 31 | 32 | from .minresSolver import MINRESSolver, ParametersMINRESSolver 33 | 34 | from .adam import Adam, ParametersAdam 35 | 36 | from .gradientDescent import GradientDescent, ParametersGradientDescent 37 | 38 | from .inexactNewtonCG import InexactNewtonCG, ParametersInexactNewtonCG 39 | 40 | from .inexactNewtonGMRES import InexactNewtonGMRES, ParametersInexactNewtonGMRES 41 | 42 | from .inexactNewtonMINRES import InexactNewtonMINRES, ParametersInexactNewtonMINRES 43 | 44 | from .lowRankSaddleFreeNewton import LowRankSaddleFreeNewton, ParametersLowRankSaddleFreeNewton 45 | -------------------------------------------------------------------------------- /hessianlearn/utilities/parameterList.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | 15 | from __future__ import absolute_import, division, print_function 16 | 17 | class ParameterList(object): 18 | """ 19 | A small abstract class for storing parameters and their description. 20 | This class will raise an exception if the key one tries to access is not present. 21 | """ 22 | def __init__(self, data): 23 | """ 24 | data is a dictionary where each value is the pair (value, description) 25 | """ 26 | self.data = data 27 | 28 | def __getitem__(self,key): 29 | if self.data.__contains__(key): 30 | return self.data[key][0] 31 | else: 32 | raise ValueError(key) 33 | 34 | def __setitem__(self,key, value): 35 | if self.data.__contains__(key): 36 | self.data[key][0] = value 37 | else: 38 | raise ValueError(key) 39 | 40 | def showMe(self, indent=""): 41 | for k in sorted(self.data.keys()): 42 | print( indent, "---") 43 | if type(self.data[k][0]) == ParameterList: 44 | print( indent, k, "(ParameterList):", self.data[k][1] ) 45 | self.data[k][0].showMe(indent+" ") 46 | else: 47 | print( indent, k, "({0}):".format(self.data[k][0]), self.data[k][1] ) 48 | 49 | print( indent, "---") -------------------------------------------------------------------------------- /hessianlearn/problem/preconditioner.py: -------------------------------------------------------------------------------- 1 | 2 | # This file is part of the hessianlearn package 3 | # 4 | # hessianlearn is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU Lesser General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or any later version. 7 | # 8 | # hessianlearn is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU Lesser General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # If not, see . 15 | # 16 | # Author: Tom O'Leary-Roseberry 17 | # Contact: tom.olearyroseberry@utexas.edu 18 | 19 | from __future__ import absolute_import, division, print_function 20 | import numpy as np 21 | import tensorflow as tf 22 | # tf.compat.v1.enable_eager_execution() 23 | if int(tf.__version__[0]) > 1: 24 | import tensorflow.compat.v1 as tf 25 | tf.disable_v2_behavior() 26 | # tf.enable_eager_execution() 27 | 28 | class Preconditioner(object): 29 | """ 30 | This class describes a preconditioner, currently it is empty 31 | 32 | Child class should implement method __call__ which implements 33 | the preconditioner approximation of the (Hessian) inverse 34 | """ 35 | 36 | 37 | class IdentityPreconditioner(Preconditioner): 38 | """ 39 | This class describes identity preconditioning, which means doing nothing 40 | """ 41 | def __init__(self,problem,dtype = tf.float32): 42 | """ 43 | The constructor for this class takes: 44 | -problem: hessianlearn.problem.Problem class 45 | -dtype: data type 46 | """ 47 | # Rethink this later and improve for Krylov methods. 48 | self.x = tf.placeholder(dtype,problem.gradient.shape,name='vec_for_prec_apply') 49 | 50 | 51 | def __call__(self): 52 | """ 53 | The call method simply returns vector which must be passed to 54 | the sess at runtime. self.x is a placeholder variable. 55 | """ 56 | return self.x 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /hessianlearn/problem/regularization.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | import numpy as np 20 | import tensorflow as tf 21 | # tf.compat.v1.enable_eager_execution() 22 | if int(tf.__version__[0]) > 1: 23 | import tensorflow.compat.v1 as tf 24 | # tf.disable_v2_behavior() 25 | # tf.enable_eager_execution() 26 | 27 | from ..utilities import ParameterList 28 | from abc import ABC, abstractmethod 29 | 30 | 31 | def ParametersRegularization(dictionary = {}): 32 | parameters = dictionary 33 | parameters["gamma"] = [1e-1, "regularization parameter"] 34 | 35 | return ParameterList(parameters) 36 | 37 | class Regularization (ABC): 38 | """ 39 | This class describes the components of regularization used during training. 40 | 41 | The child class implements the specifics during construction 42 | """ 43 | 44 | @property 45 | def cost(self): 46 | return self._cost 47 | 48 | @property 49 | def gradient(self): 50 | return self._gradient 51 | 52 | @property 53 | def Hdw(self): 54 | return self._Hdw 55 | 56 | class L2Regularization(Regularization): 57 | """ 58 | This class implements standard Tikhonov (L2) regularization 59 | with regularization parameter gamma 60 | (gamma/2)||w||^2 61 | """ 62 | def __init__(self,problem, gamma = None,parameters = ParametersRegularization(),dtype = tf.float32): 63 | """ 64 | The constructor for this class takes 65 | -problem: The description of the training problem i.e. hessianlearn.problem.Problem variant 66 | -gamma: The regularization parameter, can be found via Morozov discrepancy, trial and error etc. 67 | """ 68 | self.problem = problem 69 | self.parameters = parameters 70 | 71 | if gamma is not None: 72 | self.parameters['gamma'] = gamma 73 | 74 | self._cost = 0.5*self.parameters['gamma']*tf.reduce_sum(self.problem._flat_w*self.problem._flat_w) 75 | 76 | self._gradient = self.parameters['gamma']*self.problem._flat_w 77 | 78 | self._Hdw = self.parameters['gamma']*self.problem.dw 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/globalization.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | import numpy as np 20 | 21 | # from ..utilities.mpiFunctions import * 22 | 23 | 24 | 25 | 26 | def ArmijoLineSearch(w_dir,w_dir_inner_g,cost_at_candidate, initial_cost, c_armijo = 1e-4 ,alpha =1.0, max_backtracking_iter = 10,comm = None): 27 | """ 28 | This function implements Armijo line search given: 29 | -w_dir: 30 | -w_dir_inner_g: 31 | -cost_at_candidate: 32 | -initial_cost: 33 | -c_armijo: 34 | -alpha: 35 | -max_backtracking_iter: 36 | -comm 37 | """ 38 | # Armijo Line Search 39 | line_search, line_search_iter = ( True, 0 ) 40 | while line_search and (line_search_iter 0.75 and on_boundary: 79 | self.radius *= 2. 80 | # self.delta *= max(2,self.delta_hat) 81 | if rho > self.eta: 82 | accept_step = True 83 | else: 84 | accept_step = False 85 | 86 | return accept_step 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /applications/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ___ ___ ___ ___ ___ ___ 6 | /__/\ / /\ / /\ / /\ ___ / /\ /__/\ 7 | \ \:\ / /:/_ / /:/_ / /:/_ / /\ / /::\ \ \:\ 8 | \__\:\ / /:/ /\ / /:/ /\ / /:/ /\ / /:/ / /:/\:\ \ \:\ 9 | ___ / /::\ / /:/ /:/_ / /:/ /::\ / /:/ /::\ /__/::\ / /:/~/::\ _____\__\:\ 10 | /__/\ /:/\:\/__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/\:\\__\/\:\__ /__/:/ /:/\:\/__/::::::::\ 11 | \ \:\/:/__\/\ \:\/:/ /:/\ \:\/:/~/:/\ \:\/:/~/:/ \ \:\/\\ \:\/:/__\/\ \:\~~\~~\/ 12 | \ \::/ \ \::/ /:/ \ \::/ /:/ \ \::/ /:/ \__\::/ \ \::/ \ \:\ ~~~ 13 | \ \:\ \ \:\/:/ \__\/ /:/ \__\/ /:/ /__/:/ \ \:\ \ \:\ 14 | \ \:\ \ \::/ /__/:/ /__/:/ \__\/ \ \:\ \ \:\ 15 | \__\/ \__\/ \__\/ \__\/ \__\/ \__\/ 16 | 17 | 18 | ___ ___ ___ ___ 19 | / /\ / /\ / /\ /__/\ 20 | / /:/_ / /::\ / /::\ \ \:\ 21 | ___ ___ / /:/ /\ / /:/\:\ / /:/\:\ \ \:\ 22 | /__/\ / /\ / /:/ /:/_ / /:/~/::\ / /:/~/:/ _____\__\:\ 23 | \ \:\ / /://__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/___/__/::::::::\ 24 | \ \:\ /:/ \ \:\/:/ /:/\ \:\/:/__\/\ \:\/:::::/\ \:\~~\~~\/ 25 | \ \:\/:/ \ \::/ /:/ \ \::/ \ \::/~~~~ \ \:\ ~~~ 26 | \ \::/ \ \:\/:/ \ \:\ \ \:\ \ \:\ 27 | \__\/ \ \::/ \ \:\ \ \:\ \ \:\ 28 | \__\/ \__\/ \__\/ \__\/ 29 | 30 | 31 | 32 | # Transfer Learning 33 | 34 | * Examples of CIFAR10, CIFAR100 classification from pre-trained Imagenet ResNet50 model in `transfer_learning/` 35 | 36 | * Pre-trained model serves as well conditioned initial guess for transfer learning. In this setting Newton methods perform well due to their excellent properties in local convergence. Low Rank Saddle Free Newton is able to zero in on highly generalizable local minimizers bypassing indefinite regions. Below are validation accuracies of best choices of fixed step-length for Adam, SGD and LRSFN with fixed rank of 40. 37 | 38 |

39 | 40 |

41 | 42 | * For more information see the following manuscript 43 | 44 | - \[2\] O'Leary-Roseberry, T., Alger, N., Ghattas O., 45 | [**Low Rank Saddle Free Newton: A Scalable Method for Stochastic Nonconvex Optimization**](https://arxiv.org/abs/2002.02881). 46 | arXiv:2002.02881. 47 | ([Download](https://arxiv.org/pdf/2002.02881.pdf))
BibTeX
48 | @article{OLearyRoseberryAlgerGhattas2020,
49 |   title={Low Rank Saddle Free Newton: Algorithm and Analysis},
50 |   author={O'Leary-Roseberry, Thomas and Alger, Nick and Ghattas, Omar},
51 |   journal={arXiv preprint arXiv:2002.02881},
52 |   year={2020}
53 | }
54 | }
55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /hessianlearn/test/test_HessianlearnModel.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import unittest 20 | import numpy as np 21 | import os 22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 23 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 24 | os.environ["KMP_WARNINGS"] = "FALSE" 25 | 26 | import tensorflow as tf 27 | if int(tf.__version__[0]) > 1: 28 | import tensorflow.compat.v1 as tf 29 | tf.disable_v2_behavior() 30 | 31 | 32 | import sys 33 | sys.path.append('../../') 34 | from hessianlearn import (HessianlearnModel, HessianlearnModelSettings, 35 | ClassificationProblem,Data, L2Regularization) 36 | 37 | tf.set_random_seed(0) 38 | 39 | class TestHessianlearnModel(unittest.TestCase): 40 | 41 | def test_all_optimizers(self): 42 | # Instantiate data 43 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 44 | # Normalize the data 45 | x_train = x_train.astype('float32') / 255. 46 | x_test = x_test.astype('float32') / 255. 47 | def one_hot_vectors(labels_temp): 48 | labels = np.zeros((labels_temp.shape[0],10)) 49 | for i,label in enumerate(labels_temp): 50 | labels[i,label] = 1 51 | return labels 52 | y_train = one_hot_vectors(y_train) 53 | y_test = one_hot_vectors(y_test) 54 | # Instantiate neural network 55 | classifier = tf.keras.Sequential([ 56 | tf.keras.layers.Flatten(input_shape=(28, 28)), 57 | tf.keras.layers.Dense(128, activation='relu'), 58 | tf.keras.layers.Dense(10) 59 | ]) 60 | # Instantiate the problem, regularization. 61 | problem = ClassificationProblem(classifier,loss_type = 'cross_entropy',dtype=tf.float32) 62 | regularization = L2Regularization(problem,gamma =0.) 63 | # Instante the data object 64 | train_dict = {problem.x:x_train, problem.y_true:y_train} 65 | validation_dict = {problem.x:x_test, problem.y_true:y_test} 66 | data = Data(train_dict,32,validation_data = validation_dict,hessian_batch_size = 8) 67 | # Instantiate the model object 68 | HLModelSettings = HessianlearnModelSettings() 69 | HLModelSettings['max_sweeps'] = 1. 70 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings) 71 | 72 | for optimizer in ['lrsfn','adam','gd','sgd','incg']: 73 | HLModel.settings['optimizer'] = optimizer 74 | if optimizer == 'incg': 75 | HLModel.settings['alpha'] = 1e-4 76 | HLModel.fit() 77 | first_loss = HLModel.logger['train_loss'][0] 78 | last_iteration = max(HLModel.logger['train_loss'].keys()) 79 | last_loss = HLModel.logger['train_loss'][last_iteration] 80 | print('first loss = ',first_loss) 81 | print('last_loss = ',last_loss) 82 | assert last_loss < first_loss 83 | 84 | 85 | if __name__ == '__main__': 86 | unittest.main() -------------------------------------------------------------------------------- /hessianlearn/utilities/finiteDifferenceCheck.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | import numpy as np 20 | from numpy.linalg import norm 21 | # import tensorflow as tf 22 | # if int(tf.__version__[0]) > 1: 23 | # import tensorflow.compat.v1 as tf 24 | # tf.disable_v2_behavior() 25 | 26 | 27 | def finite_difference_check(sess,problem, feed_dict, w = None, dw=None,verbose = False): 28 | """ 29 | This method implements finite difference checks for a given hessianlearn.problem.Problem 30 | -sess: tf.Session() 31 | -problem: hessianlearn.problem.Problem 32 | -feed_dict: data used for computation of cost, grad and hess 33 | -w: the point the finite difference check is evaluated at 34 | -dw: the direction for the finite difference check 35 | -verbose: Boolean for printing 36 | """ 37 | 38 | if w is None: 39 | w = sess.run(problem.w) 40 | # w_zeros = [] 41 | # for w_i in w: 42 | # w_zeros.append(np.zeros_like(w)) 43 | if dw is None: 44 | dw = [] 45 | for w_i in w: 46 | # print('Shape',w_i.shape) 47 | dw.append(np.ones_like(w_i)) 48 | # dw = [np.ones_like(w_i) for w_i in w] 49 | 50 | eps = np.power(2., np.linspace(-32, 0, 33)) 51 | 52 | initial_loss = sess.run(problem.loss,feed_dict) 53 | 54 | 55 | initial_g = sess.run(problem.gradient,feed_dict) 56 | 57 | feed_dict[problem.dw] = dw 58 | initial_gTdw = np.sum(sess.run(problem._gTdw,feed_dict)) 59 | 60 | initial_Hdw = sess.run(problem.Hdw,feed_dict) 61 | 62 | error_g = np.zeros_like(eps) 63 | error_H = np.zeros_like(eps) 64 | 65 | # We will need to modify w during this process so we copy 66 | # the initial values of w so we can replace them later 67 | print('Copying initial w since it will be modified during this check') 68 | w_array = sess.run(problem.w) 69 | w_changed = True 70 | 71 | if verbose: 72 | print('Initial loss:',initial_loss) 73 | # print('Initial gradient:',initial_g) 74 | print('Initial gTdw',initial_gTdw) 75 | print('{0:10} {1:10} {2:10} {3:10}'.format('epsilon','loss','error_g','error_H')) 76 | 77 | 78 | for i in np.arange(eps.shape[0]): 79 | 80 | 81 | eps_i = eps[i] 82 | # Momentarily assign w 83 | # w_update = [eps_i*dw_i for dw_i in dw] 84 | # # w_plus = w + eps_i*dw 85 | # problem._update_w(w_update) 86 | new_w = [] 87 | for w_i,dw_i in zip(w,dw): 88 | new_w.append(w_i + eps_i*dw_i) 89 | sess.run(problem._assign_to_w(new_w)) 90 | #Evaluate new loss and calculate gradient error 91 | loss_plus = sess.run(problem.loss,feed_dict) 92 | error_g_i = np.abs( (loss_plus - initial_loss)/eps_i - initial_gTdw) 93 | error_g[i] = error_g_i 94 | # Evaluate new gradient and calculate Hessian error 95 | g_plus = sess.run(problem.gradient,feed_dict) 96 | error_H_i_ = [] 97 | for g_plus_i,initial_g_i,initial_Hdw_i in zip(g_plus,initial_g,initial_Hdw): 98 | error_H_i_.append((g_plus_i - initial_g_i)/eps_i-initial_Hdw_i) 99 | error_H_i = np.sqrt(np.sum([np.linalg.norm(e)**2 for e in error_H_i_])) 100 | error_H[i] = error_H_i 101 | 102 | if verbose: 103 | print('{0:1.4e} {1:1.4e} {2:1.4e} {3:1.4e}'.format(eps_i,loss_plus,error_g_i,error_H_i)) 104 | 105 | if w_changed: 106 | problem._assign_to_w(w_array) 107 | print('Succesfully re-assigned w') 108 | 109 | out = {} 110 | out['epsilon'] = eps 111 | out['error_g'] = error_g 112 | out['error_H'] = error_H 113 | 114 | return out 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/gradientDescent.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | 24 | from ..utilities.parameterList import ParameterList 25 | from ..algorithms import Optimizer 26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion 27 | 28 | 29 | 30 | 31 | def ParametersGradientDescent(parameters = {}): 32 | parameters['alpha'] = [1e-3, "Initial steplength, or learning rate"] 33 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"] 34 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"] 35 | parameters['max_NN_evals_per_batch'] = [10000, "Scale constant for maximum neural network evaluations per datum"] 36 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"] 37 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search'] 38 | 39 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none'] 40 | # Reasons for convergence failure 41 | parameters['reasons'] = [[], 'list of reasons for termination'] 42 | 43 | return ParameterList(parameters) 44 | 45 | 46 | class GradientDescent(Optimizer): 47 | """ 48 | This class implements the gradient descent (and stochastic variant) optimizer 49 | """ 50 | def __init__(self,problem,regularization,sess = None,feed_dict = None,parameters = ParametersGradientDescent()): 51 | """ 52 | The constructor for this class takes: 53 | -problem: hessianlearn.problem.Problem 54 | -regularization: hessianlearn.problem.Regularization 55 | -sess: tf.Session() 56 | -parameters: hyperparameters dictionary 57 | """ 58 | if regularization is None: 59 | _regularization = L2Regularization(problem,gamma = 0.0) 60 | else: 61 | _regularization = regularization 62 | super(GradientDescent,self).__init__(problem,_regularization,sess,parameters) 63 | 64 | self.grad = self.problem.gradient + self.regularization.gradient 65 | self._sweeps = np.zeros(2) 66 | 67 | self.trust_region_initialized = False 68 | if self.parameters['globalization'] == 'trust_region': 69 | self.alpha = 0.0 70 | else: 71 | self.alpha = parameters['alpha'] 72 | 73 | 74 | 75 | 76 | def minimize(self,feed_dict = None): 77 | r""" 78 | Implements the gradient update: 79 | w-=alpha*g 80 | Takes the parameter: 81 | -feed_dict: data to be used to evaluate stochastic gradient and cost 82 | """ 83 | assert self.sess is not None 84 | assert feed_dict is not None 85 | 86 | g = self.sess.run(self.grad,feed_dict = feed_dict) 87 | 88 | 89 | if self.parameters['globalization'] == 'line_search': 90 | w_dir = -g 91 | w_dir_inner_g = np.inner(w_dir,g) 92 | initial_cost = self.sess.run(self.problem.loss, feed_dict) 93 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict) 94 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\ 95 | cost_at_candidate, initial_cost) 96 | p = self.alpha*w_dir 97 | self._sweeps += [1+0.5*line_search_iter,0] 98 | 99 | elif self.parameters['globalization'] == None: 100 | self.alpha = self.parameters['alpha'] 101 | p = -self.parameters['alpha']*g 102 | self._sweeps += [1,0] 103 | 104 | self.p = p 105 | 106 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:p}) 107 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/adam.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | 24 | from ..utilities.parameterList import ParameterList 25 | from ..algorithms import Optimizer 26 | 27 | 28 | 29 | 30 | def ParametersAdam(parameters = {}): 31 | parameters['alpha'] = [1e-3, "Initial steplength, or learning rate"] 32 | parameters['beta_1'] = [0.9, "Exponential decay rate for first moment"] 33 | parameters['beta_2'] = [0.999, "Exponential decay rate for second moment"] 34 | parameters['epsilon'] = [1e-7, "epsilon for denominator involving square root"] 35 | 36 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"] 37 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"] 38 | parameters['max_NN_evals_per_batch'] = [10000, "Scale constant for maximum neural network evaluations per datum"] 39 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"] 40 | 41 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none'] 42 | # Reasons for convergence failure 43 | parameters['reasons'] = [[], 'list of reasons for termination'] 44 | 45 | 46 | return ParameterList(parameters) 47 | 48 | 49 | class Adam(Optimizer): 50 | """ 51 | This class implements the Adam optimizer 52 | """ 53 | def __init__(self,problem,regularization = None,sess = None,feed_dict= None,parameters = ParametersAdam()): 54 | """ 55 | The constructor for this class takes: 56 | -problem: hessianlearn.problem.Problem 57 | -regularization: hessianlearn.problem.Regularization 58 | -sess: tf.Session() 59 | -parameters: hyperparameters dictionary 60 | """ 61 | if regularization is None: 62 | _regularization = L2Regularization(problem,gamma = 0.0) 63 | else: 64 | _regularization = regularization 65 | super(Adam,self).__init__(problem,_regularization,sess,parameters) 66 | 67 | self.grad = self.problem.gradient + self.regularization.gradient 68 | 69 | self.m = np.zeros(self.problem.dimension) 70 | self.v = np.zeros(self.problem.dimension) 71 | self.p = np.zeros(self.problem.dimension) 72 | 73 | self._iter = 0 74 | self._sweeps = np.zeros(2) 75 | 76 | self.alpha = self.parameters['alpha'] 77 | 78 | def minimize(self,feed_dict = None): 79 | r""" 80 | This method implements one step of the Adam algorithm: 81 | -feed_dict: data dictionary used to evaluate gradient 82 | """ 83 | assert self.sess is not None 84 | assert feed_dict is not None 85 | self._iter += 1 86 | 87 | alpha = self.parameters['alpha']* np.sqrt(1 - self.parameters['beta_2']**self.iter)/(1 - self.parameters['beta_1']**self.iter) 88 | 89 | gradient = self.sess.run(self.grad,feed_dict = feed_dict) 90 | 91 | self.m = self.parameters['beta_1']*self.m + (1-self.parameters['beta_1'])*gradient 92 | m_hat = self.m / (1.0 - self.parameters['beta_1']**self._iter) 93 | 94 | g_sq_vec = np.square(gradient) 95 | self.v = self.parameters['beta_2']*self.v + (1-self.parameters['beta_2'])*g_sq_vec 96 | v_hat = self.v / (1.0 - self.parameters['beta_2']**self._iter) 97 | v_root = np.sqrt(v_hat) 98 | 99 | 100 | update = -alpha*m_hat/(v_root +self.parameters['epsilon']) 101 | self.p = update 102 | self._sweeps += [1,0] 103 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 104 | 105 | 106 | -------------------------------------------------------------------------------- /hessianlearn/data/lfw.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import sys 24 | 25 | 26 | import numpy as np 27 | from scipy import signal 28 | import random 29 | from ..data.data import * 30 | 31 | import math 32 | import time 33 | 34 | # from statsmodels import robust 35 | 36 | def dir_check(dir): 37 | try: 38 | os.stat(dir) 39 | except: 40 | os.mkdir(dir) 41 | 42 | def reporthook(count, block_size, total_size): 43 | global start_time 44 | if count == 0: 45 | start_time = time.time() 46 | return 47 | duration = time.time() - start_time 48 | progress_size = int(count * block_size) 49 | speed = int(progress_size / (1024 * duration)) 50 | percent = int(count * block_size * 100 / total_size) 51 | sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" % 52 | (percent, progress_size / (1024 * 1024), speed, duration)) 53 | sys.stdout.flush() 54 | 55 | 56 | 57 | def load_lfw(): 58 | try: 59 | # read from file 60 | images = np.load('lfw_all_images.npy') 61 | labels = np.load('lfw_all_labels.npy') 62 | print('Loaded successfully locally') 63 | return [images, labels] 64 | 65 | except: 66 | # write to file 67 | print(80*'#') 68 | print('Did not load locally.') 69 | print(80*'#') 70 | try: 71 | os.stat("lfw.tgz") 72 | except: 73 | print('Downloading from source, and saving to disk.') 74 | print(80*'#') 75 | import urllib.request 76 | urllib.request.urlretrieve("http://vis-www.cs.umass.edu/lfw/lfw.tgz", "lfw.tgz",reporthook) 77 | folder_name = 'lfw/' 78 | try: 79 | os.stat(folder_name) 80 | except: 81 | try: 82 | import subprocess 83 | subprocess.run(['tar','zxvf',"lfw.tgz"]) 84 | except: 85 | pass 86 | import shutil 87 | folder_names = os.listdir(folder_name) 88 | n_folders = len(folder_names) 89 | print(n_folders,' many folder names') 90 | if not os.path.isdir('lfw_all_images'): 91 | os.mkdir('lfw_all_images') 92 | print('Making directory lfw_all_images/') 93 | for folder in os.listdir('lfw/'): 94 | for file in os.listdir('lfw/'+folder): 95 | if not os.path.isfile('lfw_all_images/'+file): 96 | shutil.move('lfw/'+folder+'/'+file,'lfw_all_images/') 97 | print('Moving ',file,'to lfw_all_images') 98 | 99 | file_names = os.listdir('lfw_all_images') 100 | n_files = len(file_names) 101 | 102 | images = np.empty(shape = (n_files,250,250,3)) 103 | 104 | from keras.preprocessing import image 105 | for file,counter in zip(file_names,range(n_files)): 106 | img = image.load_img('lfw_all_images/'+file) 107 | images[counter,:,:,:] = image.img_to_array(img) 108 | labels = np.array(file_names) 109 | assert(labels.shape[0]==images.shape[0]) 110 | print(labels.shape) 111 | images = np.array(images) 112 | np.save('lfw_all_images.npy',images) 113 | np.save('lfw_all_labels.npy',labels) 114 | print('Saved locally') 115 | return [images,labels] 116 | 117 | 118 | # def view_random_pair(self): 119 | # try: 120 | # labelkey = ['Airplane','Automobile','Bird','Cat','Deer','Dog','Frog','Horse','Ship','Truck'] 121 | # i = np.random.choice(range(60000)) 122 | # index = self.all_data[1][i] 123 | # label = labelkey[index] 124 | # import matplotlib.pyplot as plt 125 | # fig, ax = plt.subplots(figsize = (3,3)) 126 | # ax.set_title(str(label)) 127 | # data = self.all_data[0][i,:,:,:].astype(np.uint8) 128 | # ax.imshow(data) 129 | # plt.show() 130 | # except: 131 | # pass 132 | 133 | 134 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/optimizer.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | from abc import ABC, abstractmethod 23 | import numpy as np 24 | 25 | from ..utilities.parameterList import ParameterList 26 | from ..problem import Hessian 27 | 28 | def ParametersOptimizer(dictionary = {}): 29 | parameters = dictionary 30 | parameters['alpha'] = [1.0, "Initial steplength, or learning rate"] 31 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"] 32 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"] 33 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none'] 34 | 35 | 36 | return ParameterList(parameters) 37 | 38 | 39 | class Optimizer(ABC): 40 | """ 41 | This class describes the optimizer used during training 42 | 43 | All children must implement the method minimize, which implements 44 | one step of the optimizers weight update scheme 45 | """ 46 | def __init__(self,problem = None,regularization = None, sess = None,parameters = ParametersOptimizer(),comm = None): 47 | """ 48 | The constructor for this class takes: 49 | -problem: hessianlearn.problem.Problem class 50 | -regularization: hessianlearn.problem.Regularization class 51 | -sess: the tf.Session() used to evaluate the computational graph 52 | -parameters: the dictionary of hyperparameters for the optimizer. 53 | """ 54 | self._problem = problem 55 | self._regularization = regularization 56 | self._sess = sess 57 | self._parameters = parameters 58 | self._sweeps = 0 59 | self._comm = comm 60 | self._iter = 0 61 | self.H = Hessian(problem=problem,sess=sess) 62 | 63 | @property 64 | def problem(self): 65 | return self._problem 66 | 67 | @property 68 | def sess(self): 69 | return self._sess 70 | 71 | @property 72 | def parameters(self): 73 | return self._parameters 74 | 75 | @property 76 | def sweeps(self): 77 | return self._sweeps 78 | 79 | @property 80 | def comm(self): 81 | return self._comm 82 | 83 | @property 84 | def iter(self): 85 | return self._iter 86 | 87 | @property 88 | def regularization(self): 89 | return self._regularization 90 | 91 | @property 92 | def set_sess(self): 93 | return self._set_sess 94 | 95 | 96 | def _set_sess(self,sess): 97 | r""" 98 | Sets the tf.Session() 99 | """ 100 | self._sess = sess 101 | if 'H' in dir(self): 102 | self.H._sess = sess 103 | 104 | def minimize(self): 105 | r""" 106 | Implements update rule for the algorithm. 107 | """ 108 | raise NotImplementedError("Child class should implement method minimize") 109 | 110 | def initialize_trust_region(self): 111 | r""" 112 | Initializes trust region parameters 113 | """ 114 | raise NotImplementedError("Child class should implement method minimize") 115 | 116 | 117 | 118 | def _loss_at_candidate(self,p,feed_dict): 119 | """ 120 | This method implements a function to assist with Armijo line search 121 | -p: candidate update to be evaluated in Armijo line search producedure 122 | -feed_dict: data dictionary used to evaluate cost at candidate 123 | """ 124 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:p}) 125 | # self.sess.run(self.problem._update_w(p)) 126 | misfit = self.sess.run((self.problem.loss),feed_dict) 127 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:-p}) 128 | # self.sess.run(self.problem._update_w(-p)) 129 | return misfit 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/gmresSolver.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | import math 22 | import numpy as np 23 | import tensorflow as tf 24 | if int(tf.__version__[0]) > 1: 25 | import tensorflow.compat.v1 as tf 26 | tf.disable_v2_behavior() 27 | 28 | from ..utilities.parameterList import ParameterList 29 | from ..algorithms import Optimizer 30 | from .. problem import IdentityPreconditioner 31 | from ..problem import L2Regularization 32 | from abc import ABC, abstractmethod 33 | 34 | class Identity(object): 35 | def __init__(self): 36 | 37 | pass 38 | 39 | def __call__(self, x): 40 | return x 41 | 42 | 43 | 44 | def ParametersGMRESSolver(dictionary = {}): 45 | parameters = dictionary 46 | parameters["rel_tolerance"] = [1e-9, "the relative tolerance for the stopping criterion"] 47 | parameters["abs_tolerance"] = [1e-12, "the absolute tolerance for the stopping criterion"] 48 | parameters["max_iter"] = [20, "the maximum number of iterations"] 49 | parameters["zero_initial_guess"] = [True, "if True we start with a 0\ 50 | initial guess; if False we use the x as initial guess."] 51 | parameters["print_level"] = [-1, "verbosity level: -1 --> no output on \ 52 | screen; 0 --> only final residual at convergence or reason for not not convergence"] 53 | 54 | parameters['coarse_tol'] = [0.5,'coarse tolerance used in calculation \ 55 | of relative tolerances for E-W conditions'] 56 | return ParameterList(parameters) 57 | 58 | 59 | class GMRESSolver(ABC): 60 | """ 61 | This class implements a GMRES solver 62 | """ 63 | reason = ["Maximum Number of Iterations Reached", 64 | "Relative/Absolute residual less than tol", 65 | "Reached a negative direction", 66 | "Reached trust region boundary" 67 | ] 68 | def __init__(self,problem,regularization,sess = None,preconditioner = None,\ 69 | x = None,parameters = ParametersGMRESSolver()): 70 | self.sess = sess 71 | self.problem = problem 72 | self.regularization = regularization 73 | if x is None: 74 | # self.x = tf.Variable(self.problem.gradient.initialized_value()) 75 | self.x = self.problem.gradient 76 | else: 77 | self.x = x 78 | self.parameters = parameters 79 | 80 | 81 | self.Aop = self.problem.Hdw + self.regularization.Hdw 82 | 83 | # # Define preconditioner 84 | # if preconditioner is None: 85 | # self.Minv = IdentityPreconditioner(problem,self.problem.dtype) 86 | # else: 87 | # self.Minv = preconditioner 88 | 89 | 90 | 91 | 92 | 93 | 94 | def solve(self,b,feed_dict = None,x_0 = None): 95 | r""" 96 | Solve Ax=b by the mines method 97 | as defined in Iterative Methods Ed. 2 by Youssef Saad p 140 98 | """ 99 | assert self.sess is not None 100 | assert feed_dict is not None 101 | 102 | self.iter = 0 103 | self.converged = False 104 | self.reason_id = 0 105 | x = np.zeros_like(b) 106 | 107 | feed_dict[self.problem.dw] = x 108 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict) 109 | # Calculate initial residual r = Ax_0 -b 110 | r = b - Ax_0 111 | # Calculate tolerance for Eisenstat Walker conditions 112 | rr_0 = np.dot(r,r) 113 | rtol2 = rr_0 * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"] 114 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"] 115 | tol = max(rtol2, atol2) 116 | import scipy 117 | from scipy.sparse.linalg import LinearOperator 118 | 119 | def Ap(p): 120 | feed_dict[self.problem.dw] = p 121 | return self.sess.run(self.Aop,feed_dict = feed_dict) 122 | 123 | n = self.problem.dimension 124 | 125 | A = LinearOperator((n,n), matvec=Ap) 126 | 127 | # self.iter += self.parameters["max_iter"] 128 | 129 | def update_iters(rk): 130 | self.iter +=1 131 | 132 | return scipy.sparse.linalg.gmres(A, b, tol=tol, maxiter=self.parameters["max_iter"],callback = update_iters) 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /applications/mnist/mnist_autoencoder.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | ################################################################################ 19 | # Uses some code from https://blog.keras.io/building-autoencoders-in-keras.html 20 | ################################################################################ 21 | 22 | import numpy as np 23 | import os 24 | import tensorflow as tf 25 | import time 26 | # if int(tf.__version__[0]) > 1: 27 | # import tensorflow.compat.v1 as tf 28 | # tf.disable_v2_behavior() 29 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 30 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 31 | os.environ["KMP_WARNINGS"] = "FALSE" 32 | import sys 33 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../")) 34 | from hessianlearn import * 35 | 36 | tf.set_random_seed(0) 37 | 38 | settings = {} 39 | # Set run specifications 40 | # Data specs 41 | settings['batch_size'] = 100 42 | settings['hess_batch_size'] = 10 43 | 44 | 45 | ################################################################################ 46 | # Instantiate data 47 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 48 | 49 | 50 | # Normalize the data 51 | x_train = x_train.astype('float32') / 255. 52 | x_test = x_test.astype('float32') / 255. 53 | # Reshape the data 54 | x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:]))) 55 | x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:]))) 56 | 57 | # Instante the data object 58 | data = Data([x_train,y_train],settings['batch_size'],test_data = [x_test,y_test],hessian_batch_size = settings['hess_batch_size']) 59 | 60 | # settings['input_shape'] = data._input_shape 61 | # settings['output_shape'] = data._output_shape 62 | 63 | 64 | ################################################################################ 65 | # Create the neural network in keras 66 | 67 | encoding_dim = 32 68 | input_img = tf.keras.layers.Input(shape=(784,)) 69 | encoded = tf.keras.layers.Dense(encoding_dim, activation='softplus')(input_img) 70 | decoded = tf.keras.layers.Dense(784, activation='sigmoid')(encoded) 71 | autoencoder = tf.keras.models.Model(input_img, decoded) 72 | 73 | 74 | ################################################################################ 75 | # Instantiate the problem, regularization. 76 | 77 | problem = AutoencoderProblem(autoencoder,dtype=tf.float32) 78 | 79 | settings['tikhonov_gamma'] = 0.0 80 | 81 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma']) 82 | 83 | 84 | ################################################################################ 85 | # Instantiate the model object 86 | HLModelSettings = HessianlearnModelSettings() 87 | 88 | HLModelSettings['optimizer'] = 'lrsfn' 89 | HLModelSettings['alpha'] = 1e-2 90 | HLModelSettings['globalization'] = 'line_search' 91 | HLModelSettings['hessian_low_rank'] = 20 92 | HLModelSettings['max_backtrack'] = 16 93 | HLModelSettings['max_sweeps'] = 50 94 | 95 | HLModelSettings['problem_name'] = 'mnist_ae' 96 | HLModelSettings['record_spectrum'] = False 97 | HLModelSettings['rq_data_size'] = 100 98 | 99 | 100 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings) 101 | 102 | HLModel.fit() 103 | 104 | ################################################################################ 105 | # Postprocessing with the trained autoencoder 106 | 107 | encoder = tf.keras.models.Model(input_img, encoded) 108 | 109 | encoded_input = tf.keras.layers.Input(shape=(encoding_dim,)) 110 | 111 | decoder_layer = autoencoder.layers[-1] 112 | 113 | decoder = tf.keras.models.Model(encoded_input, decoder_layer(encoded_input)) 114 | 115 | encoded_imgs = encoder.predict(x_test) 116 | decoded_imgs = decoder.predict(encoded_imgs) 117 | 118 | try: 119 | import matplotlib.pyplot as plt 120 | 121 | n = 10 # how many digits we will display 122 | plt.figure(figsize=(20, 4)) 123 | for i in range(n): 124 | # display original 125 | ax = plt.subplot(2, n, i + 1) 126 | plt.imshow(x_test[i].reshape(28, 28)) 127 | plt.gray() 128 | ax.get_xaxis().set_visible(False) 129 | ax.get_yaxis().set_visible(False) 130 | 131 | # display reconstruction 132 | ax = plt.subplot(2, n, i + 1 + n) 133 | plt.imshow(decoded_imgs[i].reshape(28, 28)) 134 | plt.gray() 135 | ax.get_xaxis().set_visible(False) 136 | ax.get_yaxis().set_visible(False) 137 | plt.show() 138 | except: 139 | pass 140 | 141 | -------------------------------------------------------------------------------- /hessianlearn/test/test_varianceBasedNystrom.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Authors: Nick Alger, Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import unittest 20 | import numpy as np 21 | import sys 22 | 23 | sys.path.append('../../') 24 | from hessianlearn import (variance_based_nystrom) 25 | sys.path.append('../algorithms') 26 | from varianceBasedNystrom import * 27 | 28 | def make_random_symmetric_matrix(n,p): 29 | U, _ = np.linalg.qr(np.random.randn(n,n)) 30 | ss = np.random.randn(n)**p 31 | A = np.dot(U, np.dot(np.diag(ss), U.T)) 32 | return A 33 | 34 | 35 | def compute_Theta_slow(Q, apply_AA): 36 | r = Q.shape[1] 37 | m = len(apply_AA) 38 | Theta_true = np.zeros((r, r, m)) 39 | for i in range(r): 40 | for j in range(r): 41 | for k in range(m): 42 | Theta_true[i,j,k] = np.dot(Q[:,i], apply_AA[k](Q[:,j])) 43 | return Theta_true 44 | 45 | def compute_rayleigh_statistics_slow(U, apply_AA): 46 | m = len(apply_AA) 47 | r = U.shape[1] 48 | C = np.zeros((r, m)) 49 | for k in range(m): 50 | for i in range(r): 51 | C[i,k] = np.dot(U[:,i], apply_AA[k](U[:,i])) 52 | 53 | all_mu = np.mean(C, axis=1) 54 | all_std = np.std(C, axis=1) 55 | return all_mu, all_std 56 | 57 | 58 | class TestVarianceBasedNystrom(unittest.TestCase): 59 | 60 | def setUp(self): 61 | self.n = 500 62 | m = 50 63 | p = 7 64 | self.batch_r = 10 65 | randomness_factor = 0.1 66 | 67 | A0 = make_random_symmetric_matrix(self.n,p) 68 | AA = [A0 + randomness_factor * make_random_symmetric_matrix(self.n,p) for _ in range(m)] 69 | 70 | self.apply_AA = [lambda x, Ak=Ak: np.dot(Ak,x) for Ak in AA] 71 | 72 | self.A = np.sum(AA, axis=0)/m 73 | 74 | 75 | 76 | 77 | 78 | def test_all(self): 79 | Y = get_random_range_vectors(self.apply_AA, self.n, self.batch_r) 80 | Q,_ = np.linalg.qr(Y) 81 | Theta = compute_Theta(Q, self.apply_AA) 82 | Theta_true = compute_Theta_slow(Q, self.apply_AA) 83 | err_Theta = np.linalg.norm(Theta - Theta_true)/np.linalg.norm(Theta_true) 84 | print('err_Theta=', err_Theta) 85 | assert err_Theta < 1e-10 86 | 87 | dd, U, V = finish_computing_eigenvalue_decomposition(Q, Theta) 88 | 89 | A_approx = np.dot(U, np.dot(np.diag(dd), U.T)) 90 | err_A_1 = np.linalg.norm(self.A - A_approx)/np.linalg.norm(self.A) 91 | print('err_A_1=', err_A_1) 92 | assert err_A_1 < 1.0 93 | 94 | # Errors in computing statistics 95 | all_mu, all_std = compute_rayleigh_statistics(Theta, V) 96 | 97 | all_mu_true, all_std_true = compute_rayleigh_statistics_slow(U,self.apply_AA) 98 | 99 | err_mu = np.linalg.norm(all_mu - all_mu_true)/np.linalg.norm(all_mu_true) 100 | err_std = np.linalg.norm(all_std - all_std_true)/np.linalg.norm(all_std_true) 101 | 102 | print('err_mu=', err_mu) 103 | print('err_std=', err_std) 104 | assert err_mu < 1e-10 105 | assert err_std < 1e-10 106 | 107 | # Redo computations with better range approximation 108 | Y2 = get_random_range_vectors(self.apply_AA, self.n, self.batch_r) 109 | Y2_perp = Y2 - np.dot(Q,np.dot(Q.T, Y2)) 110 | Q2,_ = np.linalg.qr(Y2_perp) 111 | Q_new = np.hstack([Q, Q2]) 112 | err_Q_orth = np.linalg.norm(np.dot(Q_new.T, Q_new) - np.eye(Q_new.shape[1])) 113 | print('err_Q_orth=', err_Q_orth) 114 | assert err_Q_orth < 1e-10 115 | 116 | Theta_new = update_Theta(Q, Q2, Theta, self.apply_AA) 117 | 118 | Theta_true_new = compute_Theta_slow(Q_new, self.apply_AA) 119 | 120 | err_Theta_new = np.linalg.norm(Theta_new - Theta_true_new)/np.linalg.norm(Theta_true_new) 121 | print('err_Theta_new=', err_Theta_new) 122 | 123 | assert err_Theta_new < 1e-10 124 | 125 | dd_new, U_new, V_new = finish_computing_eigenvalue_decomposition(Q_new, Theta_new) 126 | A_approx_new = np.dot(U_new, np.dot(np.diag(dd_new), U_new.T)) 127 | err_A_new = np.linalg.norm(self.A - A_approx_new)/np.linalg.norm(self.A) 128 | print('err_A_new=', err_A_new) 129 | 130 | # The approximation error should decrease monotonically as we increase the range 131 | assert err_A_new < err_A_1 132 | 133 | # Run the complete method from scratch 134 | 135 | [dd_good, U_good, all_std_good], [dd_all,U_all,all_std] = variance_based_nystrom(self.apply_AA, self.n) 136 | 137 | A_good_approx = np.dot(U_good, np.dot(np.diag(dd_good), U_good.T)) 138 | err_A_good = np.linalg.norm(A_good_approx - self.A)/np.linalg.norm(self.A) 139 | print('err_A_good=', err_A_good) 140 | assert err_A_good < 0.1 141 | 142 | 143 | 144 | if __name__ == '__main__': 145 | unittest.main() -------------------------------------------------------------------------------- /hessianlearn/algorithms/inexactNewtonMINRES.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | 24 | from ..utilities.parameterList import ParameterList 25 | from ..algorithms import Optimizer, MINRESSolver, ParametersMINRESSolver 26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion 27 | from ..problem import L2Regularization 28 | 29 | 30 | 31 | 32 | def ParametersInexactNewtonMINRES(parameters = {}): 33 | parameters['alpha'] = [1e-1, "Initial steplength, or learning rate"] 34 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"] 35 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"] 36 | parameters['max_NN_evals_per_batch'] = [20, "Scale constant for maximum neural network evaluations per datum"] 37 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"] 38 | 39 | parameters['minres_parameters'] = [ ParametersMINRESSolver(),'CG Parameters'] 40 | # CG solver parameters 41 | parameters['cg_coarse_tol'] = [0.5,'CG coarse solve tolerance'] 42 | parameters['cg_max_iter'] = [1000,'CG maximum iterations'] 43 | parameters['eta_mode'] = [0, 'eta mode for E-W conditions:0,1,2'] 44 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none'] 45 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search'] 46 | 47 | 48 | # Reasons for convergence failure 49 | parameters['reasons'] = [[], 'list of reasons for termination'] 50 | 51 | 52 | return ParameterList(parameters) 53 | 54 | 55 | class InexactNewtonMINRES(Optimizer): 56 | """ 57 | This class implements the Inexact Newton MINRES optimizer 58 | """ 59 | 60 | def __init__(self,problem,regularization = None,sess = None,parameters = ParametersInexactNewtonMINRES(),preconditioner = None): 61 | """ 62 | The constructor for this class takes: 63 | -problem: hessianlearn.problem.Problem 64 | -regularization: hessianlearn.problem.Regularization 65 | -sess: tf.Session() 66 | -parameters: hyperparameters dictionary 67 | -preconditioner: hessianlearn.problem.Preconditioner 68 | """ 69 | if regularization is None: 70 | _regularization = L2Regularization(problem,gamma = 0.0) 71 | else: 72 | _regularization = regularization 73 | super(InexactNewtonMINRES,self).__init__(problem,_regularization,sess,parameters) 74 | 75 | self._sweeps = np.zeros(2) 76 | self.grad = self.problem.gradient + self.regularization.gradient 77 | self.minres_solver = MINRESSolver(self.problem,self.regularization,\ 78 | self.sess,parameters= self.parameters['minres_parameters']) 79 | self.alpha = 0.0 80 | 81 | 82 | def minimize(self,feed_dict = None,hessian_feed_dict = None): 83 | r""" 84 | Updates using inexact Newton MINRES 85 | """ 86 | assert self.sess is not None 87 | assert feed_dict is not None 88 | if hessian_feed_dict is None: 89 | hessian_feed_dict = feed_dict 90 | 91 | self.gradient = self.sess.run(self.grad,feed_dict = feed_dict) 92 | 93 | if self.parameters['globalization'] == 'line_search': 94 | w_dir,_ = self.minres_solver.solve(-self.gradient,hessian_feed_dict) 95 | w_dir_inner_g = np.inner(w_dir,self.gradient) 96 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict) 97 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict) 98 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\ 99 | cost_at_candidate, initial_cost,\ 100 | max_backtracking_iter = self.parameters['max_backtracking_iter']) 101 | update = self.alpha*w_dir 102 | self._sweeps += [1+0.5*line_search_iter,2*self.minres_solver.iter] 103 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 104 | elif self.parameters['globalization'] == None: 105 | self.alpha = self.parameters['alpha'] 106 | p,converged = self.minres_solver.solve(-self.gradient,hessian_feed_dict) 107 | # print(converged) 108 | # if converged: 109 | # print('Converged!') 110 | # else: 111 | # print('NOT CONVERGED!!!!!') 112 | self._sweeps += [1, 4*self.minres_solver.iter] 113 | self.p = p 114 | update = self.alpha*p 115 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 116 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/inexactNewtonGMRES.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | 24 | from ..utilities.parameterList import ParameterList 25 | from ..algorithms import Optimizer, GMRESSolver, ParametersGMRESSolver 26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion 27 | from ..problem import L2Regularization 28 | 29 | 30 | 31 | 32 | def ParametersInexactNewtonGMRES(parameters = {}): 33 | parameters['alpha'] = [1e-1, "Initial steplength, or learning rate"] 34 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"] 35 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"] 36 | parameters['max_NN_evals_per_batch'] = [20, "Scale constant for maximum neural network evaluations per datum"] 37 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"] 38 | 39 | parameters['gmres_parameters'] = [ ParametersGMRESSolver(),'CG Parameters'] 40 | # CG solver parameters 41 | parameters['cg_coarse_tol'] = [0.5,'CG coarse solve tolerance'] 42 | parameters['cg_max_iter'] = [1000,'CG maximum iterations'] 43 | parameters['eta_mode'] = [0, 'eta mode for E-W conditions:0,1,2'] 44 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none'] 45 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search'] 46 | 47 | 48 | # Reasons for convergence failure 49 | parameters['reasons'] = [[], 'list of reasons for termination'] 50 | 51 | 52 | return ParameterList(parameters) 53 | 54 | 55 | class InexactNewtonGMRES(Optimizer): 56 | """ 57 | This class implements the inexact Newton GMRES optimizer 58 | """ 59 | def __init__(self,problem,regularization = None,sess = None,feed_dict = None,parameters = ParametersInexactNewtonGMRES(),preconditioner = None): 60 | """ 61 | The constructor for this class takes: 62 | -problem: hessianlearn.problem.Problem 63 | -regularization: hessianlearn.problem.Regularization 64 | -sess: tf.Session() 65 | -parameters: hyperparameters dictionary 66 | -preconditioner: hessianlearn.problem.Preconditioner 67 | """ 68 | if regularization is None: 69 | _regularization = L2Regularization(problem,gamma = 0.0) 70 | else: 71 | _regularization = regularization 72 | super(InexactNewtonGMRES,self).__init__(problem,_regularization,sess,parameters) 73 | 74 | self._sweeps = np.zeros(2) 75 | self.grad = self.problem.gradient + self.regularization.gradient 76 | self.gmres_solver = GMRESSolver(self.problem,self.regularization,\ 77 | self.sess,parameters= self.parameters['gmres_parameters']) 78 | self.alpha = 0.0 79 | 80 | 81 | def minimize(self,feed_dict = None,hessian_feed_dict = None): 82 | r""" 83 | Updates using inexact Newton GMRES 84 | """ 85 | assert self.sess is not None 86 | assert feed_dict is not None 87 | if hessian_feed_dict is None: 88 | hessian_feed_dict = feed_dict 89 | 90 | self.gradient = self.sess.run(self.grad,feed_dict = feed_dict) 91 | 92 | if self.parameters['globalization'] == 'line_search': 93 | w_dir,on_boundary = self.gmres_solver.solve(-self.gradient,hessian_feed_dict) 94 | w_dir_inner_g = np.inner(w_dir,self.gradient) 95 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict) 96 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict) 97 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\ 98 | cost_at_candidate, initial_cost,\ 99 | max_backtracking_iter = self.parameters['max_backtracking_iter']) 100 | update = self.alpha*w_dir 101 | self._sweeps += [1+0.5*line_search_iter,2*self.gmres_solver.iter] 102 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 103 | elif self.parameters['globalization'] == None: 104 | self.alpha = self.parameters['alpha'] 105 | p,converged = self.gmres_solver.solve(-self.gradient,hessian_feed_dict) 106 | # print(converged) 107 | # if converged: 108 | # print('Converged!') 109 | # else: 110 | # print('NOT CONVERGED!!!!!') 111 | self._sweeps += [1, 2*self.gmres_solver.iter] 112 | self.p = p 113 | update = self.alpha*p 114 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/minresSolver.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | import math 22 | import numpy as np 23 | import tensorflow as tf 24 | if int(tf.__version__[0]) > 1: 25 | import tensorflow.compat.v1 as tf 26 | tf.disable_v2_behavior() 27 | 28 | from ..utilities.parameterList import ParameterList 29 | from ..algorithms import Optimizer 30 | from .. problem import IdentityPreconditioner 31 | from ..problem import L2Regularization 32 | from abc import ABC, abstractmethod 33 | 34 | class Identity(object): 35 | def __init__(self): 36 | 37 | pass 38 | 39 | def __call__(self, x): 40 | return x 41 | 42 | 43 | 44 | def ParametersMINRESSolver(dictionary = {}): 45 | parameters = dictionary 46 | parameters["rel_tolerance"] = [1e-9, "the relative tolerance for the stopping criterion"] 47 | parameters["abs_tolerance"] = [1e-12, "the absolute tolerance for the stopping criterion"] 48 | parameters["max_iter"] = [20, "the maximum number of iterations"] 49 | parameters["zero_initial_guess"] = [True, "if True we start with a 0\ 50 | initial guess; if False we use the x as initial guess."] 51 | parameters["print_level"] = [-1, "verbosity level: -1 --> no output on \ 52 | screen; 0 --> only final residual at convergence or reason for not not convergence"] 53 | 54 | parameters['coarse_tol'] = [0.5,'coarse tolerance used in calculation \ 55 | of relative tolerances for E-W conditions'] 56 | return ParameterList(parameters) 57 | 58 | 59 | class MINRESSolver(ABC): 60 | """ 61 | This class implements a basic MINRES Solver 62 | """ 63 | 64 | reason = ["Maximum Number of Iterations Reached", 65 | "Relative/Absolute residual less than tol", 66 | "Reached a negative direction", 67 | "Reached trust region boundary" 68 | ] 69 | def __init__(self,problem,regularization,sess = None,preconditioner = None,\ 70 | x = None,parameters = ParametersMINRESSolver()): 71 | """ 72 | The constructor for this class takes: 73 | -problem: hessianlearn.problem.Problem 74 | -regularization: hessianlearn.problem.Regularization 75 | -sess: tf.Session() 76 | -preconditioner: hessianlearn.problem.Preconditioner 77 | """ 78 | self.sess = sess 79 | self.problem = problem 80 | self.regularization = regularization 81 | if x is None: 82 | # self.x = tf.Variable(self.problem.gradient.initialized_value()) 83 | self.x = self.problem.gradient 84 | else: 85 | self.x = x 86 | self.parameters = parameters 87 | 88 | 89 | self.Aop = self.problem.Hdw + self.regularization.Hdw 90 | 91 | # # Define preconditioner 92 | # if preconditioner is None: 93 | # self.Minv = IdentityPreconditioner(problem,self.problem.dtype) 94 | # else: 95 | # self.Minv = preconditioner 96 | 97 | 98 | 99 | 100 | 101 | def solve(self,b,feed_dict = None,x_0 = None): 102 | r""" 103 | Solve Ax=b by the mines method 104 | as defined in Iterative Methods Ed. 2 by Youssef Saad p 140 105 | """ 106 | assert self.sess is not None 107 | assert feed_dict is not None 108 | 109 | self.iter = 0 110 | self.converged = False 111 | self.reason_id = 0 112 | x = np.zeros_like(b) 113 | 114 | feed_dict[self.problem.dw] = x 115 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict) 116 | # Calculate initial residual r = Ax_0 -b 117 | r = b - Ax_0 118 | # Calculate tolerance for Eisenstat Walker conditions 119 | rr = np.dot(r,r) 120 | rtol2 = rr * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"] 121 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"] 122 | tol = max(rtol2, atol2) 123 | import scipy 124 | from scipy.sparse.linalg import LinearOperator 125 | 126 | def Ap(p): 127 | feed_dict[self.problem.dw] = p 128 | return self.sess.run(self.Aop,feed_dict = feed_dict) 129 | 130 | n = self.problem.dimension 131 | 132 | A = LinearOperator((n,n), matvec=Ap) 133 | 134 | x = np.zeros_like(b) 135 | p = A(r) 136 | 137 | converged = False 138 | while not converged : 139 | self.iter +=1 140 | alpha = np.dot(p,r)/rr 141 | x_old = x 142 | x += alpha*r 143 | r -= alpha*p 144 | 145 | p = A(r) 146 | # This is the extra query of the network to see if the direction 147 | # is about to rescale gradient components in indefinite directions 148 | # towards saddle points in which case one can break before 149 | # updating 150 | pAp = np.dot(p,A(p)) 151 | if pAp < 0: 152 | return x_old, converged 153 | 154 | converged = (np.dot(r,r) < tol) 155 | if self.iter < self.parameters["max_iter"]: 156 | return x, converged 157 | 158 | return x, converged 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /hessianlearn/problem/hessian.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | 19 | 20 | from __future__ import absolute_import, division, print_function 21 | import numpy as np 22 | # import tensorflow as tf 23 | # if int(tf.__version__[0]) > 1: 24 | # import tensorflow.compat.v1 as tf 25 | # tf.disable_v2_behavior() 26 | from abc import ABC, abstractmethod 27 | 28 | 29 | class Hessian(ABC): 30 | """ 31 | This class implements methods for the neural network training Hessian. 32 | 33 | Must have a problem and a sess in order to be evaluated 34 | """ 35 | def __init__(self,problem=None,sess=None): 36 | """ 37 | Create a Hessian given: 38 | 39 | - problem: the description of the neural network training problem 40 | (hessianlearn.problem.Problem) 41 | - sess: the tf.Session() needed for evaluation at run time 42 | """ 43 | self._problem = problem 44 | self._sess = sess 45 | 46 | @property 47 | def problem(self): 48 | return self._problem 49 | @property 50 | def sess(self): 51 | return self._sess 52 | 53 | @property 54 | def dimension(self): 55 | return self.problem.dimension 56 | 57 | 58 | @property 59 | def T(self): 60 | return self._T 61 | 62 | def _T(self): 63 | return self 64 | 65 | def __mult__(self,x): 66 | return self(x) 67 | 68 | def __call__(self,x,feed_dict,verbose = False): 69 | """ 70 | This method implements Hessian action, must have a problem and sess 71 | set before this method can be evaluated. 72 | -x: numpy array to be multiplied one at a time 73 | -feed_dict: data used in finite sum Hessian evaluation 74 | -verbose: for printing 75 | """ 76 | assert self.problem is not None 77 | assert self.sess is not None 78 | 79 | if len(x.shape) == 1: 80 | feed_dict[self.problem.dw] = x 81 | return self.sess.run(self.problem.Hdw,feed_dict) 82 | elif len(x.shape) == 2: 83 | n_vectors = x.shape[-1] 84 | if self.problem._HdW is None: 85 | if verbose: 86 | print('Total vectors = ',n_vectors) 87 | print('Initializing Hessian blocking') 88 | self.problem._initialize_hessian_blocking(n_vectors) 89 | # When the block sizes agree 90 | if n_vectors == self.problem._hessian_block_size: 91 | feed_dict[self.problem._dW] = x 92 | HdW = self.sess.run(self.problem.HdW,feed_dict) 93 | return HdW 94 | # When the requested block size is smaller 95 | elif n_vectors < self.problem._hessian_block_size: 96 | # The speedup is roughly 5x, so in the case that its less 97 | # than 1/5 its faster to either reinitialize the blocking 98 | # or for loop around running problem.Hdw 99 | if n_vectors < 0.2*self.problem._hessian_block_size: 100 | # Could reinitialize the blocking or just for loop 101 | # For looping for now 102 | HdW = np.zeros_like(x) 103 | for i in range(n_vectors): 104 | feed_dict[self.problem.dw] = x[:,i] 105 | HdW[:,i] = self.sess.run(self.problem.Hdw,feed_dict) 106 | return HdW 107 | else: 108 | dW = np.zeros(self.problem.dimension,self.problem._hessian_block_size) 109 | dW[:,:n_vectors] = x 110 | feed_dict[self.problem._dW] = dW 111 | HdW = self.sess.run(self.problem.HdW,feed_dict) 112 | return HdW[:,:n_vectors] 113 | # When the requested block size is larger 114 | elif n_vectors > self.problem._hessian_block_size: 115 | HdW = np.zeros_like(x) 116 | block_size = self.problem._hessian_block_size 117 | blocks, remainder = np.divmod(HdW.shape[-1],block_size) 118 | for i in range(blocks): 119 | feed_dict[self.problem._dW] = x[:,i*block_size:(i+1)*block_size] 120 | HdW[:,i*block_size:(i+1)*block_size] = self.sess.run(self.problem.HdW,feed_dict) 121 | # The last vectors are done as a for loop or a zeroed out array 122 | if remainder == 0: 123 | pass 124 | elif remainder > 0 and remainder < 0.2*self.problem._hessian_block_size: 125 | for i in range(n_vectors): 126 | feed_dict[self.problem.dw] = x[:,blocks*block_size+i] 127 | HdW[:,blocks*block_size+i] = self.sess.run(self.problem.Hdw,feed_dict) 128 | else: 129 | dW = np.zeros((self.problem.dimension,self.problem._hessian_block_size)) 130 | dW[:,:remainder] = x[:,-remainder:] 131 | feed_dict[self.problem._dW] = dW 132 | HdW[:,-remainder:] = self.sess.run(self.problem.Hdw,feed_dict) 133 | return HdW 134 | else: 135 | # Many different Hessian mat-vecs interpreted as a tensor? 136 | print('This case is not yet implemented'.center(80)) 137 | raise 138 | 139 | def quadratics(self,x,feed_dict,verbose = False): 140 | """ 141 | This method implements Hessian quadratics xTHx. 142 | Must have self._problem and self._sess set before this method can be evaluated. 143 | -x: numpy array to be multiplied one at a time 144 | -feed_dict: data used in finite sum Hessian evaluation 145 | -verbose: for printing 146 | """ 147 | assert self.problem is not None 148 | assert self.sess is not None 149 | if len(x.shape) == 1: 150 | feed_dict[self.problem.dw] = x 151 | return self.sess.run(self.problem.H_quadratic,feed_dict) 152 | elif len(x.shape) == 2: 153 | number_of_quadratics = x.shape[1] 154 | H_quads = np.zeros(number_of_quadratics) 155 | if verbose: 156 | try: 157 | from tqdm import tqdm 158 | for i in tqdm(range(number_of_quadratics)): 159 | feed_dict[self.problem.dw] = x[:,i] 160 | H_quads[i] = self.sess.run(self.problem.H_quadratic,feed_dict) 161 | except: 162 | print('No progress bar :(') 163 | for i in range(number_of_quadratics): 164 | feed_dict[self.problem.dw] = x[:,i] 165 | H_quads[i] = self.sess.run(self.problem.H_quadratic,feed_dict) 166 | else: 167 | for i in range(number_of_quadratics): 168 | feed_dict[self.problem.dw] = x[:,i] 169 | H_quads[i] = self.sess.run(self.problem.H_quadratic,feed_dict) 170 | return H_quads 171 | else: 172 | raise 173 | 174 | 175 | class HessianWrapper: 176 | 177 | def __init__(self,hessian,data_dictionary): 178 | 179 | self._hessian = hessian 180 | self._data_dictionary = data_dictionary 181 | 182 | 183 | def __call__(self,x): 184 | return self._hessian(x,self._data_dictionary) 185 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/rangeFinders.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | import time 20 | import sys 21 | import numpy as np 22 | 23 | 24 | from scipy.linalg import cholesky, eigh, solve_triangular, qr, rq 25 | 26 | import time 27 | 28 | 29 | def block_range_finder(A_op,n,epsilon,block_size,verbose = False,seed = 0): 30 | """ 31 | Randomized algorithm for block range finding 32 | 33 | Parameters: 34 | ----------- 35 | Aop : {Callable} n x n symmetric matrix 36 | Hermitian matrix operator whose eigenvalues need to be estimated 37 | y = Aop(dw) is the action of A in the direction dw 38 | n : size of matrix A 39 | epsilon : relative reduction in error 40 | 41 | 42 | Returns: 43 | -------- 44 | Q : range for Aop 45 | """ 46 | # Taken from http://people.maths.ox.ac.uk/martinsson/Pubs/2015_randQB.pdf 47 | 48 | my_state = np.random.RandomState(seed=seed) 49 | w = my_state.randn(n,1) 50 | Action = A_op(w) 51 | initial_error = np.linalg.norm(Action) 52 | big_Q = None 53 | converged = False 54 | iteration = 0 55 | while not converged: 56 | # Sample Gaussian random matrix 57 | Omega = my_state.randn(n,block_size) 58 | # Perform QR on action 59 | Q,_ = np.linalg.qr(A_op(Omega)) 60 | # Update basis 61 | if big_Q is None: 62 | big_Q = Q 63 | else: 64 | Q -= big_Q@(big_Q.T@Q) 65 | big_Q = np.concatenate((big_Q,Q),axis = 1) 66 | # This QR gets slow after many iterations, only last columns 67 | # need to be orthonormalized 68 | big_Q,_ = np.linalg.qr(big_Q) 69 | # Error estimation 70 | Approximate_Error = Action - big_Q@(big_Q.T@Action) 71 | error = np.linalg.norm(Approximate_Error) 72 | converged = error < epsilon*initial_error 73 | iteration+=1 74 | if verbose: 75 | print('At iteration', iteration, ' error/initial_error is ',error/initial_error,' tolerance is ',epsilon,' converged = ',converged) 76 | if iteration > n//block_size: 77 | break 78 | # I believe that the extra action of A_op in forming B for the QB factorization 79 | # is cheaper to do once after the fact, and is not needed for the matrix 80 | # free randomized error estimator. For this reason I just return Q, and 81 | # do not form B. 82 | return big_Q 83 | 84 | 85 | 86 | 87 | def noise_aware_adaptive_range_finder(Hessian,hessian_feed_dict,rq_estimator_dict_list,\ 88 | block_size = None,noise_tolerance = 1.0,epsilon = 1e-1, max_vectors = 20, verbose = False,seed = 0): 89 | """ 90 | Randomized algorithm for noise aware block range finding (N.A.A.R.F.) 91 | 92 | Parameters: 93 | ----------- 94 | Hessian : 95 | hessian_feed_dict : 96 | rq_estimator_dict : 97 | block_size : 98 | noise_tolerance : 99 | epsilon : 100 | verbose : 101 | seed : 102 | 103 | Returns: 104 | -------- 105 | Q : range for dominant eigenmodes of Hessian 106 | """ 107 | 108 | ################################################################################### 109 | assert type(rq_estimator_dict_list) is list 110 | n = Hessian.dimension 111 | if block_size is None: 112 | block_size = int(0.01*n) 113 | my_state = np.random.RandomState(seed=seed) 114 | w = my_state.randn(n,1) 115 | 116 | H = lambda x: Hessian(x,hessian_feed_dict,verbose = verbose) 117 | Action = H(w) 118 | big_Q = None 119 | converged = False 120 | iteration = 0 121 | rq_noise = 0. 122 | 123 | while not converged: 124 | # Sample Gaussian random matrix 125 | Omega = my_state.randn(n,block_size) 126 | # Perform QR on action 127 | Q,_ = np.linalg.qr(H(Omega)) 128 | # Update basis 129 | if big_Q is None: 130 | big_Q = Q 131 | else: 132 | Q -= big_Q@(big_Q.T@Q) 133 | big_Q = np.concatenate((big_Q,Q),axis = 1) 134 | # This QR gets slow after many iterations, only last columns 135 | # need to be orthonormalized 136 | big_Q,_ = np.linalg.qr(big_Q) 137 | # Error estimation is both for operator error 138 | # as well as spectral noise 139 | # Operator error estimation 140 | Approximate_Error = Action - big_Q@(big_Q.T@Action) 141 | operator_error = np.linalg.norm(Approximate_Error) 142 | # Noise error estimation 143 | rq_direction = big_Q[:,-block_size:] 144 | try: 145 | RQ_samples = np.zeros((len(rq_estimator_dict_list),rq_direction.shape[1])) 146 | except: 147 | RQ_samples = np.zeros(len(rq_estimator_dict_list)) 148 | if verbose: 149 | try: 150 | from tqdm import tqdm 151 | for samp_i,sample_dictionary in enumerate(tqdm(rq_estimator_dict_list)): 152 | RQ_samples[samp_i] = Hessian.quadratics(rq_direction,sample_dictionary) 153 | except: 154 | print('Issue with tqdm') 155 | for samp_i,sample_dictionary in enumerate(rq_estimator_dict_list): 156 | RQ_samples[samp_i] = Hessian.quadratics(rq_direction,sample_dictionary) 157 | else: 158 | for samp_i,sample_dictionary in enumerate(rq_estimator_dict_list): 159 | RQ_samples[samp_i] = Hessian.quadratics(rq_direction,sample_dictionary) 160 | 161 | rq_snr = np.abs(np.mean(RQ_samples,axis=0))/np.std(RQ_samples,axis = 0) 162 | too_noisy = (rq_snr < noise_tolerance).any() 163 | converged = (operator_error < epsilon) or too_noisy 164 | # print(80*'#') 165 | # print('rq_snr = ',rq_snr) 166 | # print('rq_snr < noise_tolerance = ',rq_snr < noise_tolerance) 167 | # print('too noisy? = ',too_noisy) 168 | # print('(operator_error < epsilon) = ',(operator_error < epsilon)) 169 | # print(80*'#') 170 | 171 | iteration+=1 172 | if verbose: 173 | print('At iteration', iteration, 'operator error is ',operator_error,' convergence = ',(operator_error < epsilon)) 174 | if big_Q.shape[-1] >= max_vectors: 175 | break 176 | 177 | if iteration > n//block_size: 178 | break 179 | # I believe that the extra action of A_op in forming B for the QB factorization 180 | # is cheaper to do once after the fact, and is not needed for the matrix 181 | # free randomized error estimator. For this reason I just return Q, and 182 | # do not form B. 183 | return big_Q 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/inexactNewtonCG.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | 24 | from ..utilities.parameterList import ParameterList 25 | from ..algorithms import Optimizer, CGSolver, ParametersCGSolver 26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion 27 | from ..problem import L2Regularization 28 | 29 | 30 | 31 | 32 | def ParametersInexactNewtonCG(parameters = {}): 33 | parameters['alpha'] = [1e0, "Initial steplength, or learning rate"] 34 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"] 35 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"] 36 | parameters['max_NN_evals_per_batch'] = [10000, "Scale constant for maximum neural network evaluations per datum"] 37 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"] 38 | 39 | 40 | parameters['cg_parameters'] = [ ParametersCGSolver(),'CG Parameters'] 41 | # CG solver parameters 42 | parameters['cg_coarse_tol'] = [0.5,'CG coarse solve tolerance'] 43 | parameters['cg_max_iter'] = [10,'CG maximum iterations'] 44 | parameters['eta_mode'] = [0, 'eta mode for E-W conditions:0,1,2'] 45 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none'] 46 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search'] 47 | 48 | # Reasons for convergence failure 49 | parameters['reasons'] = [[], 'list of reasons for termination'] 50 | 51 | 52 | return ParameterList(parameters) 53 | 54 | 55 | 56 | 57 | 58 | 59 | class InexactNewtonCG(Optimizer): 60 | """ 61 | This class implements the inexact Newton CG optimizer 62 | """ 63 | def __init__(self,problem,regularization = None,sess = None,feed_dict = None,\ 64 | parameters = ParametersInexactNewtonCG(),preconditioner = None): 65 | """ 66 | The constructor for this class takes: 67 | -problem: hessianlearn.problem.Problem 68 | -regularization: hessianlearn.problem.Regularization 69 | -sess: tf.Session() 70 | -parameters: hyperparameters dictionary 71 | -preconditioner: hessianlearn.problem.Preconditioner 72 | """ 73 | if regularization is None: 74 | _regularization = L2Regularization(problem,gamma = 0.0) 75 | else: 76 | _regularization = regularization 77 | super(InexactNewtonCG,self).__init__(problem,_regularization,sess,parameters) 78 | 79 | 80 | self.grad = self.problem.gradient + self.regularization.gradient 81 | self.cg_solver = CGSolver(self.problem,self.regularization,self.sess,parameters= self.parameters['cg_parameters']) 82 | self._sweeps = np.zeros(2) 83 | self.trust_region_initialized = False 84 | if self.parameters['globalization'] == 'trust_region': 85 | self.initialize_trust_region() 86 | self.alpha = 0.0 87 | 88 | 89 | 90 | def initialize_trust_region(self): 91 | """ 92 | Initializes trust region 93 | """ 94 | if not self.parameters['globalization'] == 'trust_region': 95 | self.parameters['globalization'] = 'trust_region' 96 | self.trust_region = TrustRegion() 97 | self.cg_solver.initialize_trust_region(coarse_tol = self.parameters['cg_coarse_tol']) 98 | self.cg_solver.set_trust_region_radius(self.trust_region.radius) 99 | self.trust_region_initialized = True 100 | 101 | def minimize(self,feed_dict = None,hessian_feed_dict = None): 102 | r""" 103 | Solves using inexact Newton CG algorithm 104 | -feed_dict: the data dictionary used for evaluating stochastic gradients and cost 105 | -hessian_feed_dict: smaller data dictionary used for stochastic Hessian 106 | """ 107 | assert self.sess is not None 108 | assert feed_dict is not None 109 | if hessian_feed_dict is None: 110 | hessian_feed_dict = feed_dict 111 | 112 | gradient = self.sess.run(self.grad,feed_dict = feed_dict) 113 | 114 | 115 | 116 | if self.parameters['globalization'] is None: 117 | self.alpha = self.parameters['alpha'] 118 | p,on_boundary = self.cg_solver.solve(-gradient,hessian_feed_dict) 119 | self._sweeps += [1,2*self.cg_solver.iter] 120 | self.p = p 121 | update = self.alpha*p 122 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 123 | 124 | if self.parameters['globalization'] == 'line_search': 125 | w_dir,on_boundary = self.cg_solver.solve(-gradient,hessian_feed_dict) 126 | w_dir_inner_g = np.inner(w_dir,gradient) 127 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict) 128 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict) 129 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\ 130 | cost_at_candidate, initial_cost,\ 131 | max_backtracking_iter = self.parameters['max_backtracking_iter']) 132 | update = self.alpha*w_dir 133 | self._sweeps += [1+0.5*line_search_iter,2*self.cg_solver.iter] 134 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 135 | 136 | elif self.parameters['globalization'] == 'trust_region': 137 | if not self.trust_region_initialized: 138 | self.initialize_trust_region() 139 | # Set trust region radius 140 | self.cg_solver.set_trust_region_radius(self.trust_region.radius) 141 | # Solve for candidate step 142 | p, on_boundary = self.cg_solver.solve(-gradient,hessian_feed_dict) 143 | pg = np.dot(p,gradient) 144 | self._sweeps += [1,2*self.cg_solver.iter] 145 | self.p = p 146 | # Calculate predicted reduction 147 | feed_dict[self.cg_solver.problem.dw] = p 148 | Hp = self.sess.run(self.cg_solver.Aop,feed_dict) 149 | pHp = np.dot(p,Hp) 150 | predicted_reduction = -pg-0.5*pHp 151 | # Calculate actual reduction 152 | misfit,reg = self.sess.run((self.problem.loss,self.regularization.cost),\ 153 | feed_dict = feed_dict) 154 | cost = misfit + reg 155 | w_copy = self.sess.run(self.problem.w) 156 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:p}) 157 | 158 | misfit,reg = self.sess.run((self.problem.loss,self.regularization.cost),\ 159 | feed_dict = feed_dict) 160 | cost_new = misfit + reg 161 | actual_reduction = cost - cost_new 162 | 163 | # Decide whether or not to accept the step 164 | accept_step = self.trust_region.evaluate_step(actual_reduction = actual_reduction,\ 165 | predicted_reduction = predicted_reduction,on_boundary = on_boundary) 166 | if accept_step: 167 | pass 168 | else: 169 | self.sess.run(self.problem._assignment_ops,feed_dict = {self.problem._assignment_placeholder:p}) 170 | 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/randomizedEigensolver.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import, division, print_function 19 | import time 20 | import sys 21 | import numpy as np 22 | 23 | 24 | from scipy.linalg import cholesky, eigh, solve_triangular, qr, rq 25 | 26 | import time 27 | 28 | 29 | def low_rank_hessian(optimizer,feed_dict,k,p=None,verbose = False): 30 | H = lambda x: optimizer.H(x,feed_dict) 31 | n = optimizer.problem.dimension 32 | return randomized_eigensolver(H, n, k,p = p,verbose = verbose) 33 | 34 | 35 | def randomized_eigensolver(Aop, n, k, p = None,seed = 0,verbose = False): 36 | """ 37 | Randomized algorithm for Hermitian eigenvalue problems 38 | Returns k largest eigenvalues computed using the randomized algorithm 39 | 40 | 41 | Parameters: 42 | ----------- 43 | Aop : {Callable} n x n 44 | Hermitian matrix operator whose eigenvalues need to be estimated 45 | y = Aop(dw) is the action of A in the direction dw 46 | 47 | n : int, 48 | number of row/columns of the operator A 49 | 50 | k : int, 51 | number of eigenvalues/vectors to be estimated 52 | p : int, optional 53 | oversampling parameter which can improve accuracy of resulting solution 54 | Default: 20 55 | 56 | Returns: 57 | -------- 58 | 59 | d : ndarray, (k,) 60 | eigenvalues arranged in descending order 61 | U : ndarray, (n,k) 62 | eigenvectors arranged according to eigenvalues 63 | 64 | References: 65 | ----------- 66 | .. [1] Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. "Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions." SIAM review 53.2 (2011): 217-288. 67 | Examples: 68 | --------- 69 | >>> import numpy as np 70 | >>> n = 100 71 | >>> A = np.diag(0.95**np.arange(n)) 72 | >>> Aop = lambda dw: np.dot(A,dw) 73 | >>> k = 10 74 | >>> p = 5 75 | >>> lmbda, U = randomized_eigensolver(Aop, n, k, p) 76 | """ 77 | if n == k: 78 | p = 0 79 | elif p is None: 80 | p = int(0.01*k) 81 | if k+p > n: 82 | p = n - k 83 | random_state = np.random.RandomState(seed=seed) 84 | Omega = random_state.randn(n,k+p) 85 | n = Omega.shape[0] 86 | 87 | assert(n >= k ) 88 | 89 | m = Omega.shape[1] 90 | Y = Aop(Omega) 91 | 92 | # print('condition number for Y = ',np.linalg.cond(Y)) 93 | Q,_ = qr(Y, mode = 'economic') 94 | T = np.zeros((m,m),dtype = 'd') 95 | if verbose: 96 | print('Forming small square matrix') 97 | AQ = Aop(Q) 98 | T = Q.T@AQ 99 | 100 | # Eigenvalue problem for T 101 | if verbose: 102 | print('Computing eigenvalue decomposition') 103 | d, V = eigh(T) 104 | d_abs = np.abs(d) #sort by absolute value (we want the k largest eigenvalues regardless of sign) 105 | sort_perm = d_abs.argsort() 106 | 107 | sort_perm = sort_perm[::-1] 108 | 109 | d = d[sort_perm[0:k]] 110 | V = V[:, sort_perm[0:k]] 111 | 112 | #Compute eigenvectors 113 | U = np.dot(Q, V) 114 | 115 | return d[:k], U[:,:k] 116 | 117 | 118 | def eigensolver_from_range(Aop, Q,verbose = False): 119 | """ 120 | Randomized algorithm for Hermitian eigenvalue problems 121 | Returns k largest eigenvalues computed using the randomized algorithm 122 | 123 | 124 | Parameters: 125 | ----------- 126 | Aop : {Callable} n x n 127 | Hermitian matrix operator whose eigenvalues need to be estimated 128 | y = Aop(dw) is the action of A in the direction dw 129 | Q : Array n x r 130 | 131 | 132 | Returns: 133 | -------- 134 | 135 | d : ndarray, (k,) 136 | eigenvalues arranged in descending order 137 | U : ndarray, (n,k) 138 | eigenvectors arranged according to eigenvalues 139 | """ 140 | m = Q.shape[1] 141 | T = np.zeros((m,m),dtype = 'd') 142 | if verbose: 143 | print('Forming small square matrix') 144 | AQ = Aop(Q) 145 | T = Q.T@AQ 146 | # Eigenvalue problem for T 147 | if verbose: 148 | print('Computing eigenvalue decomposition') 149 | d, V = eigh(T) 150 | d_abs = np.abs(d) #sort by absolute value (we want the k largest eigenvalues regardless of sign) 151 | sort_perm = d_abs.argsort() 152 | 153 | sort_perm = sort_perm[::-1] 154 | 155 | d = d[sort_perm[0:m]] 156 | V = V[:, sort_perm[0:m]] 157 | 158 | #Compute eigenvectors 159 | U = np.dot(Q, V) 160 | 161 | return d[:m], U[:,:m] 162 | 163 | def randomized_double_pass_eigensolver(Aop, Y, k): 164 | """ 165 | Randomized algorithm for Hermitian eigenvalue problems 166 | Returns k largest eigenvalues computed using the randomized algorithm 167 | 168 | Parameters: 169 | ----------- 170 | Aop : {Callable} n x n 171 | Hermitian matrix operator whose eigenvalues need to be estimated 172 | y = Aop(dw) is the action of A in the direction dw 173 | Y = Aop(Omega) : precomputed action of Aop on Omega, a m x n Array of (presumably) sampled Gaussian or l-percent sparse random vectors (row) 174 | k : int, 175 | number of eigenvalues/vectors to be estimated, 0 < k < m 176 | Returns: 177 | -------- 178 | 179 | lmbda : ndarray, (k,) 180 | eigenvalues arranged in descending order 181 | Ut : ndarray, (k, n) 182 | eigenvectors arranged according to eigenvalues, rows are eigenvectors 183 | 184 | References: 185 | ----------- 186 | .. [1] Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. "Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions." SIAM review 53.2 (2011): 217-288. 187 | .. [2] Algorithm 2 of Arvind paper 188 | Examples: 189 | --------- 190 | >>> import numpy as np 191 | >>> n = 100 192 | >>> A = np.diag(0.95**np.arange(n)) 193 | >>> Aop = lambda dw: np.dot(A,dw) 194 | >>> k = 10 195 | >>> p = 5 196 | >>> Omega = np.random.randn(n, k+p) 197 | >>> lmbda, Ut = randomized_eigensolver(Aop, Omega, k) 198 | """ 199 | raise Exception("Need to reimplement this function") 200 | m, n = Y.shape 201 | assert(n >= m >= k) #m = k + p ( p is the oversampling for Omega, to ensure we get a good random projection basis) 202 | Q, _ = qr(Y.T, mode='economic') 203 | T = (Aop(Q.T) @ Q).T #m foward problems , m x m small matrix 204 | # T = .5*T + .5*T.T 205 | 206 | #Eigen subproblem 207 | lmbda, V = eigh(T, turbo=True, overwrite_a=True, check_finite=False) 208 | inds = np.abs(lmbda).argsort()[::-1] 209 | lmbda = lmbda[inds[0:k]] 210 | V = V[:, inds[0:k]] #S in the original paper m x m 211 | 212 | #Compute eigenvectors 213 | Ut = (Q @ V).T 214 | return lmbda, Ut 215 | -------------------------------------------------------------------------------- /applications/transfer_learning/imagenet_cifar10_classification_evaluate_test.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | 19 | import numpy as np 20 | import os 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 22 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 23 | os.environ["KMP_WARNINGS"] = "FALSE" 24 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 25 | import pickle 26 | import tensorflow as tf 27 | import time, datetime 28 | # if int(tf.__version__[0]) > 1: 29 | # import tensorflow.compat.v1 as tf 30 | # tf.disable_v2_behavior() 31 | 32 | 33 | # Memory issue with GPUs 34 | gpu_devices = tf.config.experimental.list_physical_devices('GPU') 35 | for device in gpu_devices: 36 | tf.config.experimental.set_memory_growth(device, True) 37 | # Load hessianlearn library 38 | import sys 39 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../")) 40 | from hessianlearn import * 41 | 42 | # Parse run specifications 43 | from argparse import ArgumentParser 44 | 45 | parser = ArgumentParser(add_help=True) 46 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str) 47 | parser.add_argument('-fixed_step',dest = 'fixed_step',\ 48 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int) 49 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-4,help= 'learning rate alpha',type=float) 50 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int) 51 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\ 52 | required= False,default = 0,help='boolean for recording spectrum',type = int) 53 | # parser.add_argument('-weight_burn_in',dest = 'weight_burn_in',\ 54 | # required= False,default = 0,help='',type = int) 55 | 56 | # parser.add_argument('-data_seed',dest = 'data_seed',\ 57 | # required= False,default = 0,help='',type = int) 58 | 59 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int) 60 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int) 61 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int) 62 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str) 63 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float) 64 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 1,help='max sweeps',type = float) 65 | parser.add_argument('-weights_file',dest = 'weights_file',required= False,default = 'None',help='weight file pickle',type = str) 66 | 67 | args = parser.parse_args() 68 | 69 | try: 70 | tf.set_random_seed(0) 71 | except: 72 | tf.random.set_seed(0) 73 | 74 | # GPU Environment Details 75 | gpu_availabe = tf.test.is_gpu_available() 76 | built_with_cuda = tf.test.is_built_with_cuda() 77 | print(80*'#') 78 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80)) 79 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80)) 80 | print(80*'#') 81 | 82 | settings = {} 83 | # Set run specifications 84 | # Data specs 85 | settings['batch_size'] = args.batch_size 86 | settings['hess_batch_size'] = args.hess_batch_size 87 | 88 | 89 | ################################################################################ 90 | # Instantiate data 91 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar10.load_data() 92 | 93 | # # Normalize the data 94 | # x_train = x_train.astype('float32') / 255. 95 | # x_test = x_test.astype('float32') / 255. 96 | 97 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 98 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test) 99 | x_val = x_test_full[:2000] 100 | x_test = x_test_full[2000:] 101 | 102 | y_train = tf.keras.utils.to_categorical(y_train) 103 | y_test_full = tf.keras.utils.to_categorical(_y_test) 104 | y_val = y_test_full[:2000] 105 | y_test = y_test_full[2000:] 106 | 107 | ################################################################################ 108 | # Create the neural network in keras 109 | 110 | # tf.keras.backend.set_floatx('float64') 111 | 112 | resnet_input_shape = (200,200,3) 113 | input_tensor = tf.keras.Input(shape = resnet_input_shape) 114 | 115 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor) 116 | 117 | for layer in pretrained_resnet50.layers[:143]: 118 | layer.trainable = False 119 | 120 | classifier = tf.keras.models.Sequential() 121 | classifier.add(tf.keras.layers.Input(shape=(32,32,3))) 122 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2]))) 123 | classifier.add(pretrained_resnet50) 124 | classifier.add(tf.keras.layers.Flatten()) 125 | classifier.add(tf.keras.layers.BatchNormalization()) 126 | classifier.add(tf.keras.layers.Dense(64, activation='relu')) 127 | classifier.add(tf.keras.layers.Dropout(0.5)) 128 | classifier.add(tf.keras.layers.BatchNormalization()) 129 | classifier.add(tf.keras.layers.Dense(10, activation='softmax')) 130 | 131 | 132 | if args.keras_opt == 'adam': 133 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8) 134 | elif args.keras_opt == 'sgd': 135 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha) 136 | else: 137 | raise 138 | 139 | classifier.compile(optimizer=optimizer, 140 | loss=tf.keras.losses.CategoricalCrossentropy(from_logits = True), 141 | metrics=['accuracy']) 142 | 143 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2) 144 | print('acc_test = ',acc_test_0) 145 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2) 146 | print('acc_val = ',acc_val_0) 147 | 148 | 149 | if args.weights_file is not 'None': 150 | try: 151 | logger = open(args.weights_file, 'rb') 152 | best_weights = pickle.load(logger)['best_weights'] 153 | for layer_name,weight in best_weights.items(): 154 | classifier.get_layer(layer_name).set_weights(weight) 155 | except: 156 | print('Issue loading best weights') 157 | 158 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2) 159 | print('acc_test final = ',acc_test_final) 160 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2) 161 | print('acc_val final = ',acc_val_final) 162 | 163 | ################################################################################ 164 | # Evaluate again on all the data. 165 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() 166 | 167 | # # Normalize the data 168 | # x_train = x_train.astype('float32') / 255. 169 | # x_test = x_test.astype('float32') / 255. 170 | 171 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 172 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test) 173 | 174 | y_train = tf.keras.utils.to_categorical(y_train) 175 | y_test = tf.keras.utils.to_categorical(y_test) 176 | 177 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2) 178 | print(80*'#') 179 | print('After hessianlearn training'.center(80)) 180 | print('acc_test_total = ',acc_test_total) 181 | -------------------------------------------------------------------------------- /applications/transfer_learning/imagenet_cifar100_classification_evaluate_test.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | 19 | import numpy as np 20 | import os 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 22 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 23 | os.environ["KMP_WARNINGS"] = "FALSE" 24 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1' 25 | import pickle 26 | import tensorflow as tf 27 | import time, datetime 28 | # if int(tf.__version__[0]) > 1: 29 | # import tensorflow.compat.v1 as tf 30 | # tf.disable_v2_behavior() 31 | 32 | 33 | # Memory issue with GPUs 34 | gpu_devices = tf.config.experimental.list_physical_devices('GPU') 35 | for device in gpu_devices: 36 | tf.config.experimental.set_memory_growth(device, True) 37 | # Load hessianlearn library 38 | import sys 39 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../")) 40 | from hessianlearn import * 41 | 42 | # Parse run specifications 43 | from argparse import ArgumentParser 44 | 45 | parser = ArgumentParser(add_help=True) 46 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str) 47 | parser.add_argument('-fixed_step',dest = 'fixed_step',\ 48 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int) 49 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-4,help= 'learning rate alpha',type=float) 50 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int) 51 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\ 52 | required= False,default = 0,help='boolean for recording spectrum',type = int) 53 | # parser.add_argument('-weight_burn_in',dest = 'weight_burn_in',\ 54 | # required= False,default = 0,help='',type = int) 55 | 56 | # parser.add_argument('-data_seed',dest = 'data_seed',\ 57 | # required= False,default = 0,help='',type = int) 58 | 59 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int) 60 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int) 61 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int) 62 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str) 63 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float) 64 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 1,help='max sweeps',type = float) 65 | parser.add_argument('-weights_file',dest = 'weights_file',required= False,default = 'None',help='weight file pickle',type = str) 66 | 67 | args = parser.parse_args() 68 | 69 | try: 70 | tf.set_random_seed(0) 71 | except: 72 | tf.random.set_seed(0) 73 | 74 | # GPU Environment Details 75 | gpu_availabe = tf.test.is_gpu_available() 76 | built_with_cuda = tf.test.is_built_with_cuda() 77 | print(80*'#') 78 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80)) 79 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80)) 80 | print(80*'#') 81 | 82 | settings = {} 83 | # Set run specifications 84 | # Data specs 85 | settings['batch_size'] = args.batch_size 86 | settings['hess_batch_size'] = args.hess_batch_size 87 | 88 | 89 | ################################################################################ 90 | # Instantiate data 91 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar100.load_data() 92 | 93 | # # Normalize the data 94 | # x_train = x_train.astype('float32') / 255. 95 | # x_test = x_test.astype('float32') / 255. 96 | 97 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 98 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test) 99 | x_val = x_test_full[:2000] 100 | x_test = x_test_full[2000:] 101 | 102 | y_train = tf.keras.utils.to_categorical(y_train) 103 | y_test_full = tf.keras.utils.to_categorical(_y_test) 104 | y_val = y_test_full[:2000] 105 | y_test = y_test_full[2000:] 106 | 107 | ################################################################################ 108 | # Create the neural network in keras 109 | 110 | # tf.keras.backend.set_floatx('float64') 111 | 112 | resnet_input_shape = (200,200,3) 113 | input_tensor = tf.keras.Input(shape = resnet_input_shape) 114 | 115 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor) 116 | 117 | for layer in pretrained_resnet50.layers[:143]: 118 | layer.trainable = False 119 | 120 | 121 | 122 | classifier = tf.keras.models.Sequential() 123 | classifier.add(tf.keras.layers.Input(shape=(32,32,3))) 124 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2]))) 125 | classifier.add(pretrained_resnet50) 126 | classifier.add(tf.keras.layers.Flatten()) 127 | classifier.add(tf.keras.layers.BatchNormalization()) 128 | classifier.add(tf.keras.layers.Dense(128, activation='relu')) 129 | classifier.add(tf.keras.layers.Dropout(0.5)) 130 | classifier.add(tf.keras.layers.BatchNormalization()) 131 | classifier.add(tf.keras.layers.Dense(100, activation='softmax')) 132 | 133 | 134 | if args.keras_opt == 'adam': 135 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8) 136 | elif args.keras_opt == 'sgd': 137 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha) 138 | else: 139 | raise 140 | 141 | classifier.compile(optimizer=optimizer, 142 | loss=tf.keras.losses.CategoricalCrossentropy(from_logits = True), 143 | metrics=['accuracy']) 144 | 145 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2) 146 | print('acc_test = ',acc_test_0) 147 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2) 148 | print('acc_val = ',acc_val_0) 149 | 150 | 151 | if args.weights_file is not 'None': 152 | try: 153 | logger = open(args.weights_file, 'rb') 154 | best_weights = pickle.load(logger)['best_weights'] 155 | for layer_name,weight in best_weights.items(): 156 | classifier.get_layer(layer_name).set_weights(weight) 157 | except: 158 | print('Issue loading best weights') 159 | 160 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2) 161 | print('acc_test final = ',acc_test_final) 162 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2) 163 | print('acc_val final = ',acc_val_final) 164 | 165 | ################################################################################ 166 | # Evaluate again on all the data. 167 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar100.load_data() 168 | 169 | # # Normalize the data 170 | # x_train = x_train.astype('float32') / 255. 171 | # x_test = x_test.astype('float32') / 255. 172 | 173 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 174 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test) 175 | 176 | y_train = tf.keras.utils.to_categorical(y_train) 177 | y_test = tf.keras.utils.to_categorical(y_test) 178 | 179 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2) 180 | print(80*'#') 181 | print('After hessianlearn training'.center(80)) 182 | print('acc_test_total = ',acc_test_total) 183 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/varianceBasedNystrom.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Nick Alger 16 | 17 | import numpy as np 18 | 19 | np.random.seed(0) 20 | 21 | 22 | def variance_based_nystrom(apply_AA, num_cols_A, oversampling_parameter=5, block_size=10, 23 | std_tol=0.5, max_bad_vectors=5, max_vectors=100, verbose=True): 24 | """ 25 | Computes approximate truncated eigenvalue decomposition 26 | A = U D U^T 27 | of a n x n matrix A which is given by the following sum of matrices: 28 | A = (A1 + A2 + ... + Am)/m. 29 | U is an n x r orthonormal matrix, and D = diag(dd). 30 | 31 | The eigenvalue decomposition is terminated when the relative variance 32 | of the eigenvalues exceeds the threshold std_tol for at least max_bad_vectors. 33 | Only eigenvalues which do not exceed std_tol are retuend. 34 | 35 | apply_AA is a list of callables, where matvecs with the matrices Ak are computed via 36 | apply_AA[k](x) = Ak * x. 37 | 38 | num_cols_A is the number of columns of A (A is n x n, num_cols_A = n) 39 | 40 | oversampling_parameter is the number of extra vectors used within randmoized SVD 41 | 42 | block_size is the number of random vectors per group used in the randomized eigenvalue method. 43 | 44 | max_vectors is the maximum rank of the truncated eigenvalue decompisition 45 | """ 46 | op = oversampling_parameter 47 | n = num_cols_A 48 | m = len(apply_AA) 49 | 50 | Q = np.zeros((n,0)) 51 | Theta = np.zeros((0,0,m)) 52 | num_bad_vectors = 0 53 | while num_bad_vectors < max_bad_vectors: 54 | Q1 = Q 55 | Theta11 = Theta 56 | 57 | Y = get_random_range_vectors(apply_AA, n, block_size) 58 | Y_perp = Y - np.dot(Q,np.dot(Q.T, Y)) 59 | Q2,_ = np.linalg.qr(Y_perp) 60 | Q2 = Q2.reshape((n,-1)) # Reshape to guard against case block_size==1 61 | Q = np.hstack([Q1, Q2]) 62 | 63 | Theta = compute_or_update_Theta(Q1, Q2, Theta11, apply_AA) 64 | dd, U, V = finish_computing_eigenvalue_decomposition(Q, Theta) 65 | _, all_std = compute_rayleigh_statistics(Theta, V) 66 | 67 | bad_inds = (all_std[:-op] / np.abs(dd[:-op])) > std_tol 68 | num_bad_vectors = np.sum(bad_inds) 69 | 70 | current_num_vectors = Q.shape[1] 71 | current_rank = current_num_vectors - op - num_bad_vectors 72 | if verbose: 73 | print('current_rank=', current_rank, ', num_bad_vectors=', num_bad_vectors) 74 | 75 | if current_num_vectors > max_vectors: 76 | break 77 | 78 | good_inds = np.logical_not(bad_inds) 79 | dd_good = dd[:-op][good_inds] 80 | U_good = U[:,:-op][:,good_inds] 81 | all_std_good = all_std[:-op][good_inds] 82 | return [dd_good, U_good, all_std_good],[dd[:-op],U[:,:-op],all_std[:-op]] 83 | 84 | 85 | def get_random_range_vectors(apply_AA, num_cols_A, block_size_r,seed = 0): 86 | """ 87 | Computes n x r matrix 88 | Y = A * Omega 89 | where A is an n x n matrix of the form 90 | A = (A1 + A2 + ... + Am)/m, 91 | matvecs with the matrices Ak may be computed via the function 92 | apply_AA[k](x) = Ak * x, 93 | and Omega is a random n x r matrix. 94 | """ 95 | n = num_cols_A 96 | r = block_size_r 97 | m = len(apply_AA) 98 | 99 | Omega = np.random.randn(n, r) 100 | Y = np.zeros((n, r)) 101 | # In Tensorflow: 102 | # z = g^T Omega 103 | # q = unstack(z) 104 | # Y = (1/m) * restack(dq_i / dw) 105 | for j in range(r): # These loops can be trivially parallelized 106 | for k in range(m): 107 | Y[:,j] = Y[:,j] + (1./m)*apply_AA[k](Omega[:,j]) 108 | return Y 109 | 110 | 111 | def compute_Theta(orthonormal_range_basis_Q, apply_AA): 112 | """ 113 | Computes r x r x m 3-tensor Theta with entries 114 | Theta_ijk = qi^T Ak qj. 115 | Theta has frontal slices 116 | Theta_::k = Q^T Ak Q. 117 | """ 118 | Q = orthonormal_range_basis_Q 119 | m = len(apply_AA) 120 | r = Q.shape[1] 121 | 122 | Theta = np.zeros((r, r, m)) 123 | for j in range(r): # These loops can be trivially parallelized 124 | for k in range(m): 125 | Theta[:,j,k] = np.dot(Q.T, apply_AA[k](Q[:,j])) 126 | return Theta 127 | 128 | 129 | def finish_computing_eigenvalue_decomposition(orthonormal_range_basis_Q, Theta): 130 | """ 131 | Finishes computing eigenvalue decomposition 132 | A = U diag(dd) U^T, 133 | and smaller auxiliary eigenvalue decomposition 134 | Q^T A Q = V diag(dd) V^T 135 | where Q is an orthonormal basis for the range of 136 | A = (A1+A2+...+Am)/m, 137 | and Theta is the matrix with frontal slices 138 | Theta_::k = Q^T Ak Q. 139 | """ 140 | Q = orthonormal_range_basis_Q 141 | m = Theta.shape[-1] 142 | 143 | B = (1. / m) * np.sum(Theta, axis=-1) 144 | dd, V = np.linalg.eigh(B) 145 | idx = np.argsort(np.abs(dd))[::-1] 146 | dd = dd[idx] 147 | V = V[:,idx] 148 | 149 | U = np.dot(Q, V) 150 | return dd, U, V 151 | 152 | 153 | def compute_rayleigh_statistics(Theta, small_eigenvectors_V): 154 | """ 155 | Computes sample mean and standard deviation of Rayleigh quotients 156 | all_mu[i] = mean(ui^T Ak ui) 157 | all_std[i] = std(ui^T Ak ui) 158 | where Ak is randomly chosen, and ui is the i'th eigenvector of 159 | A = (A1 + A2 + ... + Am)/m. 160 | Theta is the r x r x m 3-tensor with frontal slices 161 | Theta_::k = Q^T Ak Q, 162 | for orthonormal basis Q such that 163 | A =approx= Q * Q^T * A 164 | The columns, vi, of V are the eigenvectors of the matrix Q^T A Q, i.e., 165 | Q^T A Q = V D V^T 166 | where D is the diagonal matrix of eigenvalues, which we do not need here. 167 | (Note that ui = Q * vi). 168 | """ 169 | V = small_eigenvectors_V 170 | r = Theta.shape[0] 171 | 172 | C = np.sum(V.reshape((r,r,-1)) * np.einsum('jki,kl->jli', Theta, V), axis=0) 173 | all_mu = np.mean(C, axis=1) 174 | all_std = np.std(C, axis=1) 175 | return all_mu, all_std 176 | 177 | 178 | def update_Theta(Q1, Q2, Theta11, apply_AA): 179 | """ 180 | Computes updated r x r x m 3-tensor Theta with frontal slices 181 | Theta_::k = Q^T Ak Q 182 | based on old Theta1 with frontal slices 183 | Theta11_::k = Q1^T Ak Q1. 184 | Here Q1 and Q2 are orthonormal matrices, and 185 | Q = [Q1, Q2] 186 | is also an orthonormal matrix. 187 | Q1 was the old range approximation for A. 188 | Q2 columns are more vectors to improve the range approximation. 189 | Q is the new range approximation. 190 | """ 191 | m = len(apply_AA) 192 | r1 = Q1.shape[1] 193 | r2 = Q2.shape[1] 194 | r = r1 + r2 195 | Theta12 = np.zeros((r1, r2, m)) 196 | Theta22 = np.zeros((r2, r2, m)) 197 | for i in range(r2): # These loops can be trivially parallelized 198 | for k in range(m): 199 | Ak_qi = apply_AA[k](Q2[:,i]) 200 | Theta12[:,i,k] = np.dot(Q1.T, Ak_qi) 201 | Theta22[:,i,k] = np.dot(Q2.T, Ak_qi) 202 | 203 | Theta = np.zeros((r, r, m)) 204 | Theta[:r1, :r1, :] = Theta11 205 | Theta[:r1, r1:, :] = Theta12 206 | Theta[r1:, :r1, :] = Theta12.swapaxes(0,1) 207 | Theta[r1:, r1:, :] = Theta22 208 | return Theta 209 | 210 | 211 | def compute_or_update_Theta(Q1, Q2, Theta11, apply_AA): 212 | if Theta11.size == 0: 213 | return compute_Theta(Q2, apply_AA) 214 | else: 215 | return update_Theta(Q1, Q2, Theta11, apply_AA) 216 | 217 | 218 | -------------------------------------------------------------------------------- /applications/mnist/mnist_vae.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | ################################################################################ 19 | # Uses some code from https://blog.keras.io/building-autoencoders-in-keras.html 20 | ################################################################################ 21 | 22 | import numpy as np 23 | import os 24 | import tensorflow as tf 25 | import time 26 | # if int(tf.__version__[0]) > 1: 27 | # import tensorflow.compat.v1 as tf 28 | # tf.disable_v2_behavior() 29 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 30 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 31 | os.environ["KMP_WARNINGS"] = "FALSE" 32 | import sys 33 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../")) 34 | from hessianlearn import * 35 | 36 | tf.set_random_seed(0) 37 | 38 | settings = {} 39 | # Set run specifications 40 | # Data specs 41 | settings['batch_size'] = 100 42 | settings['hess_batch_size'] = 10 43 | 44 | 45 | ################################################################################ 46 | # Instantiate data 47 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 48 | 49 | 50 | # Normalize the data 51 | x_train = x_train.astype('float32') / 255. 52 | x_test = x_test.astype('float32') / 255. 53 | # Reshape the data 54 | flattened_dimension = np.prod(x_train.shape[1:]) 55 | x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:]))) 56 | x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:]))) 57 | 58 | # Instante the data object 59 | data = Data([x_train,y_train],settings['batch_size'],test_data = [x_test,y_test],hessian_batch_size = settings['hess_batch_size']) 60 | 61 | # settings['input_shape'] = data._input_shape 62 | # settings['output_shape'] = data._output_shape 63 | 64 | 65 | ################################################################################ 66 | # Build the variational autoencoder neural network model here 67 | 68 | # network parameters 69 | input_shape = (flattened_dimension, ) 70 | intermediate_dim = 512 71 | latent_dim = 2 72 | 73 | # VAE model = encoder + decoder 74 | # build encoder model 75 | inputs = tf.keras.layers.Input(shape=input_shape) 76 | x_encoder = tf.keras.layers.Dense(intermediate_dim, activation='softplus')(inputs) 77 | z_mean = tf.keras.layers.Dense(latent_dim, name='z_mean')(x_encoder) 78 | z_log_var = tf.keras.layers.Dense(latent_dim, name='z_log_var')(x_encoder) 79 | 80 | # reparameterization trick 81 | # instead of sampling from Q(z|X), sample epsilon = N(0,I) 82 | # z = z_mean + sqrt(var) * epsilon 83 | def sampling(args): 84 | """Reparameterization trick by sampling from an isotropic unit Gaussian. 85 | # Arguments 86 | args (tensor): mean and log of variance of Q(z|X) 87 | # Returns 88 | z (tensor): sampled latent vector 89 | """ 90 | z_mean, z_log_var = args 91 | batch = tf.keras.backend.shape(z_mean)[0] 92 | dim = tf.keras.backend.int_shape(z_mean)[1] 93 | # by default, random_normal has mean = 0 and std = 1.0 94 | epsilon = tf.keras.backend.random_normal(shape=(batch, dim)) 95 | return z_mean + tf.keras.backend.exp(0.5 * z_log_var) * epsilon 96 | # use reparameterization trick to push the sampling out as input 97 | # note that "output_shape" isn't necessary with the TensorFlow backend 98 | z = tf.keras.layers.Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var]) 99 | 100 | # instantiate encoder model 101 | encoder = tf.keras.models.Model(inputs, [z_mean, z_log_var, z], name='encoder') 102 | 103 | # build decoder model 104 | latent_inputs = tf.keras.layers.Input(shape=(latent_dim,), name='z_sampling') 105 | x_decoder = tf.keras.layers.Dense(intermediate_dim, activation='softplus')(latent_inputs) 106 | outputs = tf.keras.layers.Dense(flattened_dimension, activation='sigmoid')(x_decoder) 107 | 108 | # instantiate decoder model 109 | decoder = tf.keras.models.Model(latent_inputs, outputs, name='decoder') 110 | 111 | # instantiate VAE model 112 | outputs = decoder(encoder(inputs)[2]) 113 | vae = tf.keras.models.Model(inputs, outputs, name='vae_mlp') 114 | 115 | 116 | 117 | ################################################################################ 118 | # Instantiate the problem, regularization. 119 | 120 | problem = VariationalAutoencoderProblem(vae,z_mean,z_log_var,dtype=tf.float32) 121 | 122 | settings['tikhonov_gamma'] = 1e-2 123 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma']) 124 | 125 | 126 | ################################################################################ 127 | # Instantiate the model object 128 | HLModelSettings = HessianlearnModelSettings() 129 | 130 | HLModelSettings['optimizer'] = 'lrsfn' 131 | HLModelSettings['alpha'] = 5e-4 132 | HLModelSettings['globalization'] = 'line_search' 133 | HLModelSettings['hessian_low_rank'] = 20 134 | HLModelSettings['max_backtrack'] = 16 135 | HLModelSettings['max_sweeps'] = 50 136 | 137 | HLModelSettings['problem_name'] = 'mnist_vae' 138 | 139 | 140 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings) 141 | 142 | 143 | # Can pass in an initial guess for the weights w_0 to the method fit, if desired. 144 | HLModel.fit(w_0 = None) 145 | 146 | ################################################################################ 147 | # Post processing 148 | import matplotlib.pyplot as plt 149 | def plot_results(models, 150 | data, 151 | batch_size=128, 152 | model_name="vae_mnist"): 153 | """Plots labels and MNIST digits as a function of the 2D latent vector 154 | # Arguments 155 | models (tuple): encoder and decoder models 156 | data (tuple): test data and label 157 | batch_size (int): prediction batch size 158 | model_name (string): which model is using this function 159 | """ 160 | 161 | encoder, decoder = models 162 | x_test, y_test = data 163 | os.makedirs(model_name, exist_ok=True) 164 | 165 | filename = os.path.join(model_name, "vae_mean.png") 166 | # display a 2D plot of the digit classes in the latent space 167 | z_mean, _, _ = encoder.predict(x_test, 168 | batch_size=batch_size) 169 | plt.figure(figsize=(12, 10)) 170 | plt.scatter(z_mean[:, 0], z_mean[:, 1], c=y_test) 171 | plt.colorbar() 172 | plt.xlabel("z[0]") 173 | plt.ylabel("z[1]") 174 | plt.savefig(filename) 175 | plt.show() 176 | 177 | filename = os.path.join(model_name, "digits_over_latent.png") 178 | # display a 30x30 2D manifold of digits 179 | n = 30 180 | digit_size = 28 181 | figure = np.zeros((digit_size * n, digit_size * n)) 182 | # linearly spaced coordinates corresponding to the 2D plot 183 | # of digit classes in the latent space 184 | grid_x = np.linspace(-4, 4, n) 185 | grid_y = np.linspace(-4, 4, n)[::-1] 186 | 187 | for i, yi in enumerate(grid_y): 188 | for j, xi in enumerate(grid_x): 189 | z_sample = np.array([[xi, yi]]) 190 | x_decoded = decoder.predict(z_sample) 191 | digit = x_decoded[0].reshape(digit_size, digit_size) 192 | figure[i * digit_size: (i + 1) * digit_size, 193 | j * digit_size: (j + 1) * digit_size] = digit 194 | 195 | plt.figure(figsize=(10, 10)) 196 | start_range = digit_size // 2 197 | end_range = (n - 1) * digit_size + start_range + 1 198 | pixel_range = np.arange(start_range, end_range, digit_size) 199 | sample_range_x = np.round(grid_x, 1) 200 | sample_range_y = np.round(grid_y, 1) 201 | plt.xticks(pixel_range, sample_range_x) 202 | plt.yticks(pixel_range, sample_range_y) 203 | plt.xlabel("z[0]") 204 | plt.ylabel("z[1]") 205 | plt.imshow(figure, cmap='Greys_r') 206 | plt.savefig(filename) 207 | plt.show() 208 | 209 | 210 | models = (encoder, decoder) 211 | data = (x_test, y_test) 212 | plot_results(models, 213 | data, 214 | batch_size=settings['batch_size'], 215 | model_name= HLModelSettings['optimizer']+'_vae_mlp') 216 | 217 | 218 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ___ ___ ___ ___ ___ ___ 6 | /__/\ / /\ / /\ / /\ ___ / /\ /__/\ 7 | \ \:\ / /:/_ / /:/_ / /:/_ / /\ / /::\ \ \:\ 8 | \__\:\ / /:/ /\ / /:/ /\ / /:/ /\ / /:/ / /:/\:\ \ \:\ 9 | ___ / /::\ / /:/ /:/_ / /:/ /::\ / /:/ /::\ /__/::\ / /:/~/::\ _____\__\:\ 10 | /__/\ /:/\:\/__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/\:\\__\/\:\__ /__/:/ /:/\:\/__/::::::::\ 11 | \ \:\/:/__\/\ \:\/:/ /:/\ \:\/:/~/:/\ \:\/:/~/:/ \ \:\/\\ \:\/:/__\/\ \:\~~\~~\/ 12 | \ \::/ \ \::/ /:/ \ \::/ /:/ \ \::/ /:/ \__\::/ \ \::/ \ \:\ ~~~ 13 | \ \:\ \ \:\/:/ \__\/ /:/ \__\/ /:/ /__/:/ \ \:\ \ \:\ 14 | \ \:\ \ \::/ /__/:/ /__/:/ \__\/ \ \:\ \ \:\ 15 | \__\/ \__\/ \__\/ \__\/ \__\/ \__\/ 16 | 17 | 18 | ___ ___ ___ ___ 19 | / /\ / /\ / /\ /__/\ 20 | / /:/_ / /::\ / /::\ \ \:\ 21 | ___ ___ / /:/ /\ / /:/\:\ / /:/\:\ \ \:\ 22 | /__/\ / /\ / /:/ /:/_ / /:/~/::\ / /:/~/:/ _____\__\:\ 23 | \ \:\ / /://__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/___/__/::::::::\ 24 | \ \:\ /:/ \ \:\/:/ /:/\ \:\/:/__\/\ \:\/:::::/\ \:\~~\~~\/ 25 | \ \:\/:/ \ \::/ /:/ \ \::/ \ \::/~~~~ \ \:\ ~~~ 26 | \ \::/ \ \:\/:/ \ \:\ \ \:\ \ \:\ 27 | \__\/ \ \::/ \ \:\ \ \:\ \ \:\ 28 | \__\/ \__\/ \__\/ \__\/ 29 | 30 | 31 | 32 | 33 | 34 | [![Build Status](https://travis-ci.com/tomoleary/hessianlearn.svg?branch=master)](https://travis-ci.com/tomoleary/hessianlearn) 35 | [![DOI](https://zenodo.org/badge/184635062.svg)](https://zenodo.org/badge/latestdoi/184635062) 36 | [![License](https://img.shields.io/github/license/tomoleary/hessianlearn)](./LICENSE.md) 37 | [![Top language](https://img.shields.io/github/languages/top/tomoleary/hessianlearn)](https://www.python.org) 38 | ![Code size](https://img.shields.io/github/languages/code-size/tomoleary/hessianlearn) 39 | [![Issues](https://img.shields.io/github/issues/tomoleary/hessianlearn)](https://github.com/tomoleary/hessianlearn/issues) 40 | [![Latest commit](https://img.shields.io/github/last-commit/tomoleary/hessianlearn)](https://github.com/tomoleary/hessianlearn/commits/master) 41 | 42 | # Hessian-based stochastic optimization in TensorFlow and keras 43 | 44 | This code implements Hessian-based stochastic optimization in TensorFlow and keras by exposing the matrix-free Hessian to users. The code is meant to allow for rapid-prototyping of Hessian-based algorithms via the matrix-free Hessian action, which allows users to inspect Hessian based information for stochastic nonconvex (neural network training) optimization problems. 45 | 46 | The Hessian action is exposed via matrix-vector products: 47 |

48 | 49 |

50 | 51 | and matrix-matrix products: 52 |

53 | 54 |

55 | 56 | ## Compatibility 57 | 58 | The code is compatible with Tensorflow v1 and v2, but certain features of v2 are disabled (like eager execution). This is because the Hessian matrix products in hessianlearn are implemented using `placeholders` which have been deprecated in v2. For this reason hessianlearn cannot work with data generators and things like this that require eager execution. If any compatibility issues are found, please open an [issue](https://github.com/tomoleary/hessianlearn/issues). 59 | 60 | ## Usage 61 | Set `HESSIANLEARN_PATH` environmental variable 62 | 63 | Train a keras model 64 | 65 | ```python 66 | import os,sys 67 | import tensorflow as tf 68 | sys.path.append( os.environ.get('HESSIANLEARN_PATH')) 69 | from hessianlearn import * 70 | 71 | # Define keras neural network model 72 | neural_network = tf.keras.models.Model(...) 73 | # Define loss function and compile model 74 | neural_network.compile(loss = ...) 75 | 76 | ``` 77 | 78 | hessianlearn implements various training [`problem`](https://github.com/tomoleary/hessianlearn/blob/master/hessianlearn/problem/problem.py) constructs (regression, classification, autoencoders, variational autoencoders, generative adversarial networks). Instantiate a `problem`, a `data` object (which takes a dictionary with keys that correspond to the corresponding `placeholders` in `problem`) and `regularization` 79 | 80 | ```python 81 | # Instantiate the problem (this handles the loss function, 82 | # construction of hessian and gradient etc.) 83 | # KerasModelProblem extracts loss function and metrics from 84 | # a compiled keras model 85 | problem = KerasModelProblem(neural_network) 86 | # Instantiate the data object, this handles the train / validation split 87 | # as well as iterating during training 88 | data = Data({problem.x:x_data,problem.y_true:y_data},train_batch_size,\ 89 | validation_data_size = validation_data_size) 90 | # Instantiate the regularization: L2Regularization is Tikhonov, 91 | # gamma = 0 is no regularization 92 | regularization = L2Regularization(problem,gamma = 0) 93 | ``` 94 | 95 | Pass these objects into the `HessianlearnModel` which handles the training 96 | 97 | ```python 98 | HLModel = HessianlearnModel(problem,regularization,data) 99 | HLModel.fit() 100 | ``` 101 | 102 | ### Alternative Usage (More like Keras Interface) 103 | The example above was the original way the optimizer interface was implemented in hessianlearn, however to better mimic the keras interface and allow for more end-user rapid prototyping of the optimizer that is used to fit data, as of December 2021, the following way has been created 104 | 105 | ```python 106 | import os,sys 107 | import tensorflow as tf 108 | sys.path.append( os.environ.get('HESSIANLEARN_PATH')) 109 | from hessianlearn import * 110 | 111 | # Define keras neural network model 112 | neural_network = tf.keras.models.Model(...) 113 | # Define loss function and compile model 114 | neural_network.compile(loss = ...) 115 | # Instance keras model wrapper which deals with the 116 | # construction of the `problem` which handles the construction 117 | # of Hessian computational graph and variables 118 | HLModel = KerasModelWrapper(neural_network) 119 | # Then the end user can pass in an optimizer 120 | # (e.g. custom end-user optimizer) 121 | optimizer = LowRankSaddleFreeNewton # The class constructor, not an instance 122 | opt_parameters = LowRankSaddleFreeNewtonParameters() 123 | opt_parameters['hessian_low_rank'] = 40 124 | HLModel.set_optimizer(optimizer,optimizer_parameters = opt_parameters) 125 | # The data object still needs to key on to the specific computational 126 | # graph variables that data will be passed in for. 127 | # Note that data can naturally handle multiple input and output data, 128 | # in which case problem.x, problem.y_true are lists corresponding to 129 | # neural_network.inputs, neural_network.outputs 130 | problem = HLModel.problem 131 | data = Data({problem.x:x_data,problem.y_true:y_data},train_batch_size,\ 132 | validation_data_size = validation_data_size) 133 | # And finally one can call fit! 134 | HLModel.fit(data) 135 | ``` 136 | 137 | ## Examples 138 | 139 | [Tutorial 0: MNIST Autoencoder](https://github.com/tomoleary/hessianlearn/blob/master/tutorial/Tutorial%200%20MNIST%20Autoencoder.ipynb) 140 | 141 | 142 | ## Applications 143 | 144 | ### Transfer Learning 145 | 146 | * Examples of CIFAR10, CIFAR100 classification from pre-trained Imagenet ResNet50 model in `applications/transfer_learning/` 147 | 148 | * Pre-trained model serves as well conditioned initial guess for transfer learning. In this setting Newton methods perform well due to their excellent properties in local convergence. Low Rank Saddle Free Newton is able to zero in on highly generalizable local minimizers bypassing indefinite regions. Below are validation accuracies of best choices of fixed step-length for Adam, SGD and LRSFN with fixed rank of 40. 149 | 150 |

151 | 152 |

153 | 154 | # References 155 | 156 | These manuscripts motivate and use the hessianlearn library for stochastic nonconvex optimization 157 | 158 | - \[1\] O'Leary-Roseberry, T., Alger, N., Ghattas O., 159 | [**Inexact Newton Methods for Stochastic Nonconvex Optimization with Applications to Neural Network Training**](https://arxiv.org/abs/1905.06738). 160 | arXiv:1905.06738. 161 | ([Download](https://arxiv.org/pdf/1905.06738.pdf))
BibTeX
162 | @article{OLearyRoseberryAlgerGhattas2019,
163 |   title={Inexact Newton methods for stochastic nonconvex optimization with applications to neural network training},
164 |   author={O'Leary-Roseberry, Thomas and Alger, Nick and Ghattas, Omar},
165 |   journal={arXiv preprint arXiv:1905.06738},
166 |   year={2019}
167 | }
168 | }
169 | 170 | - \[2\] O'Leary-Roseberry, T., Alger, N., Ghattas O., 171 | [**Low Rank Saddle Free Newton: A Scalable Method for Stochastic Nonconvex Optimization**](https://arxiv.org/abs/2002.02881). 172 | arXiv:2002.02881. 173 | ([Download](https://arxiv.org/pdf/2002.02881.pdf))
BibTeX
174 | @article{OLearyRoseberryAlgerGhattas2020,
175 |   title={Low Rank Saddle Free Newton: Algorithm and Analysis},
176 |   author={O'Leary-Roseberry, Thomas and Alger, Nick and Ghattas, Omar},
177 |   journal={arXiv preprint arXiv:2002.02881},
178 |   year={2020}
179 | }
180 | }
181 | 182 | 183 | - \[3\] O'Leary-Roseberry, T., Villa, U., Chen P., Ghattas O., 184 | [**Derivative-Informed Projected Neural Networks for High-Dimensional Parametric Maps Governed by PDEs**](https://www.sciencedirect.com/science/article/pii/S0045782521005302). 185 | Computer Methods in Applied Mechanics and Engineering. Volume 388, 1 January 2022, 114199. 186 | ([Download](https://arxiv.org/pdf/2011.15110.pdf))
BibTeX
187 | @article{OLearyRoseberryVillaChenEtAl2022,
188 |   title={Derivative-informed projected neural networks for high-dimensional parametric maps governed by {PDE}s},
189 |   author={O’Leary-Roseberry, Thomas and Villa, Umberto and Chen, Peng and Ghattas, Omar},
190 |   journal={Computer Methods in Applied Mechanics and Engineering},
191 |   volume={388},
192 |   pages={114199},
193 |   year={2022},
194 |   publisher={Elsevier}
195 | }
196 | }
197 | 198 | 199 | - \[4\] O'Leary-Roseberry, T., Du, X., Chaudhuri, A., Martins, J., Willcox, K., Ghattas, O., 200 | [**Adaptive Projected Residual Networks for Learning Parametric Maps from Sparse Data**](https://arxiv.org/abs/2112.07096). 201 | arXiv:2112.07096. 202 | ([Download](https://arxiv.org/pdf/2112.07096.pdf))
BibTeX
203 | @article{OLearyRoseberryDuChaudhuriEtAl2021,
204 |   title={Adaptive Projected Residual Networks for Learning Parametric Maps from Sparse Data},
205 |   author={O'Leary-Roseberry, Thomas and Du, Xiaosong, and Chaudhuri, Anirban, and Martins Joaqium R. R. A., and Willcox, Karen, and Ghattas, Omar},
206 |   journal={arXiv preprint arXiv:2112.07096},
207 |   year={2021}
208 | }
209 | }
210 | 211 | 212 | 213 | 214 | 215 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/lowRankSaddleFreeNewton.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | from scipy.sparse import diags 24 | import time 25 | 26 | from ..utilities.parameterList import ParameterList 27 | from ..algorithms import Optimizer 28 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion 29 | from ..algorithms.randomizedEigensolver import randomized_eigensolver, eigensolver_from_range 30 | from ..algorithms.rangeFinders import block_range_finder, noise_aware_adaptive_range_finder 31 | from ..algorithms.varianceBasedNystrom import variance_based_nystrom 32 | from ..problem import L2Regularization, HessianWrapper 33 | 34 | 35 | 36 | 37 | def ParametersLowRankSaddleFreeNewton(parameters = {}): 38 | parameters['alpha'] = [1e-3, "Initial steplength, or learning rate"] 39 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"] 40 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"] 41 | parameters['default_damping'] = [1e-3, "Levenberg-Marquardt damping when no regularization is used"] 42 | 43 | # Hessian approximation parameters 44 | parameters['range_finding'] = [None,"Range finding, if None then r = hessian_low_rank\ 45 | Choose from None, 'arf', 'naarf', 'vn'"] 46 | parameters['range_rel_error_tolerance'] = [0.1, "Error tolerance for error estimator in adaptive range finding"] 47 | parameters['range_abs_error_tolerance'] = [100, "Error tolerance for error estimator in adaptive range finding"] 48 | parameters['range_block_size'] = [20, "Block size used in range finder"] 49 | parameters['rq_samples_for_naarf'] = [100, "Number of partitions for RQ variance evaluation"] 50 | parameters['hessian_low_rank'] = [20, "Fixed rank for randomized eigenvalue decomposition"] 51 | # Variance Nystrom Parameters 52 | parameters['max_bad_vectors_nystrom'] = [5, "Number of maximum bad vectors for variance based Nystrom"] 53 | parameters['max_vectors_nystrom'] = [40, "Number of maximum vectors for variance based Nystrom"] 54 | parameters['nystrom_std_tolerance'] = [0.5, "Noise to eigenvalue ratio used for Nystrom truncation"] 55 | 56 | 57 | # Globaliziation parameters 58 | parameters['globalization'] = [None, 'Choose from trust_region, line_search, spectral_step or none'] 59 | parameters['max_backtracking_iter'] = [5, 'Max backtracking iterations for armijo line search'] 60 | parameters['spectral_step_alpha'] = [1e-2, 'Used in min condition for spectral step'] 61 | 62 | parameters['verbose'] = [False, "Printing"] 63 | parameters['record_last_rq_std'] = [False, "Record the last eigenvector RQ variance"] 64 | 65 | return ParameterList(parameters) 66 | 67 | 68 | class LowRankSaddleFreeNewton(Optimizer): 69 | """ 70 | This class implements the Low Rank Saddle Free Newton (LRSFN) algorithm 71 | """ 72 | def __init__(self,problem,regularization = None,sess = None,parameters = ParametersLowRankSaddleFreeNewton(),preconditioner = None): 73 | """ 74 | The constructor for this class takes: 75 | -problem: hessianlearn.problem.Problem 76 | -regularization: hessianlearn.problem.Regularization 77 | -sess: tf.Session() 78 | -parameters: hyperparameters dictionary 79 | -preconditioner: hessianlearn.problem.Preconditioner 80 | """ 81 | if regularization is None: 82 | _regularization = L2Regularization(problem,gamma = 0.0) 83 | else: 84 | _regularization = regularization 85 | super(LowRankSaddleFreeNewton,self).__init__(problem,_regularization,sess,parameters) 86 | 87 | self.grad = self.problem.gradient + self.regularization.gradient 88 | 89 | if self.parameters['globalization'] == 'trust_region': 90 | self.trust_region = TrustRegion() 91 | self._sweeps = np.zeros(2) 92 | 93 | self.alpha = 0.0 94 | self._rank = 0 95 | 96 | self._rq_std = 0.0 97 | 98 | self.eigenvalues = None 99 | 100 | @property 101 | def rank(self): 102 | return self._rank 103 | 104 | @property 105 | def rq_variance(self): 106 | return self._rq_variance 107 | 108 | 109 | 110 | 111 | def minimize(self,feed_dict = None,hessian_feed_dict = None,rq_estimator_dict = None): 112 | r""" 113 | Solves the saddle escape problem. Given a misfit (loss) Hessian operator (H) 114 | 1. H = U_r Lambda_r U_r^T 115 | 2. Solve [U_r |Lambda_r| U_r^T + gamma I] p = -g for p via Woodbury formula: 116 | 117 | [U_r Lambda_r U_r^T + gamma I]^{-1} = 1/gamma * I - 1/gamma * UDU^T 118 | where D = diag(|lambda_i|/(|lambda_i| + gamma)) 119 | -feed_dict: data dictionary used for evaluating gradient and cost 120 | -hessian_feed_dict: dictionary used for stochastic Hessian 121 | -rq_estimator_dict: dictionary used for RQ variance calculations 122 | 123 | """ 124 | self._iter += 1 125 | assert self.sess is not None 126 | assert feed_dict is not None 127 | 128 | assert self.parameters['range_finding'] in [None,'arf','naarf','vn'] 129 | 130 | if hessian_feed_dict is None: 131 | hessian_feed_dict = feed_dict 132 | 133 | 134 | gradient = self.sess.run(self.grad,feed_dict = feed_dict) 135 | 136 | alpha = self.parameters['alpha'] 137 | 138 | if self.parameters['range_finding'] == 'arf': 139 | H = lambda x: self.H(x,hessian_feed_dict,verbose = self.parameters['verbose']) 140 | n = self.problem.dimension 141 | # norm_g = np.linalg.norm(gradient) 142 | # tolerance = self.parameters['range_rel_error_tolerance']*norm_g 143 | tolerance = self.parameters['range_rel_error_tolerance'] 144 | Q = block_range_finder(H,n,tolerance,self.parameters['range_block_size']) 145 | self._rank = Q.shape[1] 146 | Lmbda,U = eigensolver_from_range(H,Q) 147 | 148 | elif self.parameters['range_finding'] == 'naarf': 149 | norm_g = np.linalg.norm(gradient) 150 | tolerance = self.parameters['range_rel_error_tolerance']*norm_g 151 | if rq_estimator_dict is None: 152 | rq_estimator_dict_list = self.problem._partition_dictionaries(feed_dict,self.parameters['rq_samples_for_naarf']) 153 | elif type(rq_estimator_dict) == list: 154 | rq_estimator_dict_list = rq_estimator_dict 155 | elif type(rq_estimator_dict) == dict: 156 | rq_estimator_dict_list = self.problem._partition_dictionaries(rq_estimator_dict,self.parameters['rq_samples_for_naarf']) 157 | else: 158 | raise 159 | Q = noise_aware_adaptive_range_finder(self.H,hessian_feed_dict,rq_estimator_dict_list,block_size = self.parameters['range_block_size'],epsilon = tolerance) 160 | self._rank = Q.shape[1] 161 | H = lambda x: self.H(x,hessian_feed_dict,verbose = self.parameters['verbose']) 162 | Lmbda,U = eigensolver_from_range(H,Q) 163 | 164 | elif self.parameters['range_finding'] == 'vn': 165 | if rq_estimator_dict is None: 166 | rq_estimator_dict_list = self.problem._partition_dictionaries(feed_dict,self.parameters['rq_samples_for_naarf']) 167 | elif type(rq_estimator_dict) == list: 168 | rq_estimator_dict_list = rq_estimator_dict 169 | elif type(rq_estimator_dict) == dict: 170 | rq_estimator_dict_list = self.problem._partition_dictionaries(rq_estimator_dict,self.parameters['rq_samples_for_naarf']) 171 | else: 172 | raise 173 | nystrom_t0 = time.time() 174 | apply_H_list = [HessianWrapper(self.H,dictionary) for dictionary in rq_estimator_dict_list] 175 | [Lmbda, U, all_std_good],[Lmbda_all,U_all,all_std] = variance_based_nystrom(apply_H_list, self.H.dimension,\ 176 | std_tol = self.parameters['nystrom_std_tolerance'],\ 177 | max_vectors = self.parameters['max_vectors_nystrom'],\ 178 | max_bad_vectors=self.parameters['max_bad_vectors_nystrom'],\ 179 | verbose = self.parameters['verbose']) 180 | self._rank = U_all.shape[1] 181 | if self.parameters['verbose']: 182 | print('Nystrom method took ',time.time() - nystrom_t0, 's') 183 | 184 | else: 185 | H = lambda x: self.H(x,hessian_feed_dict,verbose = self.parameters['verbose']) 186 | n = self.problem.dimension 187 | self._rank = self.parameters['hessian_low_rank'] 188 | Lmbda,U = randomized_eigensolver(H, n, self._rank,verbose=False) 189 | 190 | self.eigenvalues = Lmbda 191 | # Log the variance of the last eigenvector 192 | if self.parameters['record_last_rq_std'] : 193 | try: 194 | rq_direction = U[:,-1] 195 | if rq_estimator_dict is None: 196 | rq_estimator_dict_list = self.problem._partition_dictionaries(feed_dict,self.parameters['rq_samples_for_naarf']) 197 | elif type(rq_estimator_dict) == list: 198 | rq_estimator_dict_list = rq_estimator_dict 199 | elif type(rq_estimator_dict) == dict: 200 | rq_estimator_dict_list = self.problem._partition_dictionaries(rq_estimator_dict,self.parameters['rq_samples_for_naarf']) 201 | else: 202 | raise 203 | 204 | try: 205 | RQ_samples = np.zeros((len(rq_estimator_dict_list),rq_direction.shape[1])) 206 | except: 207 | RQ_samples = np.zeros(len(rq_estimator_dict_list)) 208 | 209 | for samp_i,sample_dictionary in enumerate(rq_estimator_dict_list): 210 | RQ_samples[samp_i] = self.H.quadratics(rq_direction,sample_dictionary) 211 | self._rq_std = np.std(RQ_samples) 212 | except: 213 | self._rq_std = None 214 | print(80*'#') 215 | print('U is [], taking gradient step, fix this later?'.center(80)) 216 | 217 | # Saddle free inversion via Woodbury 218 | if self.regularization.parameters['gamma'] < 1e-4: 219 | gamma_damping = self.parameters['default_damping'] 220 | # Using this condition instead of fixed gamma allows one to take larger step sizes 221 | # but does not appear to improve accuracy 222 | # gamma_damping = max(0.9*np.abs(Lmbda[-1]),self.parameters['default_damping']) 223 | else: 224 | gamma_damping = self.regularization.parameters['gamma'] 225 | # print('Lmbda[0] = ',Lmbda[0]) 226 | # print('Lmbda[-1] = ',Lmbda[-1]) 227 | # print('gamma_damping = ',gamma_damping) 228 | 229 | Lmbda_abs = np.abs(Lmbda) 230 | Lmbda_diags = diags(Lmbda_abs) 231 | # Build terms for Woodbury inversion 232 | D_denominator = Lmbda_abs + gamma_damping*np.ones_like(Lmbda_abs) 233 | D = np.divide(Lmbda_abs,D_denominator) 234 | # Invert by applying terms in Woodbury formula: 235 | UTg = np.dot(U.T,gradient) 236 | DUTg = np.multiply(D,UTg) 237 | UDUTg = np.dot(U,DUTg) 238 | minus_p = (gradient - UDUTg)/gamma_damping 239 | self.p = -minus_p 240 | 241 | 242 | # Globalization: compute alpha and update the weights 243 | if self.parameters['globalization'] is None: 244 | self.alpha = self.parameters['alpha'] 245 | self._sweeps += [1,2*self._rank] 246 | update = self.alpha*self.p 247 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 248 | 249 | elif self.parameters['globalization'] is 'spectral_step': 250 | # self.alpha = min(self.parameters['spectral_step_alpha'],0.1/Lmbda_abs[0]) 251 | self.alpha = min(self.parameters['spectral_step_alpha'],0.1/Lmbda_abs[0]) 252 | self._sweeps += [1,2*self._rank] 253 | update = self.alpha*self.p 254 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 255 | 256 | elif self.parameters['globalization'] == 'line_search': 257 | w_dir_inner_g = np.inner(self.p,gradient) 258 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict) 259 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict) 260 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(self.p,w_dir_inner_g,\ 261 | cost_at_candidate, initial_cost, 262 | max_backtracking_iter = self.parameters['max_backtracking_iter']) 263 | update = self.alpha*self.p 264 | self._sweeps += [1+0.5*line_search_iter,2*self._rank] 265 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update}) 266 | 267 | 268 | 269 | 270 | 271 | -------------------------------------------------------------------------------- /applications/transfer_learning/imagenet_cifar10_classification.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | 19 | import numpy as np 20 | import os 21 | import pickle 22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 23 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 24 | os.environ["KMP_WARNINGS"] = "FALSE" 25 | os.environ['CUDA_VISIBLE_DEVICES'] = '1' 26 | import pickle 27 | import tensorflow as tf 28 | import time, datetime 29 | # if int(tf.__version__[0]) > 1: 30 | # import tensorflow.compat.v1 as tf 31 | # tf.disable_v2_behavior() 32 | 33 | 34 | # Memory issue with GPUs 35 | gpu_devices = tf.config.experimental.list_physical_devices('GPU') 36 | for device in gpu_devices: 37 | tf.config.experimental.set_memory_growth(device, True) 38 | # Load hessianlearn library 39 | import sys 40 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../")) 41 | from hessianlearn import * 42 | 43 | # Parse run specifications 44 | from argparse import ArgumentParser 45 | 46 | parser = ArgumentParser(add_help=True) 47 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str) 48 | parser.add_argument('-fixed_step',dest = 'fixed_step',\ 49 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int) 50 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-5,help= 'learning rate alpha',type=float) 51 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int) 52 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\ 53 | required= False,default = 0,help='boolean for recording spectrum',type = int) 54 | 55 | parser.add_argument("-resnet_weights", dest='resnet_weights',required=False, default = 'imagenet', help="initialization for network weights",type=str) 56 | 57 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int) 58 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int) 59 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int) 60 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str) 61 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float) 62 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 2,help='max sweeps',type = float) 63 | 64 | parser.add_argument("-loss_type", dest='loss_type',required=False, default = 'mixed', help="loss type either cross_entrop or mixed",type=str) 65 | parser.add_argument('-seed',dest = 'seed',required= False,default = 0,help='seed',type = int) 66 | 67 | 68 | args = parser.parse_args() 69 | 70 | try: 71 | tf.set_random_seed(args.seed) 72 | except: 73 | tf.random.set_seed(args.seed) 74 | 75 | # GPU Environment Details 76 | gpu_availabe = tf.test.is_gpu_available() 77 | built_with_cuda = tf.test.is_built_with_cuda() 78 | print(80*'#') 79 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80)) 80 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80)) 81 | print(80*'#') 82 | 83 | settings = {} 84 | # Set run specifications 85 | # Data specs 86 | settings['batch_size'] = args.batch_size 87 | settings['hess_batch_size'] = args.hess_batch_size 88 | 89 | 90 | ################################################################################ 91 | # Instantiate data 92 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar10.load_data() 93 | 94 | # # Normalize the data 95 | # x_train = x_train.astype('float32') / 255. 96 | # x_test = x_test.astype('float32') / 255. 97 | 98 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 99 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test) 100 | x_val = x_test_full[:2000] 101 | x_test = x_test_full[2000:] 102 | 103 | y_train = tf.keras.utils.to_categorical(y_train) 104 | y_test_full = tf.keras.utils.to_categorical(_y_test) 105 | y_val = y_test_full[:2000] 106 | y_test = y_test_full[2000:] 107 | 108 | ################################################################################ 109 | # Create the neural network in keras 110 | 111 | # tf.keras.backend.set_floatx('float64') 112 | 113 | resnet_input_shape = (200,200,3) 114 | input_tensor = tf.keras.Input(shape = resnet_input_shape) 115 | 116 | if args.resnet_weights == 'None': 117 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = None,include_top=False,input_tensor=input_tensor) 118 | else: 119 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor) 120 | 121 | for layer in pretrained_resnet50.layers[:143]: 122 | layer.trainable = False 123 | 124 | classifier = tf.keras.models.Sequential() 125 | classifier.add(tf.keras.layers.Input(shape=(32,32,3))) 126 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2]))) 127 | classifier.add(pretrained_resnet50) 128 | classifier.add(tf.keras.layers.Flatten()) 129 | classifier.add(tf.keras.layers.BatchNormalization()) 130 | classifier.add(tf.keras.layers.Dense(64, activation='relu')) 131 | classifier.add(tf.keras.layers.Dropout(0.5)) 132 | classifier.add(tf.keras.layers.BatchNormalization()) 133 | classifier.add(tf.keras.layers.Dense(10, activation='softmax')) 134 | 135 | 136 | if args.keras_opt == 'adam': 137 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8) 138 | elif args.keras_opt == 'sgd': 139 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha) 140 | else: 141 | raise 142 | 143 | if args.loss_type == 'mixed': 144 | def mixed(y_true, y_pred): 145 | squared_difference = tf.square(y_true - y_pred) 146 | return tf.reduce_mean(squared_difference, axis=-1) +tf.keras.losses.CategoricalCrossentropy(from_logits = True)(y_true, y_pred) 147 | loss = mixed 148 | else: 149 | loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True) 150 | 151 | 152 | classifier.compile(optimizer=optimizer, 153 | loss=loss, 154 | metrics=['accuracy']) 155 | 156 | 157 | loss_train_0, acc_train_0 = classifier.evaluate(x_train,y_train,verbose=2) 158 | print('acc_train = ',acc_train_0) 159 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2) 160 | print('acc_test = ',acc_test_0) 161 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2) 162 | print('acc_val = ',acc_val_0) 163 | 164 | aux_keras_data = {'loss_train_0':loss_train_0,'acc_traun_0':acc_train_0,\ 165 | 'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\ 166 | 'loss_val_0':loss_val_0, 'acc_val_0':acc_val_0} 167 | 168 | no_callback = True 169 | if no_callback: 170 | callbacks = [] 171 | else: 172 | callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_acc',restore_best_weights = True)] 173 | 174 | keras_directory = 'keras_logging_cifar10/' 175 | # CSV logging 176 | if not os.path.exists(keras_directory): 177 | os.makedirs(keras_directory) 178 | keras_logger_name = keras_directory+str(datetime.date.today())+args.keras_opt+str(args.keras_alpha)+'_'+str(args.keras_epochs)+'_seed'+str(args.seed)+'.csv' 179 | callbacks.append(tf.keras.callbacks.CSVLogger(keras_logger_name, append=True, separator=';')) 180 | 181 | classifier.fit(x_train[:], y_train[:], epochs=args.keras_epochs,batch_size = 32,\ 182 | callbacks = callbacks ,verbose = True,validation_data = (x_val,y_val)) 183 | 184 | 185 | # Grab the weights and check the accuracy post process 186 | set_weights = {} 187 | 188 | for layer in classifier.layers: 189 | set_weights[layer.name] = classifier.get_layer(layer.name).get_weights() 190 | 191 | # Post process and save additional information from keras training 192 | loss_test_keras_final, acc_test_keras_final = classifier.evaluate(x_test,y_test,verbose=2) 193 | loss_val_keras_final, acc_val_keras_final = classifier.evaluate(x_val,y_val,verbose=2) 194 | print(80*'#') 195 | print('After keras training'.center(80)) 196 | print('acc_test = ',acc_test_keras_final) 197 | print('acc_val = ',acc_val_keras_final) 198 | aux_keras_data['loss_test_final'] = loss_test_keras_final 199 | aux_keras_data['acc_test_final'] = acc_test_keras_final 200 | aux_keras_data['loss_val_final'] = loss_val_keras_final 201 | aux_keras_data['acc_val_final'] = acc_val_keras_final 202 | keras_aux_logger_name = keras_logger_name.split('.cvs')[0]+'aux_data.pkl' 203 | with open(keras_aux_logger_name,'wb+') as f: 204 | pickle.dump(aux_keras_data,f,pickle.HIGHEST_PROTOCOL) 205 | 206 | 207 | ################################################################################ 208 | # Instantiate the data, problem, regularization. 209 | 210 | t0_problem_construction = time.time() 211 | problem = ClassificationProblem(classifier,loss_type=args.loss_type,dtype=tf.float32) 212 | print('Finished constructing the problem, and it took ',time.time() - t0_problem_construction , 's') 213 | 214 | 215 | # Instante the data object 216 | data = Data({problem.x:x_train,problem.y_true:y_train},settings['batch_size'],\ 217 | validation_data = {problem.x:x_val,problem.y_true:y_val},hessian_batch_size = settings['hess_batch_size'],seed=args.seed) 218 | 219 | settings['tikhonov_gamma'] = 0.0 220 | 221 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma']) 222 | 223 | 224 | ################################################################################ 225 | # Instantiate the model object 226 | HLModelSettings = HessianlearnModelSettings() 227 | 228 | HLModelSettings['optimizer'] = args.optimizer 229 | HLModelSettings['alpha'] = args.alpha 230 | HLModelSettings['globalization'] = None 231 | HLModelSettings['hessian_low_rank'] = args.hessian_low_rank 232 | HLModelSettings['max_backtrack'] = 20 233 | HLModelSettings['max_sweeps'] = args.max_sweeps 234 | HLModelSettings['layer_weights'] = set_weights 235 | 236 | HLModelSettings['problem_name'] = 'cifar10_resnet_classification_seed'+str(args.seed) 237 | if args.resnet_weights == 'None': 238 | HLModelSettings['problem_name'] += '_random_guess' 239 | HLModelSettings['record_spectrum'] = bool(args.record_spectrum) 240 | HLModelSettings['rq_data_size'] = 100 241 | HLModelSettings['printing_sweep_frequency'] = None 242 | HLModelSettings['printing_items'] = {'time':'time','sweeps':'sweeps','Loss':'train_loss','acc train':'train_acc',\ 243 | '||g||':'||g||','Loss val':'val_loss','acc val':'val_acc',\ 244 | 'maxacc val':'max_val_acc','alpha':'alpha'} 245 | 246 | 247 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings) 248 | 249 | if args.max_sweeps > 0: 250 | HLModel.fit() 251 | 252 | 253 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2) 254 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2) 255 | 256 | hl_aux_data = {'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\ 257 | 'loss_val_0':loss_val_0,'acc_val_0':acc_val_0,\ 258 | 'loss_test_final':loss_test_final,'acc_test_final':acc_test_final,\ 259 | 'loss_val_final':loss_val_final,'acc_val_final':acc_val_final} 260 | 261 | with open(HLModel.settings['problem_name']+'_logging/'+ HLModel.logger_outname +'aux_data.pkl', 'wb+') as f: 262 | pickle.dump(hl_aux_data, f, pickle.HIGHEST_PROTOCOL) 263 | 264 | ################################################################################ 265 | # Evaluate again on all the data. 266 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() 267 | 268 | # # Normalize the data 269 | # x_train = x_train.astype('float32') / 255. 270 | # x_test = x_test.astype('float32') / 255. 271 | 272 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 273 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test) 274 | 275 | y_train = tf.keras.utils.to_categorical(y_train) 276 | y_test = tf.keras.utils.to_categorical(y_test) 277 | 278 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2) 279 | print(80*'#') 280 | print('After hessianlearn training'.center(80)) 281 | print('acc_test_total = ',acc_test_total) 282 | 283 | 284 | -------------------------------------------------------------------------------- /applications/transfer_learning/imagenet_cifar100_classification.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | 19 | import numpy as np 20 | import os 21 | import pickle 22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 23 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 24 | os.environ["KMP_WARNINGS"] = "FALSE" 25 | os.environ['CUDA_VISIBLE_DEVICES'] = '1' 26 | import pickle 27 | import tensorflow as tf 28 | import time, datetime 29 | # if int(tf.__version__[0]) > 1: 30 | # import tensorflow.compat.v1 as tf 31 | # tf.disable_v2_behavior() 32 | 33 | 34 | # Memory issue with GPUs 35 | gpu_devices = tf.config.experimental.list_physical_devices('GPU') 36 | for device in gpu_devices: 37 | tf.config.experimental.set_memory_growth(device, True) 38 | # Load hessianlearn library 39 | import sys 40 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../")) 41 | from hessianlearn import * 42 | 43 | # Parse run specifications 44 | from argparse import ArgumentParser 45 | 46 | parser = ArgumentParser(add_help=True) 47 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str) 48 | parser.add_argument('-fixed_step',dest = 'fixed_step',\ 49 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int) 50 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-5,help= 'learning rate alpha',type=float) 51 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int) 52 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\ 53 | required= False,default = 0,help='boolean for recording spectrum',type = int) 54 | 55 | parser.add_argument("-resnet_weights", dest='resnet_weights',required=False, default = 'imagenet', help="initialization for network weights",type=str) 56 | 57 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int) 58 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int) 59 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int) 60 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str) 61 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float) 62 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 2,help='max sweeps',type = float) 63 | 64 | parser.add_argument("-loss_type", dest='loss_type',required=False, default = 'mixed', help="loss type either cross_entrop or mixed",type=str) 65 | parser.add_argument('-seed',dest = 'seed',required= False,default = 0,help='seed',type = int) 66 | 67 | 68 | args = parser.parse_args() 69 | 70 | try: 71 | tf.set_random_seed(args.seed) 72 | except: 73 | tf.random.set_seed(args.seed) 74 | 75 | # GPU Environment Details 76 | gpu_availabe = tf.test.is_gpu_available() 77 | built_with_cuda = tf.test.is_built_with_cuda() 78 | print(80*'#') 79 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80)) 80 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80)) 81 | print(80*'#') 82 | 83 | settings = {} 84 | # Set run specifications 85 | # Data specs 86 | settings['batch_size'] = args.batch_size 87 | settings['hess_batch_size'] = args.hess_batch_size 88 | 89 | 90 | ################################################################################ 91 | # Instantiate data 92 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar100.load_data() 93 | 94 | # # Normalize the data 95 | # x_train = x_train.astype('float32') / 255. 96 | # x_test = x_test.astype('float32') / 255. 97 | 98 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 99 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test) 100 | x_val = x_test_full[:2000] 101 | x_test = x_test_full[2000:] 102 | 103 | y_train = tf.keras.utils.to_categorical(y_train) 104 | y_test_full = tf.keras.utils.to_categorical(_y_test) 105 | y_val = y_test_full[:2000] 106 | y_test = y_test_full[2000:] 107 | 108 | ################################################################################ 109 | # Create the neural network in keras 110 | 111 | # tf.keras.backend.set_floatx('float64') 112 | 113 | resnet_input_shape = (200,200,3) 114 | input_tensor = tf.keras.Input(shape = resnet_input_shape) 115 | 116 | if args.resnet_weights == 'None': 117 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = None,include_top=False,input_tensor=input_tensor) 118 | else: 119 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor) 120 | 121 | for layer in pretrained_resnet50.layers[:143]: 122 | layer.trainable = False 123 | 124 | classifier = tf.keras.models.Sequential() 125 | classifier.add(tf.keras.layers.Input(shape=(32,32,3))) 126 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2]))) 127 | classifier.add(pretrained_resnet50) 128 | classifier.add(tf.keras.layers.Flatten()) 129 | classifier.add(tf.keras.layers.BatchNormalization()) 130 | classifier.add(tf.keras.layers.Dense(128, activation='relu')) 131 | classifier.add(tf.keras.layers.Dropout(0.5)) 132 | classifier.add(tf.keras.layers.BatchNormalization()) 133 | classifier.add(tf.keras.layers.Dense(100, activation='softmax')) 134 | 135 | 136 | if args.keras_opt == 'adam': 137 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8) 138 | elif args.keras_opt == 'sgd': 139 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha) 140 | else: 141 | raise 142 | 143 | if args.loss_type == 'mixed': 144 | def mixed(y_true, y_pred): 145 | squared_difference = tf.square(y_true - y_pred) 146 | return tf.reduce_mean(squared_difference, axis=-1) +tf.keras.losses.CategoricalCrossentropy(from_logits = True)(y_true, y_pred) 147 | loss = mixed 148 | else: 149 | loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True) 150 | 151 | 152 | classifier.compile(optimizer=optimizer, 153 | loss=loss, 154 | metrics=['accuracy']) 155 | 156 | 157 | loss_train_0, acc_train_0 = classifier.evaluate(x_train,y_train,verbose=2) 158 | print('acc_train = ',acc_train_0) 159 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2) 160 | print('acc_test = ',acc_test_0) 161 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2) 162 | print('acc_val = ',acc_val_0) 163 | 164 | aux_keras_data = {'loss_train_0':loss_train_0,'acc_traun_0':acc_train_0,\ 165 | 'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\ 166 | 'loss_val_0':loss_val_0, 'acc_val_0':acc_val_0} 167 | 168 | no_callback = True 169 | if no_callback: 170 | callbacks = [] 171 | else: 172 | callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_acc',restore_best_weights = True)] 173 | 174 | keras_directory = 'keras_logging_cifar100/' 175 | # CSV logging 176 | if not os.path.exists(keras_directory): 177 | os.makedirs(keras_directory) 178 | keras_logger_name = keras_directory+str(datetime.date.today())+args.keras_opt+str(args.keras_alpha)+'_'+str(args.keras_epochs)+'_seed'+str(args.seed)+'.csv' 179 | callbacks.append(tf.keras.callbacks.CSVLogger(keras_logger_name, append=True, separator=';')) 180 | 181 | classifier.fit(x_train[:], y_train[:], epochs=args.keras_epochs,batch_size = 32,\ 182 | callbacks = callbacks ,verbose = True,validation_data = (x_val,y_val)) 183 | 184 | 185 | # Grab the weights and check the accuracy post process 186 | set_weights = {} 187 | 188 | for layer in classifier.layers: 189 | set_weights[layer.name] = classifier.get_layer(layer.name).get_weights() 190 | 191 | # Post process and save additional information from keras training 192 | loss_test_keras_final, acc_test_keras_final = classifier.evaluate(x_test,y_test,verbose=2) 193 | loss_val_keras_final, acc_val_keras_final = classifier.evaluate(x_val,y_val,verbose=2) 194 | print(80*'#') 195 | print('After keras training'.center(80)) 196 | print('acc_test = ',acc_test_keras_final) 197 | print('acc_val = ',acc_val_keras_final) 198 | aux_keras_data['loss_test_final'] = loss_test_keras_final 199 | aux_keras_data['acc_test_final'] = acc_test_keras_final 200 | aux_keras_data['loss_val_final'] = loss_val_keras_final 201 | aux_keras_data['acc_val_final'] = acc_val_keras_final 202 | keras_aux_logger_name = keras_logger_name.split('.cvs')[0]+'aux_data.pkl' 203 | with open(keras_aux_logger_name,'wb+') as f: 204 | pickle.dump(aux_keras_data,f,pickle.HIGHEST_PROTOCOL) 205 | 206 | 207 | ################################################################################ 208 | # Instantiate the data, problem, regularization. 209 | 210 | t0_problem_construction = time.time() 211 | problem = ClassificationProblem(classifier,loss_type=args.loss_type,dtype=tf.float32) 212 | print('Finished constructing the problem, and it took ',time.time() - t0_problem_construction , 's') 213 | 214 | 215 | # Instante the data object 216 | data = Data({problem.x:x_train,problem.y_true:y_train},settings['batch_size'],\ 217 | validation_data = {problem.x:x_val,problem.y_true:y_val},hessian_batch_size = settings['hess_batch_size'],seed=args.seed) 218 | 219 | settings['tikhonov_gamma'] = 0.0 220 | 221 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma']) 222 | 223 | 224 | ################################################################################ 225 | # Instantiate the model object 226 | HLModelSettings = HessianlearnModelSettings() 227 | 228 | HLModelSettings['optimizer'] = args.optimizer 229 | HLModelSettings['alpha'] = args.alpha 230 | HLModelSettings['globalization'] = None 231 | HLModelSettings['hessian_low_rank'] = args.hessian_low_rank 232 | HLModelSettings['max_backtrack'] = 20 233 | HLModelSettings['max_sweeps'] = args.max_sweeps 234 | HLModelSettings['layer_weights'] = set_weights 235 | 236 | HLModelSettings['problem_name'] = 'cifar100_resnet_classification_seed'+str(args.seed) 237 | if args.resnet_weights == 'None': 238 | HLModelSettings['problem_name'] += '_random_guess' 239 | HLModelSettings['record_spectrum'] = bool(args.record_spectrum) 240 | HLModelSettings['rq_data_size'] = 100 241 | HLModelSettings['printing_sweep_frequency'] = None 242 | HLModelSettings['printing_items'] = {'time':'time','sweeps':'sweeps','Loss':'train_loss','acc train':'train_acc',\ 243 | '||g||':'||g||','Loss val':'val_loss','acc val':'val_acc',\ 244 | 'maxacc val':'max_val_acc','alpha':'alpha'} 245 | 246 | 247 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings) 248 | 249 | if args.max_sweeps > 0: 250 | HLModel.fit() 251 | 252 | 253 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2) 254 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2) 255 | 256 | hl_aux_data = {'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\ 257 | 'loss_val_0':loss_val_0,'acc_val_0':acc_val_0,\ 258 | 'loss_test_final':loss_test_final,'acc_test_final':acc_test_final,\ 259 | 'loss_val_final':loss_val_final,'acc_val_final':acc_val_final} 260 | 261 | with open(HLModel.settings['problem_name']+'_logging/'+ HLModel.logger_outname +'aux_data.pkl', 'wb+') as f: 262 | pickle.dump(hl_aux_data, f, pickle.HIGHEST_PROTOCOL) 263 | 264 | ################################################################################ 265 | # Evaluate again on all the data. 266 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar100.load_data() 267 | 268 | # # Normalize the data 269 | # x_train = x_train.astype('float32') / 255. 270 | # x_test = x_test.astype('float32') / 255. 271 | 272 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train) 273 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test) 274 | 275 | y_train = tf.keras.utils.to_categorical(y_train) 276 | y_test = tf.keras.utils.to_categorical(y_test) 277 | 278 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2) 279 | print(80*'#') 280 | print('After hessianlearn training'.center(80)) 281 | print('acc_test_total = ',acc_test_total) 282 | 283 | 284 | -------------------------------------------------------------------------------- /hessianlearn/algorithms/cgSolver.py: -------------------------------------------------------------------------------- 1 | # This file is part of the hessianlearn package 2 | # 3 | # hessianlearn is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or any later version. 6 | # 7 | # hessianlearn is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public License 13 | # If not, see . 14 | # 15 | # Author: Tom O'Leary-Roseberry 16 | # Contact: tom.olearyroseberry@utexas.edu 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | import math 22 | import numpy as np 23 | import tensorflow as tf 24 | if int(tf.__version__[0]) > 1: 25 | import tensorflow.compat.v1 as tf 26 | tf.disable_v2_behavior() 27 | 28 | from ..utilities.parameterList import ParameterList 29 | from ..algorithms import Optimizer 30 | from .. problem import IdentityPreconditioner 31 | from ..problem import L2Regularization 32 | from abc import ABC, abstractmethod 33 | 34 | class Identity(object): 35 | def __init__(self): 36 | 37 | pass 38 | 39 | def __call__(self, x): 40 | return x 41 | 42 | 43 | 44 | def ParametersCGSolver(dictionary = {}): 45 | parameters = dictionary 46 | parameters["rel_tolerance"] = [1e-9, "the relative tolerance for the stopping criterion"] 47 | parameters["abs_tolerance"] = [1e-12, "the absolute tolerance for the stopping criterion"] 48 | parameters["max_iter"] = [10, "the maximum number of iterations"] 49 | parameters["zero_initial_guess"] = [True, "if True we start with a 0 initial guess; if False we use the x as initial guess."] 50 | parameters["print_level"] = [-1, "verbosity level: -1 --> no output on screen; 0 --> only final residual at convergence or reason for not not convergence"] 51 | 52 | parameters['coarse_tol'] = [0.5,'coarse tolerance used in calculation of relative tolerances for E-W conditions'] 53 | 54 | parameters['default_damping'] = [1e-3, "Levenberg-Marquardt damping when no regularization is used"] 55 | return ParameterList(parameters) 56 | 57 | 58 | class CGSolver(ABC): 59 | """ 60 | This class implements a custom CG solver to be used with Inexact Newton CG 61 | """ 62 | reason = ["Maximum Number of Iterations Reached", 63 | "Relative/Absolute residual less than tol", 64 | "Reached a negative direction", 65 | "Reached trust region boundary" 66 | ] 67 | def __init__(self,problem,regularization,sess = None,Aop = None,preconditioner = None,x = None,parameters = ParametersCGSolver()): 68 | """ 69 | The constructor for this class takes: 70 | -problem: hessianlearn.problem.Problem 71 | -regularization: hessianlearn.problem.Regularization 72 | -sess: tf.Session() 73 | -Aop: matrix vector product callable 74 | -precondition: hessianlearn.problem.Preconditioner 75 | -parameters: solver hyperparameters 76 | """ 77 | self.sess = sess 78 | self.problem = problem 79 | if regularization.parameters['gamma'] < 1e-4: 80 | regularization = L2Regularization(self.problem,gamma = parameters['default_damping']) 81 | self.regularization = regularization 82 | if x is None: 83 | # self.x = tf.Variable(self.problem.gradient.initialized_value()) 84 | self.x = self.problem.gradient 85 | else: 86 | self.x = x 87 | self.parameters = parameters 88 | if Aop is None: 89 | self.Aop = self.problem.Hdw + self.regularization.Hdw 90 | else: 91 | # be careful to note what the operator requires be passed into feed_dict 92 | self.Aop = Aop 93 | # Define preconditioner 94 | if preconditioner is None: 95 | self.Minv = IdentityPreconditioner(problem,self.problem.dtype) 96 | else: 97 | self.Minv = preconditioner 98 | 99 | self.update_x = self.update_without_trust_region 100 | self.B_op = None 101 | 102 | def initialize_trust_region(self,coarse_tol = None): 103 | """ 104 | This method initializes the trust region parameters 105 | -coarse_tol: coarse tolerance 106 | """ 107 | self.update_x = self.update_with_trust_region 108 | if coarse_tol is not None: 109 | self.parameters['coarse_tol'] = coarse_tol 110 | 111 | def set_trust_region_radius(self,radius,operator = Identity()): 112 | """ 113 | This method sets the trust region radius when trust region is used 114 | for globalization 115 | -radius: trust region radius 116 | -operator: for use in TR calculations 117 | """ 118 | assert self.parameters['zero_initial_guess'] 119 | self.trust_region_radius_squared = radius**2 120 | self.B_op = operator 121 | 122 | def update_without_trust_region(self,x,alpha,p): 123 | """ 124 | This method updates the approximation of x^* and returns False when 125 | TR is not used 126 | -x: solution at given iteration 127 | -alpha: step length 128 | -p: search direction 129 | """ 130 | x = x + alpha*p 131 | return False, x 132 | 133 | def update_with_trust_region(self,x,alpha,p): 134 | """ 135 | This method returns a Boolean delineating whether the point was placed 136 | on the trust region boundary or not, as well as the updated x 137 | -x: solution at given iteration 138 | -alpha: step length 139 | -p: search direction 140 | """ 141 | step = x + alpha*p 142 | assert self.B_op is not None 143 | step_length = np.dot(x,self.B_op(step)) 144 | if step_length < self.trust_region_radius_squared: 145 | return False, step 146 | else: 147 | # Move the point to the boundary of the trust region 148 | Bp = self.B_op(p) 149 | xBp = np.dot(x,Bp) 150 | pBp = np.dot(p,Bp) 151 | Bx = self.B_op(x) 152 | xBx = np.dot(x,Bx) 153 | a_tau = alpha*alpha*pBp 154 | b_tau = 2* alpha * xBp 155 | c_tau = xBx - self.trust_region_radius_squared 156 | discriminant = (b_tau - 4*a_tau*c_tau) 157 | if discriminant < 0: 158 | print('Issue with the discriminant') 159 | discriminant *= -1 160 | tau = 0.5*(-b_tau + math.sqrt(discriminant))/a_tau 161 | alpha_tau = alpha*tau 162 | return True, x + alpha*p 163 | 164 | def solve(self,b,feed_dict = None,x_0 = None): 165 | r""" 166 | Solve Ax=b by the preconditioned conjugate gradients method 167 | as defined in Iterative Methods Ed. 2 by Yousef Saad p 263 168 | -b: the right hand side 169 | -feed_dict: the data dictionary used to evaluate stochastic 170 | operators 171 | -x_0: the initial guess for CG 172 | """ 173 | assert self.sess is not None 174 | assert feed_dict is not None 175 | 176 | self.iter = 0 177 | self.converged = False 178 | self.reason_id = 0 179 | x = np.zeros_like(b) 180 | 181 | feed_dict[self.problem.dw] = x 182 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict) 183 | # Calculate initial residual r = Ax_0 -b 184 | r = b - Ax_0 185 | # Apply preconditioner z = M^{-1}r 186 | feed_dict[self.Minv.x] = r 187 | # fix me!!!!! Preconditioner not working for now? 188 | 189 | z = self.sess.run(self.Minv(),feed_dict = feed_dict) 190 | 191 | 192 | # Calculate p (copy array) 193 | p = z.copy() 194 | # Calculate tolerance for Eisenstat Walker conditions 195 | rz_0 = np.dot(r,z) 196 | rtol2 = rz_0 * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"] 197 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"] 198 | tol = max(rtol2, atol2) 199 | # Check convergence and initialize for solve: 200 | converged = (rz_0 < tol) 201 | if converged: 202 | self.converged = True 203 | self.reason_id = 1 204 | self.final_norm = math.sqrt(rz_0) 205 | if(self.parameters["print_level"] >= 0): 206 | print( self.reason[self.reason_id]) 207 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm) 208 | return x, False 209 | # Check if the direction is negative before taking a step. 210 | feed_dict[self.problem.dw] = p 211 | Ap = self.sess.run(self.Aop,feed_dict = feed_dict) 212 | pAp = np.dot(p,Ap) 213 | negative_direction = (pAp <= 0.0) 214 | if negative_direction: 215 | self.converged = True 216 | self.reason_id = 2 217 | x += p 218 | r -= Ap 219 | feed_dict[self.Minv.x] = r 220 | z = self.sess.run(self.Minv(),feed_dict = feed_dict) 221 | rz = np.dot(r,z) 222 | self.final_norm = math.sqrt(rz) 223 | if(self.parameters["print_level"] >= 0): 224 | print( self.reason[self.reason_id]) 225 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm) 226 | return x, False 227 | 228 | # Loop until convergence 229 | self.iter = 1 230 | while True: 231 | # Calculate alpha 232 | alpha = rz_0/pAp 233 | 234 | # Update x 235 | on_boundary,x = self.update_x(x,alpha,p) 236 | # Update r 237 | 238 | r -= alpha*Ap 239 | # Apply preconditioner z = M^{-1}r 240 | feed_dict[self.Minv.x] = r 241 | z = self.sess.run(self.Minv(),feed_dict = feed_dict) 242 | 243 | # Calculate rz 244 | rz = np.dot(r,z) 245 | # print(self.iter,rz) 246 | # Check convergence 247 | converged = (rz < tol) 248 | if converged: 249 | self.converged = True 250 | self.reason_id = 1 251 | self.final_norm = math.sqrt(rz) 252 | if(self.parameters["print_level"] >= 0): 253 | print( self.reason[self.reason_id]) 254 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm) 255 | break 256 | self.iter += 1 257 | if self.iter > self.parameters["max_iter"]: 258 | self.converged = False 259 | self.reason_id = 0 260 | self.final_norm = math.sqrt(rz) 261 | if(self.parameters["print_level"] >= 0): 262 | print( self.reason[self.reason_id]) 263 | print( "Not Converged. Final residual norm ", self.final_norm) 264 | break 265 | beta = rz / rz_0 266 | p = z + beta*p 267 | # Check if the direction is negative, and prepare for next iteration. 268 | feed_dict[self.problem.dw] = p 269 | Ap = self.sess.run(self.Aop,feed_dict = feed_dict) 270 | pAp = np.dot(p,Ap) 271 | negative_direction = (pAp <= 0.0) 272 | 273 | if negative_direction: 274 | self.converged = True 275 | self.reason_id = 2 276 | self.final_norm = math.sqrt(rz) 277 | if(self.parameters["print_level"] >= 0): 278 | print( self.reason[self.reason_id]) 279 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm) 280 | break 281 | 282 | rz_0 = rz 283 | 284 | return x, on_boundary 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | class CGSolver_scipy(ABC): 295 | """ 296 | This class implements a wrapper for the scipy CG solver 297 | """ 298 | reason = ["Maximum Number of Iterations Reached", 299 | "Relative/Absolute residual less than tol", 300 | "Reached a negative direction", 301 | "Reached trust region boundary" 302 | ] 303 | def __init__(self,problem,regularization,sess = None,Aop = None,preconditioner = None,parameters = ParametersCGSolver()): 304 | """ 305 | The constructor for this class takes 306 | -problem: hessianlearn.problem.Problem 307 | -regularization: hessianlearn.problem.Regularization 308 | -sees: tf.Session() 309 | -Aop: matrix vector product callable 310 | -preconditioner: hessianlearn.problem.Preconditioner (not currently even used) 311 | -parameters: solver hyperparameters 312 | """ 313 | self.sess = sess 314 | self.problem = problem 315 | self.regularization = regularization 316 | self.parameters = parameters 317 | if Aop is None: 318 | self.Aop = self.problem.Hdw + self.regularization.Hdw 319 | else: 320 | # be careful to note what the operator requires be passed into feed_dict 321 | self.Aop = Aop 322 | # # Define preconditioner 323 | # if preconditioner is None: 324 | # self.Minv = IdentityPreconditioner(problem,self.problem.dtype) 325 | # else: 326 | # self.Minv = preconditioner 327 | 328 | 329 | 330 | 331 | 332 | 333 | def solve(self,b,feed_dict = None,x_0 = None): 334 | r""" 335 | Solve Ax=b by the mines method 336 | as defined in Iterative Methods Ed. 2 by Youssef Saad p 140 337 | -b: right hand side 338 | -feed_dict: data dictionary for 339 | -x_0: initial guess 340 | """ 341 | assert self.sess is not None 342 | assert feed_dict is not None 343 | 344 | self.iter = 0 345 | self.converged = False 346 | self.reason_id = 0 347 | x = np.zeros_like(b) 348 | 349 | feed_dict[self.problem.dw] = x 350 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict) 351 | # Calculate initial residual r = Ax_0 -b 352 | r = b - Ax_0 353 | # Calculate tolerance for Eisenstat Walker conditions 354 | rr_0 = np.dot(r,r) 355 | rtol2 = rr_0 * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"] 356 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"] 357 | tol = max(rtol2, atol2) 358 | import scipy 359 | from scipy.sparse.linalg import LinearOperator 360 | 361 | def Ap(p): 362 | feed_dict[self.problem.dw] = p 363 | return self.sess.run(self.Aop,feed_dict = feed_dict) 364 | 365 | n = self.problem.dimension 366 | 367 | A = LinearOperator((n,n), matvec=Ap) 368 | 369 | # self.iter += self.parameters["max_iter"] 370 | 371 | def update_iters(rk): 372 | self.iter +=1 373 | 374 | return scipy.sparse.linalg.cg(A, b, tol=tol, maxiter=self.parameters["max_iter"],callback = update_iters) 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | --------------------------------------------------------------------------------