0.75 and on_boundary:
79 | self.radius *= 2.
80 | # self.delta *= max(2,self.delta_hat)
81 | if rho > self.eta:
82 | accept_step = True
83 | else:
84 | accept_step = False
85 |
86 | return accept_step
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/applications/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ___ ___ ___ ___ ___ ___
6 | /__/\ / /\ / /\ / /\ ___ / /\ /__/\
7 | \ \:\ / /:/_ / /:/_ / /:/_ / /\ / /::\ \ \:\
8 | \__\:\ / /:/ /\ / /:/ /\ / /:/ /\ / /:/ / /:/\:\ \ \:\
9 | ___ / /::\ / /:/ /:/_ / /:/ /::\ / /:/ /::\ /__/::\ / /:/~/::\ _____\__\:\
10 | /__/\ /:/\:\/__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/\:\\__\/\:\__ /__/:/ /:/\:\/__/::::::::\
11 | \ \:\/:/__\/\ \:\/:/ /:/\ \:\/:/~/:/\ \:\/:/~/:/ \ \:\/\\ \:\/:/__\/\ \:\~~\~~\/
12 | \ \::/ \ \::/ /:/ \ \::/ /:/ \ \::/ /:/ \__\::/ \ \::/ \ \:\ ~~~
13 | \ \:\ \ \:\/:/ \__\/ /:/ \__\/ /:/ /__/:/ \ \:\ \ \:\
14 | \ \:\ \ \::/ /__/:/ /__/:/ \__\/ \ \:\ \ \:\
15 | \__\/ \__\/ \__\/ \__\/ \__\/ \__\/
16 |
17 |
18 | ___ ___ ___ ___
19 | / /\ / /\ / /\ /__/\
20 | / /:/_ / /::\ / /::\ \ \:\
21 | ___ ___ / /:/ /\ / /:/\:\ / /:/\:\ \ \:\
22 | /__/\ / /\ / /:/ /:/_ / /:/~/::\ / /:/~/:/ _____\__\:\
23 | \ \:\ / /://__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/___/__/::::::::\
24 | \ \:\ /:/ \ \:\/:/ /:/\ \:\/:/__\/\ \:\/:::::/\ \:\~~\~~\/
25 | \ \:\/:/ \ \::/ /:/ \ \::/ \ \::/~~~~ \ \:\ ~~~
26 | \ \::/ \ \:\/:/ \ \:\ \ \:\ \ \:\
27 | \__\/ \ \::/ \ \:\ \ \:\ \ \:\
28 | \__\/ \__\/ \__\/ \__\/
29 |
30 |
31 |
32 | # Transfer Learning
33 |
34 | * Examples of CIFAR10, CIFAR100 classification from pre-trained Imagenet ResNet50 model in `transfer_learning/`
35 |
36 | * Pre-trained model serves as well conditioned initial guess for transfer learning. In this setting Newton methods perform well due to their excellent properties in local convergence. Low Rank Saddle Free Newton is able to zero in on highly generalizable local minimizers bypassing indefinite regions. Below are validation accuracies of best choices of fixed step-length for Adam, SGD and LRSFN with fixed rank of 40.
37 |
38 |
39 |
40 |
41 |
42 | * For more information see the following manuscript
43 |
44 | - \[2\] O'Leary-Roseberry, T., Alger, N., Ghattas O.,
45 | [**Low Rank Saddle Free Newton: A Scalable Method for Stochastic Nonconvex Optimization**](https://arxiv.org/abs/2002.02881).
46 | arXiv:2002.02881.
47 | ([Download](https://arxiv.org/pdf/2002.02881.pdf))BibTeX
48 | @article{OLearyRoseberryAlgerGhattas2020,
49 | title={Low Rank Saddle Free Newton: Algorithm and Analysis},
50 | author={O'Leary-Roseberry, Thomas and Alger, Nick and Ghattas, Omar},
51 | journal={arXiv preprint arXiv:2002.02881},
52 | year={2020}
53 | }
54 | }
55 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/hessianlearn/test/test_HessianlearnModel.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 | from __future__ import absolute_import, division, print_function
18 |
19 | import unittest
20 | import numpy as np
21 | import os
22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
23 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
24 | os.environ["KMP_WARNINGS"] = "FALSE"
25 |
26 | import tensorflow as tf
27 | if int(tf.__version__[0]) > 1:
28 | import tensorflow.compat.v1 as tf
29 | tf.disable_v2_behavior()
30 |
31 |
32 | import sys
33 | sys.path.append('../../')
34 | from hessianlearn import (HessianlearnModel, HessianlearnModelSettings,
35 | ClassificationProblem,Data, L2Regularization)
36 |
37 | tf.set_random_seed(0)
38 |
39 | class TestHessianlearnModel(unittest.TestCase):
40 |
41 | def test_all_optimizers(self):
42 | # Instantiate data
43 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
44 | # Normalize the data
45 | x_train = x_train.astype('float32') / 255.
46 | x_test = x_test.astype('float32') / 255.
47 | def one_hot_vectors(labels_temp):
48 | labels = np.zeros((labels_temp.shape[0],10))
49 | for i,label in enumerate(labels_temp):
50 | labels[i,label] = 1
51 | return labels
52 | y_train = one_hot_vectors(y_train)
53 | y_test = one_hot_vectors(y_test)
54 | # Instantiate neural network
55 | classifier = tf.keras.Sequential([
56 | tf.keras.layers.Flatten(input_shape=(28, 28)),
57 | tf.keras.layers.Dense(128, activation='relu'),
58 | tf.keras.layers.Dense(10)
59 | ])
60 | # Instantiate the problem, regularization.
61 | problem = ClassificationProblem(classifier,loss_type = 'cross_entropy',dtype=tf.float32)
62 | regularization = L2Regularization(problem,gamma =0.)
63 | # Instante the data object
64 | train_dict = {problem.x:x_train, problem.y_true:y_train}
65 | validation_dict = {problem.x:x_test, problem.y_true:y_test}
66 | data = Data(train_dict,32,validation_data = validation_dict,hessian_batch_size = 8)
67 | # Instantiate the model object
68 | HLModelSettings = HessianlearnModelSettings()
69 | HLModelSettings['max_sweeps'] = 1.
70 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings)
71 |
72 | for optimizer in ['lrsfn','adam','gd','sgd','incg']:
73 | HLModel.settings['optimizer'] = optimizer
74 | if optimizer == 'incg':
75 | HLModel.settings['alpha'] = 1e-4
76 | HLModel.fit()
77 | first_loss = HLModel.logger['train_loss'][0]
78 | last_iteration = max(HLModel.logger['train_loss'].keys())
79 | last_loss = HLModel.logger['train_loss'][last_iteration]
80 | print('first loss = ',first_loss)
81 | print('last_loss = ',last_loss)
82 | assert last_loss < first_loss
83 |
84 |
85 | if __name__ == '__main__':
86 | unittest.main()
--------------------------------------------------------------------------------
/hessianlearn/utilities/finiteDifferenceCheck.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import, division, print_function
19 | import numpy as np
20 | from numpy.linalg import norm
21 | # import tensorflow as tf
22 | # if int(tf.__version__[0]) > 1:
23 | # import tensorflow.compat.v1 as tf
24 | # tf.disable_v2_behavior()
25 |
26 |
27 | def finite_difference_check(sess,problem, feed_dict, w = None, dw=None,verbose = False):
28 | """
29 | This method implements finite difference checks for a given hessianlearn.problem.Problem
30 | -sess: tf.Session()
31 | -problem: hessianlearn.problem.Problem
32 | -feed_dict: data used for computation of cost, grad and hess
33 | -w: the point the finite difference check is evaluated at
34 | -dw: the direction for the finite difference check
35 | -verbose: Boolean for printing
36 | """
37 |
38 | if w is None:
39 | w = sess.run(problem.w)
40 | # w_zeros = []
41 | # for w_i in w:
42 | # w_zeros.append(np.zeros_like(w))
43 | if dw is None:
44 | dw = []
45 | for w_i in w:
46 | # print('Shape',w_i.shape)
47 | dw.append(np.ones_like(w_i))
48 | # dw = [np.ones_like(w_i) for w_i in w]
49 |
50 | eps = np.power(2., np.linspace(-32, 0, 33))
51 |
52 | initial_loss = sess.run(problem.loss,feed_dict)
53 |
54 |
55 | initial_g = sess.run(problem.gradient,feed_dict)
56 |
57 | feed_dict[problem.dw] = dw
58 | initial_gTdw = np.sum(sess.run(problem._gTdw,feed_dict))
59 |
60 | initial_Hdw = sess.run(problem.Hdw,feed_dict)
61 |
62 | error_g = np.zeros_like(eps)
63 | error_H = np.zeros_like(eps)
64 |
65 | # We will need to modify w during this process so we copy
66 | # the initial values of w so we can replace them later
67 | print('Copying initial w since it will be modified during this check')
68 | w_array = sess.run(problem.w)
69 | w_changed = True
70 |
71 | if verbose:
72 | print('Initial loss:',initial_loss)
73 | # print('Initial gradient:',initial_g)
74 | print('Initial gTdw',initial_gTdw)
75 | print('{0:10} {1:10} {2:10} {3:10}'.format('epsilon','loss','error_g','error_H'))
76 |
77 |
78 | for i in np.arange(eps.shape[0]):
79 |
80 |
81 | eps_i = eps[i]
82 | # Momentarily assign w
83 | # w_update = [eps_i*dw_i for dw_i in dw]
84 | # # w_plus = w + eps_i*dw
85 | # problem._update_w(w_update)
86 | new_w = []
87 | for w_i,dw_i in zip(w,dw):
88 | new_w.append(w_i + eps_i*dw_i)
89 | sess.run(problem._assign_to_w(new_w))
90 | #Evaluate new loss and calculate gradient error
91 | loss_plus = sess.run(problem.loss,feed_dict)
92 | error_g_i = np.abs( (loss_plus - initial_loss)/eps_i - initial_gTdw)
93 | error_g[i] = error_g_i
94 | # Evaluate new gradient and calculate Hessian error
95 | g_plus = sess.run(problem.gradient,feed_dict)
96 | error_H_i_ = []
97 | for g_plus_i,initial_g_i,initial_Hdw_i in zip(g_plus,initial_g,initial_Hdw):
98 | error_H_i_.append((g_plus_i - initial_g_i)/eps_i-initial_Hdw_i)
99 | error_H_i = np.sqrt(np.sum([np.linalg.norm(e)**2 for e in error_H_i_]))
100 | error_H[i] = error_H_i
101 |
102 | if verbose:
103 | print('{0:1.4e} {1:1.4e} {2:1.4e} {3:1.4e}'.format(eps_i,loss_plus,error_g_i,error_H_i))
104 |
105 | if w_changed:
106 | problem._assign_to_w(w_array)
107 | print('Succesfully re-assigned w')
108 |
109 | out = {}
110 | out['epsilon'] = eps
111 | out['error_g'] = error_g
112 | out['error_H'] = error_H
113 |
114 | return out
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/gradientDescent.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 |
24 | from ..utilities.parameterList import ParameterList
25 | from ..algorithms import Optimizer
26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion
27 |
28 |
29 |
30 |
31 | def ParametersGradientDescent(parameters = {}):
32 | parameters['alpha'] = [1e-3, "Initial steplength, or learning rate"]
33 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"]
34 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"]
35 | parameters['max_NN_evals_per_batch'] = [10000, "Scale constant for maximum neural network evaluations per datum"]
36 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"]
37 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search']
38 |
39 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none']
40 | # Reasons for convergence failure
41 | parameters['reasons'] = [[], 'list of reasons for termination']
42 |
43 | return ParameterList(parameters)
44 |
45 |
46 | class GradientDescent(Optimizer):
47 | """
48 | This class implements the gradient descent (and stochastic variant) optimizer
49 | """
50 | def __init__(self,problem,regularization,sess = None,feed_dict = None,parameters = ParametersGradientDescent()):
51 | """
52 | The constructor for this class takes:
53 | -problem: hessianlearn.problem.Problem
54 | -regularization: hessianlearn.problem.Regularization
55 | -sess: tf.Session()
56 | -parameters: hyperparameters dictionary
57 | """
58 | if regularization is None:
59 | _regularization = L2Regularization(problem,gamma = 0.0)
60 | else:
61 | _regularization = regularization
62 | super(GradientDescent,self).__init__(problem,_regularization,sess,parameters)
63 |
64 | self.grad = self.problem.gradient + self.regularization.gradient
65 | self._sweeps = np.zeros(2)
66 |
67 | self.trust_region_initialized = False
68 | if self.parameters['globalization'] == 'trust_region':
69 | self.alpha = 0.0
70 | else:
71 | self.alpha = parameters['alpha']
72 |
73 |
74 |
75 |
76 | def minimize(self,feed_dict = None):
77 | r"""
78 | Implements the gradient update:
79 | w-=alpha*g
80 | Takes the parameter:
81 | -feed_dict: data to be used to evaluate stochastic gradient and cost
82 | """
83 | assert self.sess is not None
84 | assert feed_dict is not None
85 |
86 | g = self.sess.run(self.grad,feed_dict = feed_dict)
87 |
88 |
89 | if self.parameters['globalization'] == 'line_search':
90 | w_dir = -g
91 | w_dir_inner_g = np.inner(w_dir,g)
92 | initial_cost = self.sess.run(self.problem.loss, feed_dict)
93 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict)
94 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\
95 | cost_at_candidate, initial_cost)
96 | p = self.alpha*w_dir
97 | self._sweeps += [1+0.5*line_search_iter,0]
98 |
99 | elif self.parameters['globalization'] == None:
100 | self.alpha = self.parameters['alpha']
101 | p = -self.parameters['alpha']*g
102 | self._sweeps += [1,0]
103 |
104 | self.p = p
105 |
106 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:p})
107 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/adam.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 |
24 | from ..utilities.parameterList import ParameterList
25 | from ..algorithms import Optimizer
26 |
27 |
28 |
29 |
30 | def ParametersAdam(parameters = {}):
31 | parameters['alpha'] = [1e-3, "Initial steplength, or learning rate"]
32 | parameters['beta_1'] = [0.9, "Exponential decay rate for first moment"]
33 | parameters['beta_2'] = [0.999, "Exponential decay rate for second moment"]
34 | parameters['epsilon'] = [1e-7, "epsilon for denominator involving square root"]
35 |
36 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"]
37 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"]
38 | parameters['max_NN_evals_per_batch'] = [10000, "Scale constant for maximum neural network evaluations per datum"]
39 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"]
40 |
41 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none']
42 | # Reasons for convergence failure
43 | parameters['reasons'] = [[], 'list of reasons for termination']
44 |
45 |
46 | return ParameterList(parameters)
47 |
48 |
49 | class Adam(Optimizer):
50 | """
51 | This class implements the Adam optimizer
52 | """
53 | def __init__(self,problem,regularization = None,sess = None,feed_dict= None,parameters = ParametersAdam()):
54 | """
55 | The constructor for this class takes:
56 | -problem: hessianlearn.problem.Problem
57 | -regularization: hessianlearn.problem.Regularization
58 | -sess: tf.Session()
59 | -parameters: hyperparameters dictionary
60 | """
61 | if regularization is None:
62 | _regularization = L2Regularization(problem,gamma = 0.0)
63 | else:
64 | _regularization = regularization
65 | super(Adam,self).__init__(problem,_regularization,sess,parameters)
66 |
67 | self.grad = self.problem.gradient + self.regularization.gradient
68 |
69 | self.m = np.zeros(self.problem.dimension)
70 | self.v = np.zeros(self.problem.dimension)
71 | self.p = np.zeros(self.problem.dimension)
72 |
73 | self._iter = 0
74 | self._sweeps = np.zeros(2)
75 |
76 | self.alpha = self.parameters['alpha']
77 |
78 | def minimize(self,feed_dict = None):
79 | r"""
80 | This method implements one step of the Adam algorithm:
81 | -feed_dict: data dictionary used to evaluate gradient
82 | """
83 | assert self.sess is not None
84 | assert feed_dict is not None
85 | self._iter += 1
86 |
87 | alpha = self.parameters['alpha']* np.sqrt(1 - self.parameters['beta_2']**self.iter)/(1 - self.parameters['beta_1']**self.iter)
88 |
89 | gradient = self.sess.run(self.grad,feed_dict = feed_dict)
90 |
91 | self.m = self.parameters['beta_1']*self.m + (1-self.parameters['beta_1'])*gradient
92 | m_hat = self.m / (1.0 - self.parameters['beta_1']**self._iter)
93 |
94 | g_sq_vec = np.square(gradient)
95 | self.v = self.parameters['beta_2']*self.v + (1-self.parameters['beta_2'])*g_sq_vec
96 | v_hat = self.v / (1.0 - self.parameters['beta_2']**self._iter)
97 | v_root = np.sqrt(v_hat)
98 |
99 |
100 | update = -alpha*m_hat/(v_root +self.parameters['epsilon'])
101 | self.p = update
102 | self._sweeps += [1,0]
103 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
104 |
105 |
106 |
--------------------------------------------------------------------------------
/hessianlearn/data/lfw.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import sys
24 |
25 |
26 | import numpy as np
27 | from scipy import signal
28 | import random
29 | from ..data.data import *
30 |
31 | import math
32 | import time
33 |
34 | # from statsmodels import robust
35 |
36 | def dir_check(dir):
37 | try:
38 | os.stat(dir)
39 | except:
40 | os.mkdir(dir)
41 |
42 | def reporthook(count, block_size, total_size):
43 | global start_time
44 | if count == 0:
45 | start_time = time.time()
46 | return
47 | duration = time.time() - start_time
48 | progress_size = int(count * block_size)
49 | speed = int(progress_size / (1024 * duration))
50 | percent = int(count * block_size * 100 / total_size)
51 | sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
52 | (percent, progress_size / (1024 * 1024), speed, duration))
53 | sys.stdout.flush()
54 |
55 |
56 |
57 | def load_lfw():
58 | try:
59 | # read from file
60 | images = np.load('lfw_all_images.npy')
61 | labels = np.load('lfw_all_labels.npy')
62 | print('Loaded successfully locally')
63 | return [images, labels]
64 |
65 | except:
66 | # write to file
67 | print(80*'#')
68 | print('Did not load locally.')
69 | print(80*'#')
70 | try:
71 | os.stat("lfw.tgz")
72 | except:
73 | print('Downloading from source, and saving to disk.')
74 | print(80*'#')
75 | import urllib.request
76 | urllib.request.urlretrieve("http://vis-www.cs.umass.edu/lfw/lfw.tgz", "lfw.tgz",reporthook)
77 | folder_name = 'lfw/'
78 | try:
79 | os.stat(folder_name)
80 | except:
81 | try:
82 | import subprocess
83 | subprocess.run(['tar','zxvf',"lfw.tgz"])
84 | except:
85 | pass
86 | import shutil
87 | folder_names = os.listdir(folder_name)
88 | n_folders = len(folder_names)
89 | print(n_folders,' many folder names')
90 | if not os.path.isdir('lfw_all_images'):
91 | os.mkdir('lfw_all_images')
92 | print('Making directory lfw_all_images/')
93 | for folder in os.listdir('lfw/'):
94 | for file in os.listdir('lfw/'+folder):
95 | if not os.path.isfile('lfw_all_images/'+file):
96 | shutil.move('lfw/'+folder+'/'+file,'lfw_all_images/')
97 | print('Moving ',file,'to lfw_all_images')
98 |
99 | file_names = os.listdir('lfw_all_images')
100 | n_files = len(file_names)
101 |
102 | images = np.empty(shape = (n_files,250,250,3))
103 |
104 | from keras.preprocessing import image
105 | for file,counter in zip(file_names,range(n_files)):
106 | img = image.load_img('lfw_all_images/'+file)
107 | images[counter,:,:,:] = image.img_to_array(img)
108 | labels = np.array(file_names)
109 | assert(labels.shape[0]==images.shape[0])
110 | print(labels.shape)
111 | images = np.array(images)
112 | np.save('lfw_all_images.npy',images)
113 | np.save('lfw_all_labels.npy',labels)
114 | print('Saved locally')
115 | return [images,labels]
116 |
117 |
118 | # def view_random_pair(self):
119 | # try:
120 | # labelkey = ['Airplane','Automobile','Bird','Cat','Deer','Dog','Frog','Horse','Ship','Truck']
121 | # i = np.random.choice(range(60000))
122 | # index = self.all_data[1][i]
123 | # label = labelkey[index]
124 | # import matplotlib.pyplot as plt
125 | # fig, ax = plt.subplots(figsize = (3,3))
126 | # ax.set_title(str(label))
127 | # data = self.all_data[0][i,:,:,:].astype(np.uint8)
128 | # ax.imshow(data)
129 | # plt.show()
130 | # except:
131 | # pass
132 |
133 |
134 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/optimizer.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | from abc import ABC, abstractmethod
23 | import numpy as np
24 |
25 | from ..utilities.parameterList import ParameterList
26 | from ..problem import Hessian
27 |
28 | def ParametersOptimizer(dictionary = {}):
29 | parameters = dictionary
30 | parameters['alpha'] = [1.0, "Initial steplength, or learning rate"]
31 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"]
32 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"]
33 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none']
34 |
35 |
36 | return ParameterList(parameters)
37 |
38 |
39 | class Optimizer(ABC):
40 | """
41 | This class describes the optimizer used during training
42 |
43 | All children must implement the method minimize, which implements
44 | one step of the optimizers weight update scheme
45 | """
46 | def __init__(self,problem = None,regularization = None, sess = None,parameters = ParametersOptimizer(),comm = None):
47 | """
48 | The constructor for this class takes:
49 | -problem: hessianlearn.problem.Problem class
50 | -regularization: hessianlearn.problem.Regularization class
51 | -sess: the tf.Session() used to evaluate the computational graph
52 | -parameters: the dictionary of hyperparameters for the optimizer.
53 | """
54 | self._problem = problem
55 | self._regularization = regularization
56 | self._sess = sess
57 | self._parameters = parameters
58 | self._sweeps = 0
59 | self._comm = comm
60 | self._iter = 0
61 | self.H = Hessian(problem=problem,sess=sess)
62 |
63 | @property
64 | def problem(self):
65 | return self._problem
66 |
67 | @property
68 | def sess(self):
69 | return self._sess
70 |
71 | @property
72 | def parameters(self):
73 | return self._parameters
74 |
75 | @property
76 | def sweeps(self):
77 | return self._sweeps
78 |
79 | @property
80 | def comm(self):
81 | return self._comm
82 |
83 | @property
84 | def iter(self):
85 | return self._iter
86 |
87 | @property
88 | def regularization(self):
89 | return self._regularization
90 |
91 | @property
92 | def set_sess(self):
93 | return self._set_sess
94 |
95 |
96 | def _set_sess(self,sess):
97 | r"""
98 | Sets the tf.Session()
99 | """
100 | self._sess = sess
101 | if 'H' in dir(self):
102 | self.H._sess = sess
103 |
104 | def minimize(self):
105 | r"""
106 | Implements update rule for the algorithm.
107 | """
108 | raise NotImplementedError("Child class should implement method minimize")
109 |
110 | def initialize_trust_region(self):
111 | r"""
112 | Initializes trust region parameters
113 | """
114 | raise NotImplementedError("Child class should implement method minimize")
115 |
116 |
117 |
118 | def _loss_at_candidate(self,p,feed_dict):
119 | """
120 | This method implements a function to assist with Armijo line search
121 | -p: candidate update to be evaluated in Armijo line search producedure
122 | -feed_dict: data dictionary used to evaluate cost at candidate
123 | """
124 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:p})
125 | # self.sess.run(self.problem._update_w(p))
126 | misfit = self.sess.run((self.problem.loss),feed_dict)
127 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:-p})
128 | # self.sess.run(self.problem._update_w(-p))
129 | return misfit
130 |
131 |
132 |
133 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/gmresSolver.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | import math
22 | import numpy as np
23 | import tensorflow as tf
24 | if int(tf.__version__[0]) > 1:
25 | import tensorflow.compat.v1 as tf
26 | tf.disable_v2_behavior()
27 |
28 | from ..utilities.parameterList import ParameterList
29 | from ..algorithms import Optimizer
30 | from .. problem import IdentityPreconditioner
31 | from ..problem import L2Regularization
32 | from abc import ABC, abstractmethod
33 |
34 | class Identity(object):
35 | def __init__(self):
36 |
37 | pass
38 |
39 | def __call__(self, x):
40 | return x
41 |
42 |
43 |
44 | def ParametersGMRESSolver(dictionary = {}):
45 | parameters = dictionary
46 | parameters["rel_tolerance"] = [1e-9, "the relative tolerance for the stopping criterion"]
47 | parameters["abs_tolerance"] = [1e-12, "the absolute tolerance for the stopping criterion"]
48 | parameters["max_iter"] = [20, "the maximum number of iterations"]
49 | parameters["zero_initial_guess"] = [True, "if True we start with a 0\
50 | initial guess; if False we use the x as initial guess."]
51 | parameters["print_level"] = [-1, "verbosity level: -1 --> no output on \
52 | screen; 0 --> only final residual at convergence or reason for not not convergence"]
53 |
54 | parameters['coarse_tol'] = [0.5,'coarse tolerance used in calculation \
55 | of relative tolerances for E-W conditions']
56 | return ParameterList(parameters)
57 |
58 |
59 | class GMRESSolver(ABC):
60 | """
61 | This class implements a GMRES solver
62 | """
63 | reason = ["Maximum Number of Iterations Reached",
64 | "Relative/Absolute residual less than tol",
65 | "Reached a negative direction",
66 | "Reached trust region boundary"
67 | ]
68 | def __init__(self,problem,regularization,sess = None,preconditioner = None,\
69 | x = None,parameters = ParametersGMRESSolver()):
70 | self.sess = sess
71 | self.problem = problem
72 | self.regularization = regularization
73 | if x is None:
74 | # self.x = tf.Variable(self.problem.gradient.initialized_value())
75 | self.x = self.problem.gradient
76 | else:
77 | self.x = x
78 | self.parameters = parameters
79 |
80 |
81 | self.Aop = self.problem.Hdw + self.regularization.Hdw
82 |
83 | # # Define preconditioner
84 | # if preconditioner is None:
85 | # self.Minv = IdentityPreconditioner(problem,self.problem.dtype)
86 | # else:
87 | # self.Minv = preconditioner
88 |
89 |
90 |
91 |
92 |
93 |
94 | def solve(self,b,feed_dict = None,x_0 = None):
95 | r"""
96 | Solve Ax=b by the mines method
97 | as defined in Iterative Methods Ed. 2 by Youssef Saad p 140
98 | """
99 | assert self.sess is not None
100 | assert feed_dict is not None
101 |
102 | self.iter = 0
103 | self.converged = False
104 | self.reason_id = 0
105 | x = np.zeros_like(b)
106 |
107 | feed_dict[self.problem.dw] = x
108 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict)
109 | # Calculate initial residual r = Ax_0 -b
110 | r = b - Ax_0
111 | # Calculate tolerance for Eisenstat Walker conditions
112 | rr_0 = np.dot(r,r)
113 | rtol2 = rr_0 * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"]
114 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"]
115 | tol = max(rtol2, atol2)
116 | import scipy
117 | from scipy.sparse.linalg import LinearOperator
118 |
119 | def Ap(p):
120 | feed_dict[self.problem.dw] = p
121 | return self.sess.run(self.Aop,feed_dict = feed_dict)
122 |
123 | n = self.problem.dimension
124 |
125 | A = LinearOperator((n,n), matvec=Ap)
126 |
127 | # self.iter += self.parameters["max_iter"]
128 |
129 | def update_iters(rk):
130 | self.iter +=1
131 |
132 | return scipy.sparse.linalg.gmres(A, b, tol=tol, maxiter=self.parameters["max_iter"],callback = update_iters)
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
--------------------------------------------------------------------------------
/applications/mnist/mnist_autoencoder.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | ################################################################################
19 | # Uses some code from https://blog.keras.io/building-autoencoders-in-keras.html
20 | ################################################################################
21 |
22 | import numpy as np
23 | import os
24 | import tensorflow as tf
25 | import time
26 | # if int(tf.__version__[0]) > 1:
27 | # import tensorflow.compat.v1 as tf
28 | # tf.disable_v2_behavior()
29 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
30 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
31 | os.environ["KMP_WARNINGS"] = "FALSE"
32 | import sys
33 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../"))
34 | from hessianlearn import *
35 |
36 | tf.set_random_seed(0)
37 |
38 | settings = {}
39 | # Set run specifications
40 | # Data specs
41 | settings['batch_size'] = 100
42 | settings['hess_batch_size'] = 10
43 |
44 |
45 | ################################################################################
46 | # Instantiate data
47 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
48 |
49 |
50 | # Normalize the data
51 | x_train = x_train.astype('float32') / 255.
52 | x_test = x_test.astype('float32') / 255.
53 | # Reshape the data
54 | x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
55 | x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
56 |
57 | # Instante the data object
58 | data = Data([x_train,y_train],settings['batch_size'],test_data = [x_test,y_test],hessian_batch_size = settings['hess_batch_size'])
59 |
60 | # settings['input_shape'] = data._input_shape
61 | # settings['output_shape'] = data._output_shape
62 |
63 |
64 | ################################################################################
65 | # Create the neural network in keras
66 |
67 | encoding_dim = 32
68 | input_img = tf.keras.layers.Input(shape=(784,))
69 | encoded = tf.keras.layers.Dense(encoding_dim, activation='softplus')(input_img)
70 | decoded = tf.keras.layers.Dense(784, activation='sigmoid')(encoded)
71 | autoencoder = tf.keras.models.Model(input_img, decoded)
72 |
73 |
74 | ################################################################################
75 | # Instantiate the problem, regularization.
76 |
77 | problem = AutoencoderProblem(autoencoder,dtype=tf.float32)
78 |
79 | settings['tikhonov_gamma'] = 0.0
80 |
81 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma'])
82 |
83 |
84 | ################################################################################
85 | # Instantiate the model object
86 | HLModelSettings = HessianlearnModelSettings()
87 |
88 | HLModelSettings['optimizer'] = 'lrsfn'
89 | HLModelSettings['alpha'] = 1e-2
90 | HLModelSettings['globalization'] = 'line_search'
91 | HLModelSettings['hessian_low_rank'] = 20
92 | HLModelSettings['max_backtrack'] = 16
93 | HLModelSettings['max_sweeps'] = 50
94 |
95 | HLModelSettings['problem_name'] = 'mnist_ae'
96 | HLModelSettings['record_spectrum'] = False
97 | HLModelSettings['rq_data_size'] = 100
98 |
99 |
100 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings)
101 |
102 | HLModel.fit()
103 |
104 | ################################################################################
105 | # Postprocessing with the trained autoencoder
106 |
107 | encoder = tf.keras.models.Model(input_img, encoded)
108 |
109 | encoded_input = tf.keras.layers.Input(shape=(encoding_dim,))
110 |
111 | decoder_layer = autoencoder.layers[-1]
112 |
113 | decoder = tf.keras.models.Model(encoded_input, decoder_layer(encoded_input))
114 |
115 | encoded_imgs = encoder.predict(x_test)
116 | decoded_imgs = decoder.predict(encoded_imgs)
117 |
118 | try:
119 | import matplotlib.pyplot as plt
120 |
121 | n = 10 # how many digits we will display
122 | plt.figure(figsize=(20, 4))
123 | for i in range(n):
124 | # display original
125 | ax = plt.subplot(2, n, i + 1)
126 | plt.imshow(x_test[i].reshape(28, 28))
127 | plt.gray()
128 | ax.get_xaxis().set_visible(False)
129 | ax.get_yaxis().set_visible(False)
130 |
131 | # display reconstruction
132 | ax = plt.subplot(2, n, i + 1 + n)
133 | plt.imshow(decoded_imgs[i].reshape(28, 28))
134 | plt.gray()
135 | ax.get_xaxis().set_visible(False)
136 | ax.get_yaxis().set_visible(False)
137 | plt.show()
138 | except:
139 | pass
140 |
141 |
--------------------------------------------------------------------------------
/hessianlearn/test/test_varianceBasedNystrom.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Authors: Nick Alger, Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 | from __future__ import absolute_import, division, print_function
18 |
19 | import unittest
20 | import numpy as np
21 | import sys
22 |
23 | sys.path.append('../../')
24 | from hessianlearn import (variance_based_nystrom)
25 | sys.path.append('../algorithms')
26 | from varianceBasedNystrom import *
27 |
28 | def make_random_symmetric_matrix(n,p):
29 | U, _ = np.linalg.qr(np.random.randn(n,n))
30 | ss = np.random.randn(n)**p
31 | A = np.dot(U, np.dot(np.diag(ss), U.T))
32 | return A
33 |
34 |
35 | def compute_Theta_slow(Q, apply_AA):
36 | r = Q.shape[1]
37 | m = len(apply_AA)
38 | Theta_true = np.zeros((r, r, m))
39 | for i in range(r):
40 | for j in range(r):
41 | for k in range(m):
42 | Theta_true[i,j,k] = np.dot(Q[:,i], apply_AA[k](Q[:,j]))
43 | return Theta_true
44 |
45 | def compute_rayleigh_statistics_slow(U, apply_AA):
46 | m = len(apply_AA)
47 | r = U.shape[1]
48 | C = np.zeros((r, m))
49 | for k in range(m):
50 | for i in range(r):
51 | C[i,k] = np.dot(U[:,i], apply_AA[k](U[:,i]))
52 |
53 | all_mu = np.mean(C, axis=1)
54 | all_std = np.std(C, axis=1)
55 | return all_mu, all_std
56 |
57 |
58 | class TestVarianceBasedNystrom(unittest.TestCase):
59 |
60 | def setUp(self):
61 | self.n = 500
62 | m = 50
63 | p = 7
64 | self.batch_r = 10
65 | randomness_factor = 0.1
66 |
67 | A0 = make_random_symmetric_matrix(self.n,p)
68 | AA = [A0 + randomness_factor * make_random_symmetric_matrix(self.n,p) for _ in range(m)]
69 |
70 | self.apply_AA = [lambda x, Ak=Ak: np.dot(Ak,x) for Ak in AA]
71 |
72 | self.A = np.sum(AA, axis=0)/m
73 |
74 |
75 |
76 |
77 |
78 | def test_all(self):
79 | Y = get_random_range_vectors(self.apply_AA, self.n, self.batch_r)
80 | Q,_ = np.linalg.qr(Y)
81 | Theta = compute_Theta(Q, self.apply_AA)
82 | Theta_true = compute_Theta_slow(Q, self.apply_AA)
83 | err_Theta = np.linalg.norm(Theta - Theta_true)/np.linalg.norm(Theta_true)
84 | print('err_Theta=', err_Theta)
85 | assert err_Theta < 1e-10
86 |
87 | dd, U, V = finish_computing_eigenvalue_decomposition(Q, Theta)
88 |
89 | A_approx = np.dot(U, np.dot(np.diag(dd), U.T))
90 | err_A_1 = np.linalg.norm(self.A - A_approx)/np.linalg.norm(self.A)
91 | print('err_A_1=', err_A_1)
92 | assert err_A_1 < 1.0
93 |
94 | # Errors in computing statistics
95 | all_mu, all_std = compute_rayleigh_statistics(Theta, V)
96 |
97 | all_mu_true, all_std_true = compute_rayleigh_statistics_slow(U,self.apply_AA)
98 |
99 | err_mu = np.linalg.norm(all_mu - all_mu_true)/np.linalg.norm(all_mu_true)
100 | err_std = np.linalg.norm(all_std - all_std_true)/np.linalg.norm(all_std_true)
101 |
102 | print('err_mu=', err_mu)
103 | print('err_std=', err_std)
104 | assert err_mu < 1e-10
105 | assert err_std < 1e-10
106 |
107 | # Redo computations with better range approximation
108 | Y2 = get_random_range_vectors(self.apply_AA, self.n, self.batch_r)
109 | Y2_perp = Y2 - np.dot(Q,np.dot(Q.T, Y2))
110 | Q2,_ = np.linalg.qr(Y2_perp)
111 | Q_new = np.hstack([Q, Q2])
112 | err_Q_orth = np.linalg.norm(np.dot(Q_new.T, Q_new) - np.eye(Q_new.shape[1]))
113 | print('err_Q_orth=', err_Q_orth)
114 | assert err_Q_orth < 1e-10
115 |
116 | Theta_new = update_Theta(Q, Q2, Theta, self.apply_AA)
117 |
118 | Theta_true_new = compute_Theta_slow(Q_new, self.apply_AA)
119 |
120 | err_Theta_new = np.linalg.norm(Theta_new - Theta_true_new)/np.linalg.norm(Theta_true_new)
121 | print('err_Theta_new=', err_Theta_new)
122 |
123 | assert err_Theta_new < 1e-10
124 |
125 | dd_new, U_new, V_new = finish_computing_eigenvalue_decomposition(Q_new, Theta_new)
126 | A_approx_new = np.dot(U_new, np.dot(np.diag(dd_new), U_new.T))
127 | err_A_new = np.linalg.norm(self.A - A_approx_new)/np.linalg.norm(self.A)
128 | print('err_A_new=', err_A_new)
129 |
130 | # The approximation error should decrease monotonically as we increase the range
131 | assert err_A_new < err_A_1
132 |
133 | # Run the complete method from scratch
134 |
135 | [dd_good, U_good, all_std_good], [dd_all,U_all,all_std] = variance_based_nystrom(self.apply_AA, self.n)
136 |
137 | A_good_approx = np.dot(U_good, np.dot(np.diag(dd_good), U_good.T))
138 | err_A_good = np.linalg.norm(A_good_approx - self.A)/np.linalg.norm(self.A)
139 | print('err_A_good=', err_A_good)
140 | assert err_A_good < 0.1
141 |
142 |
143 |
144 | if __name__ == '__main__':
145 | unittest.main()
--------------------------------------------------------------------------------
/hessianlearn/algorithms/inexactNewtonMINRES.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 |
24 | from ..utilities.parameterList import ParameterList
25 | from ..algorithms import Optimizer, MINRESSolver, ParametersMINRESSolver
26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion
27 | from ..problem import L2Regularization
28 |
29 |
30 |
31 |
32 | def ParametersInexactNewtonMINRES(parameters = {}):
33 | parameters['alpha'] = [1e-1, "Initial steplength, or learning rate"]
34 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"]
35 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"]
36 | parameters['max_NN_evals_per_batch'] = [20, "Scale constant for maximum neural network evaluations per datum"]
37 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"]
38 |
39 | parameters['minres_parameters'] = [ ParametersMINRESSolver(),'CG Parameters']
40 | # CG solver parameters
41 | parameters['cg_coarse_tol'] = [0.5,'CG coarse solve tolerance']
42 | parameters['cg_max_iter'] = [1000,'CG maximum iterations']
43 | parameters['eta_mode'] = [0, 'eta mode for E-W conditions:0,1,2']
44 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none']
45 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search']
46 |
47 |
48 | # Reasons for convergence failure
49 | parameters['reasons'] = [[], 'list of reasons for termination']
50 |
51 |
52 | return ParameterList(parameters)
53 |
54 |
55 | class InexactNewtonMINRES(Optimizer):
56 | """
57 | This class implements the Inexact Newton MINRES optimizer
58 | """
59 |
60 | def __init__(self,problem,regularization = None,sess = None,parameters = ParametersInexactNewtonMINRES(),preconditioner = None):
61 | """
62 | The constructor for this class takes:
63 | -problem: hessianlearn.problem.Problem
64 | -regularization: hessianlearn.problem.Regularization
65 | -sess: tf.Session()
66 | -parameters: hyperparameters dictionary
67 | -preconditioner: hessianlearn.problem.Preconditioner
68 | """
69 | if regularization is None:
70 | _regularization = L2Regularization(problem,gamma = 0.0)
71 | else:
72 | _regularization = regularization
73 | super(InexactNewtonMINRES,self).__init__(problem,_regularization,sess,parameters)
74 |
75 | self._sweeps = np.zeros(2)
76 | self.grad = self.problem.gradient + self.regularization.gradient
77 | self.minres_solver = MINRESSolver(self.problem,self.regularization,\
78 | self.sess,parameters= self.parameters['minres_parameters'])
79 | self.alpha = 0.0
80 |
81 |
82 | def minimize(self,feed_dict = None,hessian_feed_dict = None):
83 | r"""
84 | Updates using inexact Newton MINRES
85 | """
86 | assert self.sess is not None
87 | assert feed_dict is not None
88 | if hessian_feed_dict is None:
89 | hessian_feed_dict = feed_dict
90 |
91 | self.gradient = self.sess.run(self.grad,feed_dict = feed_dict)
92 |
93 | if self.parameters['globalization'] == 'line_search':
94 | w_dir,_ = self.minres_solver.solve(-self.gradient,hessian_feed_dict)
95 | w_dir_inner_g = np.inner(w_dir,self.gradient)
96 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict)
97 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict)
98 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\
99 | cost_at_candidate, initial_cost,\
100 | max_backtracking_iter = self.parameters['max_backtracking_iter'])
101 | update = self.alpha*w_dir
102 | self._sweeps += [1+0.5*line_search_iter,2*self.minres_solver.iter]
103 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
104 | elif self.parameters['globalization'] == None:
105 | self.alpha = self.parameters['alpha']
106 | p,converged = self.minres_solver.solve(-self.gradient,hessian_feed_dict)
107 | # print(converged)
108 | # if converged:
109 | # print('Converged!')
110 | # else:
111 | # print('NOT CONVERGED!!!!!')
112 | self._sweeps += [1, 4*self.minres_solver.iter]
113 | self.p = p
114 | update = self.alpha*p
115 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
116 |
117 |
118 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/inexactNewtonGMRES.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 |
24 | from ..utilities.parameterList import ParameterList
25 | from ..algorithms import Optimizer, GMRESSolver, ParametersGMRESSolver
26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion
27 | from ..problem import L2Regularization
28 |
29 |
30 |
31 |
32 | def ParametersInexactNewtonGMRES(parameters = {}):
33 | parameters['alpha'] = [1e-1, "Initial steplength, or learning rate"]
34 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"]
35 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"]
36 | parameters['max_NN_evals_per_batch'] = [20, "Scale constant for maximum neural network evaluations per datum"]
37 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"]
38 |
39 | parameters['gmres_parameters'] = [ ParametersGMRESSolver(),'CG Parameters']
40 | # CG solver parameters
41 | parameters['cg_coarse_tol'] = [0.5,'CG coarse solve tolerance']
42 | parameters['cg_max_iter'] = [1000,'CG maximum iterations']
43 | parameters['eta_mode'] = [0, 'eta mode for E-W conditions:0,1,2']
44 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none']
45 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search']
46 |
47 |
48 | # Reasons for convergence failure
49 | parameters['reasons'] = [[], 'list of reasons for termination']
50 |
51 |
52 | return ParameterList(parameters)
53 |
54 |
55 | class InexactNewtonGMRES(Optimizer):
56 | """
57 | This class implements the inexact Newton GMRES optimizer
58 | """
59 | def __init__(self,problem,regularization = None,sess = None,feed_dict = None,parameters = ParametersInexactNewtonGMRES(),preconditioner = None):
60 | """
61 | The constructor for this class takes:
62 | -problem: hessianlearn.problem.Problem
63 | -regularization: hessianlearn.problem.Regularization
64 | -sess: tf.Session()
65 | -parameters: hyperparameters dictionary
66 | -preconditioner: hessianlearn.problem.Preconditioner
67 | """
68 | if regularization is None:
69 | _regularization = L2Regularization(problem,gamma = 0.0)
70 | else:
71 | _regularization = regularization
72 | super(InexactNewtonGMRES,self).__init__(problem,_regularization,sess,parameters)
73 |
74 | self._sweeps = np.zeros(2)
75 | self.grad = self.problem.gradient + self.regularization.gradient
76 | self.gmres_solver = GMRESSolver(self.problem,self.regularization,\
77 | self.sess,parameters= self.parameters['gmres_parameters'])
78 | self.alpha = 0.0
79 |
80 |
81 | def minimize(self,feed_dict = None,hessian_feed_dict = None):
82 | r"""
83 | Updates using inexact Newton GMRES
84 | """
85 | assert self.sess is not None
86 | assert feed_dict is not None
87 | if hessian_feed_dict is None:
88 | hessian_feed_dict = feed_dict
89 |
90 | self.gradient = self.sess.run(self.grad,feed_dict = feed_dict)
91 |
92 | if self.parameters['globalization'] == 'line_search':
93 | w_dir,on_boundary = self.gmres_solver.solve(-self.gradient,hessian_feed_dict)
94 | w_dir_inner_g = np.inner(w_dir,self.gradient)
95 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict)
96 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict)
97 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\
98 | cost_at_candidate, initial_cost,\
99 | max_backtracking_iter = self.parameters['max_backtracking_iter'])
100 | update = self.alpha*w_dir
101 | self._sweeps += [1+0.5*line_search_iter,2*self.gmres_solver.iter]
102 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
103 | elif self.parameters['globalization'] == None:
104 | self.alpha = self.parameters['alpha']
105 | p,converged = self.gmres_solver.solve(-self.gradient,hessian_feed_dict)
106 | # print(converged)
107 | # if converged:
108 | # print('Converged!')
109 | # else:
110 | # print('NOT CONVERGED!!!!!')
111 | self._sweeps += [1, 2*self.gmres_solver.iter]
112 | self.p = p
113 | update = self.alpha*p
114 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/minresSolver.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | import math
22 | import numpy as np
23 | import tensorflow as tf
24 | if int(tf.__version__[0]) > 1:
25 | import tensorflow.compat.v1 as tf
26 | tf.disable_v2_behavior()
27 |
28 | from ..utilities.parameterList import ParameterList
29 | from ..algorithms import Optimizer
30 | from .. problem import IdentityPreconditioner
31 | from ..problem import L2Regularization
32 | from abc import ABC, abstractmethod
33 |
34 | class Identity(object):
35 | def __init__(self):
36 |
37 | pass
38 |
39 | def __call__(self, x):
40 | return x
41 |
42 |
43 |
44 | def ParametersMINRESSolver(dictionary = {}):
45 | parameters = dictionary
46 | parameters["rel_tolerance"] = [1e-9, "the relative tolerance for the stopping criterion"]
47 | parameters["abs_tolerance"] = [1e-12, "the absolute tolerance for the stopping criterion"]
48 | parameters["max_iter"] = [20, "the maximum number of iterations"]
49 | parameters["zero_initial_guess"] = [True, "if True we start with a 0\
50 | initial guess; if False we use the x as initial guess."]
51 | parameters["print_level"] = [-1, "verbosity level: -1 --> no output on \
52 | screen; 0 --> only final residual at convergence or reason for not not convergence"]
53 |
54 | parameters['coarse_tol'] = [0.5,'coarse tolerance used in calculation \
55 | of relative tolerances for E-W conditions']
56 | return ParameterList(parameters)
57 |
58 |
59 | class MINRESSolver(ABC):
60 | """
61 | This class implements a basic MINRES Solver
62 | """
63 |
64 | reason = ["Maximum Number of Iterations Reached",
65 | "Relative/Absolute residual less than tol",
66 | "Reached a negative direction",
67 | "Reached trust region boundary"
68 | ]
69 | def __init__(self,problem,regularization,sess = None,preconditioner = None,\
70 | x = None,parameters = ParametersMINRESSolver()):
71 | """
72 | The constructor for this class takes:
73 | -problem: hessianlearn.problem.Problem
74 | -regularization: hessianlearn.problem.Regularization
75 | -sess: tf.Session()
76 | -preconditioner: hessianlearn.problem.Preconditioner
77 | """
78 | self.sess = sess
79 | self.problem = problem
80 | self.regularization = regularization
81 | if x is None:
82 | # self.x = tf.Variable(self.problem.gradient.initialized_value())
83 | self.x = self.problem.gradient
84 | else:
85 | self.x = x
86 | self.parameters = parameters
87 |
88 |
89 | self.Aop = self.problem.Hdw + self.regularization.Hdw
90 |
91 | # # Define preconditioner
92 | # if preconditioner is None:
93 | # self.Minv = IdentityPreconditioner(problem,self.problem.dtype)
94 | # else:
95 | # self.Minv = preconditioner
96 |
97 |
98 |
99 |
100 |
101 | def solve(self,b,feed_dict = None,x_0 = None):
102 | r"""
103 | Solve Ax=b by the mines method
104 | as defined in Iterative Methods Ed. 2 by Youssef Saad p 140
105 | """
106 | assert self.sess is not None
107 | assert feed_dict is not None
108 |
109 | self.iter = 0
110 | self.converged = False
111 | self.reason_id = 0
112 | x = np.zeros_like(b)
113 |
114 | feed_dict[self.problem.dw] = x
115 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict)
116 | # Calculate initial residual r = Ax_0 -b
117 | r = b - Ax_0
118 | # Calculate tolerance for Eisenstat Walker conditions
119 | rr = np.dot(r,r)
120 | rtol2 = rr * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"]
121 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"]
122 | tol = max(rtol2, atol2)
123 | import scipy
124 | from scipy.sparse.linalg import LinearOperator
125 |
126 | def Ap(p):
127 | feed_dict[self.problem.dw] = p
128 | return self.sess.run(self.Aop,feed_dict = feed_dict)
129 |
130 | n = self.problem.dimension
131 |
132 | A = LinearOperator((n,n), matvec=Ap)
133 |
134 | x = np.zeros_like(b)
135 | p = A(r)
136 |
137 | converged = False
138 | while not converged :
139 | self.iter +=1
140 | alpha = np.dot(p,r)/rr
141 | x_old = x
142 | x += alpha*r
143 | r -= alpha*p
144 |
145 | p = A(r)
146 | # This is the extra query of the network to see if the direction
147 | # is about to rescale gradient components in indefinite directions
148 | # towards saddle points in which case one can break before
149 | # updating
150 | pAp = np.dot(p,A(p))
151 | if pAp < 0:
152 | return x_old, converged
153 |
154 | converged = (np.dot(r,r) < tol)
155 | if self.iter < self.parameters["max_iter"]:
156 | return x, converged
157 |
158 | return x, converged
159 |
160 |
161 |
162 |
163 |
--------------------------------------------------------------------------------
/hessianlearn/problem/hessian.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 |
19 |
20 | from __future__ import absolute_import, division, print_function
21 | import numpy as np
22 | # import tensorflow as tf
23 | # if int(tf.__version__[0]) > 1:
24 | # import tensorflow.compat.v1 as tf
25 | # tf.disable_v2_behavior()
26 | from abc import ABC, abstractmethod
27 |
28 |
29 | class Hessian(ABC):
30 | """
31 | This class implements methods for the neural network training Hessian.
32 |
33 | Must have a problem and a sess in order to be evaluated
34 | """
35 | def __init__(self,problem=None,sess=None):
36 | """
37 | Create a Hessian given:
38 |
39 | - problem: the description of the neural network training problem
40 | (hessianlearn.problem.Problem)
41 | - sess: the tf.Session() needed for evaluation at run time
42 | """
43 | self._problem = problem
44 | self._sess = sess
45 |
46 | @property
47 | def problem(self):
48 | return self._problem
49 | @property
50 | def sess(self):
51 | return self._sess
52 |
53 | @property
54 | def dimension(self):
55 | return self.problem.dimension
56 |
57 |
58 | @property
59 | def T(self):
60 | return self._T
61 |
62 | def _T(self):
63 | return self
64 |
65 | def __mult__(self,x):
66 | return self(x)
67 |
68 | def __call__(self,x,feed_dict,verbose = False):
69 | """
70 | This method implements Hessian action, must have a problem and sess
71 | set before this method can be evaluated.
72 | -x: numpy array to be multiplied one at a time
73 | -feed_dict: data used in finite sum Hessian evaluation
74 | -verbose: for printing
75 | """
76 | assert self.problem is not None
77 | assert self.sess is not None
78 |
79 | if len(x.shape) == 1:
80 | feed_dict[self.problem.dw] = x
81 | return self.sess.run(self.problem.Hdw,feed_dict)
82 | elif len(x.shape) == 2:
83 | n_vectors = x.shape[-1]
84 | if self.problem._HdW is None:
85 | if verbose:
86 | print('Total vectors = ',n_vectors)
87 | print('Initializing Hessian blocking')
88 | self.problem._initialize_hessian_blocking(n_vectors)
89 | # When the block sizes agree
90 | if n_vectors == self.problem._hessian_block_size:
91 | feed_dict[self.problem._dW] = x
92 | HdW = self.sess.run(self.problem.HdW,feed_dict)
93 | return HdW
94 | # When the requested block size is smaller
95 | elif n_vectors < self.problem._hessian_block_size:
96 | # The speedup is roughly 5x, so in the case that its less
97 | # than 1/5 its faster to either reinitialize the blocking
98 | # or for loop around running problem.Hdw
99 | if n_vectors < 0.2*self.problem._hessian_block_size:
100 | # Could reinitialize the blocking or just for loop
101 | # For looping for now
102 | HdW = np.zeros_like(x)
103 | for i in range(n_vectors):
104 | feed_dict[self.problem.dw] = x[:,i]
105 | HdW[:,i] = self.sess.run(self.problem.Hdw,feed_dict)
106 | return HdW
107 | else:
108 | dW = np.zeros(self.problem.dimension,self.problem._hessian_block_size)
109 | dW[:,:n_vectors] = x
110 | feed_dict[self.problem._dW] = dW
111 | HdW = self.sess.run(self.problem.HdW,feed_dict)
112 | return HdW[:,:n_vectors]
113 | # When the requested block size is larger
114 | elif n_vectors > self.problem._hessian_block_size:
115 | HdW = np.zeros_like(x)
116 | block_size = self.problem._hessian_block_size
117 | blocks, remainder = np.divmod(HdW.shape[-1],block_size)
118 | for i in range(blocks):
119 | feed_dict[self.problem._dW] = x[:,i*block_size:(i+1)*block_size]
120 | HdW[:,i*block_size:(i+1)*block_size] = self.sess.run(self.problem.HdW,feed_dict)
121 | # The last vectors are done as a for loop or a zeroed out array
122 | if remainder == 0:
123 | pass
124 | elif remainder > 0 and remainder < 0.2*self.problem._hessian_block_size:
125 | for i in range(n_vectors):
126 | feed_dict[self.problem.dw] = x[:,blocks*block_size+i]
127 | HdW[:,blocks*block_size+i] = self.sess.run(self.problem.Hdw,feed_dict)
128 | else:
129 | dW = np.zeros((self.problem.dimension,self.problem._hessian_block_size))
130 | dW[:,:remainder] = x[:,-remainder:]
131 | feed_dict[self.problem._dW] = dW
132 | HdW[:,-remainder:] = self.sess.run(self.problem.Hdw,feed_dict)
133 | return HdW
134 | else:
135 | # Many different Hessian mat-vecs interpreted as a tensor?
136 | print('This case is not yet implemented'.center(80))
137 | raise
138 |
139 | def quadratics(self,x,feed_dict,verbose = False):
140 | """
141 | This method implements Hessian quadratics xTHx.
142 | Must have self._problem and self._sess set before this method can be evaluated.
143 | -x: numpy array to be multiplied one at a time
144 | -feed_dict: data used in finite sum Hessian evaluation
145 | -verbose: for printing
146 | """
147 | assert self.problem is not None
148 | assert self.sess is not None
149 | if len(x.shape) == 1:
150 | feed_dict[self.problem.dw] = x
151 | return self.sess.run(self.problem.H_quadratic,feed_dict)
152 | elif len(x.shape) == 2:
153 | number_of_quadratics = x.shape[1]
154 | H_quads = np.zeros(number_of_quadratics)
155 | if verbose:
156 | try:
157 | from tqdm import tqdm
158 | for i in tqdm(range(number_of_quadratics)):
159 | feed_dict[self.problem.dw] = x[:,i]
160 | H_quads[i] = self.sess.run(self.problem.H_quadratic,feed_dict)
161 | except:
162 | print('No progress bar :(')
163 | for i in range(number_of_quadratics):
164 | feed_dict[self.problem.dw] = x[:,i]
165 | H_quads[i] = self.sess.run(self.problem.H_quadratic,feed_dict)
166 | else:
167 | for i in range(number_of_quadratics):
168 | feed_dict[self.problem.dw] = x[:,i]
169 | H_quads[i] = self.sess.run(self.problem.H_quadratic,feed_dict)
170 | return H_quads
171 | else:
172 | raise
173 |
174 |
175 | class HessianWrapper:
176 |
177 | def __init__(self,hessian,data_dictionary):
178 |
179 | self._hessian = hessian
180 | self._data_dictionary = data_dictionary
181 |
182 |
183 | def __call__(self,x):
184 | return self._hessian(x,self._data_dictionary)
185 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/rangeFinders.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import, division, print_function
19 | import time
20 | import sys
21 | import numpy as np
22 |
23 |
24 | from scipy.linalg import cholesky, eigh, solve_triangular, qr, rq
25 |
26 | import time
27 |
28 |
29 | def block_range_finder(A_op,n,epsilon,block_size,verbose = False,seed = 0):
30 | """
31 | Randomized algorithm for block range finding
32 |
33 | Parameters:
34 | -----------
35 | Aop : {Callable} n x n symmetric matrix
36 | Hermitian matrix operator whose eigenvalues need to be estimated
37 | y = Aop(dw) is the action of A in the direction dw
38 | n : size of matrix A
39 | epsilon : relative reduction in error
40 |
41 |
42 | Returns:
43 | --------
44 | Q : range for Aop
45 | """
46 | # Taken from http://people.maths.ox.ac.uk/martinsson/Pubs/2015_randQB.pdf
47 |
48 | my_state = np.random.RandomState(seed=seed)
49 | w = my_state.randn(n,1)
50 | Action = A_op(w)
51 | initial_error = np.linalg.norm(Action)
52 | big_Q = None
53 | converged = False
54 | iteration = 0
55 | while not converged:
56 | # Sample Gaussian random matrix
57 | Omega = my_state.randn(n,block_size)
58 | # Perform QR on action
59 | Q,_ = np.linalg.qr(A_op(Omega))
60 | # Update basis
61 | if big_Q is None:
62 | big_Q = Q
63 | else:
64 | Q -= big_Q@(big_Q.T@Q)
65 | big_Q = np.concatenate((big_Q,Q),axis = 1)
66 | # This QR gets slow after many iterations, only last columns
67 | # need to be orthonormalized
68 | big_Q,_ = np.linalg.qr(big_Q)
69 | # Error estimation
70 | Approximate_Error = Action - big_Q@(big_Q.T@Action)
71 | error = np.linalg.norm(Approximate_Error)
72 | converged = error < epsilon*initial_error
73 | iteration+=1
74 | if verbose:
75 | print('At iteration', iteration, ' error/initial_error is ',error/initial_error,' tolerance is ',epsilon,' converged = ',converged)
76 | if iteration > n//block_size:
77 | break
78 | # I believe that the extra action of A_op in forming B for the QB factorization
79 | # is cheaper to do once after the fact, and is not needed for the matrix
80 | # free randomized error estimator. For this reason I just return Q, and
81 | # do not form B.
82 | return big_Q
83 |
84 |
85 |
86 |
87 | def noise_aware_adaptive_range_finder(Hessian,hessian_feed_dict,rq_estimator_dict_list,\
88 | block_size = None,noise_tolerance = 1.0,epsilon = 1e-1, max_vectors = 20, verbose = False,seed = 0):
89 | """
90 | Randomized algorithm for noise aware block range finding (N.A.A.R.F.)
91 |
92 | Parameters:
93 | -----------
94 | Hessian :
95 | hessian_feed_dict :
96 | rq_estimator_dict :
97 | block_size :
98 | noise_tolerance :
99 | epsilon :
100 | verbose :
101 | seed :
102 |
103 | Returns:
104 | --------
105 | Q : range for dominant eigenmodes of Hessian
106 | """
107 |
108 | ###################################################################################
109 | assert type(rq_estimator_dict_list) is list
110 | n = Hessian.dimension
111 | if block_size is None:
112 | block_size = int(0.01*n)
113 | my_state = np.random.RandomState(seed=seed)
114 | w = my_state.randn(n,1)
115 |
116 | H = lambda x: Hessian(x,hessian_feed_dict,verbose = verbose)
117 | Action = H(w)
118 | big_Q = None
119 | converged = False
120 | iteration = 0
121 | rq_noise = 0.
122 |
123 | while not converged:
124 | # Sample Gaussian random matrix
125 | Omega = my_state.randn(n,block_size)
126 | # Perform QR on action
127 | Q,_ = np.linalg.qr(H(Omega))
128 | # Update basis
129 | if big_Q is None:
130 | big_Q = Q
131 | else:
132 | Q -= big_Q@(big_Q.T@Q)
133 | big_Q = np.concatenate((big_Q,Q),axis = 1)
134 | # This QR gets slow after many iterations, only last columns
135 | # need to be orthonormalized
136 | big_Q,_ = np.linalg.qr(big_Q)
137 | # Error estimation is both for operator error
138 | # as well as spectral noise
139 | # Operator error estimation
140 | Approximate_Error = Action - big_Q@(big_Q.T@Action)
141 | operator_error = np.linalg.norm(Approximate_Error)
142 | # Noise error estimation
143 | rq_direction = big_Q[:,-block_size:]
144 | try:
145 | RQ_samples = np.zeros((len(rq_estimator_dict_list),rq_direction.shape[1]))
146 | except:
147 | RQ_samples = np.zeros(len(rq_estimator_dict_list))
148 | if verbose:
149 | try:
150 | from tqdm import tqdm
151 | for samp_i,sample_dictionary in enumerate(tqdm(rq_estimator_dict_list)):
152 | RQ_samples[samp_i] = Hessian.quadratics(rq_direction,sample_dictionary)
153 | except:
154 | print('Issue with tqdm')
155 | for samp_i,sample_dictionary in enumerate(rq_estimator_dict_list):
156 | RQ_samples[samp_i] = Hessian.quadratics(rq_direction,sample_dictionary)
157 | else:
158 | for samp_i,sample_dictionary in enumerate(rq_estimator_dict_list):
159 | RQ_samples[samp_i] = Hessian.quadratics(rq_direction,sample_dictionary)
160 |
161 | rq_snr = np.abs(np.mean(RQ_samples,axis=0))/np.std(RQ_samples,axis = 0)
162 | too_noisy = (rq_snr < noise_tolerance).any()
163 | converged = (operator_error < epsilon) or too_noisy
164 | # print(80*'#')
165 | # print('rq_snr = ',rq_snr)
166 | # print('rq_snr < noise_tolerance = ',rq_snr < noise_tolerance)
167 | # print('too noisy? = ',too_noisy)
168 | # print('(operator_error < epsilon) = ',(operator_error < epsilon))
169 | # print(80*'#')
170 |
171 | iteration+=1
172 | if verbose:
173 | print('At iteration', iteration, 'operator error is ',operator_error,' convergence = ',(operator_error < epsilon))
174 | if big_Q.shape[-1] >= max_vectors:
175 | break
176 |
177 | if iteration > n//block_size:
178 | break
179 | # I believe that the extra action of A_op in forming B for the QB factorization
180 | # is cheaper to do once after the fact, and is not needed for the matrix
181 | # free randomized error estimator. For this reason I just return Q, and
182 | # do not form B.
183 | return big_Q
184 |
185 |
186 |
187 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/inexactNewtonCG.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 |
24 | from ..utilities.parameterList import ParameterList
25 | from ..algorithms import Optimizer, CGSolver, ParametersCGSolver
26 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion
27 | from ..problem import L2Regularization
28 |
29 |
30 |
31 |
32 | def ParametersInexactNewtonCG(parameters = {}):
33 | parameters['alpha'] = [1e0, "Initial steplength, or learning rate"]
34 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"]
35 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"]
36 | parameters['max_NN_evals_per_batch'] = [10000, "Scale constant for maximum neural network evaluations per datum"]
37 | parameters['max_NN_evals'] = [None, "Maximum number of neural network evaluations"]
38 |
39 |
40 | parameters['cg_parameters'] = [ ParametersCGSolver(),'CG Parameters']
41 | # CG solver parameters
42 | parameters['cg_coarse_tol'] = [0.5,'CG coarse solve tolerance']
43 | parameters['cg_max_iter'] = [10,'CG maximum iterations']
44 | parameters['eta_mode'] = [0, 'eta mode for E-W conditions:0,1,2']
45 | parameters['globalization'] = [None, 'Choose from trust_region, line_search or none']
46 | parameters['max_backtracking_iter'] = [10, 'max backtracking iterations for line search']
47 |
48 | # Reasons for convergence failure
49 | parameters['reasons'] = [[], 'list of reasons for termination']
50 |
51 |
52 | return ParameterList(parameters)
53 |
54 |
55 |
56 |
57 |
58 |
59 | class InexactNewtonCG(Optimizer):
60 | """
61 | This class implements the inexact Newton CG optimizer
62 | """
63 | def __init__(self,problem,regularization = None,sess = None,feed_dict = None,\
64 | parameters = ParametersInexactNewtonCG(),preconditioner = None):
65 | """
66 | The constructor for this class takes:
67 | -problem: hessianlearn.problem.Problem
68 | -regularization: hessianlearn.problem.Regularization
69 | -sess: tf.Session()
70 | -parameters: hyperparameters dictionary
71 | -preconditioner: hessianlearn.problem.Preconditioner
72 | """
73 | if regularization is None:
74 | _regularization = L2Regularization(problem,gamma = 0.0)
75 | else:
76 | _regularization = regularization
77 | super(InexactNewtonCG,self).__init__(problem,_regularization,sess,parameters)
78 |
79 |
80 | self.grad = self.problem.gradient + self.regularization.gradient
81 | self.cg_solver = CGSolver(self.problem,self.regularization,self.sess,parameters= self.parameters['cg_parameters'])
82 | self._sweeps = np.zeros(2)
83 | self.trust_region_initialized = False
84 | if self.parameters['globalization'] == 'trust_region':
85 | self.initialize_trust_region()
86 | self.alpha = 0.0
87 |
88 |
89 |
90 | def initialize_trust_region(self):
91 | """
92 | Initializes trust region
93 | """
94 | if not self.parameters['globalization'] == 'trust_region':
95 | self.parameters['globalization'] = 'trust_region'
96 | self.trust_region = TrustRegion()
97 | self.cg_solver.initialize_trust_region(coarse_tol = self.parameters['cg_coarse_tol'])
98 | self.cg_solver.set_trust_region_radius(self.trust_region.radius)
99 | self.trust_region_initialized = True
100 |
101 | def minimize(self,feed_dict = None,hessian_feed_dict = None):
102 | r"""
103 | Solves using inexact Newton CG algorithm
104 | -feed_dict: the data dictionary used for evaluating stochastic gradients and cost
105 | -hessian_feed_dict: smaller data dictionary used for stochastic Hessian
106 | """
107 | assert self.sess is not None
108 | assert feed_dict is not None
109 | if hessian_feed_dict is None:
110 | hessian_feed_dict = feed_dict
111 |
112 | gradient = self.sess.run(self.grad,feed_dict = feed_dict)
113 |
114 |
115 |
116 | if self.parameters['globalization'] is None:
117 | self.alpha = self.parameters['alpha']
118 | p,on_boundary = self.cg_solver.solve(-gradient,hessian_feed_dict)
119 | self._sweeps += [1,2*self.cg_solver.iter]
120 | self.p = p
121 | update = self.alpha*p
122 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
123 |
124 | if self.parameters['globalization'] == 'line_search':
125 | w_dir,on_boundary = self.cg_solver.solve(-gradient,hessian_feed_dict)
126 | w_dir_inner_g = np.inner(w_dir,gradient)
127 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict)
128 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict)
129 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(w_dir,w_dir_inner_g,\
130 | cost_at_candidate, initial_cost,\
131 | max_backtracking_iter = self.parameters['max_backtracking_iter'])
132 | update = self.alpha*w_dir
133 | self._sweeps += [1+0.5*line_search_iter,2*self.cg_solver.iter]
134 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
135 |
136 | elif self.parameters['globalization'] == 'trust_region':
137 | if not self.trust_region_initialized:
138 | self.initialize_trust_region()
139 | # Set trust region radius
140 | self.cg_solver.set_trust_region_radius(self.trust_region.radius)
141 | # Solve for candidate step
142 | p, on_boundary = self.cg_solver.solve(-gradient,hessian_feed_dict)
143 | pg = np.dot(p,gradient)
144 | self._sweeps += [1,2*self.cg_solver.iter]
145 | self.p = p
146 | # Calculate predicted reduction
147 | feed_dict[self.cg_solver.problem.dw] = p
148 | Hp = self.sess.run(self.cg_solver.Aop,feed_dict)
149 | pHp = np.dot(p,Hp)
150 | predicted_reduction = -pg-0.5*pHp
151 | # Calculate actual reduction
152 | misfit,reg = self.sess.run((self.problem.loss,self.regularization.cost),\
153 | feed_dict = feed_dict)
154 | cost = misfit + reg
155 | w_copy = self.sess.run(self.problem.w)
156 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:p})
157 |
158 | misfit,reg = self.sess.run((self.problem.loss,self.regularization.cost),\
159 | feed_dict = feed_dict)
160 | cost_new = misfit + reg
161 | actual_reduction = cost - cost_new
162 |
163 | # Decide whether or not to accept the step
164 | accept_step = self.trust_region.evaluate_step(actual_reduction = actual_reduction,\
165 | predicted_reduction = predicted_reduction,on_boundary = on_boundary)
166 | if accept_step:
167 | pass
168 | else:
169 | self.sess.run(self.problem._assignment_ops,feed_dict = {self.problem._assignment_placeholder:p})
170 |
171 |
172 |
173 |
174 |
175 |
176 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/randomizedEigensolver.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import, division, print_function
19 | import time
20 | import sys
21 | import numpy as np
22 |
23 |
24 | from scipy.linalg import cholesky, eigh, solve_triangular, qr, rq
25 |
26 | import time
27 |
28 |
29 | def low_rank_hessian(optimizer,feed_dict,k,p=None,verbose = False):
30 | H = lambda x: optimizer.H(x,feed_dict)
31 | n = optimizer.problem.dimension
32 | return randomized_eigensolver(H, n, k,p = p,verbose = verbose)
33 |
34 |
35 | def randomized_eigensolver(Aop, n, k, p = None,seed = 0,verbose = False):
36 | """
37 | Randomized algorithm for Hermitian eigenvalue problems
38 | Returns k largest eigenvalues computed using the randomized algorithm
39 |
40 |
41 | Parameters:
42 | -----------
43 | Aop : {Callable} n x n
44 | Hermitian matrix operator whose eigenvalues need to be estimated
45 | y = Aop(dw) is the action of A in the direction dw
46 |
47 | n : int,
48 | number of row/columns of the operator A
49 |
50 | k : int,
51 | number of eigenvalues/vectors to be estimated
52 | p : int, optional
53 | oversampling parameter which can improve accuracy of resulting solution
54 | Default: 20
55 |
56 | Returns:
57 | --------
58 |
59 | d : ndarray, (k,)
60 | eigenvalues arranged in descending order
61 | U : ndarray, (n,k)
62 | eigenvectors arranged according to eigenvalues
63 |
64 | References:
65 | -----------
66 | .. [1] Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. "Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions." SIAM review 53.2 (2011): 217-288.
67 | Examples:
68 | ---------
69 | >>> import numpy as np
70 | >>> n = 100
71 | >>> A = np.diag(0.95**np.arange(n))
72 | >>> Aop = lambda dw: np.dot(A,dw)
73 | >>> k = 10
74 | >>> p = 5
75 | >>> lmbda, U = randomized_eigensolver(Aop, n, k, p)
76 | """
77 | if n == k:
78 | p = 0
79 | elif p is None:
80 | p = int(0.01*k)
81 | if k+p > n:
82 | p = n - k
83 | random_state = np.random.RandomState(seed=seed)
84 | Omega = random_state.randn(n,k+p)
85 | n = Omega.shape[0]
86 |
87 | assert(n >= k )
88 |
89 | m = Omega.shape[1]
90 | Y = Aop(Omega)
91 |
92 | # print('condition number for Y = ',np.linalg.cond(Y))
93 | Q,_ = qr(Y, mode = 'economic')
94 | T = np.zeros((m,m),dtype = 'd')
95 | if verbose:
96 | print('Forming small square matrix')
97 | AQ = Aop(Q)
98 | T = Q.T@AQ
99 |
100 | # Eigenvalue problem for T
101 | if verbose:
102 | print('Computing eigenvalue decomposition')
103 | d, V = eigh(T)
104 | d_abs = np.abs(d) #sort by absolute value (we want the k largest eigenvalues regardless of sign)
105 | sort_perm = d_abs.argsort()
106 |
107 | sort_perm = sort_perm[::-1]
108 |
109 | d = d[sort_perm[0:k]]
110 | V = V[:, sort_perm[0:k]]
111 |
112 | #Compute eigenvectors
113 | U = np.dot(Q, V)
114 |
115 | return d[:k], U[:,:k]
116 |
117 |
118 | def eigensolver_from_range(Aop, Q,verbose = False):
119 | """
120 | Randomized algorithm for Hermitian eigenvalue problems
121 | Returns k largest eigenvalues computed using the randomized algorithm
122 |
123 |
124 | Parameters:
125 | -----------
126 | Aop : {Callable} n x n
127 | Hermitian matrix operator whose eigenvalues need to be estimated
128 | y = Aop(dw) is the action of A in the direction dw
129 | Q : Array n x r
130 |
131 |
132 | Returns:
133 | --------
134 |
135 | d : ndarray, (k,)
136 | eigenvalues arranged in descending order
137 | U : ndarray, (n,k)
138 | eigenvectors arranged according to eigenvalues
139 | """
140 | m = Q.shape[1]
141 | T = np.zeros((m,m),dtype = 'd')
142 | if verbose:
143 | print('Forming small square matrix')
144 | AQ = Aop(Q)
145 | T = Q.T@AQ
146 | # Eigenvalue problem for T
147 | if verbose:
148 | print('Computing eigenvalue decomposition')
149 | d, V = eigh(T)
150 | d_abs = np.abs(d) #sort by absolute value (we want the k largest eigenvalues regardless of sign)
151 | sort_perm = d_abs.argsort()
152 |
153 | sort_perm = sort_perm[::-1]
154 |
155 | d = d[sort_perm[0:m]]
156 | V = V[:, sort_perm[0:m]]
157 |
158 | #Compute eigenvectors
159 | U = np.dot(Q, V)
160 |
161 | return d[:m], U[:,:m]
162 |
163 | def randomized_double_pass_eigensolver(Aop, Y, k):
164 | """
165 | Randomized algorithm for Hermitian eigenvalue problems
166 | Returns k largest eigenvalues computed using the randomized algorithm
167 |
168 | Parameters:
169 | -----------
170 | Aop : {Callable} n x n
171 | Hermitian matrix operator whose eigenvalues need to be estimated
172 | y = Aop(dw) is the action of A in the direction dw
173 | Y = Aop(Omega) : precomputed action of Aop on Omega, a m x n Array of (presumably) sampled Gaussian or l-percent sparse random vectors (row)
174 | k : int,
175 | number of eigenvalues/vectors to be estimated, 0 < k < m
176 | Returns:
177 | --------
178 |
179 | lmbda : ndarray, (k,)
180 | eigenvalues arranged in descending order
181 | Ut : ndarray, (k, n)
182 | eigenvectors arranged according to eigenvalues, rows are eigenvectors
183 |
184 | References:
185 | -----------
186 | .. [1] Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. "Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions." SIAM review 53.2 (2011): 217-288.
187 | .. [2] Algorithm 2 of Arvind paper
188 | Examples:
189 | ---------
190 | >>> import numpy as np
191 | >>> n = 100
192 | >>> A = np.diag(0.95**np.arange(n))
193 | >>> Aop = lambda dw: np.dot(A,dw)
194 | >>> k = 10
195 | >>> p = 5
196 | >>> Omega = np.random.randn(n, k+p)
197 | >>> lmbda, Ut = randomized_eigensolver(Aop, Omega, k)
198 | """
199 | raise Exception("Need to reimplement this function")
200 | m, n = Y.shape
201 | assert(n >= m >= k) #m = k + p ( p is the oversampling for Omega, to ensure we get a good random projection basis)
202 | Q, _ = qr(Y.T, mode='economic')
203 | T = (Aop(Q.T) @ Q).T #m foward problems , m x m small matrix
204 | # T = .5*T + .5*T.T
205 |
206 | #Eigen subproblem
207 | lmbda, V = eigh(T, turbo=True, overwrite_a=True, check_finite=False)
208 | inds = np.abs(lmbda).argsort()[::-1]
209 | lmbda = lmbda[inds[0:k]]
210 | V = V[:, inds[0:k]] #S in the original paper m x m
211 |
212 | #Compute eigenvectors
213 | Ut = (Q @ V).T
214 | return lmbda, Ut
215 |
--------------------------------------------------------------------------------
/applications/transfer_learning/imagenet_cifar10_classification_evaluate_test.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 |
19 | import numpy as np
20 | import os
21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
22 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
23 | os.environ["KMP_WARNINGS"] = "FALSE"
24 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1'
25 | import pickle
26 | import tensorflow as tf
27 | import time, datetime
28 | # if int(tf.__version__[0]) > 1:
29 | # import tensorflow.compat.v1 as tf
30 | # tf.disable_v2_behavior()
31 |
32 |
33 | # Memory issue with GPUs
34 | gpu_devices = tf.config.experimental.list_physical_devices('GPU')
35 | for device in gpu_devices:
36 | tf.config.experimental.set_memory_growth(device, True)
37 | # Load hessianlearn library
38 | import sys
39 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../"))
40 | from hessianlearn import *
41 |
42 | # Parse run specifications
43 | from argparse import ArgumentParser
44 |
45 | parser = ArgumentParser(add_help=True)
46 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str)
47 | parser.add_argument('-fixed_step',dest = 'fixed_step',\
48 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int)
49 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-4,help= 'learning rate alpha',type=float)
50 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int)
51 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\
52 | required= False,default = 0,help='boolean for recording spectrum',type = int)
53 | # parser.add_argument('-weight_burn_in',dest = 'weight_burn_in',\
54 | # required= False,default = 0,help='',type = int)
55 |
56 | # parser.add_argument('-data_seed',dest = 'data_seed',\
57 | # required= False,default = 0,help='',type = int)
58 |
59 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int)
60 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int)
61 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int)
62 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str)
63 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float)
64 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 1,help='max sweeps',type = float)
65 | parser.add_argument('-weights_file',dest = 'weights_file',required= False,default = 'None',help='weight file pickle',type = str)
66 |
67 | args = parser.parse_args()
68 |
69 | try:
70 | tf.set_random_seed(0)
71 | except:
72 | tf.random.set_seed(0)
73 |
74 | # GPU Environment Details
75 | gpu_availabe = tf.test.is_gpu_available()
76 | built_with_cuda = tf.test.is_built_with_cuda()
77 | print(80*'#')
78 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80))
79 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80))
80 | print(80*'#')
81 |
82 | settings = {}
83 | # Set run specifications
84 | # Data specs
85 | settings['batch_size'] = args.batch_size
86 | settings['hess_batch_size'] = args.hess_batch_size
87 |
88 |
89 | ################################################################################
90 | # Instantiate data
91 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar10.load_data()
92 |
93 | # # Normalize the data
94 | # x_train = x_train.astype('float32') / 255.
95 | # x_test = x_test.astype('float32') / 255.
96 |
97 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
98 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test)
99 | x_val = x_test_full[:2000]
100 | x_test = x_test_full[2000:]
101 |
102 | y_train = tf.keras.utils.to_categorical(y_train)
103 | y_test_full = tf.keras.utils.to_categorical(_y_test)
104 | y_val = y_test_full[:2000]
105 | y_test = y_test_full[2000:]
106 |
107 | ################################################################################
108 | # Create the neural network in keras
109 |
110 | # tf.keras.backend.set_floatx('float64')
111 |
112 | resnet_input_shape = (200,200,3)
113 | input_tensor = tf.keras.Input(shape = resnet_input_shape)
114 |
115 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor)
116 |
117 | for layer in pretrained_resnet50.layers[:143]:
118 | layer.trainable = False
119 |
120 | classifier = tf.keras.models.Sequential()
121 | classifier.add(tf.keras.layers.Input(shape=(32,32,3)))
122 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2])))
123 | classifier.add(pretrained_resnet50)
124 | classifier.add(tf.keras.layers.Flatten())
125 | classifier.add(tf.keras.layers.BatchNormalization())
126 | classifier.add(tf.keras.layers.Dense(64, activation='relu'))
127 | classifier.add(tf.keras.layers.Dropout(0.5))
128 | classifier.add(tf.keras.layers.BatchNormalization())
129 | classifier.add(tf.keras.layers.Dense(10, activation='softmax'))
130 |
131 |
132 | if args.keras_opt == 'adam':
133 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8)
134 | elif args.keras_opt == 'sgd':
135 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha)
136 | else:
137 | raise
138 |
139 | classifier.compile(optimizer=optimizer,
140 | loss=tf.keras.losses.CategoricalCrossentropy(from_logits = True),
141 | metrics=['accuracy'])
142 |
143 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2)
144 | print('acc_test = ',acc_test_0)
145 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2)
146 | print('acc_val = ',acc_val_0)
147 |
148 |
149 | if args.weights_file is not 'None':
150 | try:
151 | logger = open(args.weights_file, 'rb')
152 | best_weights = pickle.load(logger)['best_weights']
153 | for layer_name,weight in best_weights.items():
154 | classifier.get_layer(layer_name).set_weights(weight)
155 | except:
156 | print('Issue loading best weights')
157 |
158 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2)
159 | print('acc_test final = ',acc_test_final)
160 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2)
161 | print('acc_val final = ',acc_val_final)
162 |
163 | ################################################################################
164 | # Evaluate again on all the data.
165 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
166 |
167 | # # Normalize the data
168 | # x_train = x_train.astype('float32') / 255.
169 | # x_test = x_test.astype('float32') / 255.
170 |
171 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
172 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test)
173 |
174 | y_train = tf.keras.utils.to_categorical(y_train)
175 | y_test = tf.keras.utils.to_categorical(y_test)
176 |
177 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2)
178 | print(80*'#')
179 | print('After hessianlearn training'.center(80))
180 | print('acc_test_total = ',acc_test_total)
181 |
--------------------------------------------------------------------------------
/applications/transfer_learning/imagenet_cifar100_classification_evaluate_test.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 |
19 | import numpy as np
20 | import os
21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
22 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
23 | os.environ["KMP_WARNINGS"] = "FALSE"
24 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1'
25 | import pickle
26 | import tensorflow as tf
27 | import time, datetime
28 | # if int(tf.__version__[0]) > 1:
29 | # import tensorflow.compat.v1 as tf
30 | # tf.disable_v2_behavior()
31 |
32 |
33 | # Memory issue with GPUs
34 | gpu_devices = tf.config.experimental.list_physical_devices('GPU')
35 | for device in gpu_devices:
36 | tf.config.experimental.set_memory_growth(device, True)
37 | # Load hessianlearn library
38 | import sys
39 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../"))
40 | from hessianlearn import *
41 |
42 | # Parse run specifications
43 | from argparse import ArgumentParser
44 |
45 | parser = ArgumentParser(add_help=True)
46 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str)
47 | parser.add_argument('-fixed_step',dest = 'fixed_step',\
48 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int)
49 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-4,help= 'learning rate alpha',type=float)
50 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int)
51 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\
52 | required= False,default = 0,help='boolean for recording spectrum',type = int)
53 | # parser.add_argument('-weight_burn_in',dest = 'weight_burn_in',\
54 | # required= False,default = 0,help='',type = int)
55 |
56 | # parser.add_argument('-data_seed',dest = 'data_seed',\
57 | # required= False,default = 0,help='',type = int)
58 |
59 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int)
60 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int)
61 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int)
62 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str)
63 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float)
64 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 1,help='max sweeps',type = float)
65 | parser.add_argument('-weights_file',dest = 'weights_file',required= False,default = 'None',help='weight file pickle',type = str)
66 |
67 | args = parser.parse_args()
68 |
69 | try:
70 | tf.set_random_seed(0)
71 | except:
72 | tf.random.set_seed(0)
73 |
74 | # GPU Environment Details
75 | gpu_availabe = tf.test.is_gpu_available()
76 | built_with_cuda = tf.test.is_built_with_cuda()
77 | print(80*'#')
78 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80))
79 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80))
80 | print(80*'#')
81 |
82 | settings = {}
83 | # Set run specifications
84 | # Data specs
85 | settings['batch_size'] = args.batch_size
86 | settings['hess_batch_size'] = args.hess_batch_size
87 |
88 |
89 | ################################################################################
90 | # Instantiate data
91 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar100.load_data()
92 |
93 | # # Normalize the data
94 | # x_train = x_train.astype('float32') / 255.
95 | # x_test = x_test.astype('float32') / 255.
96 |
97 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
98 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test)
99 | x_val = x_test_full[:2000]
100 | x_test = x_test_full[2000:]
101 |
102 | y_train = tf.keras.utils.to_categorical(y_train)
103 | y_test_full = tf.keras.utils.to_categorical(_y_test)
104 | y_val = y_test_full[:2000]
105 | y_test = y_test_full[2000:]
106 |
107 | ################################################################################
108 | # Create the neural network in keras
109 |
110 | # tf.keras.backend.set_floatx('float64')
111 |
112 | resnet_input_shape = (200,200,3)
113 | input_tensor = tf.keras.Input(shape = resnet_input_shape)
114 |
115 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor)
116 |
117 | for layer in pretrained_resnet50.layers[:143]:
118 | layer.trainable = False
119 |
120 |
121 |
122 | classifier = tf.keras.models.Sequential()
123 | classifier.add(tf.keras.layers.Input(shape=(32,32,3)))
124 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2])))
125 | classifier.add(pretrained_resnet50)
126 | classifier.add(tf.keras.layers.Flatten())
127 | classifier.add(tf.keras.layers.BatchNormalization())
128 | classifier.add(tf.keras.layers.Dense(128, activation='relu'))
129 | classifier.add(tf.keras.layers.Dropout(0.5))
130 | classifier.add(tf.keras.layers.BatchNormalization())
131 | classifier.add(tf.keras.layers.Dense(100, activation='softmax'))
132 |
133 |
134 | if args.keras_opt == 'adam':
135 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8)
136 | elif args.keras_opt == 'sgd':
137 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha)
138 | else:
139 | raise
140 |
141 | classifier.compile(optimizer=optimizer,
142 | loss=tf.keras.losses.CategoricalCrossentropy(from_logits = True),
143 | metrics=['accuracy'])
144 |
145 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2)
146 | print('acc_test = ',acc_test_0)
147 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2)
148 | print('acc_val = ',acc_val_0)
149 |
150 |
151 | if args.weights_file is not 'None':
152 | try:
153 | logger = open(args.weights_file, 'rb')
154 | best_weights = pickle.load(logger)['best_weights']
155 | for layer_name,weight in best_weights.items():
156 | classifier.get_layer(layer_name).set_weights(weight)
157 | except:
158 | print('Issue loading best weights')
159 |
160 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2)
161 | print('acc_test final = ',acc_test_final)
162 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2)
163 | print('acc_val final = ',acc_val_final)
164 |
165 | ################################################################################
166 | # Evaluate again on all the data.
167 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar100.load_data()
168 |
169 | # # Normalize the data
170 | # x_train = x_train.astype('float32') / 255.
171 | # x_test = x_test.astype('float32') / 255.
172 |
173 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
174 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test)
175 |
176 | y_train = tf.keras.utils.to_categorical(y_train)
177 | y_test = tf.keras.utils.to_categorical(y_test)
178 |
179 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2)
180 | print(80*'#')
181 | print('After hessianlearn training'.center(80))
182 | print('acc_test_total = ',acc_test_total)
183 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/varianceBasedNystrom.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Nick Alger
16 |
17 | import numpy as np
18 |
19 | np.random.seed(0)
20 |
21 |
22 | def variance_based_nystrom(apply_AA, num_cols_A, oversampling_parameter=5, block_size=10,
23 | std_tol=0.5, max_bad_vectors=5, max_vectors=100, verbose=True):
24 | """
25 | Computes approximate truncated eigenvalue decomposition
26 | A = U D U^T
27 | of a n x n matrix A which is given by the following sum of matrices:
28 | A = (A1 + A2 + ... + Am)/m.
29 | U is an n x r orthonormal matrix, and D = diag(dd).
30 |
31 | The eigenvalue decomposition is terminated when the relative variance
32 | of the eigenvalues exceeds the threshold std_tol for at least max_bad_vectors.
33 | Only eigenvalues which do not exceed std_tol are retuend.
34 |
35 | apply_AA is a list of callables, where matvecs with the matrices Ak are computed via
36 | apply_AA[k](x) = Ak * x.
37 |
38 | num_cols_A is the number of columns of A (A is n x n, num_cols_A = n)
39 |
40 | oversampling_parameter is the number of extra vectors used within randmoized SVD
41 |
42 | block_size is the number of random vectors per group used in the randomized eigenvalue method.
43 |
44 | max_vectors is the maximum rank of the truncated eigenvalue decompisition
45 | """
46 | op = oversampling_parameter
47 | n = num_cols_A
48 | m = len(apply_AA)
49 |
50 | Q = np.zeros((n,0))
51 | Theta = np.zeros((0,0,m))
52 | num_bad_vectors = 0
53 | while num_bad_vectors < max_bad_vectors:
54 | Q1 = Q
55 | Theta11 = Theta
56 |
57 | Y = get_random_range_vectors(apply_AA, n, block_size)
58 | Y_perp = Y - np.dot(Q,np.dot(Q.T, Y))
59 | Q2,_ = np.linalg.qr(Y_perp)
60 | Q2 = Q2.reshape((n,-1)) # Reshape to guard against case block_size==1
61 | Q = np.hstack([Q1, Q2])
62 |
63 | Theta = compute_or_update_Theta(Q1, Q2, Theta11, apply_AA)
64 | dd, U, V = finish_computing_eigenvalue_decomposition(Q, Theta)
65 | _, all_std = compute_rayleigh_statistics(Theta, V)
66 |
67 | bad_inds = (all_std[:-op] / np.abs(dd[:-op])) > std_tol
68 | num_bad_vectors = np.sum(bad_inds)
69 |
70 | current_num_vectors = Q.shape[1]
71 | current_rank = current_num_vectors - op - num_bad_vectors
72 | if verbose:
73 | print('current_rank=', current_rank, ', num_bad_vectors=', num_bad_vectors)
74 |
75 | if current_num_vectors > max_vectors:
76 | break
77 |
78 | good_inds = np.logical_not(bad_inds)
79 | dd_good = dd[:-op][good_inds]
80 | U_good = U[:,:-op][:,good_inds]
81 | all_std_good = all_std[:-op][good_inds]
82 | return [dd_good, U_good, all_std_good],[dd[:-op],U[:,:-op],all_std[:-op]]
83 |
84 |
85 | def get_random_range_vectors(apply_AA, num_cols_A, block_size_r,seed = 0):
86 | """
87 | Computes n x r matrix
88 | Y = A * Omega
89 | where A is an n x n matrix of the form
90 | A = (A1 + A2 + ... + Am)/m,
91 | matvecs with the matrices Ak may be computed via the function
92 | apply_AA[k](x) = Ak * x,
93 | and Omega is a random n x r matrix.
94 | """
95 | n = num_cols_A
96 | r = block_size_r
97 | m = len(apply_AA)
98 |
99 | Omega = np.random.randn(n, r)
100 | Y = np.zeros((n, r))
101 | # In Tensorflow:
102 | # z = g^T Omega
103 | # q = unstack(z)
104 | # Y = (1/m) * restack(dq_i / dw)
105 | for j in range(r): # These loops can be trivially parallelized
106 | for k in range(m):
107 | Y[:,j] = Y[:,j] + (1./m)*apply_AA[k](Omega[:,j])
108 | return Y
109 |
110 |
111 | def compute_Theta(orthonormal_range_basis_Q, apply_AA):
112 | """
113 | Computes r x r x m 3-tensor Theta with entries
114 | Theta_ijk = qi^T Ak qj.
115 | Theta has frontal slices
116 | Theta_::k = Q^T Ak Q.
117 | """
118 | Q = orthonormal_range_basis_Q
119 | m = len(apply_AA)
120 | r = Q.shape[1]
121 |
122 | Theta = np.zeros((r, r, m))
123 | for j in range(r): # These loops can be trivially parallelized
124 | for k in range(m):
125 | Theta[:,j,k] = np.dot(Q.T, apply_AA[k](Q[:,j]))
126 | return Theta
127 |
128 |
129 | def finish_computing_eigenvalue_decomposition(orthonormal_range_basis_Q, Theta):
130 | """
131 | Finishes computing eigenvalue decomposition
132 | A = U diag(dd) U^T,
133 | and smaller auxiliary eigenvalue decomposition
134 | Q^T A Q = V diag(dd) V^T
135 | where Q is an orthonormal basis for the range of
136 | A = (A1+A2+...+Am)/m,
137 | and Theta is the matrix with frontal slices
138 | Theta_::k = Q^T Ak Q.
139 | """
140 | Q = orthonormal_range_basis_Q
141 | m = Theta.shape[-1]
142 |
143 | B = (1. / m) * np.sum(Theta, axis=-1)
144 | dd, V = np.linalg.eigh(B)
145 | idx = np.argsort(np.abs(dd))[::-1]
146 | dd = dd[idx]
147 | V = V[:,idx]
148 |
149 | U = np.dot(Q, V)
150 | return dd, U, V
151 |
152 |
153 | def compute_rayleigh_statistics(Theta, small_eigenvectors_V):
154 | """
155 | Computes sample mean and standard deviation of Rayleigh quotients
156 | all_mu[i] = mean(ui^T Ak ui)
157 | all_std[i] = std(ui^T Ak ui)
158 | where Ak is randomly chosen, and ui is the i'th eigenvector of
159 | A = (A1 + A2 + ... + Am)/m.
160 | Theta is the r x r x m 3-tensor with frontal slices
161 | Theta_::k = Q^T Ak Q,
162 | for orthonormal basis Q such that
163 | A =approx= Q * Q^T * A
164 | The columns, vi, of V are the eigenvectors of the matrix Q^T A Q, i.e.,
165 | Q^T A Q = V D V^T
166 | where D is the diagonal matrix of eigenvalues, which we do not need here.
167 | (Note that ui = Q * vi).
168 | """
169 | V = small_eigenvectors_V
170 | r = Theta.shape[0]
171 |
172 | C = np.sum(V.reshape((r,r,-1)) * np.einsum('jki,kl->jli', Theta, V), axis=0)
173 | all_mu = np.mean(C, axis=1)
174 | all_std = np.std(C, axis=1)
175 | return all_mu, all_std
176 |
177 |
178 | def update_Theta(Q1, Q2, Theta11, apply_AA):
179 | """
180 | Computes updated r x r x m 3-tensor Theta with frontal slices
181 | Theta_::k = Q^T Ak Q
182 | based on old Theta1 with frontal slices
183 | Theta11_::k = Q1^T Ak Q1.
184 | Here Q1 and Q2 are orthonormal matrices, and
185 | Q = [Q1, Q2]
186 | is also an orthonormal matrix.
187 | Q1 was the old range approximation for A.
188 | Q2 columns are more vectors to improve the range approximation.
189 | Q is the new range approximation.
190 | """
191 | m = len(apply_AA)
192 | r1 = Q1.shape[1]
193 | r2 = Q2.shape[1]
194 | r = r1 + r2
195 | Theta12 = np.zeros((r1, r2, m))
196 | Theta22 = np.zeros((r2, r2, m))
197 | for i in range(r2): # These loops can be trivially parallelized
198 | for k in range(m):
199 | Ak_qi = apply_AA[k](Q2[:,i])
200 | Theta12[:,i,k] = np.dot(Q1.T, Ak_qi)
201 | Theta22[:,i,k] = np.dot(Q2.T, Ak_qi)
202 |
203 | Theta = np.zeros((r, r, m))
204 | Theta[:r1, :r1, :] = Theta11
205 | Theta[:r1, r1:, :] = Theta12
206 | Theta[r1:, :r1, :] = Theta12.swapaxes(0,1)
207 | Theta[r1:, r1:, :] = Theta22
208 | return Theta
209 |
210 |
211 | def compute_or_update_Theta(Q1, Q2, Theta11, apply_AA):
212 | if Theta11.size == 0:
213 | return compute_Theta(Q2, apply_AA)
214 | else:
215 | return update_Theta(Q1, Q2, Theta11, apply_AA)
216 |
217 |
218 |
--------------------------------------------------------------------------------
/applications/mnist/mnist_vae.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | ################################################################################
19 | # Uses some code from https://blog.keras.io/building-autoencoders-in-keras.html
20 | ################################################################################
21 |
22 | import numpy as np
23 | import os
24 | import tensorflow as tf
25 | import time
26 | # if int(tf.__version__[0]) > 1:
27 | # import tensorflow.compat.v1 as tf
28 | # tf.disable_v2_behavior()
29 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
30 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
31 | os.environ["KMP_WARNINGS"] = "FALSE"
32 | import sys
33 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../"))
34 | from hessianlearn import *
35 |
36 | tf.set_random_seed(0)
37 |
38 | settings = {}
39 | # Set run specifications
40 | # Data specs
41 | settings['batch_size'] = 100
42 | settings['hess_batch_size'] = 10
43 |
44 |
45 | ################################################################################
46 | # Instantiate data
47 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
48 |
49 |
50 | # Normalize the data
51 | x_train = x_train.astype('float32') / 255.
52 | x_test = x_test.astype('float32') / 255.
53 | # Reshape the data
54 | flattened_dimension = np.prod(x_train.shape[1:])
55 | x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
56 | x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
57 |
58 | # Instante the data object
59 | data = Data([x_train,y_train],settings['batch_size'],test_data = [x_test,y_test],hessian_batch_size = settings['hess_batch_size'])
60 |
61 | # settings['input_shape'] = data._input_shape
62 | # settings['output_shape'] = data._output_shape
63 |
64 |
65 | ################################################################################
66 | # Build the variational autoencoder neural network model here
67 |
68 | # network parameters
69 | input_shape = (flattened_dimension, )
70 | intermediate_dim = 512
71 | latent_dim = 2
72 |
73 | # VAE model = encoder + decoder
74 | # build encoder model
75 | inputs = tf.keras.layers.Input(shape=input_shape)
76 | x_encoder = tf.keras.layers.Dense(intermediate_dim, activation='softplus')(inputs)
77 | z_mean = tf.keras.layers.Dense(latent_dim, name='z_mean')(x_encoder)
78 | z_log_var = tf.keras.layers.Dense(latent_dim, name='z_log_var')(x_encoder)
79 |
80 | # reparameterization trick
81 | # instead of sampling from Q(z|X), sample epsilon = N(0,I)
82 | # z = z_mean + sqrt(var) * epsilon
83 | def sampling(args):
84 | """Reparameterization trick by sampling from an isotropic unit Gaussian.
85 | # Arguments
86 | args (tensor): mean and log of variance of Q(z|X)
87 | # Returns
88 | z (tensor): sampled latent vector
89 | """
90 | z_mean, z_log_var = args
91 | batch = tf.keras.backend.shape(z_mean)[0]
92 | dim = tf.keras.backend.int_shape(z_mean)[1]
93 | # by default, random_normal has mean = 0 and std = 1.0
94 | epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
95 | return z_mean + tf.keras.backend.exp(0.5 * z_log_var) * epsilon
96 | # use reparameterization trick to push the sampling out as input
97 | # note that "output_shape" isn't necessary with the TensorFlow backend
98 | z = tf.keras.layers.Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
99 |
100 | # instantiate encoder model
101 | encoder = tf.keras.models.Model(inputs, [z_mean, z_log_var, z], name='encoder')
102 |
103 | # build decoder model
104 | latent_inputs = tf.keras.layers.Input(shape=(latent_dim,), name='z_sampling')
105 | x_decoder = tf.keras.layers.Dense(intermediate_dim, activation='softplus')(latent_inputs)
106 | outputs = tf.keras.layers.Dense(flattened_dimension, activation='sigmoid')(x_decoder)
107 |
108 | # instantiate decoder model
109 | decoder = tf.keras.models.Model(latent_inputs, outputs, name='decoder')
110 |
111 | # instantiate VAE model
112 | outputs = decoder(encoder(inputs)[2])
113 | vae = tf.keras.models.Model(inputs, outputs, name='vae_mlp')
114 |
115 |
116 |
117 | ################################################################################
118 | # Instantiate the problem, regularization.
119 |
120 | problem = VariationalAutoencoderProblem(vae,z_mean,z_log_var,dtype=tf.float32)
121 |
122 | settings['tikhonov_gamma'] = 1e-2
123 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma'])
124 |
125 |
126 | ################################################################################
127 | # Instantiate the model object
128 | HLModelSettings = HessianlearnModelSettings()
129 |
130 | HLModelSettings['optimizer'] = 'lrsfn'
131 | HLModelSettings['alpha'] = 5e-4
132 | HLModelSettings['globalization'] = 'line_search'
133 | HLModelSettings['hessian_low_rank'] = 20
134 | HLModelSettings['max_backtrack'] = 16
135 | HLModelSettings['max_sweeps'] = 50
136 |
137 | HLModelSettings['problem_name'] = 'mnist_vae'
138 |
139 |
140 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings)
141 |
142 |
143 | # Can pass in an initial guess for the weights w_0 to the method fit, if desired.
144 | HLModel.fit(w_0 = None)
145 |
146 | ################################################################################
147 | # Post processing
148 | import matplotlib.pyplot as plt
149 | def plot_results(models,
150 | data,
151 | batch_size=128,
152 | model_name="vae_mnist"):
153 | """Plots labels and MNIST digits as a function of the 2D latent vector
154 | # Arguments
155 | models (tuple): encoder and decoder models
156 | data (tuple): test data and label
157 | batch_size (int): prediction batch size
158 | model_name (string): which model is using this function
159 | """
160 |
161 | encoder, decoder = models
162 | x_test, y_test = data
163 | os.makedirs(model_name, exist_ok=True)
164 |
165 | filename = os.path.join(model_name, "vae_mean.png")
166 | # display a 2D plot of the digit classes in the latent space
167 | z_mean, _, _ = encoder.predict(x_test,
168 | batch_size=batch_size)
169 | plt.figure(figsize=(12, 10))
170 | plt.scatter(z_mean[:, 0], z_mean[:, 1], c=y_test)
171 | plt.colorbar()
172 | plt.xlabel("z[0]")
173 | plt.ylabel("z[1]")
174 | plt.savefig(filename)
175 | plt.show()
176 |
177 | filename = os.path.join(model_name, "digits_over_latent.png")
178 | # display a 30x30 2D manifold of digits
179 | n = 30
180 | digit_size = 28
181 | figure = np.zeros((digit_size * n, digit_size * n))
182 | # linearly spaced coordinates corresponding to the 2D plot
183 | # of digit classes in the latent space
184 | grid_x = np.linspace(-4, 4, n)
185 | grid_y = np.linspace(-4, 4, n)[::-1]
186 |
187 | for i, yi in enumerate(grid_y):
188 | for j, xi in enumerate(grid_x):
189 | z_sample = np.array([[xi, yi]])
190 | x_decoded = decoder.predict(z_sample)
191 | digit = x_decoded[0].reshape(digit_size, digit_size)
192 | figure[i * digit_size: (i + 1) * digit_size,
193 | j * digit_size: (j + 1) * digit_size] = digit
194 |
195 | plt.figure(figsize=(10, 10))
196 | start_range = digit_size // 2
197 | end_range = (n - 1) * digit_size + start_range + 1
198 | pixel_range = np.arange(start_range, end_range, digit_size)
199 | sample_range_x = np.round(grid_x, 1)
200 | sample_range_y = np.round(grid_y, 1)
201 | plt.xticks(pixel_range, sample_range_x)
202 | plt.yticks(pixel_range, sample_range_y)
203 | plt.xlabel("z[0]")
204 | plt.ylabel("z[1]")
205 | plt.imshow(figure, cmap='Greys_r')
206 | plt.savefig(filename)
207 | plt.show()
208 |
209 |
210 | models = (encoder, decoder)
211 | data = (x_test, y_test)
212 | plot_results(models,
213 | data,
214 | batch_size=settings['batch_size'],
215 | model_name= HLModelSettings['optimizer']+'_vae_mlp')
216 |
217 |
218 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ___ ___ ___ ___ ___ ___
6 | /__/\ / /\ / /\ / /\ ___ / /\ /__/\
7 | \ \:\ / /:/_ / /:/_ / /:/_ / /\ / /::\ \ \:\
8 | \__\:\ / /:/ /\ / /:/ /\ / /:/ /\ / /:/ / /:/\:\ \ \:\
9 | ___ / /::\ / /:/ /:/_ / /:/ /::\ / /:/ /::\ /__/::\ / /:/~/::\ _____\__\:\
10 | /__/\ /:/\:\/__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/\:\\__\/\:\__ /__/:/ /:/\:\/__/::::::::\
11 | \ \:\/:/__\/\ \:\/:/ /:/\ \:\/:/~/:/\ \:\/:/~/:/ \ \:\/\\ \:\/:/__\/\ \:\~~\~~\/
12 | \ \::/ \ \::/ /:/ \ \::/ /:/ \ \::/ /:/ \__\::/ \ \::/ \ \:\ ~~~
13 | \ \:\ \ \:\/:/ \__\/ /:/ \__\/ /:/ /__/:/ \ \:\ \ \:\
14 | \ \:\ \ \::/ /__/:/ /__/:/ \__\/ \ \:\ \ \:\
15 | \__\/ \__\/ \__\/ \__\/ \__\/ \__\/
16 |
17 |
18 | ___ ___ ___ ___
19 | / /\ / /\ / /\ /__/\
20 | / /:/_ / /::\ / /::\ \ \:\
21 | ___ ___ / /:/ /\ / /:/\:\ / /:/\:\ \ \:\
22 | /__/\ / /\ / /:/ /:/_ / /:/~/::\ / /:/~/:/ _____\__\:\
23 | \ \:\ / /://__/:/ /:/ /\/__/:/ /:/\:\/__/:/ /:/___/__/::::::::\
24 | \ \:\ /:/ \ \:\/:/ /:/\ \:\/:/__\/\ \:\/:::::/\ \:\~~\~~\/
25 | \ \:\/:/ \ \::/ /:/ \ \::/ \ \::/~~~~ \ \:\ ~~~
26 | \ \::/ \ \:\/:/ \ \:\ \ \:\ \ \:\
27 | \__\/ \ \::/ \ \:\ \ \:\ \ \:\
28 | \__\/ \__\/ \__\/ \__\/
29 |
30 |
31 |
32 |
33 |
34 | [](https://travis-ci.com/tomoleary/hessianlearn)
35 | [](https://zenodo.org/badge/latestdoi/184635062)
36 | [](./LICENSE.md)
37 | [](https://www.python.org)
38 | 
39 | [](https://github.com/tomoleary/hessianlearn/issues)
40 | [](https://github.com/tomoleary/hessianlearn/commits/master)
41 |
42 | # Hessian-based stochastic optimization in TensorFlow and keras
43 |
44 | This code implements Hessian-based stochastic optimization in TensorFlow and keras by exposing the matrix-free Hessian to users. The code is meant to allow for rapid-prototyping of Hessian-based algorithms via the matrix-free Hessian action, which allows users to inspect Hessian based information for stochastic nonconvex (neural network training) optimization problems.
45 |
46 | The Hessian action is exposed via matrix-vector products:
47 |
48 |
49 |
50 |
51 | and matrix-matrix products:
52 |
53 |
54 |
55 |
56 | ## Compatibility
57 |
58 | The code is compatible with Tensorflow v1 and v2, but certain features of v2 are disabled (like eager execution). This is because the Hessian matrix products in hessianlearn are implemented using `placeholders` which have been deprecated in v2. For this reason hessianlearn cannot work with data generators and things like this that require eager execution. If any compatibility issues are found, please open an [issue](https://github.com/tomoleary/hessianlearn/issues).
59 |
60 | ## Usage
61 | Set `HESSIANLEARN_PATH` environmental variable
62 |
63 | Train a keras model
64 |
65 | ```python
66 | import os,sys
67 | import tensorflow as tf
68 | sys.path.append( os.environ.get('HESSIANLEARN_PATH'))
69 | from hessianlearn import *
70 |
71 | # Define keras neural network model
72 | neural_network = tf.keras.models.Model(...)
73 | # Define loss function and compile model
74 | neural_network.compile(loss = ...)
75 |
76 | ```
77 |
78 | hessianlearn implements various training [`problem`](https://github.com/tomoleary/hessianlearn/blob/master/hessianlearn/problem/problem.py) constructs (regression, classification, autoencoders, variational autoencoders, generative adversarial networks). Instantiate a `problem`, a `data` object (which takes a dictionary with keys that correspond to the corresponding `placeholders` in `problem`) and `regularization`
79 |
80 | ```python
81 | # Instantiate the problem (this handles the loss function,
82 | # construction of hessian and gradient etc.)
83 | # KerasModelProblem extracts loss function and metrics from
84 | # a compiled keras model
85 | problem = KerasModelProblem(neural_network)
86 | # Instantiate the data object, this handles the train / validation split
87 | # as well as iterating during training
88 | data = Data({problem.x:x_data,problem.y_true:y_data},train_batch_size,\
89 | validation_data_size = validation_data_size)
90 | # Instantiate the regularization: L2Regularization is Tikhonov,
91 | # gamma = 0 is no regularization
92 | regularization = L2Regularization(problem,gamma = 0)
93 | ```
94 |
95 | Pass these objects into the `HessianlearnModel` which handles the training
96 |
97 | ```python
98 | HLModel = HessianlearnModel(problem,regularization,data)
99 | HLModel.fit()
100 | ```
101 |
102 | ### Alternative Usage (More like Keras Interface)
103 | The example above was the original way the optimizer interface was implemented in hessianlearn, however to better mimic the keras interface and allow for more end-user rapid prototyping of the optimizer that is used to fit data, as of December 2021, the following way has been created
104 |
105 | ```python
106 | import os,sys
107 | import tensorflow as tf
108 | sys.path.append( os.environ.get('HESSIANLEARN_PATH'))
109 | from hessianlearn import *
110 |
111 | # Define keras neural network model
112 | neural_network = tf.keras.models.Model(...)
113 | # Define loss function and compile model
114 | neural_network.compile(loss = ...)
115 | # Instance keras model wrapper which deals with the
116 | # construction of the `problem` which handles the construction
117 | # of Hessian computational graph and variables
118 | HLModel = KerasModelWrapper(neural_network)
119 | # Then the end user can pass in an optimizer
120 | # (e.g. custom end-user optimizer)
121 | optimizer = LowRankSaddleFreeNewton # The class constructor, not an instance
122 | opt_parameters = LowRankSaddleFreeNewtonParameters()
123 | opt_parameters['hessian_low_rank'] = 40
124 | HLModel.set_optimizer(optimizer,optimizer_parameters = opt_parameters)
125 | # The data object still needs to key on to the specific computational
126 | # graph variables that data will be passed in for.
127 | # Note that data can naturally handle multiple input and output data,
128 | # in which case problem.x, problem.y_true are lists corresponding to
129 | # neural_network.inputs, neural_network.outputs
130 | problem = HLModel.problem
131 | data = Data({problem.x:x_data,problem.y_true:y_data},train_batch_size,\
132 | validation_data_size = validation_data_size)
133 | # And finally one can call fit!
134 | HLModel.fit(data)
135 | ```
136 |
137 | ## Examples
138 |
139 | [Tutorial 0: MNIST Autoencoder](https://github.com/tomoleary/hessianlearn/blob/master/tutorial/Tutorial%200%20MNIST%20Autoencoder.ipynb)
140 |
141 |
142 | ## Applications
143 |
144 | ### Transfer Learning
145 |
146 | * Examples of CIFAR10, CIFAR100 classification from pre-trained Imagenet ResNet50 model in `applications/transfer_learning/`
147 |
148 | * Pre-trained model serves as well conditioned initial guess for transfer learning. In this setting Newton methods perform well due to their excellent properties in local convergence. Low Rank Saddle Free Newton is able to zero in on highly generalizable local minimizers bypassing indefinite regions. Below are validation accuracies of best choices of fixed step-length for Adam, SGD and LRSFN with fixed rank of 40.
149 |
150 |
151 |
152 |
153 |
154 | # References
155 |
156 | These manuscripts motivate and use the hessianlearn library for stochastic nonconvex optimization
157 |
158 | - \[1\] O'Leary-Roseberry, T., Alger, N., Ghattas O.,
159 | [**Inexact Newton Methods for Stochastic Nonconvex Optimization with Applications to Neural Network Training**](https://arxiv.org/abs/1905.06738).
160 | arXiv:1905.06738.
161 | ([Download](https://arxiv.org/pdf/1905.06738.pdf))BibTeX
162 | @article{OLearyRoseberryAlgerGhattas2019,
163 | title={Inexact Newton methods for stochastic nonconvex optimization with applications to neural network training},
164 | author={O'Leary-Roseberry, Thomas and Alger, Nick and Ghattas, Omar},
165 | journal={arXiv preprint arXiv:1905.06738},
166 | year={2019}
167 | }
168 | }
169 |
170 | - \[2\] O'Leary-Roseberry, T., Alger, N., Ghattas O.,
171 | [**Low Rank Saddle Free Newton: A Scalable Method for Stochastic Nonconvex Optimization**](https://arxiv.org/abs/2002.02881).
172 | arXiv:2002.02881.
173 | ([Download](https://arxiv.org/pdf/2002.02881.pdf))BibTeX
174 | @article{OLearyRoseberryAlgerGhattas2020,
175 | title={Low Rank Saddle Free Newton: Algorithm and Analysis},
176 | author={O'Leary-Roseberry, Thomas and Alger, Nick and Ghattas, Omar},
177 | journal={arXiv preprint arXiv:2002.02881},
178 | year={2020}
179 | }
180 | }
181 |
182 |
183 | - \[3\] O'Leary-Roseberry, T., Villa, U., Chen P., Ghattas O.,
184 | [**Derivative-Informed Projected Neural Networks for High-Dimensional Parametric Maps Governed by PDEs**](https://www.sciencedirect.com/science/article/pii/S0045782521005302).
185 | Computer Methods in Applied Mechanics and Engineering. Volume 388, 1 January 2022, 114199.
186 | ([Download](https://arxiv.org/pdf/2011.15110.pdf))BibTeX
187 | @article{OLearyRoseberryVillaChenEtAl2022,
188 | title={Derivative-informed projected neural networks for high-dimensional parametric maps governed by {PDE}s},
189 | author={O’Leary-Roseberry, Thomas and Villa, Umberto and Chen, Peng and Ghattas, Omar},
190 | journal={Computer Methods in Applied Mechanics and Engineering},
191 | volume={388},
192 | pages={114199},
193 | year={2022},
194 | publisher={Elsevier}
195 | }
196 | }
197 |
198 |
199 | - \[4\] O'Leary-Roseberry, T., Du, X., Chaudhuri, A., Martins, J., Willcox, K., Ghattas, O.,
200 | [**Adaptive Projected Residual Networks for Learning Parametric Maps from Sparse Data**](https://arxiv.org/abs/2112.07096).
201 | arXiv:2112.07096.
202 | ([Download](https://arxiv.org/pdf/2112.07096.pdf))BibTeX
203 | @article{OLearyRoseberryDuChaudhuriEtAl2021,
204 | title={Adaptive Projected Residual Networks for Learning Parametric Maps from Sparse Data},
205 | author={O'Leary-Roseberry, Thomas and Du, Xiaosong, and Chaudhuri, Anirban, and Martins Joaqium R. R. A., and Willcox, Karen, and Ghattas, Omar},
206 | journal={arXiv preprint arXiv:2112.07096},
207 | year={2021}
208 | }
209 | }
210 |
211 |
212 |
213 |
214 |
215 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/lowRankSaddleFreeNewton.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | from scipy.sparse import diags
24 | import time
25 |
26 | from ..utilities.parameterList import ParameterList
27 | from ..algorithms import Optimizer
28 | from ..algorithms.globalization import ArmijoLineSearch, TrustRegion
29 | from ..algorithms.randomizedEigensolver import randomized_eigensolver, eigensolver_from_range
30 | from ..algorithms.rangeFinders import block_range_finder, noise_aware_adaptive_range_finder
31 | from ..algorithms.varianceBasedNystrom import variance_based_nystrom
32 | from ..problem import L2Regularization, HessianWrapper
33 |
34 |
35 |
36 |
37 | def ParametersLowRankSaddleFreeNewton(parameters = {}):
38 | parameters['alpha'] = [1e-3, "Initial steplength, or learning rate"]
39 | parameters['rel_tolerance'] = [1e-3, "Relative convergence when sqrt(g,g)/sqrt(g_0,g_0) <= rel_tolerance"]
40 | parameters['abs_tolerance'] = [1e-4,"Absolute converge when sqrt(g,g) <= abs_tolerance"]
41 | parameters['default_damping'] = [1e-3, "Levenberg-Marquardt damping when no regularization is used"]
42 |
43 | # Hessian approximation parameters
44 | parameters['range_finding'] = [None,"Range finding, if None then r = hessian_low_rank\
45 | Choose from None, 'arf', 'naarf', 'vn'"]
46 | parameters['range_rel_error_tolerance'] = [0.1, "Error tolerance for error estimator in adaptive range finding"]
47 | parameters['range_abs_error_tolerance'] = [100, "Error tolerance for error estimator in adaptive range finding"]
48 | parameters['range_block_size'] = [20, "Block size used in range finder"]
49 | parameters['rq_samples_for_naarf'] = [100, "Number of partitions for RQ variance evaluation"]
50 | parameters['hessian_low_rank'] = [20, "Fixed rank for randomized eigenvalue decomposition"]
51 | # Variance Nystrom Parameters
52 | parameters['max_bad_vectors_nystrom'] = [5, "Number of maximum bad vectors for variance based Nystrom"]
53 | parameters['max_vectors_nystrom'] = [40, "Number of maximum vectors for variance based Nystrom"]
54 | parameters['nystrom_std_tolerance'] = [0.5, "Noise to eigenvalue ratio used for Nystrom truncation"]
55 |
56 |
57 | # Globaliziation parameters
58 | parameters['globalization'] = [None, 'Choose from trust_region, line_search, spectral_step or none']
59 | parameters['max_backtracking_iter'] = [5, 'Max backtracking iterations for armijo line search']
60 | parameters['spectral_step_alpha'] = [1e-2, 'Used in min condition for spectral step']
61 |
62 | parameters['verbose'] = [False, "Printing"]
63 | parameters['record_last_rq_std'] = [False, "Record the last eigenvector RQ variance"]
64 |
65 | return ParameterList(parameters)
66 |
67 |
68 | class LowRankSaddleFreeNewton(Optimizer):
69 | """
70 | This class implements the Low Rank Saddle Free Newton (LRSFN) algorithm
71 | """
72 | def __init__(self,problem,regularization = None,sess = None,parameters = ParametersLowRankSaddleFreeNewton(),preconditioner = None):
73 | """
74 | The constructor for this class takes:
75 | -problem: hessianlearn.problem.Problem
76 | -regularization: hessianlearn.problem.Regularization
77 | -sess: tf.Session()
78 | -parameters: hyperparameters dictionary
79 | -preconditioner: hessianlearn.problem.Preconditioner
80 | """
81 | if regularization is None:
82 | _regularization = L2Regularization(problem,gamma = 0.0)
83 | else:
84 | _regularization = regularization
85 | super(LowRankSaddleFreeNewton,self).__init__(problem,_regularization,sess,parameters)
86 |
87 | self.grad = self.problem.gradient + self.regularization.gradient
88 |
89 | if self.parameters['globalization'] == 'trust_region':
90 | self.trust_region = TrustRegion()
91 | self._sweeps = np.zeros(2)
92 |
93 | self.alpha = 0.0
94 | self._rank = 0
95 |
96 | self._rq_std = 0.0
97 |
98 | self.eigenvalues = None
99 |
100 | @property
101 | def rank(self):
102 | return self._rank
103 |
104 | @property
105 | def rq_variance(self):
106 | return self._rq_variance
107 |
108 |
109 |
110 |
111 | def minimize(self,feed_dict = None,hessian_feed_dict = None,rq_estimator_dict = None):
112 | r"""
113 | Solves the saddle escape problem. Given a misfit (loss) Hessian operator (H)
114 | 1. H = U_r Lambda_r U_r^T
115 | 2. Solve [U_r |Lambda_r| U_r^T + gamma I] p = -g for p via Woodbury formula:
116 |
117 | [U_r Lambda_r U_r^T + gamma I]^{-1} = 1/gamma * I - 1/gamma * UDU^T
118 | where D = diag(|lambda_i|/(|lambda_i| + gamma))
119 | -feed_dict: data dictionary used for evaluating gradient and cost
120 | -hessian_feed_dict: dictionary used for stochastic Hessian
121 | -rq_estimator_dict: dictionary used for RQ variance calculations
122 |
123 | """
124 | self._iter += 1
125 | assert self.sess is not None
126 | assert feed_dict is not None
127 |
128 | assert self.parameters['range_finding'] in [None,'arf','naarf','vn']
129 |
130 | if hessian_feed_dict is None:
131 | hessian_feed_dict = feed_dict
132 |
133 |
134 | gradient = self.sess.run(self.grad,feed_dict = feed_dict)
135 |
136 | alpha = self.parameters['alpha']
137 |
138 | if self.parameters['range_finding'] == 'arf':
139 | H = lambda x: self.H(x,hessian_feed_dict,verbose = self.parameters['verbose'])
140 | n = self.problem.dimension
141 | # norm_g = np.linalg.norm(gradient)
142 | # tolerance = self.parameters['range_rel_error_tolerance']*norm_g
143 | tolerance = self.parameters['range_rel_error_tolerance']
144 | Q = block_range_finder(H,n,tolerance,self.parameters['range_block_size'])
145 | self._rank = Q.shape[1]
146 | Lmbda,U = eigensolver_from_range(H,Q)
147 |
148 | elif self.parameters['range_finding'] == 'naarf':
149 | norm_g = np.linalg.norm(gradient)
150 | tolerance = self.parameters['range_rel_error_tolerance']*norm_g
151 | if rq_estimator_dict is None:
152 | rq_estimator_dict_list = self.problem._partition_dictionaries(feed_dict,self.parameters['rq_samples_for_naarf'])
153 | elif type(rq_estimator_dict) == list:
154 | rq_estimator_dict_list = rq_estimator_dict
155 | elif type(rq_estimator_dict) == dict:
156 | rq_estimator_dict_list = self.problem._partition_dictionaries(rq_estimator_dict,self.parameters['rq_samples_for_naarf'])
157 | else:
158 | raise
159 | Q = noise_aware_adaptive_range_finder(self.H,hessian_feed_dict,rq_estimator_dict_list,block_size = self.parameters['range_block_size'],epsilon = tolerance)
160 | self._rank = Q.shape[1]
161 | H = lambda x: self.H(x,hessian_feed_dict,verbose = self.parameters['verbose'])
162 | Lmbda,U = eigensolver_from_range(H,Q)
163 |
164 | elif self.parameters['range_finding'] == 'vn':
165 | if rq_estimator_dict is None:
166 | rq_estimator_dict_list = self.problem._partition_dictionaries(feed_dict,self.parameters['rq_samples_for_naarf'])
167 | elif type(rq_estimator_dict) == list:
168 | rq_estimator_dict_list = rq_estimator_dict
169 | elif type(rq_estimator_dict) == dict:
170 | rq_estimator_dict_list = self.problem._partition_dictionaries(rq_estimator_dict,self.parameters['rq_samples_for_naarf'])
171 | else:
172 | raise
173 | nystrom_t0 = time.time()
174 | apply_H_list = [HessianWrapper(self.H,dictionary) for dictionary in rq_estimator_dict_list]
175 | [Lmbda, U, all_std_good],[Lmbda_all,U_all,all_std] = variance_based_nystrom(apply_H_list, self.H.dimension,\
176 | std_tol = self.parameters['nystrom_std_tolerance'],\
177 | max_vectors = self.parameters['max_vectors_nystrom'],\
178 | max_bad_vectors=self.parameters['max_bad_vectors_nystrom'],\
179 | verbose = self.parameters['verbose'])
180 | self._rank = U_all.shape[1]
181 | if self.parameters['verbose']:
182 | print('Nystrom method took ',time.time() - nystrom_t0, 's')
183 |
184 | else:
185 | H = lambda x: self.H(x,hessian_feed_dict,verbose = self.parameters['verbose'])
186 | n = self.problem.dimension
187 | self._rank = self.parameters['hessian_low_rank']
188 | Lmbda,U = randomized_eigensolver(H, n, self._rank,verbose=False)
189 |
190 | self.eigenvalues = Lmbda
191 | # Log the variance of the last eigenvector
192 | if self.parameters['record_last_rq_std'] :
193 | try:
194 | rq_direction = U[:,-1]
195 | if rq_estimator_dict is None:
196 | rq_estimator_dict_list = self.problem._partition_dictionaries(feed_dict,self.parameters['rq_samples_for_naarf'])
197 | elif type(rq_estimator_dict) == list:
198 | rq_estimator_dict_list = rq_estimator_dict
199 | elif type(rq_estimator_dict) == dict:
200 | rq_estimator_dict_list = self.problem._partition_dictionaries(rq_estimator_dict,self.parameters['rq_samples_for_naarf'])
201 | else:
202 | raise
203 |
204 | try:
205 | RQ_samples = np.zeros((len(rq_estimator_dict_list),rq_direction.shape[1]))
206 | except:
207 | RQ_samples = np.zeros(len(rq_estimator_dict_list))
208 |
209 | for samp_i,sample_dictionary in enumerate(rq_estimator_dict_list):
210 | RQ_samples[samp_i] = self.H.quadratics(rq_direction,sample_dictionary)
211 | self._rq_std = np.std(RQ_samples)
212 | except:
213 | self._rq_std = None
214 | print(80*'#')
215 | print('U is [], taking gradient step, fix this later?'.center(80))
216 |
217 | # Saddle free inversion via Woodbury
218 | if self.regularization.parameters['gamma'] < 1e-4:
219 | gamma_damping = self.parameters['default_damping']
220 | # Using this condition instead of fixed gamma allows one to take larger step sizes
221 | # but does not appear to improve accuracy
222 | # gamma_damping = max(0.9*np.abs(Lmbda[-1]),self.parameters['default_damping'])
223 | else:
224 | gamma_damping = self.regularization.parameters['gamma']
225 | # print('Lmbda[0] = ',Lmbda[0])
226 | # print('Lmbda[-1] = ',Lmbda[-1])
227 | # print('gamma_damping = ',gamma_damping)
228 |
229 | Lmbda_abs = np.abs(Lmbda)
230 | Lmbda_diags = diags(Lmbda_abs)
231 | # Build terms for Woodbury inversion
232 | D_denominator = Lmbda_abs + gamma_damping*np.ones_like(Lmbda_abs)
233 | D = np.divide(Lmbda_abs,D_denominator)
234 | # Invert by applying terms in Woodbury formula:
235 | UTg = np.dot(U.T,gradient)
236 | DUTg = np.multiply(D,UTg)
237 | UDUTg = np.dot(U,DUTg)
238 | minus_p = (gradient - UDUTg)/gamma_damping
239 | self.p = -minus_p
240 |
241 |
242 | # Globalization: compute alpha and update the weights
243 | if self.parameters['globalization'] is None:
244 | self.alpha = self.parameters['alpha']
245 | self._sweeps += [1,2*self._rank]
246 | update = self.alpha*self.p
247 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
248 |
249 | elif self.parameters['globalization'] is 'spectral_step':
250 | # self.alpha = min(self.parameters['spectral_step_alpha'],0.1/Lmbda_abs[0])
251 | self.alpha = min(self.parameters['spectral_step_alpha'],0.1/Lmbda_abs[0])
252 | self._sweeps += [1,2*self._rank]
253 | update = self.alpha*self.p
254 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
255 |
256 | elif self.parameters['globalization'] == 'line_search':
257 | w_dir_inner_g = np.inner(self.p,gradient)
258 | initial_cost = self.sess.run(self.problem.loss,feed_dict = feed_dict)
259 | cost_at_candidate = lambda p : self._loss_at_candidate(p,feed_dict = feed_dict)
260 | self.alpha, line_search, line_search_iter = ArmijoLineSearch(self.p,w_dir_inner_g,\
261 | cost_at_candidate, initial_cost,
262 | max_backtracking_iter = self.parameters['max_backtracking_iter'])
263 | update = self.alpha*self.p
264 | self._sweeps += [1+0.5*line_search_iter,2*self._rank]
265 | self.sess.run(self.problem._update_ops,feed_dict = {self.problem._update_placeholder:update})
266 |
267 |
268 |
269 |
270 |
271 |
--------------------------------------------------------------------------------
/applications/transfer_learning/imagenet_cifar10_classification.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 |
19 | import numpy as np
20 | import os
21 | import pickle
22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
23 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
24 | os.environ["KMP_WARNINGS"] = "FALSE"
25 | os.environ['CUDA_VISIBLE_DEVICES'] = '1'
26 | import pickle
27 | import tensorflow as tf
28 | import time, datetime
29 | # if int(tf.__version__[0]) > 1:
30 | # import tensorflow.compat.v1 as tf
31 | # tf.disable_v2_behavior()
32 |
33 |
34 | # Memory issue with GPUs
35 | gpu_devices = tf.config.experimental.list_physical_devices('GPU')
36 | for device in gpu_devices:
37 | tf.config.experimental.set_memory_growth(device, True)
38 | # Load hessianlearn library
39 | import sys
40 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../"))
41 | from hessianlearn import *
42 |
43 | # Parse run specifications
44 | from argparse import ArgumentParser
45 |
46 | parser = ArgumentParser(add_help=True)
47 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str)
48 | parser.add_argument('-fixed_step',dest = 'fixed_step',\
49 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int)
50 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-5,help= 'learning rate alpha',type=float)
51 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int)
52 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\
53 | required= False,default = 0,help='boolean for recording spectrum',type = int)
54 |
55 | parser.add_argument("-resnet_weights", dest='resnet_weights',required=False, default = 'imagenet', help="initialization for network weights",type=str)
56 |
57 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int)
58 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int)
59 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int)
60 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str)
61 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float)
62 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 2,help='max sweeps',type = float)
63 |
64 | parser.add_argument("-loss_type", dest='loss_type',required=False, default = 'mixed', help="loss type either cross_entrop or mixed",type=str)
65 | parser.add_argument('-seed',dest = 'seed',required= False,default = 0,help='seed',type = int)
66 |
67 |
68 | args = parser.parse_args()
69 |
70 | try:
71 | tf.set_random_seed(args.seed)
72 | except:
73 | tf.random.set_seed(args.seed)
74 |
75 | # GPU Environment Details
76 | gpu_availabe = tf.test.is_gpu_available()
77 | built_with_cuda = tf.test.is_built_with_cuda()
78 | print(80*'#')
79 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80))
80 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80))
81 | print(80*'#')
82 |
83 | settings = {}
84 | # Set run specifications
85 | # Data specs
86 | settings['batch_size'] = args.batch_size
87 | settings['hess_batch_size'] = args.hess_batch_size
88 |
89 |
90 | ################################################################################
91 | # Instantiate data
92 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar10.load_data()
93 |
94 | # # Normalize the data
95 | # x_train = x_train.astype('float32') / 255.
96 | # x_test = x_test.astype('float32') / 255.
97 |
98 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
99 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test)
100 | x_val = x_test_full[:2000]
101 | x_test = x_test_full[2000:]
102 |
103 | y_train = tf.keras.utils.to_categorical(y_train)
104 | y_test_full = tf.keras.utils.to_categorical(_y_test)
105 | y_val = y_test_full[:2000]
106 | y_test = y_test_full[2000:]
107 |
108 | ################################################################################
109 | # Create the neural network in keras
110 |
111 | # tf.keras.backend.set_floatx('float64')
112 |
113 | resnet_input_shape = (200,200,3)
114 | input_tensor = tf.keras.Input(shape = resnet_input_shape)
115 |
116 | if args.resnet_weights == 'None':
117 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = None,include_top=False,input_tensor=input_tensor)
118 | else:
119 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor)
120 |
121 | for layer in pretrained_resnet50.layers[:143]:
122 | layer.trainable = False
123 |
124 | classifier = tf.keras.models.Sequential()
125 | classifier.add(tf.keras.layers.Input(shape=(32,32,3)))
126 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2])))
127 | classifier.add(pretrained_resnet50)
128 | classifier.add(tf.keras.layers.Flatten())
129 | classifier.add(tf.keras.layers.BatchNormalization())
130 | classifier.add(tf.keras.layers.Dense(64, activation='relu'))
131 | classifier.add(tf.keras.layers.Dropout(0.5))
132 | classifier.add(tf.keras.layers.BatchNormalization())
133 | classifier.add(tf.keras.layers.Dense(10, activation='softmax'))
134 |
135 |
136 | if args.keras_opt == 'adam':
137 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8)
138 | elif args.keras_opt == 'sgd':
139 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha)
140 | else:
141 | raise
142 |
143 | if args.loss_type == 'mixed':
144 | def mixed(y_true, y_pred):
145 | squared_difference = tf.square(y_true - y_pred)
146 | return tf.reduce_mean(squared_difference, axis=-1) +tf.keras.losses.CategoricalCrossentropy(from_logits = True)(y_true, y_pred)
147 | loss = mixed
148 | else:
149 | loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
150 |
151 |
152 | classifier.compile(optimizer=optimizer,
153 | loss=loss,
154 | metrics=['accuracy'])
155 |
156 |
157 | loss_train_0, acc_train_0 = classifier.evaluate(x_train,y_train,verbose=2)
158 | print('acc_train = ',acc_train_0)
159 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2)
160 | print('acc_test = ',acc_test_0)
161 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2)
162 | print('acc_val = ',acc_val_0)
163 |
164 | aux_keras_data = {'loss_train_0':loss_train_0,'acc_traun_0':acc_train_0,\
165 | 'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\
166 | 'loss_val_0':loss_val_0, 'acc_val_0':acc_val_0}
167 |
168 | no_callback = True
169 | if no_callback:
170 | callbacks = []
171 | else:
172 | callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_acc',restore_best_weights = True)]
173 |
174 | keras_directory = 'keras_logging_cifar10/'
175 | # CSV logging
176 | if not os.path.exists(keras_directory):
177 | os.makedirs(keras_directory)
178 | keras_logger_name = keras_directory+str(datetime.date.today())+args.keras_opt+str(args.keras_alpha)+'_'+str(args.keras_epochs)+'_seed'+str(args.seed)+'.csv'
179 | callbacks.append(tf.keras.callbacks.CSVLogger(keras_logger_name, append=True, separator=';'))
180 |
181 | classifier.fit(x_train[:], y_train[:], epochs=args.keras_epochs,batch_size = 32,\
182 | callbacks = callbacks ,verbose = True,validation_data = (x_val,y_val))
183 |
184 |
185 | # Grab the weights and check the accuracy post process
186 | set_weights = {}
187 |
188 | for layer in classifier.layers:
189 | set_weights[layer.name] = classifier.get_layer(layer.name).get_weights()
190 |
191 | # Post process and save additional information from keras training
192 | loss_test_keras_final, acc_test_keras_final = classifier.evaluate(x_test,y_test,verbose=2)
193 | loss_val_keras_final, acc_val_keras_final = classifier.evaluate(x_val,y_val,verbose=2)
194 | print(80*'#')
195 | print('After keras training'.center(80))
196 | print('acc_test = ',acc_test_keras_final)
197 | print('acc_val = ',acc_val_keras_final)
198 | aux_keras_data['loss_test_final'] = loss_test_keras_final
199 | aux_keras_data['acc_test_final'] = acc_test_keras_final
200 | aux_keras_data['loss_val_final'] = loss_val_keras_final
201 | aux_keras_data['acc_val_final'] = acc_val_keras_final
202 | keras_aux_logger_name = keras_logger_name.split('.cvs')[0]+'aux_data.pkl'
203 | with open(keras_aux_logger_name,'wb+') as f:
204 | pickle.dump(aux_keras_data,f,pickle.HIGHEST_PROTOCOL)
205 |
206 |
207 | ################################################################################
208 | # Instantiate the data, problem, regularization.
209 |
210 | t0_problem_construction = time.time()
211 | problem = ClassificationProblem(classifier,loss_type=args.loss_type,dtype=tf.float32)
212 | print('Finished constructing the problem, and it took ',time.time() - t0_problem_construction , 's')
213 |
214 |
215 | # Instante the data object
216 | data = Data({problem.x:x_train,problem.y_true:y_train},settings['batch_size'],\
217 | validation_data = {problem.x:x_val,problem.y_true:y_val},hessian_batch_size = settings['hess_batch_size'],seed=args.seed)
218 |
219 | settings['tikhonov_gamma'] = 0.0
220 |
221 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma'])
222 |
223 |
224 | ################################################################################
225 | # Instantiate the model object
226 | HLModelSettings = HessianlearnModelSettings()
227 |
228 | HLModelSettings['optimizer'] = args.optimizer
229 | HLModelSettings['alpha'] = args.alpha
230 | HLModelSettings['globalization'] = None
231 | HLModelSettings['hessian_low_rank'] = args.hessian_low_rank
232 | HLModelSettings['max_backtrack'] = 20
233 | HLModelSettings['max_sweeps'] = args.max_sweeps
234 | HLModelSettings['layer_weights'] = set_weights
235 |
236 | HLModelSettings['problem_name'] = 'cifar10_resnet_classification_seed'+str(args.seed)
237 | if args.resnet_weights == 'None':
238 | HLModelSettings['problem_name'] += '_random_guess'
239 | HLModelSettings['record_spectrum'] = bool(args.record_spectrum)
240 | HLModelSettings['rq_data_size'] = 100
241 | HLModelSettings['printing_sweep_frequency'] = None
242 | HLModelSettings['printing_items'] = {'time':'time','sweeps':'sweeps','Loss':'train_loss','acc train':'train_acc',\
243 | '||g||':'||g||','Loss val':'val_loss','acc val':'val_acc',\
244 | 'maxacc val':'max_val_acc','alpha':'alpha'}
245 |
246 |
247 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings)
248 |
249 | if args.max_sweeps > 0:
250 | HLModel.fit()
251 |
252 |
253 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2)
254 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2)
255 |
256 | hl_aux_data = {'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\
257 | 'loss_val_0':loss_val_0,'acc_val_0':acc_val_0,\
258 | 'loss_test_final':loss_test_final,'acc_test_final':acc_test_final,\
259 | 'loss_val_final':loss_val_final,'acc_val_final':acc_val_final}
260 |
261 | with open(HLModel.settings['problem_name']+'_logging/'+ HLModel.logger_outname +'aux_data.pkl', 'wb+') as f:
262 | pickle.dump(hl_aux_data, f, pickle.HIGHEST_PROTOCOL)
263 |
264 | ################################################################################
265 | # Evaluate again on all the data.
266 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
267 |
268 | # # Normalize the data
269 | # x_train = x_train.astype('float32') / 255.
270 | # x_test = x_test.astype('float32') / 255.
271 |
272 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
273 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test)
274 |
275 | y_train = tf.keras.utils.to_categorical(y_train)
276 | y_test = tf.keras.utils.to_categorical(y_test)
277 |
278 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2)
279 | print(80*'#')
280 | print('After hessianlearn training'.center(80))
281 | print('acc_test_total = ',acc_test_total)
282 |
283 |
284 |
--------------------------------------------------------------------------------
/applications/transfer_learning/imagenet_cifar100_classification.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 |
19 | import numpy as np
20 | import os
21 | import pickle
22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
23 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
24 | os.environ["KMP_WARNINGS"] = "FALSE"
25 | os.environ['CUDA_VISIBLE_DEVICES'] = '1'
26 | import pickle
27 | import tensorflow as tf
28 | import time, datetime
29 | # if int(tf.__version__[0]) > 1:
30 | # import tensorflow.compat.v1 as tf
31 | # tf.disable_v2_behavior()
32 |
33 |
34 | # Memory issue with GPUs
35 | gpu_devices = tf.config.experimental.list_physical_devices('GPU')
36 | for device in gpu_devices:
37 | tf.config.experimental.set_memory_growth(device, True)
38 | # Load hessianlearn library
39 | import sys
40 | sys.path.append( os.environ.get('HESSIANLEARN_PATH', "../../"))
41 | from hessianlearn import *
42 |
43 | # Parse run specifications
44 | from argparse import ArgumentParser
45 |
46 | parser = ArgumentParser(add_help=True)
47 | parser.add_argument("-optimizer", dest='optimizer',required=False, default = 'lrsfn', help="optimizer type",type=str)
48 | parser.add_argument('-fixed_step',dest = 'fixed_step',\
49 | required= False,default = 1,help='boolean for fixed step vs globalization',type = int)
50 | parser.add_argument('-alpha',dest = 'alpha',required = False,default = 1e-5,help= 'learning rate alpha',type=float)
51 | parser.add_argument('-hessian_low_rank',dest = 'hessian_low_rank',required= False,default = 40,help='low rank for sfn',type = int)
52 | parser.add_argument('-record_spectrum',dest = 'record_spectrum',\
53 | required= False,default = 0,help='boolean for recording spectrum',type = int)
54 |
55 | parser.add_argument("-resnet_weights", dest='resnet_weights',required=False, default = 'imagenet', help="initialization for network weights",type=str)
56 |
57 | parser.add_argument('-batch_size',dest = 'batch_size',required= False,default = 32,help='batch size',type = int)
58 | parser.add_argument('-hess_batch_size',dest = 'hess_batch_size',required= False,default = 8,help='hess batch size',type = int)
59 | parser.add_argument('-keras_epochs',dest = 'keras_epochs',required= False,default = 50,help='keras_epochs',type = int)
60 | parser.add_argument("-keras_opt", dest='keras_opt',required=False, default = 'adam', help="optimizer type for keras",type=str)
61 | parser.add_argument('-keras_alpha',dest = 'keras_alpha',required= False,default = 1e-3,help='keras learning rate',type = float)
62 | parser.add_argument('-max_sweeps',dest = 'max_sweeps',required= False,default = 2,help='max sweeps',type = float)
63 |
64 | parser.add_argument("-loss_type", dest='loss_type',required=False, default = 'mixed', help="loss type either cross_entrop or mixed",type=str)
65 | parser.add_argument('-seed',dest = 'seed',required= False,default = 0,help='seed',type = int)
66 |
67 |
68 | args = parser.parse_args()
69 |
70 | try:
71 | tf.set_random_seed(args.seed)
72 | except:
73 | tf.random.set_seed(args.seed)
74 |
75 | # GPU Environment Details
76 | gpu_availabe = tf.test.is_gpu_available()
77 | built_with_cuda = tf.test.is_built_with_cuda()
78 | print(80*'#')
79 | print(('IS GPU AVAILABLE: '+str(gpu_availabe)).center(80))
80 | print(('IS BUILT WITH CUDA: '+str(built_with_cuda)).center(80))
81 | print(80*'#')
82 |
83 | settings = {}
84 | # Set run specifications
85 | # Data specs
86 | settings['batch_size'] = args.batch_size
87 | settings['hess_batch_size'] = args.hess_batch_size
88 |
89 |
90 | ################################################################################
91 | # Instantiate data
92 | (x_train, y_train), (_x_test, _y_test) = tf.keras.datasets.cifar100.load_data()
93 |
94 | # # Normalize the data
95 | # x_train = x_train.astype('float32') / 255.
96 | # x_test = x_test.astype('float32') / 255.
97 |
98 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
99 | x_test_full = tf.keras.applications.resnet50.preprocess_input(_x_test)
100 | x_val = x_test_full[:2000]
101 | x_test = x_test_full[2000:]
102 |
103 | y_train = tf.keras.utils.to_categorical(y_train)
104 | y_test_full = tf.keras.utils.to_categorical(_y_test)
105 | y_val = y_test_full[:2000]
106 | y_test = y_test_full[2000:]
107 |
108 | ################################################################################
109 | # Create the neural network in keras
110 |
111 | # tf.keras.backend.set_floatx('float64')
112 |
113 | resnet_input_shape = (200,200,3)
114 | input_tensor = tf.keras.Input(shape = resnet_input_shape)
115 |
116 | if args.resnet_weights == 'None':
117 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = None,include_top=False,input_tensor=input_tensor)
118 | else:
119 | pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(weights = 'imagenet',include_top=False,input_tensor=input_tensor)
120 |
121 | for layer in pretrained_resnet50.layers[:143]:
122 | layer.trainable = False
123 |
124 | classifier = tf.keras.models.Sequential()
125 | classifier.add(tf.keras.layers.Input(shape=(32,32,3)))
126 | classifier.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, resnet_input_shape[:2])))
127 | classifier.add(pretrained_resnet50)
128 | classifier.add(tf.keras.layers.Flatten())
129 | classifier.add(tf.keras.layers.BatchNormalization())
130 | classifier.add(tf.keras.layers.Dense(128, activation='relu'))
131 | classifier.add(tf.keras.layers.Dropout(0.5))
132 | classifier.add(tf.keras.layers.BatchNormalization())
133 | classifier.add(tf.keras.layers.Dense(100, activation='softmax'))
134 |
135 |
136 | if args.keras_opt == 'adam':
137 | optimizer = tf.keras.optimizers.Adam(learning_rate = args.keras_alpha,epsilon = 1e-8)
138 | elif args.keras_opt == 'sgd':
139 | optimizer = tf.keras.optimizers.SGD(learning_rate=args.keras_alpha)
140 | else:
141 | raise
142 |
143 | if args.loss_type == 'mixed':
144 | def mixed(y_true, y_pred):
145 | squared_difference = tf.square(y_true - y_pred)
146 | return tf.reduce_mean(squared_difference, axis=-1) +tf.keras.losses.CategoricalCrossentropy(from_logits = True)(y_true, y_pred)
147 | loss = mixed
148 | else:
149 | loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
150 |
151 |
152 | classifier.compile(optimizer=optimizer,
153 | loss=loss,
154 | metrics=['accuracy'])
155 |
156 |
157 | loss_train_0, acc_train_0 = classifier.evaluate(x_train,y_train,verbose=2)
158 | print('acc_train = ',acc_train_0)
159 | loss_test_0, acc_test_0 = classifier.evaluate(x_test,y_test,verbose=2)
160 | print('acc_test = ',acc_test_0)
161 | loss_val_0, acc_val_0 = classifier.evaluate(x_val,y_val,verbose=2)
162 | print('acc_val = ',acc_val_0)
163 |
164 | aux_keras_data = {'loss_train_0':loss_train_0,'acc_traun_0':acc_train_0,\
165 | 'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\
166 | 'loss_val_0':loss_val_0, 'acc_val_0':acc_val_0}
167 |
168 | no_callback = True
169 | if no_callback:
170 | callbacks = []
171 | else:
172 | callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_acc',restore_best_weights = True)]
173 |
174 | keras_directory = 'keras_logging_cifar100/'
175 | # CSV logging
176 | if not os.path.exists(keras_directory):
177 | os.makedirs(keras_directory)
178 | keras_logger_name = keras_directory+str(datetime.date.today())+args.keras_opt+str(args.keras_alpha)+'_'+str(args.keras_epochs)+'_seed'+str(args.seed)+'.csv'
179 | callbacks.append(tf.keras.callbacks.CSVLogger(keras_logger_name, append=True, separator=';'))
180 |
181 | classifier.fit(x_train[:], y_train[:], epochs=args.keras_epochs,batch_size = 32,\
182 | callbacks = callbacks ,verbose = True,validation_data = (x_val,y_val))
183 |
184 |
185 | # Grab the weights and check the accuracy post process
186 | set_weights = {}
187 |
188 | for layer in classifier.layers:
189 | set_weights[layer.name] = classifier.get_layer(layer.name).get_weights()
190 |
191 | # Post process and save additional information from keras training
192 | loss_test_keras_final, acc_test_keras_final = classifier.evaluate(x_test,y_test,verbose=2)
193 | loss_val_keras_final, acc_val_keras_final = classifier.evaluate(x_val,y_val,verbose=2)
194 | print(80*'#')
195 | print('After keras training'.center(80))
196 | print('acc_test = ',acc_test_keras_final)
197 | print('acc_val = ',acc_val_keras_final)
198 | aux_keras_data['loss_test_final'] = loss_test_keras_final
199 | aux_keras_data['acc_test_final'] = acc_test_keras_final
200 | aux_keras_data['loss_val_final'] = loss_val_keras_final
201 | aux_keras_data['acc_val_final'] = acc_val_keras_final
202 | keras_aux_logger_name = keras_logger_name.split('.cvs')[0]+'aux_data.pkl'
203 | with open(keras_aux_logger_name,'wb+') as f:
204 | pickle.dump(aux_keras_data,f,pickle.HIGHEST_PROTOCOL)
205 |
206 |
207 | ################################################################################
208 | # Instantiate the data, problem, regularization.
209 |
210 | t0_problem_construction = time.time()
211 | problem = ClassificationProblem(classifier,loss_type=args.loss_type,dtype=tf.float32)
212 | print('Finished constructing the problem, and it took ',time.time() - t0_problem_construction , 's')
213 |
214 |
215 | # Instante the data object
216 | data = Data({problem.x:x_train,problem.y_true:y_train},settings['batch_size'],\
217 | validation_data = {problem.x:x_val,problem.y_true:y_val},hessian_batch_size = settings['hess_batch_size'],seed=args.seed)
218 |
219 | settings['tikhonov_gamma'] = 0.0
220 |
221 | regularization = L2Regularization(problem,gamma = settings['tikhonov_gamma'])
222 |
223 |
224 | ################################################################################
225 | # Instantiate the model object
226 | HLModelSettings = HessianlearnModelSettings()
227 |
228 | HLModelSettings['optimizer'] = args.optimizer
229 | HLModelSettings['alpha'] = args.alpha
230 | HLModelSettings['globalization'] = None
231 | HLModelSettings['hessian_low_rank'] = args.hessian_low_rank
232 | HLModelSettings['max_backtrack'] = 20
233 | HLModelSettings['max_sweeps'] = args.max_sweeps
234 | HLModelSettings['layer_weights'] = set_weights
235 |
236 | HLModelSettings['problem_name'] = 'cifar100_resnet_classification_seed'+str(args.seed)
237 | if args.resnet_weights == 'None':
238 | HLModelSettings['problem_name'] += '_random_guess'
239 | HLModelSettings['record_spectrum'] = bool(args.record_spectrum)
240 | HLModelSettings['rq_data_size'] = 100
241 | HLModelSettings['printing_sweep_frequency'] = None
242 | HLModelSettings['printing_items'] = {'time':'time','sweeps':'sweeps','Loss':'train_loss','acc train':'train_acc',\
243 | '||g||':'||g||','Loss val':'val_loss','acc val':'val_acc',\
244 | 'maxacc val':'max_val_acc','alpha':'alpha'}
245 |
246 |
247 | HLModel = HessianlearnModel(problem,regularization,data,settings = HLModelSettings)
248 |
249 | if args.max_sweeps > 0:
250 | HLModel.fit()
251 |
252 |
253 | loss_test_final, acc_test_final = classifier.evaluate(x_test,y_test,verbose=2)
254 | loss_val_final, acc_val_final = classifier.evaluate(x_val,y_val,verbose=2)
255 |
256 | hl_aux_data = {'loss_test_0':loss_test_0,'acc_test_0':acc_test_0,\
257 | 'loss_val_0':loss_val_0,'acc_val_0':acc_val_0,\
258 | 'loss_test_final':loss_test_final,'acc_test_final':acc_test_final,\
259 | 'loss_val_final':loss_val_final,'acc_val_final':acc_val_final}
260 |
261 | with open(HLModel.settings['problem_name']+'_logging/'+ HLModel.logger_outname +'aux_data.pkl', 'wb+') as f:
262 | pickle.dump(hl_aux_data, f, pickle.HIGHEST_PROTOCOL)
263 |
264 | ################################################################################
265 | # Evaluate again on all the data.
266 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar100.load_data()
267 |
268 | # # Normalize the data
269 | # x_train = x_train.astype('float32') / 255.
270 | # x_test = x_test.astype('float32') / 255.
271 |
272 | x_train = tf.keras.applications.resnet50.preprocess_input(x_train)
273 | x_test = tf.keras.applications.resnet50.preprocess_input(x_test)
274 |
275 | y_train = tf.keras.utils.to_categorical(y_train)
276 | y_test = tf.keras.utils.to_categorical(y_test)
277 |
278 | loss_test_total, acc_test_total = classifier.evaluate(x_test,y_test,verbose=2)
279 | print(80*'#')
280 | print('After hessianlearn training'.center(80))
281 | print('acc_test_total = ',acc_test_total)
282 |
283 |
284 |
--------------------------------------------------------------------------------
/hessianlearn/algorithms/cgSolver.py:
--------------------------------------------------------------------------------
1 | # This file is part of the hessianlearn package
2 | #
3 | # hessianlearn is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or any later version.
6 | #
7 | # hessianlearn is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 | # GNU Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public License
13 | # If not, see .
14 | #
15 | # Author: Tom O'Leary-Roseberry
16 | # Contact: tom.olearyroseberry@utexas.edu
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | import math
22 | import numpy as np
23 | import tensorflow as tf
24 | if int(tf.__version__[0]) > 1:
25 | import tensorflow.compat.v1 as tf
26 | tf.disable_v2_behavior()
27 |
28 | from ..utilities.parameterList import ParameterList
29 | from ..algorithms import Optimizer
30 | from .. problem import IdentityPreconditioner
31 | from ..problem import L2Regularization
32 | from abc import ABC, abstractmethod
33 |
34 | class Identity(object):
35 | def __init__(self):
36 |
37 | pass
38 |
39 | def __call__(self, x):
40 | return x
41 |
42 |
43 |
44 | def ParametersCGSolver(dictionary = {}):
45 | parameters = dictionary
46 | parameters["rel_tolerance"] = [1e-9, "the relative tolerance for the stopping criterion"]
47 | parameters["abs_tolerance"] = [1e-12, "the absolute tolerance for the stopping criterion"]
48 | parameters["max_iter"] = [10, "the maximum number of iterations"]
49 | parameters["zero_initial_guess"] = [True, "if True we start with a 0 initial guess; if False we use the x as initial guess."]
50 | parameters["print_level"] = [-1, "verbosity level: -1 --> no output on screen; 0 --> only final residual at convergence or reason for not not convergence"]
51 |
52 | parameters['coarse_tol'] = [0.5,'coarse tolerance used in calculation of relative tolerances for E-W conditions']
53 |
54 | parameters['default_damping'] = [1e-3, "Levenberg-Marquardt damping when no regularization is used"]
55 | return ParameterList(parameters)
56 |
57 |
58 | class CGSolver(ABC):
59 | """
60 | This class implements a custom CG solver to be used with Inexact Newton CG
61 | """
62 | reason = ["Maximum Number of Iterations Reached",
63 | "Relative/Absolute residual less than tol",
64 | "Reached a negative direction",
65 | "Reached trust region boundary"
66 | ]
67 | def __init__(self,problem,regularization,sess = None,Aop = None,preconditioner = None,x = None,parameters = ParametersCGSolver()):
68 | """
69 | The constructor for this class takes:
70 | -problem: hessianlearn.problem.Problem
71 | -regularization: hessianlearn.problem.Regularization
72 | -sess: tf.Session()
73 | -Aop: matrix vector product callable
74 | -precondition: hessianlearn.problem.Preconditioner
75 | -parameters: solver hyperparameters
76 | """
77 | self.sess = sess
78 | self.problem = problem
79 | if regularization.parameters['gamma'] < 1e-4:
80 | regularization = L2Regularization(self.problem,gamma = parameters['default_damping'])
81 | self.regularization = regularization
82 | if x is None:
83 | # self.x = tf.Variable(self.problem.gradient.initialized_value())
84 | self.x = self.problem.gradient
85 | else:
86 | self.x = x
87 | self.parameters = parameters
88 | if Aop is None:
89 | self.Aop = self.problem.Hdw + self.regularization.Hdw
90 | else:
91 | # be careful to note what the operator requires be passed into feed_dict
92 | self.Aop = Aop
93 | # Define preconditioner
94 | if preconditioner is None:
95 | self.Minv = IdentityPreconditioner(problem,self.problem.dtype)
96 | else:
97 | self.Minv = preconditioner
98 |
99 | self.update_x = self.update_without_trust_region
100 | self.B_op = None
101 |
102 | def initialize_trust_region(self,coarse_tol = None):
103 | """
104 | This method initializes the trust region parameters
105 | -coarse_tol: coarse tolerance
106 | """
107 | self.update_x = self.update_with_trust_region
108 | if coarse_tol is not None:
109 | self.parameters['coarse_tol'] = coarse_tol
110 |
111 | def set_trust_region_radius(self,radius,operator = Identity()):
112 | """
113 | This method sets the trust region radius when trust region is used
114 | for globalization
115 | -radius: trust region radius
116 | -operator: for use in TR calculations
117 | """
118 | assert self.parameters['zero_initial_guess']
119 | self.trust_region_radius_squared = radius**2
120 | self.B_op = operator
121 |
122 | def update_without_trust_region(self,x,alpha,p):
123 | """
124 | This method updates the approximation of x^* and returns False when
125 | TR is not used
126 | -x: solution at given iteration
127 | -alpha: step length
128 | -p: search direction
129 | """
130 | x = x + alpha*p
131 | return False, x
132 |
133 | def update_with_trust_region(self,x,alpha,p):
134 | """
135 | This method returns a Boolean delineating whether the point was placed
136 | on the trust region boundary or not, as well as the updated x
137 | -x: solution at given iteration
138 | -alpha: step length
139 | -p: search direction
140 | """
141 | step = x + alpha*p
142 | assert self.B_op is not None
143 | step_length = np.dot(x,self.B_op(step))
144 | if step_length < self.trust_region_radius_squared:
145 | return False, step
146 | else:
147 | # Move the point to the boundary of the trust region
148 | Bp = self.B_op(p)
149 | xBp = np.dot(x,Bp)
150 | pBp = np.dot(p,Bp)
151 | Bx = self.B_op(x)
152 | xBx = np.dot(x,Bx)
153 | a_tau = alpha*alpha*pBp
154 | b_tau = 2* alpha * xBp
155 | c_tau = xBx - self.trust_region_radius_squared
156 | discriminant = (b_tau - 4*a_tau*c_tau)
157 | if discriminant < 0:
158 | print('Issue with the discriminant')
159 | discriminant *= -1
160 | tau = 0.5*(-b_tau + math.sqrt(discriminant))/a_tau
161 | alpha_tau = alpha*tau
162 | return True, x + alpha*p
163 |
164 | def solve(self,b,feed_dict = None,x_0 = None):
165 | r"""
166 | Solve Ax=b by the preconditioned conjugate gradients method
167 | as defined in Iterative Methods Ed. 2 by Yousef Saad p 263
168 | -b: the right hand side
169 | -feed_dict: the data dictionary used to evaluate stochastic
170 | operators
171 | -x_0: the initial guess for CG
172 | """
173 | assert self.sess is not None
174 | assert feed_dict is not None
175 |
176 | self.iter = 0
177 | self.converged = False
178 | self.reason_id = 0
179 | x = np.zeros_like(b)
180 |
181 | feed_dict[self.problem.dw] = x
182 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict)
183 | # Calculate initial residual r = Ax_0 -b
184 | r = b - Ax_0
185 | # Apply preconditioner z = M^{-1}r
186 | feed_dict[self.Minv.x] = r
187 | # fix me!!!!! Preconditioner not working for now?
188 |
189 | z = self.sess.run(self.Minv(),feed_dict = feed_dict)
190 |
191 |
192 | # Calculate p (copy array)
193 | p = z.copy()
194 | # Calculate tolerance for Eisenstat Walker conditions
195 | rz_0 = np.dot(r,z)
196 | rtol2 = rz_0 * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"]
197 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"]
198 | tol = max(rtol2, atol2)
199 | # Check convergence and initialize for solve:
200 | converged = (rz_0 < tol)
201 | if converged:
202 | self.converged = True
203 | self.reason_id = 1
204 | self.final_norm = math.sqrt(rz_0)
205 | if(self.parameters["print_level"] >= 0):
206 | print( self.reason[self.reason_id])
207 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm)
208 | return x, False
209 | # Check if the direction is negative before taking a step.
210 | feed_dict[self.problem.dw] = p
211 | Ap = self.sess.run(self.Aop,feed_dict = feed_dict)
212 | pAp = np.dot(p,Ap)
213 | negative_direction = (pAp <= 0.0)
214 | if negative_direction:
215 | self.converged = True
216 | self.reason_id = 2
217 | x += p
218 | r -= Ap
219 | feed_dict[self.Minv.x] = r
220 | z = self.sess.run(self.Minv(),feed_dict = feed_dict)
221 | rz = np.dot(r,z)
222 | self.final_norm = math.sqrt(rz)
223 | if(self.parameters["print_level"] >= 0):
224 | print( self.reason[self.reason_id])
225 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm)
226 | return x, False
227 |
228 | # Loop until convergence
229 | self.iter = 1
230 | while True:
231 | # Calculate alpha
232 | alpha = rz_0/pAp
233 |
234 | # Update x
235 | on_boundary,x = self.update_x(x,alpha,p)
236 | # Update r
237 |
238 | r -= alpha*Ap
239 | # Apply preconditioner z = M^{-1}r
240 | feed_dict[self.Minv.x] = r
241 | z = self.sess.run(self.Minv(),feed_dict = feed_dict)
242 |
243 | # Calculate rz
244 | rz = np.dot(r,z)
245 | # print(self.iter,rz)
246 | # Check convergence
247 | converged = (rz < tol)
248 | if converged:
249 | self.converged = True
250 | self.reason_id = 1
251 | self.final_norm = math.sqrt(rz)
252 | if(self.parameters["print_level"] >= 0):
253 | print( self.reason[self.reason_id])
254 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm)
255 | break
256 | self.iter += 1
257 | if self.iter > self.parameters["max_iter"]:
258 | self.converged = False
259 | self.reason_id = 0
260 | self.final_norm = math.sqrt(rz)
261 | if(self.parameters["print_level"] >= 0):
262 | print( self.reason[self.reason_id])
263 | print( "Not Converged. Final residual norm ", self.final_norm)
264 | break
265 | beta = rz / rz_0
266 | p = z + beta*p
267 | # Check if the direction is negative, and prepare for next iteration.
268 | feed_dict[self.problem.dw] = p
269 | Ap = self.sess.run(self.Aop,feed_dict = feed_dict)
270 | pAp = np.dot(p,Ap)
271 | negative_direction = (pAp <= 0.0)
272 |
273 | if negative_direction:
274 | self.converged = True
275 | self.reason_id = 2
276 | self.final_norm = math.sqrt(rz)
277 | if(self.parameters["print_level"] >= 0):
278 | print( self.reason[self.reason_id])
279 | print( "Converged in ", self.iter, " iterations with final norm ", self.final_norm)
280 | break
281 |
282 | rz_0 = rz
283 |
284 | return x, on_boundary
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 | class CGSolver_scipy(ABC):
295 | """
296 | This class implements a wrapper for the scipy CG solver
297 | """
298 | reason = ["Maximum Number of Iterations Reached",
299 | "Relative/Absolute residual less than tol",
300 | "Reached a negative direction",
301 | "Reached trust region boundary"
302 | ]
303 | def __init__(self,problem,regularization,sess = None,Aop = None,preconditioner = None,parameters = ParametersCGSolver()):
304 | """
305 | The constructor for this class takes
306 | -problem: hessianlearn.problem.Problem
307 | -regularization: hessianlearn.problem.Regularization
308 | -sees: tf.Session()
309 | -Aop: matrix vector product callable
310 | -preconditioner: hessianlearn.problem.Preconditioner (not currently even used)
311 | -parameters: solver hyperparameters
312 | """
313 | self.sess = sess
314 | self.problem = problem
315 | self.regularization = regularization
316 | self.parameters = parameters
317 | if Aop is None:
318 | self.Aop = self.problem.Hdw + self.regularization.Hdw
319 | else:
320 | # be careful to note what the operator requires be passed into feed_dict
321 | self.Aop = Aop
322 | # # Define preconditioner
323 | # if preconditioner is None:
324 | # self.Minv = IdentityPreconditioner(problem,self.problem.dtype)
325 | # else:
326 | # self.Minv = preconditioner
327 |
328 |
329 |
330 |
331 |
332 |
333 | def solve(self,b,feed_dict = None,x_0 = None):
334 | r"""
335 | Solve Ax=b by the mines method
336 | as defined in Iterative Methods Ed. 2 by Youssef Saad p 140
337 | -b: right hand side
338 | -feed_dict: data dictionary for
339 | -x_0: initial guess
340 | """
341 | assert self.sess is not None
342 | assert feed_dict is not None
343 |
344 | self.iter = 0
345 | self.converged = False
346 | self.reason_id = 0
347 | x = np.zeros_like(b)
348 |
349 | feed_dict[self.problem.dw] = x
350 | Ax_0 = self.sess.run(self.Aop,feed_dict = feed_dict)
351 | # Calculate initial residual r = Ax_0 -b
352 | r = b - Ax_0
353 | # Calculate tolerance for Eisenstat Walker conditions
354 | rr_0 = np.dot(r,r)
355 | rtol2 = rr_0 * self.parameters["rel_tolerance"] * self.parameters["rel_tolerance"]
356 | atol2 = self.parameters["abs_tolerance"] * self.parameters["abs_tolerance"]
357 | tol = max(rtol2, atol2)
358 | import scipy
359 | from scipy.sparse.linalg import LinearOperator
360 |
361 | def Ap(p):
362 | feed_dict[self.problem.dw] = p
363 | return self.sess.run(self.Aop,feed_dict = feed_dict)
364 |
365 | n = self.problem.dimension
366 |
367 | A = LinearOperator((n,n), matvec=Ap)
368 |
369 | # self.iter += self.parameters["max_iter"]
370 |
371 | def update_iters(rk):
372 | self.iter +=1
373 |
374 | return scipy.sparse.linalg.cg(A, b, tol=tol, maxiter=self.parameters["max_iter"],callback = update_iters)
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
--------------------------------------------------------------------------------