├── hessianexps ├── run_loop.sh ├── model │ ├── __init__.py │ ├── logreg.py │ ├── fullconn.py │ ├── forward.py │ └── abstractmodel.py ├── rundataplotter.py ├── rayleigh.py ├── data.py ├── config.py └── runexperiment.py ├── .gitignore └── README.md /hessianexps/run_loop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for number in {1..50} 3 | do 4 | python runexperiment.py 5 | echo $number 6 | done 7 | 8 | -------------------------------------------------------------------------------- /hessianexps/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstractmodel import Model, create_model 2 | from .fullconn import create_fully_connected_model 3 | from .logreg import create_logreg_model 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | */.DS_Store 3 | */*/.DS_Store 4 | 5 | .ipynb_checkpoints 6 | */.ipynb_checkpoints 7 | */*/.ipynb_checkpoints 8 | 9 | */__pycache__ 10 | */*/__pycache__ 11 | 12 | 13 | -------------------------------------------------------------------------------- /hessianexps/model/logreg.py: -------------------------------------------------------------------------------- 1 | from autograd import numpy as np 2 | 3 | from . import Model 4 | from .forward import make_logreg_predict 5 | 6 | 7 | def create_logreg_model(args, inputs, targets): 8 | """Create a Logistic Regression model. (Factory function)""" 9 | initial_params = np.ones(args.input_dim) * 0.01 10 | predict_logreg = make_logreg_predict(inputs) 11 | mdl = Model(initial_params, predict_logreg, inputs, targets) 12 | 13 | return mdl 14 | -------------------------------------------------------------------------------- /hessianexps/rundataplotter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to visualize generated dataset. 3 | 4 | Uses the same command-line arguments as runexperiment.py. 5 | """ 6 | import matplotlib.pyplot as plt 7 | 8 | import config 9 | import data 10 | 11 | 12 | if __name__ == '__main__': 13 | args = config.parse_command_line_arguments() 14 | inputs, targets = data.generate_data(args) 15 | labels = targets.argmax(axis=1) # position of 1 for each sample 16 | plt.scatter(inputs[:, 0], inputs[:, 1], c=labels, lw=0) 17 | plt.show() 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hessian-for-basicDL 2 | 3 | Developed in collaboration with [*vug*](https://github.com/vug). The repo can calculate the Hessian matrix and/or its spectrum for simple neural nets. It can also find the largest and smallest few eigenvalues without finding the full Hessian. 4 | 5 | ## References 6 | 7 | Some results using this code has been used in the following work: 8 | 9 | [1] Levent Sagun, Utku Evci, V. Ugur Guney, Yann Dauphin, Leon Bottou, [*Empirical Analysis of the Hessian of Over-Parametrized Neural Networks*](https://arxiv.org/abs/1706.04454) 10 | 11 | Relevant related works are: 12 | 13 | [2] Levent Sagun, Leon Bottou, Yann LeCun, [*Eigenvalues of the Hessian in Deep Learning: Singularity and Beyond*](https://arxiv.org/abs/1611.07476) 14 | 15 | [3] Pratik Chaudhari, Anna Choromanska, Stefano Soatto, Yann LeCun, Carlo Baldassi, Christian Borgs, Jennifer Chayes, Levent Sagun, Riccardo Zecchina, [*Entropy-SGD: Biasing Gradient Descent Into Wide Valleys*](https://arxiv.org/abs/1611.01838) 16 | -------------------------------------------------------------------------------- /hessianexps/model/fullconn.py: -------------------------------------------------------------------------------- 1 | from autograd import numpy as np 2 | 3 | from .forward import make_fully_connected_predict 4 | from . import Model 5 | 6 | 7 | def create_fully_connected_model(args, inputs, targets): 8 | """Create a Fully-Connected NN model. (Factory function)""" 9 | hidden_layer_sizes = args.layer_sizes 10 | param_scale = 0.1 11 | 12 | layer_sizes = [args.input_dim] + hidden_layer_sizes + [args.num_classes] 13 | init_params = initalize_parameters(layer_sizes, param_scale) 14 | predict_full_conn = make_fully_connected_predict(inputs) 15 | 16 | mdl = Model(init_params, predict_full_conn, inputs, targets) 17 | 18 | return mdl 19 | 20 | 21 | def initalize_parameters(layer_sizes, scale=0.1): 22 | """Build a list of (weights, biases) tuples, one for each layer in the net.""" 23 | params = [] 24 | for n_units_in_curr_layer, n_units_in_next_layer in zip(layer_sizes[:-1], layer_sizes[1:]): 25 | weight_matrix = scale * np.random.randn(n_units_in_curr_layer, n_units_in_next_layer) 26 | bias_vector = scale * np.random.randn(n_units_in_next_layer) 27 | params.append((weight_matrix, bias_vector)) 28 | return params 29 | -------------------------------------------------------------------------------- /hessianexps/model/forward.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module that includes makers of different prediction functions of different classification models. 3 | 4 | * They take inputs. 5 | * Create predict() function that takes classifier parameters as input 6 | * and return classifier of predictions. 7 | * make_ functions encapsulate `inputs` so that predict() is a function of single argument. 8 | """ 9 | from autograd import numpy as np 10 | 11 | 12 | def make_logreg_predict(inputs): 13 | """Return probability prediction function of Logistic Regression.""" 14 | def predict(weights, inputs=inputs): 15 | probs_label_1 = sigmoid_exp(np.dot(inputs, weights)) 16 | probs_label_1 = probs_label_1.reshape(-1, 1) 17 | probs_label_0 = 1.0 - probs_label_1 18 | probs = np.hstack([probs_label_0, probs_label_1]) 19 | return probs 20 | 21 | return predict 22 | 23 | 24 | def make_fully_connected_predict(inputs): 25 | """Return probability prediction function of Fully-Connected Neural Network.""" 26 | def predict(params, inputs=inputs, activation=relu): 27 | """ 28 | :param params: a list of (weights, bias) tuples 29 | :param activation: the activation function 30 | :return: normalized class probabilities 31 | """ 32 | inp = inputs 33 | for W, b in params: 34 | out = np.dot(inp, W) + b 35 | inp = activation(out) # activation tanh 36 | return softmax(out) 37 | 38 | return predict 39 | 40 | 41 | def softmax(mat): 42 | return np.exp(mat) / np.sum(np.exp(mat), axis=1, keepdims=True) 43 | 44 | def relu(x): 45 | return np.maximum(0, x) 46 | 47 | def sigmoid_exp(x): 48 | return 1.0 / (1.0 + np.exp(-x)) 49 | 50 | def sigmoid_tanh(x): 51 | return 0.5 * (np.tanh(x) + 1.0) 52 | -------------------------------------------------------------------------------- /hessianexps/rayleigh.py: -------------------------------------------------------------------------------- 1 | from autograd import numpy as np 2 | from scipy.optimize import fmin_l_bfgs_b 3 | 4 | 5 | def get_top_eigensystem(model, input_dim, k=1, tolerance=1e-5): 6 | """Compute the largest eigenvalue and vectors by the Rayleigh quotient minimization.""" 7 | return get_extremal_eigensystem(model, input_dim, direction='top', k=k, tolerance=tolerance) 8 | 9 | 10 | def get_bottom_eigensystem(model, input_dim, k=1, tolerance=1e-5): 11 | """Compute the smallest eigenvalue and vectors by the Rayleigh quotient minimization.""" 12 | return get_extremal_eigensystem(model, input_dim, direction='bottom', k=k, tolerance=tolerance) 13 | 14 | 15 | def get_extremal_eigensystem(model, input_dim, direction='top', k=1, tolerance=1e-5): 16 | """Compute the smallest/largest eigenvalue and vectors by the Rayleigh quotient minimization.""" 17 | def maximizer(v): 18 | # Note: i, eigenvectors and model are defined in the outer function get_extremal_eigensystem(). 19 | quot, grad_quot = _largest_rayleigh_quotient_in_subspace(v, i, eigenvectors, model) 20 | if direction == 'top': 21 | return -quot, -grad_quot 22 | else: 23 | return quot, grad_quot 24 | 25 | eigenvalues = [] 26 | eigenvectors = [] 27 | for i in range(k): 28 | v0 = np.random.normal(0, 1, len(model.params_flat)) 29 | v_max, lambda_max, _ = fmin_l_bfgs_b(maximizer, v0, pgtol=tolerance) 30 | if direction == 'top': 31 | lambda_max *= -1 32 | eigenvalues.append(lambda_max) 33 | eigenvectors.append(v_max) 34 | return eigenvalues, eigenvectors 35 | 36 | 37 | def _largest_rayleigh_quotient_in_subspace(v, i, eigvecs, model): 38 | """Compute largest Rayleigh quotient and its gradient of Hessian in subspace. 39 | 40 | Subspace is the vector space of Hessian minus the eigenspace spanned by first i eigenvectors. 41 | 42 | :param eigvecs: the list of first i eigenvectors 43 | """ 44 | if i != 0: 45 | matrix = np.concatenate((eigvecs, [v]), axis=0) 46 | q, r = np.linalg.qr(matrix.transpose()) 47 | v = q.transpose()[i] * r[i][i] 48 | return model.rayleigh_quotient(v), model.grad_rayleigh(v) 49 | -------------------------------------------------------------------------------- /hessianexps/data.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | from sklearn import preprocessing 3 | 4 | 5 | def generate_data(args): 6 | if args.data_type == 'blob': 7 | inputs, targets = generate_blobs(2 * args.num_samples, args.input_dim, args.cov_factor, args.num_classes, args.data_seed) 8 | elif args.data_type == 'circle': 9 | inputs, targets = generate_circles(2 * args.num_samples, args.data_seed) 10 | elif args.data_type == 'moon': 11 | inputs, targets = generate_moons(2 * args.num_samples, args.data_seed) 12 | else: 13 | raise Exception('Unknown --data-type: {}'.format(args.data_type)) 14 | 15 | encoder = preprocessing.OneHotEncoder() 16 | targets_nx1 = targets.reshape(-1, 1) 17 | targets_onehot = encoder.fit_transform(targets_nx1).toarray() 18 | inputs_scaled = preprocessing.scale(inputs) 19 | 20 | inputs_train, inputs_test = inputs_scaled[:args.num_samples], inputs_scaled[args.num_samples:] 21 | targets_train, targets_test = targets_onehot[:args.num_samples], targets_onehot[args.num_samples:] 22 | return inputs_train, targets_train, inputs_test, targets_test 23 | 24 | 25 | def generate_blobs(num_samples, input_dim, cov_factor, num_classes, seed=None): 26 | """N dimensional features in two classes (Gaussian blobs).""" 27 | inputs, targets = datasets.make_blobs(n_samples=num_samples, n_features=input_dim, cluster_std=cov_factor, random_state=seed, 28 | centers=num_classes) 29 | return inputs, targets 30 | 31 | 32 | def generate_circles(num_samples, seed=None, noise=0.1, factor =0.7): 33 | """Generate 2D features in two classes (concentric circles). 34 | 35 | http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html 36 | 37 | :param noise: Standard deviation of Gaussian noise added to the data 38 | :param factor: Scale factor between inner and outer circle 39 | """ 40 | inputs, targets = datasets.make_circles(n_samples=num_samples, shuffle=True, noise=noise, random_state=seed, factor=factor) 41 | return inputs, targets 42 | 43 | 44 | def generate_moons(num_samples, seed=None, noise=0.1): 45 | """Generate 2D features in two classes (up and down crescent moons). 46 | 47 | http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html 48 | 49 | :param noise: Standard deviation of Gaussian noise added to the data 50 | """ 51 | inputs, targets = datasets.make_moons(n_samples=num_samples, shuffle=True, noise=noise, random_state=seed) 52 | return inputs, targets 53 | -------------------------------------------------------------------------------- /hessianexps/model/abstractmodel.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for abstract Model class. 3 | 4 | Different models differ only by their prediction function predict(). 5 | """ 6 | import autograd 7 | from autograd import numpy as np 8 | from autograd.util import flatten, quick_grad_check 9 | 10 | import model 11 | 12 | 13 | class Model(object): 14 | """Generic model that have methods to calculate loss, gradient etc.""" 15 | 16 | def __init__(self, params, predict, inputs, targets): 17 | """Construct a Model object given a prediction function.""" 18 | self.__params = params 19 | self.__params_flat, self.unflatten_params = flatten(self.params) 20 | self.predict = predict 21 | self.inputs = inputs 22 | self.targets = targets 23 | 24 | self.gradient = autograd.grad(self.loss) 25 | self.hessian = autograd.hessian(self.loss) 26 | self.hess_dot_vec = autograd.hessian_vector_product(self.loss) 27 | self.grad_rayleigh = autograd.grad(self.rayleigh_quotient) 28 | 29 | def loss(self, params_flat, inputs=None, targets=None): 30 | """Take unflatten parameters, flatten them and calculate log-loss.""" 31 | params = self.unflatten_params(params_flat) 32 | 33 | if inputs is None: 34 | predictions = self.predict(params) 35 | return log_loss(self.targets, predictions) 36 | 37 | predictions = self.predict(params, inputs) 38 | return log_loss(targets, predictions) 39 | 40 | @property 41 | def params(self): 42 | return self.__params 43 | 44 | @params.setter 45 | def params(self, params): 46 | self.__params = params 47 | self.__params_flat, self.unflatten_params = flatten(self.__params) 48 | 49 | @property 50 | def params_flat(self): 51 | return self.__params_flat 52 | 53 | @params_flat.setter 54 | def params_flat(self, params_flat): 55 | self.__params_flat = params_flat 56 | self.__params = self.unflatten_params(self.__params_flat) 57 | 58 | def rayleigh_quotient(self, vec): 59 | hv_val = self.hess_dot_vec(self.params_flat, vec) 60 | rq = np.dot(hv_val, vec) / np.dot(vec, vec) 61 | return rq 62 | 63 | 64 | def log_loss(targets, preds, eps=1e-15): 65 | preds = np.clip(preds, eps, 1 - eps) 66 | return - np.mean(np.sum(targets * np.log(preds), axis=1)) 67 | 68 | 69 | def create_model(args, inputs, targets): 70 | if args.classifier == 'logreg': 71 | mdl = model.create_logreg_model(args, inputs, targets) 72 | elif args.classifier == 'fullconn': 73 | mdl = model.create_fully_connected_model(args, inputs, targets) 74 | else: 75 | raise Exception('Unknown classifier type {}'.format(args.classifier)) 76 | 77 | quick_grad_check(mdl.loss, mdl.params_flat, verbose=False) 78 | return mdl 79 | -------------------------------------------------------------------------------- /hessianexps/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | 4 | def parse_command_line_arguments(): 5 | parser = argparse.ArgumentParser() 6 | 7 | # Data Generation 8 | parser.add_argument('--data-type', type=str, choices=['blob', 'circle', 'moon'], default='blob', 9 | help='Type of random data generation pattern.') 10 | parser.add_argument('--num-samples', type=int, default=100, 11 | help='Number of training samples') 12 | parser.add_argument('--input-dim', type=int, default=2, 13 | help='Dimension of the input space') 14 | parser.add_argument('--num-classes', type=int, default=2, 15 | help='Number of classes in generated data. ' 16 | 'Taken into account only by classifiers and data generators that support multi-class') 17 | parser.add_argument('--cov-factor', type=float, default=1., 18 | help='Multiplier for the covariance of the data') 19 | parser.add_argument('--data-seed', type=int, default=None, 20 | help='Seed for random number generation of data generators') 21 | 22 | # Model 23 | parser.add_argument('--classifier', type=str, choices=['logreg', 'fullconn'], default='logreg', 24 | help='Type of classifier. Logistic Regression, Fully-Connected NN.') 25 | parser.add_argument('--layer-sizes', nargs='*', type=int, default=[10, 5], 26 | help='Number of units in hidden layers. ' 27 | 'First layer will have --input-dim units. Last layer will have --num-classes units.') 28 | 29 | # Training 30 | parser.add_argument('--batch-size', type=int, default=0, 31 | help='Number of samples in a training batch. ' 32 | '0 means whole dataset') 33 | parser.add_argument('--learning-rate', type=float, default=0.1, 34 | help='Learning rate') 35 | parser.add_argument('--stopping-grad-norm', type=float, default=1e-4, 36 | help='Stop training if grad_norm becomes smaller than this threshold') 37 | parser.add_argument('--max-iterations', type=int, default=1000000, 38 | help='Cancel training after maximum number of iteration steps') 39 | 40 | # Results 41 | parser.add_argument('--hessian-calc-period', type=int, default=0, 42 | help='Calculate hessian at every N iteration. ' 43 | '0 means do not calculate at intermediate iterations.') 44 | parser.add_argument('--results-folder', type=str, default='/Users/leventsagun/Dropbox/calismalar/second_order_explorations_NIPS2017', 45 | help='Folder in which to put all results folders') 46 | parser.add_argument('--experiment-folder', type=str, default='defaults', 47 | help='Folder in which to write results files') 48 | 49 | # Hessian analysis 50 | parser.add_argument('--top-evals', type=int, default=2, 51 | help='Find top k evals') 52 | parser.add_argument('--bottom-evals', type=int, default=2, 53 | help='Find bottom k evals') 54 | 55 | args = parser.parse_args() 56 | 57 | args.results_folder = args.results_folder + '/' + args.experiment_folder 58 | 59 | if args.classifier == 'logreg' and args.data_type == 'blob' and args.num_classes != 2: 60 | raise Exception('LogReg for more than 2 classes is not implemented yet.') 61 | return args 62 | -------------------------------------------------------------------------------- /hessianexps/runexperiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from datetime import datetime 4 | 5 | import autograd.numpy as np 6 | 7 | import config 8 | import data 9 | import model 10 | from rayleigh import get_top_eigensystem, get_bottom_eigensystem 11 | 12 | 13 | def main(): 14 | args = config.parse_command_line_arguments() 15 | 16 | inputs_train, targets_train, inputs_test, targets_test = data.generate_data(args) 17 | results = { 18 | 'inputs_train': inputs_train, 19 | 'targets_train': targets_train, 20 | 'inputs_test': inputs_test, 21 | 'targets_test': targets_test 22 | } 23 | 24 | mdl = model.create_model(args, inputs_train, targets_train) 25 | 26 | train_model(args, mdl, results) 27 | 28 | 29 | def train_model(args, mdl, results): 30 | results['args'] = args 31 | init_loss = mdl.loss(mdl.params_flat) 32 | init_grad_norm = np.linalg.norm(mdl.gradient(mdl.params_flat)) 33 | print('Initial loss: {}, norm grad: {}'.format(init_loss, init_grad_norm)) 34 | results['init_full_loss'] = init_loss 35 | results['init_full_grad_norm'] = init_grad_norm 36 | 37 | results['history1'] = [] 38 | results['history1_columns'] = ['iter_no', 'batch_loss', 'batch_grad_norm', 'batch_param_norm'] 39 | results['history2'] = [] 40 | results['history2_columns'] = ['full_hessian', 'full_hessian_evals'] 41 | for iter_no in range(args.max_iterations): 42 | inputs, targets = get_batch_samples(iter_no, args, mdl) 43 | batch_loss = mdl.loss(mdl.params_flat, inputs, targets) 44 | batch_grad = mdl.gradient(mdl.params_flat, inputs, targets) 45 | batch_grad_norm = np.linalg.norm(batch_grad) 46 | batch_param_norm = np.linalg.norm(mdl.params_flat) 47 | results['history1'].append([iter_no, batch_loss, batch_grad_norm, batch_param_norm]) 48 | 49 | if iter_no == 0 or (args.hessian_calc_period != 0 and iter_no % args.hessian_calc_period == 0): 50 | hess = mdl.hessian(mdl.params_flat) 51 | evals = np.linalg.eigvalsh(hess) 52 | results['history2'].append([iter_no, hess, evals]) 53 | 54 | if batch_grad_norm <= args.stopping_grad_norm: 55 | break 56 | 57 | mdl.params_flat -= batch_grad * args.learning_rate 58 | print('{:06d} {} loss: {:.8f}, norm grad: {:.8f}'.format( 59 | iter_no, datetime.now(), batch_loss, batch_grad_norm)) 60 | 61 | final_loss = mdl.loss(mdl.params_flat) 62 | final_grad_norm = np.linalg.norm(mdl.gradient(mdl.params_flat)) 63 | print('Final loss: {}, norm grad: {}'.format(final_loss, final_grad_norm)) 64 | results['final_full_loss'] = final_loss 65 | results['final_full_grad_norm'] = final_grad_norm 66 | 67 | hess = mdl.hessian(mdl.params_flat) 68 | evals = np.linalg.eigvalsh(hess) 69 | results['history2'].append([iter_no, hess, evals]) 70 | results['final_params'] = mdl.params 71 | 72 | save_results(args, results) 73 | 74 | 75 | def get_batch_samples(iter_no, args, mdl): 76 | """Return inputs and outputs belonging to batch given iteration number.""" 77 | if args.batch_size == 0: 78 | return None, None 79 | 80 | num_batches = int(np.ceil(len(mdl.inputs) / args.batch_size)) 81 | mod_iter_no = iter_no % num_batches 82 | start = mod_iter_no * args.batch_size 83 | end = (mod_iter_no + 1) * args.batch_size 84 | inputs = mdl.inputs[start:end] 85 | targets = mdl.targets[start:end] 86 | return inputs, targets 87 | 88 | 89 | def save_results(args, results): 90 | if not os.path.exists(args.results_folder): 91 | os.makedirs(args.results_folder) 92 | now = datetime.now() 93 | now_str = now.strftime('%Y%m%d-%H%M%S-%f') 94 | filename = 'results-{}.pkl'.format(now_str) 95 | filepath = os.path.join(args.results_folder, filename) 96 | with open(filepath, 'wb') as fh: 97 | pickle.dump(results, fh) 98 | 99 | 100 | def compute_eigensystem(args, mdl): 101 | """Compute the eigenvalues of the symmetric (exact) Hessian.""" 102 | evs = np.linalg.eigvalsh(mdl.hessian(mdl.params_flat)) 103 | print('all eigenvalues {}'.format(np.array(sorted(evs)))) 104 | 105 | top_evals, top_evecs = get_top_eigensystem(mdl, args.input_dim, args.top_evals) 106 | bottom_evals, bottom_evecs = get_bottom_eigensystem(mdl, args.input_dim, args.bottom_evals) 107 | print('largest eigenvalue by Raleigh quotient {}'.format(np.array(sorted(top_evals)))) 108 | print('smallest eigenvalue by Raleigh quotient {}'.format(np.array(sorted(bottom_evals)))) 109 | 110 | srayeigs = np.array(sorted(bottom_evals) + sorted(top_evals)) 111 | seigs = np.array(sorted(evs[:args.bottom_evals]) + sorted(evs[-args.top_evals:])) 112 | print('percentage diff', np.array((srayeigs - seigs) / seigs * 100., dtype=np.int32)) 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | --------------------------------------------------------------------------------