├── tests ├── pytest.ini ├── __init__.py ├── utils │ ├── __init__.py │ └── shared.py ├── problems │ ├── __init__.py │ ├── convolutional.py │ ├── conv_utils.py │ └── linear.py ├── test_experiments.py ├── test_conv.py ├── test_nn.py ├── test_finders.py └── test_optimizers.py ├── requirements.txt ├── autocrit ├── nn │ ├── __init__.py │ ├── conv.py │ ├── networks.py │ └── layers.py ├── utils │ ├── __init__.py │ ├── math.py │ └── random_matrix.py ├── finders │ ├── __init__.py │ ├── base.py │ ├── gradnormmin.py │ ├── newtons.py │ └── minresQLP.py ├── __init__.py ├── defaults.py ├── optimizers.py └── experiments.py ├── .flake8 ├── requirements-dev.txt ├── README.md ├── setup.py └── .gitignore /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -m "not slow" 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from . import problems 2 | 3 | __all__ = ["problems"] 4 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import shared 2 | 3 | __all__ = ["shared"] 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | autograd>=1.2 2 | numpy>=1.16 3 | pandas>=0.24 4 | scipy>=1.2 5 | -------------------------------------------------------------------------------- /tests/problems/__init__.py: -------------------------------------------------------------------------------- 1 | from . import linear 2 | 3 | __all__ = ["linear"] 4 | -------------------------------------------------------------------------------- /autocrit/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layers, networks 2 | 3 | __all__ = ["layers", "networks"] 4 | -------------------------------------------------------------------------------- /autocrit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import math, random_matrix 2 | 3 | __all__ = ["math", "random_matrix"] 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E731, E741, W504 3 | max-line-length = 100 4 | 5 | exclude = critfinder/finders/minresQLP.py 6 | -------------------------------------------------------------------------------- /autocrit/finders/__init__.py: -------------------------------------------------------------------------------- 1 | from . import base, gradnormmin, newtons 2 | 3 | __all__ = ["base", "gradnormmin", "newtons"] 4 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autograd>=1.2 2 | numpy>=1.16 3 | pandas>=0.24 4 | scipy==1.1 5 | 6 | torch 7 | torchvision 8 | 9 | pyflakes>=2.1.1 10 | pytest>=4.4.1 11 | scikit-learn>=0.20.3 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # autocrit 2 | 3 | Autograd-based package for finding critical points of functions using Newton-based methods and gradient norm minimization. 4 | Specifically intended for use with neural networks, see `autocrit.nn` module. 5 | 6 | To install, clone this repository, navigate to the directory it was cloned into, and then use the command 7 | ``` 8 | pip install -e autocrit 9 | ``` 10 | 11 | For additional tools useful in the analysis of critical point-finding experiments, 12 | see the [`autocrit_tools` repo](https://github.com/charlesfrye/autocrit_tools). 13 | -------------------------------------------------------------------------------- /autocrit/__init__.py: -------------------------------------------------------------------------------- 1 | from .finders import gradnormmin, newtons 2 | from . import experiments, finders, nn, optimizers 3 | 4 | GradientNormMinimizer = gradnormmin.GradientNormMinimizer 5 | 6 | FastNewtonMR = newtons.FastNewtonMR 7 | FastNewtonTR = newtons.FastNewtonTR 8 | 9 | OptimizationExperiment = experiments.OptimizationExperiment 10 | CritFinderExperiment = experiments.CritFinderExperiment 11 | 12 | FullyConnectedNetwork = nn.networks.FullyConnected 13 | 14 | __all__ = ["finders", "optimizers", 15 | "gradnormmin", "newtons", 16 | "GradientNormMinimizer", 17 | "FastNewtonMR", "FastNewtonTR", 18 | "OptimizationExperiment", "CritFinderExperiment", 19 | "FullyConnectedNetwork"] 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | version = "0.0.1" 4 | 5 | with open("README.md", "r") as f: 6 | long_description = f.read() 7 | 8 | setuptools.setup( 9 | name="autocrit", 10 | version="0.0.1", 11 | author="Charles Frye", 12 | author_email="cfrye59@gmail.com", 13 | description="Critical point-finding algorithms in autograd", 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | packages=setuptools.find_packages(), 17 | classifiers=[ 18 | "Programming Language :: Python :: 3.5", 19 | "Programming Language :: Python :: 3.6", 20 | "Programming Language :: Python :: 3.7", 21 | "Programming Language :: Python :: 3.8", 22 | "Operating System :: OS Independent"], 23 | ) 24 | -------------------------------------------------------------------------------- /autocrit/defaults.py: -------------------------------------------------------------------------------- 1 | """Shared default values for numerical constants/hyperparameters 2 | """ 3 | # ALPHA: learning rate 4 | DEFAULT_ALPHA = 0.1 5 | # BETA: learning rate decrease factor in BTLS 6 | DEFAULT_BETA = 0.5 7 | # RHO: scaling factor for Armijo/sufficient decrease criterion 8 | DEFAULT_RHO = 1e-4 9 | # RHO_PURE: same as RHO, but for the Pure Newton check 10 | DEFAULT_RHO_PURE = 0.5 11 | # GAMMA: scaling factor for Wolfe/sufficient curvature decrease criterion 12 | DEFAULT_GAMMA = 0.9 13 | # GAMMAS: "nudge" to add to diagonal in NewtonTR 14 | DEFAULT_GAMMAS = (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6) 15 | 16 | # MOMENTUM: momentum coefficient for MomentumOptimizers 17 | DEFAULT_MOMENTUM = 0.9 18 | 19 | # MINIMIZER_PARAMS: default parameters for GNM minimizer 20 | DEFAULT_MINIMIZER_PARAMS = {"lr": DEFAULT_ALPHA} 21 | 22 | # STEP_SIZE: "learning rate" for Newton methods 23 | DEFAULT_STEP_SIZE = 1.0 24 | 25 | # RTOL, MAXIT: parameters for MRQLP step of NewtonMR 26 | DEFAULT_RTOL = 1e-10 27 | DEFAULT_MAXIT = 25 28 | -------------------------------------------------------------------------------- /tests/test_experiments.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | 3 | import autocrit 4 | 5 | import tests.utils.shared as shared 6 | 7 | 8 | def test_OptimizationExperiment(tmpdir): 9 | """test saving, execution, and loading for default kwargs 10 | """ 11 | num_iters = 10 12 | 13 | _, network, init_theta = shared.generate_random_shallow_regression() 14 | 15 | experiment = autocrit.OptimizationExperiment(network.loss) 16 | 17 | outfile = tmpdir / "optexpt.json" 18 | experiment_test(experiment, network.loss, 19 | init_theta, num_iters, outfile) 20 | 21 | 22 | def test_CritFinderExperiment(tmpdir): 23 | """test saving, execution, and loading for default kwargs 24 | """ 25 | num_iters = 10 26 | 27 | _, network, init_theta = shared.generate_random_shallow_regression() 28 | 29 | experiment = autocrit.CritFinderExperiment(network.loss, "newtonMR") 30 | 31 | outfile = tmpdir / "cfexpt.json" 32 | experiment_test(experiment, network.loss, 33 | init_theta, num_iters, outfile) 34 | 35 | 36 | def experiment_test(experiment, f, init_theta, num_iters, outfile): 37 | thetas = experiment.run(init_theta, num_iters=num_iters) 38 | 39 | experiment.to_json(outfile) 40 | 41 | reloaded_expt = experiment.from_json(f, outfile) 42 | assert experiment.construct_dictionary() == reloaded_expt.construct_dictionary() 43 | 44 | reloaded_thetas = reloaded_expt.run(init_theta, num_iters=num_iters) 45 | assert np.array_equal(thetas, reloaded_thetas) 46 | -------------------------------------------------------------------------------- /tests/test_conv.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import autograd 4 | import autograd.numpy as np 5 | 6 | import autocrit.nn.conv 7 | 8 | CONV_KWARGS = {"axes": ([2, 3], [2, 3]), 9 | "dot_axes": ([1], [0]), 10 | "mode": "valid"} 11 | 12 | 13 | def test_accelerated_equivalence(): 14 | warnings.filterwarnings("ignore") 15 | batch = 10 16 | in_ch = 3 17 | out_ch = 16 18 | k_size = 3 19 | 20 | X = np.random.randn(batch, in_ch, 32, 32) 21 | w = np.random.randn(out_ch, in_ch, k_size, k_size) 22 | w = np.ascontiguousarray(np.transpose(w, (1, 0, 2, 3))) 23 | 24 | y = autocrit.nn.conv.convolve(X, w, accelerated=False, **CONV_KWARGS) 25 | accelerated_y = autocrit.nn.conv.convolve(X, w, accelerated=True, **CONV_KWARGS) 26 | 27 | loss_grads = loss_grad(X, w) 28 | accelerated_loss_grads = accelerated_loss_grad(X, w) 29 | 30 | assert np.allclose(y, accelerated_y),\ 31 | "accelerated output not equal to autograd output" 32 | assert np.allclose(loss_grads, accelerated_loss_grads),\ 33 | "accelerated gradients not equal to autograd gradients" 34 | 35 | 36 | def accelerated_loss(X, w): 37 | activations = autocrit.nn.conv.convolve(X, w, accelerated=True, **CONV_KWARGS) 38 | squared_activations = np.square(activations) 39 | return np.mean(squared_activations) 40 | 41 | 42 | def loss(X, w): 43 | activations = autocrit.nn.conv.convolve(X, w, accelerated=False, **CONV_KWARGS) 44 | squared_activations = np.square(activations) 45 | return np.mean(squared_activations) 46 | 47 | 48 | accelerated_loss_grad = autograd.grad(accelerated_loss, argnum=1) 49 | loss_grad = autograd.grad(loss, argnum=1) 50 | -------------------------------------------------------------------------------- /autocrit/utils/math.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | 3 | XH_EPS = 1e-25 4 | RESCALE_EPS = 1e-6 5 | 6 | 7 | def rms(arr): 8 | return np.sqrt(np.mean(np.square(arr))) 9 | 10 | 11 | def relu(x): 12 | return np.where(x > 0., x, 0.) 13 | 14 | 15 | def softplus(x, lam=5.): 16 | return 1 / lam * np.log(1 + np.exp(lam * x)) 17 | 18 | 19 | def sigmoid(x): 20 | return np.where(x >= 0, _positive_sigm(x), _negative_sigm(x)) 21 | 22 | 23 | def swish(x): 24 | return np.multiply(x, sigmoid(x)) 25 | 26 | 27 | def _negative_sigm(x): 28 | expon = np.exp(-x) 29 | return 1 / (1 + expon) 30 | 31 | 32 | def _positive_sigm(x): 33 | expon = np.exp(x) 34 | return expon / (1 + expon) 35 | 36 | 37 | def mean_squared_error(x, xhat): 38 | return np.mean(np.square(x - xhat)) 39 | 40 | 41 | def softmax_cross_entropy(l, p): 42 | phat = softmax(l) 43 | return np.mean(cross_entropy(p, phat)) 44 | 45 | 46 | def softmax(x): 47 | expon = np.exp(x - np.max(x, axis=0)) 48 | return expon / np.sum(expon, axis=0) 49 | 50 | 51 | def cross_entropy(ps, qs, eps=XH_EPS): 52 | return np.einsum("ij,ij->j", ps, -np.log(qs + eps)) 53 | 54 | 55 | def logits_to_labels(logits): 56 | return np.argmax(logits, axis=0) 57 | 58 | 59 | def accuracy(yhats, ys): 60 | return np.mean(yhats == ys) 61 | 62 | 63 | def assess_accuracy(network, theta, X, Y_iis): 64 | logits = network.forward_pass(X, theta) 65 | labels = logits_to_labels(logits) 66 | return accuracy(labels, Y_iis) 67 | 68 | 69 | def pointwise_nonlinearity(parameters, x, nonlinearity): 70 | W, b = parameters 71 | return nonlinearity(np.dot(W, x) + b) 72 | 73 | 74 | def cossim(x, y): 75 | return np.dot(x.T, y) / (np.linalg.norm(x) * np.linalg.norm(y)) 76 | 77 | 78 | def rescale(arr, eps=RESCALE_EPS): 79 | return (arr - np.min(arr)) / max((np.max(arr) - np.min(arr)), eps) 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Generic Python 2 | # 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | # Vim 106 | *~ 107 | 108 | ## Project-Specific 109 | 110 | # data files 111 | *.npz 112 | *.npy 113 | *.pkl 114 | *.out 115 | 116 | # notebooks 117 | nbs/ 118 | -------------------------------------------------------------------------------- /tests/problems/convolutional.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | 3 | from . import conv_utils 4 | 5 | 6 | class Classification(object): 7 | 8 | def __init__(self, X, Y, Y_iis): 9 | """ 10 | X : array, h x w x ch x n input observations 11 | Y : array, l x n output label onehots 12 | Y_iis : array, 1 x n output label integers 13 | """ 14 | self.X = X 15 | self.Y = Y 16 | self.Y_iis = Y_iis 17 | 18 | self.exact_solution = None 19 | 20 | def loss(self, soln): 21 | if soln is None: 22 | return 0 23 | else: 24 | raise ValueError( 25 | "loss not implemented for convolutional classification") 26 | 27 | @classmethod 28 | def generate_test_problem(cls, n, im_side=17, autocorr_scale=5.): 29 | """Generate a test convolutional linear classification // 30 | logistic regression problem using a mixture of Gaussians. 31 | 32 | The problem is to separate images with a pink power spectrum 33 | from images with a white power spectrum. 34 | 35 | Images are square with side length im_side and the pink noise 36 | images have an autocorrelation scale monotonically increasing 37 | with autocorr_scale. 38 | """ 39 | n = n // 2 40 | 41 | X, Y, Y_iis = generate_convolutional_mog_data(n, im_side, autocorr_scale) 42 | 43 | return cls(X, Y, Y_iis) 44 | 45 | 46 | def generate_convolutional_mog_data(n, im_side=17, autocorr_scale=5.): 47 | 48 | circ_cov_mat = conv_utils.generate_iostropic_circulant_cov_2d( 49 | im_side, autocorr_scale=autocorr_scale) 50 | circ_class_samples = conv_utils.rgb_gauss_random_samples( 51 | n, cov_or_covs=circ_cov_mat) 52 | 53 | white_noise_cov_mat = np.eye(im_side ** 2) 54 | noise_class_samples = conv_utils.rgb_gauss_random_samples( 55 | n, cov_or_covs=white_noise_cov_mat) 56 | 57 | # combine batch major samples from each class 58 | inputs = np.concatenate([circ_class_samples.T, noise_class_samples.T]) 59 | 60 | # covert to images, then return to batch minor 61 | inputs = np.asarray( 62 | [conv_utils.to_im_rgb(inpt, im_side) for inpt in inputs]).T 63 | 64 | # generate one_hot and label vectors 65 | one_hots = np.hstack([np.tile(np.atleast_2d([1, 0]).T, [1, n]), 66 | np.tile(np.atleast_2d([0, 1]).T, [1, n])]) 67 | labels = np.argmax(one_hots, axis=0) 68 | 69 | return inputs, one_hots, labels 70 | -------------------------------------------------------------------------------- /autocrit/finders/base.py: -------------------------------------------------------------------------------- 1 | """Provides an abstract base class for critical-point-finding algorithms, 2 | aka finders. 3 | 4 | Primarily handles high-level API and logging. Also infers Hessian and 5 | Hessian-vector product (hvp) oracles from a zeroth-order oracle. 6 | """ 7 | import autograd 8 | import autograd.numpy as np 9 | 10 | 11 | class Finder(object): 12 | """Abstract base class for critical-point-finding algorithms (finders). 13 | """ 14 | 15 | def __init__(self, f, grad_kwargs=None, log_kwargs=None): 16 | self.f = f 17 | self.grad_f = autograd.grad(f) 18 | self.H = lambda theta: np.squeeze(autograd.hessian(f)(theta)) 19 | self.hvp = lambda theta, v: np.dot(self.H(theta), v) 20 | 21 | self.log = {} 22 | self.loggers = [] 23 | if log_kwargs is None: 24 | log_kwargs = {} 25 | self.log_kwargs = log_kwargs 26 | 27 | self.setup_logs(**log_kwargs) 28 | 29 | def run(self): 30 | raise NotImplementedError 31 | 32 | def update_logs(self, step_info): 33 | for logger in self.loggers: 34 | logger.write_log(step_info, self.log) 35 | 36 | def setup_logs(self, track_theta=False, track_f=True, track_grad_f=False, track_g=False, 37 | track_update=False): 38 | if track_theta: 39 | self.loggers.append( 40 | Logger("theta", 41 | lambda step_info: step_info["theta"])) 42 | 43 | if track_f: 44 | self.loggers.append( 45 | Logger("f_theta", 46 | lambda step_info: self.f(step_info["theta"]))) 47 | 48 | if track_grad_f: 49 | self.loggers.append( 50 | Logger("grad_theta", 51 | lambda step_info: self.grad_f(step_info["theta"]))) 52 | 53 | if track_g: 54 | self.loggers.append( 55 | Logger("g_theta", 56 | lambda step_info: 0.5 * np.sum(np.square(self.grad_f(step_info["theta"]))))) 57 | 58 | if track_update: 59 | self.loggers.append( 60 | Logger("update_direction", 61 | lambda step_info: step_info["update_direction"])) 62 | 63 | 64 | class Logger(object): 65 | 66 | def __init__(self, key, log_func): 67 | self.key = key 68 | self.log_func = log_func 69 | 70 | def write_log(self, step_info, log): 71 | if self.key not in log.keys(): 72 | log[self.key] = [] 73 | log[self.key].append(self.log_func(step_info)) 74 | -------------------------------------------------------------------------------- /autocrit/nn/conv.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa E221, E251 2 | """Adds optional torch acceleration to autograd's convolve function. 3 | """ 4 | import autograd.scipy.signal as _autograd_signal 5 | from functools import partial 6 | import autograd.numpy as np 7 | import numpy as npo # original numpy 8 | from autograd.extend import primitive, defvjp 9 | 10 | try: 11 | import torch 12 | import torch.nn.functional as torch_F 13 | torch_accelerated = True 14 | except ImportError: 15 | torch_accelerated = False 16 | 17 | 18 | def convolve(A, B, axes=None, dot_axes=[(), ()], mode='full', accelerated=torch_accelerated): 19 | args_are_implemented = check_implemented(axes, dot_axes, mode) 20 | if accelerated and args_are_implemented: 21 | return _torch_convolve(A, B, axes=axes, dot_axes=dot_axes, mode=mode) 22 | else: 23 | return _autograd_signal.convolve(A, B, axes=axes, dot_axes=dot_axes, mode=mode) 24 | 25 | 26 | @primitive 27 | def _torch_convolve(A, B, axes=None, dot_axes=[(), ()], mode='full'): 28 | B = np.ascontiguousarray(np.transpose(B[:, :, ::-1, ::-1], (1, 0, 2, 3))) 29 | At, Bt = torch.tensor(A), torch.tensor(B) 30 | if tuple(dot_axes) == ([0], [0]): 31 | At = torch.transpose(At, 0 ,1) 32 | yt = torch_F.conv2d(Bt, At) 33 | yt = torch.flip(torch.transpose(yt, 0, 1), (-2, -1)) 34 | else: 35 | yt = torch_F.conv2d(At, Bt) 36 | return np.asarray(yt) 37 | 38 | 39 | def check_implemented(axes, dot_axes, mode): 40 | """Check whether a fast convolution with these argument values has been implemented.""" 41 | if tuple(axes) != ([2, 3], [2, 3]): 42 | return False 43 | if tuple(dot_axes) not in [([1], [0]), ([0], [0])]: 44 | return False 45 | if mode != "valid": 46 | return False 47 | 48 | return True 49 | 50 | 51 | def _torch_grad_convolve(argnum, ans, A, B, axes=None, dot_axes=[(), ()], mode='full'): 52 | assert mode in ['valid', 'full'], "Grad for mode {0} not yet implemented".format(mode) 53 | axes, shapes = _autograd_signal.parse_axes(A.shape, B.shape, axes, dot_axes, mode) 54 | if argnum == 0: 55 | _, Y = A, B 56 | _X_, _Y_ = 'A', 'B' 57 | ignore_Y = 'ignore_B' 58 | elif argnum == 1: 59 | _, Y = B, A 60 | _X_, _Y_ = 'B', 'A' 61 | ignore_Y = 'ignore_A' 62 | else: 63 | raise NotImplementedError("Can't take grad of convolve w.r.t. arg {0}".format(argnum)) 64 | 65 | if mode == 'full': 66 | new_mode = 'valid' 67 | else: 68 | if any([x_size > y_size for x_size, y_size 69 | in zip(shapes[_X_]['conv'], shapes[_Y_]['conv'])]): 70 | new_mode = 'full' 71 | else: 72 | new_mode = 'valid' 73 | 74 | def vjp(g): 75 | result = convolve(g, Y[tuple(_autograd_signal.flipped_idxs(Y.ndim, axes[_Y_]['conv']))], 76 | axes = [axes['out']['conv'], axes[_Y_]['conv']], 77 | dot_axes = [axes['out'][ignore_Y], axes[_Y_]['ignore']], 78 | mode = new_mode) 79 | new_order = npo.argsort(axes[_X_]['ignore'] + axes[_X_]['dot'] + axes[_X_]['conv']) 80 | return np.transpose(result, new_order) 81 | return vjp 82 | 83 | 84 | defvjp(_torch_convolve, partial(_torch_grad_convolve, 0), partial(_torch_grad_convolve, 1)) 85 | -------------------------------------------------------------------------------- /autocrit/finders/gradnormmin.py: -------------------------------------------------------------------------------- 1 | """Provides a Finder that performs gradient norm minimization to find critical points. 2 | """ 3 | import json 4 | 5 | import autograd 6 | import autograd.numpy as np 7 | 8 | from .base import Finder, Logger 9 | from ..defaults import DEFAULT_ALPHA 10 | from ..optimizers import GradientDescentOptimizer, MomentumOptimizer 11 | from ..optimizers import BackTrackingLineSearchOptimizer 12 | 13 | DEFAULT_MINIMIZER_PARAMS = {"lr": DEFAULT_ALPHA} 14 | 15 | 16 | class GradientNormMinimizer(Finder): 17 | r"""Find critical points of function f by minimizing 18 | auxiliary function g where 19 | $$ 20 | g(theta) = \frac{1]{2}\lvert\nabla f(theta)\rvert^2 21 | $$ 22 | 23 | The gradient of g is the product of the hessian with the gradient. 24 | This can be more efficiently computed as a hessian-vector product. 25 | """ 26 | 27 | def __init__(self, f, log_kwargs=None, minimizer_str="gd", minimizer_params=None): 28 | Finder.__init__(self, f, log_kwargs=log_kwargs) 29 | 30 | def g(theta): 31 | return 0.5 * np.sum(np.square(self.grad_f(theta))) 32 | 33 | self.g = g 34 | self.grad_g = autograd.grad(g) 35 | self.hvp = autograd.hessian_vector_product(self.f) 36 | self.fast_grad_g = lambda x: self.hvp(x, self.grad_f(x)) 37 | 38 | self.minimizer_str = minimizer_str 39 | self.minimizer_params = minimizer_params or DEFAULT_MINIMIZER_PARAMS.copy() 40 | self.set_minimizer(minimizer_str) 41 | 42 | def run(self, init_theta, num_iters=1): 43 | theta = init_theta 44 | self.update_logs({"theta": theta}) 45 | 46 | for ii in range(num_iters): 47 | theta_new = theta + self.minimizer.update(theta) 48 | self.update_logs({"theta": theta_new}) 49 | 50 | if np.array_equal(theta, theta_new): 51 | return theta 52 | 53 | theta = theta_new 54 | 55 | return theta 56 | 57 | def setup_log(self, track_thetas=False, track_f_thetas=False, track_g_thetas=False): 58 | 59 | if track_thetas: 60 | self.loggers.append(Logger("theta", lambda step_info: step_info["theta"])) 61 | 62 | if track_f_thetas: 63 | self.loggers.append(Logger("f_theta", lambda step_info: self.f(step_info["theta"]))) 64 | 65 | if track_g_thetas: 66 | self.loggers.append(Logger("g_theta", lambda step_info: self.g(step_info["theta"]))) 67 | 68 | def set_minimizer(self, minimizer_str): 69 | if minimizer_str == "gd": 70 | self.minimizer = GradientDescentOptimizer( 71 | self.g, self.fast_grad_g, **self.minimizer_params) 72 | elif minimizer_str == "momentum": 73 | self.minimizer = MomentumOptimizer( 74 | self.g, self.fast_grad_g, **self.minimizer_params) 75 | elif minimizer_str == "btls": 76 | self.minimizer = BackTrackingLineSearchOptimizer( 77 | self.g, self.fast_grad_g, **self.minimizer_params) 78 | else: 79 | raise NotImplementedError 80 | 81 | def to_json(self, json_path): 82 | dictionary = self.construct_dictionary() 83 | with open(json_path, "w") as fp: 84 | json.write(dictionary, fp) 85 | 86 | @classmethod 87 | def from_json(cls, f, json_path): 88 | with open(json_path) as fp: 89 | dictionary = json.load(fp) 90 | return cls(f, **dictionary) 91 | 92 | def construct_dictionary(self): 93 | dictionary = {"log_kwargs": self.log_kwargs, 94 | "minimizer_str": self.minimizer_str, 95 | "minimzer_params": self.minimzer_params} 96 | return dictionary 97 | -------------------------------------------------------------------------------- /tests/test_nn.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | import pytest 3 | 4 | import autocrit.nn as nn 5 | 6 | 7 | def test_id_net(): 8 | none_vector = np.asarray([[None]]) 9 | id_net = nn.networks.Network( 10 | (none_vector, none_vector), 11 | [nn.layers.PointwiseNonlinearLayer("none")]) 12 | 13 | assert id_net.forward_pass(None, np.asarray([None])) is None 14 | 15 | 16 | def test_equivalence_fc(): 17 | 18 | network_style, fc_style, data = make_network_and_fc_style() 19 | shared_theta = network_style.initialize() 20 | 21 | assert network_style.loss(shared_theta) == fc_style.loss(shared_theta) 22 | 23 | loss_val = network_style.loss(shared_theta) 24 | 25 | network_style_dict = network_style.construct_dict() 26 | fc_style_dict = fc_style.construct_dict() 27 | 28 | network_style_rebuild = nn.networks.Network(data, **network_style_dict) 29 | fc_style_rebuild = nn.networks.FullyConnected(data, **fc_style_dict) 30 | 31 | assert loss_val == network_style_rebuild.loss(shared_theta) 32 | assert loss_val == fc_style_rebuild.loss(shared_theta) 33 | 34 | 35 | def test_regularizer_l2(): 36 | scalmult = make_scalmult(regularizer_str="l2", 37 | regularization_parameter=1.) 38 | theta = scalmult.initialize() 39 | assert scalmult.loss(theta) == np.square(theta) 40 | 41 | scalmult = make_scalmult(regularizer_str="l2", 42 | regularization_parameter=0.5) 43 | theta = scalmult.initialize() 44 | assert scalmult.loss(theta) == 0.5 * np.square(theta) 45 | 46 | 47 | def test_regularizer_l1(): 48 | scalmult = make_scalmult(regularizer_str="l1", 49 | regularization_parameter=1.) 50 | theta = scalmult.initialize() 51 | assert scalmult.loss(theta) == np.abs(theta) 52 | 53 | scalmult = make_scalmult(regularizer_str="l1", 54 | regularization_parameter=-1.) 55 | theta = scalmult.initialize() 56 | assert scalmult.loss(theta) == -np.abs(theta) 57 | 58 | 59 | def test_to_from_json(tmpdir): 60 | with pytest.raises(NotImplementedError): 61 | none_vector = np.asarray([[None]]) 62 | lambda_id_net = nn.networks.Network( 63 | (none_vector, none_vector), 64 | [nn.layers.LambdaLayer(lambda x: x)]) 65 | path = tmpdir.join("lambda_id_net.json") 66 | lambda_id_net.to_json(path) 67 | 68 | network_style, fc_style, data = make_network_and_fc_style() 69 | 70 | network_style_path = tmpdir.join("network_style.json") 71 | network_style.to_json(network_style_path) 72 | 73 | fc_style_path = tmpdir.join("fc_style.json") 74 | fc_style.to_json(fc_style_path) 75 | 76 | network_style_rebuild = nn.networks.Network.from_json(data, network_style_path) 77 | fc_style_rebuild = nn.networks.FullyConnected.from_json(data, fc_style_path) 78 | 79 | shared_theta = network_style.initialize() 80 | 81 | assert network_style_rebuild.loss(shared_theta) == network_style.loss(shared_theta) 82 | assert fc_style_rebuild.loss(shared_theta) == fc_style.loss(shared_theta) 83 | 84 | 85 | def make_scalmult(**kwargs): 86 | data = (np.asarray([0]), np.asarray([0])) 87 | scalar_mult_layer = nn.layers.FCLayer(1, has_biases=False) 88 | scalmult = nn.networks.Network( 89 | data, layer_specs=[scalar_mult_layer], 90 | **kwargs) 91 | 92 | return scalmult 93 | 94 | 95 | def make_network_and_fc_style(): 96 | layer_sizes = [2, 4] 97 | data = (np.random.standard_normal((4, 1)), np.random.standard_normal((4, 1))) 98 | 99 | network_style = nn.networks.Network( 100 | data, 101 | layer_specs=[nn.layers.FCLayer(layer_size) for layer_size in layer_sizes]) 102 | 103 | fc_style = nn.networks.FullyConnected( 104 | data, 105 | layer_sizes=layer_sizes, 106 | nonlinearity_str="none") 107 | 108 | return network_style, fc_style, data 109 | -------------------------------------------------------------------------------- /tests/problems/conv_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for generating gaussian random matrices 2 | with translation-invariant covariance (toroidal boundaries) 3 | and converting between "vector" and "image" representations of same. 4 | 5 | Images constructed in this fashion (with or without channel dependencies) 6 | are the natural targets of convolutional neural networks. 7 | """ 8 | import random 9 | 10 | import autograd.numpy as np 11 | import scipy 12 | 13 | from autocrit.utils.math import rescale 14 | 15 | EPS = 1e-3 16 | 17 | 18 | def rgb_gauss_random_samples(N, mean_or_means=None, cov_or_covs=None, im_sz=None): 19 | means, covs, im_sz = _handle_kwargs(mean_or_means, cov_or_covs, im_sz) 20 | 21 | samples_by_channel = np.asarray( 22 | [np.random.multivariate_normal(mean, cov, N).T 23 | for mean, cov in zip(means, covs)]) 24 | 25 | samples_batch_minor = np.moveaxis(samples_by_channel, [0, 1], [1, 0]) 26 | 27 | return samples_batch_minor 28 | 29 | 30 | def generate_iostropic_circulant_cov_2d(k, autocorr_scale=1.): 31 | """returns covariance matrix for translation-invariant, 32 | isotropic multivariate gaussian defined on a discrete torus 33 | with side length k 34 | """ 35 | isotropic_circulant_1d = generate_isotropic_circulant_2d_vector(k, autocorr_scale) 36 | 37 | circ_mat = circulant_2d_vector_to_circulant_2d_matrix(isotropic_circulant_1d) 38 | 39 | # check symmetry 40 | assert np.array_equal(circ_mat, (circ_mat.T + circ_mat) / 2) 41 | 42 | # impose PSD 43 | cov_mat = apply_damping(circ_mat) 44 | 45 | return cov_mat 46 | 47 | 48 | def to_im(vals, im_side): 49 | return np.reshape(vals, (im_side, im_side)) 50 | 51 | 52 | def from_im(im): 53 | return np.reshape(im, im.shape[0] ** 2) 54 | 55 | 56 | def to_im_rgb(rgb_vec, im_side): 57 | return np.asarray([to_im(ch_vec, im_side) for ch_vec in rgb_vec]) 58 | 59 | 60 | def apply_damping(mat, eps=EPS): 61 | eigvals = np.linalg.eigvalsh(mat) 62 | damping_coeff = np.abs(min([min(eigvals) - eps, -eps])) 63 | damped_mat = mat + damping_coeff * np.eye(mat.shape[0]) 64 | return damped_mat 65 | 66 | 67 | def generate_isotropic_circulant_2d_vector(k, autocorr_scale): 68 | gaussian = scipy.stats.multivariate_normal(mean=[0, 0]).pdf 69 | xs = ys = np.linspace(-autocorr_scale, autocorr_scale, k) 70 | Xs, Ys = np.meshgrid(xs, ys) 71 | isotropic_circulant_1d = np.asarray( 72 | [gaussian([x, y]) for x, y in zip(Xs.flatten(), Ys.flatten())]) 73 | isotropic_circulant_1d = np.roll(isotropic_circulant_1d, 74 | -np.argmax(isotropic_circulant_1d)) 75 | return isotropic_circulant_1d 76 | 77 | 78 | def circulant_2d_vector_to_circulant_2d_matrix(circulant_2d_vector): 79 | 80 | circulant_2d_matrix = np.asarray( 81 | [np.roll(circulant_2d_vector, ii) 82 | for ii in range(len(circulant_2d_vector))]) 83 | 84 | return circulant_2d_matrix 85 | 86 | 87 | def _handle_kwargs(mean_or_means, cov_or_covs, im_sz): 88 | kwargs = [mean_or_means, cov_or_covs, im_sz] 89 | assert not all([kwarg is None for kwarg in kwargs]) 90 | 91 | if im_sz is None: 92 | assert not (mean_or_means is None and cov_or_covs is None) 93 | 94 | if cov_or_covs is not None: 95 | if type(cov_or_covs) is not list: 96 | assert isinstance(cov_or_covs, np.ndarray) 97 | covs = 3 * [cov_or_covs] 98 | else: 99 | covs = cov_or_covs 100 | else: 101 | covs = None 102 | 103 | if mean_or_means is not None: 104 | if type(mean_or_means) is not list: 105 | assert isinstance(mean_or_means, np.ndarray) 106 | means = 3 * [mean_or_means] 107 | else: 108 | means = mean_or_means 109 | else: 110 | means = None 111 | 112 | if im_sz is None: 113 | if covs is not None: 114 | im_sz = covs[0].shape[0] 115 | else: 116 | im_sz = means.shape[0] 117 | 118 | if means is None: 119 | means = 3 * [np.zeros(im_sz)] 120 | 121 | if covs is None: 122 | covs = 3 * [np.eye(im_sz)] 123 | 124 | return means, covs, im_sz 125 | 126 | 127 | def display_sample_rgb(rgbs_batch_minor, ax, im_side=None): 128 | random_rgb_vec = random.choice(rgbs_batch_minor.T) 129 | assert random_rgb_vec.shape[0] == 3 130 | if im_side is None: 131 | candidate_im_side = np.sqrt(random_rgb_vec.shape[1]) 132 | assert candidate_im_side == int(candidate_im_side) 133 | im_side = int(candidate_im_side) 134 | 135 | random_rgb_im = to_im_rgb(random_rgb_vec, im_side) 136 | if np.min(random_rgb_im) < 0: 137 | random_rgb_im = rescale(random_rgb_im) 138 | 139 | ax.imshow(random_rgb_im.T) 140 | ax.axis("off") 141 | -------------------------------------------------------------------------------- /tests/test_finders.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import autocrit 4 | import autocrit.utils.random_matrix 5 | import pytest 6 | 7 | import tests.utils.shared as shared 8 | 9 | 10 | def test_NewtonMethod(): 11 | warnings.filterwarnings("ignore") 12 | finder = autocrit.finders.newtons.NewtonMethod 13 | finder_str = "NewtonMethod" 14 | 15 | problem_str = "least squares" 16 | finder_kwargs = {} 17 | num_iters = 1 18 | 19 | random_least_squares_problem, random_init = \ 20 | shared.generate_random_least_squares() 21 | 22 | shared.convergence_test(finder, finder_str, finder_kwargs, 23 | random_least_squares_problem.loss, random_least_squares_problem, 24 | problem_str, random_init, num_iters) 25 | 26 | 27 | def test_FastNewtonMR(): 28 | warnings.filterwarnings("ignore") 29 | finder = autocrit.FastNewtonMR 30 | finder_str = "FastNewtonMR" 31 | 32 | problem_str = "least squares" 33 | finder_kwargs = {"alpha": 0.5, "beta": 0.99, "check_pure": True} 34 | num_iters = 500 35 | 36 | random_least_squares_problem, random_init = \ 37 | shared.generate_random_least_squares() 38 | 39 | shared.convergence_test(finder, finder_str, finder_kwargs, 40 | random_least_squares_problem.loss, random_least_squares_problem, 41 | problem_str, random_init, num_iters) 42 | 43 | problem_str = "shallow regression" 44 | finder_kwargs = {"alpha": 0.5, "beta": 0.9, "rho": 1e-6} 45 | num_iters = 250 46 | 47 | random_regression_problem, network, random_init = \ 48 | shared.generate_random_shallow_regression() 49 | 50 | shared.convergence_test(finder, finder_str, finder_kwargs, 51 | network.loss, random_regression_problem, problem_str, 52 | random_init, num_iters, 53 | test_soln_converge=False) 54 | 55 | 56 | def test_FastNewtonTR(): 57 | warnings.filterwarnings("ignore") 58 | finder = autocrit.FastNewtonTR 59 | finder_str = "FastNewtonTR" 60 | 61 | problem_str = "least squares" 62 | finder_kwargs = {"step_size": 0.5} 63 | num_iters = 25 64 | 65 | random_least_squares_problem, random_init = \ 66 | shared.generate_random_least_squares() 67 | 68 | shared.convergence_test(finder, finder_str, finder_kwargs, 69 | random_least_squares_problem.loss, random_least_squares_problem, 70 | problem_str, random_init, num_iters) 71 | 72 | problem_str = "shallow regression" 73 | finder_kwargs = {"step_size": 0.1} 74 | num_iters = 250 75 | 76 | random_regression_problem, network, random_init = \ 77 | shared.generate_random_shallow_regression() 78 | 79 | shared.convergence_test(finder, finder_str, finder_kwargs, 80 | network.loss, random_regression_problem, problem_str, 81 | random_init, num_iters, 82 | test_soln_converge=False) 83 | 84 | 85 | @pytest.mark.slow 86 | def test_deep_classification(): 87 | warnings.filterwarnings("ignore") 88 | 89 | problem_str = "deep classification" 90 | 91 | finder = autocrit.FastNewtonMR 92 | finder_str = "FastNewtonMR" 93 | 94 | finder_kwargs = {"alpha": 1., "beta": 0.5, "rho": 1e-6, 95 | "check_pure": True} 96 | num_iters = 250 97 | 98 | random_classification_problem, network, random_init = \ 99 | shared.generate_random_deep_classification() 100 | 101 | shared.convergence_test(finder, finder_str, finder_kwargs, 102 | network.loss, random_classification_problem, problem_str, 103 | random_init, num_iters, 104 | test_function_converge=False, 105 | test_soln_converge=False) 106 | 107 | finder = autocrit.FastNewtonTR 108 | finder_str = "FastNewtonTR" 109 | 110 | finder_kwargs = {"step_size": 0.05} 111 | num_iters = 250 112 | 113 | random_classification_problem, network, random_init = \ 114 | shared.generate_random_deep_classification(seed=shared.SEED + 1) 115 | 116 | shared.convergence_test(finder, finder_str, finder_kwargs, 117 | network.loss, random_classification_problem, problem_str, 118 | random_init, num_iters, 119 | test_function_converge=False, 120 | test_soln_converge=False) 121 | 122 | 123 | def test_GradientNormMinimizer(): 124 | warnings.filterwarnings("ignore") 125 | finder = autocrit.GradientNormMinimizer 126 | finder_str = "GradientNormMinimizer" 127 | 128 | problem_str = "least squares" 129 | finder_kwargs = {"minimizer_str": "momentum", 130 | "minimizer_params": {"lr": 1e-2, 131 | "momentum": 0.9}} 132 | num_iters = 1000 133 | 134 | random_least_squares_problem, random_init = \ 135 | shared.generate_random_least_squares() 136 | 137 | shared.convergence_test(finder, finder_str, finder_kwargs, 138 | random_least_squares_problem.loss, random_least_squares_problem, 139 | problem_str, random_init, num_iters) 140 | -------------------------------------------------------------------------------- /tests/utils/shared.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import autograd 4 | import autograd.numpy as np 5 | 6 | import autocrit.nn as nn 7 | from autocrit.utils.random_matrix import generate_random_unit_vector 8 | import autocrit.utils.math as math 9 | import tests.problems.linear as linear 10 | import tests.problems.convolutional as convolutional 11 | 12 | SEED = 14 13 | 14 | CRITERION_STRS = ["gradient norm", "function val", "solution val"] 15 | CRITERION_VALS = [1e-5, 5e-5, 1e-5] 16 | 17 | DIM = 25 18 | 19 | K = 10 20 | L = 5 21 | M = 2 22 | N = 100 23 | 24 | FAIL_MSG = "{0} failed to converge on {1} in {2}:\n\t{3} > {4}" 25 | 26 | 27 | def convergence_test(algorithm, algorithm_str, algorithm_kwargs, 28 | loss, problem, problem_str, init, num_iters, 29 | test_function_converge=True, test_soln_converge=True): 30 | 31 | _, _, errors = evaluate(algorithm, loss, problem, init, num_iters, 32 | kwargs=algorithm_kwargs, 33 | calc_func_error=test_function_converge, 34 | calc_soln_rms_error=test_soln_converge) 35 | 36 | for criterion_str, error_val, criterion_val in zip(CRITERION_STRS, errors, CRITERION_VALS): 37 | assert_pass(algorithm_str, problem_str, criterion_str, error_val, criterion_val) 38 | 39 | 40 | def evaluate(algorithm_constructor, loss, 41 | problem, init, num_iters, 42 | kwargs=None, 43 | calc_func_error=True, 44 | calc_soln_rms_error=True): 45 | if kwargs is None: 46 | kwargs = {} 47 | 48 | algorithm = algorithm_constructor(loss, **kwargs) 49 | solution = algorithm.run(init, num_iters) 50 | exact_solution = problem.exact_solution 51 | 52 | grad_rms_error = math.rms(autograd.grad(loss)(solution)) 53 | errors = [grad_rms_error] 54 | 55 | if calc_func_error: 56 | func_error = loss(solution) - problem.loss(exact_solution) 57 | errors.append(func_error) 58 | 59 | if calc_soln_rms_error: 60 | soln_rms_error = math.rms(solution.ravel() - exact_solution.T.ravel()) 61 | errors.append(soln_rms_error) 62 | 63 | return solution, exact_solution, errors 64 | 65 | 66 | def assert_pass(algorithm_str, problem_str, criterion_str, error_val, criterion_val): 67 | fail_msg = FAIL_MSG.format( 68 | algorithm_str, problem_str, criterion_str, error_val, criterion_val) 69 | 70 | assert error_val <= criterion_val, fail_msg 71 | 72 | 73 | def generate_random_least_squares(dim=DIM, seed=SEED): 74 | np.random.seed(seed) 75 | random_least_squares_problem = linear.LeastSquares.\ 76 | generate_random_problem(dim=dim) 77 | random_init = generate_random_unit_vector(dim=dim) 78 | 79 | return random_least_squares_problem, random_init 80 | 81 | 82 | def generate_random_shallow_regression(k=K, l=L, n=N, seed=SEED): 83 | np.random.seed(seed) 84 | random_regression_problem = linear.Regression.\ 85 | generate_random_problem(k=k, l=l, n=n) 86 | 87 | shallow_network = nn.networks.FullyConnected( 88 | (random_regression_problem.X, random_regression_problem.Y), 89 | layer_sizes=[l], 90 | nonlinearity_str="none", 91 | has_biases=False) 92 | 93 | random_init = shallow_network.initialize() 94 | 95 | return random_regression_problem, shallow_network, random_init 96 | 97 | 98 | def generate_random_shallow_classification(k=K, m=M, n=N, seed=SEED): 99 | np.random.seed(seed) 100 | random.seed(seed) 101 | random_classification_problem = linear.Classification.\ 102 | generate_random_problem(k=k, m=m, n=n) 103 | 104 | shallow_network = nn.networks.FullyConnected( 105 | (random_classification_problem.X, random_classification_problem.Y), 106 | layer_sizes=[m], 107 | nonlinearity_str="none", 108 | has_biases=False, 109 | cost_str="softmax_cross_entropy") 110 | 111 | random_init = shallow_network.initialize() 112 | 113 | return random_classification_problem, shallow_network, random_init 114 | 115 | 116 | def generate_random_deep_classification(k=K, m=M, n=N, seed=SEED): 117 | np.random.seed(seed) 118 | random.seed(seed) 119 | random_classification_problem = linear.Classification.\ 120 | generate_random_problem(k=k, m=m, n=n) 121 | 122 | p = min(k, m) 123 | deep_network = nn.networks.FullyConnected( 124 | (random_classification_problem.X, random_classification_problem.Y), 125 | layer_sizes=[p, m], 126 | nonlinearity_str="none", 127 | has_biases=False, 128 | regularizer_str="l2", 129 | regularization_parameter=0.1, 130 | cost_str="softmax_cross_entropy") 131 | 132 | random_init = deep_network.initialize() 133 | 134 | return random_classification_problem, deep_network, random_init 135 | 136 | 137 | def generate_test_conv_classification(n=N, seed=SEED): 138 | np.random.seed(seed) 139 | random.seed(seed) 140 | test_classification_problem = convolutional.Classification.\ 141 | generate_test_problem(n=n) 142 | 143 | conv_network = nn.networks.Network( 144 | (test_classification_problem.X, test_classification_problem.Y), 145 | layer_specs=[nn.layers.ConvLayer((4, 4), 2), 146 | nn.layers.MaxPoolLayer((2, 2)), 147 | nn.layers.GlobalAvgPoolLayer(), 148 | nn.layers.SqueezeLayer()], 149 | cost_str="softmax_cross_entropy") 150 | 151 | random_init = conv_network.initialize() 152 | 153 | return test_classification_problem, conv_network, random_init 154 | -------------------------------------------------------------------------------- /tests/problems/linear.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import autograd 4 | import autograd.numpy as np 5 | import sklearn.linear_model 6 | 7 | from autocrit.utils import random_matrix 8 | import autocrit.utils.math as math 9 | 10 | 11 | class LeastSquares(object): 12 | EPS = 1e-20 13 | 14 | def __init__(self, A, b): 15 | self.A = A 16 | self.b = b 17 | 18 | self.grad = autograd.grad(self.loss) 19 | 20 | self.exact_solution = self.solve() 21 | 22 | assert self.loss(self.exact_solution) < self.EPS 23 | assert math.rms(self.grad(self.exact_solution)) < np.sqrt(self.EPS) 24 | 25 | def loss(self, x): 26 | return np.sum(np.square(np.dot(self.A, x) - self.b)) 27 | 28 | def solve(self): 29 | return np.dot(np.linalg.pinv(self.A), self.b) 30 | 31 | @classmethod 32 | def generate_random_problem(cls, dim=25, eps=5e-1): 33 | random_psd_matrix = random_matrix.Wishart(dim, dim) 34 | A = np.eye(dim) + eps * random_psd_matrix.M 35 | b = random_matrix.generate_random_unit_vector(dim=dim) 36 | 37 | return cls(A, b) 38 | 39 | 40 | class Regression(object): 41 | EPS = 1e-20 42 | 43 | def __init__(self, X, Y): 44 | """ 45 | X : array, k x n input observations 46 | Y : array, l x n output observations 47 | 48 | Attributes: 49 | ----------- 50 | W : array, l x k parameter matrix 51 | """ 52 | self.X = X 53 | self.Y = Y 54 | 55 | self.grad = autograd.grad(self.loss) 56 | self.H = autograd.hessian(self.loss) 57 | 58 | self.exact_solution = self.solve() 59 | 60 | assert math.rms(self.grad(self.exact_solution)) < np.sqrt(self.EPS) 61 | 62 | def loss(self, W): 63 | return np.mean(np.square(np.dot(W.T, self.X) - self.Y)) 64 | 65 | def solve(self): 66 | return np.dot( 67 | np.dot( 68 | np.linalg.pinv(np.dot(self.X, self.X.T)), 69 | self.X), 70 | self.Y.T) 71 | 72 | @classmethod 73 | def generate_random_problem(cls, k, l, n, sigma=1.): 74 | """Generate a random linear regression problem. 75 | 76 | Parameters: 77 | ----------- 78 | 79 | k : int, dimension of X 80 | l : int, dimension of Y 81 | n : int, number of observations 82 | sigma: float, expected norm of additive noise on Y 83 | 84 | Returns: 85 | -------- 86 | 87 | regression_problem : LinearRegressionProblem, class combining loss, data, solver 88 | """ 89 | 90 | # independent gaussian vectors with approx unit norm 91 | X = 1 / np.sqrt(k) * np.random.standard_normal(size=(k, n)) 92 | X -= np.mean(X, axis=1)[:, None] 93 | 94 | # unit norm transformation with uniform orientation 95 | W = np.asarray( 96 | [random_matrix.generate_random_unit_vector(dim=k) for _ in range(l)])\ 97 | .T.squeeze() 98 | 99 | Y = np.dot(W.T, X) 100 | Y += np.sqrt(sigma / l) * np.random.standard_normal(size=Y.shape) 101 | 102 | return cls(X, Y) 103 | 104 | 105 | class Classification(object): 106 | EPS = 1e-20 107 | 108 | def __init__(self, X, Y, Y_iis): 109 | """ 110 | X : array, k x n input observations 111 | Y : array, l x n output label onehots 112 | Y_iis : array, 1 x n output label integers 113 | """ 114 | self.X = X 115 | self.Y = Y 116 | self.Y_iis = Y_iis 117 | 118 | self.grad = autograd.grad(self.loss) 119 | self.H = autograd.hessian(self.loss) 120 | 121 | # minimally regularized LogisticRegression 122 | self.sklearn_model = sklearn.linear_model.LogisticRegression( 123 | solver="sag", fit_intercept=False, C=1e4) 124 | 125 | self.exact_solution = self.solve() 126 | 127 | def loss(self, W): 128 | _W = np.hstack([-W, W]) 129 | logits = np.dot(_W.T, self.X) 130 | 131 | return math.softmax_cross_entropy(logits, self.Y) 132 | 133 | def solve(self): 134 | self.sklearn_model.fit(self.X.T, self.Y_iis) 135 | 136 | return np.atleast_2d(self.sklearn_model.coef_).T 137 | 138 | @classmethod 139 | def generate_random_problem(cls, k, m, n): 140 | """Generate a random linear classification // logistic regression 141 | problem using a mixture of Gaussians. 142 | 143 | Parameters: 144 | ----------- 145 | 146 | k : int, dimension of X 147 | m : int, number of labels 148 | n : int, number of observations 149 | 150 | Returns: 151 | -------- 152 | X : array, k x n input observations 153 | Y : array, m x n one_hot labels 154 | regression_problem : RegressionProblem, class combining loss, data, solver 155 | """ 156 | 157 | X, Y, Y_iis, mus, covs = sample_gaussian_mixture(k, m, n) 158 | 159 | return cls(X, Y, Y_iis) 160 | 161 | 162 | def sample_gaussian_mixture(k, m, n, mus=None, covs=None): 163 | mus = [random_matrix.generate_random_unit_vector(dim=k) for _ in range(m)] 164 | covs = [np.eye(k) / np.sqrt(k) for _ in range(m)] 165 | 166 | labels = list(range(m)) 167 | 168 | Y_iis = [] 169 | Y = [] 170 | X = [] 171 | one_hots = np.eye(m) 172 | 173 | for _ in range(n): 174 | y_ii = random.choice(labels) 175 | y = one_hots[y_ii] 176 | x = np.random.multivariate_normal(np.squeeze(mus[y_ii]), covs[y_ii]) 177 | 178 | Y_iis.append(y_ii) 179 | Y.append(y) 180 | X.append(x) 181 | 182 | X = np.asarray(X).T 183 | Y = np.atleast_2d(np.asarray(Y)).T 184 | Y_iis = np.asarray(Y_iis) 185 | 186 | return X, Y, Y_iis, mus, covs 187 | -------------------------------------------------------------------------------- /tests/test_optimizers.py: -------------------------------------------------------------------------------- 1 | """Tests GradientDescentOptimizer, MomentumOptimizer, and BacktrackingLineSearchOptimizer 2 | for convergence in function value, gradient norm, and solution value for linear least squares 3 | linear regression, and linear classification with a shallow network. 4 | Additionally tests MomentumOptimizer on a linear convolutional problem. 5 | """ 6 | import warnings 7 | 8 | import autocrit 9 | import autocrit.utils.random_matrix 10 | 11 | import tests.utils.shared as shared 12 | 13 | 14 | def test_GradientDescentOptimizer(): 15 | optimizer = autocrit.optimizers.GradientDescentOptimizer 16 | optimizer_str = "GradientDescentOptimizer" 17 | 18 | problem_str = "least squares" 19 | optimizer_kwargs = {} 20 | num_iters = 1000 21 | 22 | random_least_squares_problem, random_init = \ 23 | shared.generate_random_least_squares() 24 | 25 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 26 | random_least_squares_problem.loss, random_least_squares_problem, 27 | problem_str, random_init, num_iters) 28 | 29 | problem_str = "shallow regression" 30 | optimizer_kwargs = {} 31 | num_iters = 10000 32 | 33 | random_regression_problem, network, random_init = \ 34 | shared.generate_random_shallow_regression() 35 | 36 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 37 | network.loss, random_regression_problem, problem_str, 38 | random_init, num_iters) 39 | 40 | problem_str = "shallow classification" 41 | optimizer_kwargs = {} 42 | num_iters = 12500 43 | 44 | random_classification_problem, network, random_init = \ 45 | shared.generate_random_shallow_classification() 46 | 47 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 48 | network.loss, random_classification_problem, problem_str, 49 | random_init, num_iters, test_soln_converge=False) 50 | 51 | 52 | def test_MomentumOptimizer(): 53 | warnings.filterwarnings("ignore") 54 | optimizer = autocrit.optimizers.MomentumOptimizer 55 | optimizer_str = "MomentumOptimizer" 56 | 57 | problem_str = "least squares" 58 | optimizer_kwargs = {} 59 | num_iters = 1000 60 | 61 | random_least_squares_problem, random_init = \ 62 | shared.generate_random_least_squares() 63 | 64 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 65 | random_least_squares_problem.loss, random_least_squares_problem, 66 | problem_str, random_init, num_iters) 67 | 68 | problem_str = "shallow regression" 69 | optimizer_kwargs = {} 70 | num_iters = 1000 71 | 72 | random_regression_problem, network, random_init = \ 73 | shared.generate_random_shallow_regression() 74 | 75 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 76 | network.loss, random_regression_problem, problem_str, 77 | random_init, num_iters) 78 | 79 | problem_str = "shallow classification" 80 | optimizer_kwargs = {} 81 | num_iters = 1000 82 | 83 | random_classification_problem, network, random_init = \ 84 | shared.generate_random_shallow_classification() 85 | 86 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 87 | network.loss, random_classification_problem, problem_str, 88 | random_init, num_iters, test_soln_converge=False) 89 | 90 | problem_str = "convolutional classification" 91 | optimizer_kwargs = {"momentum": 0.99} 92 | num_iters = 1000 93 | 94 | test_classification_problem, network, random_init = \ 95 | shared.generate_test_conv_classification() 96 | 97 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 98 | network.loss, test_classification_problem, problem_str, 99 | random_init, num_iters, test_soln_converge=False) 100 | 101 | 102 | def test_BackTrackingLineSearchOptimizer(dim=25): 103 | optimizer = autocrit.optimizers.BackTrackingLineSearchOptimizer 104 | optimizer_str = "BackTrackingLineSearchOptimizer" 105 | 106 | problem_str = "least squares" 107 | optimizer_kwargs = {} 108 | num_iters = 1000 109 | 110 | random_least_squares_problem, random_init = \ 111 | shared.generate_random_least_squares() 112 | 113 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 114 | random_least_squares_problem.loss, random_least_squares_problem, 115 | problem_str, random_init, num_iters) 116 | 117 | problem_str = "shallow regression" 118 | optimizer_kwargs = {"gamma": 1 - 1e-3} 119 | num_iters = 100 120 | 121 | random_regression_problem, network, random_init = \ 122 | shared.generate_random_shallow_regression() 123 | 124 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 125 | network.loss, random_regression_problem, problem_str, 126 | random_init, num_iters) 127 | 128 | problem_str = "shallow classification" 129 | optimizer_kwargs = {"gamma": 1 - 1e-3} 130 | num_iters = 100 131 | 132 | random_classification_problem, network, random_init = \ 133 | shared.generate_random_shallow_classification() 134 | 135 | shared.convergence_test(optimizer, optimizer_str, optimizer_kwargs, 136 | network.loss, random_classification_problem, problem_str, 137 | random_init, num_iters, test_soln_converge=False) 138 | -------------------------------------------------------------------------------- /autocrit/optimizers.py: -------------------------------------------------------------------------------- 1 | """Optimization algorithms using zeroth- and first-order oracles. 2 | 3 | Includes gradient descent, momentum, and backtracking line search, 4 | using either the standard Wolfe criterion or the Roosta criterion from 5 | the paper on Newton-MR. 6 | """ 7 | import autograd 8 | import autograd.numpy as np 9 | 10 | from autocrit.defaults import DEFAULT_ALPHA, DEFAULT_MOMENTUM 11 | from autocrit.defaults import DEFAULT_BETA, DEFAULT_GAMMA, DEFAULT_RHO 12 | 13 | 14 | class FirstOrderOptimizer(object): 15 | """Abstract Base Class for optimizers with a zeroth- and first-order oracle. 16 | 17 | If no first-order oracle is provided, it is computed from the zeroth-order 18 | oracle with autograd.""" 19 | 20 | def __init__(self, f, grad_f): 21 | self.f = f 22 | if grad_f is None: 23 | self.grad_f = autograd.grad(f) 24 | else: 25 | self.grad_f = grad_f 26 | 27 | def run(self, init, num_iters): 28 | solution = np.copy(init) 29 | 30 | for _ in range(num_iters): 31 | solution += self.update(solution) 32 | 33 | return solution 34 | 35 | 36 | class GradientDescentOptimizer(FirstOrderOptimizer): 37 | """FirstOrderOptimizer that uses scaled gradients to update.""" 38 | 39 | def __init__(self, f, grad_f=None, lr=DEFAULT_ALPHA): 40 | super().__init__(f, grad_f) 41 | self.lr = lr 42 | 43 | def update(self, theta): 44 | return -self.lr * self.grad_f(theta) 45 | 46 | 47 | class MomentumOptimizer(FirstOrderOptimizer): 48 | """FirstOrderOptimizer that maintains a 'velocity' term in addition to scaled gradients. 49 | 50 | If initial velocity is not provided in init_velocity, starts at 0. 51 | """ 52 | 53 | def __init__(self, f, grad_f=None, lr=DEFAULT_ALPHA, momentum=DEFAULT_MOMENTUM, 54 | init_velocity=None): 55 | super().__init__(f, grad_f) 56 | self.lr = lr 57 | self.momentum = momentum 58 | 59 | self.velocity = init_velocity 60 | 61 | def update(self, theta): 62 | if self.velocity is None: 63 | self.velocity = np.zeros_like(theta) 64 | self.velocity = self.grad_f(theta) + self.momentum * self.velocity 65 | update = -self.lr * self.velocity 66 | 67 | return update 68 | 69 | 70 | class BackTrackingLineSearchOptimizer(FirstOrderOptimizer): 71 | """FirstOrderOptimizer that uses line search over the gradient direction. 72 | 73 | Can use either the traditional Wolfe criterion for terminating the line search 74 | or the new critertion from Roosta et al., 2018. 75 | """ 76 | 77 | def __init__(self, f, grad_f=None, hvp=None, 78 | alpha=DEFAULT_ALPHA, beta=DEFAULT_BETA, 79 | rho=DEFAULT_RHO, gamma=None, 80 | criterion="wolfe"): 81 | 82 | super().__init__(f, grad_f) 83 | self.set_criterion(criterion, gamma) 84 | 85 | self.alpha = alpha 86 | self.beta = beta 87 | self.rho = rho 88 | 89 | self.min_step_size = self.compute_min_step_size(self.alpha, self.beta) 90 | 91 | if hvp is None: 92 | self.hvp = autograd.hessian_vector_product(self.f) 93 | else: 94 | self.hvp = hvp 95 | 96 | def update(self, theta): 97 | update_direction = -self.grad_f(theta) 98 | converged = self.check_convergence(theta, update_direction) 99 | while not converged: 100 | self.alpha *= self.beta 101 | if self.alpha <= self.min_step_size: 102 | return np.zeros_like(theta) 103 | converged = self.check_convergence(theta, update_direction) 104 | step = self.alpha * update_direction 105 | self.alpha /= self.beta 106 | return step 107 | 108 | def set_criterion(self, criterion_str, gamma): 109 | self.criterion_str = criterion_str 110 | 111 | if self.criterion_str is None: 112 | return 113 | 114 | if self.criterion_str == "roosta": 115 | self.check_convergence = self.roosta_criterion 116 | elif self.criterion_str == "wolfe": 117 | self.check_convergence = self.wolfe_criterion 118 | if gamma is None: 119 | self.gamma = DEFAULT_GAMMA 120 | else: 121 | self.gamma = gamma 122 | else: 123 | raise NotImplementedError 124 | 125 | def roosta_criterion(self, theta, update_direction): 126 | proposed_update = theta + self.alpha * update_direction 127 | updated_f = self.f(proposed_update) 128 | current_f = self.f(theta) 129 | 130 | sufficient_decrease = 2 * self.rho * self.alpha * np.dot( 131 | self.hvp(theta, update_direction).T, self.grad_f(theta)) 132 | 133 | return (updated_f <= 134 | current_f + sufficient_decrease) 135 | 136 | def wolfe_criterion(self, theta, update_direction): 137 | proposed_update = theta + self.alpha * update_direction 138 | updated_f = self.f(proposed_update) 139 | current_f = self.f(theta) 140 | 141 | current_grad = self.grad_f(theta) 142 | grad_update_product = np.dot(update_direction.T, current_grad) 143 | 144 | new_grad = self.grad_f(proposed_update) 145 | new_grad_update_product = np.dot(update_direction.T, new_grad) 146 | 147 | passed_armijo = updated_f <= current_f + self.rho * self.alpha * grad_update_product 148 | 149 | passed_curvature = -new_grad_update_product <= -self.gamma * grad_update_product 150 | 151 | return passed_armijo and passed_curvature 152 | 153 | @staticmethod 154 | def compute_min_step_size(alpha, beta): 155 | while alpha * beta != alpha: 156 | alpha *= beta 157 | return alpha 158 | -------------------------------------------------------------------------------- /autocrit/utils/random_matrix.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | import scipy.integrate 3 | 4 | PRECISION = 1e-4 5 | 6 | 7 | class RandomMatrix(object): 8 | 9 | def __init__(self): 10 | self.symmetric = False 11 | return 12 | 13 | def eigvals(self): 14 | if self.symmetric: 15 | return np.linalg.eigvalsh(self.M) 16 | else: 17 | return np.linalg.eigvals(self.M) 18 | 19 | def expected_cumulative_spectral_distribution(self, lam, precision=PRECISION, accumulate=False): 20 | if lam < self.min_lam: 21 | return 0. 22 | 23 | lams = self.generate_lams(lam, precision) 24 | 25 | singular_mass = self.expected_spectral_singular_mass() 26 | density_values = [self.expected_spectral_density(lam) for lam in lams] 27 | if not accumulate: 28 | accumulated_density = scipy.integrate.trapz(density_values, lams) 29 | return ((lam >= 0) * singular_mass) + accumulated_density 30 | else: 31 | accumulated_densities = scipy.integrate.cumtrapz(density_values, lams) 32 | accumulated_masses = singular_mass * (lams[1:] >= 0) + accumulated_densities 33 | return accumulated_masses 34 | 35 | def __repr__(self): 36 | return self.M.__repr__() 37 | 38 | def display_expected_cumulative_spectral_distribution( 39 | self, ax, precision=PRECISION, **plot_kwargs): 40 | lams = self.generate_lams(self.max_lam + precision, precision) 41 | 42 | expected_csds = self.expected_cumulative_spectral_distribution( 43 | self.max_lam + precision, precision, accumulate=True) 44 | 45 | ax.plot(lams[1:], expected_csds, **plot_kwargs) 46 | 47 | return ax 48 | 49 | def generate_lams(self, lam, precision=PRECISION): 50 | return np.arange(self.min_lam - 2 * precision, lam + precision, precision) 51 | 52 | 53 | class SymmetricWigner(RandomMatrix): 54 | 55 | def __init__(self, N): 56 | super().__init__() 57 | self.symmetric = True 58 | self.generate = self.generate_symmetric_gaussian 59 | self.M = self.generate(N) 60 | self.min_lam = -2. 61 | self.max_lam = 2. 62 | 63 | @staticmethod 64 | def generate_symmetric_gaussian(N): 65 | """generate an N by N symmetric gaussian random matrix with variance 1/N 66 | """ 67 | base_matrix = SymmetricWigner.generate_gaussian(N) 68 | return (1 / np.sqrt(2)) * (base_matrix + base_matrix.T) 69 | 70 | @staticmethod 71 | def generate_gaussian(N): 72 | """generate an N by N gaussian random matrix with variance 1/N 73 | """ 74 | return 1 / np.sqrt(N) * np.random.standard_normal(size=(N, N)) 75 | 76 | def expected_spectral_singular_mass(self): 77 | return 0. 78 | 79 | def expected_spectral_density(self, lam): 80 | """Expected density for a symmetric gaussian random matrix with variance 1/N""" 81 | if lam > self.max_lam or lam < self.min_lam: 82 | return 0 83 | else: 84 | return 1 / (2 * np.pi) * np.sqrt(2 ** 2 - lam ** 2) 85 | 86 | 87 | class Wishart(RandomMatrix): 88 | 89 | def __init__(self, N, k, negative=False): 90 | super().__init__() 91 | self.symmetric = True 92 | if negative: 93 | self.generate = self.generate_negative_wishart 94 | self.sign = -1 95 | else: 96 | self.generate = self.generate_wishart 97 | self.sign = 1 98 | 99 | self.N, self.k = N, k 100 | self.M = self.generate(self.N, self.k) 101 | self.sigma = 1. 102 | 103 | self.central_lam = self.sign * N / k 104 | self.scaling_factor = 1 / (2 * np.pi * self.sigma ** 2) 105 | 106 | self.lam_plus = self.sigma ** 2 * self.sign * (1 + np.sqrt(self.central_lam)) ** 2 107 | self.lam_minus = self.sigma ** 2 * self.sign * (1 - np.sqrt(self.central_lam)) ** 2 108 | 109 | if negative: 110 | self.max_lam = 0. 111 | self.min_lam = self.lam_plus 112 | else: 113 | self.max_lam = self.lam_plus 114 | self.min_lam = 0. 115 | 116 | self.expected_spectral_density = self.marchenkopastur_density 117 | 118 | @staticmethod 119 | def generate_wishart(N, k=1): 120 | """generate an N by N wishart random matrix with rank min(N,k) 121 | """ 122 | self_outer_product = lambda x: x.dot(x.T) 123 | random_factor = np.random.standard_normal(size=(N, k)) 124 | wishart_random_matrix = 1 / k * self_outer_product(random_factor) 125 | 126 | return wishart_random_matrix 127 | 128 | @staticmethod 129 | def generate_negative_wishart(N, k=1): 130 | """generate an N by N negative wishart random matric with rank min(N,k) 131 | """ 132 | wishart_random_matrix = Wishart.generate_wishart(N, k) 133 | negative_wishart_random_matrix = -1 * wishart_random_matrix 134 | 135 | return negative_wishart_random_matrix 136 | 137 | def marchenkopastur_density(self, lam): 138 | """the density for the non-singular portion of the marchenko-pastur distribution, 139 | as given by https://en.wikipedia.org/wiki/Marchenko-Pastur_distribution. 140 | """ 141 | 142 | # density is 0 on real half-line opposite its sign 143 | if np.sign(lam) != self.sign: 144 | return 0 145 | 146 | # that handled, we can solve as though lam were positive, since density invariant 147 | lam = np.abs(lam) 148 | lam_minus = self.sign * self.lam_minus 149 | lam_plus = self.sign * self.lam_plus 150 | 151 | if (lam > lam_minus and lam < lam_plus): 152 | unscaled_density = np.sqrt( 153 | (lam_plus - lam) * (lam - lam_minus)) / (self.central_lam * lam) 154 | return self.scaling_factor * unscaled_density 155 | else: 156 | return 0 157 | 158 | def expected_spectral_singular_mass(self): 159 | return max(1 - self.k / self.N, 0) 160 | 161 | 162 | def generate_random_unit_vector(dim=25): 163 | gauss_random_vector = np.atleast_2d(np.random.standard_normal(size=dim)).T 164 | return gauss_random_vector / np.linalg.norm(gauss_random_vector) 165 | -------------------------------------------------------------------------------- /autocrit/nn/networks.py: -------------------------------------------------------------------------------- 1 | """Provides neural networks composed of layers from nn.layers module. 2 | 3 | Generic sequential networks are implemented by the Network class, 4 | while the traditional fully-connected network is provided by FullyConnected. 5 | """ 6 | from collections import namedtuple 7 | import json 8 | 9 | import autograd 10 | from autograd import numpy as np 11 | 12 | from . import layers as nn_layers 13 | from autocrit.utils import random_matrix 14 | from autocrit.utils import math 15 | 16 | _LAYERS = nn_layers._LAYERS 17 | 18 | _COSTS = {"mean_squared_error": math.mean_squared_error, 19 | "softmax_cross_entropy": math.softmax_cross_entropy} 20 | 21 | 22 | def l2_regularizer(theta): 23 | return np.mean(np.square(theta)) 24 | 25 | 26 | def l1_regularizer(theta): 27 | return np.mean(np.abs(theta)) 28 | 29 | 30 | _REGULARIZERS = {"l2": l2_regularizer, 31 | "l1": l1_regularizer, 32 | "none": lambda x: 0.} 33 | 34 | 35 | Data = namedtuple("Data", ['x', 'y']) 36 | 37 | 38 | class Network(object): 39 | 40 | def __init__(self, data, layer_specs, cost_str="mean_squared_error", 41 | regularizer_str="none", regularization_parameter=0., 42 | batch_size=None): 43 | if not isinstance(data, Data): 44 | try: 45 | data = Data(x=data[0], y=data[1]) 46 | except IndexError: 47 | raise("data argument not understood") 48 | 49 | self.data = data 50 | 51 | if batch_size is None: 52 | self.batch_size = self.data.x.shape[-1] 53 | else: 54 | self.batch_size = batch_size 55 | 56 | self.cost_str = cost_str 57 | self.regularizer_str = regularizer_str 58 | 59 | self.cost = _COSTS[self.cost_str] 60 | self.regularizer = _REGULARIZERS[self.regularizer_str] 61 | 62 | self.regularization_parameter = regularization_parameter 63 | 64 | self.layer_specs = layer_specs 65 | self.layers = [] 66 | for layer_spec in self.layer_specs: 67 | if not isinstance(layer_spec, nn_layers.Layer): 68 | layer_constructor = _LAYERS[layer_spec["type"]] 69 | layer = layer_constructor(**layer_spec["params"]) 70 | else: 71 | layer = layer_spec 72 | self.layers.append(layer) 73 | 74 | self.N_params, _ = self.build() 75 | 76 | self.grad = autograd.grad(self.loss) 77 | self.hess = autograd.hessian(self.loss) 78 | 79 | def loss(self, theta): 80 | return self.loss_on_batch(self.data.x, self.data.y, theta) 81 | 82 | def loss_on_batch(self, batch_x, batch_y, theta): 83 | return (self.cost(self.forward_pass(batch_x, theta), batch_y) + 84 | self.regularization_parameter * self.regularizer(theta)) 85 | 86 | def loss_on_random_batch(self, theta, batch_size=None): 87 | """Loss on a randomly selected batch of size batch_size. 88 | Defaults to self.batch_size, which itself defaults to full-batch. 89 | """ 90 | if batch_size is None: 91 | batch_size = self.batch_size 92 | dataset_size = self.data.x.shape[-1] 93 | 94 | if dataset_size == batch_size: 95 | batch_x, batch_y = self.data.x, self.data.y 96 | else: 97 | batch_idxs = np.random.choice(dataset_size, size=batch_size) 98 | batch_x, batch_y = self.data.x[..., batch_idxs], self.data.y[..., batch_idxs] 99 | 100 | return self.loss_on_batch(batch_x, batch_y, theta) 101 | 102 | def forward_pass(self, x, theta): 103 | y = x 104 | for layer in self.layers: 105 | params = self.parser.get(theta, layer) 106 | y = layer.forward_pass(y, params) 107 | return y 108 | 109 | def build(self): 110 | self.parser = nn_layers.ParamParser() 111 | 112 | shape = self.data.x.shape 113 | for layer in self.layers: 114 | N_params, shape = layer.build(shape) 115 | self.parser.add_params(layer, (N_params)) 116 | 117 | return self.parser.N, shape 118 | 119 | def to_json(self, filename): 120 | dictionary = self.construct_dict() 121 | with open(filename, "w") as f: 122 | json.dump(dictionary, f) 123 | 124 | @classmethod 125 | def from_json(cls, data, filename): 126 | with open(filename) as f: 127 | dictionary = json.load(f) 128 | return cls(data, **dictionary) 129 | 130 | def construct_dict(self): 131 | self.layer_dicts = [layer.to_dict() for layer in self.layers] 132 | 133 | return {"layer_specs": self.layer_dicts, 134 | "cost_str": self.cost_str, 135 | "regularizer_str": self.regularizer_str, 136 | "regularization_parameter": self.regularization_parameter, 137 | "batch_size": self.batch_size} 138 | 139 | def initialize(self): 140 | return 1 / np.sqrt(self.N_params) * np.random.standard_normal(size=[self.N_params, 1]) 141 | 142 | 143 | class FullyConnected(Network): 144 | 145 | def __init__(self, data, layer_sizes, cost_str="mean_squared_error", nonlinearity_str="relu", 146 | regularizer_str="none", regularization_parameter=0., has_biases=True, 147 | batch_size=None): 148 | self.layer_sizes = layer_sizes 149 | self.has_biases = has_biases 150 | self.nonlinearity_str = nonlinearity_str 151 | layers = [] 152 | for layer_size in self.layer_sizes: 153 | assert isinstance(layer_size, int) 154 | layers.append(nn_layers.FCLayer(layer_size, self.has_biases)) 155 | layers.append(_LAYERS["pointwise_nonlinear"](self.nonlinearity_str)) 156 | 157 | if self.has_biases: 158 | self.num_biases = sum(self.layer_sizes) 159 | else: 160 | self.num_biases = 0 161 | 162 | Network.__init__(self, data, layers, cost_str, regularizer_str, 163 | regularization_parameter, batch_size=batch_size) 164 | 165 | def initialize(self, weight_kwargs=None, bias_kwargs=None): 166 | if weight_kwargs is None: 167 | weight_kwargs = {} 168 | if bias_kwargs is None: 169 | bias_kwargs = {} 170 | 171 | init_weights = self.initialize_weights(**weight_kwargs) 172 | init_biases = self.initialize_biases(**bias_kwargs) 173 | 174 | return np.atleast_2d(np.concatenate([init_weights, init_biases])).T 175 | 176 | def initialize_weights(self): 177 | in_sizes = [self.data.x.shape[0]] + self.layer_sizes[:-1] 178 | out_sizes = self.layer_sizes 179 | weight_matrices = [self.initialize_weight_matrix(in_size, out_size) 180 | for in_size, out_size in zip(in_sizes, out_sizes)] 181 | 182 | return np.concatenate([weight_matrix.ravel() 183 | for weight_matrix in weight_matrices]) 184 | 185 | def initialize_biases(self, constant=0.01): 186 | return np.asarray([constant] * self.num_biases) 187 | 188 | def initialize_weight_matrix(self, in_size, out_size): 189 | weight_matrix = np.asarray([random_matrix.generate_random_unit_vector(dim=in_size) 190 | for _ in range(out_size)]).squeeze() 191 | return weight_matrix 192 | 193 | def construct_dict(self): 194 | 195 | return {"layer_sizes": self.layer_sizes, 196 | "cost_str": self.cost_str, 197 | "nonlinearity_str": self.nonlinearity_str, 198 | "regularizer_str": self.regularizer_str, 199 | "regularization_parameter": self.regularization_parameter, 200 | "has_biases": self.has_biases} 201 | -------------------------------------------------------------------------------- /autocrit/experiments.py: -------------------------------------------------------------------------------- 1 | """Provides Experiment objects, which apply an optimization algorithm 2 | or a critical point-finding algorithm to a function. 3 | 4 | If the function is autograd-differentiable, the gradient oracle is 5 | computed automatically. If that's insufficient, a gradient oracle can be 6 | directly provided as the grad_f argument. 7 | """ 8 | import json 9 | import random 10 | 11 | import autograd 12 | import autograd.numpy as np 13 | 14 | from . import finders 15 | from . import nn 16 | from . import optimizers 17 | 18 | SEED = 14 19 | 20 | _NETWORK_INITS = {"fullyconnected": nn.networks.FullyConnected} 21 | 22 | _OPTIMIZERS = {"gd": optimizers.GradientDescentOptimizer, 23 | "momentum": optimizers.MomentumOptimizer, 24 | "btls": optimizers.BackTrackingLineSearchOptimizer} 25 | 26 | _FINDER_INITS = {"newtonMR": finders.newtons.FastNewtonMR, 27 | "newtonTR": finders.newtons.FastNewtonTR, 28 | "gnm": finders.gradnormmin.GradientNormMinimizer} 29 | 30 | DEFAULT_LOG_KWARGS = {"track_theta": True, "track_f": True, "track_grad_f": False} 31 | 32 | 33 | class Experiment(object): 34 | """Abstract base class for OptimizationExperiments and CritFinderExperiments. 35 | 36 | Concrete classes should implement a .run method that executes the experiment 37 | and stores the results of runs in self.runs, a list. These should be save-able 38 | into .npz format by np.savez. 39 | 40 | They should further implement a construct_dictionary method that saves 41 | all of the relevant arguments necessary for a constructor call as a dictionary 42 | that can be written to a .json file. These .json files are used to reconstruct 43 | experiments and their components. 44 | """ 45 | 46 | def __init__(self, seed=None): 47 | """ 48 | Parameters 49 | ---------- 50 | 51 | seed : int or None, default is None 52 | Seeding value for random and np.random. 53 | If None, defaults to global variable SEED. 54 | """ 55 | if seed is None: 56 | self.seed = SEED 57 | else: 58 | self.seed = seed 59 | 60 | self.runs = [] 61 | 62 | def to_json(self, filename): 63 | dictionary = self.construct_dictionary() 64 | 65 | with open(filename, "w") as f: 66 | json.dump(dictionary, f) 67 | 68 | def save_results(self, filename): 69 | results_dict = self.runs[-1] 70 | np.savez(filename, **results_dict) 71 | 72 | def construct_dictionary(self): 73 | raise NotImplementedError 74 | 75 | 76 | class OptimizationExperiment(Experiment): 77 | """Concrete Experiment that performs optimization on a function. 78 | """ 79 | 80 | def __init__(self, f, grad_f=None, optimizer_str="gd", optimizer_kwargs=None, 81 | log_kwargs=None, seed=None): 82 | """Create an OptimizationExperiment on callable f according to kwargs. 83 | 84 | Parameters 85 | ---------- 86 | 87 | f : callable 88 | Function to optimize. Should require only parameters as input. 89 | For stochastic functions, e.g. for stochastic gradient descent, 90 | function must perform batching. 91 | 92 | grad_f : callable or None, default is None 93 | A gradient oracle for f. If None, autograd.grad is called on f. 94 | 95 | optimizer_str : str 96 | String to key into _OPTIMIZERS. Default is "gd", which is 97 | optimizers.gradient_descent. 98 | 99 | optimizer_kwargs : dict or None, default is None 100 | A dictionary of keyword arguments for the optimizer selected with 101 | optimizer_str. See optimizers for call signatures. 102 | 103 | log_kwargs : dict or None, default is None 104 | A dictionary of keyword arguments for the log_run method, which 105 | determines which features of the run are saved. If None, 106 | DEFAULT_LOG_KWARGS is used. See log_run for details. 107 | 108 | seed : int or None, default is None 109 | Seeding value for random and np.random. 110 | If None, defaults to global variable SEED. 111 | """ 112 | Experiment.__init__(self, seed=seed) 113 | 114 | if log_kwargs is None: 115 | self.log_kwargs = DEFAULT_LOG_KWARGS.copy() 116 | else: 117 | self.log_kwargs = log_kwargs 118 | 119 | self.f = f 120 | self.grad_f = grad_f 121 | 122 | if self.grad_f is None: 123 | self.grad_f = autograd.grad(f) 124 | 125 | self.optimizer_str = optimizer_str 126 | self.optimizer = _OPTIMIZERS[self.optimizer_str] 127 | 128 | if optimizer_kwargs is None: 129 | self.optimizer_kwargs = {} 130 | else: 131 | self.optimizer_kwargs = optimizer_kwargs 132 | 133 | self.optimizer = _OPTIMIZERS[self.optimizer_str]( 134 | self.f, self.grad_f, **self.optimizer_kwargs) 135 | 136 | def run(self, init_theta, num_iters=1, seed=None): 137 | """Execute optimizer on self.f, starting with init_theta, for num_iters. 138 | 139 | Includes optional SEED argument to allow for stochastic behavior 140 | of stochastic functions f. 141 | Warning: this does not guarantee that f is non-stochastic across calls. 142 | """ 143 | if seed is None: 144 | seed = self.seed 145 | np.random.seed(seed) 146 | random.seed(seed) 147 | 148 | empty_run = {"theta": [], 149 | "f_theta": [], 150 | "grad_f_theta": [], 151 | "g_theta": []} 152 | self.runs.append(empty_run) 153 | 154 | theta = init_theta 155 | self.log_step(theta, **self.log_kwargs) 156 | 157 | for _ in range(num_iters): 158 | theta = theta + self.optimizer.update(theta) 159 | self.log_step(theta, **self.log_kwargs) 160 | 161 | return theta 162 | 163 | def log_step(self, theta, 164 | track_theta=False, track_f=False, track_grad_f=False, track_g=False): 165 | """Append selected values to run dictionary 166 | """ 167 | run = self.runs[-1] 168 | if track_theta: 169 | run["theta"].append(theta) 170 | if track_f: 171 | run["f_theta"].append(self.f(theta)) 172 | if track_grad_f: 173 | run["grad_f_theta"].append(self.grad_f(theta)) 174 | if track_g: 175 | run["g_theta"].append(0.5 * np.sum(np.square(self.grad_f(theta)))) 176 | 177 | @classmethod 178 | def from_json(cls, f, filename, grad_f=None): 179 | """Given a function and possibly a gradient oracle and the path to a .json file, 180 | creates an OptimizationExperiment on f using kwargs in the .json file. 181 | """ 182 | with open(filename) as fn: 183 | dictionary = json.load(fn) 184 | 185 | return cls(f, grad_f, **dictionary) 186 | 187 | def construct_dictionary(self): 188 | """Construct a dictionary containing necessary information for 189 | reconstructing OptimizationExperiment when combined with self.f. 190 | 191 | See OptimizationExperiment.from_json for details. 192 | """ 193 | return {"optimizer_str": self.optimizer_str, 194 | "optimizer_kwargs": self.optimizer_kwargs, 195 | "log_kwargs": self.log_kwargs, 196 | "seed": self.seed} 197 | 198 | 199 | class CritFinderExperiment(Experiment): 200 | """Concrete Experiment that finds critical points on a function. 201 | """ 202 | 203 | def __init__(self, f, finder_str, finder_kwargs=None): 204 | """ 205 | 206 | Parameters 207 | ---------- 208 | 209 | f : callable 210 | Function to search on. Should require only parameters as input. 211 | For stochastic functions, function must perform batching. 212 | 213 | finder_str : str 214 | String to key into _FINDER_INITS. Identifies the critical point- 215 | finding algorithm to use. 216 | 217 | finder_kwargs: dict or None, default is None 218 | Dictionary with keyword arguments to provide to self.finder_init. 219 | If None, an empty dictionary is used. 220 | 221 | seed : int or None, default is None 222 | Seeding value for random and np.random. 223 | If None, defaults to global variable SEED. 224 | """ 225 | Experiment.__init__(self) 226 | self.f = f 227 | 228 | self.finder_str = finder_str 229 | 230 | if finder_kwargs is None: 231 | self.finder_kwargs = {} 232 | else: 233 | self.finder_kwargs = finder_kwargs 234 | 235 | if "log_kwargs" not in self.finder_kwargs.keys(): 236 | self.finder_kwargs.update({"log_kwargs": DEFAULT_LOG_KWARGS.copy()}) 237 | 238 | self.finder_init = _FINDER_INITS[self.finder_str] 239 | 240 | self.finder = self.finder_init(self.f, **self.finder_kwargs) 241 | 242 | def run(self, init_theta, num_iters=1, seed=None): 243 | """Execute finder on self.f, starting with init_theta, for num_iters. 244 | """ 245 | if seed is None: 246 | seed = self.seed 247 | np.random.seed(seed) 248 | random.seed(seed) 249 | 250 | self.finder.log = {} 251 | thetas = self.finder.run(init_theta, num_iters) 252 | self.runs.append(self.finder.log) 253 | return thetas 254 | 255 | @classmethod 256 | def from_json(cls, f, filename): 257 | """Given a function f and the path to a .json file, 258 | creates a CritFinderExperiment for f using kwargs in the .json file. 259 | """ 260 | with open(filename) as fn: 261 | dictionary = json.load(fn) 262 | 263 | return cls(f, **dictionary) 264 | 265 | def construct_dictionary(self): 266 | """Construct a dictionary containing necessary information for 267 | reconstructing CritFinderExperiment when combined with self.f. 268 | 269 | See CritFinderExperiment.from_json for details. 270 | """ 271 | dictionary = {"finder_kwargs": self.finder_kwargs, 272 | "finder_str": self.finder_str} 273 | return dictionary 274 | 275 | def uniform(self, thetas): 276 | """Select a theta at random from list thetas. 277 | """ 278 | return random.choice(thetas) 279 | 280 | def uniform_f(self, thetas): 281 | """Select a theta from thetas uniformly across values of self.f. 282 | 283 | This can be slow. Overwrite this method by calling freeze_uniform_f 284 | if this function needs to be called multiple times. 285 | """ 286 | return self.uniform_cd(*self.sort_and_calculate_cds(thetas, self.f)) 287 | 288 | def freeze_uniform_f(self, thetas): 289 | """Overwrites self.uniform_f with a function that has pre-computed 290 | the sorted version of thetas and the cumulative densities, supporting 291 | much faster random selection. 292 | """ 293 | sorted_thetas, cds = self.sort_and_calculate_cds(thetas, self.f) 294 | self.uniform_f = lambda thetas: self.uniform_cd(sorted_thetas, cds) 295 | 296 | @staticmethod 297 | def sort_and_calculate_cds(thetas, f): 298 | f_thetas = [f(theta) for theta in thetas] 299 | min_f, max_f = min(f_thetas), max(f_thetas) 300 | cds = [(f_theta - min_f) / (max_f - min_f) for f_theta in f_thetas] 301 | thetas, cds = zip(*sorted(zip(thetas, cds), key=lambda tup: tup[1])) 302 | return thetas, cds 303 | 304 | @staticmethod 305 | def uniform_cd(sorted_thetas, cds): 306 | """Select randomly from sorted_thetas with respect to the cumulative 307 | density implied by cds, an equal-length list of cumulative density values 308 | for each element in sorted_thetas. 309 | """ 310 | rand_cd = random.uniform(0, 1) 311 | idx = next(filter(lambda tup: tup[1] >= rand_cd, enumerate(cds)))[0] 312 | return sorted_thetas[idx] 313 | -------------------------------------------------------------------------------- /autocrit/finders/newtons.py: -------------------------------------------------------------------------------- 1 | """Provides Newton-style methods for finding critical points. 2 | """ 3 | # import warnings 4 | 5 | import autograd 6 | import autograd.numpy as np 7 | 8 | from .minresQLP import MinresQLP as mrqlp 9 | 10 | from .base import Finder, Logger 11 | from ..defaults import DEFAULT_STEP_SIZE, DEFAULT_RTOL, DEFAULT_MAXIT 12 | from ..defaults import DEFAULT_ALPHA, DEFAULT_BETA, DEFAULT_GAMMAS, DEFAULT_RHO, DEFAULT_RHO_PURE 13 | 14 | DEFAULT_ACONDLIM = 1e7 15 | DEFAULT_MAXXNORM = 1e4 16 | DEFAULT_TRANCOND = 1e4 17 | 18 | 19 | class NewtonMethod(Finder): 20 | """Base version of Newton method for finding critical points. 21 | 22 | All Newton methods are run the same way: select an update direction or directions, 23 | and then the current value of theta and the update direction(s) are used to select an update. 24 | 25 | Those two steps are implemented here as the methods get_update_direction, 26 | which inverts the Hessian and multiplies it with the negative gradient, 27 | and select_update, which scales the result by the step_size. 28 | 29 | Additional Newton methods are defined by over-riding those two methods. 30 | """ 31 | 32 | def __init__(self, f, step_size=DEFAULT_STEP_SIZE, log_kwargs=None): 33 | Finder.__init__(self, f, log_kwargs=log_kwargs) 34 | 35 | self.step_size = step_size 36 | 37 | self.parameters = {"step_size": step_size} 38 | 39 | def run(self, init_theta, num_iters=1): 40 | theta = init_theta 41 | self.update_logs({"theta": theta, 42 | "update_direction": None, 43 | "parameters": self.parameters}) 44 | 45 | for ii in range(num_iters): 46 | 47 | update_direction = self.get_update_direction(theta) 48 | theta_new = self.select_update(theta, update_direction) 49 | 50 | self.update_logs({"theta": theta_new, 51 | "update_direction": update_direction, 52 | "parameters": self.parameters}) 53 | 54 | if np.array_equal(theta, theta_new): 55 | return theta 56 | 57 | theta = theta_new 58 | 59 | return theta 60 | 61 | def get_update_direction(self, theta): 62 | """Compute an update direction using the classic Newton-Raphson method: 63 | compute the Hessian at theta, invert it explicitly, and then apply that matrix 64 | to the negative gradient. 65 | """ 66 | update_direction = -np.linalg.inv(self.H(theta)).dot(self.grad_f(theta)) 67 | return update_direction 68 | 69 | def select_update(self, theta, update_direction): 70 | """Select the update along update direction using a fixed step size. 71 | """ 72 | return theta + self.step_size * update_direction 73 | 74 | def squared_grad_norm(self, theta): 75 | return np.sum(np.square(self.grad_f(theta))) 76 | 77 | 78 | class NewtonPI(NewtonMethod): 79 | """Newton method that uses Moore-Penrose pseudo-inversion of the Hessian instead of 80 | classic inversion, for use in problems with singular Hessians. 81 | """ 82 | 83 | def __init__(self, f, step_size=DEFAULT_STEP_SIZE, log_kwargs=None): 84 | NewtonMethod.__init__(self, f, step_size=step_size, log_kwargs=log_kwargs) 85 | self.pinv = np.linalg.pinv 86 | 87 | def get_update_direction(self, theta): 88 | update_direction = -self.pinv(self.H(theta)).dot(self.grad_f(theta)) 89 | return update_direction 90 | 91 | 92 | class NewtonBTLS(NewtonMethod): 93 | """Newton method that uses back-tracking line search to select the update. 94 | Convergence is checked using the Roosta criterion. 95 | """ 96 | 97 | def __init__(self, f, alpha=DEFAULT_ALPHA, beta=DEFAULT_BETA, rho=DEFAULT_RHO, 98 | check_pure=False, rho_pure=DEFAULT_RHO_PURE, log_kwargs=None): 99 | NewtonMethod.__init__(self, f, log_kwargs=log_kwargs) 100 | self.alpha = alpha 101 | self.beta = beta 102 | self.rho = rho 103 | 104 | self.check_pure = check_pure 105 | self.rho_pure = rho_pure 106 | self.pure_accepted = False 107 | 108 | self.parameters.update({"alpha": self.alpha, 109 | "pure_accepted": self.pure_accepted}) 110 | 111 | self.loggers.append( 112 | Logger("alpha", 113 | lambda step_info: step_info["parameters"]["alpha"])) 114 | 115 | if self.check_pure: 116 | self.loggers.append( 117 | Logger("pure_accepted", 118 | lambda step_info: step_info["parameters"]["pure_accepted"])) 119 | 120 | self.min_step_size = self.compute_min_step_size(alpha, beta) 121 | 122 | def select_update(self, theta, update_direction): 123 | if self.check_pure and self.alpha != 1: 124 | converged = self.check_convergence(theta, update_direction, 1., self.rho_pure) 125 | if converged: 126 | self.alpha = 1. 127 | self.pure_accepted = True 128 | else: 129 | converged = False 130 | self.pure_accepted = False 131 | 132 | while not converged: 133 | converged = self.check_convergence(theta, update_direction, self.alpha, self.rho) 134 | 135 | if not converged: 136 | self.alpha *= self.beta 137 | if self.alpha <= self.min_step_size: 138 | return np.zeros_like(theta) 139 | 140 | update = theta + self.alpha * update_direction 141 | 142 | self.parameters.update( 143 | {"alpha": self.alpha, 144 | "pure_accepted": self.pure_accepted}) 145 | 146 | self.alpha = min(1., self.alpha / self.beta) 147 | return update 148 | 149 | def check_convergence(self, theta, update_direction, alpha, rho): 150 | proposed_update = theta + alpha * update_direction 151 | updated_squared_gradient_norm = self.squared_grad_norm(proposed_update) 152 | current_squared_gradient_norm = self.squared_grad_norm(theta) 153 | sufficient_decrease = 2 * rho * alpha * np.dot(self.hvp(theta, update_direction).T, 154 | self.grad_f(theta)) 155 | 156 | return (updated_squared_gradient_norm <= 157 | current_squared_gradient_norm + sufficient_decrease) 158 | 159 | @staticmethod 160 | def compute_min_step_size(alpha, beta): 161 | while alpha * beta != alpha: 162 | alpha *= beta 163 | return alpha 164 | 165 | 166 | class NewtonMR(NewtonBTLS): 167 | """Newton method that uses MRQLP to approximately compute the update direction 168 | and back-tracking line search to select the update. 169 | """ 170 | 171 | def __init__(self, f, alpha=DEFAULT_ALPHA, beta=DEFAULT_BETA, rho=DEFAULT_RHO, 172 | check_pure=False, rho_pure=DEFAULT_RHO_PURE, 173 | rtol=DEFAULT_RTOL, maxit=DEFAULT_MAXIT, 174 | acondlim=DEFAULT_ACONDLIM, trancond=DEFAULT_TRANCOND, 175 | maxxnorm=DEFAULT_MAXXNORM, 176 | log_mrqlp=False, log_kwargs=None): 177 | NewtonBTLS.__init__(self, f, alpha, beta, rho, check_pure, rho_pure, 178 | log_kwargs=log_kwargs) 179 | self.rtol = rtol 180 | self.maxit = maxit 181 | self.acondlim = acondlim 182 | self.trancond = trancond 183 | self.maxxnorm = maxxnorm 184 | 185 | self.parameters.update({"rtol": rtol, 186 | "maxit": maxit, 187 | "acondlim": acondlim, 188 | "trancond": trancond, 189 | "maxxnorm": maxxnorm}) 190 | 191 | self.log_mrqlp = log_mrqlp 192 | 193 | if self.log_mrqlp: 194 | self.loggers.append( 195 | Logger("mrqlp_outputs", 196 | lambda step_info: step_info["parameters"]["mrqlp_outputs"])) 197 | self.parameters.update({"mrqlp_outputs": None}) 198 | 199 | def get_update_direction(self, theta): 200 | current_hvp = lambda v: self.hvp(theta, v) 201 | mrqlp_outputs = mrqlp( 202 | current_hvp, -1 * self.grad_f(theta), 203 | rtol=self.rtol, maxit=self.maxit, 204 | acondlim=self.acondlim, trancond=self.trancond, maxxnorm=self.maxxnorm) 205 | 206 | self.parameters.update({"mrqlp_outputs": mrqlp_outputs[1:]}) 207 | mr_update_direction = mrqlp_outputs[0] 208 | 209 | return mr_update_direction 210 | 211 | 212 | class FastNewtonMR(NewtonMR): 213 | """Newton method that uses MRQLP to approximately compute the update direction. 214 | Makes use of fast Hessian-vector products. 215 | """ 216 | 217 | def __init__(self, f, alpha=DEFAULT_ALPHA, beta=DEFAULT_BETA, rho=DEFAULT_RHO, 218 | check_pure=False, rho_pure=DEFAULT_RHO_PURE, 219 | rtol=DEFAULT_RTOL, maxit=DEFAULT_MAXIT, 220 | acondlim=DEFAULT_ACONDLIM, trancond=DEFAULT_TRANCOND, 221 | maxxnorm=DEFAULT_MAXXNORM, 222 | log_mrqlp=False, log_kwargs=None): 223 | NewtonMR.__init__(self, f, alpha, beta, rho, check_pure, rho_pure, 224 | rtol=rtol, maxit=maxit, acondlim=acondlim, 225 | maxxnorm=maxxnorm, trancond=trancond, 226 | log_mrqlp=log_mrqlp, log_kwargs=log_kwargs) 227 | self.hvp = autograd.hessian_vector_product(self.f) 228 | 229 | 230 | class NewtonTR(NewtonPI): 231 | """Newton method that computes a sequence of proposed updates using the pseudo-inverse of 232 | a sequence of perturbed versions of the Hessian. The perturbations are diagonal matrices with 233 | varying values gamma. Equivalent to a trust region approach. 234 | """ 235 | 236 | def __init__(self, f, gammas=DEFAULT_GAMMAS, step_size=DEFAULT_STEP_SIZE, log_kwargs=None): 237 | NewtonPI.__init__(self, f, step_size=step_size, log_kwargs=log_kwargs) 238 | self.gammas = gammas 239 | self.Hs = [lambda theta: self.H(theta) + np.diag([gamma] * theta.shape[0]) 240 | for gamma in gammas] 241 | 242 | self.parameters.update({"gammas": gammas}) 243 | 244 | def get_update_direction(self, theta): 245 | update_directions = [] 246 | 247 | for H in self.Hs: 248 | update_directions.append(-self.pinv(H(theta)) 249 | .dot(self.grad_f(theta))) 250 | 251 | return update_directions 252 | 253 | def select_update(self, theta, update_directions): 254 | best_update = theta 255 | best_grad_norm = self.squared_grad_norm(best_update) 256 | for update_direction in update_directions: 257 | proposed_update = theta + self.step_size * update_direction 258 | if self.squared_grad_norm(proposed_update) < best_grad_norm: 259 | best_update = proposed_update 260 | 261 | return best_update 262 | 263 | 264 | class FastNewtonTR(NewtonTR): 265 | """Newton method that computes a sequence of proposed updates by applying MRQLP to 266 | a sequence of perturbed versions of the Hessian. The perturbations are diagonal matrices with 267 | varying values gamma. Equivalent to a trust region approach. 268 | Makes use of fast Hessian-vector products. 269 | """ 270 | 271 | def __init__(self, f, gammas=DEFAULT_GAMMAS, step_size=DEFAULT_STEP_SIZE, log_kwargs=None, 272 | rtol=DEFAULT_RTOL, maxit=DEFAULT_MAXIT): 273 | NewtonTR.__init__(self, f, gammas, step_size=step_size, log_kwargs=log_kwargs) 274 | self.rtol = rtol 275 | self.maxit = maxit 276 | 277 | self.hvps = [lambda theta, v: autograd.hessian_vector_product(self.f)(theta, v) + 278 | np.sum(gamma * theta) for gamma in gammas] 279 | 280 | def get_update_direction(self, theta): 281 | update_directions = [] 282 | current_hvps = [lambda v: hvp(theta, v) for hvp in self.hvps] 283 | 284 | for current_hvp in current_hvps: 285 | mr_update_direction = mrqlp(current_hvp, -1 * self.grad_f(theta), 286 | rtol=self.rtol, maxit=self.maxit)[0] 287 | update_directions.append(mr_update_direction) 288 | 289 | return update_directions 290 | -------------------------------------------------------------------------------- /autocrit/nn/layers.py: -------------------------------------------------------------------------------- 1 | # modified from code in autograd/examples/convnet.py 2 | 3 | import autograd.numpy as np 4 | 5 | from autocrit.nn.conv import convolve, torch_accelerated 6 | from autocrit.utils import math 7 | 8 | _NONLINEARITIES = {"relu": math.relu, 9 | "sigmoid": math.sigmoid, 10 | "softplus": math.softplus, 11 | "swish": math.swish, 12 | "none": lambda x: x} 13 | 14 | 15 | class ParamParser(object): 16 | """A helper class to index into a parameter vector.""" 17 | def __init__(self): 18 | self.idxs_and_shapes = {} 19 | self.N = 0 20 | 21 | def add_params(self, name, shape): 22 | start = self.N 23 | self.N += np.prod(shape) 24 | self.idxs_and_shapes[name] = (slice(start, self.N), shape) 25 | 26 | def get(self, vect, name): 27 | idxs, shape = self.idxs_and_shapes[name] 28 | return np.reshape(vect[idxs], shape) 29 | 30 | 31 | class Layer(object): 32 | """A Layer implements two methods: 33 | forward_pass, which takes inputs and a parameter vector and returns outputs, 34 | and build, which takes the input_shape and computes the number of 35 | parameters and the shape of the outputs, optionally also 36 | using a ParamsParser to track those parameters. 37 | """ 38 | 39 | def __init__(self): 40 | pass 41 | 42 | def to_batch_major(self, inputs): 43 | """Reorder [y, x, channels, batch] 44 | to [batch, channels, y, x] 45 | """ 46 | return np.moveaxis(inputs, [0, 1, 2, 3], [2, 3, 1, 0]) 47 | 48 | def to_batch_minor(self, inputs): 49 | """Reorder [batch, channels, y, x] 50 | to [y, x, channels, batch] 51 | """ 52 | return np.moveaxis(inputs, [0, 1, 2, 3], [3, 2, 0, 1]) 53 | 54 | def forward_pass(self, inputs, theta): 55 | raise NotImplementedError 56 | 57 | def build(self, input_shape): 58 | raise NotImplementedError 59 | 60 | def to_dict(self, str, params): 61 | """Convert Layer to a dictionary representation. 62 | """ 63 | return {"type": str, "params": params} 64 | 65 | 66 | class PointwiseNonlinearLayer(Layer): 67 | """Layer for applying the same nonlinear function to each node, 68 | aka pointwise. 69 | 70 | Any callable can be provided as the nonlinearity, but the layer 71 | can only be represented by a dictionary if the nonlinearity is provided 72 | as a string, used to key into the _NONLINEARITIES dictionary. 73 | """ 74 | str = "pointwise_nonlinear" 75 | 76 | def __init__(self, nonlinearity): 77 | """ 78 | Parameters 79 | ---------- 80 | nonlinearity: str or callable. pointwise nonlinear transformation. 81 | if is a str instance, used to key into _NONLINEARITIES dictionary. 82 | if is callable, directly called as function applied by this layer. 83 | it is assumed but not checked that this function doesn't change the shape. 84 | """ 85 | if isinstance(nonlinearity, str): 86 | self.nonlinearity_str = nonlinearity 87 | nonlinearity = _NONLINEARITIES[nonlinearity] 88 | else: 89 | assert callable(nonlinearity) 90 | self.nonlinearity = nonlinearity 91 | 92 | def forward_pass(self, inputs, theta): 93 | return self.nonlinearity(inputs) 94 | 95 | def build(self, input_shape): 96 | return 0, input_shape 97 | 98 | def to_dict(self): 99 | assert hasattr(self, "nonlinearity_str"), "can't save nonlinear layer without str" 100 | params = {"nonlinearity": self.nonlinearity_str} 101 | return super().to_dict(self.str, params) 102 | 103 | 104 | class FCLayer(Layer): 105 | """Layer for applying an affine transformation to the inputs. 106 | """ 107 | str = "fc" 108 | 109 | def __init__(self, out_nodes, has_biases=True): 110 | """ 111 | Parameters 112 | ---------- 113 | out_nodes: int, number of nodes in the output layer. 114 | has_biases: bool, if False, linear transform. otherwise affine. 115 | """ 116 | self.out_nodes = out_nodes 117 | self.has_biases = has_biases 118 | 119 | def forward_pass(self, inputs, theta): 120 | W = self.parser.get(theta, 'weights') 121 | if self.has_biases: 122 | b = self.parser.get(theta, 'biases') 123 | else: 124 | b = 0. 125 | activations = np.dot(W, inputs) + b 126 | return activations 127 | 128 | def build(self, input_shape): 129 | self.parser = ParamParser() 130 | self.parser.add_params('weights', (self.out_nodes, input_shape[0])) 131 | if self.has_biases: 132 | self.parser.add_params('biases', (self.out_nodes, 1)) 133 | output_shape = (self.out_nodes, 1) 134 | 135 | return self.parser.N, output_shape 136 | 137 | def to_dict(self): 138 | params = {"out_nodes": self.out_nodes, 139 | "has_biases": self.has_biases} 140 | return super().to_dict(self.str, params) 141 | 142 | 143 | class ConvLayer(Layer): 144 | """Layer for applying a valid 2D convolution to inputs. 145 | """ 146 | str = "conv" 147 | 148 | def __init__(self, kernel_shape, out_channels, accelerated=torch_accelerated): 149 | """ 150 | Parameters 151 | ---------- 152 | kernel_shape: tuple of ints, shape of convolutional kernel 153 | out_channels: int, number of output channels aka convolutional kernels 154 | accelerated: Boolean, use pytorch acceleration, if available. See conv.py 155 | """ 156 | self.kernel_shape = kernel_shape 157 | self.out_channels = out_channels 158 | self.accelerated = accelerated 159 | 160 | def forward_pass(self, inputs, theta): 161 | weights = self.parser.get(theta, 'weights') 162 | biases = self.parser.get(theta, 'biases') 163 | inputs = self.to_batch_major(inputs) 164 | conv = convolve(inputs, weights, 165 | axes=([2, 3], [2, 3]), dot_axes=([1], [0]), 166 | mode='valid', accelerated=self.accelerated) 167 | activations = conv + biases 168 | activations = self.to_batch_minor(activations) 169 | return activations 170 | 171 | def build(self, input_shape): 172 | self.parser = ParamParser() 173 | self.parser.add_params('weights', (input_shape[-2], self.out_channels) + 174 | self.kernel_shape) 175 | self.parser.add_params('biases', (1, self.out_channels, 1, 1)) 176 | output_shape = self.conv_output_shape(input_shape[:-1], self.kernel_shape) +\ 177 | (self.out_channels, input_shape[-1]) 178 | return self.parser.N, output_shape 179 | 180 | def conv_output_shape(self, A, B): 181 | return (A[0] - B[0] + 1, A[1] - B[1] + 1) 182 | 183 | def to_dict(self): 184 | params = {"kernel_shape": self.kernel_shape, 185 | "out_channels": self.out_channels} 186 | return super().to_dict(self.str, params) 187 | 188 | 189 | class PoolLayer(Layer): 190 | """Abstract class for Layers that applying pooling: summarizing 191 | a block of values in a feature map with a single number. 192 | 193 | Pooling shapes must evenly tile inputs. 194 | """ 195 | 196 | def __init__(self, pool_shape): 197 | """ 198 | Parameters 199 | ---------- 200 | pool_shape: tuple of ints, shape of pooling kernel 201 | """ 202 | self.pool_shape = pool_shape 203 | 204 | def forward_pass(self, inputs, theta): 205 | patches = self.to_patches(inputs) 206 | patch_means = self.pool_func(patches) 207 | patch_means = self.to_batch_minor(patch_means) 208 | return patch_means 209 | 210 | def build(self, input_shape): 211 | output_shape = self.set_output_shapes(input_shape) 212 | return 0, output_shape 213 | 214 | def to_patches(self, inputs): 215 | self.set_patch_shapes(inputs.shape) 216 | channels, batch = inputs.shape[2:] 217 | inputs = self.to_batch_major(inputs) 218 | 219 | patched_shape = inputs.shape[:2] 220 | for patch_ct, pool_shape in zip(self.patch_yx, self.pool_shape): 221 | patched_shape += (patch_ct, pool_shape) 222 | 223 | patches = inputs.reshape(patched_shape) 224 | 225 | return patches 226 | 227 | def set_patch_shapes(self, input_shape): 228 | self.input_yx = input_shape[:2] 229 | self.patch_yx = np.floor_divide(self.input_yx, self.pool_shape) 230 | self.num_patches = np.prod(self.patch_yx) 231 | 232 | def set_output_shapes(self, input_shape): 233 | self.output_shape = list(input_shape) 234 | for i in [0, 1]: 235 | assert input_shape[i] % self.pool_shape[i] == 0, \ 236 | "pool shape should tile input exactly" 237 | self.output_shape[i] = input_shape[i] // self.pool_shape[i] 238 | return self.output_shape 239 | 240 | def pool_func(self, patches): 241 | return patches 242 | 243 | def to_dict(self, str, params): 244 | return super().to_dict(str, params) 245 | 246 | 247 | class AvgPoolLayer(PoolLayer): 248 | """Applies an average pooling: computes mean of elements in pool kernel. 249 | 250 | Pooling kernel shape must evenly tile inputs. 251 | """ 252 | str = "avg_pool" 253 | 254 | def __init__(self, pool_shape): 255 | """ 256 | Parameters 257 | ---------- 258 | pool_shape: tuple of ints, shape of pooling kernel 259 | """ 260 | super().__init__(pool_shape) 261 | 262 | def pool_func(self, patches): 263 | return np.mean(np.mean(patches, axis=3), axis=4) 264 | 265 | 266 | class MaxPoolLayer(PoolLayer): 267 | """Applies maximum-based pooling: computes max of elements in pool kernel. 268 | 269 | Pooling kernel shape must evenly tile inputs. 270 | """ 271 | str = "max_pool" 272 | 273 | def __init__(self, pool_shape): 274 | """ 275 | Parameters 276 | ---------- 277 | pool_shape: tuple of ints, shape of pooling kernel 278 | """ 279 | super().__init__(pool_shape) 280 | 281 | def pool_func(self, patches): 282 | return np.max(np.max(patches, axis=3), axis=4) 283 | 284 | def to_dict(self): 285 | params = {"pool_shape": self.pool_shape} 286 | return super().to_dict(self.str, params) 287 | 288 | 289 | class GlobalAvgPoolLayer(AvgPoolLayer): 290 | """Applies global average pooling: computes the average of the 291 | entire feature map. 292 | 293 | Typically used as the last transformation before classification 294 | in an all-convolutional classification network. 295 | """ 296 | str = "global_avg_pool" 297 | 298 | def __init__(self): 299 | pass 300 | 301 | def build(self, input_shape): 302 | super().__init__(input_shape[:2]) 303 | output_shape = self.set_output_shapes(input_shape) 304 | return 0, output_shape 305 | 306 | def to_dict(self): 307 | params = {} 308 | return super().to_dict(self.str, params) 309 | 310 | 311 | class SqueezeLayer(Layer): 312 | """Removes "dummy" singleton axes from shapes. 313 | """ 314 | str = "squeeze" 315 | 316 | def __init__(self, squeeze_axes=(0, 1)): 317 | """ 318 | Parameters 319 | ---------- 320 | 321 | squeeze_axes: tuple of ints, axes to remove 322 | """ 323 | super().__init__() 324 | self.squeeze_axes = squeeze_axes 325 | 326 | def build(self, input_shape): 327 | output_shape = [input_shape[i] for i in range(len(input_shape)) 328 | if i not in self.squeeze_axes] 329 | return 0, output_shape 330 | 331 | def forward_pass(self, inputs, theta): 332 | for axis in self.squeeze_axes: 333 | assert inputs.shape[axis] == 1 334 | return np.squeeze(inputs, axis=self.squeeze_axes) 335 | 336 | def to_dict(self): 337 | params = {"squeeze_axes": self.squeeze_axes} 338 | return super().to_dict(self.str, params) 339 | 340 | 341 | class LambdaLayer(Layer): 342 | """Layer for arbitrary functional transformations. 343 | 344 | Cannot be represented by a dictionary. 345 | """ 346 | str = "lambda" 347 | 348 | def __init__(self, lam, shape_calculator=lambda shape: shape): 349 | """ 350 | Parameters 351 | ---------- 352 | lam: callable. Functional transformation to apply. 353 | shape_calculator: callable. Computes output shape from input shape. 354 | Defaults to assuming shape does not change. 355 | """ 356 | super().__init__() 357 | self.lam = lam 358 | self.shape_calculator = shape_calculator 359 | 360 | def build(self, input_shape): 361 | output_shape = self.shape_calculator(input_shape) 362 | return 0, output_shape 363 | 364 | def forward_pass(self, inputs, theta): 365 | return self.lam(inputs) 366 | 367 | def to_dict(self): 368 | raise NotImplementedError("cannot convert LambdaLayer to dict") 369 | 370 | 371 | _layer_list = [PointwiseNonlinearLayer, 372 | FCLayer, 373 | ConvLayer, 374 | AvgPoolLayer, 375 | MaxPoolLayer, 376 | GlobalAvgPoolLayer, 377 | SqueezeLayer, 378 | LambdaLayer] 379 | 380 | _LAYERS = {layer.str: layer for layer in _layer_list} 381 | -------------------------------------------------------------------------------- /autocrit/finders/minresQLP.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jan 14 23:43:12 2018 5 | 6 | Note: 7 | This code is translated from the MATLAB version of minresQLP: 8 | http://www.stanford.edu/group/SOL/software.html 9 | 10 | Authors: 11 | Yang Liu, 12 | School of Mathematics and Physics, 13 | The University of Queensland. 14 | yang.liu15(AT)uqconnect.edu.au 15 | 16 | Farbod Roosta-Khorasani, 17 | School of Mathematics and Physics, 18 | The University of Queensland. 19 | fred.roosta(AT)uq.edu.au 20 | 21 | REFERENCES: 22 | S.-C. Choi, C. C. Paige, and M. A. Saunders, 23 | MINRES-QLP: A Krylov subspace method for indefinite or singular symmetric 24 | systems, SIAM Journal of Scientific Computing, submitted on March 7, 2010. 25 | 26 | S.-C. Choi's PhD Dissertation, Stanford University, 2006: 27 | http://www.stanford.edu/group/SOL/dissertations.html 28 | 29 | -------------------------------------------------------------------------- 30 | minresQLP: Aim to obtain the min-length solution of symmetric 31 | (possibly singular) Ax=b or min||Ax-b||. 32 | 33 | X = minresQLP(A,B) solves the system of linear equations A*X=B 34 | or the least-squares problem min norm(B-A*X) if A is singular. 35 | The N-by-N matrix A must be symmetric or Hermitian, but need not be 36 | positive definite or nonsingular. It may be double or single. 37 | The rhs vector B must have length N. It may be real or complex, 38 | double or single, 39 | 40 | X = minresQLP(AFUN,B) accepts a function handle AFUN instead of 41 | the matrix A. Y = AFUN(X) returns the matrix-vector product Y=A*X. 42 | In all of the following syntaxes, A can be replaced by AFUN. 43 | 44 | X = minresQLP(A,B,RTOL) specifies a stopping tolerance. 45 | If RTOL=[] or is absent, a default value is used. 46 | (Similarly for all later input parameters.) 47 | Default RTOL=1e-6. 48 | 49 | X = minresQLP(A,B,RTOL,MAXIT) 50 | specifies the maximum number of iterations. Default MAXIT=N. 51 | 52 | X = minresQLP(A,B,RTOL,MAXIT,M) 53 | uses a matrix M as preconditioner. M must be positive definite 54 | and symmetric or Hermitian. It may be a function handle MFUN 55 | such that Y=MFUN(X) returns Y=M divide X. 56 | If M=[], a preconditioner is not applied. 57 | 58 | X = minresQLP(A,B,RTOL,MAXIT,M,SHIFT) 59 | solves (A - SHIFT*I)X = B, or the corresponding least-squares problem 60 | if (A - SHIFT*I) is singular, where SHIFT is a real or complex scalar. 61 | Default SHIFT=0. 62 | 63 | X = minresQLP(A,B,RTOL,MAXIT,M,SHIFT,MAXXNORM,ACONDLIM,TRANCOND) 64 | specifies three parameters associated with singular or 65 | ill-conditioned systems (A - SHIFT*I)*X = B. 66 | 67 | MAXXNORM is an upper bound on NORM(X). 68 | Default MAXXNORM=1e7. 69 | 70 | ACONDLIM is an upper bound on ACOND, an estimate of COND(A). 71 | Default ACONDLIM=1e15. 72 | 73 | TRANCOND is a real scalar >= 1. 74 | If TRANCOND>1, a switch is made from MINRES iterations to 75 | MINRES-QLP iterationsd when ACOND >= TRANCOND. 76 | If TRANCOND=1, all iterations will be MINRES-QLP iterations. 77 | If TRANCOND=ACONDLIM, all iterations will be conventional MINRES 78 | iterations (which are slightly cheaper). 79 | Default TRANCOND=1e7. 80 | 81 | X = minresQLP(A,B,RTOL,MAXIT,M,SHIFT,MAXXNORM,ACONDLIM,TRANCOND,SHOW) 82 | specifies the printing option. 83 | If SHOW=true, an iteration log will be output. 84 | If SHOW=false, the log is suppressed. 85 | Default SHOW=true. 86 | 87 | 88 | FLAG: 89 | -1 (beta2=0) B and X are eigenvectors of (A - SHIFT*I). 90 | 0 (beta1=0) B = 0. The exact solution is X = 0. 91 | 1 X solves the compatible (possibly singular) system (A - SHIFT*I)X = B 92 | to the desired tolerance: 93 | RELRES = RNORM / (ANORM*XNORM + NORM(B)) <= RTOL, 94 | where 95 | R = B - (A - SHIFT*I)X and RNORM = norm(R). 96 | 2 X solves the incompatible (singular) system (A - SHIFT*I)X = B 97 | to the desired tolerance: 98 | RELARES = ARNORM / (ANORM * RNORM) <= RTOL, 99 | where 100 | AR = (A - SHIFT*I)R and ARNORM = NORM(AR). 101 | 3 Same as 1 with RTOL = EPS. 102 | 4 Same as 2 with RTOL = EPS. 103 | 5 X converged to an eigenvector of (A - SHIFT*I). 104 | 6 XNORM exceeded MAXXNORM. 105 | 7 ACOND exceeded ACONDLIM. 106 | 8 MAXIT iterations were performed before one of the previous 107 | conditions was satisfied. 108 | 9 The system appears to be exactly singular. XNORM does not 109 | yet exceed MAXXNORM, but would if further iterations were 110 | performed. 111 | 112 | ITER: the number of iterations performed. ITER = MITER + QLPITER. 113 | MITER: the number of conventional MINRES iterations. 114 | QLPITER: the number of MINRES-QLP iterations. 115 | 116 | RELRES & RELARES: Relative residuals for (A - SHIFT*I)X = B and the 117 | associated least-squares problem. RELRES and RELARES are 118 | defined above in the description of FLAG. 119 | 120 | ANORM: an estimate of the 2-norm of A-SHIFT*I. 121 | ACOND: an estimate of COND(A-SHIFT*I,2). 122 | XNORM: a recurred estimate of NORM(X). 123 | AXNORM: a recurred estimate of NORM((A-SHIFT*I)X) 124 | 125 | RESVEC: a vector of estimates of NORM(R) at each iteration, 126 | including NORM(B) as the first entry. 127 | ARESVEC: a vector of estimates of NORM((A-SHIFT*I)R) at each 128 | iteration, including NORM((A-SHIFT*I)B) as the first entry. 129 | RESVEC and ARESVEC have length ITER+1. 130 | 131 | COPYRIGHT NOTICE: 132 | If you seek permission to copy and distribute translations of this 133 | software into another language, please e-mail a specific request to 134 | saunders@stanford.edu and scchoi@stanford.edu. 135 | """ 136 | 137 | import numpy as np 138 | import scipy.sparse as sp 139 | from numpy.linalg import inv, norm 140 | from scipy.sparse.linalg import cg 141 | from scipy.sparse.linalg.interface import aslinearoperator 142 | 143 | def MinresQLP(A, b, rtol, maxit, M=None, shift=None, maxxnorm=None, 144 | acondlim=None, trancond=None, show=False, rnormvec=False): 145 | 146 | #A = aslinearoperator(A) 147 | if shift is None: 148 | shift = 0 149 | if maxxnorm is None: 150 | maxxnorm = 1e7 151 | if acondlim is None: 152 | acondlim = 1e15 153 | if trancond is None: 154 | trancond = 1e7 155 | if rnormvec: 156 | resvec = [] 157 | Aresvec = [] 158 | 159 | 160 | n = len(b) 161 | b = b.reshape(n,1) 162 | r2 = b 163 | r3 = r2 164 | beta1 = norm(r2) 165 | 166 | if M is None: 167 | noprecon = True 168 | pass 169 | else: 170 | noprecon = False 171 | r3 = Precond(M, r2) 172 | beta1 = r3.T.dot(r2) #teta 173 | if beta1 <0: 174 | print('Error: "M" is indefinite!') 175 | else: 176 | beta1 = np.sqrt(beta1) 177 | 178 | ## Initialize 179 | flag0 = -2 180 | flag = -2 181 | iters = 0 182 | QLPiter = 0 183 | beta = 0 184 | tau = 0 185 | taul = 0 186 | phi = beta1 187 | betan = beta1 188 | gmin = 0 189 | cs = -1 190 | sn = 0 191 | cr1 = -1 192 | sr1 = 0 193 | cr2 = -1 194 | sr2 = 0 195 | dltan = 0 196 | eplnn = 0 197 | gama = 0 198 | gamal = 0 199 | gamal2 = 0 200 | eta = 0 201 | etal = 0 202 | etal2 = 0 203 | vepln = 0 204 | veplnl = 0 205 | veplnl2 = 0 206 | ul3 = 0 207 | ul2 = 0 208 | ul = 0 209 | u = 0 210 | rnorm = betan 211 | xnorm = 0 212 | xl2norm = 0 213 | Axnorm = 0 214 | Anorm = 0 215 | Acond = 1 216 | relres = rnorm / (beta1 + 1e-50) 217 | x = np.zeros((n,1)) 218 | w = np.zeros((n,1)) 219 | wl = np.zeros((n,1)) 220 | if rnormvec: 221 | resvec = np.append(resvec, beta1) 222 | 223 | msg = [' beta2 = 0. b and x are eigenvectors ', # -1 224 | ' beta1 = 0. The exact solution is x = 0 ', # 0 225 | ' A solution to Ax = b found, given rtol ', # 1 226 | ' Min-length solution for singular LS problem, given rtol', # 2 227 | ' A solution to Ax = b found, given eps ', # 3 228 | ' Min-length solution for singular LS problem, given eps ', # 4 229 | ' x has converged to an eigenvector ', # 5 230 | ' xnorm has exceeded maxxnorm ', # 6 231 | ' Acond has exceeded acondlim ', # 7 232 | ' The iteration limit was reached ', # 8 233 | ' Least-squares problem but no converged solution yet '] # 9 234 | 235 | if show: 236 | print(' ') 237 | print('Enter Minres-QLP: ') 238 | print('Min-length solution of symmetric(singular)', end=' ') 239 | print('(A-sI)x = b or min ||(A-sI)x - b||') 240 | #||Ax - b|| is ||(A-sI)x - b|| if shift != 0 here 241 | hstr1 = ' n = %8g ||Ax - b|| = %8.2e ' % (n, beta1) 242 | hstr2 = 'shift = %8.2e rtol = %8g' % (shift, rtol) 243 | hstr3 = 'maxit = %8g maxxnorm = %8.2e ' % (maxit, maxxnorm) 244 | hstr4 = 'acondlim = %8.2e trancond = %8g' % (acondlim, trancond) 245 | print(hstr1, hstr2) 246 | print(hstr3, hstr4) 247 | 248 | #b = 0 --> x = 0 skip the main loop 249 | if beta1 == 0: 250 | flag = 0 251 | 252 | while flag == flag0 and iters < maxit: 253 | #lanczos 254 | iters += 1 255 | betal = beta 256 | beta = betan 257 | v = r3/beta 258 | r3 = Ax(A, v) 259 | if shift == 0: 260 | pass 261 | else: 262 | r3 = r3 - shift*v 263 | 264 | if iters > 1: 265 | r3 = r3 - r1*beta/betal 266 | 267 | alfa = np.real(r3.T.dot(v)) 268 | r3 = r3 - r2*alfa/beta 269 | r1 = r2 270 | r2 = r3 271 | 272 | if noprecon: 273 | betan = norm(r3) 274 | if iters == 1: 275 | if betan == 0: 276 | if alfa == 0: 277 | flag = 0 278 | break 279 | else: 280 | flag = -1 281 | x = b/alfa 282 | break 283 | else: 284 | r3 = Precond(M, r2) 285 | betan = r2.T.dot(r3) 286 | if betan > 0: 287 | betan = np.sqrt(betan) 288 | else: 289 | print('Error: "M" is indefinite or singular!') 290 | pnorm = np.sqrt(betal ** 2 + alfa ** 2 + betan ** 2) 291 | 292 | #previous left rotation Q_{k-1} 293 | dbar = dltan 294 | dlta = cs*dbar + sn*alfa 295 | epln = eplnn 296 | gbar = sn*dbar - cs*alfa 297 | eplnn = sn*betan 298 | dltan = -cs*betan 299 | dlta_QLP = dlta 300 | #current left plane rotation Q_k 301 | gamal3 = gamal2 302 | gamal2 = gamal 303 | gamal = gama 304 | cs, sn, gama = SymGivens(gbar, betan) 305 | gama_tmp = gama 306 | taul2 = taul 307 | taul = tau 308 | tau = cs*phi 309 | Axnorm = np.sqrt(Axnorm ** 2 + tau ** 2) 310 | phi = sn*phi 311 | #previous right plane rotation P_{k-2,k} 312 | if iters > 2: 313 | veplnl2 = veplnl 314 | etal2 = etal 315 | etal = eta 316 | dlta_tmp = sr2*vepln - cr2*dlta 317 | veplnl = cr2*vepln + sr2*dlta 318 | dlta = dlta_tmp 319 | eta = sr2*gama 320 | gama = -cr2 *gama 321 | #current right plane rotation P{k-1,k} 322 | if iters > 1: 323 | cr1, sr1, gamal = SymGivens(gamal, dlta) 324 | vepln = sr1*gama 325 | gama = -cr1*gama 326 | 327 | #update xnorm 328 | xnorml = xnorm 329 | ul4 = ul3 330 | ul3 = ul2 331 | if iters > 2: 332 | ul2 = (taul2 - etal2*ul4 - veplnl2*ul3)/gamal2 333 | if iters > 1: 334 | ul = (taul - etal*ul3 - veplnl *ul2)/gamal 335 | xnorm_tmp = np.sqrt(xl2norm**2 + ul2**2 + ul**2) 336 | if abs(gama) > np.finfo(np.double).tiny and xnorm_tmp < maxxnorm: 337 | u = (tau - eta*ul2 - vepln*ul)/gama 338 | if np.sqrt(xnorm_tmp**2 + u**2) > maxxnorm: 339 | u = 0 340 | flag = 6 341 | else: 342 | u = 0 343 | flag = 9 344 | xl2norm = np.sqrt(xl2norm**2 + ul2**2) 345 | xnorm = np.sqrt(xl2norm**2 + ul**2 + u**2) 346 | #update w&x 347 | #Minres 348 | if (Acond < trancond) and flag != flag0 and QLPiter == 0: 349 | wl2 = wl 350 | wl = w 351 | w = (v - epln*wl2 - dlta_QLP*wl)/gama_tmp 352 | if xnorm < maxxnorm: 353 | x += tau*w 354 | else: 355 | flag = 6 356 | #Minres-QLP 357 | else: 358 | QLPiter += 1 359 | if QLPiter == 1: 360 | xl2 = np.zeros((n,1)) 361 | if (iters > 1): # construct w_{k-3}, w_{k-2}, w_{k-1} 362 | if iters > 3: 363 | wl2 = gamal3*wl2 + veplnl2*wl + etal*w 364 | if iters > 2: 365 | wl = gamal_QLP*wl + vepln_QLP*w 366 | w = gama_QLP*w 367 | xl2 = x - wl*ul_QLP - w*u_QLP 368 | 369 | if iters == 1: 370 | wl2 = wl 371 | wl = v*sr1 372 | w = -v*cr1 373 | elif iters == 2: 374 | wl2 = wl 375 | wl = w*cr1 + v*sr1 376 | w = w*sr1 - v*cr1 377 | else: 378 | wl2 = wl 379 | wl = w 380 | w = wl2*sr2 - v*cr2 381 | wl2 = wl2*cr2 +v*sr2 382 | v = wl*cr1 + w*sr1 383 | w = wl*sr1 - w*cr1 384 | wl = v 385 | xl2 = xl2 + wl2*ul2 386 | x = xl2 + wl*ul + w*u 387 | 388 | #next right plane rotation P{k-1,k+1} 389 | gamal_tmp = gamal 390 | cr2, sr2, gamal = SymGivens(gamal, eplnn) 391 | #transfering from Minres to Minres-QLP 392 | gamal_QLP = gamal_tmp 393 | #print('gamal_QLP=', gamal_QLP) 394 | vepln_QLP = vepln 395 | gama_QLP = gama 396 | ul_QLP = ul 397 | u_QLP = u 398 | ## Estimate various norms 399 | abs_gama = abs(gama) 400 | Anorml = Anorm 401 | Anorm = max([Anorm, pnorm, gamal, abs_gama]) 402 | if iters == 1: 403 | gmin = gama 404 | gminl = gmin 405 | elif iters > 1: 406 | gminl2 = gminl 407 | gminl = gmin 408 | gmin = min([gminl2, gamal, abs_gama]) 409 | Acondl = Acond 410 | Acond = Anorm / gmin 411 | rnorml = rnorm 412 | relresl = relres 413 | if flag != 9: 414 | rnorm = phi 415 | relres = rnorm / (Anorm * xnorm + beta1) 416 | rootl = np.sqrt(gbar ** 2 + dltan ** 2) 417 | Arnorml = rnorml * rootl 418 | relAresl = rootl / Anorm 419 | ## See if any of the stopping criteria are satisfied. 420 | epsx = Anorm * xnorm * np.finfo(float).eps 421 | if (flag == flag0) or (flag == 9): 422 | t1 = 1 + relres 423 | t2 = 1 + relAresl 424 | if iters >= maxit: 425 | flag = 8 #exit before maxit 426 | if epsx >= beta1: 427 | flag = 5 #x = eigenvector 428 | if t2 <= 1: 429 | flag = 4 #Accurate Least Square Solution 430 | if t1 <= 1: 431 | flag = 3 #Accurate Ax = b Solution 432 | if relAresl <= rtol: 433 | flag = 2 #Trustful Least Square Solution 434 | if relres <= rtol: 435 | flag = 1 #Trustful Ax = b Solution 436 | if Acond >= acondlim: 437 | flag = 7 #Huge Acond 438 | if xnorm >= maxxnorm: 439 | flag = 6 #xnorm exceeded 440 | if flag == 2 or flag == 4 or flag == 6 or flag == 7: 441 | #possibly singular 442 | iters = iters - 1 443 | Acond = Acondl 444 | rnorm = rnorml 445 | relres = relresl 446 | else: 447 | if rnormvec: 448 | resvec = np.append(resvec, rnorm) 449 | Aresvec = np.append(Aresvec, Arnorml) 450 | 451 | if show: 452 | if iters%10 - 1 == 0: 453 | lstr = (' iter rnorm Arnorm relres ' + 454 | 'relAres Anorm Acond xnorm') 455 | print(' ') 456 | print(lstr) 457 | if QLPiter == 1: 458 | print('QLP', end='') 459 | else: 460 | print(' ', end='') 461 | lstr1 = '%8g %8.2e ' % (iters-1, rnorml) 462 | lstr2 = '%8.2e %8.2e ' % (Arnorml, relresl) 463 | lstr3 = '%8.2e %8.2e ' % (relAresl, Anorml) 464 | lstr4 = '%8.2e %8.2e ' % (Acondl, xnorml) 465 | print(lstr1, lstr2, lstr3, lstr4) 466 | 467 | #exited the main loop 468 | if show: 469 | if QLPiter == 1: 470 | print('QLP', end = '') 471 | else: 472 | print(' ', end = '') 473 | Miter = iters - QLPiter 474 | 475 | #final quantities 476 | r1 = b - Ax(A,x) + shift*x 477 | rnorm = norm(r1) 478 | Arnorm = norm(Ax(A,r1) - shift*r1) 479 | xnorm = norm(x) 480 | relres = rnorm/(Anorm*xnorm + beta1) 481 | relAres = 0 482 | if rnorm > np.finfo(np.double).tiny: 483 | relAres = Arnorm/(Anorm*rnorm) 484 | 485 | if show: 486 | if rnorm > np.finfo(np.double).tiny: 487 | lstr1 = '%8g %8.2e ' % (iters, rnorm) 488 | lstr2 = '%8.2eD %8.2e ' % (Arnorm, relres) 489 | lstr3 = '%8.2eD %8.2e ' % (relAres, Anorm) 490 | lstr4 = '%8.2e %8.2e ' % (Acond, xnorm) 491 | print(lstr1, lstr2, lstr3, lstr4) 492 | else: 493 | lstr1 = '%8g %8.2e ' % (iters, rnorm) 494 | lstr2 = '%8.2eD %8.2e ' % (Arnorm, relres) 495 | lstr3 = ' %8.2e ' % (Anorm) 496 | lstr4 = '%8.2e %8.2e ' % (Acond, xnorm) 497 | print(lstr1, lstr2, lstr3, lstr4) 498 | 499 | print(' ') 500 | print('Exit Minres-QLP: ') 501 | str1 = 'Flag = %8g %8s' % (flag, msg[int(flag + 1)]) 502 | str2 = 'Iter = %8g ' % (iters) 503 | str3 = 'Minres = %8g Minres-QLP = %8g' % (Miter, QLPiter) 504 | str4 = 'relres = %8.2e relAres = %8.2e ' % (relres, relAres) 505 | str5 = 'rnorm = %8.2e Arnorm = %8.2e' % (rnorm, Arnorm) 506 | str6 = 'Anorm = %8.2e Acond = %8.2e ' % (Anorm, Acond) 507 | str7 = 'xnorm = %8.2e Axnorm = %8.2e' % (xnorm, Axnorm) 508 | print(str1) 509 | print(str2, str3) 510 | print(str4, str5) 511 | print(str6, str7) 512 | 513 | if rnormvec: 514 | Aresvec = np.append(Aresvec, Arnorm) 515 | return (x,flag,iters,Miter,QLPiter,relres,relAres,Anorm,Acond, 516 | xnorm,Axnorm,resvec,Aresvec) 517 | 518 | return (x,flag,iters,Miter,QLPiter,relres,relAres,Anorm,Acond,xnorm,Axnorm) 519 | 520 | 521 | def Ax(A, x): 522 | if callable(A): 523 | Ax = A(x) 524 | else: 525 | Ax = A.dot(x) 526 | return Ax 527 | 528 | def Precond(M, r): 529 | if callable(M): 530 | h = cg(M, r) 531 | else: 532 | h = inv(M).dot(r) 533 | return h 534 | 535 | def SymGivens(a, b): 536 | if b == 0: 537 | if a == 0: 538 | c = 1 539 | else: 540 | c = np.sign(a) 541 | s = 0 542 | r = abs(a) 543 | elif a == 0: 544 | c = 0 545 | s = np.sign(b) 546 | r = abs(b) 547 | elif abs(b) > abs(a): 548 | t = a / b 549 | s = np.sign(b) / np.sqrt(1 + t ** 2) 550 | c = s * t 551 | r = b / s 552 | else: 553 | t = b / a 554 | c = np.sign(a) / np.sqrt(1 + t ** 2) 555 | s = c * t 556 | r = a / c 557 | return c, s, r 558 | 559 | def main(): 560 | ################## example1 #################### 561 | n=100 562 | e = np.ones((n,1)) 563 | data = np.c_[-2*e,4*e,-2*e] 564 | A = sp.spdiags(data.T, [-1,0,1],n,n).toarray() 565 | M = sp.spdiags(4*e.T, 0,n,n).toarray() 566 | b = sum(A) 567 | rtol = 1e-10 568 | maxit = 50 569 | x = MinresQLP(A,b,rtol,maxit,M,show=True) 570 | # x = MinresQLP(A,b,rtol,maxit,M,show=True,rnormvec=True) 571 | # print(x[11]) 572 | # print(x[12]) 573 | 574 | ################## example2 #################### 575 | # n=50 576 | # N=n**2 577 | # e = np.ones((n,1)) 578 | # data = np.c_[e, e, e] 579 | # B = sp.spdiags(data.T, [-1,0,1],n,n) 580 | # A_mid = np.array([]).reshape(0,0) 581 | # for i in range(n): 582 | # A_mid = sp.block_diag((A_mid, B)) 583 | # if i == 0: 584 | # A_upper = sp.hstack([sp.csr_matrix((n,n)), B]) 585 | # A_lower = sp.vstack([sp.csr_matrix((n,n)), B]) 586 | # if i > 0 and i < n-1: 587 | # A_upper = sp.block_diag((A_upper, B)) 588 | # A_lower = sp.block_diag((A_lower, B)) 589 | # if i == n-1: 590 | # A_upper = sp.vstack([A_upper, sp.csr_matrix((n,N))]) 591 | # A_lower = sp.hstack([A_lower, sp.csr_matrix((N,n))]) 592 | # A = A_upper + A_lower + A_mid 593 | # b = sum(A.toarray()) 594 | # rtol = 1e-5 595 | # x = MinresQLP(A, b, rtol, N, maxxnorm = 1e2, show = True) 596 | 597 | ################## example3 #################### 598 | # a = -10 599 | # c = -a 600 | # n = 2*c + 1 601 | # A = sp.spdiags(np.arange(a, c+1), 0, n, n) 602 | # b = np.ones((n, 1)) 603 | # rtol = 1e-6 604 | # x = MinresQLP(A, b, rtol, n, maxxnorm = 1e2, show = True) 605 | 606 | if __name__ == '__main__': 607 | main() 608 | --------------------------------------------------------------------------------