├── README.md ├── checkStackedAECost_nonneg.m ├── feedForwardAutoencoder.m ├── initializeParameters_nonneg.m ├── loadMNISTImages.m ├── loadMNISTLabels.m ├── main.m ├── minFunc ├── ArmijoBacktrack.m ├── WolfeLineSearch.m ├── autoGrad.m ├── autoHess.m ├── autoHv.m ├── autoTensor.m ├── callOutput.m ├── conjGrad.m ├── dampedUpdate.m ├── example_minFunc.m ├── example_minFunc_LR.m ├── isLegal.m ├── lbfgs.m ├── lbfgsC.c ├── lbfgsC.mexa64 ├── lbfgsC.mexglx ├── lbfgsC.mexmac ├── lbfgsC.mexmaci ├── lbfgsC.mexmaci64 ├── lbfgsC.mexw32 ├── lbfgsC.mexw64 ├── lbfgsUpdate.m ├── logistic │ ├── LogisticDiagPrecond.m │ ├── LogisticHv.m │ ├── LogisticLoss.m │ ├── mexutil.c │ ├── mexutil.h │ ├── mylogsumexp.m │ ├── repmatC.c │ ├── repmatC.dll │ ├── repmatC.mexglx │ └── repmatC.mexmac ├── mchol.m ├── mcholC.c ├── mcholC.mexmaci64 ├── mcholC.mexw32 ├── mcholC.mexw64 ├── mcholinc.m ├── minFunc.m ├── minFunc_processInputOptions.m ├── polyinterp.m ├── precondDiag.m ├── precondTriu.m ├── precondTriuDiag.m ├── rosenbrock.m └── taylorModel.m ├── mnist ├── t10k-images.idx3-ubyte ├── t10k-labels.idx1-ubyte ├── train-images.idx3-ubyte └── train-labels.idx1-ubyte ├── params2stack.m ├── softmax ├── computeNumericalGradient.m ├── softmaxCost_nonneg.m ├── softmaxPredict.m └── softmaxTrain_nonneg.m ├── sparseAutoencoderCost_nonneg.m ├── stack2params.m ├── stackedAECost_nonneg.m └── stackedAEPredict.m /README.md: -------------------------------------------------------------------------------- 1 | # Nonnegativity-Constrained-Autoencoder-NCAE 2 | Matlab code for implementing Nonnegativity Constrained Autoencoder (NCAE) for Part-based Deep Learning. 3 | 4 | Reference: 5 | 6 | [1] Hosseini-Asl, E.; Zurada, J.M.; Nasraoui, O., "Deep Learning of Part-Based Representation of Data Using Sparse Autoencoders With Nonnegativity Constraints," in Neural Networks and Learning Systems, IEEE Transactions on , vol.PP, no.99, pp.1-13 7 | doi: 10.1109/TNNLS.2015.2479223 8 | URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7310882&isnumber=6104215 9 | 10 | [2] UFLDL Tutorial, http://deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial 11 | 12 | 13 | -------------------------------------------------------------------------------- /checkStackedAECost_nonneg.m: -------------------------------------------------------------------------------- 1 | function [] = checkStackedAECost_nonneg() 2 | 3 | % Check the gradients for the stacked autoencoder 4 | % 5 | % In general, we recommend that the creation of such files for checking 6 | % gradients when you write new cost functions. 7 | % 8 | 9 | %% Setup random data / small model 10 | clc 11 | inputSize = 4; 12 | hiddenSize = 5; 13 | lambda1 = 0.01; 14 | lambda2 = 0.1; 15 | data = randn(inputSize, 5); 16 | labels = [ 1 2 1 2 1 ]; 17 | numClasses = 2; 18 | 19 | stack = cell(2,1); 20 | stack{1}.w = 0.1 * randn(3, inputSize); 21 | stack{1}.b = zeros(3, 1); 22 | stack{2}.w = 0.1 * randn(hiddenSize, 3); 23 | stack{2}.b = zeros(hiddenSize, 1); 24 | softmaxTheta = 0.005 * randn(hiddenSize * numClasses, 1); 25 | 26 | [stackparams, netconfig] = stack2params(stack); 27 | stackedAETheta = [ softmaxTheta ; stackparams ]; 28 | 29 | 30 | [cost, grad] = stackedAECost_nonneg(stackedAETheta, inputSize, hiddenSize, ... 31 | numClasses, netconfig, ... 32 | lambda1, data, labels); 33 | 34 | % Check that the numerical and analytic gradients are the same 35 | numgrad = computeNumericalGradient( @(x) stackedAECost_nonneg(x, inputSize, ... 36 | hiddenSize, numClasses, netconfig, ... 37 | lambda1, data, labels), ... 38 | stackedAETheta); 39 | 40 | % Use this to visually compare the gradients side by side 41 | disp([numgrad grad]); 42 | 43 | % Compare numerically computed gradients with the ones obtained from backpropagation 44 | disp('Norm between numerical and analytical gradient (should be less than 1e-9)'); 45 | diff = norm(numgrad-grad)/norm(numgrad+grad); 46 | disp(diff); % Should be small. In our implementation, these values are 47 | % usually less than 1e-9. 48 | 49 | % When you got this working, Congratulations!!! 50 | 51 | 52 | -------------------------------------------------------------------------------- /feedForwardAutoencoder.m: -------------------------------------------------------------------------------- 1 | function [activation] = feedForwardAutoencoder(theta, hiddenSize, visibleSize, dropoutFraction, data) 2 | 3 | 4 | 5 | W1 = reshape(theta(1:hiddenSize*visibleSize), hiddenSize, visibleSize); 6 | b1 = theta(2*hiddenSize*visibleSize+1:2*hiddenSize*visibleSize+hiddenSize); 7 | 8 | 9 | z2 = W1*data + repmat(b1,1,size(data,2)); 10 | activation = sigmoid(z2); 11 | 12 | if(dropoutFraction > 0) 13 | activation = activation.*(1 - dropoutFraction); 14 | end 15 | 16 | 17 | 18 | end 19 | 20 | 21 | function sigm = sigmoid(x) 22 | sigm = 1 ./ (1 + exp(-x)); 23 | end 24 | -------------------------------------------------------------------------------- /initializeParameters_nonneg.m: -------------------------------------------------------------------------------- 1 | function theta = initializeParameters_nonneg(hiddenSize, visibleSize, seed) 2 | 3 | %% Initialize parameters randomly based on layer sizes. 4 | r = sqrt(6) / sqrt(hiddenSize+visibleSize+1); % we'll choose weights uniformly from the interval [-r, r] 5 | rand('state',seed) 6 | 7 | W1 = rand(hiddenSize, visibleSize)* r; 8 | W2 = rand(visibleSize, hiddenSize) * r; 9 | 10 | b1 = zeros(hiddenSize, 1); 11 | b2 = zeros(visibleSize, 1); 12 | 13 | theta = [W1(:) ; W2(:) ; b1(:) ; b2(:)]; 14 | 15 | end 16 | 17 | -------------------------------------------------------------------------------- /loadMNISTImages.m: -------------------------------------------------------------------------------- 1 | function images = loadMNISTImages(filename) 2 | %loadMNISTImages returns a 28x28x[number of MNIST images] matrix containing 3 | %the raw MNIST images 4 | 5 | fp = fopen(filename, 'rb'); 6 | assert(fp ~= -1, ['Could not open ', filename, '']); 7 | 8 | magic = fread(fp, 1, 'int32', 0, 'ieee-be'); 9 | assert(magic == 2051, ['Bad magic number in ', filename, '']); 10 | 11 | numImages = fread(fp, 1, 'int32', 0, 'ieee-be'); 12 | numRows = fread(fp, 1, 'int32', 0, 'ieee-be'); 13 | numCols = fread(fp, 1, 'int32', 0, 'ieee-be'); 14 | 15 | images = fread(fp, inf, 'unsigned char'); 16 | images = reshape(images, numCols, numRows, numImages); 17 | images = permute(images,[2 1 3]); 18 | 19 | fclose(fp); 20 | 21 | % Reshape to #pixels x #examples 22 | images = reshape(images, size(images, 1) * size(images, 2), size(images, 3)); 23 | % Convert to double and rescale to [0,1] 24 | images = double(images) / 255; 25 | 26 | end 27 | -------------------------------------------------------------------------------- /loadMNISTLabels.m: -------------------------------------------------------------------------------- 1 | function labels = loadMNISTLabels(filename) 2 | %loadMNISTLabels returns a [number of MNIST images]x1 matrix containing 3 | %the labels for the MNIST images 4 | 5 | fp = fopen(filename, 'rb'); 6 | assert(fp ~= -1, ['Could not open ', filename, '']); 7 | 8 | magic = fread(fp, 1, 'int32', 0, 'ieee-be'); 9 | assert(magic == 2049, ['Bad magic number in ', filename, '']); 10 | 11 | numLabels = fread(fp, 1, 'int32', 0, 'ieee-be'); 12 | 13 | labels = fread(fp, inf, 'unsigned char'); 14 | 15 | assert(size(labels,1) == numLabels, 'Mismatch in label count'); 16 | 17 | fclose(fp); 18 | 19 | end 20 | -------------------------------------------------------------------------------- /main.m: -------------------------------------------------------------------------------- 1 | 2 | clc 3 | clear all 4 | close all 5 | 6 | %% Initialize Deep Network Parameters 7 | 8 | inputSize = 784; 9 | numClasses = 10; 10 | hiddenSizeL1 = 196; % Layer 1 Hidden Size 11 | hiddenSizeL2 = 20; % Layer 2 Hidden Size 12 | sparsityParam = 0.05; % desired average activation of the hidden units. 13 | lambda = 3e-3; % weight decay parameter 14 | beta = 3; % weight of sparsity penalty term 15 | 16 | inputZeroMaskedFraction = 0.0; % denoising ratio 17 | dropoutFraction = 0.0; % dropout ratio 18 | 19 | %% Load data from the MNIST database 20 | 21 | % Load MNIST database files 22 | addpath('/Datasets/MNIST') 23 | trainData = loadMNISTImages('mnist/train-images.idx3-ubyte'); 24 | trainLabels = loadMNISTLabels('mnist/train-labels.idx1-ubyte'); 25 | 26 | trainLabels(trainLabels == 0) = 10; % Remap 0 to 10 since our labels need to start from 1 27 | 28 | testData = loadMNISTImages('mnist/t10k-images.idx3-ubyte'); 29 | testLabels = loadMNISTLabels('mnist/t10k-labels.idx1-ubyte'); 30 | testLabels(testLabels == 0) = 10; % Remap 0 to 10 since our labels need to start from 1 31 | 32 | 33 | %% STEP 2: Train the first sparse autoencoder 34 | 35 | 36 | % Randomly initialize the parameters 37 | 38 | seed = 1; 39 | sae1Theta = initializeParameters_nonneg(hiddenSizeL1, inputSize, seed); 40 | 41 | 42 | addpath minFunc/ 43 | options.Method = 'lbfgs'; 44 | options.maxIter = 400; 45 | options.display = 'on'; 46 | 47 | 48 | 49 | [sae1OptTheta, cost, costhistoty] = minFunc( @(p) sparseAutoencoderCost_nonneg(p, ... 50 | inputSize, hiddenSizeL1, ... 51 | lambda, inputZeroMaskedFraction,... 52 | dropoutFraction, sparsityParam, ... 53 | beta, trainData), ... 54 | sae1Theta, options); 55 | 56 | %% Train the second sparse autoencoder 57 | 58 | [sae1Features] = feedForwardAutoencoder(sae1OptTheta, hiddenSizeL1, ... 59 | inputSize, dropoutFraction, trainData); 60 | 61 | % Randomly initialize the parameters 62 | sae2Theta = initializeParameters_nonneg(hiddenSizeL2, hiddenSizeL1, seed); 63 | 64 | [sae2OptTheta, cost] = minFunc( @(p) sparseAutoencoderCost_nonneg(p, ... 65 | hiddenSizeL1, hiddenSizeL2, ... 66 | lambda, inputZeroMaskedFraction,... 67 | dropoutFraction, sparsityParam, ... 68 | beta, sae1Features), ... 69 | sae2Theta, options); 70 | 71 | %% Train the softmax classifier 72 | 73 | [sae2Features] = feedForwardAutoencoder(sae2OptTheta, hiddenSizeL2, ... 74 | hiddenSizeL1, dropoutFraction, sae1Features); 75 | 76 | % Randomly initialize the parameters 77 | rand('state',seed); 78 | saeSoftmaxTheta = 0.005 * randn(hiddenSizeL2 * numClasses, 1); 79 | 80 | addpath softmax/ 81 | 82 | options.maxIter = 100; 83 | softmaxModel = softmaxTrain_nonneg(hiddenSizeL2, numClasses, lambda, ... 84 | sae2Features, trainLabels, options); 85 | 86 | saeSoftmaxOptTheta = softmaxModel.optTheta(:); 87 | 88 | 89 | %% Finetune softmax model 90 | 91 | 92 | % Initialize the stack using the parameters learned 93 | stack = cell(2,1); 94 | stack{1}.w = reshape(sae1OptTheta(1:hiddenSizeL1*inputSize), ... 95 | hiddenSizeL1, inputSize); 96 | stack{1}.b = sae1OptTheta(2*hiddenSizeL1*inputSize+1:2*hiddenSizeL1*inputSize+hiddenSizeL1); 97 | stack{2}.w = reshape(sae2OptTheta(1:hiddenSizeL2*hiddenSizeL1), ... 98 | hiddenSizeL2, hiddenSizeL1); 99 | stack{2}.b = sae2OptTheta(2*hiddenSizeL2*hiddenSizeL1+1:2*hiddenSizeL2*hiddenSizeL1+hiddenSizeL2); 100 | 101 | % Initialize the parameters for the deep model 102 | [stackparams, netconfig] = stack2params(stack); 103 | stackedAETheta = [ saeSoftmaxOptTheta ; stackparams ]; 104 | 105 | 106 | %% Check Gradient 107 | 108 | 109 | checkStackedAECost_nonneg() 110 | 111 | %% Fine-tuning AE 112 | 113 | options.Method = 'lbfgs'; 114 | options.maxIter = 400; 115 | options.display = 'on'; 116 | 117 | dbstop if error 118 | [stackedAEOptTheta, cost] = minFunc( @(p) stackedAECost_nonneg(p, inputSize, hiddenSizeL2, ... 119 | numClasses, netconfig, ... 120 | lambda, trainData, trainLabels), ... 121 | stackedAETheta, options); 122 | 123 | 124 | %% Test 125 | 126 | 127 | [pred] = stackedAEPredict(stackedAETheta, inputSize, hiddenSizeL2, ... 128 | numClasses, netconfig, dropoutFraction, testData); 129 | 130 | acc_before(seed) = mean(testLabels(:) == pred(:)); 131 | fprintf('Before Finetuning Test Accuracy: %0.3f%%\n', acc_before(seed) * 100); 132 | 133 | [pred] = stackedAEPredict(stackedAEOptTheta, inputSize, hiddenSizeL2, ... 134 | numClasses, netconfig, dropoutFraction, testData); 135 | 136 | acc_after(seed) = mean(testLabels(:) == pred(:)); 137 | fprintf('After Finetuning Test Accuracy: %0.3f%%\n', acc_after(seed) * 100); 138 | 139 | -------------------------------------------------------------------------------- /minFunc/ArmijoBacktrack.m: -------------------------------------------------------------------------------- 1 | function [t,x_new,f_new,g_new,funEvals,H] = ArmijoBacktrack(... 2 | x,t,d,f,fr,g,gtd,c1,LS,tolX,debug,doPlot,saveHessianComp,funObj,varargin) 3 | % 4 | % Backtracking linesearch to satisfy Armijo condition 5 | % 6 | % Inputs: 7 | % x: starting location 8 | % t: initial step size 9 | % d: descent direction 10 | % f: function value at starting location 11 | % fr: reference function value (usually funObj(x)) 12 | % gtd: directional derivative at starting location 13 | % c1: sufficient decrease parameter 14 | % debug: display debugging information 15 | % LS: type of interpolation 16 | % tolX: minimum allowable step length 17 | % doPlot: do a graphical display of interpolation 18 | % funObj: objective function 19 | % varargin: parameters of objective function 20 | % 21 | % Outputs: 22 | % t: step length 23 | % f_new: function value at x+t*d 24 | % g_new: gradient value at x+t*d 25 | % funEvals: number function evaluations performed by line search 26 | % H: Hessian at initial guess (only computed if requested 27 | 28 | % Evaluate the Objective and Gradient at the Initial Step 29 | if nargout == 6 30 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 31 | else 32 | [f_new,g_new] = feval(funObj, x + t*d, varargin{:}); 33 | end 34 | funEvals = 1; 35 | 36 | while f_new > fr + c1*t*gtd || ~isLegal(f_new) 37 | 38 | temp = t; 39 | if LS == 0 || ~isLegal(f_new) 40 | % Backtrack w/ fixed backtracking rate 41 | if debug 42 | fprintf('Fixed BT\n'); 43 | end 44 | t = 0.5*t; 45 | elseif LS == 2 && isLegal(g_new) 46 | % Backtracking w/ cubic interpolation w/ derivative 47 | if debug 48 | fprintf('Grad-Cubic BT\n'); 49 | end 50 | t = polyinterp([0 f gtd; t f_new g_new'*d],doPlot); 51 | elseif funEvals < 2 || ~isLegal(f_prev) 52 | % Backtracking w/ quadratic interpolation (no derivative at new point) 53 | if debug 54 | fprintf('Quad BT\n'); 55 | end 56 | t = polyinterp([0 f gtd; t f_new sqrt(-1)],doPlot); 57 | else%if LS == 1 58 | % Backtracking w/ cubic interpolation (no derivatives at new points) 59 | if debug 60 | fprintf('Cubic BT\n'); 61 | end 62 | t = polyinterp([0 f gtd; t f_new sqrt(-1); t_prev f_prev sqrt(-1)],doPlot); 63 | end 64 | 65 | % Adjust if change in t is too small/large 66 | 67 | if t < temp*1e-3 68 | if debug 69 | fprintf('Interpolated Value Too Small, Adjusting\n'); 70 | end 71 | t = temp*1e-3; 72 | elseif t > temp*0.6 73 | if debug 74 | fprintf('Interpolated Value Too Large, Adjusting\n'); 75 | end 76 | t = temp*0.6; 77 | end 78 | 79 | f_prev = f_new; 80 | t_prev = temp; 81 | if ~saveHessianComp && nargout == 6 82 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 83 | else 84 | [f_new,g_new] = feval(funObj, x + t*d, varargin{:}); 85 | end 86 | funEvals = funEvals+1; 87 | 88 | % Check whether step size has become too small 89 | if sum(abs(t*d)) <= tolX 90 | if debug 91 | fprintf('Backtracking Line Search Failed\n'); 92 | end 93 | t = 0; 94 | f_new = f; 95 | g_new = g; 96 | break; 97 | end 98 | end 99 | 100 | % Evaluate Hessian at new point 101 | if nargout == 6 && funEvals > 1 && saveHessianComp 102 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 103 | funEvals = funEvals+1; 104 | end 105 | 106 | x_new = x + t*d; 107 | 108 | end 109 | -------------------------------------------------------------------------------- /minFunc/WolfeLineSearch.m: -------------------------------------------------------------------------------- 1 | function [t,f_new,g_new,funEvals,H] = WolfeLineSearch(... 2 | x,t,d,f,g,gtd,c1,c2,LS,maxLS,tolX,debug,doPlot,saveHessianComp,funObj,varargin) 3 | % 4 | % Bracketing Line Search to Satisfy Wolfe Conditions 5 | % 6 | % Inputs: 7 | % x: starting location 8 | % t: initial step size 9 | % d: descent direction 10 | % f: function value at starting location 11 | % g: gradient at starting location 12 | % gtd: directional derivative at starting location 13 | % c1: sufficient decrease parameter 14 | % c2: curvature parameter 15 | % debug: display debugging information 16 | % LS: type of interpolation 17 | % maxLS: maximum number of iterations 18 | % tolX: minimum allowable step length 19 | % doPlot: do a graphical display of interpolation 20 | % funObj: objective function 21 | % varargin: parameters of objective function 22 | % 23 | % Outputs: 24 | % t: step length 25 | % f_new: function value at x+t*d 26 | % g_new: gradient value at x+t*d 27 | % funEvals: number function evaluations performed by line search 28 | % H: Hessian at initial guess (only computed if requested 29 | 30 | % Evaluate the Objective and Gradient at the Initial Step 31 | if nargout == 5 32 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 33 | else 34 | [f_new,g_new] = feval(funObj, x + t*d, varargin{:}); 35 | end 36 | funEvals = 1; 37 | gtd_new = g_new'*d; 38 | 39 | % Bracket an Interval containing a point satisfying the 40 | % Wolfe criteria 41 | 42 | LSiter = 0; 43 | t_prev = 0; 44 | f_prev = f; 45 | g_prev = g; 46 | gtd_prev = gtd; 47 | done = 0; 48 | 49 | while LSiter < maxLS 50 | 51 | %% Bracketing Phase 52 | if ~isLegal(f_new) || ~isLegal(g_new) 53 | if 0 54 | if debug 55 | fprintf('Extrapolated into illegal region, Bisecting\n'); 56 | end 57 | t = (t + t_prev)/2; 58 | if ~saveHessianComp && nargout == 5 59 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 60 | else 61 | [f_new,g_new] = feval(funObj, x + t*d, varargin{:}); 62 | end 63 | funEvals = funEvals + 1; 64 | gtd_new = g_new'*d; 65 | LSiter = LSiter+1; 66 | continue; 67 | else 68 | if debug 69 | fprintf('Extrapolated into illegal region, switching to Armijo line-search\n'); 70 | end 71 | t = (t + t_prev)/2; 72 | % Do Armijo 73 | if nargout == 5 74 | [t,x_new,f_new,g_new,armijoFunEvals,H] = ArmijoBacktrack(... 75 | x,t,d,f,f,g,gtd,c1,max(0,min(LS-2,2)),tolX,debug,doPlot,saveHessianComp,... 76 | funObj,varargin{:}); 77 | else 78 | [t,x_new,f_new,g_new,armijoFunEvals] = ArmijoBacktrack(... 79 | x,t,d,f,f,g,gtd,c1,max(0,min(LS-2,2)),tolX,debug,doPlot,saveHessianComp,... 80 | funObj,varargin{:}); 81 | end 82 | funEvals = funEvals + armijoFunEvals; 83 | return; 84 | end 85 | end 86 | 87 | 88 | if f_new > f + c1*t*gtd || (LSiter > 1 && f_new >= f_prev) 89 | bracket = [t_prev t]; 90 | bracketFval = [f_prev f_new]; 91 | bracketGval = [g_prev g_new]; 92 | break; 93 | elseif abs(gtd_new) <= -c2*gtd 94 | bracket = t; 95 | bracketFval = f_new; 96 | bracketGval = g_new; 97 | done = 1; 98 | break; 99 | elseif gtd_new >= 0 100 | bracket = [t_prev t]; 101 | bracketFval = [f_prev f_new]; 102 | bracketGval = [g_prev g_new]; 103 | break; 104 | end 105 | temp = t_prev; 106 | t_prev = t; 107 | minStep = t + 0.01*(t-temp); 108 | maxStep = t*10; 109 | if LS == 3 110 | if debug 111 | fprintf('Extending Braket\n'); 112 | end 113 | t = maxStep; 114 | elseif LS ==4 115 | if debug 116 | fprintf('Cubic Extrapolation\n'); 117 | end 118 | t = polyinterp([temp f_prev gtd_prev; t f_new gtd_new],doPlot,minStep,maxStep); 119 | else 120 | t = mixedExtrap(temp,f_prev,gtd_prev,t,f_new,gtd_new,minStep,maxStep,debug,doPlot); 121 | end 122 | 123 | f_prev = f_new; 124 | g_prev = g_new; 125 | gtd_prev = gtd_new; 126 | if ~saveHessianComp && nargout == 5 127 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 128 | else 129 | [f_new,g_new] = feval(funObj, x + t*d, varargin{:}); 130 | end 131 | funEvals = funEvals + 1; 132 | gtd_new = g_new'*d; 133 | LSiter = LSiter+1; 134 | end 135 | 136 | if LSiter == maxLS 137 | bracket = [0 t]; 138 | bracketFval = [f f_new]; 139 | bracketGval = [g g_new]; 140 | end 141 | 142 | %% Zoom Phase 143 | 144 | % We now either have a point satisfying the criteria, or a bracket 145 | % surrounding a point satisfying the criteria 146 | % Refine the bracket until we find a point satisfying the criteria 147 | insufProgress = 0; 148 | Tpos = 2; 149 | LOposRemoved = 0; 150 | while ~done && LSiter < maxLS 151 | 152 | % Find High and Low Points in bracket 153 | [f_LO LOpos] = min(bracketFval); 154 | HIpos = -LOpos + 3; 155 | 156 | % Compute new trial value 157 | if LS == 3 || ~isLegal(bracketFval) || ~isLegal(bracketGval) 158 | if debug 159 | fprintf('Bisecting\n'); 160 | end 161 | t = mean(bracket); 162 | elseif LS == 4 163 | if debug 164 | fprintf('Grad-Cubic Interpolation\n'); 165 | end 166 | t = polyinterp([bracket(1) bracketFval(1) bracketGval(:,1)'*d 167 | bracket(2) bracketFval(2) bracketGval(:,2)'*d],doPlot); 168 | else 169 | % Mixed Case %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 170 | nonTpos = -Tpos+3; 171 | if LOposRemoved == 0 172 | oldLOval = bracket(nonTpos); 173 | oldLOFval = bracketFval(nonTpos); 174 | oldLOGval = bracketGval(:,nonTpos); 175 | end 176 | t = mixedInterp(bracket,bracketFval,bracketGval,d,Tpos,oldLOval,oldLOFval,oldLOGval,debug,doPlot); 177 | end 178 | 179 | 180 | % Test that we are making sufficient progress 181 | if min(max(bracket)-t,t-min(bracket))/(max(bracket)-min(bracket)) < 0.1 182 | if debug 183 | fprintf('Interpolation close to boundary'); 184 | end 185 | if insufProgress || t>=max(bracket) || t <= min(bracket) 186 | if debug 187 | fprintf(', Evaluating at 0.1 away from boundary\n'); 188 | end 189 | if abs(t-max(bracket)) < abs(t-min(bracket)) 190 | t = max(bracket)-0.1*(max(bracket)-min(bracket)); 191 | else 192 | t = min(bracket)+0.1*(max(bracket)-min(bracket)); 193 | end 194 | insufProgress = 0; 195 | else 196 | if debug 197 | fprintf('\n'); 198 | end 199 | insufProgress = 1; 200 | end 201 | else 202 | insufProgress = 0; 203 | end 204 | 205 | % Evaluate new point 206 | if ~saveHessianComp && nargout == 5 207 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 208 | else 209 | [f_new,g_new] = feval(funObj, x + t*d, varargin{:}); 210 | end 211 | funEvals = funEvals + 1; 212 | gtd_new = g_new'*d; 213 | LSiter = LSiter+1; 214 | 215 | if f_new > f + c1*t*gtd || f_new >= f_LO 216 | % Armijo condition not satisfied or not lower than lowest 217 | % point 218 | bracket(HIpos) = t; 219 | bracketFval(HIpos) = f_new; 220 | bracketGval(:,HIpos) = g_new; 221 | Tpos = HIpos; 222 | else 223 | if abs(gtd_new) <= - c2*gtd 224 | % Wolfe conditions satisfied 225 | done = 1; 226 | elseif gtd_new*(bracket(HIpos)-bracket(LOpos)) >= 0 227 | % Old HI becomes new LO 228 | bracket(HIpos) = bracket(LOpos); 229 | bracketFval(HIpos) = bracketFval(LOpos); 230 | bracketGval(:,HIpos) = bracketGval(:,LOpos); 231 | if LS == 5 232 | if debug 233 | fprintf('LO Pos is being removed!\n'); 234 | end 235 | LOposRemoved = 1; 236 | oldLOval = bracket(LOpos); 237 | oldLOFval = bracketFval(LOpos); 238 | oldLOGval = bracketGval(:,LOpos); 239 | end 240 | end 241 | % New point becomes new LO 242 | bracket(LOpos) = t; 243 | bracketFval(LOpos) = f_new; 244 | bracketGval(:,LOpos) = g_new; 245 | Tpos = LOpos; 246 | end 247 | 248 | if ~done && abs((bracket(1)-bracket(2))*gtd_new) < tolX 249 | if debug 250 | fprintf('Line Search can not make further progress\n'); 251 | end 252 | break; 253 | end 254 | 255 | end 256 | 257 | %% 258 | if LSiter == maxLS 259 | if debug 260 | fprintf('Line Search Exceeded Maximum Line Search Iterations\n'); 261 | end 262 | end 263 | 264 | [f_LO LOpos] = min(bracketFval); 265 | t = bracket(LOpos); 266 | f_new = bracketFval(LOpos); 267 | g_new = bracketGval(:,LOpos); 268 | 269 | 270 | 271 | % Evaluate Hessian at new point 272 | if nargout == 5 && funEvals > 1 && saveHessianComp 273 | [f_new,g_new,H] = feval(funObj, x + t*d, varargin{:}); 274 | funEvals = funEvals + 1; 275 | end 276 | 277 | end 278 | 279 | 280 | %% 281 | function [t] = mixedExtrap(x0,f0,g0,x1,f1,g1,minStep,maxStep,debug,doPlot); 282 | alpha_c = polyinterp([x0 f0 g0; x1 f1 g1],doPlot,minStep,maxStep); 283 | alpha_s = polyinterp([x0 f0 g0; x1 sqrt(-1) g1],doPlot,minStep,maxStep); 284 | if alpha_c > minStep && abs(alpha_c - x1) < abs(alpha_s - x1) 285 | if debug 286 | fprintf('Cubic Extrapolation\n'); 287 | end 288 | t = alpha_c; 289 | else 290 | if debug 291 | fprintf('Secant Extrapolation\n'); 292 | end 293 | t = alpha_s; 294 | end 295 | end 296 | 297 | %% 298 | function [t] = mixedInterp(bracket,bracketFval,bracketGval,d,Tpos,oldLOval,oldLOFval,oldLOGval,debug,doPlot); 299 | 300 | % Mixed Case %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 301 | nonTpos = -Tpos+3; 302 | 303 | gtdT = bracketGval(:,Tpos)'*d; 304 | gtdNonT = bracketGval(:,nonTpos)'*d; 305 | oldLOgtd = oldLOGval'*d; 306 | if bracketFval(Tpos) > oldLOFval 307 | alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd 308 | bracket(Tpos) bracketFval(Tpos) gtdT],doPlot); 309 | alpha_q = polyinterp([oldLOval oldLOFval oldLOgtd 310 | bracket(Tpos) bracketFval(Tpos) sqrt(-1)],doPlot); 311 | if abs(alpha_c - oldLOval) < abs(alpha_q - oldLOval) 312 | if debug 313 | fprintf('Cubic Interpolation\n'); 314 | end 315 | t = alpha_c; 316 | else 317 | if debug 318 | fprintf('Mixed Quad/Cubic Interpolation\n'); 319 | end 320 | t = (alpha_q + alpha_c)/2; 321 | end 322 | elseif gtdT'*oldLOgtd < 0 323 | alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd 324 | bracket(Tpos) bracketFval(Tpos) gtdT],doPlot); 325 | alpha_s = polyinterp([oldLOval oldLOFval oldLOgtd 326 | bracket(Tpos) sqrt(-1) gtdT],doPlot); 327 | if abs(alpha_c - bracket(Tpos)) >= abs(alpha_s - bracket(Tpos)) 328 | if debug 329 | fprintf('Cubic Interpolation\n'); 330 | end 331 | t = alpha_c; 332 | else 333 | if debug 334 | fprintf('Quad Interpolation\n'); 335 | end 336 | t = alpha_s; 337 | end 338 | elseif abs(gtdT) <= abs(oldLOgtd) 339 | alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd 340 | bracket(Tpos) bracketFval(Tpos) gtdT],... 341 | doPlot,min(bracket),max(bracket)); 342 | alpha_s = polyinterp([oldLOval sqrt(-1) oldLOgtd 343 | bracket(Tpos) bracketFval(Tpos) gtdT],... 344 | doPlot,min(bracket),max(bracket)); 345 | if alpha_c > min(bracket) && alpha_c < max(bracket) 346 | if abs(alpha_c - bracket(Tpos)) < abs(alpha_s - bracket(Tpos)) 347 | if debug 348 | fprintf('Bounded Cubic Extrapolation\n'); 349 | end 350 | t = alpha_c; 351 | else 352 | if debug 353 | fprintf('Bounded Secant Extrapolation\n'); 354 | end 355 | t = alpha_s; 356 | end 357 | else 358 | if debug 359 | fprintf('Bounded Secant Extrapolation\n'); 360 | end 361 | t = alpha_s; 362 | end 363 | 364 | if bracket(Tpos) > oldLOval 365 | t = min(bracket(Tpos) + 0.66*(bracket(nonTpos) - bracket(Tpos)),t); 366 | else 367 | t = max(bracket(Tpos) + 0.66*(bracket(nonTpos) - bracket(Tpos)),t); 368 | end 369 | else 370 | t = polyinterp([bracket(nonTpos) bracketFval(nonTpos) gtdNonT 371 | bracket(Tpos) bracketFval(Tpos) gtdT],doPlot); 372 | end 373 | end -------------------------------------------------------------------------------- /minFunc/autoGrad.m: -------------------------------------------------------------------------------- 1 | function [f,g] = autoGrad(x,useComplex,funObj,varargin) % [f,g] = autoGrad(x,useComplex,funObj,varargin) % % Numerically compute gradient of objective function from function values p = length(x); mu = 1e-150; if useComplex % Use Complex Differentials diff = zeros(p,1); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; diff(j,1) = funObj(x + mu*i*e_j,varargin{:}); end f = mean(real(diff)); g = imag(diff)/mu; else % Use Finite Differencing f = funObj(x,varargin{:}); mu = 2*sqrt(1e-12)*(1+norm(x))/norm(p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; diff(j,1) = funObj(x + mu*e_j,varargin{:}); end g = (diff-f)/mu; end if 0 % DEBUG CODE [fReal gReal] = funObj(x,varargin{:}); [fReal f] [gReal g] pause; end -------------------------------------------------------------------------------- /minFunc/autoHess.m: -------------------------------------------------------------------------------- 1 | function [f,g,H] = autoHess(x,useComplex,funObj,varargin) % Numerically compute Hessian of objective function from gradient values p = length(x); if useComplex % Use Complex Differentials mu = 1e-150; diff = zeros(p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; [f(j) diff(:,j)] = funObj(x + mu*i*e_j,varargin{:}); end f = mean(real(f)); g = mean(real(diff),2); H = imag(diff)/mu; else % Use finite differencing mu = 2*sqrt(1e-12)*(1+norm(x))/norm(p); [f,g] = funObj(x,varargin{:}); diff = zeros(p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; [f diff(:,j)] = funObj(x + mu*e_j,varargin{:}); end H = (diff-repmat(g,[1 p]))/mu; end % Make sure H is symmetric H = (H+H')/2; if 0 % DEBUG CODE [fReal gReal HReal] = funObj(x,varargin{:}); [fReal f] [gReal g] [HReal H] pause; end -------------------------------------------------------------------------------- /minFunc/autoHv.m: -------------------------------------------------------------------------------- 1 | function [Hv] = autoHv(v,x,g,useComplex,funObj,varargin) 2 | % Numerically compute Hessian-vector product H*v of funObj(x,varargin{:}) 3 | % based on gradient values 4 | 5 | if useComplex 6 | mu = 1e-150i; 7 | else 8 | mu = 2*sqrt(1e-12)*(1+norm(x))/norm(v); 9 | end 10 | [f,finDif] = funObj(x + v*mu,varargin{:}); 11 | Hv = (finDif-g)/mu; -------------------------------------------------------------------------------- /minFunc/autoTensor.m: -------------------------------------------------------------------------------- 1 | function [f,g,H,T] = autoTensor(x,useComplex,funObj,varargin) % [f,g,H,T] = autoTensor(x,useComplex,funObj,varargin) % Numerically compute Tensor of 3rd-derivatives of objective function from Hessian values p = length(x); if useComplex % Use Complex Differentials mu = 1e-150; diff = zeros(p,p,p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; [f(j) g(:,j) diff(:,:,j)] = funObj(x + mu*i*e_j,varargin{:}); end f = mean(real(f)); g = mean(real(g),2); H = mean(real(diff),3); T = imag(diff)/mu; else % Use finite differencing mu = 2*sqrt(1e-12)*(1+norm(x))/norm(p); [f,g,H] = funObj(x,varargin{:}); diff = zeros(p,p,p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; [junk1 junk2 diff(:,:,j)] = funObj(x + mu*e_j,varargin{:}); end T = (diff-repmat(H,[1 1 p]))/mu; end -------------------------------------------------------------------------------- /minFunc/callOutput.m: -------------------------------------------------------------------------------- 1 | function [] = callOutput(outputFcn,x,state,i,funEvals,f,t,gtd,g,d,opt,varargin) 2 | 3 | optimValues.iteration = i; 4 | optimValues.funccount = funEvals; 5 | optimValues.fval = f; 6 | optimValues.stepsize = t; 7 | optimValues.directionalderivative = gtd; 8 | optimValues.gradient = g; 9 | optimValues.searchdirection = d; 10 | optimValues.firstorderopt = opt; 11 | 12 | feval(outputFcn, x,optimValues,state,varargin{:}); -------------------------------------------------------------------------------- /minFunc/conjGrad.m: -------------------------------------------------------------------------------- 1 | function [x,k,res,negCurv] = cg(A,b,optTol,maxIter,verbose,precFunc,precArgs,matrixVectFunc,matrixVectArgs) 2 | % [x,k,res,negCurv] = 3 | % cg(A,b,optTol,maxIter,verbose,precFunc,precArgs,matrixVectFunc,matrixVect 4 | % Args) 5 | % Linear Conjugate Gradient, where optionally we use 6 | % - preconditioner on vector v with precFunc(v,precArgs{:}) 7 | % - matrix multipled by vector with matrixVectFunc(v,matrixVectArgs{:}) 8 | 9 | x = zeros(size(b)); 10 | r = -b; 11 | 12 | % Apply preconditioner (if supplied) 13 | if nargin >= 7 && ~isempty(precFunc) 14 | y = precFunc(r,precArgs{:}); 15 | else 16 | y = r; 17 | end 18 | 19 | ry = r'*y; 20 | p = -y; 21 | k = 0; 22 | 23 | res = norm(r); 24 | done = 0; 25 | negCurv = []; 26 | while res > optTol & k < maxIter & ~done 27 | % Compute Matrix-vector product 28 | if nargin >= 9 29 | Ap = matrixVectFunc(p,matrixVectArgs{:}); 30 | else 31 | Ap = A*p; 32 | end 33 | pAp = p'*Ap; 34 | 35 | % Check for negative Curvature 36 | if pAp <= 1e-16 37 | if verbose 38 | fprintf('Negative Curvature Detected!\n'); 39 | end 40 | 41 | if nargout == 4 42 | if pAp < 0 43 | negCurv = p; 44 | return 45 | end 46 | end 47 | 48 | if k == 0 49 | if verbose 50 | fprintf('First-Iter, Proceeding...\n'); 51 | end 52 | done = 1; 53 | else 54 | if verbose 55 | fprintf('Stopping\n'); 56 | end 57 | break; 58 | end 59 | end 60 | 61 | % Conjugate Gradient 62 | alpha = ry/(pAp); 63 | x = x + alpha*p; 64 | r = r + alpha*Ap; 65 | 66 | % If supplied, apply preconditioner 67 | if nargin >= 7 && ~isempty(precFunc) 68 | y = precFunc(r,precArgs{:}); 69 | else 70 | y = r; 71 | end 72 | 73 | ry_new = r'*y; 74 | beta = ry_new/ry; 75 | p = -y + beta*p; 76 | k = k + 1; 77 | 78 | % Update variables 79 | ry = ry_new; 80 | res = norm(r); 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /minFunc/dampedUpdate.m: -------------------------------------------------------------------------------- 1 | function [old_dirs,old_stps,Hdiag,Bcompact] = lbfgsUpdate(y,s,corrections,debug,old_dirs,old_stps,Hdiag) 2 | 3 | %B0 = eye(length(y))/Hdiag; 4 | S = old_dirs(:,2:end); 5 | Y = old_stps(:,2:end); 6 | k = size(Y,2); 7 | L = zeros(k); 8 | for j = 1:k 9 | for i = j+1:k 10 | L(i,j) = S(:,i)'*Y(:,j); 11 | end 12 | end 13 | D = diag(diag(S'*Y)); 14 | N = [S/Hdiag Y]; 15 | M = [S'*S/Hdiag L;L' -D]; 16 | 17 | ys = y'*s; 18 | Bs = s/Hdiag - N*(M\(N'*s)); % Product B*s 19 | sBs = s'*Bs; 20 | 21 | eta = .02; 22 | if ys < eta*sBs 23 | if debug 24 | fprintf('Damped Update\n'); 25 | end 26 | theta = min(max(0,((1-eta)*sBs)/(sBs - ys)),1); 27 | y = theta*y + (1-theta)*Bs; 28 | end 29 | 30 | 31 | numCorrections = size(old_dirs,2); 32 | if numCorrections < corrections 33 | % Full Update 34 | old_dirs(:,numCorrections+1) = s; 35 | old_stps(:,numCorrections+1) = y; 36 | else 37 | % Limited-Memory Update 38 | old_dirs = [old_dirs(:,2:corrections) s]; 39 | old_stps = [old_stps(:,2:corrections) y]; 40 | end 41 | 42 | % Update scale of initial Hessian approximation 43 | Hdiag = (y'*s)/(y'*y); -------------------------------------------------------------------------------- /minFunc/example_minFunc.m: -------------------------------------------------------------------------------- 1 | % Runs various limited-memory solvers on 2D rosenbrock function for 25 2 | % function evaluations 3 | maxFunEvals = 25; 4 | 5 | fprintf('Result after %d evaluations of limited-memory solvers on 2D rosenbrock:\n',maxFunEvals); 6 | 7 | fprintf('---------------------------------------\n'); 8 | fprintf('x1 = %.4f, x2 = %.4f (starting point)\n',0,0); 9 | fprintf('x1 = %.4f, x2 = %.4f (optimal solution)\n',1,1); 10 | fprintf('---------------------------------------\n'); 11 | 12 | if exist('minimize') == 2 13 | % Minimize.m - conjugate gradient method 14 | x = minimize([0 0]', 'rosenbrock', -maxFunEvals); 15 | fprintf('x1 = %.4f, x2 = %.4f (minimize.m by C. Rasmussen)\n',x(1),x(2)); 16 | end 17 | 18 | options = []; 19 | options.display = 'none'; 20 | options.maxFunEvals = maxFunEvals; 21 | 22 | % Steepest Descent 23 | options.Method = 'sd'; 24 | x = minFunc(@rosenbrock,[0 0]',options); 25 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with steepest descent)\n',x(1),x(2)); 26 | 27 | % Cyclic Steepest Descent 28 | options.Method = 'csd'; 29 | x = minFunc(@rosenbrock,[0 0]',options); 30 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with cyclic steepest descent)\n',x(1),x(2)); 31 | 32 | % Barzilai & Borwein 33 | options.Method = 'bb'; 34 | options.bbType = 1; 35 | x = minFunc(@rosenbrock,[0 0]',options); 36 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with spectral gradient descent)\n',x(1),x(2)); 37 | 38 | % Hessian-Free Newton 39 | options.Method = 'newton0'; 40 | x = minFunc(@rosenbrock,[0 0]',options); 41 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with Hessian-free Newton)\n',x(1),x(2)); 42 | 43 | % Hessian-Free Newton w/ L-BFGS preconditioner 44 | options.Method = 'pnewton0'; 45 | x = minFunc(@rosenbrock,[0 0]',options); 46 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with preconditioned Hessian-free Newton)\n',x(1),x(2)); 47 | 48 | % Conjugate Gradient 49 | options.Method = 'cg'; 50 | x = minFunc(@rosenbrock,[0 0]',options); 51 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with conjugate gradient)\n',x(1),x(2)); 52 | 53 | % Scaled conjugate Gradient 54 | options.Method = 'scg'; 55 | x = minFunc(@rosenbrock,[0 0]',options); 56 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with scaled conjugate gradient)\n',x(1),x(2)); 57 | 58 | % Preconditioned Conjugate Gradient 59 | options.Method = 'pcg'; 60 | x = minFunc(@rosenbrock,[0 0]',options); 61 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with preconditioned conjugate gradient)\n',x(1),x(2)); 62 | 63 | % Default: L-BFGS (default) 64 | options.Method = 'lbfgs'; 65 | x = minFunc(@rosenbrock,[0 0]',options); 66 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with limited-memory BFGS - default)\n',x(1),x(2)); 67 | 68 | fprintf('---------------------------------------\n'); 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /minFunc/example_minFunc_LR.m: -------------------------------------------------------------------------------- 1 | clear all 2 | 3 | nInst = 500; 4 | nVars = 100; 5 | X = [ones(nInst,1) randn(nInst,nVars-1)]; 6 | w = randn(nVars,1); 7 | y = sign(X*w); 8 | flipInd = rand(nInst,1) > .9; 9 | y(flipInd) = -y(flipInd); 10 | 11 | w_init = zeros(nVars,1); 12 | funObj = @(w)LogisticLoss(w,X,y); 13 | 14 | fprintf('Running Hessian-Free Newton w/ numerical Hessian-Vector products\n'); 15 | options.Method = 'newton0'; 16 | minFunc(@LogisticLoss,w_init,options,X,y); 17 | pause; 18 | 19 | fprintf('Running Preconditioned Hessian-Free Newton w/ numerical Hessian-Vector products (Diagonal preconditioner)\n'); 20 | options.Method = 'pnewton0'; 21 | options.precFunc = @LogisticDiagPrecond; 22 | minFunc(@LogisticLoss,w_init,options,X,y); 23 | pause; 24 | 25 | fprintf('Running Preconditioned Hessian-Free Newton w/ numerical Hessian-Vector products (L-BFGS preconditioner)\n'); 26 | options.Method = 'pnewton0'; 27 | options.precFunc = []; 28 | minFunc(@LogisticLoss,w_init,options,X,y); 29 | pause; 30 | 31 | fprintf('Running Hessian-Free Newton w/ analytic Hessian-Vector products\n'); 32 | options.Method = 'newton0'; 33 | options.HvFunc = @LogisticHv; 34 | minFunc(@LogisticLoss,w_init,options,X,y); 35 | pause; 36 | 37 | fprintf('Running Preconditioned Hessian-Free Newton w/ analytic Hessian-Vector products (Diagonal preconditioner)\n'); 38 | options.Method = 'pnewton0'; 39 | options.HvFunc = @LogisticHv; 40 | options.precFunc = @LogisticDiagPrecond; 41 | minFunc(@LogisticLoss,w_init,options,X,y); 42 | pause; 43 | 44 | fprintf('Running Preconditioned Hessian-Free Newton w/ analytic Hessian-Vector products (L-BFGS preconditioner)\n'); 45 | options.Method = 'pnewton0'; 46 | options.precFunc = []; 47 | options.HvFunc = @LogisticHv; 48 | minFunc(@LogisticLoss,w_init,options,X,y); 49 | pause; -------------------------------------------------------------------------------- /minFunc/isLegal.m: -------------------------------------------------------------------------------- 1 | function [legal] = isLegal(v) 2 | legal = sum(any(imag(v(:))))==0 & sum(isnan(v(:)))==0 & sum(isinf(v(:)))==0; -------------------------------------------------------------------------------- /minFunc/lbfgs.m: -------------------------------------------------------------------------------- 1 | function [d] = lbfgs(g,s,y,Hdiag) 2 | % BFGS Search Direction 3 | % 4 | % This function returns the (L-BFGS) approximate inverse Hessian, 5 | % multiplied by the gradient 6 | % 7 | % If you pass in all previous directions/sizes, it will be the same as full BFGS 8 | % If you truncate to the k most recent directions/sizes, it will be L-BFGS 9 | % 10 | % s - previous search directions (p by k) 11 | % y - previous step sizes (p by k) 12 | % g - gradient (p by 1) 13 | % Hdiag - value of initial Hessian diagonal elements (scalar) 14 | 15 | [p,k] = size(s); 16 | 17 | for i = 1:k 18 | ro(i,1) = 1/(y(:,i)'*s(:,i)); 19 | end 20 | 21 | q = zeros(p,k+1); 22 | r = zeros(p,k+1); 23 | al =zeros(k,1); 24 | be =zeros(k,1); 25 | 26 | q(:,k+1) = g; 27 | 28 | for i = k:-1:1 29 | al(i) = ro(i)*s(:,i)'*q(:,i+1); 30 | q(:,i) = q(:,i+1)-al(i)*y(:,i); 31 | end 32 | 33 | % Multiply by Initial Hessian 34 | r(:,1) = Hdiag*q(:,1); 35 | 36 | for i = 1:k 37 | be(i) = ro(i)*y(:,i)'*r(:,i); 38 | r(:,i+1) = r(:,i) + s(:,i)*(al(i)-be(i)); 39 | end 40 | d=r(:,k+1); -------------------------------------------------------------------------------- /minFunc/lbfgsC.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mex.h" 3 | 4 | /* See lbfgs.m for details! */ 5 | /* This function may not exit gracefully on bad input! */ 6 | 7 | 8 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) 9 | { 10 | /* Variable Declarations */ 11 | 12 | double *s, *y, *g, *H, *d, *ro, *alpha, *beta, *q, *r; 13 | int nVars,nSteps,lhs_dims[2]; 14 | double temp; 15 | int i,j; 16 | 17 | /* Get Input Pointers */ 18 | 19 | g = mxGetPr(prhs[0]); 20 | s = mxGetPr(prhs[1]); 21 | y = mxGetPr(prhs[2]); 22 | H = mxGetPr(prhs[3]); 23 | 24 | /* Compute number of variables (p), rank of update (d) */ 25 | 26 | nVars = mxGetDimensions(prhs[1])[0]; 27 | nSteps = mxGetDimensions(prhs[1])[1]; 28 | 29 | /* Allocated Memory for Function Variables */ 30 | ro = mxCalloc(nSteps,sizeof(double)); 31 | alpha = mxCalloc(nSteps,sizeof(double)); 32 | beta = mxCalloc(nSteps,sizeof(double)); 33 | q = mxCalloc(nVars*(nSteps+1),sizeof(double)); 34 | r = mxCalloc(nVars*(nSteps+1),sizeof(double)); 35 | 36 | /* Set-up Output Vector */ 37 | 38 | lhs_dims[0] = nVars; 39 | lhs_dims[1] = 1; 40 | 41 | plhs[0] = mxCreateNumericArray(2,lhs_dims,mxDOUBLE_CLASS,mxREAL); 42 | d = mxGetPr(plhs[0]); 43 | 44 | /* ro = 1/(y(:,i)'*s(:,i)) */ 45 | for(i=0;i=0;i--) 62 | { 63 | /* alpha(i) = ro(i)*s(:,i)'*q(:,i+1) */ 64 | alpha[i] = 0; 65 | for(j=0;j 1e-10 4 | numCorrections = size(old_dirs,2); 5 | if numCorrections < corrections 6 | % Full Update 7 | old_dirs(:,numCorrections+1) = s; 8 | old_stps(:,numCorrections+1) = y; 9 | else 10 | % Limited-Memory Update 11 | old_dirs = [old_dirs(:,2:corrections) s]; 12 | old_stps = [old_stps(:,2:corrections) y]; 13 | end 14 | 15 | % Update scale of initial Hessian approximation 16 | Hdiag = ys/(y'*y); 17 | else 18 | if debug 19 | fprintf('Skipping Update\n'); 20 | end 21 | end -------------------------------------------------------------------------------- /minFunc/logistic/LogisticDiagPrecond.m: -------------------------------------------------------------------------------- 1 | function [m] = LogisticHv(v,w,X,y) 2 | % v(feature,1) - vector that we will apply diagonal preconditioner to 3 | % w(feature,1) 4 | % X(instance,feature) 5 | % y(instance,1) 6 | 7 | sig = 1./(1+exp(-y.*(X*w))); 8 | 9 | % Compute diagonals of Hessian 10 | sig = sig.*(1-sig); 11 | for i = 1:length(w) 12 | h(i,1) = (sig.*X(:,i))'*X(:,i); 13 | end 14 | 15 | % Apply preconditioner 16 | m = v./h; 17 | 18 | % Exact preconditioner 19 | %H = X'*diag(sig.*(1-sig))*X; 20 | %m = H\v; 21 | -------------------------------------------------------------------------------- /minFunc/logistic/LogisticHv.m: -------------------------------------------------------------------------------- 1 | function [Hv] = LogisticHv(v,w,X,y) 2 | % v(feature,1) - vector that we will multiply Hessian by 3 | % w(feature,1) 4 | % X(instance,feature) 5 | % y(instance,1) 6 | 7 | sig = 1./(1+exp(-y.*(X*w))); 8 | Hv = X.'*(sig.*(1-sig).*(X*v)); 9 | -------------------------------------------------------------------------------- /minFunc/logistic/LogisticLoss.m: -------------------------------------------------------------------------------- 1 | function [nll,g,H,T] = LogisticLoss(w,X,y) 2 | % w(feature,1) 3 | % X(instance,feature) 4 | % y(instance,1) 5 | 6 | [n,p] = size(X); 7 | 8 | Xw = X*w; 9 | yXw = y.*Xw; 10 | 11 | nll = sum( ([zeros(n,1) -yXw])); 12 | 13 | if nargout > 1 14 | if nargout > 2 15 | sig = 1./(1+exp(-yXw)); 16 | g = -X.'*(y.*(1-sig)); 17 | else 18 | g = -X.'*(y./(1+exp(yXw))); 19 | end 20 | end 21 | 22 | if nargout > 2 23 | H = X.'*diag(sparse(sig.*(1-sig)))*X; 24 | end 25 | 26 | if nargout > 3 27 | T = zeros(p,p,p); 28 | for j1 = 1:p 29 | for j2 = 1:p 30 | for j3 = 1:p 31 | T(j1,j2,j3) = sum(y(:).^3.*X(:,j1).*X(:,j2).*X(:,j3).*sig.*(1-sig).*(1-2*sig)); 32 | end 33 | end 34 | end 35 | end -------------------------------------------------------------------------------- /minFunc/logistic/mexutil.c: -------------------------------------------------------------------------------- 1 | #include "mexutil.h" 2 | 3 | /* Functions to create uninitialized arrays. */ 4 | 5 | mxArray *mxCreateNumericArrayE(int ndim, const int *dims, 6 | mxClassID class, mxComplexity ComplexFlag) 7 | { 8 | mxArray *a; 9 | int i, *dims1 = mxMalloc(ndim*sizeof(int)); 10 | size_t sz = 1; 11 | for(i=0;i 9 | 10 | /* repeat a block of memory rep times */ 11 | void memrep(char *dest, size_t chunk, int rep) 12 | { 13 | #if 0 14 | /* slow way */ 15 | int i; 16 | char *p = dest; 17 | for(i=1;i>1); 31 | #endif 32 | } 33 | 34 | void repmat(char *dest, const char *src, int ndim, int *destdimsize, 35 | int *dimsize, const int *dims, int *rep) 36 | { 37 | int d = ndim-1; 38 | int i, chunk; 39 | /* copy the first repetition into dest */ 40 | if(d == 0) { 41 | chunk = dimsize[0]; 42 | memcpy(dest,src,chunk); 43 | } 44 | else { 45 | /* recursively repeat each slice of src */ 46 | for(i=0;i ndimdest) ndimdest = nrep; 91 | rep = mxCalloc(ndimdest, sizeof(int)); 92 | for(i=0;i ndimdest) ndimdest = nrep; 105 | rep = mxCalloc(ndimdest, sizeof(int)); 106 | for(i=0;i ndim) memrep(dest,destdimsize[ndim-1],extra_rep); 143 | if(mxIsComplex(srcmat)) { 144 | src = (char*)mxGetPi(srcmat); 145 | dest = (char*)mxGetPi(plhs[0]); 146 | repmat(dest,src,ndim,destdimsize,dimsize,dims,rep); 147 | if(ndimdest > ndim) memrep(dest,destdimsize[ndim-1],extra_rep); 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /minFunc/logistic/repmatC.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/minFunc/logistic/repmatC.dll -------------------------------------------------------------------------------- /minFunc/logistic/repmatC.mexglx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/minFunc/logistic/repmatC.mexglx -------------------------------------------------------------------------------- /minFunc/logistic/repmatC.mexmac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/minFunc/logistic/repmatC.mexmac -------------------------------------------------------------------------------- /minFunc/mchol.m: -------------------------------------------------------------------------------- 1 | function [l,d,perm] = mchol(A,mu) 2 | % Compute a modified LDL factorization of A 3 | % (MEX ME!) 4 | 5 | if nargin < 2 6 | mu = 1e-12; 7 | end 8 | 9 | n = size(A,1); 10 | l = eye(n); 11 | d = zeros(n,1); 12 | perm = 1:n; 13 | 14 | for i = 1:n 15 | c(i,i) = A(i,i); 16 | end 17 | 18 | % Compute modification parameters 19 | gamma = max(abs(diag(A))); 20 | xi = max(max(abs(setdiag(A,0)))); 21 | delta = mu*max(gamma+xi,1); 22 | if n > 1 23 | beta = sqrt(max([gamma xi/sqrt(n^2-1) mu])); 24 | else 25 | beta = sqrt(max([gamma mu])); 26 | end 27 | 28 | for j = 1:n 29 | 30 | % Find q that results in Best Permutation with j 31 | [maxVal maxPos] = max(abs(diag(c(j:end,j:end)))); 32 | q = maxPos+j-1; 33 | 34 | % Permute d,c,l,a 35 | d([j q]) = d([q j]); 36 | perm([j q]) = perm([q j]); 37 | c([j q],:) = c([q j],:); 38 | c(:,[j q]) = c(:,[q j]); 39 | l([j q],:) = l([q j],:); 40 | l(:,[j q]) = l(:,[q j]); 41 | A([j q],:) = A([q j],:); 42 | A(:,[j q]) = A(:,[q j]); 43 | 44 | for s = 1:j-1 45 | l(j,s) = c(j,s)/d(s); 46 | end 47 | for i = j+1:n 48 | c(i,j) = A(i,j) - sum(l(j,1:j-1).*c(i,1:j-1)); 49 | end 50 | theta = 0; 51 | if j < n && j > 1 52 | theta = max(abs(c(j+1:n,j))); 53 | end 54 | d(j) = max([abs(c(j,j)) (theta/beta)^2 delta]); 55 | if j < n 56 | for i = j+1:n 57 | c(i,i) = c(i,i) - (c(i,j)^2)/d(j); 58 | end 59 | end 60 | end -------------------------------------------------------------------------------- /minFunc/mcholC.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mex.h" 3 | 4 | double mymax(double x, double y) 5 | { 6 | if (x > y) 7 | return x; 8 | else 9 | return y; 10 | } 11 | 12 | double absolute(double x) 13 | { 14 | if (x >= -x) 15 | return x; 16 | else 17 | return -x; 18 | } 19 | 20 | void permuteInt(int *x, int p, int q) 21 | { 22 | int temp; 23 | temp = x[p]; 24 | x[p] = x[q]; 25 | x[q] = temp; 26 | } 27 | 28 | void permute(double *x, int p, int q) 29 | { 30 | double temp; 31 | temp = x[p]; 32 | x[p] = x[q]; 33 | x[q] = temp; 34 | } 35 | 36 | void permuteRows(double *x, int p, int q,int n) 37 | { 38 | int i; 39 | double temp; 40 | for(i = 0; i < n; i++) 41 | { 42 | temp = x[p+i*n]; 43 | x[p+i*n] = x[q+i*n]; 44 | x[q+i*n] = temp; 45 | } 46 | } 47 | 48 | void permuteCols(double *x, int p, int q,int n) 49 | { 50 | int i; 51 | double temp; 52 | for(i = 0; i < n; i++) 53 | { 54 | temp = x[i+p*n]; 55 | x[i+p*n] = x[i+q*n]; 56 | x[i+q*n] = temp; 57 | } 58 | } 59 | 60 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) 61 | { 62 | int n,sizL[2],sizD[2],i,j,q,s, 63 | *P; 64 | 65 | double mu,gamma,xi,delta,beta,maxVal,theta, 66 | *c, *H, *L, *D, *A; 67 | 68 | /* Input */ 69 | H = mxGetPr(prhs[0]); 70 | if (nrhs == 1) 71 | { 72 | mu = 1e-12; 73 | } 74 | else 75 | { 76 | mu = mxGetScalar(prhs[1]); 77 | } 78 | 79 | /* Compute Sizes */ 80 | n = mxGetDimensions(prhs[0])[0]; 81 | 82 | /* Form Output */ 83 | sizL[0] = n; 84 | sizL[1] = n; 85 | plhs[0] = mxCreateNumericArray(2,sizL,mxDOUBLE_CLASS,mxREAL); 86 | L = mxGetPr(plhs[0]); 87 | sizD[0] = n; 88 | sizD[1] = 1; 89 | plhs[1] = mxCreateNumericArray(2,sizD,mxDOUBLE_CLASS,mxREAL); 90 | D = mxGetPr(plhs[1]); 91 | plhs[2] = mxCreateNumericArray(2,sizD,mxINT32_CLASS,mxREAL); 92 | P = (int*)mxGetData(plhs[2]); 93 | 94 | /* Initialize */ 95 | c = mxCalloc(n*n,sizeof(double)); 96 | A = mxCalloc(n*n,sizeof(double)); 97 | 98 | for (i = 0; i < n; i++) 99 | { 100 | P[i] = i; 101 | for (j = 0;j < n; j++) 102 | { 103 | A[i+n*j] = H[i+n*j]; 104 | } 105 | } 106 | 107 | gamma = 0; 108 | for (i = 0; i < n; i++) 109 | { 110 | L[i+n*i] = 1; 111 | c[i+n*i] = A[i+n*i]; 112 | } 113 | 114 | /* Compute modification parameters */ 115 | gamma = -1; 116 | xi = -1; 117 | for (i = 0; i < n; i++) 118 | { 119 | gamma = mymax(gamma,absolute(A[i+n*i])); 120 | for (j = 0;j < n; j++) 121 | { 122 | //printf("A(%d,%d) = %f, %f\n",i,j,A[i+n*j],absolute(A[i+n*j])); 123 | if (i != j) 124 | xi = mymax(xi,absolute(A[i+n*j])); 125 | } 126 | } 127 | delta = mu*mymax(gamma+xi,1); 128 | 129 | if (n > 1) 130 | { 131 | beta = sqrt(mymax(gamma,mymax(mu,xi/sqrt(n*n-1)))); 132 | } 133 | else 134 | { 135 | beta = sqrt(mymax(gamma,mu)); 136 | } 137 | 138 | for (j = 0; j < n; j++) 139 | { 140 | 141 | /* Find q that results in Best Permutation with j */ 142 | maxVal = -1; 143 | q = 0; 144 | for(i = j; i < n; i++) 145 | { 146 | if (absolute(c[i+n*i]) > maxVal) 147 | { 148 | maxVal = mymax(maxVal,absolute(c[i+n*i])); 149 | q = i; 150 | } 151 | } 152 | 153 | /* Permute D,c,L,A,P */ 154 | permute(D,j,q); 155 | permuteInt(P,j,q); 156 | permuteRows(c,j,q,n); 157 | permuteCols(c,j,q,n); 158 | permuteRows(L,j,q,n); 159 | permuteCols(L,j,q,n); 160 | permuteRows(A,j,q,n); 161 | permuteCols(A,j,q,n); 162 | 163 | for(s = 0; s <= j-1; s++) 164 | L[j+n*s] = c[j+n*s]/D[s]; 165 | 166 | for(i = j+1; i < n; i++) 167 | { 168 | c[i+j*n] = A[i+j*n]; 169 | for(s = 0; s <= j-1; s++) 170 | { 171 | c[i+j*n] -= L[j+n*s]*c[i+n*s]; 172 | } 173 | } 174 | 175 | theta = 0; 176 | if (j < n-1) 177 | { 178 | for(i = j+1;i < n; i++) 179 | theta = mymax(theta,absolute(c[i+n*j])); 180 | } 181 | 182 | D[j] = mymax(absolute(c[j+n*j]),mymax(delta,theta*theta/(beta*beta))); 183 | 184 | if (j < n-1) 185 | { 186 | for(i = j+1; i < n; i++) 187 | { 188 | c[i+n*i] = c[i+n*i] - c[i+n*j]*c[i+n*j]/D[j]; 189 | } 190 | } 191 | 192 | } 193 | 194 | for(i = 0; i < n; i++) 195 | P[i]++; 196 | 197 | mxFree(c); 198 | mxFree(A); 199 | } -------------------------------------------------------------------------------- /minFunc/mcholC.mexmaci64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/minFunc/mcholC.mexmaci64 -------------------------------------------------------------------------------- /minFunc/mcholC.mexw32: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/minFunc/mcholC.mexw32 -------------------------------------------------------------------------------- /minFunc/mcholC.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/minFunc/mcholC.mexw64 -------------------------------------------------------------------------------- /minFunc/mcholinc.m: -------------------------------------------------------------------------------- 1 | function [R,tau] = mcholinc(H,verbose) 2 | % Computes Cholesky of H+tau*I, for suitably large tau that matrix is pd 3 | 4 | p = size(H,1); 5 | 6 | beta = norm(H,'fro'); 7 | if min(diag(H)) > 1e-12 8 | tau = 0; 9 | else 10 | if verbose 11 | fprintf('Small Value on Diagonal, Adjusting Hessian\n'); 12 | end 13 | tau = max(beta/2,1e-12); 14 | end 15 | while 1 16 | [R,posDef] = chol(H+tau*eye(p)); 17 | if posDef == 0 18 | break; 19 | else 20 | if verbose 21 | fprintf('Cholesky Failed, Adjusting Hessian\n'); 22 | end 23 | tau = max(2*tau,beta/2); 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /minFunc/minFunc.m: -------------------------------------------------------------------------------- 1 | function [x,f,exitflag,output] = minFunc(funObj,x0,options,varargin) 2 | % minFunc(funObj,x0,options,varargin) 3 | % 4 | % Unconstrained optimizer using a line search strategy 5 | % 6 | % Uses an interface very similar to fminunc 7 | % (it doesn't support all of the optimization toolbox options, 8 | % but supports many other options). 9 | % 10 | % It computes descent directions using one of ('Method'): 11 | % - 'sd': Steepest Descent 12 | % (no previous information used, not recommended) 13 | % - 'csd': Cyclic Steepest Descent 14 | % (uses previous step length for a fixed length cycle) 15 | % - 'bb': Barzilai and Borwein Gradient 16 | % (uses only previous step) 17 | % - 'cg': Non-Linear Conjugate Gradient 18 | % (uses only previous step and a vector beta) 19 | % - 'scg': Scaled Non-Linear Conjugate Gradient 20 | % (uses previous step and a vector beta, 21 | % and Hessian-vector products to initialize line search) 22 | % - 'pcg': Preconditionined Non-Linear Conjugate Gradient 23 | % (uses only previous step and a vector beta, preconditioned version) 24 | % - 'lbfgs': Quasi-Newton with Limited-Memory BFGS Updating 25 | % (default: uses a predetermined nunber of previous steps to form a 26 | % low-rank Hessian approximation) 27 | % - 'newton0': Hessian-Free Newton 28 | % (numerically computes Hessian-Vector products) 29 | % - 'pnewton0': Preconditioned Hessian-Free Newton 30 | % (numerically computes Hessian-Vector products, preconditioned 31 | % version) 32 | % - 'qnewton': Quasi-Newton Hessian approximation 33 | % (uses dense Hessian approximation) 34 | % - 'mnewton': Newton's method with Hessian calculation after every 35 | % user-specified number of iterations 36 | % (needs user-supplied Hessian matrix) 37 | % - 'newton': Newton's method with Hessian calculation every iteration 38 | % (needs user-supplied Hessian matrix) 39 | % - 'tensor': Tensor 40 | % (needs user-supplied Hessian matrix and Tensor of 3rd partial derivatives) 41 | % 42 | % Several line search strategies are available for finding a step length satisfying 43 | % the termination criteria ('LS'): 44 | % - 0: Backtrack w/ Step Size Halving 45 | % - 1: Backtrack w/ Quadratic/Cubic Interpolation from new function values 46 | % - 2: Backtrack w/ Cubic Interpolation from new function + gradient 47 | % values (default for 'bb' and 'sd') 48 | % - 3: Bracketing w/ Step Size Doubling and Bisection 49 | % - 4: Bracketing w/ Cubic Interpolation/Extrapolation with function + 50 | % gradient values (default for all except 'bb' and 'sd') 51 | % - 5: Bracketing w/ Mixed Quadratic/Cubic Interpolation/Extrapolation 52 | % - 6: Use Matlab Optimization Toolbox's line search 53 | % (requires Matlab's linesearch.m to be added to the path) 54 | % 55 | % Above, the first three find a point satisfying the Armijo conditions, 56 | % while the last four search for find a point satisfying the Wolfe 57 | % conditions. If the objective function overflows, it is recommended 58 | % to use one of the first 3. 59 | % The first three can be used to perform a non-monotone 60 | % linesearch by changing the option 'Fref'. 61 | % 62 | % Several strategies for choosing the initial step size are avaiable ('LS_init'): 63 | % - 0: Always try an initial step length of 1 (default for all except 'cg' and 'sd') 64 | % (t = 1) 65 | % - 1: Use a step similar to the previous step (default for 'cg' and 'sd') 66 | % (t = t_old*min(2,g'd/g_old'd_old)) 67 | % - 2: Quadratic Initialization using previous function value and new 68 | % function value/gradient (use this if steps tend to be very long) 69 | % (t = min(1,2*(f-f_old)/g)) 70 | % - 3: The minimum between 1 and twice the previous step length 71 | % (t = min(1,2*t) 72 | % - 4: The scaled conjugate gradient step length (may accelerate 73 | % conjugate gradient methods, but requires a Hessian-vector product) 74 | % (t = g'd/d'Hd) 75 | % 76 | % Inputs: 77 | % funObj is a function handle 78 | % x0 is a starting vector; 79 | % options is a struct containing parameters 80 | % (defaults are used for non-existent or blank fields) 81 | % all other arguments are passed to funObj 82 | % 83 | % Outputs: 84 | % x is the minimum value found 85 | % f is the function value at the minimum found 86 | % exitflag returns an exit condition 87 | % output returns a structure with other information 88 | % 89 | % Supported Input Options 90 | % Display - Level of display [ off | final | (iter) | full | excessive ] 91 | % MaxFunEvals - Maximum number of function evaluations allowed (1000) 92 | % MaxIter - Maximum number of iterations allowed (500) 93 | % TolFun - Termination tolerance on the first-order optimality (1e-5) 94 | % TolX - Termination tolerance on progress in terms of function/parameter changes (1e-9) 95 | % Method - [ sd | csd | bb | cg | scg | pcg | {lbfgs} | newton0 | pnewton0 | 96 | % qnewton | mnewton | newton | tensor ] 97 | % c1 - Sufficient Decrease for Armijo condition (1e-4) 98 | % c2 - Curvature Decrease for Wolfe conditions (.2 for cg methods, .9 otherwise) 99 | % LS_init - Line Search Initialization -see above (2 for cg/sd, 4 for scg, 0 otherwise) 100 | % LS - Line Search type -see above (2 for bb, 4 otherwise) 101 | % Fref - Setting this to a positive integer greater than 1 102 | % will use non-monotone Armijo objective in the line search. 103 | % (20 for bb, 10 for csd, 1 for all others) 104 | % numDiff - compute derivative numerically 105 | % (default: 0) (this option has a different effect for 'newton', see below) 106 | % useComplex - if 1, use complex differentials when computing numerical derivatives 107 | % to get very accurate values (default: 0) 108 | % DerivativeCheck - if 'on', computes derivatives numerically at initial 109 | % point and compares to user-supplied derivative (default: 'off') 110 | % outputFcn - function to run after each iteration (default: []). It 111 | % should have the following interface: 112 | % outputFcn(x,infoStruct,state,varargin{:}) 113 | % useMex - where applicable, use mex files to speed things up (default: 1) 114 | % 115 | % Method-specific input options: 116 | % newton: 117 | % HessianModify - type of Hessian modification for direct solvers to 118 | % use if the Hessian is not positive definite (default: 0) 119 | % 0: Minimum Euclidean norm s.t. eigenvalues sufficiently large 120 | % (requires eigenvalues on iterations where matrix is not pd) 121 | % 1: Start with (1/2)*||A||_F and increment until Cholesky succeeds 122 | % (an approximation to method 0, does not require eigenvalues) 123 | % 2: Modified LDL factorization 124 | % (only 1 generalized Cholesky factorization done and no eigenvalues required) 125 | % 3: Modified Spectral Decomposition 126 | % (requires eigenvalues) 127 | % 4: Modified Symmetric Indefinite Factorization 128 | % 5: Uses the eigenvector of the smallest eigenvalue as negative 129 | % curvature direction 130 | % cgSolve - use conjugate gradient instead of direct solver (default: 0) 131 | % 0: Direct Solver 132 | % 1: Conjugate Gradient 133 | % 2: Conjugate Gradient with Diagonal Preconditioner 134 | % 3: Conjugate Gradient with LBFGS Preconditioner 135 | % x: Conjugate Graident with Symmetric Successive Over Relaxation 136 | % Preconditioner with parameter x 137 | % (where x is a real number in the range [0,2]) 138 | % x: Conjugate Gradient with Incomplete Cholesky Preconditioner 139 | % with drop tolerance -x 140 | % (where x is a real negative number) 141 | % numDiff - compute Hessian numerically 142 | % (default: 0, done with complex differentials if useComplex = 1) 143 | % LS_saveHessiancomp - when on, only computes the Hessian at the 144 | % first and last iteration of the line search (default: 1) 145 | % mnewton: 146 | % HessianIter - number of iterations to use same Hessian (default: 5) 147 | % qnewton: 148 | % initialHessType - scale initial Hessian approximation (default: 1) 149 | % qnUpdate - type of quasi-Newton update (default: 3): 150 | % 0: BFGS 151 | % 1: SR1 (when it is positive-definite, otherwise BFGS) 152 | % 2: Hoshino 153 | % 3: Self-Scaling BFGS 154 | % 4: Oren's Self-Scaling Variable Metric method 155 | % 5: McCormick-Huang asymmetric update 156 | % Damped - use damped BFGS update (default: 1) 157 | % newton0/pnewton0: 158 | % HvFunc - user-supplied function that returns Hessian-vector products 159 | % (by default, these are computed numerically using autoHv) 160 | % HvFunc should have the following interface: HvFunc(v,x,varargin{:}) 161 | % useComplex - use a complex perturbation to get high accuracy 162 | % Hessian-vector products (default: 0) 163 | % (the increased accuracy can make the method much more efficient, 164 | % but gradient code must properly support complex inputs) 165 | % useNegCurv - a negative curvature direction is used as the descent 166 | % direction if one is encountered during the cg iterations 167 | % (default: 1) 168 | % precFunc (for pnewton0 only) - user-supplied preconditioner 169 | % (by default, an L-BFGS preconditioner is used) 170 | % precFunc should have the following interfact: 171 | % precFunc(v,x,varargin{:}) 172 | % lbfgs: 173 | % Corr - number of corrections to store in memory (default: 100) 174 | % (higher numbers converge faster but use more memory) 175 | % Damped - use damped update (default: 0) 176 | % pcg: 177 | % cgUpdate - type of update (default: 2) 178 | % cg/scg/pcg: 179 | % cgUpdate - type of update (default for cg/scg: 2, default for pcg: 1) 180 | % 0: Fletcher Reeves 181 | % 1: Polak-Ribiere 182 | % 2: Hestenes-Stiefel (not supported for pcg) 183 | % 3: Gilbert-Nocedal 184 | % HvFunc (for scg only)- user-supplied function that returns Hessian-vector 185 | % products 186 | % (by default, these are computed numerically using autoHv) 187 | % HvFunc should have the following interface: 188 | % HvFunc(v,x,varargin{:}) 189 | % precFunc (for pcg only) - user-supplied preconditioner 190 | % (by default, an L-BFGS preconditioner is used) 191 | % precFunc should have the following interfact: 192 | % precFunc(v,x,varargin{:}) 193 | % bb: 194 | % bbType - type of bb step (default: 1) 195 | % 0: min_alpha ||delta_x - alpha delta_g||_2 196 | % 1: min_alpha ||alpha delta_x - delta_g||_2 197 | % 2: Conic BB 198 | % 3: Gradient method with retards 199 | % csd: 200 | % cycle - length of cycle (default: 3) 201 | % 202 | % Supported Output Options 203 | % iterations - number of iterations taken 204 | % funcCount - number of function evaluations 205 | % algorithm - algorithm used 206 | % firstorderopt - first-order optimality 207 | % message - exit message 208 | % trace.funccount - function evaluations after each iteration 209 | % trace.fval - function value after each iteration 210 | % 211 | % Author: Mark Schmidt (2006) 212 | % Web: http://www.cs.ubc.ca/~schmidtm 213 | % 214 | % Sources (in order of how much the source material contributes): 215 | % J. Nocedal and S.J. Wright. 1999. "Numerical Optimization". Springer Verlag. 216 | % R. Fletcher. 1987. "Practical Methods of Optimization". Wiley. 217 | % J. Demmel. 1997. "Applied Linear Algebra. SIAM. 218 | % R. Barret, M. Berry, T. Chan, J. Demmel, J. Dongarra, V. Eijkhout, R. 219 | % Pozo, C. Romine, and H. Van der Vost. 1994. "Templates for the Solution of 220 | % Linear Systems: Building Blocks for Iterative Methods". SIAM. 221 | % J. More and D. Thuente. "Line search algorithms with guaranteed 222 | % sufficient decrease". ACM Trans. Math. Softw. vol 20, 286-307, 1994. 223 | % M. Raydan. "The Barzilai and Borwein gradient method for the large 224 | % scale unconstrained minimization problem". SIAM J. Optim., 7, 26-33, 225 | % (1997). 226 | % "Mathematical Optimization". The Computational Science Education 227 | % Project. 1995. 228 | % C. Kelley. 1999. "Iterative Methods for Optimization". Frontiers in 229 | % Applied Mathematics. SIAM. 230 | 231 | if nargin < 3 232 | options = []; 233 | end 234 | 235 | % Get Parameters 236 | [verbose,verboseI,debug,doPlot,maxFunEvals,maxIter,tolFun,tolX,method,... 237 | corrections,c1,c2,LS_init,LS,cgSolve,qnUpdate,cgUpdate,initialHessType,... 238 | HessianModify,Fref,useComplex,numDiff,LS_saveHessianComp,... 239 | DerivativeCheck,Damped,HvFunc,bbType,cycle,... 240 | HessianIter,outputFcn,useMex,useNegCurv,precFunc] = ... 241 | minFunc_processInputOptions(options); 242 | 243 | if isfield(options, 'logfile') 244 | logfile = options.logfile; 245 | else 246 | logfile = []; 247 | end 248 | 249 | % Constants 250 | SD = 0; 251 | CSD = 1; 252 | BB = 2; 253 | CG = 3; 254 | PCG = 4; 255 | LBFGS = 5; 256 | QNEWTON = 6; 257 | NEWTON0 = 7; 258 | NEWTON = 8; 259 | TENSOR = 9; 260 | 261 | % Initialize 262 | p = length(x0); 263 | d = zeros(p,1); 264 | x = x0; 265 | t = 1; 266 | 267 | % If necessary, form numerical differentiation functions 268 | funEvalMultiplier = 1; 269 | if numDiff && method ~= TENSOR 270 | varargin(3:end+2) = varargin(1:end); 271 | varargin{1} = useComplex; 272 | varargin{2} = funObj; 273 | if method ~= NEWTON 274 | if debug 275 | if useComplex 276 | fprintf('Using complex differentials for gradient computation\n'); 277 | else 278 | fprintf('Using finite differences for gradient computation\n'); 279 | end 280 | end 281 | funObj = @autoGrad; 282 | else 283 | if debug 284 | if useComplex 285 | fprintf('Using complex differentials for gradient computation\n'); 286 | else 287 | fprintf('Using finite differences for gradient computation\n'); 288 | end 289 | end 290 | funObj = @autoHess; 291 | end 292 | 293 | if method == NEWTON0 && useComplex == 1 294 | if debug 295 | fprintf('Turning off the use of complex differentials\n'); 296 | end 297 | useComplex = 0; 298 | end 299 | 300 | if useComplex 301 | funEvalMultiplier = p; 302 | else 303 | funEvalMultiplier = p+1; 304 | end 305 | end 306 | 307 | % Evaluate Initial Point 308 | if method < NEWTON 309 | [f,g] = feval(funObj, x, varargin{:}); 310 | else 311 | [f,g,H] = feval(funObj, x, varargin{:}); 312 | computeHessian = 1; 313 | end 314 | funEvals = 1; 315 | 316 | if strcmp(DerivativeCheck,'on') 317 | if numDiff 318 | fprintf('Can not do derivative checking when numDiff is 1\n'); 319 | end 320 | % Check provided gradient/hessian function using numerical derivatives 321 | fprintf('Checking Gradient:\n'); 322 | [f2,g2] = autoGrad(x,useComplex,funObj,varargin{:}); 323 | 324 | fprintf('Max difference between user and numerical gradient: %f\n',max(abs(g-g2))); 325 | if max(abs(g-g2)) > 1e-4 326 | fprintf('User NumDif:\n'); 327 | [g g2] 328 | diff = abs(g-g2) 329 | pause; 330 | end 331 | 332 | if method >= NEWTON 333 | fprintf('Check Hessian:\n'); 334 | [f2,g2,H2] = autoHess(x,useComplex,funObj,varargin{:}); 335 | 336 | fprintf('Max difference between user and numerical hessian: %f\n',max(abs(H(:)-H2(:)))); 337 | if max(abs(H(:)-H2(:))) > 1e-4 338 | H 339 | H2 340 | diff = abs(H-H2) 341 | pause; 342 | end 343 | end 344 | end 345 | 346 | % Output Log 347 | if verboseI 348 | fprintf('%10s %10s %15s %15s %15s\n','Iteration','FunEvals','Step Length','Function Val','Opt Cond'); 349 | end 350 | 351 | if logfile 352 | fid = fopen(logfile, 'a'); 353 | if (fid > 0) 354 | fprintf(fid, '-- %10s %10s %15s %15s %15s\n','Iteration','FunEvals','Step Length','Function Val','Opt Cond'); 355 | fclose(fid); 356 | end 357 | end 358 | 359 | % Output Function 360 | if ~isempty(outputFcn) 361 | callOutput(outputFcn,x,'init',0,funEvals,f,[],[],g,[],sum(abs(g)),varargin{:}); 362 | end 363 | 364 | % Initialize Trace 365 | trace.fval = f; 366 | trace.funcCount = funEvals; 367 | 368 | % Check optimality of initial point 369 | if sum(abs(g)) <= tolFun 370 | exitflag=1; 371 | msg = 'Optimality Condition below TolFun'; 372 | if verbose 373 | fprintf('%s\n',msg); 374 | end 375 | if nargout > 3 376 | output = struct('iterations',0,'funcCount',1,... 377 | 'algorithm',method,'firstorderopt',sum(abs(g)),'message',msg,'trace',trace); 378 | end 379 | return; 380 | end 381 | 382 | % Perform up to a maximum of 'maxIter' descent steps: 383 | for i = 1:maxIter 384 | 385 | % ****************** COMPUTE DESCENT DIRECTION ***************** 386 | 387 | switch method 388 | case SD % Steepest Descent 389 | d = -g; 390 | 391 | case CSD % Cyclic Steepest Descent 392 | 393 | if mod(i,cycle) == 1 % Use Steepest Descent 394 | alpha = 1; 395 | LS_init = 2; 396 | LS = 4; % Precise Line Search 397 | elseif mod(i,cycle) == mod(1+1,cycle) % Use Previous Step 398 | alpha = t; 399 | LS_init = 0; 400 | LS = 2; % Non-monotonic line search 401 | end 402 | d = -alpha*g; 403 | 404 | case BB % Steepest Descent with Barzilai and Borwein Step Length 405 | 406 | if i == 1 407 | d = -g; 408 | else 409 | y = g-g_old; 410 | s = t*d; 411 | if bbType == 0 412 | yy = y'*y; 413 | alpha = (s'*y)/(yy); 414 | if alpha <= 1e-10 || alpha > 1e10 415 | alpha = 1; 416 | end 417 | elseif bbType == 1 418 | sy = s'*y; 419 | alpha = (s'*s)/sy; 420 | if alpha <= 1e-10 || alpha > 1e10 421 | alpha = 1; 422 | end 423 | elseif bbType == 2 % Conic Interpolation ('Modified BB') 424 | sy = s'*y; 425 | ss = s'*s; 426 | alpha = ss/sy; 427 | if alpha <= 1e-10 || alpha > 1e10 428 | alpha = 1; 429 | end 430 | alphaConic = ss/(6*(myF_old - f) + 4*g'*s + 2*g_old'*s); 431 | if alphaConic > .001*alpha && alphaConic < 1000*alpha 432 | alpha = alphaConic; 433 | end 434 | elseif bbType == 3 % Gradient Method with retards (bb type 1, random selection of previous step) 435 | sy = s'*y; 436 | alpha = (s'*s)/sy; 437 | if alpha <= 1e-10 || alpha > 1e10 438 | alpha = 1; 439 | end 440 | v(1+mod(i-2,5)) = alpha; 441 | alpha = v(ceil(rand*length(v))); 442 | end 443 | d = -alpha*g; 444 | end 445 | g_old = g; 446 | myF_old = f; 447 | 448 | 449 | case CG % Non-Linear Conjugate Gradient 450 | 451 | if i == 1 452 | d = -g; % Initially use steepest descent direction 453 | else 454 | gtgo = g'*g_old; 455 | gotgo = g_old'*g_old; 456 | 457 | if cgUpdate == 0 458 | % Fletcher-Reeves 459 | beta = (g'*g)/(gotgo); 460 | elseif cgUpdate == 1 461 | % Polak-Ribiere 462 | beta = (g'*(g-g_old)) /(gotgo); 463 | elseif cgUpdate == 2 464 | % Hestenes-Stiefel 465 | beta = (g'*(g-g_old))/((g-g_old)'*d); 466 | else 467 | % Gilbert-Nocedal 468 | beta_FR = (g'*(g-g_old)) /(gotgo); 469 | beta_PR = (g'*g-gtgo)/(gotgo); 470 | beta = max(-beta_FR,min(beta_PR,beta_FR)); 471 | end 472 | 473 | d = -g + beta*d; 474 | 475 | % Restart if not a direction of sufficient descent 476 | if g'*d > -tolX 477 | if debug 478 | fprintf('Restarting CG\n'); 479 | end 480 | beta = 0; 481 | d = -g; 482 | end 483 | 484 | % Old restart rule: 485 | %if beta < 0 || abs(gtgo)/(gotgo) >= 0.1 || g'*d >= 0 486 | 487 | end 488 | g_old = g; 489 | 490 | case PCG % Preconditioned Non-Linear Conjugate Gradient 491 | 492 | % Apply preconditioner to negative gradient 493 | if isempty(precFunc) 494 | % Use L-BFGS Preconditioner 495 | if i == 1 496 | old_dirs = zeros(length(g),0); 497 | old_stps = zeros(length(g),0); 498 | Hdiag = 1; 499 | s = -g; 500 | else 501 | [old_dirs,old_stps,Hdiag] = lbfgsUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag); 502 | 503 | if useMex 504 | s = lbfgsC(-g,old_dirs,old_stps,Hdiag); 505 | else 506 | s = lbfgs(-g,old_dirs,old_stps,Hdiag); 507 | end 508 | end 509 | else % User-supplied preconditioner 510 | s = precFunc(-g,x,varargin{:}); 511 | end 512 | 513 | if i == 1 514 | d = s; 515 | else 516 | 517 | if cgUpdate == 0 518 | % Preconditioned Fletcher-Reeves 519 | beta = (g'*s)/(g_old'*s_old); 520 | elseif cgUpdate < 3 521 | % Preconditioned Polak-Ribiere 522 | beta = (g'*(s-s_old))/(g_old'*s_old); 523 | else 524 | % Preconditioned Gilbert-Nocedal 525 | beta_FR = (g'*s)/(g_old'*s_old); 526 | beta_PR = (g'*(s-s_old))/(g_old'*s_old); 527 | beta = max(-beta_FR,min(beta_PR,beta_FR)); 528 | end 529 | d = s + beta*d; 530 | 531 | if g'*d > -tolX 532 | if debug 533 | fprintf('Restarting CG\n'); 534 | end 535 | beta = 0; 536 | d = s; 537 | end 538 | 539 | end 540 | g_old = g; 541 | s_old = s; 542 | case LBFGS % L-BFGS 543 | 544 | % Update the direction and step sizes 545 | 546 | if i == 1 547 | d = -g; % Initially use steepest descent direction 548 | old_dirs = zeros(length(g),0); 549 | old_stps = zeros(length(d),0); 550 | Hdiag = 1; 551 | else 552 | if Damped 553 | [old_dirs,old_stps,Hdiag] = dampedUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag); 554 | else 555 | [old_dirs,old_stps,Hdiag] = lbfgsUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag); 556 | end 557 | 558 | if useMex 559 | d = lbfgsC(-g,old_dirs,old_stps,Hdiag); 560 | else 561 | d = lbfgs(-g,old_dirs,old_stps,Hdiag); 562 | end 563 | end 564 | g_old = g; 565 | 566 | case QNEWTON % Use quasi-Newton Hessian approximation 567 | 568 | if i == 1 569 | d = -g; 570 | else 571 | % Compute difference vectors 572 | y = g-g_old; 573 | s = t*d; 574 | 575 | if i == 2 576 | % Make initial Hessian approximation 577 | if initialHessType == 0 578 | % Identity 579 | if qnUpdate <= 1 580 | R = eye(length(g)); 581 | else 582 | H = eye(length(g)); 583 | end 584 | else 585 | % Scaled Identity 586 | if debug 587 | fprintf('Scaling Initial Hessian Approximation\n'); 588 | end 589 | if qnUpdate <= 1 590 | % Use Cholesky of Hessian approximation 591 | R = sqrt((y'*y)/(y'*s))*eye(length(g)); 592 | else 593 | % Use Inverse of Hessian approximation 594 | H = eye(length(g))*(y'*s)/(y'*y); 595 | end 596 | end 597 | end 598 | 599 | if qnUpdate == 0 % Use BFGS updates 600 | Bs = R'*(R*s); 601 | if Damped 602 | eta = .02; 603 | if y'*s < eta*s'*Bs 604 | if debug 605 | fprintf('Damped Update\n'); 606 | end 607 | theta = min(max(0,((1-eta)*s'*Bs)/(s'*Bs - y'*s)),1); 608 | y = theta*y + (1-theta)*Bs; 609 | end 610 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 611 | else 612 | if y'*s > 1e-10 613 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 614 | else 615 | if debug 616 | fprintf('Skipping Update\n'); 617 | end 618 | end 619 | end 620 | elseif qnUpdate == 1 % Perform SR1 Update if it maintains positive-definiteness 621 | 622 | Bs = R'*(R*s); 623 | ymBs = y-Bs; 624 | if abs(s'*ymBs) >= norm(s)*norm(ymBs)*1e-8 && (s-((R\(R'\y))))'*y > 1e-10 625 | R = cholupdate(R,-ymBs/sqrt(ymBs'*s),'-'); 626 | else 627 | if debug 628 | fprintf('SR1 not positive-definite, doing BFGS Update\n'); 629 | end 630 | if Damped 631 | eta = .02; 632 | if y'*s < eta*s'*Bs 633 | if debug 634 | fprintf('Damped Update\n'); 635 | end 636 | theta = min(max(0,((1-eta)*s'*Bs)/(s'*Bs - y'*s)),1); 637 | y = theta*y + (1-theta)*Bs; 638 | end 639 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 640 | else 641 | if y'*s > 1e-10 642 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 643 | else 644 | if debug 645 | fprintf('Skipping Update\n'); 646 | end 647 | end 648 | end 649 | end 650 | elseif qnUpdate == 2 % Use Hoshino update 651 | v = sqrt(y'*H*y)*(s/(s'*y) - (H*y)/(y'*H*y)); 652 | phi = 1/(1 + (y'*H*y)/(s'*y)); 653 | H = H + (s*s')/(s'*y) - (H*y*y'*H)/(y'*H*y) + phi*v*v'; 654 | 655 | elseif qnUpdate == 3 % Self-Scaling BFGS update 656 | ys = y'*s; 657 | Hy = H*y; 658 | yHy = y'*Hy; 659 | gamma = ys/yHy; 660 | v = sqrt(yHy)*(s/ys - Hy/yHy); 661 | H = gamma*(H - Hy*Hy'/yHy + v*v') + (s*s')/ys; 662 | elseif qnUpdate == 4 % Oren's Self-Scaling Variable Metric update 663 | 664 | % Oren's method 665 | if (s'*y)/(y'*H*y) > 1 666 | phi = 1; % BFGS 667 | omega = 0; 668 | elseif (s'*(H\s))/(s'*y) < 1 669 | phi = 0; % DFP 670 | omega = 1; 671 | else 672 | phi = (s'*y)*(y'*H*y-s'*y)/((s'*(H\s))*(y'*H*y)-(s'*y)^2); 673 | omega = phi; 674 | end 675 | 676 | gamma = (1-omega)*(s'*y)/(y'*H*y) + omega*(s'*(H\s))/(s'*y); 677 | v = sqrt(y'*H*y)*(s/(s'*y) - (H*y)/(y'*H*y)); 678 | H = gamma*(H - (H*y*y'*H)/(y'*H*y) + phi*v*v') + (s*s')/(s'*y); 679 | 680 | elseif qnUpdate == 5 % McCormick-Huang asymmetric update 681 | theta = 1; 682 | phi = 0; 683 | psi = 1; 684 | omega = 0; 685 | t1 = s*(theta*s + phi*H'*y)'; 686 | t2 = (theta*s + phi*H'*y)'*y; 687 | t3 = H*y*(psi*s + omega*H'*y)'; 688 | t4 = (psi*s + omega*H'*y)'*y; 689 | H = H + t1/t2 - t3/t4; 690 | end 691 | 692 | if qnUpdate <= 1 693 | d = -R\(R'\g); 694 | else 695 | d = -H*g; 696 | end 697 | 698 | end 699 | g_old = g; 700 | 701 | case NEWTON0 % Hessian-Free Newton 702 | 703 | cgMaxIter = min(p,maxFunEvals-funEvals); 704 | cgForce = min(0.5,sqrt(norm(g)))*norm(g); 705 | 706 | % Set-up preconditioner 707 | precondFunc = []; 708 | precondArgs = []; 709 | if cgSolve == 1 710 | if isempty(precFunc) % Apply L-BFGS preconditioner 711 | if i == 1 712 | old_dirs = zeros(length(g),0); 713 | old_stps = zeros(length(g),0); 714 | Hdiag = 1; 715 | else 716 | [old_dirs,old_stps,Hdiag] = lbfgsUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag); 717 | if useMex 718 | precondFunc = @lbfgsC; 719 | else 720 | precondFunc = @lbfgs; 721 | end 722 | precondArgs = {old_dirs,old_stps,Hdiag}; 723 | end 724 | g_old = g; 725 | else 726 | % Apply user-defined preconditioner 727 | precondFunc = precFunc; 728 | precondArgs = {x,varargin{:}}; 729 | end 730 | end 731 | 732 | % Solve Newton system using cg and hessian-vector products 733 | if isempty(HvFunc) 734 | % No user-supplied Hessian-vector function, 735 | % use automatic differentiation 736 | HvFun = @autoHv; 737 | HvArgs = {x,g,useComplex,funObj,varargin{:}}; 738 | else 739 | % Use user-supplid Hessian-vector function 740 | HvFun = HvFunc; 741 | HvArgs = {x,varargin{:}}; 742 | end 743 | 744 | if useNegCurv 745 | [d,cgIter,cgRes,negCurv] = conjGrad([],-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFun,HvArgs); 746 | else 747 | [d,cgIter,cgRes] = conjGrad([],-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFun,HvArgs); 748 | end 749 | 750 | funEvals = funEvals+cgIter; 751 | if debug 752 | fprintf('newtonCG stopped on iteration %d w/ residual %.5e\n',cgIter,cgRes); 753 | 754 | end 755 | 756 | if useNegCurv 757 | if ~isempty(negCurv) 758 | %if debug 759 | fprintf('Using negative curvature direction\n'); 760 | %end 761 | d = negCurv/norm(negCurv); 762 | d = d/sum(abs(g)); 763 | end 764 | end 765 | 766 | case NEWTON % Newton search direction 767 | 768 | if cgSolve == 0 769 | if HessianModify == 0 770 | % Attempt to perform a Cholesky factorization of the Hessian 771 | [R,posDef] = chol(H); 772 | 773 | % If the Cholesky factorization was successful, then the Hessian is 774 | % positive definite, solve the system 775 | if posDef == 0 776 | d = -R\(R'\g); 777 | 778 | else 779 | % otherwise, adjust the Hessian to be positive definite based on the 780 | % minimum eigenvalue, and solve with QR 781 | % (expensive, we don't want to do this very much) 782 | if debug 783 | fprintf('Adjusting Hessian\n'); 784 | end 785 | H = H + eye(length(g)) * max(0,1e-12 - min(real(eig(H)))); 786 | d = -H\g; 787 | end 788 | elseif HessianModify == 1 789 | % Modified Incomplete Cholesky 790 | R = mcholinc(H,debug); 791 | d = -R\(R'\g); 792 | elseif HessianModify == 2 793 | % Modified Generalized Cholesky 794 | if useMex 795 | [L D perm] = mcholC(H); 796 | else 797 | [L D perm] = mchol(H); 798 | end 799 | d(perm) = -L' \ ((D.^-1).*(L \ g(perm))); 800 | 801 | elseif HessianModify == 3 802 | % Modified Spectral Decomposition 803 | [V,D] = eig((H+H')/2); 804 | D = diag(D); 805 | D = max(abs(D),max(max(abs(D)),1)*1e-12); 806 | d = -V*((V'*g)./D); 807 | elseif HessianModify == 4 808 | % Modified Symmetric Indefinite Factorization 809 | [L,D,perm] = ldl(H,'vector'); 810 | [blockPos junk] = find(triu(D,1)); 811 | for diagInd = setdiff(setdiff(1:p,blockPos),blockPos+1) 812 | if D(diagInd,diagInd) < 1e-12 813 | D(diagInd,diagInd) = 1e-12; 814 | end 815 | end 816 | for blockInd = blockPos' 817 | block = D(blockInd:blockInd+1,blockInd:blockInd+1); 818 | block_a = block(1); 819 | block_b = block(2); 820 | block_d = block(4); 821 | lambda = (block_a+block_d)/2 - sqrt(4*block_b^2 + (block_a - block_d)^2)/2; 822 | D(blockInd:blockInd+1,blockInd:blockInd+1) = block+eye(2)*(lambda+1e-12); 823 | end 824 | d(perm) = -L' \ (D \ (L \ g(perm))); 825 | else 826 | % Take Newton step if Hessian is pd, 827 | % otherwise take a step with negative curvature 828 | [R,posDef] = chol(H); 829 | if posDef == 0 830 | d = -R\(R'\g); 831 | else 832 | if debug 833 | fprintf('Taking Direction of Negative Curvature\n'); 834 | end 835 | [V,D] = eig(H); 836 | u = V(:,1); 837 | d = -sign(u'*g)*u; 838 | end 839 | end 840 | 841 | else 842 | % Solve with Conjugate Gradient 843 | cgMaxIter = p; 844 | cgForce = min(0.5,sqrt(norm(g)))*norm(g); 845 | 846 | % Select Preconditioner 847 | if cgSolve == 1 848 | % No preconditioner 849 | precondFunc = []; 850 | precondArgs = []; 851 | elseif cgSolve == 2 852 | % Diagonal preconditioner 853 | precDiag = diag(H); 854 | precDiag(precDiag < 1e-12) = 1e-12 - min(precDiag); 855 | precondFunc = @precondDiag; 856 | precondArgs = {precDiag.^-1}; 857 | elseif cgSolve == 3 858 | % L-BFGS preconditioner 859 | if i == 1 860 | old_dirs = zeros(length(g),0); 861 | old_stps = zeros(length(g),0); 862 | Hdiag = 1; 863 | else 864 | [old_dirs,old_stps,Hdiag] = lbfgsUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag); 865 | end 866 | g_old = g; 867 | if useMex 868 | precondFunc = @lbfgsC; 869 | else 870 | precondFunc = @lbfgs; 871 | end 872 | precondArgs = {old_dirs,old_stps,Hdiag}; 873 | elseif cgSolve > 0 874 | % Symmetric Successive Overelaxation Preconditioner 875 | omega = cgSolve; 876 | D = diag(H); 877 | D(D < 1e-12) = 1e-12 - min(D); 878 | precDiag = (omega/(2-omega))*D.^-1; 879 | precTriu = diag(D/omega) + triu(H,1); 880 | precondFunc = @precondTriuDiag; 881 | precondArgs = {precTriu,precDiag.^-1}; 882 | else 883 | % Incomplete Cholesky Preconditioner 884 | opts.droptol = -cgSolve; 885 | opts.rdiag = 1; 886 | R = cholinc(sparse(H),opts); 887 | if min(diag(R)) < 1e-12 888 | R = cholinc(sparse(H + eye*(1e-12 - min(diag(R)))),opts); 889 | end 890 | precondFunc = @precondTriu; 891 | precondArgs = {R}; 892 | end 893 | 894 | % Run cg with the appropriate preconditioner 895 | if isempty(HvFunc) 896 | % No user-supplied Hessian-vector function 897 | [d,cgIter,cgRes] = conjGrad(H,-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs); 898 | else 899 | % Use user-supplied Hessian-vector function 900 | [d,cgIter,cgRes] = conjGrad(H,-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFunc,{x,varargin{:}}); 901 | end 902 | if debug 903 | fprintf('CG stopped after %d iterations w/ residual %.5e\n',cgIter,cgRes); 904 | %funEvals = funEvals + cgIter; 905 | end 906 | end 907 | 908 | case TENSOR % Tensor Method 909 | 910 | if numDiff 911 | % Compute 3rd-order Tensor Numerically 912 | [junk1 junk2 junk3 T] = autoTensor(x,useComplex,funObj,varargin{:}); 913 | else 914 | % Use user-supplied 3rd-derivative Tensor 915 | [junk1 junk2 junk3 T] = feval(funObj, x, varargin{:}); 916 | end 917 | options_sub.Method = 'newton'; 918 | options_sub.Display = 'none'; 919 | options_sub.TolX = tolX; 920 | options_sub.TolFun = tolFun; 921 | d = minFunc(@taylorModel,zeros(p,1),options_sub,f,g,H,T); 922 | 923 | if any(abs(d) > 1e5) || all(abs(d) < 1e-5) || g'*d > -tolX 924 | if debug 925 | fprintf('Using 2nd-Order Step\n'); 926 | end 927 | [V,D] = eig((H+H')/2); 928 | D = diag(D); 929 | D = max(abs(D),max(max(abs(D)),1)*1e-12); 930 | d = -V*((V'*g)./D); 931 | else 932 | if debug 933 | fprintf('Using 3rd-Order Step\n'); 934 | end 935 | end 936 | end 937 | 938 | if ~isLegal(d) 939 | fprintf('Step direction is illegal!\n'); 940 | pause; 941 | return 942 | end 943 | 944 | % ****************** COMPUTE STEP LENGTH ************************ 945 | 946 | % Directional Derivative 947 | gtd = g'*d; 948 | 949 | % Check that progress can be made along direction 950 | if gtd > -tolX 951 | exitflag=2; 952 | msg = 'Directional Derivative below TolX'; 953 | break; 954 | end 955 | 956 | % Select Initial Guess 957 | if i == 1 958 | if method < NEWTON0 959 | t = min(1,1/sum(abs(g))); 960 | else 961 | t = 1; 962 | end 963 | else 964 | if LS_init == 0 965 | % Newton step 966 | t = 1; 967 | elseif LS_init == 1 968 | % Close to previous step length 969 | t = t*min(2,(gtd_old)/(gtd)); 970 | elseif LS_init == 2 971 | % Quadratic Initialization based on {f,g} and previous f 972 | t = min(1,2*(f-f_old)/(gtd)); 973 | elseif LS_init == 3 974 | % Double previous step length 975 | t = min(1,t*2); 976 | elseif LS_init == 4 977 | % Scaled step length if possible 978 | if isempty(HvFunc) 979 | % No user-supplied Hessian-vector function, 980 | % use automatic differentiation 981 | dHd = d'*autoHv(d,x,g,0,funObj,varargin{:}); 982 | else 983 | % Use user-supplid Hessian-vector function 984 | dHd = d'*HvFunc(d,x,varargin{:}); 985 | end 986 | 987 | funEvals = funEvals + 1; 988 | if dHd > 0 989 | t = -gtd/(dHd); 990 | else 991 | t = min(1,2*(f-f_old)/(gtd)); 992 | end 993 | end 994 | 995 | if t <= 0 996 | t = 1; 997 | end 998 | end 999 | f_old = f; 1000 | gtd_old = gtd; 1001 | 1002 | % Compute reference fr if using non-monotone objective 1003 | if Fref == 1 1004 | fr = f; 1005 | else 1006 | if i == 1 1007 | old_fvals = repmat(-inf,[Fref 1]); 1008 | end 1009 | 1010 | if i <= Fref 1011 | old_fvals(i) = f; 1012 | else 1013 | old_fvals = [old_fvals(2:end);f]; 1014 | end 1015 | fr = max(old_fvals); 1016 | end 1017 | 1018 | computeHessian = 0; 1019 | if method >= NEWTON 1020 | if HessianIter == 1 1021 | computeHessian = 1; 1022 | elseif i > 1 && mod(i-1,HessianIter) == 0 1023 | computeHessian = 1; 1024 | end 1025 | end 1026 | 1027 | % Line Search 1028 | f_old = f; 1029 | if LS < 3 % Use Armijo Bactracking 1030 | % Perform Backtracking line search 1031 | if computeHessian 1032 | [t,x,f,g,LSfunEvals,H] = ArmijoBacktrack(x,t,d,f,fr,g,gtd,c1,LS,tolX,debug,doPlot,LS_saveHessianComp,funObj,varargin{:}); 1033 | else 1034 | [t,x,f,g,LSfunEvals] = ArmijoBacktrack(x,t,d,f,fr,g,gtd,c1,LS,tolX,debug,doPlot,1,funObj,varargin{:}); 1035 | end 1036 | funEvals = funEvals + LSfunEvals; 1037 | 1038 | elseif LS < 6 1039 | % Find Point satisfying Wolfe 1040 | 1041 | if computeHessian 1042 | [t,f,g,LSfunEvals,H] = WolfeLineSearch(x,t,d,f,g,gtd,c1,c2,LS,25,tolX,debug,doPlot,LS_saveHessianComp,funObj,varargin{:}); 1043 | else 1044 | [t,f,g,LSfunEvals] = WolfeLineSearch(x,t,d,f,g,gtd,c1,c2,LS,25,tolX,debug,doPlot,1,funObj,varargin{:}); 1045 | end 1046 | funEvals = funEvals + LSfunEvals; 1047 | x = x + t*d; 1048 | 1049 | else 1050 | % Use Matlab optim toolbox line search 1051 | [t,f_new,fPrime_new,g_new,LSexitFlag,LSiter]=... 1052 | lineSearch({'fungrad',[],funObj},x,p,1,p,d,f,gtd,t,c1,c2,-inf,maxFunEvals-funEvals,... 1053 | tolX,[],[],[],varargin{:}); 1054 | funEvals = funEvals + LSiter; 1055 | if isempty(t) 1056 | exitflag = -2; 1057 | msg = 'Matlab LineSearch failed'; 1058 | break; 1059 | end 1060 | 1061 | if method >= NEWTON 1062 | 1063 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 1064 | funEvals = funEvals + 1; 1065 | end 1066 | x = x + t*d; 1067 | 1068 | f = f_new; 1069 | g = g_new; 1070 | end 1071 | 1072 | % Output iteration information 1073 | if verboseI 1074 | fprintf('%10d %10d %15.5e %15.5e %15.5e\n',i,funEvals*funEvalMultiplier,t,f,sum(abs(g))); 1075 | end 1076 | 1077 | if logfile 1078 | fid = fopen(logfile, 'a'); 1079 | if (fid > 0) 1080 | fprintf(fid, '-- %10d %10d %15.5e %15.5e %15.5e\n',i,funEvals*funEvalMultiplier,t,f,sum(abs(g))); 1081 | fclose(fid); 1082 | end 1083 | end 1084 | 1085 | 1086 | % Output Function 1087 | if ~isempty(outputFcn) 1088 | callOutput(outputFcn,x,'iter',i,funEvals,f,t,gtd,g,d,sum(abs(g)),varargin{:}); 1089 | end 1090 | 1091 | % Update Trace 1092 | trace.fval(end+1,1) = f; 1093 | trace.funcCount(end+1,1) = funEvals; 1094 | 1095 | % Check Optimality Condition 1096 | if sum(abs(g)) <= tolFun 1097 | exitflag=1; 1098 | msg = 'Optimality Condition below TolFun'; 1099 | break; 1100 | end 1101 | 1102 | % ******************* Check for lack of progress ******************* 1103 | 1104 | if sum(abs(t*d)) <= tolX 1105 | exitflag=2; 1106 | msg = 'Step Size below TolX'; 1107 | break; 1108 | end 1109 | 1110 | 1111 | if abs(f-f_old) < tolX 1112 | exitflag=2; 1113 | msg = 'Function Value changing by less than TolX'; 1114 | break; 1115 | end 1116 | 1117 | % ******** Check for going over iteration/evaluation limit ******************* 1118 | 1119 | if funEvals*funEvalMultiplier > maxFunEvals 1120 | exitflag = 0; 1121 | msg = 'Exceeded Maximum Number of Function Evaluations'; 1122 | break; 1123 | end 1124 | 1125 | if i == maxIter 1126 | exitflag = 0; 1127 | msg='Exceeded Maximum Number of Iterations'; 1128 | break; 1129 | end 1130 | 1131 | end 1132 | 1133 | if verbose 1134 | fprintf('%s\n',msg); 1135 | end 1136 | if nargout > 3 1137 | output = struct('iterations',i,'funcCount',funEvals*funEvalMultiplier,... 1138 | 'algorithm',method,'firstorderopt',sum(abs(g)),'message',msg,'trace',trace); 1139 | end 1140 | 1141 | % Output Function 1142 | if ~isempty(outputFcn) 1143 | callOutput(outputFcn,x,'done',i,funEvals,f,t,gtd,g,d,sum(abs(g)),varargin{:}); 1144 | end 1145 | 1146 | end 1147 | 1148 | -------------------------------------------------------------------------------- /minFunc/minFunc_processInputOptions.m: -------------------------------------------------------------------------------- 1 | 2 | function [verbose,verboseI,debug,doPlot,maxFunEvals,maxIter,tolFun,tolX,method,... 3 | corrections,c1,c2,LS_init,LS,cgSolve,qnUpdate,cgUpdate,initialHessType,... 4 | HessianModify,Fref,useComplex,numDiff,LS_saveHessianComp,... 5 | DerivativeCheck,Damped,HvFunc,bbType,cycle,... 6 | HessianIter,outputFcn,useMex,useNegCurv,precFunc] = ... 7 | minFunc_processInputOptions(o) 8 | 9 | % Constants 10 | SD = 0; 11 | CSD = 1; 12 | BB = 2; 13 | CG = 3; 14 | PCG = 4; 15 | LBFGS = 5; 16 | QNEWTON = 6; 17 | NEWTON0 = 7; 18 | NEWTON = 8; 19 | TENSOR = 9; 20 | 21 | verbose = 1; 22 | verboseI= 1; 23 | debug = 0; 24 | doPlot = 0; 25 | method = LBFGS; 26 | cgSolve = 0; 27 | 28 | o = toUpper(o); 29 | 30 | if isfield(o,'DISPLAY') 31 | switch(upper(o.DISPLAY)) 32 | case 0 33 | verbose = 0; 34 | verboseI = 0; 35 | case 'FINAL' 36 | verboseI = 0; 37 | case 'OFF' 38 | verbose = 0; 39 | verboseI = 0; 40 | case 'NONE' 41 | verbose = 0; 42 | verboseI = 0; 43 | case 'FULL' 44 | debug = 1; 45 | case 'EXCESSIVE' 46 | debug = 1; 47 | doPlot = 1; 48 | end 49 | end 50 | 51 | 52 | LS_init = 0; 53 | c2 = 0.9; 54 | LS = 4; 55 | Fref = 1; 56 | Damped = 0; 57 | HessianIter = 1; 58 | if isfield(o,'METHOD') 59 | m = upper(o.METHOD); 60 | switch(m) 61 | case 'TENSOR' 62 | method = TENSOR; 63 | case 'NEWTON' 64 | method = NEWTON; 65 | case 'MNEWTON' 66 | method = NEWTON; 67 | HessianIter = 5; 68 | case 'PNEWTON0' 69 | method = NEWTON0; 70 | cgSolve = 1; 71 | case 'NEWTON0' 72 | method = NEWTON0; 73 | case 'QNEWTON' 74 | method = QNEWTON; 75 | Damped = 1; 76 | case 'LBFGS' 77 | method = LBFGS; 78 | case 'BB' 79 | method = BB; 80 | LS = 2; 81 | Fref = 20; 82 | case 'PCG' 83 | method = PCG; 84 | c2 = 0.2; 85 | LS_init = 2; 86 | case 'SCG' 87 | method = CG; 88 | c2 = 0.2; 89 | LS_init = 4; 90 | case 'CG' 91 | method = CG; 92 | c2 = 0.2; 93 | LS_init = 2; 94 | case 'CSD' 95 | method = CSD; 96 | c2 = 0.2; 97 | Fref = 10; 98 | LS_init = 2; 99 | case 'SD' 100 | method = SD; 101 | LS_init = 2; 102 | end 103 | end 104 | 105 | maxFunEvals = getOpt(o,'MAXFUNEVALS',1000); 106 | maxIter = getOpt(o,'MAXITER',500); 107 | tolFun = getOpt(o,'TOLFUN',1e-5); 108 | tolX = getOpt(o,'TOLX',1e-9); 109 | corrections = getOpt(o,'CORR',100); 110 | c1 = getOpt(o,'C1',1e-4); 111 | c2 = getOpt(o,'C2',c2); 112 | LS_init = getOpt(o,'LS_INIT',LS_init); 113 | LS = getOpt(o,'LS',LS); 114 | cgSolve = getOpt(o,'CGSOLVE',cgSolve); 115 | qnUpdate = getOpt(o,'QNUPDATE',3); 116 | cgUpdate = getOpt(o,'CGUPDATE',2); 117 | initialHessType = getOpt(o,'INITIALHESSTYPE',1); 118 | HessianModify = getOpt(o,'HESSIANMODIFY',0); 119 | Fref = getOpt(o,'FREF',Fref); 120 | useComplex = getOpt(o,'USECOMPLEX',0); 121 | numDiff = getOpt(o,'NUMDIFF',0); 122 | LS_saveHessianComp = getOpt(o,'LS_SAVEHESSIANCOMP',1); 123 | DerivativeCheck = getOpt(o,'DERIVATIVECHECK',0); 124 | Damped = getOpt(o,'DAMPED',Damped); 125 | HvFunc = getOpt(o,'HVFUNC',[]); 126 | bbType = getOpt(o,'BBTYPE',0); 127 | cycle = getOpt(o,'CYCLE',3); 128 | HessianIter = getOpt(o,'HESSIANITER',HessianIter); 129 | outputFcn = getOpt(o,'OUTPUTFCN',[]); 130 | useMex = getOpt(o,'USEMEX',1); 131 | useNegCurv = getOpt(o,'USENEGCURV',1); 132 | precFunc = getOpt(o,'PRECFUNC',[]); 133 | end 134 | 135 | function [v] = getOpt(options,opt,default) 136 | if isfield(options,opt) 137 | if ~isempty(getfield(options,opt)) 138 | v = getfield(options,opt); 139 | else 140 | v = default; 141 | end 142 | else 143 | v = default; 144 | end 145 | end 146 | 147 | function [o] = toUpper(o) 148 | if ~isempty(o) 149 | fn = fieldnames(o); 150 | for i = 1:length(fn) 151 | o = setfield(o,upper(fn{i}),getfield(o,fn{i})); 152 | end 153 | end 154 | end -------------------------------------------------------------------------------- /minFunc/polyinterp.m: -------------------------------------------------------------------------------- 1 | function [minPos,fmin] = polyinterp(points,doPlot,xminBound,xmaxBound) 2 | % function [minPos] = polyinterp(points,doPlot,xminBound,xmaxBound) 3 | % 4 | % Minimum of interpolating polynomial based on function and derivative 5 | % values 6 | % 7 | % In can also be used for extrapolation if {xmin,xmax} are outside 8 | % the domain of the points. 9 | % 10 | % Input: 11 | % points(pointNum,[x f g]) 12 | % doPlot: set to 1 to plot, default: 0 13 | % xmin: min value that brackets minimum (default: min of points) 14 | % xmax: max value that brackets maximum (default: max of points) 15 | % 16 | % set f or g to sqrt(-1) if they are not known 17 | % the order of the polynomial is the number of known f and g values minus 1 18 | 19 | if nargin < 2 20 | doPlot = 0; 21 | end 22 | 23 | nPoints = size(points,1); 24 | order = sum(sum((imag(points(:,2:3))==0)))-1; 25 | 26 | % Code for most common case: 27 | % - cubic interpolation of 2 points 28 | % w/ function and derivative values for both 29 | % - no xminBound/xmaxBound 30 | 31 | if nPoints == 2 && order ==3 && nargin <= 2 && doPlot == 0 32 | % Solution in this case (where x2 is the farthest point): 33 | % d1 = g1 + g2 - 3*(f1-f2)/(x1-x2); 34 | % d2 = sqrt(d1^2 - g1*g2); 35 | % minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2)); 36 | % t_new = min(max(minPos,x1),x2); 37 | [minVal minPos] = min(points(:,1)); 38 | notMinPos = -minPos+3; 39 | d1 = points(minPos,3) + points(notMinPos,3) - 3*(points(minPos,2)-points(notMinPos,2))/(points(minPos,1)-points(notMinPos,1)); 40 | d2 = sqrt(d1^2 - points(minPos,3)*points(notMinPos,3)); 41 | if isreal(d2) 42 | t = points(notMinPos,1) - (points(notMinPos,1) - points(minPos,1))*((points(notMinPos,3) + d2 - d1)/(points(notMinPos,3) - points(minPos,3) + 2*d2)); 43 | minPos = min(max(t,points(minPos,1)),points(notMinPos,1)); 44 | else 45 | minPos = mean(points(:,1)); 46 | end 47 | return; 48 | end 49 | 50 | xmin = min(points(:,1)); 51 | xmax = max(points(:,1)); 52 | 53 | % Compute Bounds of Interpolation Area 54 | if nargin < 3 55 | xminBound = xmin; 56 | end 57 | if nargin < 4 58 | xmaxBound = xmax; 59 | end 60 | 61 | % Constraints Based on available Function Values 62 | A = zeros(0,order+1); 63 | b = zeros(0,1); 64 | for i = 1:nPoints 65 | if imag(points(i,2))==0 66 | constraint = zeros(1,order+1); 67 | for j = order:-1:0 68 | constraint(order-j+1) = points(i,1)^j; 69 | end 70 | A = [A;constraint]; 71 | b = [b;points(i,2)]; 72 | end 73 | end 74 | 75 | % Constraints based on available Derivatives 76 | for i = 1:nPoints 77 | if isreal(points(i,3)) 78 | constraint = zeros(1,order+1); 79 | for j = 1:order 80 | constraint(j) = (order-j+1)*points(i,1)^(order-j); 81 | end 82 | A = [A;constraint]; 83 | b = [b;points(i,3)]; 84 | end 85 | end 86 | 87 | % Find interpolating polynomial 88 | params = A\b; 89 | 90 | % Compute Critical Points 91 | dParams = zeros(order,1); 92 | for i = 1:length(params)-1 93 | dParams(i) = params(i)*(order-i+1); 94 | end 95 | 96 | if any(isinf(dParams)) 97 | cp = [xminBound;xmaxBound;points(:,1)].'; 98 | else 99 | cp = [xminBound;xmaxBound;points(:,1);roots(dParams)].'; 100 | end 101 | 102 | % Test Critical Points 103 | fmin = inf; 104 | minPos = (xminBound+xmaxBound)/2; % Default to Bisection if no critical points valid 105 | for xCP = cp 106 | if imag(xCP)==0 && xCP >= xminBound && xCP <= xmaxBound 107 | fCP = polyval(params,xCP); 108 | if imag(fCP)==0 && fCP < fmin 109 | minPos = real(xCP); 110 | fmin = real(fCP); 111 | end 112 | end 113 | end 114 | % Plot Situation 115 | if doPlot 116 | figure(1); clf; hold on; 117 | 118 | % Plot Points 119 | plot(points(:,1),points(:,2),'b*'); 120 | 121 | % Plot Derivatives 122 | for i = 1:nPoints 123 | if isreal(points(i,3)) 124 | m = points(i,3); 125 | b = points(i,2) - m*points(i,1); 126 | plot([points(i,1)-.05 points(i,1)+.05],... 127 | [(points(i,1)-.05)*m+b (points(i,1)+.05)*m+b],'c.-'); 128 | end 129 | end 130 | 131 | % Plot Function 132 | x = min(xmin,xminBound)-.1:(max(xmax,xmaxBound)+.1-min(xmin,xminBound)-.1)/100:max(xmax,xmaxBound)+.1; 133 | size(x) 134 | for i = 1:length(x) 135 | f(i) = polyval(params,x(i)); 136 | end 137 | plot(x,f,'y'); 138 | axis([x(1)-.1 x(end)+.1 min(f)-.1 max(f)+.1]); 139 | 140 | % Plot Minimum 141 | plot(minPos,fmin,'g+'); 142 | if doPlot == 1 143 | pause(1); 144 | end 145 | end -------------------------------------------------------------------------------- /minFunc/precondDiag.m: -------------------------------------------------------------------------------- 1 | function [y] = precondDiag(r,D) 2 | y = D.*r; -------------------------------------------------------------------------------- /minFunc/precondTriu.m: -------------------------------------------------------------------------------- 1 | function [y] = precondUpper(r,U) 2 | y = U \ (U' \ r); -------------------------------------------------------------------------------- /minFunc/precondTriuDiag.m: -------------------------------------------------------------------------------- 1 | function [y] = precondUpper(r,U,D) 2 | y = U \ (D .* (U' \ r)); -------------------------------------------------------------------------------- /minFunc/rosenbrock.m: -------------------------------------------------------------------------------- 1 | function [f, df, ddf, dddf] = rosenbrock(x); 2 | 3 | % rosenbrock.m This function returns the function value, partial derivatives 4 | % and Hessian of the (general dimension) rosenbrock function, given by: 5 | % 6 | % f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 7 | % 8 | % where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1). 9 | % 10 | % Carl Edward Rasmussen, 2001-07-21. 11 | 12 | D = length(x); 13 | f = sum(100*(x(2:D)-x(1:D-1).^2).^2 + (1-x(1:D-1)).^2); 14 | 15 | if nargout > 1 16 | df = zeros(D, 1); 17 | df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1)); 18 | df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2); 19 | end 20 | 21 | if nargout > 2 22 | ddf = zeros(D,D); 23 | ddf(1:D-1,1:D-1) = diag(-400*x(2:D) + 1200*x(1:D-1).^2 + 2); 24 | ddf(2:D,2:D) = ddf(2:D,2:D) + 200*eye(D-1); 25 | ddf = ddf - diag(400*x(1:D-1),1) - diag(400*x(1:D-1),-1); 26 | end 27 | 28 | if nargout > 3 29 | dddf = zeros(D,D,D); 30 | for d = 1:D 31 | if d > 1 32 | dddf(d,d-1,d-1) = -400; 33 | end 34 | if d < D 35 | dddf(d,d+1,d) = -400; 36 | dddf(d,d,d+1) = -400; 37 | dddf(d,d,d) = 2400*x(d); 38 | end 39 | end 40 | end -------------------------------------------------------------------------------- /minFunc/taylorModel.m: -------------------------------------------------------------------------------- 1 | function [f,g,H] = taylorModel(d,f,g,H,T) 2 | 3 | p = length(d); 4 | 5 | fd3 = 0; 6 | gd2 = zeros(p,1); 7 | Hd = zeros(p); 8 | for t1 = 1:p 9 | for t2 = 1:p 10 | for t3 = 1:p 11 | fd3 = fd3 + T(t1,t2,t3)*d(t1)*d(t2)*d(t3); 12 | 13 | if nargout > 1 14 | gd2(t3) = gd2(t3) + T(t1,t2,t3)*d(t1)*d(t2); 15 | end 16 | 17 | if nargout > 2 18 | Hd(t2,t3) = Hd(t2,t3) + T(t1,t2,t3)*d(t1); 19 | end 20 | end 21 | 22 | end 23 | end 24 | 25 | f = f + g'*d + (1/2)*d'*H*d + (1/6)*fd3; 26 | 27 | if nargout > 1 28 | g = g + H*d + (1/2)*gd2; 29 | end 30 | 31 | if nargout > 2 32 | H = H + Hd; 33 | end 34 | 35 | if any(abs(d) > 1e5) 36 | % We want the optimizer to stop if the solution is unbounded 37 | g = zeros(p,1); 38 | end -------------------------------------------------------------------------------- /mnist/t10k-images.idx3-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/mnist/t10k-images.idx3-ubyte -------------------------------------------------------------------------------- /mnist/t10k-labels.idx1-ubyte: -------------------------------------------------------------------------------- 1 | '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             -------------------------------------------------------------------------------- /mnist/train-images.idx3-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/mnist/train-images.idx3-ubyte -------------------------------------------------------------------------------- /mnist/train-labels.idx1-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ehosseiniasl/Nonnegativity-Constrained-Autoencoder-NCAE/219c53631d60a268ba8550796e2ae38639450861/mnist/train-labels.idx1-ubyte -------------------------------------------------------------------------------- /params2stack.m: -------------------------------------------------------------------------------- 1 | function stack = params2stack(params, netconfig) 2 | 3 | % Converts a flattened parameter vector into a nice "stack" structure 4 | % for us to work with. This is useful when you're building multilayer 5 | % networks. 6 | % 7 | % stack = params2stack(params, netconfig) 8 | % 9 | % params - flattened parameter vector 10 | % netconfig - auxiliary variable containing 11 | % the configuration of the network 12 | % 13 | 14 | 15 | % Map the params (a vector into a stack of weights) 16 | depth = numel(netconfig.layersizes); 17 | stack = cell(depth,1); 18 | prevLayerSize = netconfig.inputsize; % the size of the previous layer 19 | curPos = double(1); % mark current position in parameter vector 20 | 21 | for d = 1:depth 22 | % Create layer d 23 | stack{d} = struct; 24 | 25 | % Extract weights 26 | wlen = double(netconfig.layersizes{d} * prevLayerSize); 27 | stack{d}.w = reshape(params(curPos:curPos+wlen-1), netconfig.layersizes{d}, prevLayerSize); 28 | curPos = curPos+wlen; 29 | 30 | % Extract bias 31 | blen = double(netconfig.layersizes{d}); 32 | stack{d}.b = reshape(params(curPos:curPos+blen-1), netconfig.layersizes{d}, 1); 33 | curPos = curPos+blen; 34 | 35 | % Set previous layer size 36 | prevLayerSize = netconfig.layersizes{d}; 37 | end 38 | 39 | end -------------------------------------------------------------------------------- /softmax/computeNumericalGradient.m: -------------------------------------------------------------------------------- 1 | function numgrad = computeNumericalGradient(J, theta) 2 | % numgrad = computeNumericalGradient(J, theta) 3 | % theta: a vector of parameters 4 | % J: a function that outputs a real-number. Calling y = J(theta) will return the 5 | % function value at theta. 6 | 7 | % Initialize numgrad with zeros 8 | numgrad = zeros(size(theta)); 9 | 10 | %% ---------- YOUR CODE HERE -------------------------------------- 11 | % Instructions: 12 | % Implement numerical gradient checking, and return the result in numgrad. 13 | % (See Section 2.3 of the lecture notes.) 14 | % You should write code so that numgrad(i) is (the numerical approximation to) the 15 | % partial derivative of J with respect to the i-th input argument, evaluated at theta. 16 | % I.e., numgrad(i) should be the (approximately) the partial derivative of J with 17 | % respect to theta(i). 18 | % 19 | % Hint: You will probably want to compute the elements of numgrad one at a time. 20 | 21 | eps = 1e-4; 22 | 23 | for i = 1:size(theta,1) 24 | 25 | delta = zeros(size(theta,1),1); 26 | delta(i) = eps; 27 | theta_p = theta + delta; 28 | theta_n = theta - delta; 29 | numgrad(i,1) = (J(theta_p)-J(theta_n))./(2*eps); 30 | 31 | end 32 | 33 | 34 | 35 | 36 | 37 | 38 | %% --------------------------------------------------------------- 39 | end 40 | -------------------------------------------------------------------------------- /softmax/softmaxCost_nonneg.m: -------------------------------------------------------------------------------- 1 | function [cost, grad] = softmaxCost(theta, numClasses, inputSize, lambda, data, labels) 2 | 3 | % numClasses - the number of classes 4 | % inputSize - the size N of the input vector 5 | % lambda - weight decay parameter 6 | % data - the N x M input matrix, where each column data(:, i) corresponds to 7 | % a single test set 8 | % labels - an M x 1 matrix containing the labels corresponding for the input data 9 | % 10 | 11 | % Unroll the parameters from theta 12 | theta = reshape(theta, numClasses, inputSize); 13 | 14 | numCases = size(data, 2); 15 | 16 | groundTruth = full(sparse(labels, 1:numCases, 1)); 17 | cost = 0; 18 | 19 | thetagrad = zeros(numClasses, inputSize); 20 | 21 | %% ---------- YOUR CODE HERE -------------------------------------- 22 | % Instructions: Compute the cost and gradient for softmax regression. 23 | % You need to compute thetagrad and cost. 24 | % The groundTruth matrix might come in handy. 25 | 26 | % tmp = theta*data; 27 | 28 | prob = exp(theta*data); 29 | 30 | [r,c] = find(isinf(prob)); 31 | prob(r,c) = exp(709); % avoid Inf in prob matrix 32 | 33 | prob_norm = prob./repmat(sum(prob),numClasses,1); 34 | 35 | [r,c] = find(prob_norm == 0); 36 | prob_norm(r,c) = eps; 37 | 38 | theta_neg = zeros(size(theta,1), size(theta,2)); 39 | 40 | theta_neg(find(theta<0)) = theta(find(theta<0)); 41 | 42 | theta_neg_abs = theta_neg; 43 | theta_neg_abs(theta_neg_abs~=0)=1; 44 | 45 | weight_neg_decay = sum(sum(theta_neg.^2)) ; 46 | 47 | 48 | cost = -sum(sum(groundTruth.*log(prob_norm)))/numCases + lambda/2*weight_neg_decay; 49 | 50 | cost_acc = -sum(sum(groundTruth.*log(prob_norm)))/numCases 51 | if isnan(cost_acc) 52 | error() 53 | end 54 | 55 | % cost = -sum(sum(groundTruth.*log(prob_norm)))/numCases + lambda/2*weight_neg_decay - 0.5*lambda*(sum(theta_neg(:))); 56 | 57 | thetagrad = -1/numCases * (data*(groundTruth-prob_norm)') ; 58 | 59 | thetagrad = thetagrad' + lambda*theta_neg; 60 | 61 | % thetagrad = thetagrad' + lambda*theta_neg - 0.5*lambda*theta_neg_abs; 62 | 63 | 64 | 65 | % ------------------------------------------------------------------ 66 | % Unroll the gradient matrices into a vector for minFunc 67 | grad = [thetagrad(:)]; 68 | end 69 | 70 | -------------------------------------------------------------------------------- /softmax/softmaxPredict.m: -------------------------------------------------------------------------------- 1 | function [pred] = softmaxPredict(softmaxModel, data) 2 | 3 | % softmaxModel - model trained using softmaxTrain 4 | % data - the N x M input matrix, where each column data(:, i) corresponds to 5 | % a single test set 6 | % 7 | % Your code should produce the prediction matrix 8 | % pred, where pred(i) is argmax_c P(y(c) | x(i)). 9 | 10 | % Unroll the parameters from theta 11 | theta = softmaxModel.optTheta; % this provides a numClasses x inputSize matrix 12 | pred = zeros(1, size(data, 2)); 13 | 14 | %% ---------- YOUR CODE HERE -------------------------------------- 15 | % Instructions: Compute pred using theta assuming that the labels start 16 | % from 1. 17 | 18 | prob = exp(theta*data); 19 | prob_norm = prob./repmat(sum(prob),size(theta,1),1); 20 | 21 | [tmp,pred]=max(prob_norm); 22 | 23 | 24 | 25 | % --------------------------------------------------------------------- 26 | 27 | end 28 | 29 | -------------------------------------------------------------------------------- /softmax/softmaxTrain_nonneg.m: -------------------------------------------------------------------------------- 1 | function [softmaxModel] = softmaxTrain_nonneg(inputSize, numClasses, lambda, inputData, labels, options) 2 | %softmaxTrain Train a softmax model with the given parameters on the given 3 | % data. Returns softmaxOptTheta, a vector containing the trained parameters 4 | % for the model. 5 | % 6 | % inputSize: the size of an input vector x^(i) 7 | % numClasses: the number of classes 8 | % lambda: weight decay parameter 9 | % inputData: an N by M matrix containing the input data, such that 10 | % inputData(:, c) is the cth input 11 | % labels: M by 1 matrix containing the class labels for the 12 | % corresponding inputs. labels(c) is the class label for 13 | % the cth input 14 | % options (optional): options 15 | % options.maxIter: number of iterations to train for 16 | 17 | if ~exist('options', 'var') 18 | options = struct; 19 | end 20 | 21 | if ~isfield(options, 'maxIter') 22 | options.maxIter = 400; 23 | end 24 | 25 | % initialize parameters 26 | theta = 0.005 * randn(numClasses * inputSize, 1); 27 | 28 | % Use minFunc to minimize the function 29 | addpath minFunc/ 30 | options.Method = 'lbfgs'; % Here, we use L-BFGS to optimize our cost 31 | % function. Generally, for minFunc to work, you 32 | % need a function pointer with two outputs: the 33 | % function value and the gradient. In our problem, 34 | % softmaxCost.m satisfies this. 35 | minFuncOptions.display = 'on'; 36 | options.optTol = 1e-12; 37 | [softmaxOptTheta, cost] = minFunc( @(p) softmaxCost_nonneg(p, ... 38 | numClasses, inputSize, lambda, ... 39 | inputData, labels), ... 40 | theta, options); 41 | 42 | % Fold softmaxOptTheta into a nicer format 43 | softmaxModel.optTheta = reshape(softmaxOptTheta, numClasses, inputSize); 44 | softmaxModel.inputSize = inputSize; 45 | softmaxModel.numClasses = numClasses; 46 | 47 | end 48 | -------------------------------------------------------------------------------- /sparseAutoencoderCost_nonneg.m: -------------------------------------------------------------------------------- 1 | function [cost,grad, objhistory] = sparseAutoencoderCost_nonneg(theta, visibleSize, hiddenSize, ... 2 | lambda, inputZeroMaskedFraction, dropoutFraction, sparsityParam, beta, data) 3 | 4 | 5 | objhistory = []; 6 | 7 | W1 = reshape(theta(1:hiddenSize*visibleSize), hiddenSize, visibleSize); 8 | W2 = reshape(theta(hiddenSize*visibleSize+1:2*hiddenSize*visibleSize), visibleSize, hiddenSize); 9 | b1 = theta(2*hiddenSize*visibleSize+1:2*hiddenSize*visibleSize+hiddenSize); 10 | b2 = theta(2*hiddenSize*visibleSize+hiddenSize+1:end); 11 | 12 | % Cost and gradient variables (your code needs to compute these values). 13 | % Here, we initialize them to zeros. 14 | cost = 0; 15 | W1grad = zeros(size(W1)); 16 | W2grad = zeros(size(W2)); 17 | b1grad = zeros(size(b1)); 18 | b2grad = zeros(size(b2)); 19 | 20 | %% computing delta's in output and hidden layers 21 | 22 | 23 | y = data; 24 | a1 = data; 25 | if (inputZeroMaskedFraction>0) 26 | a1 = a1.*(rand(size(a1))>inputZeroMaskedFraction); 27 | end 28 | 29 | z2 = W1*a1 + repmat(b1,1,size(a1,2)); 30 | a2 = sigmoid(z2); 31 | 32 | %dropout 33 | if(dropoutFraction > 0) 34 | dropOutMask = (rand(size(a2))>dropoutFraction); 35 | a2 = a2.*dropOutMask; 36 | end 37 | 38 | z3 = W2*a2 + repmat(b2,1,size(a2,2)); 39 | a3 = sigmoid(z3); 40 | 41 | yhat = a3; 42 | 43 | delta3 = -(y - yhat) .* (a3.*(ones(visibleSize,size(y,2))-a3)); 44 | 45 | 46 | param = sum(a2,2)./size(y,2); 47 | par = sparsityParam*ones(hiddenSize,1); 48 | sparsity = beta*(-par./param + (ones(hiddenSize,1)-par)./(ones(hiddenSize,1)-param)); 49 | sparsity = repmat(sparsity,1,size(data,2)); 50 | 51 | delta2 = (W2'*delta3 + sparsity) .* (a2.*(ones(hiddenSize,size(y,2))-a2)); 52 | 53 | if(dropoutFraction > 0) 54 | delta2 = delta2.*dropOutMask; 55 | end 56 | 57 | 58 | kl = sum(sparsityParam*log(par./param) + (1-sparsityParam)*log((ones(hiddenSize,1)-par)./(ones(hiddenSize,1)-param))); 59 | 60 | 61 | idx1 = find(W1 < 0); 62 | idx2 = find(W1 <= -1); 63 | idx3 = find(W1 >= 0); 64 | 65 | idx4 = find(W2 < 0); 66 | idx5 = find(W2 <= -1); 67 | idx6 = find(W2 >= 0); 68 | 69 | L2_regN = sum(sum(W1(idx1).^2)) + sum(sum(W2(idx4).^2)); 70 | L2_regP = sum(sum(W1(idx3).^2)) + sum(sum(W2(idx6).^2)); 71 | L1_reg = sum(abs(W1(:))) + sum(abs(W2(:))); 72 | 73 | 74 | cost = 0.5*sum(sum((y-yhat).^2))./size(y,2) + beta*kl + lambda/2*L2_regN; 75 | 76 | newobj = 0.5*sum(sum((y-yhat).^2))./size(y,2); 77 | objhistory = [objhistory newobj]; 78 | 79 | one1 = ones(size(W1)); 80 | one2 = ones(size(W2)); 81 | 82 | W1grad = delta2*(a1')./(size(y,2)); 83 | W1grad(idx1) = W1grad(idx1) + lambda*W1(idx1); 84 | 85 | W2grad = delta3*(a2')./(size(y,2)); 86 | W2grad(idx4) = W2grad(idx4) + lambda*W2(idx4); 87 | 88 | b1grad = sum(delta2,2)./(size(y,2)); 89 | 90 | b2grad = sum(delta3,2)./(size(y,2)); 91 | 92 | 93 | 94 | grad = [W1grad(:) ; W2grad(:) ; b1grad(:) ; b2grad(:)]; 95 | 96 | end 97 | 98 | 99 | 100 | function sigm = sigmoid(x) 101 | 102 | sigm = 1 ./ (1 + exp(-x)); 103 | end 104 | 105 | -------------------------------------------------------------------------------- /stack2params.m: -------------------------------------------------------------------------------- 1 | function [params, netconfig] = stack2params(stack) 2 | 3 | % Converts a "stack" structure into a flattened parameter vector and also 4 | % stores the network configuration. This is useful when working with 5 | % optimization toolboxes such as minFunc. 6 | % 7 | % [params, netconfig] = stack2params(stack) 8 | % 9 | % stack - the stack structure, where stack{1}.w = weights of first layer 10 | % stack{1}.b = weights of first layer 11 | % stack{2}.w = weights of second layer 12 | % stack{2}.b = weights of second layer 13 | % ... etc. 14 | 15 | 16 | % Setup the compressed param vector 17 | params = []; 18 | for d = 1:numel(stack) 19 | 20 | % This can be optimized. But since our stacks are relatively short, it 21 | % is okay 22 | params = [params ; stack{d}.w(:) ; stack{d}.b(:) ]; 23 | 24 | % Check that stack is of the correct form 25 | assert(size(stack{d}.w, 1) == size(stack{d}.b, 1), ... 26 | ['The bias should be a *column* vector of ' ... 27 | int2str(size(stack{d}.w, 1)) 'x1']); 28 | if d < numel(stack) 29 | assert(size(stack{d}.w, 1) == size(stack{d+1}.w, 2), ... 30 | ['The adjacent layers L' int2str(d) ' and L' int2str(d+1) ... 31 | ' should have matching sizes.']); 32 | end 33 | 34 | end 35 | 36 | if nargout > 1 37 | % Setup netconfig 38 | if numel(stack) == 0 39 | netconfig.inputsize = 0; 40 | netconfig.layersizes = {}; 41 | else 42 | netconfig.inputsize = size(stack{1}.w, 2); 43 | netconfig.layersizes = {}; 44 | for d = 1:numel(stack) 45 | netconfig.layersizes = [netconfig.layersizes ; size(stack{d}.w,1)]; 46 | end 47 | end 48 | end 49 | 50 | end -------------------------------------------------------------------------------- /stackedAECost_nonneg.m: -------------------------------------------------------------------------------- 1 | function [ cost, grad ] = stackedAECost(theta, inputSize, hiddenSize, ... 2 | numClasses, netconfig, ... 3 | lambda1, data, labels) 4 | 5 | 6 | 7 | %% Unroll softmaxTheta parameter 8 | 9 | % We first extract the part which compute the softmax gradient 10 | softmaxTheta = reshape(theta(1:hiddenSize*numClasses), numClasses, hiddenSize); 11 | 12 | % Extract out the "stack" 13 | stack = params2stack(theta(hiddenSize*numClasses+1:end), netconfig); 14 | 15 | softmaxThetaGrad = zeros(size(softmaxTheta)); 16 | stackgrad = cell(size(stack)); 17 | for d = 1:numel(stack) 18 | stackgrad{d}.w = zeros(size(stack{d}.w)); 19 | stackgrad{d}.b = zeros(size(stack{d}.b)); 20 | end 21 | 22 | cost = 0; 23 | 24 | 25 | M = size(data, 2); 26 | groundTruth = full(sparse(labels, 1:M, 1)); 27 | 28 | 29 | %% 30 | 31 | W1 = stack{1}.w; 32 | W2 = stack{2}.w; 33 | b1 = stack{1}.b; 34 | b2 = stack{2}.b; 35 | 36 | a1 = data; 37 | z2 = W1*a1 + repmat(b1, 1, size(data,2)); 38 | a2 = sigmoid(z2); 39 | z3 = W2*a2 + repmat(b2, 1, size(data,2)); 40 | a3 = sigmoid(z3); 41 | 42 | prob = exp(softmaxTheta*a3); 43 | [r,c] = find(isinf(prob)); 44 | prob(r,c) = exp(709); % avoid Inf in prob matrix 45 | prob_norm = prob./repmat(sum(prob),numClasses,1); 46 | [r,c] = find(prob_norm == 0); 47 | prob_norm(r,c) = eps; 48 | 49 | a4 = prob_norm; 50 | 51 | delta3 = -(softmaxTheta'*(groundTruth-prob_norm)) .* (a3.*(ones(size(softmaxTheta,2),size(data,2))-a3)); 52 | 53 | delta2 = (W2'*delta3) .* (a2.*(ones(size(W2,2),size(data,2))-a2)); 54 | 55 | delta1 = (W1'*delta2) .* (a1.*(ones(size(W1,2),size(data,2))-a1)); 56 | 57 | idx1 = find(W1 < 0); 58 | idx2 = find(W1 <= -1); 59 | idx3 = find(W1 >= 0); 60 | 61 | idx4 = find(W2 < 0); 62 | idx5 = find(W2 <= -1); 63 | idx6 = find(W2 >= 0); 64 | 65 | L2_regN = sum(sum(W1(idx1).^2))+sum(sum(W2(idx4).^2)); 66 | L2_regP = sum(sum(W1(idx3).^2))+sum(sum(W2(idx6).^2)); 67 | 68 | 69 | stackgrad{1}.w = delta2*(a1')./size(data,2); 70 | stackgrad{1}.w(idx1) = stackgrad{1}.w(idx1) + lambda1*W1(idx1); 71 | stackgrad{1}.b = sum(delta2,2)./size(data,2) ; 72 | 73 | stackgrad{2}.w = delta3*(a2')./size(data,2); 74 | stackgrad{2}.w(idx4) = stackgrad{2}.w(idx4) + lambda1*W2(idx4); 75 | stackgrad{2}.b = sum(delta3,2)./size(data,2); 76 | 77 | 78 | idx7 = find(softmaxTheta<0); 79 | idx8 = find(softmaxTheta>=0); 80 | 81 | softmax_L2_regN = sum(sum(softmaxTheta(idx7).^2)); 82 | softmax_L2_regP = sum(sum(softmaxTheta(idx8).^2)); 83 | 84 | softmaxThetaGrad = -1/size(data,2) * (a3*(groundTruth-prob_norm)') ; 85 | 86 | softmaxThetaGrad = softmaxThetaGrad'; 87 | softmaxThetaGrad(idx7) = softmaxThetaGrad(idx7) + lambda1*softmaxTheta(idx7); 88 | 89 | cost = -sum(sum(groundTruth.*log(prob_norm)))/size(data,2) + lambda1/2*softmax_L2_regN + lambda1/2*L2_regN; 90 | 91 | if isnan(cost) 92 | error() 93 | end 94 | 95 | %% Roll gradient vector 96 | 97 | grad = [softmaxThetaGrad(:) ; stack2params(stackgrad)]; 98 | 99 | end 100 | 101 | 102 | % You might find this useful 103 | function sigm = sigmoid(x) 104 | sigm = 1 ./ (1 + exp(-x)); 105 | end 106 | -------------------------------------------------------------------------------- /stackedAEPredict.m: -------------------------------------------------------------------------------- 1 | function [pred] = stackedAEPredict(theta, inputSize, hiddenSize, numClasses, netconfig, dropoutFraction, data) 2 | 3 | 4 | 5 | %% Unroll theta parameter 6 | 7 | softmaxTheta = reshape(theta(1:hiddenSize*numClasses), numClasses, hiddenSize); 8 | 9 | stack = params2stack(theta(hiddenSize*numClasses+1:end), netconfig); 10 | 11 | %% 12 | 13 | W1 = stack{1}.w; 14 | W2 = stack{2}.w; 15 | b1 = stack{1}.b; 16 | b2 = stack{2}.b; 17 | 18 | a1 = data; 19 | z2 = W1*a1 + repmat(b1, 1, size(data,2)); 20 | a2 = sigmoid(z2); 21 | if(dropoutFraction > 0) 22 | a2 = a2.*(1 - dropoutFraction); 23 | end 24 | z3 = W2*a2 + repmat(b2, 1, size(data,2)); 25 | a3 = sigmoid(z3); 26 | if(dropoutFraction > 0) 27 | a3 = a3.*(1 - dropoutFraction); 28 | end 29 | 30 | prob = exp(softmaxTheta*a3); 31 | prob_norm = prob./repmat(sum(prob),numClasses,1); 32 | 33 | [tmp,pred]=max(prob_norm); 34 | 35 | 36 | 37 | end 38 | 39 | 40 | function sigm = sigmoid(x) 41 | sigm = 1 ./ (1 + exp(-x)); 42 | end 43 | --------------------------------------------------------------------------------