├── dataset └── .keep ├── train_models └── .keep ├── minFunc_2012 ├── minFunc │ ├── precondDiag.m │ ├── precondTriu.m │ ├── precondTriuDiag.m │ ├── isLegal.m │ ├── compiled │ │ ├── lbfgsC.mexa64 │ │ ├── lbfgsC.mexglx │ │ ├── lbfgsC.mexmac │ │ ├── lbfgsC.mexmaci │ │ ├── lbfgsC.mexw32 │ │ ├── lbfgsC.mexw64 │ │ ├── mcholC.mexa64 │ │ ├── mcholC.mexglx │ │ ├── mcholC.mexmac │ │ ├── mcholC.mexw32 │ │ ├── mcholC.mexw64 │ │ ├── lbfgsAddC.mexa64 │ │ ├── lbfgsAddC.mexw64 │ │ ├── lbfgsC.mexmaci64 │ │ ├── mcholC.mexmaci64 │ │ ├── lbfgsAddC.mexmaci64 │ │ ├── lbfgsProdC.mexa64 │ │ ├── lbfgsProdC.mexw64 │ │ └── lbfgsProdC.mexmaci64 │ ├── mcholinc.m │ ├── lbfgsUpdate.m │ ├── lbfgsAdd.m │ ├── lbfgsProd.m │ ├── taylorModel.m │ ├── mex │ │ ├── lbfgsAddC.c │ │ ├── lbfgsProdC.c │ │ ├── lbfgsC.c │ │ └── mcholC.c │ ├── lbfgs.m │ ├── dampedUpdate.m │ ├── mchol.m │ ├── conjGrad.m │ ├── polyinterp.m │ ├── minFunc_processInputOptions.m │ ├── ArmijoBacktrack.m │ ├── WolfeLineSearch.m │ └── minFunc.m ├── logisticExample │ ├── LogisticHv.m │ ├── mylogsumexp.m │ ├── LogisticDiagPrecond.m │ ├── LogisticLoss.m │ └── example_minFunc_LR.m ├── mexAll.m ├── autoDif │ ├── autoHv.m │ ├── autoHess.m │ ├── derivativeCheck.m │ ├── autoGrad.m │ ├── autoTensor.m │ └── fastDerivativeCheck.m ├── rosenbrock.m ├── example_derivativeCheck.m ├── ZSL_ObjFunc.m~ ├── ZSL_ObjFunc.m └── example_minFunc.m ├── LICENSE ├── ZSL_ObjFunc_Wz.m ├── ZSL_ObjFunc_Wx.m ├── README.md ├── ZSL_Test.m ├── get_datapath.m └── ZSL_Train.m /dataset/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /train_models/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/precondDiag.m: -------------------------------------------------------------------------------- 1 | function [y] = precondDiag(r,D) 2 | y = D.*r; -------------------------------------------------------------------------------- /minFunc_2012/minFunc/precondTriu.m: -------------------------------------------------------------------------------- 1 | function [y] = precondUpper(r,U) 2 | y = U \ (U' \ r); -------------------------------------------------------------------------------- /minFunc_2012/minFunc/precondTriuDiag.m: -------------------------------------------------------------------------------- 1 | function [y] = precondUpper(r,U,D) 2 | y = U \ (D .* (U' \ r)); -------------------------------------------------------------------------------- /minFunc_2012/minFunc/isLegal.m: -------------------------------------------------------------------------------- 1 | function [legal] = isLegal(v) 2 | legal = sum(any(imag(v(:))))==0 & sum(isnan(v(:)))==0 & sum(isinf(v(:)))==0; -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsC.mexa64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexa64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsC.mexglx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexglx -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsC.mexmac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexmac -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsC.mexmaci: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexmaci -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsC.mexw32: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexw32 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsC.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexw64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/mcholC.mexa64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexa64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/mcholC.mexglx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexglx -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/mcholC.mexmac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexmac -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/mcholC.mexw32: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexw32 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/mcholC.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexw64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsAddC.mexa64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsAddC.mexa64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsAddC.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsAddC.mexw64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsC.mexmaci64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexmaci64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/mcholC.mexmaci64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexmaci64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsAddC.mexmaci64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsAddC.mexmaci64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsProdC.mexa64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsProdC.mexa64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsProdC.mexw64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsProdC.mexw64 -------------------------------------------------------------------------------- /minFunc_2012/minFunc/compiled/lbfgsProdC.mexmaci64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsProdC.mexmaci64 -------------------------------------------------------------------------------- /minFunc_2012/logisticExample/LogisticHv.m: -------------------------------------------------------------------------------- 1 | function [Hv] = LogisticHv(v,w,X,y) 2 | % v(feature,1) - vector that we will multiply Hessian by 3 | % w(feature,1) 4 | % X(instance,feature) 5 | % y(instance,1) 6 | 7 | sig = 1./(1+exp(-y.*(X*w))); 8 | Hv = X.'*(sig.*(1-sig).*(X*v)); 9 | -------------------------------------------------------------------------------- /minFunc_2012/logisticExample/mylogsumexp.m: -------------------------------------------------------------------------------- 1 | function lse = mylogsumexp(b) 2 | % does logsumexp across columns 3 | B = max(b,[],2); 4 | lse = log(sum(exp(b-repmat(B,[1 size(b,2)])),2))+B; 5 | 6 | % Old version that used repmatC 7 | %lse = log(sum(exp(b-repmatC(B,[1 size(b,2)])),2))+B; 8 | end -------------------------------------------------------------------------------- /minFunc_2012/mexAll.m: -------------------------------------------------------------------------------- 1 | % minFunc 2 | fprintf('Compiling minFunc files...\n'); 3 | mex -outdir minFunc/compiled minFunc/mex/mcholC.c 4 | mex -outdir minFunc/compiled minFunc/mex/lbfgsC.c 5 | mex -outdir minFunc/compiled minFunc/mex/lbfgsAddC.c 6 | mex -outdir minFunc/compiled minFunc/mex/lbfgsProdC.c 7 | 8 | -------------------------------------------------------------------------------- /minFunc_2012/autoDif/autoHv.m: -------------------------------------------------------------------------------- 1 | function [Hv] = autoHv(v,x,g,useComplex,funObj,varargin) 2 | % [Hv] = autoHv(v,x,g,useComplex,funObj,varargin) 3 | % 4 | % Numerically compute Hessian-vector product H*v of funObj(x,varargin{:}) 5 | % based on gradient values 6 | 7 | if useComplex 8 | mu = 1e-150i; 9 | else 10 | mu = 2*sqrt(1e-12)*(1+norm(x))/norm(v); 11 | end 12 | [f,finDif] = funObj(x + v*mu,varargin{:}); 13 | Hv = (finDif-g)/mu; -------------------------------------------------------------------------------- /minFunc_2012/logisticExample/LogisticDiagPrecond.m: -------------------------------------------------------------------------------- 1 | function [m] = LogisticHv(v,w,X,y) 2 | % v(feature,1) - vector that we will apply diagonal preconditioner to 3 | % w(feature,1) 4 | % X(instance,feature) 5 | % y(instance,1) 6 | 7 | sig = 1./(1+exp(-y.*(X*w))); 8 | 9 | % Compute diagonals of Hessian 10 | sig = sig.*(1-sig); 11 | for i = 1:length(w) 12 | h(i,1) = (sig.*X(:,i))'*X(:,i); 13 | end 14 | 15 | % Apply preconditioner 16 | m = v./h; 17 | 18 | % Exact preconditioner 19 | %H = X'*diag(sig.*(1-sig))*X; 20 | %m = H\v; 21 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/mcholinc.m: -------------------------------------------------------------------------------- 1 | function [R,tau] = mcholinc(H,verbose) 2 | % Computes Cholesky of H+tau*I, for suitably large tau that matrix is pd 3 | 4 | p = size(H,1); 5 | 6 | beta = norm(H,'fro'); 7 | if min(diag(H)) > 1e-12 8 | tau = 0; 9 | else 10 | if verbose 11 | fprintf('Small Value on Diagonal, Adjusting Hessian\n'); 12 | end 13 | tau = max(beta/2,1e-12); 14 | end 15 | while 1 16 | [R,posDef] = chol(H+tau*eye(p)); 17 | if posDef == 0 18 | break; 19 | else 20 | if verbose 21 | fprintf('Cholesky Failed, Adjusting Hessian\n'); 22 | end 23 | tau = max(2*tau,beta/2); 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/lbfgsUpdate.m: -------------------------------------------------------------------------------- 1 | function [old_dirs,old_stps,Hdiag] = lbfgsUpdate(y,s,corrections,debug,old_dirs,old_stps,Hdiag) 2 | ys = y'*s; 3 | if ys > 1e-10 4 | numCorrections = size(old_dirs,2); 5 | if numCorrections < corrections 6 | % Full Update 7 | old_dirs(:,numCorrections+1) = s; 8 | old_stps(:,numCorrections+1) = y; 9 | else 10 | % Limited-Memory Update 11 | old_dirs = [old_dirs(:,2:corrections) s]; 12 | old_stps = [old_stps(:,2:corrections) y]; 13 | end 14 | 15 | % Update scale of initial Hessian approximation 16 | Hdiag = ys/(y'*y); 17 | else 18 | if debug 19 | fprintf('Skipping Update\n'); 20 | end 21 | end -------------------------------------------------------------------------------- /minFunc_2012/minFunc/lbfgsAdd.m: -------------------------------------------------------------------------------- 1 | function [S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(y,s,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex) 2 | ys = y'*s; 3 | skipped = 0; 4 | corrections = size(S,2); 5 | if ys > 1e-10 6 | if lbfgs_end < corrections 7 | lbfgs_end = lbfgs_end+1; 8 | if lbfgs_start ~= 1 9 | if lbfgs_start == corrections 10 | lbfgs_start = 1; 11 | else 12 | lbfgs_start = lbfgs_start+1; 13 | end 14 | end 15 | else 16 | lbfgs_start = min(2,corrections); 17 | lbfgs_end = 1; 18 | end 19 | 20 | if useMex 21 | lbfgsAddC(y,s,Y,S,ys,int32(lbfgs_end)); 22 | else 23 | S(:,lbfgs_end) = s; 24 | Y(:,lbfgs_end) = y; 25 | end 26 | YS(lbfgs_end) = ys; 27 | 28 | % Update scale of initial Hessian approximation 29 | Hdiag = ys/(y'*y); 30 | else 31 | skipped = 1; 32 | end -------------------------------------------------------------------------------- /minFunc_2012/minFunc/lbfgsProd.m: -------------------------------------------------------------------------------- 1 | function [d] = lbfgsProd(g,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag) 2 | % BFGS Search Direction 3 | % 4 | % This function returns the (L-BFGS) approximate inverse Hessian, 5 | % multiplied by the negative gradient 6 | 7 | % Set up indexing 8 | [nVars,maxCorrections] = size(S); 9 | if lbfgs_start == 1 10 | ind = 1:lbfgs_end; 11 | nCor = lbfgs_end-lbfgs_start+1; 12 | else 13 | ind = [lbfgs_start:maxCorrections 1:lbfgs_end]; 14 | nCor = maxCorrections; 15 | end 16 | al = zeros(nCor,1); 17 | be = zeros(nCor,1); 18 | 19 | d = -g; 20 | for j = 1:length(ind) 21 | i = ind(end-j+1); 22 | al(i) = (S(:,i)'*d)/YS(i); 23 | d = d-al(i)*Y(:,i); 24 | end 25 | 26 | % Multiply by Initial Hessian 27 | d = Hdiag*d; 28 | 29 | for i = ind 30 | be(i) = (Y(:,i)'*d)/YS(i); 31 | d = d + S(:,i)*(al(i)-be(i)); 32 | end 33 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/taylorModel.m: -------------------------------------------------------------------------------- 1 | function [f,g,H] = taylorModel(d,f,g,H,T) 2 | 3 | p = length(d); 4 | 5 | fd3 = 0; 6 | gd2 = zeros(p,1); 7 | Hd = zeros(p); 8 | for t1 = 1:p 9 | for t2 = 1:p 10 | for t3 = 1:p 11 | fd3 = fd3 + T(t1,t2,t3)*d(t1)*d(t2)*d(t3); 12 | 13 | if nargout > 1 14 | gd2(t3) = gd2(t3) + T(t1,t2,t3)*d(t1)*d(t2); 15 | end 16 | 17 | if nargout > 2 18 | Hd(t2,t3) = Hd(t2,t3) + T(t1,t2,t3)*d(t1); 19 | end 20 | end 21 | 22 | end 23 | end 24 | 25 | f = f + g'*d + (1/2)*d'*H*d + (1/6)*fd3; 26 | 27 | if nargout > 1 28 | g = g + H*d + (1/2)*gd2; 29 | end 30 | 31 | if nargout > 2 32 | H = H + Hd; 33 | end 34 | 35 | if any(abs(d) > 1e5) 36 | % We want the optimizer to stop if the solution is unbounded 37 | g = zeros(p,1); 38 | end -------------------------------------------------------------------------------- /minFunc_2012/logisticExample/LogisticLoss.m: -------------------------------------------------------------------------------- 1 | function [nll,g,H,T] = LogisticLoss(w,X,y) 2 | % w(feature,1) 3 | % X(instance,feature) 4 | % y(instance,1) 5 | 6 | [n,p] = size(X); 7 | 8 | Xw = X*w; 9 | yXw = y.*Xw; 10 | 11 | nll = sum(mylogsumexp([zeros(n,1) -yXw])); 12 | 13 | if nargout > 1 14 | if nargout > 2 15 | sig = 1./(1+exp(-yXw)); 16 | g = -X.'*(y.*(1-sig)); 17 | else 18 | %g = -X.'*(y./(1+exp(yXw))); 19 | g = -(X.'*(y./(1+exp(yXw)))); 20 | end 21 | end 22 | 23 | if nargout > 2 24 | H = X.'*diag(sparse(sig.*(1-sig)))*X; 25 | end 26 | 27 | if nargout > 3 28 | T = zeros(p,p,p); 29 | for j1 = 1:p 30 | for j2 = 1:p 31 | for j3 = 1:p 32 | T(j1,j2,j3) = sum(y(:).^3.*X(:,j1).*X(:,j2).*X(:,j3).*sig.*(1-sig).*(1-2*sig)); 33 | end 34 | end 35 | end 36 | end -------------------------------------------------------------------------------- /minFunc_2012/rosenbrock.m: -------------------------------------------------------------------------------- 1 | function [f, df] = rosenbrock(x, y ) 2 | 3 | % rosenbrock.m This function returns the function value, partial derivatives 4 | % and Hessian of the (general dimension) rosenbrock function, given by: 5 | % 6 | % f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 7 | % 8 | % where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1). 9 | % 10 | % Carl Edward Rasmussen, 2001-07-21. 11 | 12 | D = length(x); 13 | f = sum(100*(x(2:D)-x(1:D-1).^2).^2 + (1-x(1:D-1)).^2); 14 | 15 | if nargout > 1 16 | df = zeros(D, 1); 17 | df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1)); 18 | df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2); 19 | end 20 | 21 | % if nargout > 2 22 | % ddf = zeros(D,D); 23 | % ddf(1:D-1,1:D-1) = diag(-400*x(2:D) + 1200*x(1:D-1).^2 + 2); 24 | % ddf(2:D,2:D) = ddf(2:D,2:D) + 200*eye(D-1); 25 | % ddf = ddf - diag(400*x(1:D-1),1) - diag(400*x(1:D-1),-1); 26 | % end 27 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/mex/lbfgsAddC.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mex.h" 3 | 4 | /* See lbfgsAdd.m for details */ 5 | /* This function will not exit gracefully on bad input! */ 6 | 7 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) 8 | { 9 | /* Variable Declarations */ 10 | 11 | double *s,*y,*S, *Y, ys; 12 | int i,j,nVars,lbfgs_end; 13 | 14 | /* Get Input Pointers */ 15 | 16 | y = mxGetPr(prhs[0]); 17 | s = mxGetPr(prhs[1]); 18 | Y = mxGetPr(prhs[2]); 19 | S = mxGetPr(prhs[3]); 20 | ys= mxGetScalar(prhs[4]); 21 | lbfgs_end = (int)mxGetScalar(prhs[5]); 22 | 23 | if (!mxIsClass(prhs[5],"int32")) 24 | mexErrMsgTxt("lbfgs_end must be int32"); 25 | 26 | /* Compute number of variables, maximum number of corrections */ 27 | 28 | nVars = mxGetDimensions(prhs[2])[0]; 29 | 30 | for(j=0;j 1e-4 23 | H 24 | H2 25 | diff = abs(H-H2) 26 | pause; 27 | end 28 | else 29 | [f,g] = funObj(x,varargin{:}); 30 | 31 | fprintf('Checking Gradient...\n'); 32 | [f2,g2] = autoGrad(x,type,funObj,varargin{:}); 33 | 34 | fprintf('Max difference between user and numerical gradient: %e\n',max(abs(g-g2))); 35 | if max(abs(g-g2)) > 1e-4 36 | fprintf('User NumDif:\n'); 37 | [g g2] 38 | diff = abs(g-g2) 39 | pause 40 | end 41 | end 42 | 43 | -------------------------------------------------------------------------------- /minFunc_2012/autoDif/autoGrad.m: -------------------------------------------------------------------------------- 1 | function [f,g] = autoGrad(x,type,funObj,varargin) % [f,g] = autoGrad(x,useComplex,funObj,varargin) % % Numerically compute gradient of objective function from function values % % type = % 1 - forward-differencing (p+1 evaluations) % 2 - central-differencing (more accurate, but requires 2p evaluations) % 3 - complex-step derivative (most accurate and only requires p evaluations, but only works for certain objectives) p = length(x); if type == 1 % Use Finite Differencing f = funObj(x,varargin{:}); mu = 2*sqrt(1e-12)*(1+norm(x)); diff = zeros(p,1); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; diff(j,1) = funObj(x + mu*e_j,varargin{:}); end g = (diff-f)/mu; elseif type == 3 % Use Complex Differentials mu = 1e-150; diff = zeros(p,1); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; diff(j,1) = funObj(x + mu*i*e_j,varargin{:}); end f = mean(real(diff)); g = imag(diff)/mu; else % Use Central Differencing mu = 2*sqrt(1e-12)*(1+norm(x)); diff1 = zeros(p,1); diff2 = zeros(p,1); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; diff1(j,1) = funObj(x + mu*e_j,varargin{:}); diff2(j,1) = funObj(x - mu*e_j,varargin{:}); end f = mean([diff1;diff2]); g = (diff1 - diff2)/(2*mu); end if 0 % DEBUG CODE [fReal gReal] = funObj(x,varargin{:}); [fReal f] [gReal g] diff pause; end -------------------------------------------------------------------------------- /minFunc_2012/autoDif/autoTensor.m: -------------------------------------------------------------------------------- 1 | function [f,g,H,T] = autoTensor(x,type,funObj,varargin) % [f,g,H,T] = autoTensor(x,useComplex,funObj,varargin) % Numerically compute Tensor of 3rd-derivatives of objective function from Hessian values p = length(x); if type == 2 mu = 2*sqrt(1e-12)*(1+norm(x)); f1 = zeros(p,1); f2 = zeros(p,2); g1 = zeros(p); g2 = zeros(p); diff = zeros(p,p,p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; [f1(j) g1(:,j) diff1(:,:,j)] = funObj(x + mu*e_j,varargin{:}); [f2(j) g2(:,j) diff2(:,:,j)] = funObj(x + mu*e_j,varargin{:}); end f = mean([f1;f2]); g = mean([g1 g2],2); H = mean(cat(3,diff1,diff2),3); T = (diff1-diff2)/(2*mu); elseif type == 3 % Use Complex Differentials mu = 1e-150; f = zeros(p,1); g = zeros(p); diff = zeros(p,p,p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; [f(j) g(:,j) diff(:,:,j)] = funObj(x + mu*i*e_j,varargin{:}); end f = mean(real(f)); g = mean(real(g),2); H = mean(real(diff),3); T = imag(diff)/mu; else % Use finite differencing mu = 2*sqrt(1e-12)*(1+norm(x)); [f,g,H] = funObj(x,varargin{:}); diff = zeros(p,p,p); for j = 1:p e_j = zeros(p,1); e_j(j) = 1; [~ ~ diff(:,:,j)] = funObj(x + mu*e_j,varargin{:}); end T = (diff-repmat(H,[1 1 p]))/mu; end -------------------------------------------------------------------------------- /ZSL_ObjFunc_Wz.m: -------------------------------------------------------------------------------- 1 | function [f, df] = ZSL_ObjFunc_Wz(W_z_vec, num_Parts, c, dx, dz, W_x, X, Z, Y, ZZ_t, D_xzi, lambda1, lambda2, GPU_mode) 2 | 3 | W_z = reshape(W_z_vec, [c, dz]); 4 | 5 | dp = dx / num_Parts; 6 | W_x_t = W_x'; 7 | 8 | XX_t = X * X'; 9 | XYZ_t = X * Y * Z'; 10 | 11 | %%%% precompute multplication 12 | Wxt_Wz = W_x' * W_z; 13 | Wxt_Wz_Z = Wxt_Wz * Z; 14 | 15 | trace_sum = 0; 16 | for i = 1:num_Parts 17 | trace_sum = trace_sum + trace( W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * full(D_xzi{i}) * W_z' * W_x_t((dp*(i-1)+1):dp*(i),:)'); 18 | end 19 | 20 | %%%% calculate loss 21 | f = norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2 + lambda2 * trace_sum; 22 | if(GPU_mode) 23 | f = gather(f); 24 | end 25 | %%%% calculate the derivative of W_z 26 | term0 = W_x * XX_t * Wxt_Wz * ZZ_t - W_x * XYZ_t; 27 | term1 = lambda1 * W_x * Wxt_Wz * ZZ_t ; 28 | if(GPU_mode) 29 | term2 = gpuArray(zeros(c, dz)); 30 | else 31 | term2 = zeros(c, dz); 32 | end 33 | 34 | for i = 1:num_Parts 35 | term2 = term2 + W_x_t((dp*(i-1)+1) : dp*(i),:)'* W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * full(D_xzi{i}); 36 | end 37 | term2 = term2 * lambda2; 38 | dW_z = 2 * (term0 + term1 + term2); 39 | df = reshape(dW_z, [c*dz,1]); 40 | if(GPU_mode) 41 | df = gather(df); 42 | end 43 | 44 | end 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /ZSL_ObjFunc_Wx.m: -------------------------------------------------------------------------------- 1 | function [f, df] = ZSL_ObjFunc_Wx(W_x_vec, num_Parts, c, dx, W_z, X, Z, Y, ZZ_t, D_xzi, lambda1, lambda2, GPU_mode) 2 | 3 | W_x = reshape(W_x_vec, [c, dx]); 4 | 5 | dp = dx / num_Parts; 6 | W_x_t = W_x'; 7 | 8 | XX_t = X * X'; 9 | XYZ_t = X * Y * Z'; 10 | 11 | %%%% precompute multplication 12 | Wxt_Wz = W_x' * W_z; 13 | Wxt_Wz_Z = Wxt_Wz * Z; 14 | 15 | trace_sum = 0; 16 | for i = 1 : num_Parts 17 | trace_sum = trace_sum + trace( W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * full(D_xzi{i}) * W_z' * W_x_t((dp*(i-1)+1) : dp*(i),:)'); 18 | end 19 | 20 | %%%% calculate loss 21 | f = norm((X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2 + lambda2 * trace_sum; 22 | if(GPU_mode) 23 | f = gather(f); 24 | end 25 | 26 | %%%% calculate the derivative of W_x 27 | term0 = W_z * ZZ_t * Wxt_Wz' * XX_t - 2 * W_z * XYZ_t'; 28 | term1 = lambda1 * W_z * ZZ_t * Wxt_Wz'; 29 | if(GPU_mode) 30 | term2 = gpuArray(zeros(dx, c)); 31 | else 32 | term2 = zeros(dx, c); 33 | end 34 | for i = 1 : num_Parts 35 | term2((dp*(i-1)+1) : dp*(i), :) = W_x_t((dp*(i-1)+1) : dp*(i),:)* W_z * full(D_xzi{i}) * W_z'; 36 | end 37 | term2 = lambda2 * term2; 38 | 39 | dW_x = 2 * (term0 + term1 + term2'); 40 | df = reshape(dW_x, [c*dx,1]); 41 | 42 | if(GPU_mode) 43 | df = gather(df); 44 | end 45 | 46 | end 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/mchol.m: -------------------------------------------------------------------------------- 1 | function [l,d,perm] = mchol(A,mu) 2 | % [l,d,perm] = mchol(A,mu) 3 | % Compute the Gill-Murray modified LDL factorization of A, 4 | 5 | if nargin < 2 6 | mu = 1e-12; 7 | end 8 | 9 | n = size(A,1); 10 | l = eye(n); 11 | d = zeros(n,1); 12 | perm = 1:n; 13 | 14 | for i = 1:n 15 | c(i,i) = A(i,i); 16 | end 17 | 18 | % Compute modification parameters 19 | gamma = max(abs(diag(A))); 20 | xi = max(max(abs(setdiag(A,0)))); 21 | delta = mu*max(gamma+xi,1); 22 | if n > 1 23 | beta = sqrt(max([gamma xi/sqrt(n^2-1) mu])); 24 | else 25 | beta = sqrt(max([gamma mu])); 26 | end 27 | 28 | for j = 1:n 29 | 30 | % Find q that results in Best Permutation with j 31 | [maxVal maxPos] = max(abs(diag(c(j:end,j:end)))); 32 | q = maxPos+j-1; 33 | 34 | % Permute d,c,l,a 35 | d([j q]) = d([q j]); 36 | perm([j q]) = perm([q j]); 37 | c([j q],:) = c([q j],:); 38 | c(:,[j q]) = c(:,[q j]); 39 | l([j q],:) = l([q j],:); 40 | l(:,[j q]) = l(:,[q j]); 41 | A([j q],:) = A([q j],:); 42 | A(:,[j q]) = A(:,[q j]); 43 | 44 | for s = 1:j-1 45 | l(j,s) = c(j,s)/d(s); 46 | end 47 | for i = j+1:n 48 | c(i,j) = A(i,j) - sum(l(j,1:j-1).*c(i,1:j-1)); 49 | end 50 | theta = 0; 51 | if j < n 52 | theta = max(abs(c(j+1:n,j))); 53 | end 54 | d(j) = max([abs(c(j,j)) (theta/beta)^2 delta]); 55 | if j < n 56 | for i = j+1:n 57 | c(i,i) = c(i,i) - (c(i,j)^2)/d(j); 58 | end 59 | end 60 | end -------------------------------------------------------------------------------- /minFunc_2012/example_derivativeCheck.m: -------------------------------------------------------------------------------- 1 | clear all 2 | 3 | nInst = 250; 4 | nVars = 10; 5 | X = randn(nInst,nVars); 6 | w = randn(nVars,1); 7 | y = sign(X*w + randn(nInst,1)); 8 | 9 | wTest = randn(nVars,1); 10 | 11 | fprintf('Testing gradient using forward-differencing...\n'); 12 | order = 1; 13 | derivativeCheck(@LogisticLoss,wTest,order,1,X,y); 14 | 15 | fprintf('Testing gradient using central-differencing...\n'); 16 | derivativeCheck(@LogisticLoss,wTest,order,2,X,y); 17 | 18 | fprintf('Testing gradient using complex-step derivative...\n'); 19 | derivativeCheck(@LogisticLoss,wTest,order,3,X,y); 20 | 21 | fprintf('\n\n\n'); 22 | pause 23 | 24 | fprintf('Testing Hessian using forward-differencing\n'); 25 | order = 2; 26 | derivativeCheck(@LogisticLoss,wTest,order,1,X,y); 27 | 28 | fprintf('Testing Hessian using central-differencing\n'); 29 | order = 2; 30 | derivativeCheck(@LogisticLoss,wTest,order,2,X,y); 31 | 32 | fprintf('Testing Hessian using complex-step derivative\n'); 33 | order = 2; 34 | derivativeCheck(@LogisticLoss,wTest,order,3,X,y); 35 | 36 | fprintf('\n\n\n'); 37 | pause 38 | 39 | fprintf('Testing gradient using fastDerivativeCheck...\n'); 40 | order = 1; 41 | fastDerivativeCheck(@LogisticLoss,wTest,order,1,X,y); 42 | fastDerivativeCheck(@LogisticLoss,wTest,order,2,X,y); 43 | fastDerivativeCheck(@LogisticLoss,wTest,order,3,X,y); 44 | 45 | fprintf('\n\n\n'); 46 | pause 47 | 48 | fprintf('Testing Hessian using fastDerivativeCheck...\n'); 49 | order = 2; 50 | fastDerivativeCheck(@LogisticLoss,wTest,order,1,X,y); 51 | fastDerivativeCheck(@LogisticLoss,wTest,order,2,X,y); 52 | fastDerivativeCheck(@LogisticLoss,wTest,order,3,X,y); 53 | -------------------------------------------------------------------------------- /minFunc_2012/autoDif/fastDerivativeCheck.m: -------------------------------------------------------------------------------- 1 | function diff = derivativeCheck(funObj,x,order,type,varargin) % diff = fastDerivativeCheck(funObj,x,order,varargin) if nargin < 3 order = 1; % Only check gradient by default if nargin < 4 type = 2; % Use central-differencing by default end end p = length(x); d = sign(randn(p,1)); if order == 2 fprintf('Checking Hessian-vector product along random direction:\n'); [f,g,H] = funObj(x,varargin{:}); Hv = H*d; if type == 1 % Use Finite Differencing mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x)); [diff,diffa] = funObj(x+d*mu,varargin{:}); Hv2 = (diffa-g)/mu; elseif type == 3 % Use Complex Differentials mu = 1e-150; [diff,diffa] = funObj(x+d*mu*i,varargin{:}); Hv2 = imag(diffa-g)/mu; else % Use Central Differencing mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x)); [diff1,diffa] = funObj(x+d*mu,varargin{:}); [diff2,diffb] = funObj(x-d*mu,varargin{:}); Hv2 = (diffa-diffb)/(2*mu); end fprintf('Max difference between user and numerical Hessian-vector product: %e\n',max(abs(Hv-Hv2))); else fprintf('Checking Gradient along random direction:\n'); [f,g] = funObj(x,varargin{:}); gtd = g'*d; if type == 1 % Use Finite Differencing mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x)); diff = funObj(x+d*mu,varargin{:}); gtd2 = (diff-f)/mu; elseif type == 3 % Use Complex Differentials mu = 1e-150; [diff,diffa] = funObj(x+d*mu*i,varargin{:}); gtd2 = imag(diff)/mu; else % Use Central Differencing mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x)); diff1 = funObj(x+d*mu,varargin{:}); diff2 = funObj(x-d*mu,varargin{:}); gtd2 = (diff1-diff2)/(2*mu); end fprintf('Max difference between user and numerical directional-derivative: %e\n',max(abs(gtd-gtd2))); end -------------------------------------------------------------------------------- /minFunc_2012/ZSL_ObjFunc.m~: -------------------------------------------------------------------------------- 1 | function [f, df] = ZSL_ObjFunc(W, c, dx, dz, X, Z, Y, ZZ_t, XX_t, XYZ_t, D_xzi, lambda1, lambda2, lambda3) 2 | 3 | %assert(length(W) == (c*dx + c*dz)); 4 | 5 | W_x_vec = W(1:c*dx); 6 | W_z_vec = W(c*dx+1:end); 7 | W_x = reshape(W_x_vec, [c, dx]); 8 | W_z = reshape(W_z_vec, [c, dz]); 9 | 10 | dp = dx/7; 11 | W_x_p = zeros(dp, c, 7); 12 | W_x_t = W_x'; 13 | for i = 1:7 14 | W_x_p(:,:,i) = W_x_transform((dp*(i-1)+1) : dp*(i),:); 15 | end 16 | 17 | % % precompute multplication 18 | 19 | Wxt_Wz = W_x' * W_z; 20 | Wxt_Wz_Z = Wxt_Wz * Z; %Wxt_Wz_Z = W_x'*W_z*Z; 21 | 22 | trace_sum = 0; 23 | %D_xzi = zeros(dz,dz,7); 24 | for i = 1:7 25 | W_xz = W_x_p(:,:,i) * W_z; 26 | D_xzi(:,:,i) = diag([1 ./ (2*sqrt(sum((W_xz').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_xz'))]); 27 | trace_sum = trace_sum + trace( W_x_p(:,:,i) * W_z * D_xzi(:,:,i) * W_z' * W_x_p(:,:,i)'); 28 | end 29 | 30 | D_z = diag([1 ./ (2*sqrt(sum((W_z').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_z'))]); %% dz X dz 31 | % % loss function 32 | f = norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2 +... 33 | lambda2 * trace(W_z * D_z * W_z') + lambda3 * trace_sum; 34 | 35 | % % calculate the derivative of W_x 36 | term1 = W_z * ZZ_t * Wxt_Wz' * XX_t - 2 * W_z * XYZ_t'; 37 | term2 = lambda1 * W_z * ZZ_t * Wxt_Wz'; 38 | term4 = zeros(dx, c); 39 | for i = 1:7 40 | term4((dp*(i-1)+1) : dp*(i), :) = W_x_p(:,:,i)* W_z * D_xzi(:,:,i) * W_z'; 41 | end 42 | term4 = lambda3 * term4; 43 | 44 | dW_x = 2 * (term1 + term2 + term4'); 45 | dW_x_vec = reshape(dW_x, [c*dx,1]); 46 | 47 | 48 | % % calculate the derivative of W_z 49 | term1 = W_x * XX_t * Wxt_Wz * ZZ_t - W_x * XYZ_t; 50 | term2 = lambda1 * W_x * Wxt_Wz * ZZ_t ; 51 | term3 = lambda2 * W_z * D_z; 52 | term4 = zeros(c, dz); 53 | for i = 1:7 54 | term4 = term4 + W_x_p(:,:,i)'* W_x_p(:,:,i) * W_z * D_xzi(:,:,i); 55 | end 56 | term4 = term4 * lambda3; 57 | dW_z = 2 * (term1 + term2 + term3 + term4); 58 | dW_z_vec = reshape(dW_z, [c*dz,1]); 59 | 60 | df = [dW_x_vec; dW_z_vec]; 61 | 62 | end 63 | 64 | 65 | function value = normL2_by_row(M) 66 | ep = 0.0001; 67 | value = sqrt(sum(M.^2,2) + ep); 68 | end 69 | 70 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/conjGrad.m: -------------------------------------------------------------------------------- 1 | function [x,k,res,negCurv] = cg(A,b,optTol,maxIter,verbose,precFunc,precArgs,matrixVectFunc,matrixVectArgs) 2 | % [x,k,res,negCurv] = 3 | % cg(A,b,optTol,maxIter,verbose,precFunc,precArgs,matrixVectFunc,matrixVect 4 | % Args) 5 | % Linear Conjugate Gradient, where optionally we use 6 | % - preconditioner on vector v with precFunc(v,precArgs{:}) 7 | % - matrix multipled by vector with matrixVectFunc(v,matrixVectArgs{:}) 8 | 9 | if nargin <= 4 10 | verbose = 0; 11 | end 12 | 13 | x = zeros(size(b)); 14 | r = -b; 15 | 16 | % Apply preconditioner (if supplied) 17 | if nargin >= 7 && ~isempty(precFunc) 18 | y = precFunc(r,precArgs{:}); 19 | else 20 | y = r; 21 | end 22 | 23 | ry = r'*y; 24 | p = -y; 25 | k = 0; 26 | 27 | res = norm(r); 28 | done = 0; 29 | negCurv = []; 30 | while res > optTol & k < maxIter & ~done 31 | % Compute Matrix-vector product 32 | if nargin >= 9 33 | Ap = matrixVectFunc(p,matrixVectArgs{:}); 34 | else 35 | Ap = A*p; 36 | end 37 | pAp = p'*Ap; 38 | 39 | % Check for negative Curvature 40 | if pAp <= 1e-16 41 | if verbose 42 | fprintf('Negative Curvature Detected!\n'); 43 | end 44 | 45 | if nargout == 4 46 | if pAp < 0 47 | negCurv = p; 48 | return 49 | end 50 | end 51 | 52 | if k == 0 53 | if verbose 54 | fprintf('First-Iter, Proceeding...\n'); 55 | end 56 | done = 1; 57 | else 58 | if verbose 59 | fprintf('Stopping\n'); 60 | end 61 | break; 62 | end 63 | end 64 | 65 | % Conjugate Gradient 66 | alpha = ry/(pAp); 67 | x = x + alpha*p; 68 | r = r + alpha*Ap; 69 | 70 | % If supplied, apply preconditioner 71 | if nargin >= 7 && ~isempty(precFunc) 72 | y = precFunc(r,precArgs{:}); 73 | else 74 | y = r; 75 | end 76 | 77 | ry_new = r'*y; 78 | beta = ry_new/ry; 79 | p = -y + beta*p; 80 | k = k + 1; 81 | 82 | % Update variables 83 | ry = ry_new; 84 | res = norm(r); 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /minFunc_2012/ZSL_ObjFunc.m: -------------------------------------------------------------------------------- 1 | function [f, df] = ZSL_ObjFunc(W, c, dx, dz, X, Z, Y, ZZ_t, XX_t, XYZ_t, D_xzi, lambda1, lambda2, lambda3) 2 | 3 | %assert(length(W) == (c*dx + c*dz)); 4 | 5 | W_x_vec = W(1:c*dx); 6 | W_z_vec = W(c*dx+1:end); 7 | W_x = reshape(W_x_vec, [c, dx]); 8 | W_z = reshape(W_z_vec, [c, dz]); 9 | 10 | dp = dx/7; 11 | W_x_t = W_x'; %% W_x_transform 12 | 13 | %W_x_p = zeros(dp, c, 7); 14 | %for i = 1:7 15 | % W_x_p(:,:,i) = W_x_t((dp*(i-1)+1) : dp*(i),:); 16 | %end 17 | 18 | % % precompute multplication 19 | 20 | Wxt_Wz = W_x' * W_z; 21 | Wxt_Wz_Z = Wxt_Wz * Z; %Wxt_Wz_Z = W_x'*W_z*Z; 22 | 23 | trace_sum = 0; 24 | %D_xzi = zeros(dz,dz,7); 25 | for i = 1:7 26 | W_xz = W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z; 27 | D_xzi(:,:,i) = diag([1 ./ (2*sqrt(sum((W_xz').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_xz'))]); 28 | trace_sum = trace_sum + trace( W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * D_xzi(:,:,i) * W_z' * W_x_t((dp*(i-1)+1) : dp*(i),:)'); 29 | end 30 | 31 | D_z = diag([1 ./ (2*sqrt(sum((W_z').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_z'))]); %% dz X dz 32 | % % loss function 33 | f = norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2 +... 34 | lambda2 * trace(W_z * D_z * W_z') + lambda3 * trace_sum; 35 | 36 | % % calculate the derivative of W_x 37 | term1 = W_z * ZZ_t * Wxt_Wz' * XX_t - 2 * W_z * XYZ_t'; 38 | term2 = lambda1 * W_z * ZZ_t * Wxt_Wz'; 39 | term4 = zeros(dx, c); 40 | for i = 1:7 41 | term4((dp*(i-1)+1) : dp*(i), :) = W_x_t((dp*(i-1)+1) : dp*(i),:)* W_z * D_xzi(:,:,i) * W_z'; 42 | end 43 | term4 = lambda3 * term4; 44 | 45 | dW_x = 2 * (term1 + term2 + term4'); 46 | dW_x_vec = reshape(dW_x, [c*dx,1]); 47 | 48 | 49 | % % calculate the derivative of W_z 50 | term1 = W_x * XX_t * Wxt_Wz * ZZ_t - W_x * XYZ_t; 51 | term2 = lambda1 * W_x * Wxt_Wz * ZZ_t ; 52 | term3 = lambda2 * W_z * D_z; 53 | term4 = zeros(c, dz); 54 | for i = 1:7 55 | term4 = term4 + W_x_t((dp*(i-1)+1) : dp*(i),:)'* W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * D_xzi(:,:,i); 56 | end 57 | term4 = term4 * lambda3; 58 | dW_z = 2 * (term1 + term2 + term3 + term4); 59 | dW_z_vec = reshape(dW_z, [c*dz,1]); 60 | 61 | df = [dW_x_vec; dW_z_vec]; 62 | 63 | fprintf(['f = ', num2str(f), '\n']); 64 | end 65 | 66 | 67 | % % function value = normL2_by_row(M) 68 | % % ep = 0.0001; 69 | % % value = sqrt(sum(M.^2,2) + ep); 70 | % % end 71 | 72 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/mex/lbfgsProdC.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mex.h" 3 | 4 | /* See lbfgsProd.m for details */ 5 | /* This function will not exit gracefully on bad input! */ 6 | 7 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) 8 | { 9 | /* Variable Declarations */ 10 | 11 | double *S, *Y, *YS, *g, Hdiag, *d, *alpha, *beta; 12 | int i,j,nVars,nCor,maxCor,lbfgs_start,lbfgs_end; 13 | 14 | /* Get Input Pointers */ 15 | 16 | g = mxGetPr(prhs[0]); 17 | S = mxGetPr(prhs[1]); 18 | Y = mxGetPr(prhs[2]); 19 | YS= mxGetPr(prhs[3]); 20 | lbfgs_start = (int)mxGetScalar(prhs[4]); 21 | lbfgs_end = (int)mxGetScalar(prhs[5]); 22 | Hdiag = mxGetScalar(prhs[6]); 23 | 24 | if (!mxIsClass(prhs[4],"int32")||!mxIsClass(prhs[5],"int32")) 25 | mexErrMsgTxt("lbfgs_start and lbfgs_end must be int32"); 26 | 27 | /* Compute number of variables, maximum number of corrections */ 28 | 29 | nVars = mxGetDimensions(prhs[1])[0]; 30 | maxCor = mxGetDimensions(prhs[1])[1]; 31 | 32 | /* Compute number of corrections available */ 33 | if (lbfgs_start == 1) 34 | nCor = lbfgs_end-lbfgs_start+1; 35 | else 36 | nCor = maxCor; 37 | 38 | /* Allocate Memory for Local Variables */ 39 | alpha = mxCalloc(nCor,sizeof(double)); 40 | beta = mxCalloc(nCor,sizeof(double)); 41 | 42 | /* Set-up Output Vector */ 43 | plhs[0] = mxCreateDoubleMatrix(nVars,1,mxREAL); 44 | d = mxGetPr(plhs[0]); 45 | 46 | for(j=0;j= 0;i--) { 50 | alpha[i] = 0; 51 | for(j=0;j= lbfgs_start-1;i--) { 59 | alpha[i] = 0; 60 | for(j=0;j> **ZSL_Test('CUBird', 'Easy', 'ATN')** ### ATN means using groundtruth part annotation 27 | Dataset: CUB2011 Easy ATN 28 | Model: trained_models/CUBird_Easy_ATN.mat 29 | Load Testing set 30 | test_acc = 43.5049% 31 | 32 | ---------------------------------------------------------------------- 33 | >> **ZSL_Test('CUBird', 'Easy', 'DET')** ### DET means using the detected parts instead of GT parts. 34 | Dataset: CUB2011 Easy DET 35 | Model: trained_models/CUBird_Easy_DET.mat 36 | Load Testing set 37 | test_acc = 37.5725% 38 | 39 | #### NABirds Easy/Hard split in Table3 40 | -------------------------------------------------------------------------------- 41 | >> **ZSL_Test('NABird', 'Easy')** ### Easy means category-share splitting 42 | Dataset: NABird Easy DET 43 | Model: trained_models/NABird_Easy_DET.mat 44 | Load Testing set 45 | test_acc = 30.5937% 46 | 47 | -------------------------------------------------- 48 | >> **ZSL_Test('NABird', 'Hard')** ### Hard means category-share splitting 49 | Dataset: NABird Hard DET 50 | Model: trained_models/NABird_Hard_DET.mat 51 | Load Testing set 52 | test_acc = 8.1349% 53 | 54 | 55 | 56 | Training 57 | --------- 58 | >>ZSL_Train(Dateset, Splitmode, ImgFtSource, lambda1, lambda2, GPU_mode) 59 | is the command to train the model using a particular setting. 60 | % For example ZSL_Train('CUBird', 'Easy', 'DET', 100000, 10000, true), trains on the CUBirds dataset on the Easy split and using the detected part boxes. 61 | , lambda1=100000, and lambda2=10000, and GPU_mode=true (using GPU mode for training). If false, the training is done on CPU. 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/mex/lbfgsC.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mex.h" 3 | 4 | /* See lbfgs.m for details! */ 5 | /* This function may not exit gracefully on bad input! */ 6 | 7 | 8 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) 9 | { 10 | /* Variable Declarations */ 11 | 12 | double *s, *y, *g, *H, *d, *ro, *alpha, *beta, *q, *r; 13 | int nVars,nSteps,lhs_dims[2]; 14 | double temp; 15 | int i,j; 16 | 17 | /* Get Input Pointers */ 18 | 19 | g = mxGetPr(prhs[0]); 20 | s = mxGetPr(prhs[1]); 21 | y = mxGetPr(prhs[2]); 22 | H = mxGetPr(prhs[3]); 23 | 24 | /* Compute number of variables (p), rank of update (d) */ 25 | 26 | nVars = mxGetDimensions(prhs[1])[0]; 27 | nSteps = mxGetDimensions(prhs[1])[1]; 28 | 29 | /* Allocated Memory for Function Variables */ 30 | ro = mxCalloc(nSteps,sizeof(double)); 31 | alpha = mxCalloc(nSteps,sizeof(double)); 32 | beta = mxCalloc(nSteps,sizeof(double)); 33 | q = mxCalloc(nVars*(nSteps+1),sizeof(double)); 34 | r = mxCalloc(nVars*(nSteps+1),sizeof(double)); 35 | 36 | /* Set-up Output Vector */ 37 | 38 | lhs_dims[0] = nVars; 39 | lhs_dims[1] = 1; 40 | 41 | plhs[0] = mxCreateNumericArray(2,lhs_dims,mxDOUBLE_CLASS,mxREAL); 42 | d = mxGetPr(plhs[0]); 43 | 44 | /* ro = 1/(y(:,i)'*s(:,i)) */ 45 | for(i=0;i=0;i--) 62 | { 63 | /* alpha(i) = ro(i)*s(:,i)'*q(:,i+1) */ 64 | alpha[i] = 0; 65 | for(j=0;j= xminBound && xCP <= xmaxBound 106 | fCP = polyval(params,xCP); 107 | if imag(fCP)==0 && fCP < fmin 108 | minPos = real(xCP); 109 | fmin = real(fCP); 110 | end 111 | end 112 | end 113 | 114 | % Plot Situation 115 | if doPlot 116 | clf; hold on; 117 | 118 | % Plot Points 119 | plot(points(:,1),points(:,2),'b*'); 120 | 121 | % Plot Derivatives 122 | for i = 1:nPoints 123 | if isreal(points(i,3)) 124 | m = points(i,3); 125 | b = points(i,2) - m*points(i,1); 126 | plot([points(i,1)-.05 points(i,1)+.05],... 127 | [(points(i,1)-.05)*m+b (points(i,1)+.05)*m+b],'c.-'); 128 | end 129 | end 130 | 131 | % Plot Function 132 | x = min(xmin,xminBound)-.1:(max(xmax,xmaxBound)+.1-min(xmin,xminBound)+.1)/100:max(xmax,xmaxBound)+.1; 133 | for i = 1:length(x) 134 | f(i) = polyval(params,x(i)); 135 | end 136 | plot(x,f,'y'); 137 | axis([x(1)-.1 x(end)+.1 min(f)-.1 max(f)+.1]); 138 | 139 | % Plot Minimum 140 | plot(minPos,fmin,'g+'); 141 | if doPlot == 1 142 | pause(1); 143 | end 144 | end -------------------------------------------------------------------------------- /minFunc_2012/minFunc/minFunc_processInputOptions.m: -------------------------------------------------------------------------------- 1 | 2 | function [verbose,verboseI,debug,doPlot,maxFunEvals,maxIter,optTol,progTol,method,... 3 | corrections,c1,c2,LS_init,cgSolve,qnUpdate,cgUpdate,initialHessType,... 4 | HessianModify,Fref,useComplex,numDiff,LS_saveHessianComp,... 5 | Damped,HvFunc,bbType,cycle,... 6 | HessianIter,outputFcn,useMex,useNegCurv,precFunc,... 7 | LS_type,LS_interp,LS_multi,DerivativeCheck] = ... 8 | minFunc_processInputOptions(o) 9 | 10 | % Constants 11 | SD = 0; 12 | CSD = 1; 13 | BB = 2; 14 | CG = 3; 15 | PCG = 4; 16 | LBFGS = 5; 17 | QNEWTON = 6; 18 | NEWTON0 = 7; 19 | NEWTON = 8; 20 | TENSOR = 9; 21 | 22 | verbose = 1; 23 | verboseI= 1; 24 | debug = 0; 25 | doPlot = 0; 26 | method = LBFGS; 27 | cgSolve = 0; 28 | 29 | o = toUpper(o); 30 | 31 | if isfield(o,'DISPLAY') 32 | switch(upper(o.DISPLAY)) 33 | case 0 34 | verbose = 0; 35 | verboseI = 0; 36 | case 'FINAL' 37 | verboseI = 0; 38 | case 'OFF' 39 | verbose = 0; 40 | verboseI = 0; 41 | case 'NONE' 42 | verbose = 0; 43 | verboseI = 0; 44 | case 'FULL' 45 | debug = 1; 46 | case 'EXCESSIVE' 47 | debug = 1; 48 | doPlot = 1; 49 | end 50 | end 51 | 52 | DerivativeCheck = 0; 53 | if isfield(o,'DERIVATIVECHECK') 54 | switch(upper(o.DERIVATIVECHECK)) 55 | case 1 56 | DerivativeCheck = 1; 57 | case 'ON' 58 | DerivativeCheck = 1; 59 | end 60 | end 61 | 62 | LS_init = 0; 63 | LS_type = 1; 64 | LS_interp = 2; 65 | LS_multi = 0; 66 | Fref = 1; 67 | Damped = 0; 68 | HessianIter = 1; 69 | c2 = 0.9; 70 | if isfield(o,'METHOD') 71 | m = upper(o.METHOD); 72 | switch(m) 73 | case 'TENSOR' 74 | method = TENSOR; 75 | case 'NEWTON' 76 | method = NEWTON; 77 | case 'MNEWTON' 78 | method = NEWTON; 79 | HessianIter = 5; 80 | case 'PNEWTON0' 81 | method = NEWTON0; 82 | cgSolve = 1; 83 | case 'NEWTON0' 84 | method = NEWTON0; 85 | case 'QNEWTON' 86 | method = QNEWTON; 87 | Damped = 1; 88 | case 'LBFGS' 89 | method = LBFGS; 90 | case 'BB' 91 | method = BB; 92 | LS_type = 0; 93 | Fref = 20; 94 | case 'PCG' 95 | method = PCG; 96 | c2 = 0.2; 97 | LS_init = 2; 98 | case 'SCG' 99 | method = CG; 100 | c2 = 0.2; 101 | LS_init = 4; 102 | case 'CG' 103 | method = CG; 104 | c2 = 0.2; 105 | LS_init = 2; 106 | case 'CSD' 107 | method = CSD; 108 | c2 = 0.2; 109 | Fref = 10; 110 | LS_init = 2; 111 | case 'SD' 112 | method = SD; 113 | LS_init = 2; 114 | end 115 | end 116 | 117 | maxFunEvals = getOpt(o,'MAXFUNEVALS',1000); 118 | maxIter = getOpt(o,'MAXITER',500); 119 | optTol = getOpt(o,'OPTTOL',1e-5); 120 | progTol = getOpt(o,'PROGTOL',1e-9); 121 | corrections = getOpt(o,'CORRECTIONS',100); 122 | corrections = getOpt(o,'CORR',corrections); 123 | c1 = getOpt(o,'C1',1e-4); 124 | c2 = getOpt(o,'C2',c2); 125 | LS_init = getOpt(o,'LS_INIT',LS_init); 126 | cgSolve = getOpt(o,'CGSOLVE',cgSolve); 127 | qnUpdate = getOpt(o,'QNUPDATE',3); 128 | cgUpdate = getOpt(o,'CGUPDATE',2); 129 | initialHessType = getOpt(o,'INITIALHESSTYPE',1); 130 | HessianModify = getOpt(o,'HESSIANMODIFY',0); 131 | Fref = getOpt(o,'FREF',Fref); 132 | useComplex = getOpt(o,'USECOMPLEX',0); 133 | numDiff = getOpt(o,'NUMDIFF',0); 134 | LS_saveHessianComp = getOpt(o,'LS_SAVEHESSIANCOMP',1); 135 | Damped = getOpt(o,'DAMPED',Damped); 136 | HvFunc = getOpt(o,'HVFUNC',[]); 137 | bbType = getOpt(o,'BBTYPE',0); 138 | cycle = getOpt(o,'CYCLE',3); 139 | HessianIter = getOpt(o,'HESSIANITER',HessianIter); 140 | outputFcn = getOpt(o,'OUTPUTFCN',[]); 141 | useMex = getOpt(o,'USEMEX',1); 142 | useNegCurv = getOpt(o,'USENEGCURV',1); 143 | precFunc = getOpt(o,'PRECFUNC',[]); 144 | LS_type = getOpt(o,'LS_type',LS_type); 145 | LS_interp = getOpt(o,'LS_interp',LS_interp); 146 | LS_multi = getOpt(o,'LS_multi',LS_multi); 147 | end 148 | 149 | function [v] = getOpt(options,opt,default) 150 | if isfield(options,opt) 151 | if ~isempty(getfield(options,opt)) 152 | v = getfield(options,opt); 153 | else 154 | v = default; 155 | end 156 | else 157 | v = default; 158 | end 159 | end 160 | 161 | function [o] = toUpper(o) 162 | if ~isempty(o) 163 | fn = fieldnames(o); 164 | for i = 1:length(fn) 165 | o = setfield(o,upper(fn{i}),getfield(o,fn{i})); 166 | end 167 | end 168 | end -------------------------------------------------------------------------------- /minFunc_2012/minFunc/ArmijoBacktrack.m: -------------------------------------------------------------------------------- 1 | function [t,x_new,f_new,g_new,funEvals,H] = ArmijoBacktrack(... 2 | x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,funObj,varargin) 3 | % [t,x_new,f_new,g_new,funEvals,H] = ArmijoBacktrack(... 4 | % x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,funObj,varargin) 5 | % 6 | % Backtracking linesearch to satisfy Armijo condition 7 | % 8 | % Inputs: 9 | % x: starting location 10 | % t: initial step size 11 | % d: descent direction 12 | % f: function value at starting location 13 | % fr: reference function value (usually funObj(x)) 14 | % gtd: directional derivative at starting location 15 | % c1: sufficient decrease parameter 16 | % debug: display debugging information 17 | % LS_interp: type of interpolation 18 | % progTol: minimum allowable step length 19 | % doPlot: do a graphical display of interpolation 20 | % funObj: objective function 21 | % varargin: parameters of objective function 22 | % 23 | % Outputs: 24 | % t: step length 25 | % f_new: function value at x+t*d 26 | % g_new: gradient value at x+t*d 27 | % funEvals: number function evaluations performed by line search 28 | % H: Hessian at initial guess (only computed if requested) 29 | % 30 | % recet change: LS changed to LS_interp and LS_multi 31 | 32 | % Evaluate the Objective and Gradient at the Initial Step 33 | if nargout == 6 34 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 35 | else 36 | [f_new,g_new] = funObj(x+t*d,varargin{:}); 37 | end 38 | funEvals = 1; 39 | 40 | while f_new > fr + c1*t*gtd || ~isLegal(f_new) 41 | temp = t; 42 | 43 | if LS_interp == 0 || ~isLegal(f_new) 44 | % Ignore value of new point 45 | if debug 46 | fprintf('Fixed BT\n'); 47 | end 48 | t = 0.5*t; 49 | elseif LS_interp == 1 || ~isLegal(g_new) 50 | % Use function value at new point, but not its derivative 51 | if funEvals < 2 || LS_multi == 0 || ~isLegal(f_prev) 52 | % Backtracking w/ quadratic interpolation based on two points 53 | if debug 54 | fprintf('Quad BT\n'); 55 | end 56 | t = polyinterp([0 f gtd; t f_new sqrt(-1)],doPlot,0,t); 57 | else 58 | % Backtracking w/ cubic interpolation based on three points 59 | if debug 60 | fprintf('Cubic BT\n'); 61 | end 62 | t = polyinterp([0 f gtd; t f_new sqrt(-1); t_prev f_prev sqrt(-1)],doPlot,0,t); 63 | end 64 | else 65 | % Use function value and derivative at new point 66 | 67 | if funEvals < 2 || LS_multi == 0 || ~isLegal(f_prev) 68 | % Backtracking w/ cubic interpolation w/ derivative 69 | if debug 70 | fprintf('Grad-Cubic BT\n'); 71 | end 72 | t = polyinterp([0 f gtd; t f_new g_new'*d],doPlot,0,t); 73 | elseif ~isLegal(g_prev) 74 | % Backtracking w/ quartic interpolation 3 points and derivative 75 | % of two 76 | if debug 77 | fprintf('Grad-Quartic BT\n'); 78 | end 79 | t = polyinterp([0 f gtd; t f_new g_new'*d; t_prev f_prev sqrt(-1)],doPlot,0,t); 80 | else 81 | % Backtracking w/ quintic interpolation of 3 points and derivative 82 | % of two 83 | if debug 84 | fprintf('Grad-Quintic BT\n'); 85 | end 86 | t = polyinterp([0 f gtd; t f_new g_new'*d; t_prev f_prev g_prev'*d],doPlot,0,t); 87 | end 88 | end 89 | 90 | % Adjust if change in t is too small/large 91 | if t < temp*1e-3 92 | if debug 93 | fprintf('Interpolated Value Too Small, Adjusting\n'); 94 | end 95 | t = temp*1e-3; 96 | elseif t > temp*0.6 97 | if debug 98 | fprintf('Interpolated Value Too Large, Adjusting\n'); 99 | end 100 | t = temp*0.6; 101 | end 102 | 103 | % Store old point if doing three-point interpolation 104 | if LS_multi 105 | f_prev = f_new; 106 | t_prev = temp; 107 | if LS_interp == 2 108 | g_prev = g_new; 109 | end 110 | end 111 | 112 | if ~saveHessianComp && nargout == 6 113 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 114 | else 115 | [f_new,g_new] = funObj(x + t*d,varargin{:}); 116 | end 117 | funEvals = funEvals+1; 118 | 119 | % Check whether step size has become too small 120 | if max(abs(t*d)) <= progTol 121 | if debug 122 | fprintf('Backtracking Line Search Failed\n'); 123 | end 124 | t = 0; 125 | f_new = f; 126 | g_new = g; 127 | break; 128 | end 129 | end 130 | 131 | % Evaluate Hessian at new point 132 | if nargout == 6 && funEvals > 1 && saveHessianComp 133 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 134 | funEvals = funEvals+1; 135 | end 136 | 137 | x_new = x + t*d; 138 | 139 | end 140 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/mex/mcholC.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mex.h" 3 | 4 | double mymax(double x, double y) 5 | { 6 | if (x > y) 7 | return x; 8 | else 9 | return y; 10 | } 11 | 12 | double absolute(double x) 13 | { 14 | if (x >= -x) 15 | return x; 16 | else 17 | return -x; 18 | } 19 | 20 | void permuteInt(int *x, int p, int q) 21 | { 22 | int temp; 23 | temp = x[p]; 24 | x[p] = x[q]; 25 | x[q] = temp; 26 | } 27 | 28 | void permute(double *x, int p, int q) 29 | { 30 | double temp; 31 | temp = x[p]; 32 | x[p] = x[q]; 33 | x[q] = temp; 34 | } 35 | 36 | void permuteRows(double *x, int p, int q,int n) 37 | { 38 | int i; 39 | double temp; 40 | for(i = 0; i < n; i++) 41 | { 42 | temp = x[p+i*n]; 43 | x[p+i*n] = x[q+i*n]; 44 | x[q+i*n] = temp; 45 | } 46 | } 47 | 48 | void permuteCols(double *x, int p, int q,int n) 49 | { 50 | int i; 51 | double temp; 52 | for(i = 0; i < n; i++) 53 | { 54 | temp = x[i+p*n]; 55 | x[i+p*n] = x[i+q*n]; 56 | x[i+q*n] = temp; 57 | } 58 | } 59 | 60 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) 61 | { 62 | int n,sizL[2],sizD[2],i,j,q,s, 63 | *P; 64 | 65 | double mu,gamma,xi,delta,beta,maxVal,theta, 66 | *c, *H, *L, *D, *A; 67 | 68 | /* Input */ 69 | H = mxGetPr(prhs[0]); 70 | if (nrhs == 1) 71 | { 72 | mu = 1e-12; 73 | } 74 | else 75 | { 76 | mu = mxGetScalar(prhs[1]); 77 | } 78 | 79 | /* Compute Sizes */ 80 | n = mxGetDimensions(prhs[0])[0]; 81 | 82 | /* Form Output */ 83 | sizL[0] = n; 84 | sizL[1] = n; 85 | plhs[0] = mxCreateNumericArray(2,sizL,mxDOUBLE_CLASS,mxREAL); 86 | L = mxGetPr(plhs[0]); 87 | sizD[0] = n; 88 | sizD[1] = 1; 89 | plhs[1] = mxCreateNumericArray(2,sizD,mxDOUBLE_CLASS,mxREAL); 90 | D = mxGetPr(plhs[1]); 91 | plhs[2] = mxCreateNumericArray(2,sizD,mxINT32_CLASS,mxREAL); 92 | P = (int*)mxGetData(plhs[2]); 93 | 94 | /* Initialize */ 95 | c = mxCalloc(n*n,sizeof(double)); 96 | A = mxCalloc(n*n,sizeof(double)); 97 | 98 | for (i = 0; i < n; i++) 99 | { 100 | P[i] = i; 101 | for (j = 0;j < n; j++) 102 | { 103 | A[i+n*j] = H[i+n*j]; 104 | } 105 | } 106 | 107 | gamma = 0; 108 | for (i = 0; i < n; i++) 109 | { 110 | L[i+n*i] = 1; 111 | c[i+n*i] = A[i+n*i]; 112 | } 113 | 114 | /* Compute modification parameters */ 115 | gamma = -1; 116 | xi = -1; 117 | for (i = 0; i < n; i++) 118 | { 119 | gamma = mymax(gamma,absolute(A[i+n*i])); 120 | for (j = 0;j < n; j++) 121 | { 122 | /*printf("A(%d,%d) = %f, %f\n",i,j,A[i+n*j],absolute(A[i+n*j]));*/ 123 | if (i != j) 124 | xi = mymax(xi,absolute(A[i+n*j])); 125 | } 126 | } 127 | delta = mu*mymax(gamma+xi,1); 128 | 129 | if (n > 1) 130 | { 131 | beta = sqrt(mymax(gamma,mymax(mu,xi/sqrt(n*n-1)))); 132 | } 133 | else 134 | { 135 | beta = sqrt(mymax(gamma,mu)); 136 | } 137 | 138 | for (j = 0; j < n; j++) 139 | { 140 | 141 | /* Find q that results in Best Permutation with j */ 142 | maxVal = -1; 143 | q = 0; 144 | for(i = j; i < n; i++) 145 | { 146 | if (absolute(c[i+n*i]) > maxVal) 147 | { 148 | maxVal = mymax(maxVal,absolute(c[i+n*i])); 149 | q = i; 150 | } 151 | } 152 | 153 | /* Permute D,c,L,A,P */ 154 | permute(D,j,q); 155 | permuteInt(P,j,q); 156 | permuteRows(c,j,q,n); 157 | permuteCols(c,j,q,n); 158 | permuteRows(L,j,q,n); 159 | permuteCols(L,j,q,n); 160 | permuteRows(A,j,q,n); 161 | permuteCols(A,j,q,n); 162 | 163 | for(s = 0; s <= j-1; s++) 164 | L[j+n*s] = c[j+n*s]/D[s]; 165 | 166 | for(i = j+1; i < n; i++) 167 | { 168 | c[i+j*n] = A[i+j*n]; 169 | for(s = 0; s <= j-1; s++) 170 | { 171 | c[i+j*n] -= L[j+n*s]*c[i+n*s]; 172 | } 173 | } 174 | 175 | theta = 0; 176 | if (j < n-1) 177 | { 178 | for(i = j+1;i < n; i++) 179 | theta = mymax(theta,absolute(c[i+n*j])); 180 | } 181 | 182 | D[j] = mymax(absolute(c[j+n*j]),mymax(delta,theta*theta/(beta*beta))); 183 | 184 | if (j < n-1) 185 | { 186 | for(i = j+1; i < n; i++) 187 | { 188 | c[i+n*i] = c[i+n*i] - c[i+n*j]*c[i+n*j]/D[j]; 189 | } 190 | } 191 | 192 | } 193 | 194 | for(i = 0; i < n; i++) 195 | P[i]++; 196 | 197 | mxFree(c); 198 | mxFree(A); 199 | } -------------------------------------------------------------------------------- /ZSL_Train.m: -------------------------------------------------------------------------------- 1 | function [ ] = ZSL_Train(Dateset, Splitmode, ImgFtSource, lambda1, lambda2, GPU_mode) 2 | % example 3 | % ZSL_Train('CUBird', 'Easy', 'DET', 100000, 10000, true) 4 | 5 | %gpuDevice(2) 6 | if(~exist('Dateset', 'var')) Dateset = 'CUBird'; end % {'CUBird', 'NABird'} 7 | if(~exist('Splitmode', 'var')) Splitmode = 'Easy'; end % {'Easy', 'Hard'} 8 | % feature extracted based on (1)detected boundingbox or (2)annotation. 9 | if(~exist('ImgFtSource', 'var')) ImgFtSource = 'DET'; end % {'DET', 'ATN'} 10 | if(~exist('lambda1', 'var')) lambda1 = 100000; end 11 | if(~exist('lambda2', 'var')) lambda2 = 10000; end 12 | if(~exist('GPU_mode', 'var')) GPU_mode = true; end 13 | addpath(genpath('./minFunc_2012')) 14 | %%%% set to True if continuing to train 15 | continueTrain = false; 16 | if(continueTrain) 17 | continue_weight_path = 'CUBirdResult/CUBird_Easy_Param_5_4_DET/Weight_opt_250.mat'; 18 | startLoop = 251; 19 | end 20 | 21 | path = get_datapath(Dateset, Splitmode, ImgFtSource, lambda1, lambda2, true); 22 | 23 | if(GPU_mode) fprintf('Using GPU_mode to train.\n') 24 | else fprintf('Using CPU_mode to train.\n') 25 | end 26 | 27 | %%%% prepare the data for training. 28 | img_feat_dict = load(path.img_feat_path); 29 | text_feat_dict = load(path.text_feat_path); 30 | img_label_dict = load(path.img_label_path); 31 | data_split_dict = load(path.data_split_path); 32 | 33 | label = img_label_dict.imageClassLabels(:, 2); 34 | Data = double(img_feat_dict.cnn_feat'); 35 | 36 | ctr = data_split_dict.train_cid; 37 | cte = data_split_dict.test_cid; 38 | 39 | 40 | NumTrnClass = length(unique(ctr)); 41 | NumTstClass = length(unique(cte)); 42 | fprintf('Load training set\n') 43 | 44 | NumClass = NumTrnClass + NumTstClass; 45 | nPerClass = zeros(NumClass, 1); 46 | Id_perClass = cell(NumClass, 1); 47 | 48 | for idc = 1:NumClass 49 | Id_perClass{idc} = find(label==idc); 50 | nPerClass(idc) = sum(label==idc); 51 | end 52 | 53 | Xtr = []; ytr = []; 54 | for idc = ctr 55 | Xc = Data(Id_perClass{idc}, :); 56 | Xtr = [Xtr; Xc]; 57 | ytr = [ytr; idc*ones(size(Xc,1),1)]; 58 | end 59 | 60 | C = NumTrnClass; 61 | N = length(ytr); 62 | Y = zeros(N, C); 63 | y = zeros(N, 1); 64 | for n =1:N 65 | Y(n, :) = ctr==ytr(n); 66 | y(n) = find(ctr==ytr(n)); 67 | end 68 | 69 | X = Xtr'; 70 | Z = text_feat_dict.PredicateMatrix(ctr, :)'; 71 | d_x = size(X, 1); 72 | d_z = size(Z, 1); 73 | % Dimension of features for each part 74 | if(~exist('d_p', 'var')) d_p = 512; end 75 | % Dimension of embedding space 76 | if(~exist('m', 'var')) m = NumTrnClass; end 77 | if(strcmp(Dateset, 'CUBird')) 78 | num_Parts = 7; 79 | elseif(strcmp(Dateset, 'NABird')) 80 | num_Parts = 6; 81 | end 82 | 83 | %%%% Set parameter for training 84 | MAX_ITER = 20; %%%% Number of iterations in a loop 85 | MAX_LOOP = 300; %%%% Number of max loops. 86 | MAX_FUNCEVL =100; %%%% 87 | 88 | options = []; 89 | options.Method = 'lbfgs'; 90 | options.Display = 'full'; 91 | options.DerivativeCheck = 'off'; 92 | options.maxFunEvals = MAX_FUNCEVL; 93 | options.MaxIter = MAX_ITER; 94 | 95 | %%%% Initialize weights 96 | if(continueTrain) 97 | load(continue_weight_path); 98 | W_init_x = W_x_opt; 99 | W_init_z = W_z_opt; 100 | disp(['Continue training from:', continue_weight_path]); 101 | else 102 | startLoop = 1; 103 | W_init_x = randn(m ,d_x); 104 | W_init_z = randn(m ,d_z); 105 | disp('Start from Random Initialization.') 106 | end 107 | 108 | if(GPU_mode) 109 | %%%% prepare gpu data for iteration: 110 | X = gpuArray(X); 111 | Z = gpuArray(Z); 112 | Y = gpuArray(Y); 113 | %%%% prepare gpu data for iteration: End 114 | end 115 | 116 | ZZ_t = Z * Z'; 117 | W_x_opt = W_init_x; 118 | W_z_opt = W_init_z; 119 | 120 | fprintf('train_acc = %1.4f%% \n', 100 * (1-get_error(X, W_x_opt, W_z_opt , Z, y))); 121 | trainWx_FLAG = false; 122 | 123 | for train_Itn = startLoop : MAX_LOOP 124 | 125 | t = clock; 126 | if(trainWx_FLAG) 127 | fprintf('\nITER %d: Training W_x\n', train_Itn); 128 | else 129 | fprintf('\nITER %d: Training W_z\n', train_Itn); 130 | end 131 | 132 | 133 | %%%% compute the D_z and D_xz 134 | D_xzi = zeros(d_z,d_z, num_Parts); 135 | W_x_t = W_init_x'; 136 | for i = 1:num_Parts 137 | W_xz = W_x_t((d_p*(i-1)+1) : d_p*(i),:) * W_init_z; 138 | D_xzi(:,:,i) = diag([1 ./ (2*sqrt(sum((W_xz').^2,2) + 0.0001))]); 139 | end 140 | 141 | if(GPU_mode) 142 | %%%% prepare gpu data inside iteration: 143 | D_xzi_cell = cell(num_Parts, 1); 144 | for i = 1:num_Parts 145 | D_xzi_cell{i} = gpuArray(sparse(D_xzi(:,:,i))); 146 | end 147 | if(trainWx_FLAG) 148 | W_init_z = gpuArray(W_init_z); 149 | else 150 | W_init_x = gpuArray(W_init_x); 151 | end 152 | else 153 | D_xzi_cell = cell(num_Parts, 1); 154 | for i = 1:num_Parts 155 | D_xzi_cell{i} = sparse(D_xzi(:,:,i)); 156 | end 157 | end 158 | 159 | fprintf('Start training using L-BFGS ......\n') 160 | if(trainWx_FLAG) 161 | W_x_opt = minFunc(@ZSL_ObjFunc_Wx, reshape(W_init_x,[m*d_x, 1]), options, num_Parts, m, d_x, W_init_z, ... 162 | X, Z, Y, ZZ_t, D_xzi_cell, lambda1, lambda2, GPU_mode); 163 | 164 | W_x_opt = reshape(W_x_opt, [m, d_x]); 165 | W_z_opt = W_init_z; 166 | if(GPU_mode) 167 | W_z_opt = gather(W_z_opt); 168 | end 169 | else 170 | W_z_opt = minFunc(@ZSL_ObjFunc_Wz, reshape(W_init_z,[m*d_z, 1]), options, num_Parts, m, d_x, d_z, W_init_x,... 171 | X, Z, Y, ZZ_t, D_xzi_cell, lambda1, lambda2, GPU_mode); 172 | 173 | W_z_opt = reshape(W_z_opt, [m, d_z]); 174 | W_x_opt = W_init_x; 175 | if(GPU_mode) 176 | W_x_opt = gather(W_x_opt); 177 | end 178 | end 179 | trainWx_FLAG = ~trainWx_FLAG; % train W_z and W_x alternatively 180 | 181 | %%%% calculate each loss 182 | parts_Regu =0; 183 | if(lambda2) 184 | W_x_t = W_x_opt'; 185 | for i = 1:num_Parts 186 | W_xz = W_x_t((d_p*(i-1)+1) : d_p*(i),:) * W_z_opt; 187 | parts_Regu = parts_Regu + sum(sqrt(sum(W_xz.^2, 1))); 188 | end 189 | end 190 | 191 | Wxt_Wz_Z =W_x_opt' * W_z_opt * Z; 192 | 193 | f0 = norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2; 194 | f1 = lambda1 * norm( Wxt_Wz_Z ,'fro')^2; 195 | f2 = lambda2 * parts_Regu; 196 | f = f0 + f1 + f2; 197 | 198 | fprintf('\nTime for loop: %f seconds.\n', etime(clock,t)); 199 | fprintf('train_acc = %1.4f%%\n', 100 * (1-get_error(X, W_x_opt, W_z_opt , Z, y))); 200 | fprintf('Total Loss: f = %f, Loss_0 = %f, Loss_1 = %f, Loss_2 = %f \n\n', f, f0, f1,f2); 201 | 202 | fid = fopen([path.repath '/results.txt'], 'a+'); 203 | fprintf(fid, 'ITER %d: train_acc = %1.4f%%\n', train_Itn, 100 * (1-get_error(X, W_x_opt, W_z_opt , Z, y))); 204 | fprintf(fid, 'Total Loss: f = %f, Loss_0 = %f, Loss_1 = %f, Loss_2 = %f \n\n', f, f0, f1,f2); 205 | fclose(fid); 206 | 207 | if(mod(train_Itn, 10) == 0) 208 | Weight_Name = sprintf([path.repath '/Weight_opt_%d'], train_Itn); 209 | save(Weight_Name, 'W_x_opt', 'W_z_opt'); 210 | end 211 | 212 | %%%% use the current weight as initialization. 213 | W_init_z = W_z_opt; 214 | W_init_x = W_x_opt; 215 | end 216 | 217 | end 218 | 219 | function err = get_error(X, W_x, W_z, Z, y) 220 | pred_score =X' * W_x' * W_z * Z; 221 | [~, maxIdx] = max(pred_score'); 222 | pred_id = maxIdx'; 223 | GT_id = y; 224 | err = sum(pred_id ~= GT_id) / length(y); 225 | end 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /minFunc_2012/minFunc/WolfeLineSearch.m: -------------------------------------------------------------------------------- 1 | function [t,f_new,g_new,funEvals,H] = WolfeLineSearch(... 2 | x,t,d,f,g,gtd,c1,c2,LS_interp,LS_multi,maxLS,progTol,debug,doPlot,saveHessianComp,funObj,varargin) 3 | % 4 | % Bracketing Line Search to Satisfy Wolfe Conditions 5 | % 6 | % Inputs: 7 | % x: starting location 8 | % t: initial step size 9 | % d: descent direction 10 | % f: function value at starting location 11 | % g: gradient at starting location 12 | % gtd: directional derivative at starting location 13 | % c1: sufficient decrease parameter 14 | % c2: curvature parameter 15 | % debug: display debugging information 16 | % LS_interp: type of interpolation 17 | % maxLS: maximum number of iterations 18 | % progTol: minimum allowable step length 19 | % doPlot: do a graphical display of interpolation 20 | % funObj: objective function 21 | % varargin: parameters of objective function 22 | % 23 | % Outputs: 24 | % t: step length 25 | % f_new: function value at x+t*d 26 | % g_new: gradient value at x+t*d 27 | % funEvals: number function evaluations performed by line search 28 | % H: Hessian at initial guess (only computed if requested 29 | 30 | % Evaluate the Objective and Gradient at the Initial Step 31 | if nargout == 5 32 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 33 | else 34 | [f_new,g_new] = funObj(x+t*d,varargin{:}); 35 | end 36 | funEvals = 1; 37 | gtd_new = g_new'*d; 38 | 39 | % Bracket an Interval containing a point satisfying the 40 | % Wolfe criteria 41 | 42 | LSiter = 0; 43 | t_prev = 0; 44 | f_prev = f; 45 | g_prev = g; 46 | gtd_prev = gtd; 47 | nrmD = max(abs(d)); 48 | done = 0; 49 | 50 | while LSiter < maxLS 51 | 52 | %% Bracketing Phase 53 | if ~isLegal(f_new) || ~isLegal(g_new) 54 | if debug 55 | fprintf('Extrapolated into illegal region, switching to Armijo line-search\n'); 56 | end 57 | t = (t + t_prev)/2; 58 | % Do Armijo 59 | if nargout == 5 60 | [t,x_new,f_new,g_new,armijoFunEvals,H] = ArmijoBacktrack(... 61 | x,t,d,f,f,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,... 62 | funObj,varargin{:}); 63 | else 64 | [t,x_new,f_new,g_new,armijoFunEvals] = ArmijoBacktrack(... 65 | x,t,d,f,f,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,... 66 | funObj,varargin{:}); 67 | end 68 | funEvals = funEvals + armijoFunEvals; 69 | return; 70 | end 71 | 72 | 73 | if f_new > f + c1*t*gtd || (LSiter > 1 && f_new >= f_prev) 74 | bracket = [t_prev t]; 75 | bracketFval = [f_prev f_new]; 76 | bracketGval = [g_prev g_new]; 77 | break; 78 | elseif abs(gtd_new) <= -c2*gtd 79 | bracket = t; 80 | bracketFval = f_new; 81 | bracketGval = g_new; 82 | done = 1; 83 | break; 84 | elseif gtd_new >= 0 85 | bracket = [t_prev t]; 86 | bracketFval = [f_prev f_new]; 87 | bracketGval = [g_prev g_new]; 88 | break; 89 | end 90 | temp = t_prev; 91 | t_prev = t; 92 | minStep = t + 0.01*(t-temp); 93 | maxStep = t*10; 94 | if LS_interp <= 1 95 | if debug 96 | fprintf('Extending Braket\n'); 97 | end 98 | t = maxStep; 99 | elseif LS_interp == 2 100 | if debug 101 | fprintf('Cubic Extrapolation\n'); 102 | end 103 | t = polyinterp([temp f_prev gtd_prev; t f_new gtd_new],doPlot,minStep,maxStep); 104 | elseif LS_interp == 3 105 | t = mixedExtrap(temp,f_prev,gtd_prev,t,f_new,gtd_new,minStep,maxStep,debug,doPlot); 106 | end 107 | 108 | f_prev = f_new; 109 | g_prev = g_new; 110 | gtd_prev = gtd_new; 111 | if ~saveHessianComp && nargout == 5 112 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 113 | else 114 | [f_new,g_new] = funObj(x + t*d,varargin{:}); 115 | end 116 | funEvals = funEvals + 1; 117 | gtd_new = g_new'*d; 118 | LSiter = LSiter+1; 119 | end 120 | 121 | if LSiter == maxLS 122 | bracket = [0 t]; 123 | bracketFval = [f f_new]; 124 | bracketGval = [g g_new]; 125 | end 126 | 127 | %% Zoom Phase 128 | 129 | % We now either have a point satisfying the criteria, or a bracket 130 | % surrounding a point satisfying the criteria 131 | % Refine the bracket until we find a point satisfying the criteria 132 | insufProgress = 0; 133 | Tpos = 2; 134 | LOposRemoved = 0; 135 | while ~done && LSiter < maxLS 136 | 137 | % Find High and Low Points in bracket 138 | [f_LO LOpos] = min(bracketFval); 139 | HIpos = -LOpos + 3; 140 | 141 | % Compute new trial value 142 | if LS_interp <= 1 || ~isLegal(bracketFval) || ~isLegal(bracketGval) 143 | if debug 144 | fprintf('Bisecting\n'); 145 | end 146 | t = mean(bracket); 147 | elseif LS_interp == 2 148 | if debug 149 | fprintf('Grad-Cubic Interpolation\n'); 150 | end 151 | t = polyinterp([bracket(1) bracketFval(1) bracketGval(:,1)'*d 152 | bracket(2) bracketFval(2) bracketGval(:,2)'*d],doPlot); 153 | else 154 | % Mixed Case %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 155 | nonTpos = -Tpos+3; 156 | if LOposRemoved == 0 157 | oldLOval = bracket(nonTpos); 158 | oldLOFval = bracketFval(nonTpos); 159 | oldLOGval = bracketGval(:,nonTpos); 160 | end 161 | t = mixedInterp(bracket,bracketFval,bracketGval,d,Tpos,oldLOval,oldLOFval,oldLOGval,debug,doPlot); 162 | end 163 | 164 | 165 | % Test that we are making sufficient progress 166 | if min(max(bracket)-t,t-min(bracket))/(max(bracket)-min(bracket)) < 0.1 167 | if debug 168 | fprintf('Interpolation close to boundary'); 169 | end 170 | if insufProgress || t>=max(bracket) || t <= min(bracket) 171 | if debug 172 | fprintf(', Evaluating at 0.1 away from boundary\n'); 173 | end 174 | if abs(t-max(bracket)) < abs(t-min(bracket)) 175 | t = max(bracket)-0.1*(max(bracket)-min(bracket)); 176 | else 177 | t = min(bracket)+0.1*(max(bracket)-min(bracket)); 178 | end 179 | insufProgress = 0; 180 | else 181 | if debug 182 | fprintf('\n'); 183 | end 184 | insufProgress = 1; 185 | end 186 | else 187 | insufProgress = 0; 188 | end 189 | 190 | % Evaluate new point 191 | if ~saveHessianComp && nargout == 5 192 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 193 | else 194 | [f_new,g_new] = funObj(x + t*d,varargin{:}); 195 | end 196 | funEvals = funEvals + 1; 197 | gtd_new = g_new'*d; 198 | LSiter = LSiter+1; 199 | 200 | armijo = f_new < f + c1*t*gtd; 201 | if ~armijo || f_new >= f_LO 202 | % Armijo condition not satisfied or not lower than lowest 203 | % point 204 | bracket(HIpos) = t; 205 | bracketFval(HIpos) = f_new; 206 | bracketGval(:,HIpos) = g_new; 207 | Tpos = HIpos; 208 | else 209 | if abs(gtd_new) <= - c2*gtd 210 | % Wolfe conditions satisfied 211 | done = 1; 212 | elseif gtd_new*(bracket(HIpos)-bracket(LOpos)) >= 0 213 | % Old HI becomes new LO 214 | bracket(HIpos) = bracket(LOpos); 215 | bracketFval(HIpos) = bracketFval(LOpos); 216 | bracketGval(:,HIpos) = bracketGval(:,LOpos); 217 | if LS_interp == 3 218 | if debug 219 | fprintf('LO Pos is being removed!\n'); 220 | end 221 | LOposRemoved = 1; 222 | oldLOval = bracket(LOpos); 223 | oldLOFval = bracketFval(LOpos); 224 | oldLOGval = bracketGval(:,LOpos); 225 | end 226 | end 227 | % New point becomes new LO 228 | bracket(LOpos) = t; 229 | bracketFval(LOpos) = f_new; 230 | bracketGval(:,LOpos) = g_new; 231 | Tpos = LOpos; 232 | end 233 | 234 | if ~done && abs(bracket(1)-bracket(2))*nrmD < progTol 235 | if debug 236 | fprintf('Line-search bracket has been reduced below progTol\n'); 237 | end 238 | break; 239 | end 240 | 241 | end 242 | 243 | %% 244 | if LSiter == maxLS 245 | if debug 246 | fprintf('Line Search Exceeded Maximum Line Search Iterations\n'); 247 | end 248 | end 249 | 250 | [f_LO LOpos] = min(bracketFval); 251 | t = bracket(LOpos); 252 | f_new = bracketFval(LOpos); 253 | g_new = bracketGval(:,LOpos); 254 | 255 | 256 | 257 | % Evaluate Hessian at new point 258 | if nargout == 5 && funEvals > 1 && saveHessianComp 259 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 260 | funEvals = funEvals + 1; 261 | end 262 | 263 | end 264 | 265 | 266 | %% 267 | function [t] = mixedExtrap(x0,f0,g0,x1,f1,g1,minStep,maxStep,debug,doPlot); 268 | alpha_c = polyinterp([x0 f0 g0; x1 f1 g1],doPlot,minStep,maxStep); 269 | alpha_s = polyinterp([x0 f0 g0; x1 sqrt(-1) g1],doPlot,minStep,maxStep); 270 | if alpha_c > minStep && abs(alpha_c - x1) < abs(alpha_s - x1) 271 | if debug 272 | fprintf('Cubic Extrapolation\n'); 273 | end 274 | t = alpha_c; 275 | else 276 | if debug 277 | fprintf('Secant Extrapolation\n'); 278 | end 279 | t = alpha_s; 280 | end 281 | end 282 | 283 | %% 284 | function [t] = mixedInterp(bracket,bracketFval,bracketGval,d,Tpos,oldLOval,oldLOFval,oldLOGval,debug,doPlot); 285 | 286 | % Mixed Case %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 287 | nonTpos = -Tpos+3; 288 | 289 | gtdT = bracketGval(:,Tpos)'*d; 290 | gtdNonT = bracketGval(:,nonTpos)'*d; 291 | oldLOgtd = oldLOGval'*d; 292 | if bracketFval(Tpos) > oldLOFval 293 | alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd 294 | bracket(Tpos) bracketFval(Tpos) gtdT],doPlot); 295 | alpha_q = polyinterp([oldLOval oldLOFval oldLOgtd 296 | bracket(Tpos) bracketFval(Tpos) sqrt(-1)],doPlot); 297 | if abs(alpha_c - oldLOval) < abs(alpha_q - oldLOval) 298 | if debug 299 | fprintf('Cubic Interpolation\n'); 300 | end 301 | t = alpha_c; 302 | else 303 | if debug 304 | fprintf('Mixed Quad/Cubic Interpolation\n'); 305 | end 306 | t = (alpha_q + alpha_c)/2; 307 | end 308 | elseif gtdT'*oldLOgtd < 0 309 | alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd 310 | bracket(Tpos) bracketFval(Tpos) gtdT],doPlot); 311 | alpha_s = polyinterp([oldLOval oldLOFval oldLOgtd 312 | bracket(Tpos) sqrt(-1) gtdT],doPlot); 313 | if abs(alpha_c - bracket(Tpos)) >= abs(alpha_s - bracket(Tpos)) 314 | if debug 315 | fprintf('Cubic Interpolation\n'); 316 | end 317 | t = alpha_c; 318 | else 319 | if debug 320 | fprintf('Quad Interpolation\n'); 321 | end 322 | t = alpha_s; 323 | end 324 | elseif abs(gtdT) <= abs(oldLOgtd) 325 | alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd 326 | bracket(Tpos) bracketFval(Tpos) gtdT],... 327 | doPlot,min(bracket),max(bracket)); 328 | alpha_s = polyinterp([oldLOval sqrt(-1) oldLOgtd 329 | bracket(Tpos) bracketFval(Tpos) gtdT],... 330 | doPlot,min(bracket),max(bracket)); 331 | if alpha_c > min(bracket) && alpha_c < max(bracket) 332 | if abs(alpha_c - bracket(Tpos)) < abs(alpha_s - bracket(Tpos)) 333 | if debug 334 | fprintf('Bounded Cubic Extrapolation\n'); 335 | end 336 | t = alpha_c; 337 | else 338 | if debug 339 | fprintf('Bounded Secant Extrapolation\n'); 340 | end 341 | t = alpha_s; 342 | end 343 | else 344 | if debug 345 | fprintf('Bounded Secant Extrapolation\n'); 346 | end 347 | t = alpha_s; 348 | end 349 | 350 | if bracket(Tpos) > oldLOval 351 | t = min(bracket(Tpos) + 0.66*(bracket(nonTpos) - bracket(Tpos)),t); 352 | else 353 | t = max(bracket(Tpos) + 0.66*(bracket(nonTpos) - bracket(Tpos)),t); 354 | end 355 | else 356 | t = polyinterp([bracket(nonTpos) bracketFval(nonTpos) gtdNonT 357 | bracket(Tpos) bracketFval(Tpos) gtdT],doPlot); 358 | end 359 | end -------------------------------------------------------------------------------- /minFunc_2012/minFunc/minFunc.m: -------------------------------------------------------------------------------- 1 | function [x,f,exitflag,output] = minFunc(funObj,x0,options,varargin) 2 | % [x,f,exitflag,output] = minFunc(funObj,x0,options,varargin) 3 | % 4 | % Unconstrained optimizer using a line search strategy 5 | % 6 | % Uses an interface very similar to fminunc 7 | % (it doesn't support all of the optimization toolbox options, 8 | % but supports many other options). 9 | % 10 | % It computes descent directions using one of ('Method'): 11 | % - 'sd': Steepest Descent 12 | % (no previous information used, not recommended) 13 | % - 'csd': Cyclic Steepest Descent 14 | % (uses previous step length for a fixed length cycle) 15 | % - 'bb': Barzilai and Borwein Gradient 16 | % (uses only previous step) 17 | % - 'cg': Non-Linear Conjugate Gradient 18 | % (uses only previous step and a vector beta) 19 | % - 'scg': Scaled Non-Linear Conjugate Gradient 20 | % (uses previous step and a vector beta, 21 | % and Hessian-vector products to initialize line search) 22 | % - 'pcg': Preconditionined Non-Linear Conjugate Gradient 23 | % (uses only previous step and a vector beta, preconditioned version) 24 | % - 'lbfgs': Quasi-Newton with Limited-Memory BFGS Updating 25 | % (default: uses a predetermined nunber of previous steps to form a 26 | % low-rank Hessian approximation) 27 | % - 'newton0': Hessian-Free Newton 28 | % (numerically computes Hessian-Vector products) 29 | % - 'pnewton0': Preconditioned Hessian-Free Newton 30 | % (numerically computes Hessian-Vector products, preconditioned 31 | % version) 32 | % - 'qnewton': Quasi-Newton Hessian approximation 33 | % (uses dense Hessian approximation) 34 | % - 'mnewton': Newton's method with Hessian calculation after every 35 | % user-specified number of iterations 36 | % (needs user-supplied Hessian matrix) 37 | % - 'newton': Newton's method with Hessian calculation every iteration 38 | % (needs user-supplied Hessian matrix) 39 | % - 'tensor': Tensor 40 | % (needs user-supplied Hessian matrix and Tensor of 3rd partial derivatives) 41 | % 42 | % Several line search strategies are available for finding a step length satisfying 43 | % the termination criteria ('LS_type') 44 | % - 0 : A backtracking line-search based on the Armijo condition (default for 'bb') 45 | % - 1 : A bracekting line-search based on the strong Wolfe conditions (default for all other methods) 46 | % - 2 : The line-search from the Matlab Optimization Toolbox (requires Matlab's linesearch.m to be added to the path) 47 | % 48 | % For the Armijo line-search, several interpolation strategies are available ('LS_interp'): 49 | % - 0 : Step size halving 50 | % - 1 : Polynomial interpolation using new function values 51 | % - 2 : Polynomial interpolation using new function and gradient values (default) 52 | % 53 | % When (LS_interp = 1), the default setting of (LS_multi = 0) uses quadratic interpolation, 54 | % while if (LS_multi = 1) it uses cubic interpolation if more than one point are available. 55 | % 56 | % When (LS_interp = 2), the default setting of (LS_multi = 0) uses cubic interpolation, 57 | % while if (LS_multi = 1) it uses quartic or quintic interpolation if more than one point are available 58 | % 59 | % To use the non-monotonic Armijo condition, set the 'Fref' value to the number of previous function values to store 60 | % 61 | % For the Wolfe line-search, these interpolation strategies are available ('LS_interp'): 62 | % - 0 : Step Size Doubling and Bisection 63 | % - 1 : Cubic interpolation/extrapolation using new function and gradient values (default) 64 | % - 2 : Mixed quadratic/cubic interpolation/extrapolation 65 | % 66 | % Several strategies for choosing the initial step size are avaiable ('LS_init'): 67 | % - 0: Always try an initial step length of 1 (default for all except 'sd' and 'cg') 68 | % (t = 1) 69 | % - 1: Use a step similar to the previous step 70 | % (t = t_old*min(2,g'd/g_old'd_old)) 71 | % - 2: Quadratic Initialization using previous function value and new 72 | % function value/gradient (use this if steps tend to be very long, default for 'sd' and 'cg') 73 | % (t = min(1,2*(f-f_old)/g)) 74 | % - 3: The minimum between 1 and twice the previous step length 75 | % (t = min(1,2*t) 76 | % - 4: The scaled conjugate gradient step length (may accelerate 77 | % conjugate gradient methods, but requires a Hessian-vector product, default for 'scg') 78 | % (t = g'd/d'Hd) 79 | % 80 | % Inputs: 81 | % funObj - is a function handle 82 | % x0 - is a starting vector; 83 | % options - is a struct containing parameters (defaults are used for non-existent or blank fields) 84 | % varargin{:} - all other arguments are passed as additional arguments to funObj 85 | % 86 | % Outputs: 87 | % x is the minimum value found 88 | % f is the function value at the minimum found 89 | % exitflag returns an exit condition 90 | % output returns a structure with other information 91 | % 92 | % Supported Input Options 93 | % Display - Level of display [ off | final | (iter) | full | excessive ] 94 | % MaxFunEvals - Maximum number of function evaluations allowed (1000) 95 | % MaxIter - Maximum number of iterations allowed (500) 96 | % optTol - Termination tolerance on the first-order optimality (1e-5) 97 | % progTol - Termination tolerance on progress in terms of function/parameter changes (1e-9) 98 | % Method - [ sd | csd | bb | cg | scg | pcg | {lbfgs} | newton0 | pnewton0 | 99 | % qnewton | mnewton | newton | tensor ] 100 | % c1 - Sufficient Decrease for Armijo condition (1e-4) 101 | % c2 - Curvature Decrease for Wolfe conditions (.2 for cg methods, .9 otherwise) 102 | % LS_init - Line Search Initialization - see above (2 for cg/sd, 4 for scg, 0 otherwise) 103 | % LS - Line Search type - see above (2 for bb, 4 otherwise) 104 | % Fref - Setting this to a positive integer greater than 1 105 | % will use non-monotone Armijo objective in the line search. 106 | % (20 for bb, 10 for csd, 1 for all others) 107 | % numDiff - [ 0 | 1 | 2] compute derivatives using user-supplied function (0), 108 | % numerically user forward-differencing (1), or numerically using central-differencing (2) 109 | % (default: 0) 110 | % (this option has a different effect for 'newton', see below) 111 | % useComplex - if 1, use complex differentials if computing numerical derivatives 112 | % to get very accurate values (default: 0) 113 | % DerivativeCheck - if 'on', computes derivatives numerically at initial 114 | % point and compares to user-supplied derivative (default: 'off') 115 | % outputFcn - function to run after each iteration (default: []). It 116 | % should have the following interface: 117 | % outputFcn(x,iterationType,i,funEvals,f,t,gtd,g,d,optCond,varargin{:}); 118 | % useMex - where applicable, use mex files to speed things up (default: 1) 119 | % 120 | % Method-specific input options: 121 | % newton: 122 | % HessianModify - type of Hessian modification for direct solvers to 123 | % use if the Hessian is not positive definite (default: 0) 124 | % 0: Minimum Euclidean norm s.t. eigenvalues sufficiently large 125 | % (requires eigenvalues on iterations where matrix is not pd) 126 | % 1: Start with (1/2)*||A||_F and increment until Cholesky succeeds 127 | % (an approximation to method 0, does not require eigenvalues) 128 | % 2: Modified LDL factorization 129 | % (only 1 generalized Cholesky factorization done and no eigenvalues required) 130 | % 3: Modified Spectral Decomposition 131 | % (requires eigenvalues) 132 | % 4: Modified Symmetric Indefinite Factorization 133 | % 5: Uses the eigenvector of the smallest eigenvalue as negative 134 | % curvature direction 135 | % cgSolve - use conjugate gradient instead of direct solver (default: 0) 136 | % 0: Direct Solver 137 | % 1: Conjugate Gradient 138 | % 2: Conjugate Gradient with Diagonal Preconditioner 139 | % 3: Conjugate Gradient with LBFGS Preconditioner 140 | % x: Conjugate Graident with Symmetric Successive Over Relaxation 141 | % Preconditioner with parameter x 142 | % (where x is a real number in the range [0,2]) 143 | % x: Conjugate Gradient with Incomplete Cholesky Preconditioner 144 | % with drop tolerance -x 145 | % (where x is a real negative number) 146 | % numDiff - compute Hessian numerically 147 | % (default: 0, done with complex differentials if useComplex = 1) 148 | % LS_saveHessiancomp - when on, only computes the Hessian at the 149 | % first and last iteration of the line search (default: 1) 150 | % mnewton: 151 | % HessianIter - number of iterations to use same Hessian (default: 5) 152 | % qnewton: 153 | % initialHessType - scale initial Hessian approximation (default: 1) 154 | % qnUpdate - type of quasi-Newton update (default: 3): 155 | % 0: BFGS 156 | % 1: SR1 (when it is positive-definite, otherwise BFGS) 157 | % 2: Hoshino 158 | % 3: Self-Scaling BFGS 159 | % 4: Oren's Self-Scaling Variable Metric method 160 | % 5: McCormick-Huang asymmetric update 161 | % Damped - use damped BFGS update (default: 1) 162 | % newton0/pnewton0: 163 | % HvFunc - user-supplied function that returns Hessian-vector products 164 | % (by default, these are computed numerically using autoHv) 165 | % HvFunc should have the following interface: HvFunc(v,x,varargin{:}) 166 | % useComplex - use a complex perturbation to get high accuracy 167 | % Hessian-vector products (default: 0) 168 | % (the increased accuracy can make the method much more efficient, 169 | % but gradient code must properly support complex inputs) 170 | % useNegCurv - a negative curvature direction is used as the descent 171 | % direction if one is encountered during the cg iterations 172 | % (default: 1) 173 | % precFunc (for pnewton0 only) - user-supplied preconditioner 174 | % (by default, an L-BFGS preconditioner is used) 175 | % precFunc should have the following interfact: 176 | % precFunc(v,x,varargin{:}) 177 | % lbfgs: 178 | % Corr - number of corrections to store in memory (default: 100) 179 | % (higher numbers converge faster but use more memory) 180 | % Damped - use damped update (default: 0) 181 | % cg/scg/pcg: 182 | % cgUpdate - type of update (default for cg/scg: 2, default for pcg: 1) 183 | % 0: Fletcher Reeves 184 | % 1: Polak-Ribiere 185 | % 2: Hestenes-Stiefel (not supported for pcg) 186 | % 3: Gilbert-Nocedal 187 | % HvFunc (for scg only)- user-supplied function that returns Hessian-vector 188 | % products 189 | % (by default, these are computed numerically using autoHv) 190 | % HvFunc should have the following interface: 191 | % HvFunc(v,x,varargin{:}) 192 | % precFunc (for pcg only) - user-supplied preconditioner 193 | % (by default, an L-BFGS preconditioner is used) 194 | % precFunc should have the following interface: 195 | % precFunc(v,x,varargin{:}) 196 | % bb: 197 | % bbType - type of bb step (default: 0) 198 | % 0: min_alpha ||delta_x - alpha delta_g||_2 199 | % 1: min_alpha ||alpha delta_x - delta_g||_2 200 | % 2: Conic BB 201 | % 3: Gradient method with retards 202 | % csd: 203 | % cycle - length of cycle (default: 3) 204 | % 205 | % Supported Output Options 206 | % iterations - number of iterations taken 207 | % funcCount - number of function evaluations 208 | % algorithm - algorithm used 209 | % firstorderopt - first-order optimality 210 | % message - exit message 211 | % trace.funccount - function evaluations after each iteration 212 | % trace.fval - function value after each iteration 213 | % 214 | % Author: Mark Schmidt (2005) 215 | % Web: http://www.di.ens.fr/~mschmidt/Software/minFunc.html 216 | % 217 | % Sources (in order of how much the source material contributes): 218 | % J. Nocedal and S.J. Wright. 1999. "Numerical Optimization". Springer Verlag. 219 | % R. Fletcher. 1987. "Practical Methods of Optimization". Wiley. 220 | % J. Demmel. 1997. "Applied Linear Algebra. SIAM. 221 | % R. Barret, M. Berry, T. Chan, J. Demmel, J. Dongarra, V. Eijkhout, R. 222 | % Pozo, C. Romine, and H. Van der Vost. 1994. "Templates for the Solution of 223 | % Linear Systems: Building Blocks for Iterative Methods". SIAM. 224 | % J. More and D. Thuente. "Line search algorithms with guaranteed 225 | % sufficient decrease". ACM Trans. Math. Softw. vol 20, 286-307, 1994. 226 | % M. Raydan. "The Barzilai and Borwein gradient method for the large 227 | % scale unconstrained minimization problem". SIAM J. Optim., 7, 26-33, 228 | % (1997). 229 | % "Mathematical Optimization". The Computational Science Education 230 | % Project. 1995. 231 | % C. Kelley. 1999. "Iterative Methods for Optimization". Frontiers in 232 | % Applied Mathematics. SIAM. 233 | 234 | if nargin < 3 235 | options = []; 236 | end 237 | 238 | % Get Parameters 239 | [verbose,verboseI,debug,doPlot,maxFunEvals,maxIter,optTol,progTol,method,... 240 | corrections,c1,c2,LS_init,cgSolve,qnUpdate,cgUpdate,initialHessType,... 241 | HessianModify,Fref,useComplex,numDiff,LS_saveHessianComp,... 242 | Damped,HvFunc,bbType,cycle,... 243 | HessianIter,outputFcn,useMex,useNegCurv,precFunc,... 244 | LS_type,LS_interp,LS_multi,checkGrad] = ... 245 | minFunc_processInputOptions(options); 246 | 247 | % Constants 248 | SD = 0; 249 | CSD = 1; 250 | BB = 2; 251 | CG = 3; 252 | PCG = 4; 253 | LBFGS = 5; 254 | QNEWTON = 6; 255 | NEWTON0 = 7; 256 | NEWTON = 8; 257 | TENSOR = 9; 258 | 259 | % Initialize 260 | p = length(x0); 261 | d = zeros(p,1); 262 | x = x0; 263 | t = 1; 264 | 265 | % If necessary, form numerical differentiation functions 266 | funEvalMultiplier = 1; 267 | if useComplex 268 | numDiffType = 3; 269 | else 270 | numDiffType = numDiff; 271 | end 272 | if numDiff && method ~= TENSOR 273 | varargin(3:end+2) = varargin(1:end); 274 | varargin{1} = numDiffType; 275 | varargin{2} = funObj; 276 | if method ~= NEWTON 277 | if debug 278 | if useComplex 279 | fprintf('Using complex differentials for gradient computation\n'); 280 | else 281 | fprintf('Using finite differences for gradient computation\n'); 282 | end 283 | end 284 | funObj = @autoGrad; 285 | else 286 | if debug 287 | if useComplex 288 | fprintf('Using complex differentials for Hessian computation\n'); 289 | else 290 | fprintf('Using finite differences for Hessian computation\n'); 291 | end 292 | end 293 | funObj = @autoHess; 294 | end 295 | 296 | if method == NEWTON0 && useComplex == 1 297 | if debug 298 | fprintf('Turning off the use of complex differentials for Hessian-vector products\n'); 299 | end 300 | useComplex = 0; 301 | end 302 | 303 | if useComplex 304 | funEvalMultiplier = p; 305 | elseif numDiff == 2 306 | funEvalMultiplier = 2*p; 307 | else 308 | funEvalMultiplier = p+1; 309 | end 310 | end 311 | 312 | % Evaluate Initial Point 313 | if method < NEWTON 314 | [f,g] = funObj(x,varargin{:}); 315 | computeHessian = 0; 316 | else 317 | [f,g,H] = funObj(x,varargin{:}); 318 | computeHessian = 1; 319 | end 320 | funEvals = 1; 321 | 322 | % Derivative Check 323 | if checkGrad 324 | if numDiff 325 | fprintf('Can not do derivative checking when numDiff is 1\n'); 326 | pause 327 | end 328 | derivativeCheck(funObj,x,1,numDiffType,varargin{:}); % Checks gradient 329 | if computeHessian 330 | derivativeCheck(funObj,x,2,numDiffType,varargin{:}); 331 | end 332 | end 333 | 334 | % Output Log 335 | if verboseI 336 | fprintf('%10s %10s %15s %15s %15s\n','Iteration','FunEvals','Step Length','Function Val','Opt Cond'); 337 | end 338 | 339 | % Compute optimality of initial point 340 | optCond = max(abs(g)); 341 | 342 | if nargout > 3 343 | % Initialize Trace 344 | trace.fval = f; 345 | trace.funcCount = funEvals; 346 | trace.optCond = optCond; 347 | end 348 | 349 | % Exit if initial point is optimal 350 | if optCond <= optTol 351 | exitflag=1; 352 | msg = 'Optimality Condition below optTol'; 353 | if verbose 354 | fprintf('%s\n',msg); 355 | end 356 | if nargout > 3 357 | output = struct('iterations',0,'funcCount',1,... 358 | 'algorithm',method,'firstorderopt',max(abs(g)),'message',msg,'trace',trace); 359 | end 360 | return; 361 | end 362 | 363 | % Output Function 364 | if ~isempty(outputFcn) 365 | stop = outputFcn(x,'init',0,funEvals,f,[],[],g,[],max(abs(g)),varargin{:}); 366 | if stop 367 | exitflag=-1; 368 | msg = 'Stopped by output function'; 369 | if verbose 370 | fprintf('%s\n',msg); 371 | end 372 | if nargout > 3 373 | output = struct('iterations',0,'funcCount',1,... 374 | 'algorithm',method,'firstorderopt',max(abs(g)),'message',msg,'trace',trace); 375 | end 376 | return; 377 | end 378 | end 379 | 380 | % Perform up to a maximum of 'maxIter' descent steps: 381 | for i = 1:maxIter 382 | 383 | % ****************** COMPUTE DESCENT DIRECTION ***************** 384 | 385 | switch method 386 | case SD % Steepest Descent 387 | d = -g; 388 | 389 | case CSD % Cyclic Steepest Descent 390 | 391 | if mod(i,cycle) == 1 % Use Steepest Descent 392 | alpha = 1; 393 | LS_init = 2; 394 | LS_type = 1; % Wolfe line search 395 | elseif mod(i,cycle) == mod(1+1,cycle) % Use Previous Step 396 | alpha = t; 397 | LS_init = 0; 398 | LS_type = 0; % Armijo line search 399 | end 400 | d = -alpha*g; 401 | 402 | case BB % Steepest Descent with Barzilai and Borwein Step Length 403 | 404 | if i == 1 405 | d = -g; 406 | else 407 | y = g-g_old; 408 | s = t*d; 409 | if bbType == 0 410 | yy = y'*y; 411 | alpha = (s'*y)/(yy); 412 | if alpha <= 1e-10 || alpha > 1e10 413 | alpha = 1; 414 | end 415 | elseif bbType == 1 416 | sy = s'*y; 417 | alpha = (s'*s)/sy; 418 | if alpha <= 1e-10 || alpha > 1e10 419 | alpha = 1; 420 | end 421 | elseif bbType == 2 % Conic Interpolation ('Modified BB') 422 | sy = s'*y; 423 | ss = s'*s; 424 | alpha = ss/sy; 425 | if alpha <= 1e-10 || alpha > 1e10 426 | alpha = 1; 427 | end 428 | alphaConic = ss/(6*(myF_old - f) + 4*g'*s + 2*g_old'*s); 429 | if alphaConic > .001*alpha && alphaConic < 1000*alpha 430 | alpha = alphaConic; 431 | end 432 | elseif bbType == 3 % Gradient Method with retards (bb type 1, random selection of previous step) 433 | sy = s'*y; 434 | alpha = (s'*s)/sy; 435 | if alpha <= 1e-10 || alpha > 1e10 436 | alpha = 1; 437 | end 438 | v(1+mod(i-2,5)) = alpha; 439 | alpha = v(ceil(rand*length(v))); 440 | end 441 | d = -alpha*g; 442 | end 443 | g_old = g; 444 | myF_old = f; 445 | 446 | 447 | case CG % Non-Linear Conjugate Gradient 448 | 449 | if i == 1 450 | d = -g; % Initially use steepest descent direction 451 | else 452 | gotgo = g_old'*g_old; 453 | 454 | if cgUpdate == 0 455 | % Fletcher-Reeves 456 | beta = (g'*g)/(gotgo); 457 | elseif cgUpdate == 1 458 | % Polak-Ribiere 459 | beta = (g'*(g-g_old)) /(gotgo); 460 | elseif cgUpdate == 2 461 | % Hestenes-Stiefel 462 | beta = (g'*(g-g_old))/((g-g_old)'*d); 463 | else 464 | % Gilbert-Nocedal 465 | beta_FR = (g'*(g-g_old)) /(gotgo); 466 | beta_PR = (g'*g-g'*g_old)/(gotgo); 467 | beta = max(-beta_FR,min(beta_PR,beta_FR)); 468 | end 469 | 470 | d = -g + beta*d; 471 | 472 | % Restart if not a direction of sufficient descent 473 | if g'*d > -progTol 474 | if debug 475 | fprintf('Restarting CG\n'); 476 | end 477 | beta = 0; 478 | d = -g; 479 | end 480 | 481 | % Old restart rule: 482 | %if beta < 0 || abs(gtgo)/(gotgo) >= 0.1 || g'*d >= 0 483 | 484 | end 485 | g_old = g; 486 | 487 | case PCG % Preconditioned Non-Linear Conjugate Gradient 488 | 489 | % Apply preconditioner to negative gradient 490 | if isempty(precFunc) 491 | % Use L-BFGS Preconditioner 492 | if i == 1 493 | S = zeros(p,corrections); 494 | Y = zeros(p,corrections); 495 | YS = zeros(corrections,1); 496 | lbfgs_start = 1; 497 | lbfgs_end = 0; 498 | Hdiag = 1; 499 | s = -g; 500 | else 501 | [S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(g-g_old,t*d,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex); 502 | if debug && skipped 503 | fprintf('Skipped L-BFGS updated\n'); 504 | end 505 | if useMex 506 | s = lbfgsProdC(g,S,Y,YS,int32(lbfgs_start),int32(lbfgs_end),Hdiag); 507 | else 508 | s = lbfgsProd(g,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag); 509 | end 510 | end 511 | else % User-supplied preconditioner 512 | s = precFunc(-g,x,varargin{:}); 513 | end 514 | 515 | if i == 1 516 | d = s; 517 | else 518 | 519 | if cgUpdate == 0 520 | % Preconditioned Fletcher-Reeves 521 | beta = (g'*s)/(g_old'*s_old); 522 | elseif cgUpdate < 3 523 | % Preconditioned Polak-Ribiere 524 | beta = (g'*(s-s_old))/(g_old'*s_old); 525 | else 526 | % Preconditioned Gilbert-Nocedal 527 | beta_FR = (g'*s)/(g_old'*s_old); 528 | beta_PR = (g'*(s-s_old))/(g_old'*s_old); 529 | beta = max(-beta_FR,min(beta_PR,beta_FR)); 530 | end 531 | d = s + beta*d; 532 | 533 | if g'*d > -progTol 534 | if debug 535 | fprintf('Restarting CG\n'); 536 | end 537 | beta = 0; 538 | d = s; 539 | end 540 | 541 | end 542 | g_old = g; 543 | s_old = s; 544 | case LBFGS % L-BFGS 545 | 546 | % Update the direction and step sizes 547 | if Damped 548 | if i == 1 549 | d = -g; % Initially use steepest descent direction 550 | old_dirs = zeros(length(g),0); 551 | old_stps = zeros(length(d),0); 552 | Hdiag = 1; 553 | else 554 | [old_dirs,old_stps,Hdiag] = dampedUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag); 555 | if useMex 556 | d = lbfgsC(-g,old_dirs,old_stps,Hdiag); 557 | else 558 | d = lbfgs(-g,old_dirs,old_stps,Hdiag); 559 | end 560 | end 561 | else 562 | if i == 1 563 | d = -g; % Initially use steepest descent direction 564 | S = zeros(p,corrections); 565 | Y = zeros(p,corrections); 566 | YS = zeros(corrections,1); 567 | lbfgs_start = 1; 568 | lbfgs_end = 0; 569 | Hdiag = 1; 570 | else 571 | [S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(g-g_old,t*d,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex); 572 | if debug && skipped 573 | fprintf('Skipped L-BFGS updated\n'); 574 | end 575 | if useMex 576 | d = lbfgsProdC(g,S,Y,YS,int32(lbfgs_start),int32(lbfgs_end),Hdiag); 577 | else 578 | d = lbfgsProd(g,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag); 579 | end 580 | end 581 | end 582 | g_old = g; 583 | 584 | case QNEWTON % Use quasi-Newton Hessian approximation 585 | 586 | if i == 1 587 | d = -g; 588 | else 589 | % Compute difference vectors 590 | y = g-g_old; 591 | s = t*d; 592 | 593 | if i == 2 594 | % Make initial Hessian approximation 595 | if initialHessType == 0 596 | % Identity 597 | if qnUpdate <= 1 598 | R = eye(length(g)); 599 | else 600 | H = eye(length(g)); 601 | end 602 | else 603 | % Scaled Identity 604 | if debug 605 | fprintf('Scaling Initial Hessian Approximation\n'); 606 | end 607 | if qnUpdate <= 1 608 | % Use Cholesky of Hessian approximation 609 | R = sqrt((y'*y)/(y'*s))*eye(length(g)); 610 | else 611 | % Use Inverse of Hessian approximation 612 | H = eye(length(g))*(y'*s)/(y'*y); 613 | end 614 | end 615 | end 616 | 617 | if qnUpdate == 0 % Use BFGS updates 618 | Bs = R'*(R*s); 619 | if Damped 620 | eta = .02; 621 | if y'*s < eta*s'*Bs 622 | if debug 623 | fprintf('Damped Update\n'); 624 | end 625 | theta = min(max(0,((1-eta)*s'*Bs)/(s'*Bs - y'*s)),1); 626 | y = theta*y + (1-theta)*Bs; 627 | end 628 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 629 | else 630 | if y'*s > 1e-10 631 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 632 | else 633 | if debug 634 | fprintf('Skipping Update\n'); 635 | end 636 | end 637 | end 638 | elseif qnUpdate == 1 % Perform SR1 Update if it maintains positive-definiteness 639 | 640 | Bs = R'*(R*s); 641 | ymBs = y-Bs; 642 | if abs(s'*ymBs) >= norm(s)*norm(ymBs)*1e-8 && (s-((R\(R'\y))))'*y > 1e-10 643 | R = cholupdate(R,-ymBs/sqrt(ymBs'*s),'-'); 644 | else 645 | if debug 646 | fprintf('SR1 not positive-definite, doing BFGS Update\n'); 647 | end 648 | if Damped 649 | eta = .02; 650 | if y'*s < eta*s'*Bs 651 | if debug 652 | fprintf('Damped Update\n'); 653 | end 654 | theta = min(max(0,((1-eta)*s'*Bs)/(s'*Bs - y'*s)),1); 655 | y = theta*y + (1-theta)*Bs; 656 | end 657 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 658 | else 659 | if y'*s > 1e-10 660 | R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-'); 661 | else 662 | if debug 663 | fprintf('Skipping Update\n'); 664 | end 665 | end 666 | end 667 | end 668 | elseif qnUpdate == 2 % Use Hoshino update 669 | v = sqrt(y'*H*y)*(s/(s'*y) - (H*y)/(y'*H*y)); 670 | phi = 1/(1 + (y'*H*y)/(s'*y)); 671 | H = H + (s*s')/(s'*y) - (H*y*y'*H)/(y'*H*y) + phi*v*v'; 672 | 673 | elseif qnUpdate == 3 % Self-Scaling BFGS update 674 | ys = y'*s; 675 | Hy = H*y; 676 | yHy = y'*Hy; 677 | gamma = ys/yHy; 678 | v = sqrt(yHy)*(s/ys - Hy/yHy); 679 | H = gamma*(H - Hy*Hy'/yHy + v*v') + (s*s')/ys; 680 | elseif qnUpdate == 4 % Oren's Self-Scaling Variable Metric update 681 | 682 | % Oren's method 683 | if (s'*y)/(y'*H*y) > 1 684 | phi = 1; % BFGS 685 | omega = 0; 686 | elseif (s'*(H\s))/(s'*y) < 1 687 | phi = 0; % DFP 688 | omega = 1; 689 | else 690 | phi = (s'*y)*(y'*H*y-s'*y)/((s'*(H\s))*(y'*H*y)-(s'*y)^2); 691 | omega = phi; 692 | end 693 | 694 | gamma = (1-omega)*(s'*y)/(y'*H*y) + omega*(s'*(H\s))/(s'*y); 695 | v = sqrt(y'*H*y)*(s/(s'*y) - (H*y)/(y'*H*y)); 696 | H = gamma*(H - (H*y*y'*H)/(y'*H*y) + phi*v*v') + (s*s')/(s'*y); 697 | 698 | elseif qnUpdate == 5 % McCormick-Huang asymmetric update 699 | theta = 1; 700 | phi = 0; 701 | psi = 1; 702 | omega = 0; 703 | t1 = s*(theta*s + phi*H'*y)'; 704 | t2 = (theta*s + phi*H'*y)'*y; 705 | t3 = H*y*(psi*s + omega*H'*y)'; 706 | t4 = (psi*s + omega*H'*y)'*y; 707 | H = H + t1/t2 - t3/t4; 708 | end 709 | 710 | if qnUpdate <= 1 711 | d = -R\(R'\g); 712 | else 713 | d = -H*g; 714 | end 715 | 716 | end 717 | g_old = g; 718 | 719 | case NEWTON0 % Hessian-Free Newton 720 | 721 | cgMaxIter = min(p,maxFunEvals-funEvals); 722 | cgForce = min(0.5,sqrt(norm(g)))*norm(g); 723 | 724 | % Set-up preconditioner 725 | precondFunc = []; 726 | precondArgs = []; 727 | if cgSolve == 1 728 | if isempty(precFunc) % Apply L-BFGS preconditioner 729 | if i == 1 730 | S = zeros(p,corrections); 731 | Y = zeros(p,corrections); 732 | YS = zeros(corrections,1); 733 | lbfgs_start = 1; 734 | lbfgs_end = 0; 735 | Hdiag = 1; 736 | else 737 | [S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(g-g_old,t*d,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex); 738 | if debug && skipped 739 | fprintf('Skipped L-BFGS updated\n'); 740 | end 741 | if useMex 742 | precondFunc = @lbfgsProdC; 743 | else 744 | precondFunc = @lbfgsProd; 745 | end 746 | precondArgs = {S,Y,YS,int32(lbfgs_start),int32(lbfgs_end),Hdiag}; 747 | end 748 | g_old = g; 749 | else 750 | % Apply user-defined preconditioner 751 | precondFunc = precFunc; 752 | precondArgs = {x,varargin{:}}; 753 | end 754 | end 755 | 756 | % Solve Newton system using cg and hessian-vector products 757 | if isempty(HvFunc) 758 | % No user-supplied Hessian-vector function, 759 | % use automatic differentiation 760 | HvFun = @autoHv; 761 | HvArgs = {x,g,useComplex,funObj,varargin{:}}; 762 | else 763 | % Use user-supplid Hessian-vector function 764 | HvFun = HvFunc; 765 | HvArgs = {x,varargin{:}}; 766 | end 767 | 768 | if useNegCurv 769 | [d,cgIter,cgRes,negCurv] = conjGrad([],-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFun,HvArgs); 770 | else 771 | [d,cgIter,cgRes] = conjGrad([],-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFun,HvArgs); 772 | end 773 | 774 | funEvals = funEvals+cgIter; 775 | if debug 776 | fprintf('newtonCG stopped on iteration %d w/ residual %.5e\n',cgIter,cgRes); 777 | 778 | end 779 | 780 | if useNegCurv 781 | if ~isempty(negCurv) 782 | %if debug 783 | fprintf('Using negative curvature direction\n'); 784 | %end 785 | d = negCurv/norm(negCurv); 786 | d = d/sum(abs(g)); 787 | end 788 | end 789 | 790 | case NEWTON % Newton search direction 791 | 792 | if cgSolve == 0 793 | if HessianModify == 0 794 | % Attempt to perform a Cholesky factorization of the Hessian 795 | [R,posDef] = chol(H); 796 | 797 | % If the Cholesky factorization was successful, then the Hessian is 798 | % positive definite, solve the system 799 | if posDef == 0 800 | d = -R\(R'\g); 801 | 802 | else 803 | % otherwise, adjust the Hessian to be positive definite based on the 804 | % minimum eigenvalue, and solve with QR 805 | % (expensive, we don't want to do this very much) 806 | if debug 807 | fprintf('Adjusting Hessian\n'); 808 | end 809 | H = H + eye(length(g)) * max(0,1e-12 - min(real(eig(H)))); 810 | d = -H\g; 811 | end 812 | elseif HessianModify == 1 813 | % Modified Incomplete Cholesky 814 | R = mcholinc(H,debug); 815 | d = -R\(R'\g); 816 | elseif HessianModify == 2 817 | % Modified Generalized Cholesky 818 | if useMex 819 | [L D perm] = mcholC(H); 820 | else 821 | [L D perm] = mchol(H); 822 | end 823 | d(perm) = -L' \ ((D.^-1).*(L \ g(perm))); 824 | 825 | elseif HessianModify == 3 826 | % Modified Spectral Decomposition 827 | [V,D] = eig((H+H')/2); 828 | D = diag(D); 829 | D = max(abs(D),max(max(abs(D)),1)*1e-12); 830 | d = -V*((V'*g)./D); 831 | elseif HessianModify == 4 832 | % Modified Symmetric Indefinite Factorization 833 | [L,D,perm] = ldl(H,'vector'); 834 | [blockPos junk] = find(triu(D,1)); 835 | for diagInd = setdiff(setdiff(1:p,blockPos),blockPos+1) 836 | if D(diagInd,diagInd) < 1e-12 837 | D(diagInd,diagInd) = 1e-12; 838 | end 839 | end 840 | for blockInd = blockPos' 841 | block = D(blockInd:blockInd+1,blockInd:blockInd+1); 842 | block_a = block(1); 843 | block_b = block(2); 844 | block_d = block(4); 845 | lambda = (block_a+block_d)/2 - sqrt(4*block_b^2 + (block_a - block_d)^2)/2; 846 | D(blockInd:blockInd+1,blockInd:blockInd+1) = block+eye(2)*(lambda+1e-12); 847 | end 848 | d(perm) = -L' \ (D \ (L \ g(perm))); 849 | else 850 | % Take Newton step if Hessian is pd, 851 | % otherwise take a step with negative curvature 852 | [R,posDef] = chol(H); 853 | if posDef == 0 854 | d = -R\(R'\g); 855 | else 856 | if debug 857 | fprintf('Taking Direction of Negative Curvature\n'); 858 | end 859 | [V,D] = eig(H); 860 | u = V(:,1); 861 | d = -sign(u'*g)*u; 862 | end 863 | end 864 | 865 | else 866 | % Solve with Conjugate Gradient 867 | cgMaxIter = p; 868 | cgForce = min(0.5,sqrt(norm(g)))*norm(g); 869 | 870 | % Select Preconditioner 871 | if cgSolve == 1 872 | % No preconditioner 873 | precondFunc = []; 874 | precondArgs = []; 875 | elseif cgSolve == 2 876 | % Diagonal preconditioner 877 | precDiag = diag(H); 878 | precDiag(precDiag < 1e-12) = 1e-12 - min(precDiag); 879 | precondFunc = @precondDiag; 880 | precondArgs = {precDiag.^-1}; 881 | elseif cgSolve == 3 882 | % L-BFGS preconditioner 883 | if i == 1 884 | old_dirs = zeros(length(g),0); 885 | old_stps = zeros(length(g),0); 886 | Hdiag = 1; 887 | else 888 | [old_dirs,old_stps,Hdiag] = lbfgsUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag); 889 | end 890 | g_old = g; 891 | if useMex 892 | precondFunc = @lbfgsC; 893 | else 894 | precondFunc = @lbfgs; 895 | end 896 | precondArgs = {old_dirs,old_stps,Hdiag}; 897 | elseif cgSolve > 0 898 | % Symmetric Successive Overelaxation Preconditioner 899 | omega = cgSolve; 900 | D = diag(H); 901 | D(D < 1e-12) = 1e-12 - min(D); 902 | precDiag = (omega/(2-omega))*D.^-1; 903 | precTriu = diag(D/omega) + triu(H,1); 904 | precondFunc = @precondTriuDiag; 905 | precondArgs = {precTriu,precDiag.^-1}; 906 | else 907 | % Incomplete Cholesky Preconditioner 908 | opts.droptol = -cgSolve; 909 | opts.rdiag = 1; 910 | R = cholinc(sparse(H),opts); 911 | if min(diag(R)) < 1e-12 912 | R = cholinc(sparse(H + eye*(1e-12 - min(diag(R)))),opts); 913 | end 914 | precondFunc = @precondTriu; 915 | precondArgs = {R}; 916 | end 917 | 918 | % Run cg with the appropriate preconditioner 919 | if isempty(HvFunc) 920 | % No user-supplied Hessian-vector function 921 | [d,cgIter,cgRes] = conjGrad(H,-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs); 922 | else 923 | % Use user-supplied Hessian-vector function 924 | [d,cgIter,cgRes] = conjGrad(H,-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFunc,{x,varargin{:}}); 925 | end 926 | if debug 927 | fprintf('CG stopped after %d iterations w/ residual %.5e\n',cgIter,cgRes); 928 | %funEvals = funEvals + cgIter; 929 | end 930 | end 931 | 932 | case TENSOR % Tensor Method 933 | 934 | if numDiff 935 | % Compute 3rd-order Tensor Numerically 936 | [junk1 junk2 junk3 T] = autoTensor(x,numDiffType,funObj,varargin{:}); 937 | else 938 | % Use user-supplied 3rd-derivative Tensor 939 | [junk1 junk2 junk3 T] = funObj(x,varargin{:}); 940 | end 941 | options_sub.Method = 'newton'; 942 | options_sub.Display = 'none'; 943 | options_sub.progTol = progTol; 944 | options_sub.optTol = optTol; 945 | d = minFunc(@taylorModel,zeros(p,1),options_sub,f,g,H,T); 946 | 947 | if any(abs(d) > 1e5) || all(abs(d) < 1e-5) || g'*d > -progTol 948 | if debug 949 | fprintf('Using 2nd-Order Step\n'); 950 | end 951 | [V,D] = eig((H+H')/2); 952 | D = diag(D); 953 | D = max(abs(D),max(max(abs(D)),1)*1e-12); 954 | d = -V*((V'*g)./D); 955 | else 956 | if debug 957 | fprintf('Using 3rd-Order Step\n'); 958 | end 959 | end 960 | end 961 | 962 | if ~isLegal(d) 963 | fprintf('Step direction is illegal!\n'); 964 | pause; 965 | return 966 | end 967 | 968 | % ****************** COMPUTE STEP LENGTH ************************ 969 | 970 | % Directional Derivative 971 | gtd = g'*d; 972 | 973 | % Check that progress can be made along direction 974 | if gtd > -progTol 975 | exitflag=2; 976 | msg = 'Directional Derivative below progTol'; 977 | break; 978 | end 979 | 980 | % Select Initial Guess 981 | if i == 1 982 | if method < NEWTON0 983 | t = min(1,1/sum(abs(g))); 984 | else 985 | t = 1; 986 | end 987 | else 988 | if LS_init == 0 989 | % Newton step 990 | t = 1; 991 | elseif LS_init == 1 992 | % Close to previous step length 993 | t = t*min(2,(gtd_old)/(gtd)); 994 | elseif LS_init == 2 995 | % Quadratic Initialization based on {f,g} and previous f 996 | t = min(1,2*(f-f_old)/(gtd)); 997 | elseif LS_init == 3 998 | % Double previous step length 999 | t = min(1,t*2); 1000 | elseif LS_init == 4 1001 | % Scaled step length if possible 1002 | if isempty(HvFunc) 1003 | % No user-supplied Hessian-vector function, 1004 | % use automatic differentiation 1005 | dHd = d'*autoHv(d,x,g,0,funObj,varargin{:}); 1006 | else 1007 | % Use user-supplid Hessian-vector function 1008 | dHd = d'*HvFunc(d,x,varargin{:}); 1009 | end 1010 | 1011 | funEvals = funEvals + 1; 1012 | if dHd > 0 1013 | t = -gtd/(dHd); 1014 | else 1015 | t = min(1,2*(f-f_old)/(gtd)); 1016 | end 1017 | end 1018 | 1019 | if t <= 0 1020 | t = 1; 1021 | end 1022 | end 1023 | f_old = f; 1024 | gtd_old = gtd; 1025 | 1026 | % Compute reference fr if using non-monotone objective 1027 | if Fref == 1 1028 | fr = f; 1029 | else 1030 | if i == 1 1031 | old_fvals = repmat(-inf,[Fref 1]); 1032 | end 1033 | 1034 | if i <= Fref 1035 | old_fvals(i) = f; 1036 | else 1037 | old_fvals = [old_fvals(2:end);f]; 1038 | end 1039 | fr = max(old_fvals); 1040 | end 1041 | 1042 | computeHessian = 0; 1043 | if method >= NEWTON 1044 | if HessianIter == 1 1045 | computeHessian = 1; 1046 | elseif i > 1 && mod(i-1,HessianIter) == 0 1047 | computeHessian = 1; 1048 | end 1049 | end 1050 | 1051 | % Line Search 1052 | f_old = f; 1053 | if LS_type == 0 % Use Armijo Bactracking 1054 | % Perform Backtracking line search 1055 | if computeHessian 1056 | [t,x,f,g,LSfunEvals,H] = ArmijoBacktrack(x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,LS_saveHessianComp,funObj,varargin{:}); 1057 | else 1058 | [t,x,f,g,LSfunEvals] = ArmijoBacktrack(x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,1,funObj,varargin{:}); 1059 | end 1060 | funEvals = funEvals + LSfunEvals; 1061 | 1062 | elseif LS_type == 1 % Find Point satisfying Wolfe conditions 1063 | 1064 | if computeHessian 1065 | [t,f,g,LSfunEvals,H] = WolfeLineSearch(x,t,d,f,g,gtd,c1,c2,LS_interp,LS_multi,25,progTol,debug,doPlot,LS_saveHessianComp,funObj,varargin{:}); 1066 | else 1067 | [t,f,g,LSfunEvals] = WolfeLineSearch(x,t,d,f,g,gtd,c1,c2,LS_interp,LS_multi,25,progTol,debug,doPlot,1,funObj,varargin{:}); 1068 | end 1069 | funEvals = funEvals + LSfunEvals; 1070 | x = x + t*d; 1071 | 1072 | else 1073 | % Use Matlab optim toolbox line search 1074 | [t,f_new,fPrime_new,g_new,LSexitFlag,LSiter]=... 1075 | lineSearch({'fungrad',[],funObj},x,p,1,p,d,f,gtd,t,c1,c2,-inf,maxFunEvals-funEvals,... 1076 | progTol,[],[],[],varargin{:}); 1077 | funEvals = funEvals + LSiter; 1078 | if isempty(t) 1079 | exitflag = -2; 1080 | msg = 'Matlab LineSearch failed'; 1081 | break; 1082 | end 1083 | 1084 | if method >= NEWTON 1085 | [f_new,g_new,H] = funObj(x + t*d,varargin{:}); 1086 | funEvals = funEvals + 1; 1087 | end 1088 | x = x + t*d; 1089 | f = f_new; 1090 | g = g_new; 1091 | end 1092 | 1093 | % Compute Optimality Condition 1094 | optCond = max(abs(g)); 1095 | 1096 | % Output iteration information 1097 | if verboseI 1098 | fprintf('%10d %10d %15.5e %15.5e %15.5e\n',i,funEvals*funEvalMultiplier,t,f,optCond); 1099 | end 1100 | 1101 | if nargout > 3 1102 | % Update Trace 1103 | trace.fval(end+1,1) = f; 1104 | trace.funcCount(end+1,1) = funEvals; 1105 | trace.optCond(end+1,1) = optCond; 1106 | end 1107 | 1108 | % Output Function 1109 | if ~isempty(outputFcn) 1110 | stop = outputFcn(x,'iter',i,funEvals,f,t,gtd,g,d,optCond,varargin{:}); 1111 | if stop 1112 | exitflag=-1; 1113 | msg = 'Stopped by output function'; 1114 | break; 1115 | end 1116 | end 1117 | 1118 | % Check Optimality Condition 1119 | if optCond <= optTol 1120 | exitflag=1; 1121 | msg = 'Optimality Condition below optTol'; 1122 | break; 1123 | end 1124 | 1125 | % ******************* Check for lack of progress ******************* 1126 | 1127 | if max(abs(t*d)) <= progTol 1128 | exitflag=2; 1129 | msg = 'Step Size below progTol'; 1130 | break; 1131 | end 1132 | 1133 | 1134 | if abs(f-f_old) < progTol 1135 | exitflag=2; 1136 | msg = 'Function Value changing by less than progTol'; 1137 | break; 1138 | end 1139 | 1140 | % ******** Check for going over iteration/evaluation limit ******************* 1141 | 1142 | if funEvals*funEvalMultiplier >= maxFunEvals 1143 | exitflag = 0; 1144 | msg = 'Reached Maximum Number of Function Evaluations'; 1145 | break; 1146 | end 1147 | 1148 | if i == maxIter 1149 | exitflag = 0; 1150 | msg='Reached Maximum Number of Iterations'; 1151 | break; 1152 | end 1153 | 1154 | end 1155 | 1156 | if verbose 1157 | fprintf('%s\n',msg); 1158 | end 1159 | if nargout > 3 1160 | output = struct('iterations',i,'funcCount',funEvals*funEvalMultiplier,... 1161 | 'algorithm',method,'firstorderopt',max(abs(g)),'message',msg,'trace',trace); 1162 | end 1163 | 1164 | % Output Function 1165 | if ~isempty(outputFcn) 1166 | outputFcn(x,'done',i,funEvals,f,t,gtd,g,d,max(abs(g)),varargin{:}); 1167 | end 1168 | 1169 | end 1170 | 1171 | --------------------------------------------------------------------------------