├── dataset
    └── .keep
├── train_models
    └── .keep
├── minFunc_2012
    ├── minFunc
    │   ├── precondDiag.m
    │   ├── precondTriu.m
    │   ├── precondTriuDiag.m
    │   ├── isLegal.m
    │   ├── compiled
    │   │   ├── lbfgsC.mexa64
    │   │   ├── lbfgsC.mexglx
    │   │   ├── lbfgsC.mexmac
    │   │   ├── lbfgsC.mexmaci
    │   │   ├── lbfgsC.mexw32
    │   │   ├── lbfgsC.mexw64
    │   │   ├── mcholC.mexa64
    │   │   ├── mcholC.mexglx
    │   │   ├── mcholC.mexmac
    │   │   ├── mcholC.mexw32
    │   │   ├── mcholC.mexw64
    │   │   ├── lbfgsAddC.mexa64
    │   │   ├── lbfgsAddC.mexw64
    │   │   ├── lbfgsC.mexmaci64
    │   │   ├── mcholC.mexmaci64
    │   │   ├── lbfgsAddC.mexmaci64
    │   │   ├── lbfgsProdC.mexa64
    │   │   ├── lbfgsProdC.mexw64
    │   │   └── lbfgsProdC.mexmaci64
    │   ├── mcholinc.m
    │   ├── lbfgsUpdate.m
    │   ├── lbfgsAdd.m
    │   ├── lbfgsProd.m
    │   ├── taylorModel.m
    │   ├── mex
    │   │   ├── lbfgsAddC.c
    │   │   ├── lbfgsProdC.c
    │   │   ├── lbfgsC.c
    │   │   └── mcholC.c
    │   ├── lbfgs.m
    │   ├── dampedUpdate.m
    │   ├── mchol.m
    │   ├── conjGrad.m
    │   ├── polyinterp.m
    │   ├── minFunc_processInputOptions.m
    │   ├── ArmijoBacktrack.m
    │   ├── WolfeLineSearch.m
    │   └── minFunc.m
    ├── logisticExample
    │   ├── LogisticHv.m
    │   ├── mylogsumexp.m
    │   ├── LogisticDiagPrecond.m
    │   ├── LogisticLoss.m
    │   └── example_minFunc_LR.m
    ├── mexAll.m
    ├── autoDif
    │   ├── autoHv.m
    │   ├── autoHess.m
    │   ├── derivativeCheck.m
    │   ├── autoGrad.m
    │   ├── autoTensor.m
    │   └── fastDerivativeCheck.m
    ├── rosenbrock.m
    ├── example_derivativeCheck.m
    ├── ZSL_ObjFunc.m~
    ├── ZSL_ObjFunc.m
    └── example_minFunc.m
├── LICENSE
├── ZSL_ObjFunc_Wz.m
├── ZSL_ObjFunc_Wx.m
├── README.md
├── ZSL_Test.m
├── get_datapath.m
└── ZSL_Train.m


/dataset/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/train_models/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/precondDiag.m:
--------------------------------------------------------------------------------
1 | function [y] = precondDiag(r,D)
2 | y = D.*r;


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/precondTriu.m:
--------------------------------------------------------------------------------
1 | function [y] = precondUpper(r,U)
2 | y = U \ (U' \ r);


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/precondTriuDiag.m:
--------------------------------------------------------------------------------
1 | function [y] = precondUpper(r,U,D)
2 | y = U \ (D .* (U' \ r));


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/isLegal.m:
--------------------------------------------------------------------------------
1 | function [legal] = isLegal(v)
2 | legal = sum(any(imag(v(:))))==0 & sum(isnan(v(:)))==0 & sum(isinf(v(:)))==0;


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsC.mexa64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexa64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsC.mexglx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexglx


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsC.mexmac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexmac


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsC.mexmaci:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexmaci


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsC.mexw32:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexw32


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsC.mexw64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexw64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/mcholC.mexa64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexa64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/mcholC.mexglx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexglx


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/mcholC.mexmac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexmac


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/mcholC.mexw32:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexw32


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/mcholC.mexw64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexw64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsAddC.mexa64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsAddC.mexa64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsAddC.mexw64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsAddC.mexw64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsC.mexmaci64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsC.mexmaci64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/mcholC.mexmaci64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/mcholC.mexmaci64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsAddC.mexmaci64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsAddC.mexmaci64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsProdC.mexa64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsProdC.mexa64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsProdC.mexw64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsProdC.mexw64


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/compiled/lbfgsProdC.mexmaci64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EthanZhu90/ZSL_PP_CVPR17/HEAD/minFunc_2012/minFunc/compiled/lbfgsProdC.mexmaci64


--------------------------------------------------------------------------------
/minFunc_2012/logisticExample/LogisticHv.m:
--------------------------------------------------------------------------------
1 | function [Hv] = LogisticHv(v,w,X,y)
2 | % v(feature,1) - vector that we will multiply Hessian by
3 | % w(feature,1)
4 | % X(instance,feature)
5 | % y(instance,1)
6 | 
7 | sig = 1./(1+exp(-y.*(X*w)));
8 | Hv = X.'*(sig.*(1-sig).*(X*v));
9 | 


--------------------------------------------------------------------------------
/minFunc_2012/logisticExample/mylogsumexp.m:
--------------------------------------------------------------------------------
1 | function lse = mylogsumexp(b)
2 | % does logsumexp across columns
3 | B = max(b,[],2);
4 | lse = log(sum(exp(b-repmat(B,[1 size(b,2)])),2))+B;
5 | 
6 | % Old version that used repmatC
7 | %lse = log(sum(exp(b-repmatC(B,[1 size(b,2)])),2))+B;
8 | end


--------------------------------------------------------------------------------
/minFunc_2012/mexAll.m:
--------------------------------------------------------------------------------
1 | % minFunc
2 | fprintf('Compiling minFunc files...\n');
3 | mex -outdir minFunc/compiled minFunc/mex/mcholC.c
4 | mex -outdir minFunc/compiled minFunc/mex/lbfgsC.c
5 | mex -outdir minFunc/compiled minFunc/mex/lbfgsAddC.c
6 | mex -outdir minFunc/compiled minFunc/mex/lbfgsProdC.c
7 | 
8 | 


--------------------------------------------------------------------------------
/minFunc_2012/autoDif/autoHv.m:
--------------------------------------------------------------------------------
 1 | function [Hv] = autoHv(v,x,g,useComplex,funObj,varargin)
 2 | % [Hv] = autoHv(v,x,g,useComplex,funObj,varargin)
 3 | %
 4 | % Numerically compute Hessian-vector product H*v of funObj(x,varargin{:})
 5 | %  based on gradient values
 6 | 
 7 | if useComplex
 8 |     mu = 1e-150i;
 9 | else
10 |     mu = 2*sqrt(1e-12)*(1+norm(x))/norm(v);
11 | end
12 | [f,finDif] = funObj(x + v*mu,varargin{:});
13 | Hv = (finDif-g)/mu;


--------------------------------------------------------------------------------
/minFunc_2012/logisticExample/LogisticDiagPrecond.m:
--------------------------------------------------------------------------------
 1 | function [m] = LogisticHv(v,w,X,y)
 2 | % v(feature,1) - vector that we will apply diagonal preconditioner to
 3 | % w(feature,1)
 4 | % X(instance,feature)
 5 | % y(instance,1)
 6 | 
 7 | sig = 1./(1+exp(-y.*(X*w)));
 8 | 
 9 | % Compute diagonals of Hessian
10 | sig = sig.*(1-sig);
11 | for i = 1:length(w)
12 |    h(i,1) = (sig.*X(:,i))'*X(:,i);
13 | end
14 | 
15 | % Apply preconditioner
16 | m = v./h;
17 | 
18 | % Exact preconditioner
19 | %H = X'*diag(sig.*(1-sig))*X;
20 | %m = H\v;
21 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/mcholinc.m:
--------------------------------------------------------------------------------
 1 | function [R,tau] = mcholinc(H,verbose)
 2 | % Computes Cholesky of H+tau*I, for suitably large tau that matrix is pd
 3 | 
 4 | p = size(H,1);
 5 | 
 6 | beta = norm(H,'fro');
 7 | if min(diag(H)) > 1e-12
 8 |     tau = 0;
 9 | else
10 |     if verbose
11 |         fprintf('Small Value on Diagonal, Adjusting Hessian\n');
12 |     end
13 |     tau = max(beta/2,1e-12);
14 | end
15 | while 1
16 |     [R,posDef] = chol(H+tau*eye(p));
17 |     if posDef == 0
18 |         break;
19 |     else
20 |         if verbose
21 |             fprintf('Cholesky Failed, Adjusting Hessian\n');
22 |         end
23 |         tau = max(2*tau,beta/2);
24 |     end
25 | end
26 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/lbfgsUpdate.m:
--------------------------------------------------------------------------------
 1 | function [old_dirs,old_stps,Hdiag] = lbfgsUpdate(y,s,corrections,debug,old_dirs,old_stps,Hdiag)
 2 | ys = y'*s;
 3 | if ys > 1e-10
 4 |     numCorrections = size(old_dirs,2);
 5 |     if numCorrections < corrections
 6 |         % Full Update
 7 |         old_dirs(:,numCorrections+1) = s;
 8 |         old_stps(:,numCorrections+1) = y;
 9 |     else
10 |         % Limited-Memory Update
11 |         old_dirs = [old_dirs(:,2:corrections) s];
12 |         old_stps = [old_stps(:,2:corrections) y];
13 |     end
14 | 
15 |     % Update scale of initial Hessian approximation
16 |     Hdiag = ys/(y'*y);
17 | else
18 |     if debug
19 |         fprintf('Skipping Update\n');
20 |     end
21 | end


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/lbfgsAdd.m:
--------------------------------------------------------------------------------
 1 | function [S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(y,s,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex)
 2 | ys = y'*s;
 3 | skipped = 0;
 4 | corrections = size(S,2);
 5 | if ys > 1e-10
 6 | 	if lbfgs_end < corrections
 7 | 		lbfgs_end = lbfgs_end+1;
 8 | 		if lbfgs_start ~= 1
 9 | 			if lbfgs_start == corrections
10 | 				lbfgs_start = 1;
11 | 			else
12 | 				lbfgs_start = lbfgs_start+1;
13 | 			end
14 | 		end
15 | 	else
16 | 		lbfgs_start = min(2,corrections);
17 | 		lbfgs_end = 1;
18 | 	end
19 | 	
20 | 	if useMex
21 | 		lbfgsAddC(y,s,Y,S,ys,int32(lbfgs_end));
22 | 	else
23 | 		S(:,lbfgs_end) = s;
24 | 		Y(:,lbfgs_end) = y;
25 | 	end
26 | 	YS(lbfgs_end) = ys;
27 | 	
28 | 	% Update scale of initial Hessian approximation
29 | 	Hdiag = ys/(y'*y);
30 | else
31 | 	skipped = 1;
32 | end


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/lbfgsProd.m:
--------------------------------------------------------------------------------
 1 | function [d] = lbfgsProd(g,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag)
 2 | % BFGS Search Direction
 3 | %
 4 | % This function returns the (L-BFGS) approximate inverse Hessian,
 5 | % multiplied by the negative gradient
 6 | 
 7 | % Set up indexing
 8 | [nVars,maxCorrections] = size(S);
 9 | if lbfgs_start == 1
10 | 	ind = 1:lbfgs_end;
11 | 	nCor = lbfgs_end-lbfgs_start+1;
12 | else
13 | 	ind = [lbfgs_start:maxCorrections 1:lbfgs_end];
14 | 	nCor = maxCorrections;
15 | end
16 | al = zeros(nCor,1);
17 | be = zeros(nCor,1);
18 | 
19 | d = -g;
20 | for j = 1:length(ind)
21 | 	i = ind(end-j+1);
22 | 	al(i) = (S(:,i)'*d)/YS(i);
23 | 	d = d-al(i)*Y(:,i);
24 | end
25 | 
26 | % Multiply by Initial Hessian
27 | d = Hdiag*d;
28 | 
29 | for i = ind
30 | 	be(i) = (Y(:,i)'*d)/YS(i);
31 | 	d = d + S(:,i)*(al(i)-be(i));
32 | end
33 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/taylorModel.m:
--------------------------------------------------------------------------------
 1 | function [f,g,H] = taylorModel(d,f,g,H,T)
 2 | 
 3 | p = length(d);
 4 | 
 5 | fd3 = 0;
 6 | gd2 = zeros(p,1);
 7 | Hd = zeros(p);
 8 | for t1 = 1:p
 9 |     for t2 = 1:p
10 |         for t3 = 1:p
11 |             fd3 = fd3 + T(t1,t2,t3)*d(t1)*d(t2)*d(t3);
12 | 
13 |             if nargout > 1
14 |                 gd2(t3) = gd2(t3) + T(t1,t2,t3)*d(t1)*d(t2);
15 |             end
16 | 
17 |             if nargout > 2
18 |                 Hd(t2,t3) = Hd(t2,t3) + T(t1,t2,t3)*d(t1);
19 |             end
20 |         end
21 | 
22 |     end
23 | end
24 | 
25 | f = f + g'*d + (1/2)*d'*H*d + (1/6)*fd3;
26 | 
27 | if nargout > 1
28 |     g = g + H*d + (1/2)*gd2;
29 | end
30 | 
31 | if nargout > 2
32 |     H = H + Hd;
33 | end
34 | 
35 | if any(abs(d) > 1e5)
36 |     % We want the optimizer to stop if the solution is unbounded
37 |     g = zeros(p,1);
38 | end


--------------------------------------------------------------------------------
/minFunc_2012/logisticExample/LogisticLoss.m:
--------------------------------------------------------------------------------
 1 | function [nll,g,H,T] = LogisticLoss(w,X,y)
 2 | % w(feature,1)
 3 | % X(instance,feature)
 4 | % y(instance,1)
 5 | 
 6 | [n,p] = size(X);
 7 | 
 8 | Xw = X*w;
 9 | yXw = y.*Xw;
10 | 
11 | nll = sum(mylogsumexp([zeros(n,1) -yXw]));
12 | 
13 | if nargout > 1
14 |     if nargout > 2
15 |         sig = 1./(1+exp(-yXw));
16 |         g = -X.'*(y.*(1-sig));
17 |     else
18 |         %g = -X.'*(y./(1+exp(yXw)));
19 |         g = -(X.'*(y./(1+exp(yXw))));
20 |     end
21 | end
22 | 
23 | if nargout > 2
24 |     H = X.'*diag(sparse(sig.*(1-sig)))*X;
25 | end
26 | 
27 | if nargout > 3
28 |     T = zeros(p,p,p);
29 |     for j1 = 1:p
30 |         for j2 = 1:p
31 |             for j3 = 1:p
32 |                 T(j1,j2,j3) = sum(y(:).^3.*X(:,j1).*X(:,j2).*X(:,j3).*sig.*(1-sig).*(1-2*sig));
33 |             end
34 |         end
35 |     end
36 | end


--------------------------------------------------------------------------------
/minFunc_2012/rosenbrock.m:
--------------------------------------------------------------------------------
 1 | function [f, df] = rosenbrock(x, y )
 2 | 
 3 | % rosenbrock.m This function returns the function value, partial derivatives
 4 | % and Hessian of the (general dimension) rosenbrock function, given by:
 5 | %
 6 | %       f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 
 7 | %
 8 | % where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1).
 9 | %
10 | % Carl Edward Rasmussen, 2001-07-21.
11 | 
12 | D = length(x);
13 | f = sum(100*(x(2:D)-x(1:D-1).^2).^2 + (1-x(1:D-1)).^2);
14 | 
15 | if nargout > 1
16 |   df = zeros(D, 1);
17 |   df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1));
18 |   df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2);
19 | end
20 | 
21 | % if nargout > 2
22 | %   ddf = zeros(D,D);
23 | %   ddf(1:D-1,1:D-1) = diag(-400*x(2:D) + 1200*x(1:D-1).^2 + 2);
24 | %   ddf(2:D,2:D) = ddf(2:D,2:D) + 200*eye(D-1);
25 | %   ddf = ddf - diag(400*x(1:D-1),1) - diag(400*x(1:D-1),-1);
26 | % end
27 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/mex/lbfgsAddC.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include "mex.h"
 3 | 
 4 | /* See lbfgsAdd.m for details */
 5 | /* This function will not exit gracefully on bad input! */
 6 | 
 7 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
 8 | {
 9 | 	/* Variable Declarations */
10 | 	
11 | 	double *s,*y,*S, *Y, ys;
12 | 	int i,j,nVars,lbfgs_end;
13 | 	
14 | 	/* Get Input Pointers */
15 | 	
16 | 	y = mxGetPr(prhs[0]);
17 | 	s = mxGetPr(prhs[1]);
18 | 	Y = mxGetPr(prhs[2]);
19 | 	S = mxGetPr(prhs[3]);
20 | 	ys= mxGetScalar(prhs[4]);
21 | 	lbfgs_end = (int)mxGetScalar(prhs[5]);
22 | 	
23 | 	if (!mxIsClass(prhs[5],"int32"))
24 | 		mexErrMsgTxt("lbfgs_end must be int32");
25 | 	
26 | 	/* Compute number of variables, maximum number of corrections */
27 | 	
28 | 	nVars = mxGetDimensions(prhs[2])[0];
29 | 	
30 | 	for(j=0;j<nVars;j++) {
31 | 		S[j+nVars*(lbfgs_end-1)] = s[j];
32 | 		Y[j+nVars*(lbfgs_end-1)] = y[j];
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/lbfgs.m:
--------------------------------------------------------------------------------
 1 | function [d] = lbfgs(g,s,y,Hdiag)
 2 | % BFGS Search Direction
 3 | %
 4 | % This function returns the (L-BFGS) approximate inverse Hessian,
 5 | % multiplied by the gradient
 6 | %
 7 | % If you pass in all previous directions/sizes, it will be the same as full BFGS
 8 | % If you truncate to the k most recent directions/sizes, it will be L-BFGS
 9 | %
10 | % s - previous search directions (p by k)
11 | % y - previous step sizes (p by k)
12 | % g - gradient (p by 1)
13 | % Hdiag - value of initial Hessian diagonal elements (scalar)
14 | 
15 | [p,k] = size(s);
16 | 
17 | for i = 1:k
18 |     ro(i,1) = 1/(y(:,i)'*s(:,i));
19 | end
20 | 
21 | q = zeros(p,k+1);
22 | r = zeros(p,k+1);
23 | al =zeros(k,1);
24 | be =zeros(k,1);
25 | 
26 | q(:,k+1) = g;
27 | 
28 | for i = k:-1:1
29 |     al(i) = ro(i)*s(:,i)'*q(:,i+1);
30 |     q(:,i) = q(:,i+1)-al(i)*y(:,i);
31 | end
32 | 
33 | % Multiply by Initial Hessian
34 | r(:,1) = Hdiag*q(:,1);
35 | 
36 | for i = 1:k
37 |     be(i) = ro(i)*y(:,i)'*r(:,i);
38 |     r(:,i+1) = r(:,i) + s(:,i)*(al(i)-be(i));
39 | end
40 | d=r(:,k+1);


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Yizhe (Ethan) Zhu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/minFunc_2012/autoDif/autoHess.m:
--------------------------------------------------------------------------------
1 | function [f,g,H] = autoHess(x,type,funObj,varargin)% Numerically compute Hessian of objective function from gradient valuesp = length(x);if type == 1	% Use finite differencing	mu = 2*sqrt(1e-12)*(1+norm(x));		[f,g] = funObj(x,varargin{:});	diff = zeros(p);	for j = 1:p		e_j = zeros(p,1);		e_j(j) = 1;		[f diff(:,j)] = funObj(x + mu*e_j,varargin{:});	end	H = (diff-repmat(g,[1 p]))/mu;elseif type == 3 % Use Complex Differentials	mu = 1e-150;		diff = zeros(p);	for j = 1:p		e_j = zeros(p,1);		e_j(j) = 1;		[f(j) diff(:,j)] = funObj(x + mu*i*e_j,varargin{:});	end	f = mean(real(f));	g = mean(real(diff),2);	H = imag(diff)/mu;else % Use central differencing	mu = 2*sqrt(1e-12)*(1+norm(x));	f1 = zeros(p,1);	f2 = zeros(p,1);	diff1 = zeros(p);	diff2 = zeros(p);	for j = 1:p		e_j = zeros(p,1);		e_j(j) = 1;		[f1(j) diff1(:,j)] = funObj(x + mu*e_j,varargin{:});		[f2(j) diff2(:,j)] = funObj(x - mu*e_j,varargin{:});	end	f = mean([f1;f2]);	g = mean([diff1 diff2],2);	H = (diff1-diff2)/(2*mu);end% Make sure H is symmetricH = (H+H')/2;if 0 % DEBUG CODE	[fReal gReal HReal] = funObj(x,varargin{:});	[fReal f]	[gReal g]	[HReal H]	pause;end


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/dampedUpdate.m:
--------------------------------------------------------------------------------
 1 | function [old_dirs,old_stps,Hdiag,Bcompact] = lbfgsUpdate(y,s,corrections,debug,old_dirs,old_stps,Hdiag)
 2 | 
 3 | %B0 = eye(length(y))/Hdiag;
 4 | S = old_dirs(:,2:end);
 5 | Y = old_stps(:,2:end);
 6 | k = size(Y,2);
 7 | L = zeros(k);
 8 | for j = 1:k
 9 |     for i = j+1:k
10 |         L(i,j) = S(:,i)'*Y(:,j);
11 |     end
12 | end
13 | D = diag(diag(S'*Y));
14 | N = [S/Hdiag Y];
15 | M = [S'*S/Hdiag L;L' -D];
16 | 
17 | ys = y'*s;
18 | Bs = s/Hdiag - N*(M\(N'*s)); % Product B*s
19 | sBs = s'*Bs;
20 | 
21 | eta = .02;
22 | if ys < eta*sBs
23 |     if debug
24 |         fprintf('Damped Update\n');
25 |     end
26 |     theta = min(max(0,((1-eta)*sBs)/(sBs - ys)),1);
27 |     y = theta*y + (1-theta)*Bs;
28 | end
29 | 
30 | 
31 | numCorrections = size(old_dirs,2);
32 | if numCorrections < corrections
33 |     % Full Update
34 |     old_dirs(:,numCorrections+1) = s;
35 |     old_stps(:,numCorrections+1) = y;
36 | else
37 |     % Limited-Memory Update
38 |     old_dirs = [old_dirs(:,2:corrections) s];
39 |     old_stps = [old_stps(:,2:corrections) y];
40 | end
41 | 
42 | % Update scale of initial Hessian approximation
43 | Hdiag = (y'*s)/(y'*y);


--------------------------------------------------------------------------------
/minFunc_2012/autoDif/derivativeCheck.m:
--------------------------------------------------------------------------------
 1 | function diff = derivativeCheck(funObj,x,order,type,varargin)
 2 | % diff = derivativeCheck(funObj,x,order,useComplex,varargin)
 3 | %
 4 | % type = 1 (simple forward-difference)
 5 | % type = 2 (central differencing - default)
 6 | % type = 3 (complex-step deriative)
 7 | 
 8 | if nargin < 3
 9 | 	order = 1; % Only check gradient by default
10 | 	if nargin < 4
11 | 		type = 2; % Use central-differencing by default
12 | 	end
13 | end
14 | 
15 | if order == 2
16 | 	[f,g,H] = funObj(x,varargin{:});
17 | 	
18 | 	fprintf('Checking Hessian...\n');
19 | 	[f2,g2,H2] = autoHess(x,type,funObj,varargin{:});
20 | 	
21 | 	fprintf('Max difference between user and numerical hessian: %e\n',max(abs(H(:)-H2(:))));
22 | 	if max(abs(H(:)-H2(:))) > 1e-4
23 | 		H
24 | 		H2
25 | 		diff = abs(H-H2)
26 | 		pause;
27 | 	end
28 | else
29 | 	[f,g] = funObj(x,varargin{:});
30 | 	
31 | 	fprintf('Checking Gradient...\n');
32 | 	[f2,g2] = autoGrad(x,type,funObj,varargin{:});
33 | 	
34 | 	fprintf('Max difference between user and numerical gradient: %e\n',max(abs(g-g2)));
35 | 	if max(abs(g-g2)) > 1e-4
36 | 		fprintf('User NumDif:\n');
37 | 		[g g2]
38 | 		diff = abs(g-g2)
39 | 		pause
40 | 	end
41 | end
42 | 
43 | 


--------------------------------------------------------------------------------
/minFunc_2012/autoDif/autoGrad.m:
--------------------------------------------------------------------------------
1 | function [f,g] = autoGrad(x,type,funObj,varargin)% [f,g] = autoGrad(x,useComplex,funObj,varargin)%% Numerically compute gradient of objective function from function values%% type =%     1 - forward-differencing (p+1 evaluations)%     2 - central-differencing (more accurate, but requires 2p evaluations)%     3 - complex-step derivative (most accurate and only requires p evaluations, but only works for certain objectives)p = length(x);if type == 1 % Use Finite Differencing	f = funObj(x,varargin{:});	mu = 2*sqrt(1e-12)*(1+norm(x));	diff = zeros(p,1);	for j = 1:p		e_j = zeros(p,1);		e_j(j) = 1;		diff(j,1) = funObj(x + mu*e_j,varargin{:});	end	g = (diff-f)/mu;elseif type == 3 % Use Complex Differentials	mu = 1e-150;	diff = zeros(p,1);	for j = 1:p		e_j = zeros(p,1);		e_j(j) = 1;		diff(j,1) = funObj(x + mu*i*e_j,varargin{:});	end		f = mean(real(diff));	g = imag(diff)/mu;else % Use Central Differencing	mu = 2*sqrt(1e-12)*(1+norm(x));	diff1 = zeros(p,1);	diff2 = zeros(p,1);	for j = 1:p		e_j = zeros(p,1);		e_j(j) = 1;		diff1(j,1) = funObj(x + mu*e_j,varargin{:});		diff2(j,1) = funObj(x - mu*e_j,varargin{:});	end	f = mean([diff1;diff2]);	g = (diff1 - diff2)/(2*mu);endif 0 % DEBUG CODE	[fReal gReal] = funObj(x,varargin{:});	[fReal f]	[gReal g]	diff	pause;end


--------------------------------------------------------------------------------
/minFunc_2012/autoDif/autoTensor.m:
--------------------------------------------------------------------------------
1 | function [f,g,H,T] = autoTensor(x,type,funObj,varargin)% [f,g,H,T] = autoTensor(x,useComplex,funObj,varargin)% Numerically compute Tensor of 3rd-derivatives of objective function from Hessian valuesp = length(x);if type == 2	mu = 2*sqrt(1e-12)*(1+norm(x));    	f1 = zeros(p,1);	f2 = zeros(p,2);	g1 = zeros(p);	g2 = zeros(p);    diff = zeros(p,p,p);    for j = 1:p        e_j = zeros(p,1);        e_j(j) = 1;        [f1(j) g1(:,j) diff1(:,:,j)] = funObj(x + mu*e_j,varargin{:});        [f2(j) g2(:,j) diff2(:,:,j)] = funObj(x + mu*e_j,varargin{:});	end	f = mean([f1;f2]);	g = mean([g1 g2],2);	H = mean(cat(3,diff1,diff2),3);	T = (diff1-diff2)/(2*mu);elseif type == 3 % Use Complex Differentials    mu = 1e-150;	f = zeros(p,1);	g = zeros(p);    diff = zeros(p,p,p);    for j = 1:p        e_j = zeros(p,1);        e_j(j) = 1;        [f(j) g(:,j) diff(:,:,j)] = funObj(x + mu*i*e_j,varargin{:});    end    f = mean(real(f));    g = mean(real(g),2);    H = mean(real(diff),3);    T = imag(diff)/mu;else % Use finite differencing    mu = 2*sqrt(1e-12)*(1+norm(x));        [f,g,H] = funObj(x,varargin{:});    diff = zeros(p,p,p);    for j = 1:p        e_j = zeros(p,1);        e_j(j) = 1;        [~ ~ diff(:,:,j)] = funObj(x + mu*e_j,varargin{:});    end    T = (diff-repmat(H,[1 1 p]))/mu;end


--------------------------------------------------------------------------------
/ZSL_ObjFunc_Wz.m:
--------------------------------------------------------------------------------
 1 | function [f, df] = ZSL_ObjFunc_Wz(W_z_vec, num_Parts,  c, dx, dz, W_x, X, Z, Y, ZZ_t,  D_xzi, lambda1, lambda2, GPU_mode)
 2 |  
 3 | W_z = reshape(W_z_vec, [c, dz]); 
 4 | 
 5 | dp = dx / num_Parts; 
 6 | W_x_t = W_x'; 
 7 | 
 8 | XX_t = X * X'; 
 9 | XYZ_t = X * Y * Z'; 
10 | 
11 | %%%% precompute multplication
12 | Wxt_Wz = W_x' * W_z; 
13 | Wxt_Wz_Z = Wxt_Wz * Z;
14 | 
15 | trace_sum = 0; 
16 | for i = 1:num_Parts
17 |     trace_sum = trace_sum + trace( W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * full(D_xzi{i}) * W_z' * W_x_t((dp*(i-1)+1):dp*(i),:)'); 
18 | end
19 | 
20 | %%%% calculate loss
21 | f =  norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2 + lambda2 * trace_sum; 
22 | if(GPU_mode)
23 |     f = gather(f); 
24 | end
25 | %%%% calculate the derivative of W_z
26 | term0 = W_x * XX_t * Wxt_Wz * ZZ_t - W_x * XYZ_t; 
27 | term1 = lambda1 * W_x * Wxt_Wz * ZZ_t ; 
28 | if(GPU_mode)
29 |     term2 = gpuArray(zeros(c, dz)); 
30 | else 
31 |     term2 = zeros(c, dz);
32 | end
33 | 
34 | for i = 1:num_Parts
35 |     term2 = term2 + W_x_t((dp*(i-1)+1) : dp*(i),:)'*  W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * full(D_xzi{i}); 
36 | end
37 | term2 = term2 * lambda2; 
38 | dW_z = 2 * (term0 + term1 + term2);
39 | df = reshape(dW_z, [c*dz,1]); 
40 | if(GPU_mode)
41 |     df = gather(df); 
42 | end
43 | 
44 | end
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/ZSL_ObjFunc_Wx.m:
--------------------------------------------------------------------------------
 1 | function [f, df] = ZSL_ObjFunc_Wx(W_x_vec, num_Parts, c, dx, W_z, X, Z, Y, ZZ_t,  D_xzi, lambda1, lambda2, GPU_mode)
 2 | 
 3 | W_x = reshape(W_x_vec, [c, dx]); 
 4 | 
 5 | dp = dx / num_Parts; 
 6 | W_x_t = W_x'; 
 7 | 
 8 | XX_t = X * X'; 
 9 | XYZ_t = X * Y * Z'; 
10 | 
11 | %%%% precompute multplication
12 | Wxt_Wz = W_x' * W_z; 
13 | Wxt_Wz_Z = Wxt_Wz * Z; 
14 | 
15 | trace_sum = 0; 
16 | for i = 1 : num_Parts
17 |     trace_sum = trace_sum + trace( W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * full(D_xzi{i}) * W_z' * W_x_t((dp*(i-1)+1) : dp*(i),:)'); 
18 | end
19 | 
20 | %%%% calculate loss
21 | f = norm((X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2  + lambda2 * trace_sum; 
22 | if(GPU_mode)
23 |     f = gather(f); 
24 | end
25 | 
26 | %%%% calculate the derivative of W_x
27 | term0 = W_z * ZZ_t * Wxt_Wz' * XX_t - 2 * W_z * XYZ_t'; 
28 | term1 = lambda1 * W_z * ZZ_t * Wxt_Wz'; 
29 | if(GPU_mode)
30 |     term2 = gpuArray(zeros(dx, c)); 
31 | else 
32 |     term2 = zeros(dx, c);
33 | end
34 | for i = 1 : num_Parts
35 |     term2((dp*(i-1)+1) : dp*(i), :) =   W_x_t((dp*(i-1)+1) : dp*(i),:)* W_z * full(D_xzi{i}) * W_z'; 
36 | end
37 | term2 = lambda2 * term2; 
38 | 
39 | dW_x = 2 * (term0 + term1 + term2');
40 | df = reshape(dW_x, [c*dx,1]); 
41 | 
42 | if(GPU_mode)
43 |     df = gather(df); 
44 | end
45 | 
46 | end
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/mchol.m:
--------------------------------------------------------------------------------
 1 | function [l,d,perm] = mchol(A,mu)
 2 | % [l,d,perm] = mchol(A,mu)
 3 | % Compute the Gill-Murray modified LDL factorization of A,
 4 | 
 5 | if nargin < 2
 6 |     mu = 1e-12;
 7 | end
 8 | 
 9 | n = size(A,1);
10 | l = eye(n);
11 | d = zeros(n,1);
12 | perm = 1:n;
13 | 
14 | for i = 1:n
15 |     c(i,i) = A(i,i);
16 | end
17 | 
18 | % Compute modification parameters
19 | gamma = max(abs(diag(A)));
20 | xi = max(max(abs(setdiag(A,0))));
21 | delta = mu*max(gamma+xi,1);
22 | if n > 1
23 |     beta = sqrt(max([gamma xi/sqrt(n^2-1) mu]));
24 | else
25 |     beta = sqrt(max([gamma mu]));
26 | end
27 | 
28 | for j = 1:n
29 |     
30 |     % Find q that results in Best Permutation with j
31 |     [maxVal maxPos] = max(abs(diag(c(j:end,j:end))));
32 |     q = maxPos+j-1;
33 |     
34 |     % Permute d,c,l,a
35 |     d([j q]) = d([q j]);
36 |     perm([j q]) = perm([q j]);
37 |     c([j q],:) = c([q j],:);
38 |     c(:,[j q]) = c(:,[q j]);
39 |     l([j q],:) = l([q j],:);
40 |     l(:,[j q]) = l(:,[q j]);
41 |     A([j q],:) = A([q j],:);
42 |     A(:,[j q]) = A(:,[q j]);
43 |     
44 |     for s = 1:j-1
45 |         l(j,s) = c(j,s)/d(s);
46 |     end
47 |     for i = j+1:n
48 |         c(i,j) = A(i,j) - sum(l(j,1:j-1).*c(i,1:j-1));
49 |     end
50 |     theta = 0;
51 |     if j < n
52 |         theta = max(abs(c(j+1:n,j)));
53 |     end
54 |     d(j) = max([abs(c(j,j)) (theta/beta)^2 delta]);
55 |     if j < n
56 |         for i = j+1:n
57 |             c(i,i) = c(i,i) - (c(i,j)^2)/d(j);
58 |         end
59 |     end
60 | end


--------------------------------------------------------------------------------
/minFunc_2012/example_derivativeCheck.m:
--------------------------------------------------------------------------------
 1 | clear all
 2 | 
 3 | nInst = 250;
 4 | nVars = 10;
 5 | X = randn(nInst,nVars);
 6 | w = randn(nVars,1);
 7 | y = sign(X*w + randn(nInst,1));
 8 | 
 9 | wTest = randn(nVars,1);
10 | 
11 | fprintf('Testing gradient using forward-differencing...\n');
12 | order = 1;
13 | derivativeCheck(@LogisticLoss,wTest,order,1,X,y);
14 | 
15 | fprintf('Testing gradient using central-differencing...\n');
16 | derivativeCheck(@LogisticLoss,wTest,order,2,X,y);
17 | 
18 | fprintf('Testing gradient using complex-step derivative...\n');
19 | derivativeCheck(@LogisticLoss,wTest,order,3,X,y);
20 | 
21 | fprintf('\n\n\n');
22 | pause
23 | 
24 | fprintf('Testing Hessian using forward-differencing\n');
25 | order = 2;
26 | derivativeCheck(@LogisticLoss,wTest,order,1,X,y);
27 | 
28 | fprintf('Testing Hessian using central-differencing\n');
29 | order = 2;
30 | derivativeCheck(@LogisticLoss,wTest,order,2,X,y);
31 | 
32 | fprintf('Testing Hessian using complex-step derivative\n');
33 | order = 2;
34 | derivativeCheck(@LogisticLoss,wTest,order,3,X,y);
35 | 
36 | fprintf('\n\n\n');
37 | pause
38 | 
39 | fprintf('Testing gradient using fastDerivativeCheck...\n');
40 | order = 1;
41 | fastDerivativeCheck(@LogisticLoss,wTest,order,1,X,y);
42 | fastDerivativeCheck(@LogisticLoss,wTest,order,2,X,y);
43 | fastDerivativeCheck(@LogisticLoss,wTest,order,3,X,y);
44 | 
45 | fprintf('\n\n\n');
46 | pause
47 | 
48 | fprintf('Testing Hessian using fastDerivativeCheck...\n');
49 | order = 2;
50 | fastDerivativeCheck(@LogisticLoss,wTest,order,1,X,y);
51 | fastDerivativeCheck(@LogisticLoss,wTest,order,2,X,y);
52 | fastDerivativeCheck(@LogisticLoss,wTest,order,3,X,y);
53 | 


--------------------------------------------------------------------------------
/minFunc_2012/autoDif/fastDerivativeCheck.m:
--------------------------------------------------------------------------------
1 | function diff = derivativeCheck(funObj,x,order,type,varargin)% diff = fastDerivativeCheck(funObj,x,order,varargin)if nargin < 3	order = 1; % Only check gradient by default	if nargin < 4		type = 2; % Use central-differencing by default	endendp = length(x);d = sign(randn(p,1));if order == 2	fprintf('Checking Hessian-vector product along random direction:\n');	[f,g,H] = funObj(x,varargin{:});	Hv = H*d;		if type == 1 % Use Finite Differencing		mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x));		[diff,diffa] = funObj(x+d*mu,varargin{:});		Hv2 = (diffa-g)/mu;	elseif type == 3 % Use Complex Differentials		mu = 1e-150;		[diff,diffa] = funObj(x+d*mu*i,varargin{:});		Hv2 = imag(diffa-g)/mu;	else % Use Central Differencing		mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x));			[diff1,diffa] = funObj(x+d*mu,varargin{:});			[diff2,diffb] = funObj(x-d*mu,varargin{:});			Hv2 = (diffa-diffb)/(2*mu);	end		fprintf('Max difference between user and numerical Hessian-vector product: %e\n',max(abs(Hv-Hv2)));else	fprintf('Checking Gradient along random direction:\n');	[f,g] = funObj(x,varargin{:});	gtd = g'*d;		if type == 1 % Use Finite Differencing		mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x));		diff = funObj(x+d*mu,varargin{:});		gtd2 = (diff-f)/mu;	elseif type == 3 % Use Complex Differentials		mu = 1e-150;		[diff,diffa] = funObj(x+d*mu*i,varargin{:});		gtd2 = imag(diff)/mu;	else % Use Central Differencing		mu = 2*sqrt(1e-12)*(1+norm(x))/(1+norm(x));		diff1 = funObj(x+d*mu,varargin{:});		diff2 = funObj(x-d*mu,varargin{:});		gtd2 = (diff1-diff2)/(2*mu);	end		fprintf('Max difference between user and numerical directional-derivative: %e\n',max(abs(gtd-gtd2)));end


--------------------------------------------------------------------------------
/minFunc_2012/ZSL_ObjFunc.m~:
--------------------------------------------------------------------------------
 1 | function [f, df] = ZSL_ObjFunc(W, c, dx, dz, X, Z, Y, ZZ_t, XX_t, XYZ_t,  D_xzi, lambda1, lambda2, lambda3)
 2 | 
 3 | %assert(length(W) == (c*dx + c*dz)); 
 4 | 
 5 | W_x_vec = W(1:c*dx); 
 6 | W_z_vec = W(c*dx+1:end); 
 7 | W_x = reshape(W_x_vec, [c, dx]); 
 8 | W_z = reshape(W_z_vec, [c, dz]); 
 9 | 
10 | dp = dx/7; 
11 | W_x_p = zeros(dp, c, 7); 
12 | W_x_t = W_x'; 
13 | for i = 1:7
14 |     W_x_p(:,:,i) = W_x_transform((dp*(i-1)+1) : dp*(i),:); 
15 | end
16 | 
17 | % % precompute multplication
18 | 
19 | Wxt_Wz = W_x' * W_z; 
20 | Wxt_Wz_Z = Wxt_Wz * Z; %Wxt_Wz_Z = W_x'*W_z*Z; 
21 | 
22 | trace_sum = 0; 
23 | %D_xzi = zeros(dz,dz,7);
24 | for i = 1:7
25 |     W_xz = W_x_p(:,:,i) * W_z; 
26 |     D_xzi(:,:,i) = diag([1 ./ (2*sqrt(sum((W_xz').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_xz'))]);
27 |     trace_sum = trace_sum + trace( W_x_p(:,:,i) * W_z * D_xzi(:,:,i) * W_z' * W_x_p(:,:,i)'); 
28 | end
29 | 
30 | D_z = diag([1 ./ (2*sqrt(sum((W_z').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_z'))]);  %% dz X dz
31 | % % loss function 
32 | f =  norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2 +...
33 |     lambda2 * trace(W_z * D_z * W_z') + lambda3 * trace_sum; 
34 | 
35 | % % calculate the derivative of W_x
36 | term1 = W_z * ZZ_t * Wxt_Wz' * XX_t - 2 * W_z * XYZ_t'; 
37 | term2 = lambda1 * W_z * ZZ_t * Wxt_Wz'; 
38 | term4 = zeros(dx, c); 
39 | for i = 1:7
40 |     term4((dp*(i-1)+1) : dp*(i), :) =    W_x_p(:,:,i)* W_z * D_xzi(:,:,i) * W_z'; 
41 | end
42 | term4 = lambda3 * term4; 
43 | 
44 | dW_x = 2 * (term1 + term2 + term4');
45 | dW_x_vec = reshape(dW_x, [c*dx,1]); 
46 | 
47 | 
48 | % % calculate the derivative of W_z
49 | term1 = W_x * XX_t * Wxt_Wz * ZZ_t - W_x * XYZ_t; 
50 | term2 = lambda1 * W_x * Wxt_Wz * ZZ_t ; 
51 | term3 = lambda2 * W_z * D_z; 
52 | term4 = zeros(c, dz); 
53 | for i = 1:7
54 |     term4 = term4 + W_x_p(:,:,i)'*  W_x_p(:,:,i) * W_z * D_xzi(:,:,i); 
55 | end
56 | term4 = term4 * lambda3; 
57 | dW_z = 2 * (term1 + term2 + term3 + term4);
58 | dW_z_vec = reshape(dW_z, [c*dz,1]); 
59 | 
60 | df = [dW_x_vec; dW_z_vec]; 
61 |  
62 | end
63 | 
64 | 
65 | function value = normL2_by_row(M)
66 |     ep = 0.0001;
67 |     value = sqrt(sum(M.^2,2) + ep);
68 | end
69 | 
70 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/conjGrad.m:
--------------------------------------------------------------------------------
 1 | function [x,k,res,negCurv] = cg(A,b,optTol,maxIter,verbose,precFunc,precArgs,matrixVectFunc,matrixVectArgs)
 2 | % [x,k,res,negCurv] =
 3 | % cg(A,b,optTol,maxIter,verbose,precFunc,precArgs,matrixVectFunc,matrixVect
 4 | % Args)
 5 | % Linear Conjugate Gradient, where optionally we use
 6 | % - preconditioner on vector v with precFunc(v,precArgs{:})
 7 | % - matrix multipled by vector with matrixVectFunc(v,matrixVectArgs{:})
 8 | 
 9 | if nargin <= 4
10 |     verbose = 0;
11 | end
12 | 
13 | x = zeros(size(b));
14 | r = -b;
15 | 
16 | % Apply preconditioner (if supplied)
17 | if nargin >= 7 && ~isempty(precFunc)
18 |     y = precFunc(r,precArgs{:});
19 | else
20 |     y = r;
21 | end
22 | 
23 | ry = r'*y;
24 | p = -y;
25 | k = 0;
26 | 
27 | res = norm(r);
28 | done = 0;
29 | negCurv = [];
30 | while res > optTol & k < maxIter & ~done
31 |     % Compute Matrix-vector product
32 |     if nargin >= 9
33 |         Ap = matrixVectFunc(p,matrixVectArgs{:});
34 |     else
35 |         Ap = A*p;
36 |     end
37 |     pAp = p'*Ap;
38 | 
39 |     % Check for negative Curvature
40 |     if pAp <= 1e-16
41 |         if verbose
42 |             fprintf('Negative Curvature Detected!\n');
43 |         end
44 |         
45 |         if nargout == 4
46 |            if pAp < 0
47 |               negCurv = p;
48 |               return
49 |            end
50 |         end
51 |         
52 |         if k == 0
53 |             if verbose
54 |                 fprintf('First-Iter, Proceeding...\n');
55 |             end
56 |             done = 1;
57 |         else
58 |             if verbose
59 |                 fprintf('Stopping\n');
60 |             end
61 |             break;
62 |         end
63 |     end
64 | 
65 |     % Conjugate Gradient
66 |     alpha = ry/(pAp);
67 |     x = x + alpha*p;
68 |     r = r + alpha*Ap;
69 |     
70 |     % If supplied, apply preconditioner
71 |     if nargin >= 7 && ~isempty(precFunc)
72 |         y = precFunc(r,precArgs{:});
73 |     else
74 |         y = r;
75 |     end
76 |     
77 |     ry_new = r'*y;
78 |     beta = ry_new/ry;
79 |     p = -y + beta*p;
80 |     k = k + 1;
81 | 
82 |     % Update variables
83 |     ry = ry_new;
84 |     res = norm(r);
85 | end
86 | end
87 | 


--------------------------------------------------------------------------------
/minFunc_2012/ZSL_ObjFunc.m:
--------------------------------------------------------------------------------
 1 | function [f, df] = ZSL_ObjFunc(W, c, dx, dz, X, Z, Y, ZZ_t, XX_t, XYZ_t,  D_xzi, lambda1, lambda2, lambda3)
 2 | 
 3 | %assert(length(W) == (c*dx + c*dz)); 
 4 | 
 5 | W_x_vec = W(1:c*dx); 
 6 | W_z_vec = W(c*dx+1:end); 
 7 | W_x = reshape(W_x_vec, [c, dx]); 
 8 | W_z = reshape(W_z_vec, [c, dz]); 
 9 | 
10 | dp = dx/7; 
11 | W_x_t = W_x'; %% W_x_transform
12 | 
13 | %W_x_p = zeros(dp, c, 7); 
14 | %for i = 1:7
15 | %    W_x_p(:,:,i) = W_x_t((dp*(i-1)+1) : dp*(i),:); 
16 | %end
17 | 
18 | % % precompute multplication
19 | 
20 | Wxt_Wz = W_x' * W_z; 
21 | Wxt_Wz_Z = Wxt_Wz * Z; %Wxt_Wz_Z = W_x'*W_z*Z; 
22 | 
23 | trace_sum = 0; 
24 | %D_xzi = zeros(dz,dz,7);
25 | for i = 1:7
26 |     W_xz = W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z; 
27 |     D_xzi(:,:,i) = diag([1 ./ (2*sqrt(sum((W_xz').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_xz'))]);
28 |     trace_sum = trace_sum + trace( W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * D_xzi(:,:,i) * W_z' * W_x_t((dp*(i-1)+1) : dp*(i),:)'); 
29 | end
30 | 
31 | D_z = diag([1 ./ (2*sqrt(sum((W_z').^2,2) + 0.0001))]); %diag([1 ./ (2*normL2_by_row(W_z'))]);  %% dz X dz
32 | % % loss function 
33 | f =  norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2 + lambda1 * norm( Wxt_Wz_Z ,'fro')^2 +...
34 |     lambda2 * trace(W_z * D_z * W_z') + lambda3 * trace_sum; 
35 | 
36 | % % calculate the derivative of W_x
37 | term1 = W_z * ZZ_t * Wxt_Wz' * XX_t - 2 * W_z * XYZ_t'; 
38 | term2 = lambda1 * W_z * ZZ_t * Wxt_Wz'; 
39 | term4 = zeros(dx, c); 
40 | for i = 1:7
41 |     term4((dp*(i-1)+1) : dp*(i), :) =   W_x_t((dp*(i-1)+1) : dp*(i),:)* W_z * D_xzi(:,:,i) * W_z'; 
42 | end
43 | term4 = lambda3 * term4; 
44 | 
45 | dW_x = 2 * (term1 + term2 + term4');
46 | dW_x_vec = reshape(dW_x, [c*dx,1]); 
47 | 
48 | 
49 | % % calculate the derivative of W_z
50 | term1 = W_x * XX_t * Wxt_Wz * ZZ_t - W_x * XYZ_t; 
51 | term2 = lambda1 * W_x * Wxt_Wz * ZZ_t ; 
52 | term3 = lambda2 * W_z * D_z; 
53 | term4 = zeros(c, dz); 
54 | for i = 1:7
55 |     term4 = term4 + W_x_t((dp*(i-1)+1) : dp*(i),:)'*  W_x_t((dp*(i-1)+1) : dp*(i),:) * W_z * D_xzi(:,:,i); 
56 | end
57 | term4 = term4 * lambda3; 
58 | dW_z = 2 * (term1 + term2 + term3 + term4);
59 | dW_z_vec = reshape(dW_z, [c*dz,1]); 
60 | 
61 | df = [dW_x_vec; dW_z_vec]; 
62 | 
63 | fprintf(['f = ', num2str(f), '\n']);
64 | end
65 | 
66 | 
67 | % % function value = normL2_by_row(M)
68 | % %     ep = 0.0001;
69 | % %     value = sqrt(sum(M.^2,2) + ep);
70 | % % end
71 | 
72 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/mex/lbfgsProdC.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include "mex.h"
 3 | 
 4 | /* See lbfgsProd.m for details */
 5 | /* This function will not exit gracefully on bad input! */
 6 | 
 7 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
 8 | {
 9 | 	/* Variable Declarations */
10 | 	
11 | 	double *S, *Y, *YS, *g, Hdiag, *d, *alpha, *beta;
12 | 	int i,j,nVars,nCor,maxCor,lbfgs_start,lbfgs_end;
13 | 	
14 | 	/* Get Input Pointers */
15 | 	
16 | 	g = mxGetPr(prhs[0]);
17 | 	S = mxGetPr(prhs[1]);
18 | 	Y = mxGetPr(prhs[2]);
19 | 	YS= mxGetPr(prhs[3]);
20 | 	lbfgs_start = (int)mxGetScalar(prhs[4]);
21 | 	lbfgs_end = (int)mxGetScalar(prhs[5]);
22 | 	Hdiag = mxGetScalar(prhs[6]);
23 | 	
24 | 	if (!mxIsClass(prhs[4],"int32")||!mxIsClass(prhs[5],"int32"))
25 | 		mexErrMsgTxt("lbfgs_start and lbfgs_end must be int32");
26 | 	
27 | 	/* Compute number of variables, maximum number of corrections */
28 | 	
29 | 	nVars = mxGetDimensions(prhs[1])[0];
30 | 	maxCor = mxGetDimensions(prhs[1])[1];
31 | 	
32 | 	/* Compute number of corrections available */
33 | 	if (lbfgs_start == 1)
34 | 		nCor = lbfgs_end-lbfgs_start+1;
35 | 	else
36 | 		nCor = maxCor;
37 | 	
38 | 	/* Allocate Memory for Local Variables */
39 | 	alpha = mxCalloc(nCor,sizeof(double));
40 | 	beta = mxCalloc(nCor,sizeof(double));
41 | 	
42 | 	/* Set-up Output Vector */
43 | 	plhs[0] = mxCreateDoubleMatrix(nVars,1,mxREAL);
44 | 	d = mxGetPr(plhs[0]);
45 | 	
46 | 	for(j=0;j<nVars;j++)
47 | 		d[j] = -g[j];
48 | 	
49 | 	for(i = lbfgs_end-1;i >= 0;i--) {
50 | 		alpha[i] = 0;
51 | 		for(j=0;j<nVars;j++)
52 | 			alpha[i] += S[j + nVars*i]*d[j];
53 | 		alpha[i] /= YS[i];
54 | 		for(j=0;j<nVars;j++)
55 | 			d[j] -= alpha[i]*Y[j + nVars*i];
56 | 	}
57 | 	if(lbfgs_start != 1) {
58 | 		for(i = maxCor-1;i >= lbfgs_start-1;i--) {
59 | 			alpha[i] = 0;
60 | 			for(j=0;j<nVars;j++)
61 | 				alpha[i] += S[j + nVars*i]*d[j];
62 | 			alpha[i] /= YS[i];
63 | 			for(j=0;j<nVars;j++)
64 | 				d[j] -= alpha[i]*Y[j + nVars*i];
65 | 		}
66 | 	}
67 | 	
68 | 	for(j=0;j<nVars;j++)
69 | 		d[j] *= Hdiag;
70 | 	
71 | 	if(lbfgs_start != 1) {
72 | 		for(i = lbfgs_start-1; i < maxCor; i++) {
73 | 			beta[i] = 0;
74 | 			for(j=0;j<nVars;j++)
75 | 				beta[i] += Y[j + nVars*i]*d[j];
76 | 			beta[i] /= YS[i];
77 | 			for(j=0;j<nVars;j++)
78 | 				d[j] += S[j+nVars*i]*(alpha[i]-beta[i]);
79 | 		}
80 | 	}
81 | 	for(i = 0; i < lbfgs_end; i++) {
82 | 		beta[i] = 0;
83 | 		for(j=0;j<nVars;j++)
84 | 			beta[i] += Y[j + nVars*i]*d[j];
85 | 		beta[i] /= YS[i];
86 | 		for(j=0;j<nVars;j++)
87 | 			d[j] += S[j+nVars*i]*(alpha[i]-beta[i]);
88 | 	}
89 | 	
90 | 	mxFree(alpha);
91 | 	mxFree(beta);
92 | 	
93 | }
94 | 


--------------------------------------------------------------------------------
/minFunc_2012/logisticExample/example_minFunc_LR.m:
--------------------------------------------------------------------------------
 1 | clear all
 2 | 
 3 | nInst = 500;
 4 | nVars = 200;
 5 | X = randn(nInst,nVars);
 6 | w = randn(nVars,1);
 7 | y = sign(X*w + randn(nInst,1));
 8 | 
 9 | w_init = zeros(nVars,1);
10 | funObj = @(w)LogisticLoss(w,X,y);
11 | 
12 | fprintf('\nRunning Steepest Descent\n');
13 | options.Method = 'sd';
14 | minFunc(@LogisticLoss,w_init,options,X,y);
15 | pause;
16 | 
17 | fprintf('\nRunning Cyclic Steepest Descent\n');
18 | options.Method = 'csd';
19 | minFunc(@LogisticLoss,w_init,options,X,y);
20 | pause;
21 | 
22 | fprintf('\nRunning Conjugate Gradient\n');
23 | options.Method = 'cg';
24 | minFunc(@LogisticLoss,w_init,options,X,y);
25 | pause;
26 | 
27 | fprintf('\nRunning Scaled Conjugate Gradient\n');
28 | options.Method = 'scg';
29 | minFunc(@LogisticLoss,w_init,options,X,y);
30 | pause;
31 | 
32 | fprintf('\nRunning Preconditioned Conjugate Gradient (Diagonal preconditioner)\n');
33 | options.Method = 'pcg';
34 | options.precFunc = @LogisticDiagPrecond;
35 | minFunc(@LogisticLoss,w_init,options,X,y);
36 | pause;
37 | 
38 | fprintf('\nRunning Preconditioned Conjugate Gradient (L-BFGS preconditioner)\n');
39 | options.Method = 'pcg';
40 | options.precFunc = [];
41 | minFunc(@LogisticLoss,w_init,options,X,y);
42 | pause;
43 | 
44 | fprintf('\nRunning Hessian-Free Newton w/ numerical Hessian-Vector products\n');
45 | options.Method = 'newton0';
46 | minFunc(@LogisticLoss,w_init,options,X,y);
47 | pause;
48 | 
49 | fprintf('\nRunning Preconditioned Hessian-Free Newton w/ numerical Hessian-Vector products (Diagonal preconditioner)\n');
50 | options.Method = 'pnewton0';
51 | options.precFunc = @LogisticDiagPrecond;
52 | minFunc(@LogisticLoss,w_init,options,X,y);
53 | pause;
54 | 
55 | fprintf('\nRunning Preconditioned Hessian-Free Newton w/ numerical Hessian-Vector products (L-BFGS preconditioner)\n');
56 | options.Method = 'pnewton0';
57 | options.precFunc = [];
58 | minFunc(@LogisticLoss,w_init,options,X,y);
59 | pause;
60 | 
61 | fprintf('\nRunning Hessian-Free Newton w/ analytic Hessian-Vector products\n');
62 | options.Method = 'newton0';
63 | options.HvFunc = @LogisticHv;
64 | minFunc(@LogisticLoss,w_init,options,X,y);
65 | pause;
66 | 
67 | fprintf('\nRunning Preconditioned Hessian-Free Newton w/ analytic Hessian-Vector products (Diagonal preconditioner)\n');
68 | options.Method = 'pnewton0';
69 | options.HvFunc = @LogisticHv;
70 | options.precFunc = @LogisticDiagPrecond;
71 | minFunc(@LogisticLoss,w_init,options,X,y);
72 | pause;
73 | 
74 | fprintf('\nRunning Preconditioned Hessian-Free Newton w/ analytic Hessian-Vector products (L-BFGS preconditioner)\n');
75 | options.Method = 'pnewton0';
76 | options.precFunc = [];
77 | options.HvFunc = @LogisticHv;
78 | minFunc(@LogisticLoss,w_init,options,X,y);
79 | pause;


--------------------------------------------------------------------------------
/minFunc_2012/example_minFunc.m:
--------------------------------------------------------------------------------
 1 | % Runs various limited-memory solvers on 2D rosenbrock function for 25
 2 | % function evaluations
 3 | addpath('./minFunc/')
 4 | addpath('./autoDif/')
 5 | addpath('./logisticExample/')
 6 | addpath('./minFunc/compiled/')
 7 | maxFunEvals = 100;
 8 | 
 9 | fprintf('Result after %d evaluations of limited-memory solvers on 2D rosenbrock:\n',maxFunEvals);
10 | 
11 | fprintf('---------------------------------------\n');
12 | fprintf('x1 = %.4f, x2 = %.4f (starting point)\n',0,0);
13 | fprintf('x1 = %.4f, x2 = %.4f (optimal solution)\n',1,1);
14 | fprintf('---------------------------------------\n');
15 | 
16 | if exist('minimize') == 2
17 |     % Minimize.m - conjugate gradient method
18 |     x = minimize([0 0]', 'rosenbrock', -maxFunEvals);
19 |     fprintf('x1 = %.4f, x2 = %.4f (minimize.m by C. Rasmussen)\n',x(1),x(2));
20 | end
21 | 
22 | options = [];
23 | options.display = 'none';
24 | options.maxFunEvals = maxFunEvals;
25 | 
26 | % Steepest Descent
27 | options.Method = 'sd';
28 | x = minFunc(@rosenbrock,[0 0]', options, 1);
29 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with steepest descent)\n',x(1),x(2));
30 | 
31 | % Cyclic Steepest Descent
32 | options.Method = 'csd';
33 | x = minFunc(@rosenbrock,[0 0]',options);
34 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with cyclic steepest descent)\n',x(1),x(2));
35 | 
36 | % Barzilai & Borwein
37 | options.Method = 'bb';
38 | options.bbType = 1;
39 | x = minFunc(@rosenbrock,[0 0]',options);
40 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with spectral gradient descent)\n',x(1),x(2));
41 | 
42 | % Hessian-Free Newton
43 | options.Method = 'newton0';
44 | x = minFunc(@rosenbrock,[0 0]',options);
45 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with Hessian-free Newton)\n',x(1),x(2));
46 | 
47 | % Hessian-Free Newton w/ L-BFGS preconditioner
48 | %options.Method = 'pnewton0';
49 | %x = minFunc(@rosenbrock,[0 0]',options);
50 | %fprintf('x1 = %.4f, x2 = %.4f (minFunc with preconditioned Hessian-free Newton)\n',x(1),x(2));
51 | 
52 | % Conjugate Gradient
53 | options.Method = 'cg';
54 | x = minFunc(@rosenbrock,[0 0]',options);
55 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with conjugate gradient)\n',x(1),x(2));
56 | 
57 | % Scaled conjugate Gradient
58 | options.Method = 'scg';
59 | x = minFunc(@rosenbrock,[0 0]',options);
60 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with scaled conjugate gradient)\n',x(1),x(2));
61 | 
62 | % Preconditioned Conjugate Gradient
63 | options.Method = 'pcg';
64 | x = minFunc(@rosenbrock,[0 0]',options);
65 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with preconditioned conjugate gradient)\n',x(1),x(2));
66 | 
67 | % Default: L-BFGS (default)
68 | options.Method = 'lbfgs';
69 | x = minFunc(@rosenbrock,[0 0]',options);
70 | fprintf('x1 = %.4f, x2 = %.4f (minFunc with limited-memory BFGS - default)\n',x(1),x(2));
71 | 
72 | fprintf('---------------------------------------\n');
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ZSL_PP
 2 | Mohamed Elhoseiny*, Yizhe Zhu*, Han Zhang, Ahmed Elgammal, Link the head to the "peak'': Zero Shot Learning from Noisy Text descriptions at Part Precision, CVPR, 2017
 3 | 
 4 | 
 5 | This code is implemented by Yizhe Zhu and Mohamed Elhoseiny. 
 6 | 
 7 | ## Processed feature Data:
 8 | You can download the dataset [CUB2011](https://drive.google.com/open?id=0B_8vkk7CF-pwejFFcEp2R1FfRFU) and [NABird](https://drive.google.com/open?id=0B_8vkk7CF-pwOGhpQXFUUXZlQjg).
 9 | 
10 | ## Raw wikipedia article data:
11 | Raw wikipedia article data of CUBird and NABird, as well as detailed merging information of NABird,  can be obtained [here](https://drive.google.com/open?id=0B_8vkk7CF-pwckxLQTVkcDBadGc).
12 | 
13 | ## Trianed Models:
14 | [Trained models](https://drive.google.com/open?id=0B_8vkk7CF-pwMU5QQUlUOTZFblU)  reproduce the results in the paper.  
15 | 
16 | # Testing, reproducing the results in the paper
17 | ---------------------------------------------
18 | 
19 | ZSL_Test(Dataset = 'CUBird' or 'NABird', splitmode = 'Easy' or 'Hard', ImgFtSource = 'DET' or 'ATN')
20 | 
21 |    splitmode = Easy or Hard splits defined in Section 4.1 in the paper
22 | 
23 | 
24 | #### CUNBirds Easy split in Table1 
25 | --------------------------------------------------------------------------------
26 | >> **ZSL_Test('CUBird', 'Easy', 'ATN')**  ### ATN means using groundtruth part annotation  
27 | Dataset: CUB2011   Easy  ATN  
28 | Model: trained_models/CUBird_Easy_ATN.mat  
29 | Load Testing set  
30 | test_acc = 43.5049%  
31 | 
32 | ----------------------------------------------------------------------
33 |  >> **ZSL_Test('CUBird', 'Easy', 'DET')** ### DET means using the detected parts instead of GT parts.   
34 | Dataset: CUB2011   Easy  DET  
35 | Model: trained_models/CUBird_Easy_DET.mat  
36 | Load Testing set  
37 | test_acc = 37.5725%   
38 | 
39 | ####  NABirds Easy/Hard split in Table3
40 | --------------------------------------------------------------------------------
41 | >> **ZSL_Test('NABird', 'Easy')**  ### Easy means category-share splitting   
42 | Dataset: NABird   Easy  DET  
43 | Model: trained_models/NABird_Easy_DET.mat  
44 | Load Testing set  
45 | test_acc = 30.5937% 
46 | 
47 | --------------------------------------------------
48 | >> **ZSL_Test('NABird', 'Hard')**   ### Hard means category-share splitting   
49 | Dataset: NABird   Hard  DET  
50 | Model: trained_models/NABird_Hard_DET.mat  
51 | Load Testing set  
52 | test_acc = 8.1349%   
53 | 
54 | 
55 | 
56 | Training
57 | ---------
58 | >>ZSL_Train(Dateset, Splitmode, ImgFtSource, lambda1, lambda2, GPU_mode)  
59 | is the command  to train the model using a particular setting.   
60 | % For example ZSL_Train('CUBird', 'Easy', 'DET', 100000, 10000, true), trains on the CUBirds dataset on the Easy split and using the detected part boxes. 
61 | , lambda1=100000, and lambda2=10000, and GPU_mode=true (using GPU mode for training). If false, the training is done on CPU.
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/mex/lbfgsC.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include "mex.h"
  3 | 
  4 | /* See lbfgs.m for details! */
  5 | /* This function may not exit gracefully on bad input! */
  6 | 
  7 | 
  8 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
  9 | {
 10 |     /* Variable Declarations */
 11 |     
 12 |     double *s, *y, *g, *H, *d, *ro, *alpha, *beta, *q, *r;
 13 |     int nVars,nSteps,lhs_dims[2];
 14 |     double temp;
 15 |     int i,j;
 16 |     
 17 |     /* Get Input Pointers */
 18 | 	
 19 |     g = mxGetPr(prhs[0]);
 20 |     s = mxGetPr(prhs[1]);
 21 |     y = mxGetPr(prhs[2]);
 22 |     H = mxGetPr(prhs[3]);
 23 |     
 24 |     /* Compute number of variables (p), rank of update (d) */
 25 |     
 26 |     nVars = mxGetDimensions(prhs[1])[0];
 27 |     nSteps = mxGetDimensions(prhs[1])[1];
 28 |     
 29 | 	/* Allocated Memory for Function Variables */
 30 |     ro = mxCalloc(nSteps,sizeof(double));
 31 | 	alpha = mxCalloc(nSteps,sizeof(double));
 32 | 	beta = mxCalloc(nSteps,sizeof(double));
 33 | 	q = mxCalloc(nVars*(nSteps+1),sizeof(double));
 34 | 	r = mxCalloc(nVars*(nSteps+1),sizeof(double));
 35 | 	
 36 |     /* Set-up Output Vector */
 37 |     
 38 |     lhs_dims[0] = nVars;
 39 |     lhs_dims[1] = 1;
 40 |     
 41 |     plhs[0] = mxCreateNumericArray(2,lhs_dims,mxDOUBLE_CLASS,mxREAL);
 42 |     d = mxGetPr(plhs[0]);
 43 |     
 44 |     /* ro = 1/(y(:,i)'*s(:,i)) */
 45 |     for(i=0;i<nSteps;i++)
 46 |     {
 47 |         temp = 0;
 48 |         for(j=0;j<nVars;j++)
 49 |         {
 50 | 			temp += y[j+nVars*i]*s[j+nVars*i];
 51 |         }
 52 |         ro[i] = 1/temp;
 53 |     }
 54 | 	
 55 | 	/* q(:,k+1) = g */
 56 | 	for(i=0;i<nVars;i++)
 57 | 	{
 58 | 		q[i+nVars*nSteps] = g[i];
 59 | 	}
 60 | 
 61 | 	for(i=nSteps-1;i>=0;i--)
 62 | 	{
 63 | 		/* alpha(i) = ro(i)*s(:,i)'*q(:,i+1) */
 64 | 		alpha[i] = 0;
 65 | 		for(j=0;j<nVars;j++)
 66 | 		{
 67 | 			alpha[i] += s[j+nVars*i]*q[j+nVars*(i+1)]; 
 68 | 		}
 69 | 		alpha[i] *= ro[i];
 70 | 
 71 | 		/* q(:,i) = q(:,i+1)-alpha(i)*y(:,i) */
 72 | 		for(j=0;j<nVars;j++)
 73 | 		{
 74 | 			q[j+nVars*i]=q[j+nVars*(i+1)]-alpha[i]*y[j+nVars*i];
 75 | 		}
 76 | 	}
 77 | 
 78 | 	/*  r(:,1) = q(:,1) */
 79 | 	for(i=0;i<nVars;i++)
 80 | 	{
 81 | 		r[i] = H[0]*q[i];
 82 | 	}
 83 | 
 84 | 	for(i=0;i<nSteps;i++)
 85 | 	{
 86 | 		/* beta(i) = ro(i)*y(:,i)'*r(:,i) */
 87 | 		beta[i] = 0;
 88 | 		for(j=0;j<nVars;j++)
 89 | 		{
 90 | 			beta[i] += y[j+nVars*i]*r[j+nVars*i];
 91 | 		}
 92 | 		beta[i] *= ro[i];
 93 | 
 94 | 		/* r(:,i+1) = r(:,i) + s(:,i)*(alpha(i)-beta(i)) */
 95 | 		for(j=0;j<nVars;j++)
 96 | 		{
 97 | 			r[j+nVars*(i+1)]=r[j+nVars*i]+s[j+nVars*i]*(alpha[i]-beta[i]);
 98 | 		}
 99 | 	}
100 | 
101 | 	/* d = r(:,k+1) */
102 | 	for(i=0;i<nVars;i++)
103 | 	{
104 | 		d[i]=r[i+nVars*nSteps];
105 | 	}
106 | 
107 | 	/* Free Memory */
108 | 	
109 | 	mxFree(ro);
110 | 	mxFree(alpha);
111 | 	mxFree(beta);
112 | 	mxFree(q);
113 | 	mxFree(r);
114 | 	
115 | }
116 | 


--------------------------------------------------------------------------------
/ZSL_Test.m:
--------------------------------------------------------------------------------
 1 | function [ ] = ZSL_Test(Dateset, Splitmode, ImgFtSource, modelpath)
 2 | % example
 3 | % ZSL_Test('CUBird', 'Easy', 'DET', 'trained_models/CUBird_Easy_DET.mat')
 4 | 
 5 | %%% specify the setting you want to test, and path to the trained model
 6 | if(~exist('Dateset', 'var'))    Dateset = 'CUBird';  end % {'CUBird', 'NABird'}
 7 | if(~exist('Splitmode', 'var'))  Splitmode = 'Easy';  end % {'Easy', 'Hard'}
 8 | % feature extracted based on (1)detected boundingbox or (2)annotation. 
 9 | if(~exist('ImgFtSource', 'var')) ImgFtSource = 'DET'; end % {'DET', 'ATN'} 
10 | if(~exist('modelpath', 'var')) 
11 |     if(strcmp(Dateset, 'CUBird')&&strcmp(Splitmode, 'Easy')&&strcmp(ImgFtSource, 'DET'))
12 |         modelpath = 'trained_models/CUBird_Easy_DET.mat';
13 |     elseif(strcmp(Dateset, 'CUBird')&&strcmp(Splitmode, 'Easy')&&strcmp(ImgFtSource, 'ATN'))
14 |         modelpath = 'trained_models/CUBird_Easy_ATN.mat';
15 |     elseif(strcmp(Dateset, 'CUBird')&&strcmp(Splitmode, 'Hard')&&strcmp(ImgFtSource, 'DET'))
16 |         modelpath = 'trained_models/CUBird_Hard_DET.mat';
17 |     elseif(strcmp(Dateset, 'NABird')&&strcmp(Splitmode, 'Easy')&&strcmp(ImgFtSource, 'DET'))
18 |         modelpath = 'trained_models/NABird_Easy_DET.mat';
19 |     elseif(strcmp(Dateset, 'NABird')&&strcmp(Splitmode, 'Hard')&&strcmp(ImgFtSource, 'DET'))
20 |         modelpath = 'trained_models/NABird_Hard_DET.mat';
21 |     else
22 |         error('You need to provide a trained model. ')
23 |     end
24 | end
25 | 
26 | model = load(modelpath);
27 | path = get_datapath(Dateset, Splitmode, ImgFtSource, 0, 0, false);
28 | fprintf('Model: %s\n', modelpath)
29 | 
30 | %%%%  prepare the data for testing.
31 | img_feat_dict =  load(path.img_feat_path);
32 | text_feat_dict = load(path.text_feat_path); 
33 | img_label_dict = load(path.img_label_path);
34 | data_split_dict  = load(path.data_split_path); 
35 | 
36 | label = img_label_dict.imageClassLabels(:, 2);
37 | Data = double(img_feat_dict.cnn_feat');
38 | 
39 | ctr = data_split_dict.train_cid;
40 | cte = data_split_dict.test_cid;
41 | 
42 | NumTrnClass = length(unique(ctr));
43 | NumTstClass = length(unique(cte));
44 | fprintf('Load Testing set\n')
45 | 
46 | NumClass = NumTrnClass + NumTstClass;
47 | nPerClass = zeros(NumClass, 1);
48 | IdPerClass = cell(NumClass, 1);
49 | for idc = 1:NumClass
50 |     
51 |     IdPerClass{idc} = find(label==idc);
52 |     nPerClass(idc) = sum(label==idc);  
53 | end
54 | 
55 | Xte = [];yte = [];
56 | for idc = cte
57 |     Xc = Data(IdPerClass{idc}, :);
58 |     Xte = [Xte; Xc];
59 |     yte = [yte; idc*ones(size(Xc,1),1)];
60 | end
61 | 
62 | N_te = length(yte);
63 | y_te = zeros(N_te, 1); 
64 | for n =1:N_te
65 |     y_te(n) = find(cte==yte(n));
66 | end
67 | 
68 | Z_te  = text_feat_dict.PredicateMatrix(cte, :)';
69 | 
70 | %%%% Test and display
71 | fprintf('test_acc = %1.4f%%  \n', 100 * (1-get_error(Xte', model.W_x_opt, model.W_z_opt , Z_te, y_te)));
72 | 
73 | end
74 | 
75 | function err = get_error(X, W_x, W_z, Z, y)
76 |     pred_score =X' * W_x' * W_z * Z;
77 |     [~, maxIdx] = max(pred_score');
78 |     pred_id = maxIdx';
79 |     GT_id = y;
80 |     err = sum(pred_id ~= GT_id) / length(y);
81 | end
82 | 


--------------------------------------------------------------------------------
/get_datapath.m:
--------------------------------------------------------------------------------
 1 | function path = get_datapath(Dateset, Splitmode, ImgFtSource, lambda1, lambda2, istrain) 
 2 |     path=[];
 3 |     if(strcmp(Dateset, 'CUBird'))
 4 | 
 5 |         datapath = './dataset/CUB2011';
 6 |         repath   = './CUBirdResult'; 
 7 | 
 8 |         text_feat_path = [datapath, '/11083D_TFIDF.mat'];
 9 |         img_feat_path  = [datapath, '/cnn_feat_7part_DET_ReLU.mat'];
10 |         img_label_path = [datapath, '/image_class_labels.mat'];
11 |         disp(['Dataset: CUB2011   ', Splitmode, '  ', ImgFtSource]);
12 |         if(istrain)
13 |             fprintf('Parameter  %d  %d\n', lambda1, lambda2); 
14 |         end
15 |         if(strcmp(Splitmode, 'Easy'))
16 |             data_split_path = [datapath, '/train_test_split_easy.mat'];
17 |             if(strcmp(ImgFtSource, 'DET'))
18 |                 img_feat_path  = [datapath, '/cnn_feat_7part_DET_ReLU.mat'];
19 |             elseif(strcmp(ImgFtSource, 'ATN'))
20 |                 img_feat_path  = [datapath, '/cnn_feat_7part_ATN_ReLU.mat'];
21 |             end
22 |         elseif(strcmp(Splitmode, 'Hard'))
23 |             data_split_path = [datapath, '/train_test_split_hard.mat'];
24 |             
25 |             if(strcmp(ImgFtSource, 'DET'))
26 |                 img_feat_path  = [datapath, '/cnn_feat_7part_DET_ReLU_hard.mat'];
27 |             else
28 |                 error('This setting is not available now.')
29 |             end
30 |         end
31 | 
32 |     elseif(strcmp(Dateset, 'NABird'))
33 | 
34 |         datapath = './dataset/NABird';
35 |         repath = './NABirdResult'; 
36 | 
37 |         text_feat_path = [datapath, '/13585D_TFIDF_NABird.mat'];
38 |         img_feat_path  = [datapath, '/cnn_feat_6part_DET_NABird.mat'];
39 |         img_label_path = [datapath, '/image_class_labels_NABird.mat'];
40 | 
41 |         disp(['Dataset: NABird   ',  Splitmode, '  ', ImgFtSource]);
42 |         if(istrain)
43 |             fprintf('Parameter  %d  %d\n', lambda1, lambda2); 
44 |         end
45 |         
46 |         if(strcmp(Splitmode, 'Easy'))
47 |             data_split_path = [datapath, '/train_test_split_NABird_easy.mat'];
48 |             if(strcmp(ImgFtSource, 'DET'))
49 |                 img_feat_path  = [datapath, '/cnn_feat_6part_DET_NABird.mat'];
50 |             elseif(strcmp(ImgFtSource, 'ATN'))
51 |                 img_feat_path  = [datapath, '/cnn_feat_6part_ATN_NABird.mat'];
52 |             end
53 |         elseif(strcmp(Splitmode, 'Hard'))
54 |             data_split_path = [datapath, '/train_test_split_NABird_hard.mat'];
55 |             
56 |             if(strcmp(ImgFtSource, 'DET'))
57 |                 img_feat_path  = [datapath, '/cnn_feat_6part_DET_NABird_hard.mat'];
58 |             else
59 |                 error('This setting is not available now.')
60 |             end
61 |         end
62 |     else
63 |         error('unsupported Dateset, You need prepare it first.\n');
64 |     end
65 |     
66 |     if(istrain)
67 |         if(lambda1 ~=0 ) param1 = log10(lambda1); else param1 =0; end
68 |         if(lambda2 ~=0 ) param2 = log10(lambda2); else param2 =0; end
69 |         %%%% prepare output path
70 |         if(~exist(repath, 'dir')) mkdir(repath); end
71 |         repath = sprintf('%s/%s_%s_Param_%s_%s_%s', repath, Dateset, Splitmode, num2str(param1), num2str(param2), ImgFtSource);
72 |         if(~exist(repath, 'dir')) mkdir(repath); end
73 |         disp(['Result stored in:', repath]); 
74 |         path.repath = repath; 
75 |     end
76 |     
77 |     path.text_feat_path = text_feat_path;
78 |     path.img_feat_path  = img_feat_path;
79 |     path.img_label_path = img_label_path;
80 |     path.data_split_path = data_split_path; 
81 |     
82 |     
83 | end 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/polyinterp.m:
--------------------------------------------------------------------------------
  1 | function [minPos,fmin] = polyinterp(points,doPlot,xminBound,xmaxBound)
  2 | % function [minPos] = polyinterp(points,doPlot,xminBound,xmaxBound)
  3 | %
  4 | %   Minimum of interpolating polynomial based on function and derivative
  5 | %   values
  6 | %
  7 | %   It can also be used for extrapolation if {xmin,xmax} are outside
  8 | %   the domain of the points.
  9 | %
 10 | %   Input:
 11 | %       points(pointNum,[x f g])
 12 | %       doPlot: set to 1 to plot, default: 0
 13 | %       xmin: min value that brackets minimum (default: min of points)
 14 | %       xmax: max value that brackets maximum (default: max of points)
 15 | %
 16 | %   set f or g to sqrt(-1) if they are not known
 17 | %   the order of the polynomial is the number of known f and g values minus 1
 18 | 
 19 | if nargin < 2
 20 |     doPlot = 0;
 21 | end
 22 | 
 23 | nPoints = size(points,1);
 24 | order = sum(sum((imag(points(:,2:3))==0)))-1;
 25 | 
 26 | xmin = min(points(:,1));
 27 | xmax = max(points(:,1));
 28 | 
 29 | % Compute Bounds of Interpolation Area
 30 | if nargin < 3
 31 |     xminBound = xmin;
 32 | end
 33 | if nargin < 4
 34 |     xmaxBound = xmax;
 35 | end
 36 | 
 37 | % Code for most common case:
 38 | %   - cubic interpolation of 2 points
 39 | %       w/ function and derivative values for both
 40 | 
 41 | if nPoints == 2 && order ==3 && doPlot == 0
 42 |     % Solution in this case (where x2 is the farthest point):
 43 |     %    d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
 44 |     %    d2 = sqrt(d1^2 - g1*g2);
 45 |     %    minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
 46 |     %    t_new = min(max(minPos,x1),x2);
 47 |     [minVal minPos] = min(points(:,1));
 48 |     notMinPos = -minPos+3;
 49 |     d1 = points(minPos,3) + points(notMinPos,3) - 3*(points(minPos,2)-points(notMinPos,2))/(points(minPos,1)-points(notMinPos,1));
 50 |     d2 = sqrt(d1^2 - points(minPos,3)*points(notMinPos,3));
 51 |     if isreal(d2)
 52 |         t = points(notMinPos,1) - (points(notMinPos,1) - points(minPos,1))*((points(notMinPos,3) + d2 - d1)/(points(notMinPos,3) - points(minPos,3) + 2*d2));
 53 |         minPos = min(max(t,xminBound),xmaxBound);
 54 |     else
 55 |         minPos = (xmaxBound+xminBound)/2;
 56 |     end
 57 |     return;
 58 | end
 59 | 
 60 | % Constraints Based on available Function Values
 61 | A = zeros(0,order+1);
 62 | b = zeros(0,1);
 63 | for i = 1:nPoints
 64 |     if imag(points(i,2))==0
 65 |         constraint = zeros(1,order+1);
 66 |         for j = order:-1:0
 67 |             constraint(order-j+1) = points(i,1)^j;
 68 |         end
 69 |         A = [A;constraint];
 70 |         b = [b;points(i,2)];
 71 |     end
 72 | end
 73 | 
 74 | % Constraints based on available Derivatives
 75 | for i = 1:nPoints
 76 |     if isreal(points(i,3))
 77 |         constraint = zeros(1,order+1);
 78 |         for j = 1:order
 79 |             constraint(j) = (order-j+1)*points(i,1)^(order-j);
 80 |         end
 81 |         A = [A;constraint];
 82 |         b = [b;points(i,3)];
 83 |     end
 84 | end
 85 | 
 86 | % Find interpolating polynomial
 87 | [params,ignore] = linsolve(A,b);
 88 | 
 89 | % Compute Critical Points
 90 | dParams = zeros(order,1);
 91 | for i = 1:length(params)-1
 92 |     dParams(i) = params(i)*(order-i+1);
 93 | end
 94 | 
 95 | if any(isinf(dParams))
 96 |     cp = [xminBound;xmaxBound;points(:,1)].';
 97 | else
 98 |     cp = [xminBound;xmaxBound;points(:,1);roots(dParams)].';
 99 | end
100 | 
101 | % Test Critical Points
102 | fmin = inf;
103 | minPos = (xminBound+xmaxBound)/2; % Default to Bisection if no critical points valid
104 | for xCP = cp
105 |     if imag(xCP)==0 && xCP >= xminBound && xCP <= xmaxBound
106 |         fCP = polyval(params,xCP);
107 |         if imag(fCP)==0 && fCP < fmin
108 |             minPos = real(xCP);
109 |             fmin = real(fCP);
110 |         end
111 |     end
112 | end
113 | 
114 | % Plot Situation
115 | if doPlot
116 |     clf; hold on;
117 | 
118 |     % Plot Points
119 |     plot(points(:,1),points(:,2),'b*');
120 | 
121 |     % Plot Derivatives
122 |     for i = 1:nPoints
123 |         if isreal(points(i,3))
124 |             m = points(i,3);
125 |             b = points(i,2) - m*points(i,1);
126 |             plot([points(i,1)-.05 points(i,1)+.05],...
127 |                 [(points(i,1)-.05)*m+b (points(i,1)+.05)*m+b],'c.-');
128 |         end
129 |     end
130 | 
131 |     % Plot Function
132 |     x = min(xmin,xminBound)-.1:(max(xmax,xmaxBound)+.1-min(xmin,xminBound)+.1)/100:max(xmax,xmaxBound)+.1;
133 |     for i = 1:length(x)
134 |         f(i) = polyval(params,x(i));
135 |     end
136 |     plot(x,f,'y');
137 |     axis([x(1)-.1 x(end)+.1 min(f)-.1 max(f)+.1]);
138 | 
139 |     % Plot Minimum
140 |     plot(minPos,fmin,'g+');
141 |     if doPlot == 1
142 |         pause(1);
143 |     end
144 | end


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/minFunc_processInputOptions.m:
--------------------------------------------------------------------------------
  1 | 
  2 | function [verbose,verboseI,debug,doPlot,maxFunEvals,maxIter,optTol,progTol,method,...
  3 |     corrections,c1,c2,LS_init,cgSolve,qnUpdate,cgUpdate,initialHessType,...
  4 |     HessianModify,Fref,useComplex,numDiff,LS_saveHessianComp,...
  5 |     Damped,HvFunc,bbType,cycle,...
  6 |     HessianIter,outputFcn,useMex,useNegCurv,precFunc,...
  7 |     LS_type,LS_interp,LS_multi,DerivativeCheck] = ...
  8 |     minFunc_processInputOptions(o)
  9 | 
 10 | % Constants
 11 | SD = 0;
 12 | CSD = 1;
 13 | BB = 2;
 14 | CG = 3;
 15 | PCG = 4;
 16 | LBFGS = 5;
 17 | QNEWTON = 6;
 18 | NEWTON0 = 7;
 19 | NEWTON = 8;
 20 | TENSOR = 9;
 21 | 
 22 | verbose = 1;
 23 | verboseI= 1;
 24 | debug = 0;
 25 | doPlot = 0;
 26 | method = LBFGS;
 27 | cgSolve = 0;
 28 | 
 29 | o = toUpper(o);
 30 | 
 31 | if isfield(o,'DISPLAY')
 32 |     switch(upper(o.DISPLAY))
 33 |         case 0
 34 |             verbose = 0;
 35 |             verboseI = 0;
 36 |         case 'FINAL'
 37 |             verboseI = 0;
 38 |         case 'OFF'
 39 |             verbose = 0;
 40 |             verboseI = 0;
 41 |         case 'NONE'
 42 |             verbose = 0;
 43 |             verboseI = 0;
 44 |         case 'FULL'
 45 |             debug = 1;
 46 |         case 'EXCESSIVE'
 47 |             debug = 1;
 48 |             doPlot = 1;
 49 |     end
 50 | end
 51 | 
 52 | DerivativeCheck = 0;
 53 | if isfield(o,'DERIVATIVECHECK')
 54 |     switch(upper(o.DERIVATIVECHECK))
 55 |         case 1
 56 |             DerivativeCheck = 1;
 57 |         case 'ON'
 58 |             DerivativeCheck = 1;
 59 |     end
 60 | end
 61 | 
 62 | LS_init = 0;
 63 | LS_type = 1;
 64 | LS_interp = 2;
 65 | LS_multi = 0;
 66 | Fref = 1;
 67 | Damped = 0;
 68 | HessianIter = 1;
 69 | c2 = 0.9;
 70 | if isfield(o,'METHOD')
 71 |     m = upper(o.METHOD);
 72 |     switch(m)
 73 |         case 'TENSOR'
 74 |             method = TENSOR;
 75 |         case 'NEWTON'
 76 |             method = NEWTON;
 77 |         case 'MNEWTON'
 78 |             method = NEWTON;
 79 |             HessianIter = 5;
 80 |         case 'PNEWTON0'
 81 |             method = NEWTON0;
 82 |             cgSolve = 1;
 83 |         case 'NEWTON0'
 84 |             method = NEWTON0;
 85 |         case 'QNEWTON'
 86 |             method = QNEWTON;
 87 |             Damped = 1;
 88 |         case 'LBFGS'
 89 |             method = LBFGS;
 90 |         case 'BB'
 91 |             method = BB;
 92 |             LS_type = 0;
 93 |             Fref = 20;
 94 |         case 'PCG'
 95 |             method = PCG;
 96 |             c2 = 0.2;
 97 |             LS_init = 2;
 98 |         case 'SCG'
 99 |             method = CG;
100 |             c2 = 0.2;
101 |             LS_init = 4;
102 |         case 'CG'
103 |             method = CG;
104 |             c2 = 0.2;
105 |             LS_init = 2;
106 |         case 'CSD'
107 |             method = CSD;
108 |             c2 = 0.2;
109 |             Fref = 10;
110 |             LS_init = 2;
111 |         case 'SD'
112 |             method = SD;
113 |             LS_init = 2;
114 |     end
115 | end
116 | 
117 | maxFunEvals = getOpt(o,'MAXFUNEVALS',1000);
118 | maxIter = getOpt(o,'MAXITER',500);
119 | optTol = getOpt(o,'OPTTOL',1e-5);
120 | progTol = getOpt(o,'PROGTOL',1e-9);
121 | corrections = getOpt(o,'CORRECTIONS',100);
122 | corrections = getOpt(o,'CORR',corrections);
123 | c1 = getOpt(o,'C1',1e-4);
124 | c2 = getOpt(o,'C2',c2);
125 | LS_init = getOpt(o,'LS_INIT',LS_init);
126 | cgSolve = getOpt(o,'CGSOLVE',cgSolve);
127 | qnUpdate = getOpt(o,'QNUPDATE',3);
128 | cgUpdate = getOpt(o,'CGUPDATE',2);
129 | initialHessType = getOpt(o,'INITIALHESSTYPE',1);
130 | HessianModify = getOpt(o,'HESSIANMODIFY',0);
131 | Fref = getOpt(o,'FREF',Fref);
132 | useComplex = getOpt(o,'USECOMPLEX',0);
133 | numDiff = getOpt(o,'NUMDIFF',0);
134 | LS_saveHessianComp = getOpt(o,'LS_SAVEHESSIANCOMP',1);
135 | Damped = getOpt(o,'DAMPED',Damped);
136 | HvFunc = getOpt(o,'HVFUNC',[]);
137 | bbType = getOpt(o,'BBTYPE',0);
138 | cycle = getOpt(o,'CYCLE',3);
139 | HessianIter = getOpt(o,'HESSIANITER',HessianIter);
140 | outputFcn = getOpt(o,'OUTPUTFCN',[]);
141 | useMex = getOpt(o,'USEMEX',1);
142 | useNegCurv = getOpt(o,'USENEGCURV',1);
143 | precFunc = getOpt(o,'PRECFUNC',[]);
144 | LS_type = getOpt(o,'LS_type',LS_type);
145 | LS_interp = getOpt(o,'LS_interp',LS_interp);
146 | LS_multi = getOpt(o,'LS_multi',LS_multi);
147 | end
148 | 
149 | function [v] = getOpt(options,opt,default)
150 | if isfield(options,opt)
151 |     if ~isempty(getfield(options,opt))
152 |         v = getfield(options,opt);
153 |     else
154 |         v = default;
155 |     end
156 | else
157 |     v = default;
158 | end
159 | end
160 | 
161 | function [o] = toUpper(o)
162 | if ~isempty(o)
163 |     fn = fieldnames(o);
164 |     for i = 1:length(fn)
165 |         o = setfield(o,upper(fn{i}),getfield(o,fn{i}));
166 |     end
167 | end
168 | end


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/ArmijoBacktrack.m:
--------------------------------------------------------------------------------
  1 | function [t,x_new,f_new,g_new,funEvals,H] = ArmijoBacktrack(...
  2 |     x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,funObj,varargin)
  3 | % [t,x_new,f_new,g_new,funEvals,H] = ArmijoBacktrack(...
  4 | %    x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,funObj,varargin)
  5 | %
  6 | % Backtracking linesearch to satisfy Armijo condition
  7 | %
  8 | % Inputs:
  9 | %   x: starting location
 10 | %   t: initial step size
 11 | %   d: descent direction
 12 | %   f: function value at starting location
 13 | %   fr: reference function value (usually funObj(x))
 14 | %   gtd: directional derivative at starting location
 15 | %   c1: sufficient decrease parameter
 16 | %   debug: display debugging information
 17 | %   LS_interp: type of interpolation
 18 | %   progTol: minimum allowable step length
 19 | %   doPlot: do a graphical display of interpolation
 20 | %   funObj: objective function
 21 | %   varargin: parameters of objective function
 22 | %
 23 | % Outputs:
 24 | %   t: step length
 25 | %   f_new: function value at x+t*d
 26 | %   g_new: gradient value at x+t*d
 27 | %   funEvals: number function evaluations performed by line search
 28 | %   H: Hessian at initial guess (only computed if requested)
 29 | %
 30 | % recet change: LS changed to LS_interp and LS_multi
 31 | 
 32 | % Evaluate the Objective and Gradient at the Initial Step
 33 | if nargout == 6
 34 |     [f_new,g_new,H] = funObj(x + t*d,varargin{:});
 35 | else
 36 |     [f_new,g_new] = funObj(x+t*d,varargin{:});
 37 | end
 38 | funEvals = 1;
 39 | 
 40 | while f_new > fr + c1*t*gtd || ~isLegal(f_new)
 41 |     temp = t;
 42 |     
 43 |     if LS_interp == 0 || ~isLegal(f_new)
 44 |         % Ignore value of new point
 45 |         if debug
 46 |             fprintf('Fixed BT\n');
 47 |         end
 48 |         t = 0.5*t;
 49 |     elseif LS_interp == 1 || ~isLegal(g_new)
 50 |         % Use function value at new point, but not its derivative
 51 |         if funEvals < 2 || LS_multi == 0 || ~isLegal(f_prev)
 52 |             % Backtracking w/ quadratic interpolation based on two points
 53 |             if debug
 54 |                 fprintf('Quad BT\n');
 55 |             end
 56 |             t = polyinterp([0 f gtd; t f_new sqrt(-1)],doPlot,0,t);
 57 |         else
 58 |             % Backtracking w/ cubic interpolation based on three points
 59 |             if debug
 60 |                 fprintf('Cubic BT\n');
 61 |             end
 62 |             t = polyinterp([0 f gtd; t f_new sqrt(-1); t_prev f_prev sqrt(-1)],doPlot,0,t);
 63 |         end
 64 |     else
 65 |         % Use function value and derivative at new point
 66 |         
 67 |         if funEvals < 2 || LS_multi == 0 || ~isLegal(f_prev)
 68 |             % Backtracking w/ cubic interpolation w/ derivative
 69 |             if debug
 70 |                 fprintf('Grad-Cubic BT\n');
 71 |             end
 72 |             t = polyinterp([0 f gtd; t f_new g_new'*d],doPlot,0,t);
 73 |         elseif ~isLegal(g_prev)
 74 |             % Backtracking w/ quartic interpolation 3 points and derivative
 75 |             % of two
 76 |             if debug
 77 |                 fprintf('Grad-Quartic BT\n');
 78 |             end
 79 |             t = polyinterp([0 f gtd; t f_new g_new'*d; t_prev f_prev sqrt(-1)],doPlot,0,t);
 80 |         else
 81 |             % Backtracking w/ quintic interpolation of 3 points and derivative
 82 |             % of two
 83 |             if debug
 84 |                 fprintf('Grad-Quintic BT\n');
 85 |             end
 86 |             t = polyinterp([0 f gtd; t f_new g_new'*d; t_prev f_prev g_prev'*d],doPlot,0,t);
 87 |          end
 88 |     end
 89 |     
 90 |     % Adjust if change in t is too small/large
 91 |     if t < temp*1e-3
 92 |         if debug
 93 |             fprintf('Interpolated Value Too Small, Adjusting\n');
 94 |         end
 95 |         t = temp*1e-3;
 96 |     elseif t > temp*0.6
 97 |         if debug
 98 |             fprintf('Interpolated Value Too Large, Adjusting\n');
 99 |         end
100 |         t = temp*0.6;
101 |     end
102 | 
103 |     % Store old point if doing three-point interpolation
104 |     if LS_multi
105 |         f_prev = f_new;
106 |         t_prev = temp;
107 |         if LS_interp == 2
108 |             g_prev = g_new;
109 |         end
110 |     end
111 |     
112 |     if ~saveHessianComp && nargout == 6
113 |         [f_new,g_new,H] = funObj(x + t*d,varargin{:});
114 |     else
115 |         [f_new,g_new] = funObj(x + t*d,varargin{:});
116 |     end
117 |     funEvals = funEvals+1;
118 | 
119 |     % Check whether step size has become too small
120 |     if max(abs(t*d)) <= progTol
121 |         if debug
122 |             fprintf('Backtracking Line Search Failed\n');
123 |         end
124 |         t = 0;
125 |         f_new = f;
126 |         g_new = g;
127 |         break;
128 |     end
129 | end
130 | 
131 | % Evaluate Hessian at new point
132 | if nargout == 6 && funEvals > 1 && saveHessianComp
133 |     [f_new,g_new,H] = funObj(x + t*d,varargin{:});
134 |     funEvals = funEvals+1;
135 | end
136 | 
137 | x_new = x + t*d;
138 | 
139 | end
140 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/mex/mcholC.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include "mex.h"
  3 | 
  4 | double mymax(double x, double y)
  5 | {
  6 |     if (x > y)
  7 |         return x;
  8 |     else
  9 |         return y;
 10 | }
 11 | 
 12 | double absolute(double x)
 13 | {
 14 |     if (x >= -x)
 15 |         return x;
 16 |     else
 17 |         return -x;
 18 | }
 19 | 
 20 | void permuteInt(int *x, int p, int q)
 21 | {
 22 |     int temp;
 23 |     temp = x[p];
 24 |     x[p] = x[q];
 25 |     x[q] = temp;
 26 | }
 27 | 
 28 | void permute(double *x, int p, int q)
 29 | {
 30 |     double temp;
 31 |     temp = x[p];
 32 |     x[p] = x[q];
 33 |     x[q] = temp;
 34 | }
 35 | 
 36 | void permuteRows(double *x, int p, int q,int n)
 37 | {
 38 |     int i;
 39 |     double temp;
 40 |     for(i = 0; i < n; i++)
 41 |     {
 42 |         temp = x[p+i*n];
 43 |         x[p+i*n] = x[q+i*n];
 44 |         x[q+i*n] = temp;
 45 |     }
 46 | }
 47 | 
 48 | void permuteCols(double *x, int p, int q,int n)
 49 | {
 50 |     int i;
 51 |     double temp;
 52 |     for(i = 0; i < n; i++)
 53 |     {
 54 |         temp = x[i+p*n];
 55 |         x[i+p*n] = x[i+q*n];
 56 |         x[i+q*n] = temp;
 57 |     }
 58 | }
 59 | 
 60 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
 61 | {
 62 |     int n,sizL[2],sizD[2],i,j,q,s,
 63 |     *P;
 64 |     
 65 |     double mu,gamma,xi,delta,beta,maxVal,theta,
 66 |     *c,    *H, *L, *D, *A;
 67 |     
 68 |     /* Input */
 69 |     H = mxGetPr(prhs[0]);
 70 |     if (nrhs == 1)
 71 |     {
 72 |         mu = 1e-12;
 73 |     }
 74 |     else
 75 |     {
 76 |         mu = mxGetScalar(prhs[1]);
 77 |     }
 78 |     
 79 |     /* Compute Sizes */
 80 |     n = mxGetDimensions(prhs[0])[0];
 81 |     
 82 |     /* Form Output */
 83 |     sizL[0] = n;
 84 |     sizL[1] = n;
 85 |     plhs[0] = mxCreateNumericArray(2,sizL,mxDOUBLE_CLASS,mxREAL);
 86 |     L = mxGetPr(plhs[0]);
 87 |     sizD[0] = n;
 88 |     sizD[1] = 1;
 89 |     plhs[1] = mxCreateNumericArray(2,sizD,mxDOUBLE_CLASS,mxREAL);
 90 |     D = mxGetPr(plhs[1]);
 91 |     plhs[2] = mxCreateNumericArray(2,sizD,mxINT32_CLASS,mxREAL);
 92 |     P = (int*)mxGetData(plhs[2]);
 93 |     
 94 |     /* Initialize */
 95 |     c = mxCalloc(n*n,sizeof(double));
 96 |     A = mxCalloc(n*n,sizeof(double));
 97 |     
 98 |     for (i = 0; i < n; i++)
 99 |     {
100 |         P[i] = i;
101 |         for (j = 0;j < n; j++)
102 |         {
103 |             A[i+n*j] = H[i+n*j];
104 |         }
105 |     }
106 |     
107 |     gamma = 0;
108 |     for (i = 0; i < n; i++)
109 |     {
110 |         L[i+n*i] = 1;
111 |         c[i+n*i] = A[i+n*i];
112 |     }
113 |     
114 |     /* Compute modification parameters */
115 |     gamma = -1;
116 |     xi = -1;
117 |     for (i = 0; i < n; i++)
118 |     {
119 |         gamma = mymax(gamma,absolute(A[i+n*i]));
120 |         for (j = 0;j < n; j++)
121 |         {
122 |             /*printf("A(%d,%d) = %f, %f\n",i,j,A[i+n*j],absolute(A[i+n*j]));*/
123 |             if (i != j)
124 |                 xi = mymax(xi,absolute(A[i+n*j]));
125 |         }
126 |     }
127 |     delta = mu*mymax(gamma+xi,1);
128 |     
129 |     if (n > 1)
130 |     {
131 |         beta = sqrt(mymax(gamma,mymax(mu,xi/sqrt(n*n-1))));
132 |     }
133 |     else
134 |     {
135 |         beta = sqrt(mymax(gamma,mu));
136 |     }
137 |     
138 |     for (j = 0; j < n; j++)
139 |     {
140 |         
141 |     /* Find q that results in Best Permutation with j */
142 |         maxVal = -1;
143 |         q = 0;
144 |         for(i = j; i < n; i++)
145 |         {
146 |             if (absolute(c[i+n*i]) > maxVal)
147 |             {
148 |                 maxVal = mymax(maxVal,absolute(c[i+n*i]));
149 |                 q = i;
150 |             }
151 |         }
152 |         
153 |         /* Permute D,c,L,A,P */
154 |         permute(D,j,q);
155 |         permuteInt(P,j,q);
156 |         permuteRows(c,j,q,n);
157 |         permuteCols(c,j,q,n);
158 |         permuteRows(L,j,q,n);
159 |         permuteCols(L,j,q,n);
160 |         permuteRows(A,j,q,n);
161 |         permuteCols(A,j,q,n);
162 |         
163 |         for(s = 0; s <= j-1; s++)
164 |             L[j+n*s] = c[j+n*s]/D[s];
165 |         
166 |         for(i = j+1; i < n; i++)
167 |         {
168 |             c[i+j*n] = A[i+j*n];
169 |             for(s = 0; s <= j-1; s++)
170 |             {
171 |                 c[i+j*n] -= L[j+n*s]*c[i+n*s];
172 |             }
173 |         }
174 |         
175 |         theta = 0;
176 |         if (j < n-1)
177 |         {
178 |             for(i = j+1;i < n; i++)
179 |                 theta = mymax(theta,absolute(c[i+n*j]));
180 |         }
181 |         
182 |         D[j] = mymax(absolute(c[j+n*j]),mymax(delta,theta*theta/(beta*beta)));
183 |         
184 |         if (j < n-1)
185 |         {
186 |             for(i = j+1; i < n; i++)
187 |             {
188 |                 c[i+n*i] = c[i+n*i] - c[i+n*j]*c[i+n*j]/D[j];
189 |             }
190 |         }
191 |         
192 |     }
193 |     
194 |     for(i = 0; i < n; i++)
195 |         P[i]++;
196 |     
197 |     mxFree(c);
198 |     mxFree(A);
199 | }


--------------------------------------------------------------------------------
/ZSL_Train.m:
--------------------------------------------------------------------------------
  1 | function [ ] = ZSL_Train(Dateset, Splitmode, ImgFtSource, lambda1, lambda2, GPU_mode)
  2 | % example
  3 | % ZSL_Train('CUBird', 'Easy', 'DET', 100000, 10000, true)
  4 | 
  5 | %gpuDevice(2)
  6 | if(~exist('Dateset', 'var'))    Dateset = 'CUBird';  end % {'CUBird', 'NABird'}
  7 | if(~exist('Splitmode', 'var'))  Splitmode = 'Easy';  end % {'Easy', 'Hard'}
  8 | % feature extracted based on (1)detected boundingbox or (2)annotation. 
  9 | if(~exist('ImgFtSource', 'var')) ImgFtSource = 'DET'; end % {'DET', 'ATN'} 
 10 | if(~exist('lambda1', 'var'))    lambda1 = 100000;   end
 11 | if(~exist('lambda2', 'var'))    lambda2 = 10000;   end
 12 | if(~exist('GPU_mode', 'var'))      GPU_mode = true;   end
 13 | addpath(genpath('./minFunc_2012'))
 14 | %%%%  set to True if continuing to train
 15 | continueTrain = false;
 16 | if(continueTrain)
 17 |     continue_weight_path = 'CUBirdResult/CUBird_Easy_Param_5_4_DET/Weight_opt_250.mat'; 
 18 |     startLoop = 251; 
 19 | end
 20 | 
 21 | path = get_datapath(Dateset, Splitmode, ImgFtSource, lambda1, lambda2, true);
 22 | 
 23 | if(GPU_mode)  fprintf('Using GPU_mode to train.\n')
 24 | else  fprintf('Using CPU_mode to train.\n')
 25 | end
 26 | 
 27 | %%%%  prepare the data for training. 
 28 | img_feat_dict =  load(path.img_feat_path);
 29 | text_feat_dict = load(path.text_feat_path); 
 30 | img_label_dict = load(path.img_label_path);
 31 | data_split_dict  = load(path.data_split_path); 
 32 |  
 33 | label = img_label_dict.imageClassLabels(:, 2);
 34 | Data = double(img_feat_dict.cnn_feat');
 35 | 
 36 | ctr = data_split_dict.train_cid;
 37 | cte = data_split_dict.test_cid;
 38 | 
 39 | 
 40 | NumTrnClass = length(unique(ctr));
 41 | NumTstClass = length(unique(cte));
 42 | fprintf('Load training set\n')
 43 | 
 44 | NumClass = NumTrnClass + NumTstClass;
 45 | nPerClass = zeros(NumClass, 1);
 46 | Id_perClass = cell(NumClass, 1);
 47 | 
 48 | for idc = 1:NumClass
 49 |     Id_perClass{idc} = find(label==idc);
 50 |     nPerClass(idc) = sum(label==idc);
 51 | end
 52 | 
 53 | Xtr = []; ytr = [];
 54 | for idc = ctr
 55 |     Xc = Data(Id_perClass{idc}, :);
 56 |     Xtr = [Xtr; Xc];
 57 |     ytr = [ytr; idc*ones(size(Xc,1),1)];
 58 | end
 59 | 
 60 | C = NumTrnClass;
 61 | N = length(ytr);
 62 | Y = zeros(N, C);
 63 | y = zeros(N, 1);
 64 | for n =1:N
 65 |     Y(n, :) = ctr==ytr(n);
 66 |     y(n) = find(ctr==ytr(n));
 67 | end
 68 | 
 69 | X   = Xtr';
 70 | Z   = text_feat_dict.PredicateMatrix(ctr, :)';
 71 | d_x = size(X, 1);
 72 | d_z = size(Z, 1);
 73 | % Dimension of features for each part
 74 | if(~exist('d_p', 'var')) d_p = 512;  end 
 75 | % Dimension of embedding space
 76 | if(~exist('m', 'var')) m = NumTrnClass;  end 
 77 | if(strcmp(Dateset, 'CUBird')) 
 78 |     num_Parts = 7; 
 79 | elseif(strcmp(Dateset, 'NABird')) 
 80 |     num_Parts = 6; 
 81 | end
 82 | 
 83 | %%%% Set parameter for training 
 84 | MAX_ITER = 20;    %%%%   Number of iterations in a loop 
 85 | MAX_LOOP = 300;   %%%%   Number of max loops. 
 86 | MAX_FUNCEVL =100; %%%%   
 87 | 
 88 | options = [];
 89 | options.Method = 'lbfgs';
 90 | options.Display = 'full';
 91 | options.DerivativeCheck = 'off';
 92 | options.maxFunEvals = MAX_FUNCEVL;
 93 | options.MaxIter = MAX_ITER;
 94 | 
 95 | %%%% Initialize weights 
 96 | if(continueTrain)
 97 |     load(continue_weight_path);
 98 |     W_init_x = W_x_opt;
 99 |     W_init_z = W_z_opt;
100 |     disp(['Continue training from:',  continue_weight_path]); 
101 | else
102 |     startLoop = 1; 
103 |     W_init_x = randn(m ,d_x); 
104 |     W_init_z = randn(m ,d_z);
105 |     disp('Start from Random Initialization.')
106 | end
107 | 
108 | if(GPU_mode)
109 |     %%%% prepare gpu data for iteration:
110 |     X = gpuArray(X); 
111 |     Z = gpuArray(Z); 
112 |     Y = gpuArray(Y);
113 |     %%%% prepare gpu data for iteration: End 
114 | end
115 | 
116 | ZZ_t = Z * Z';
117 | W_x_opt = W_init_x;
118 | W_z_opt = W_init_z; 
119 | 
120 | fprintf('train_acc = %1.4f%%  \n',  100 * (1-get_error(X, W_x_opt, W_z_opt , Z, y)));
121 | trainWx_FLAG = false; 
122 | 
123 | for train_Itn = startLoop : MAX_LOOP
124 |     
125 |     t = clock; 
126 |     if(trainWx_FLAG)
127 |         fprintf('\nITER %d:  Training W_x\n', train_Itn);
128 |     else
129 |         fprintf('\nITER %d:  Training W_z\n', train_Itn);
130 |     end
131 |     
132 | 
133 |     %%%% compute the D_z and D_xz
134 |     D_xzi = zeros(d_z,d_z, num_Parts);
135 |     W_x_t = W_init_x';
136 |     for i = 1:num_Parts
137 |         W_xz = W_x_t((d_p*(i-1)+1) : d_p*(i),:) * W_init_z; 
138 |         D_xzi(:,:,i) = diag([1 ./ (2*sqrt(sum((W_xz').^2,2) + 0.0001))]); 
139 |     end
140 |     
141 |     if(GPU_mode)
142 |         %%%% prepare gpu data inside iteration:
143 |         D_xzi_cell = cell(num_Parts, 1); 
144 |         for i = 1:num_Parts
145 |             D_xzi_cell{i} = gpuArray(sparse(D_xzi(:,:,i))); 
146 |         end
147 |         if(trainWx_FLAG)
148 |             W_init_z = gpuArray(W_init_z); 
149 |         else
150 |             W_init_x = gpuArray(W_init_x);
151 |         end
152 |     else
153 |         D_xzi_cell = cell(num_Parts, 1); 
154 |         for i = 1:num_Parts
155 |             D_xzi_cell{i} = sparse(D_xzi(:,:,i)); 
156 |         end
157 |     end
158 |     
159 |     fprintf('Start training using L-BFGS ......\n')
160 |     if(trainWx_FLAG)
161 |         W_x_opt = minFunc(@ZSL_ObjFunc_Wx, reshape(W_init_x,[m*d_x, 1]), options, num_Parts, m, d_x, W_init_z, ...
162 |             X, Z, Y, ZZ_t, D_xzi_cell, lambda1, lambda2, GPU_mode);
163 |     
164 |         W_x_opt = reshape(W_x_opt, [m, d_x]);
165 |         W_z_opt = W_init_z; 
166 |         if(GPU_mode)
167 |             W_z_opt = gather(W_z_opt);
168 |         end
169 |     else
170 |         W_z_opt = minFunc(@ZSL_ObjFunc_Wz, reshape(W_init_z,[m*d_z, 1]), options, num_Parts, m, d_x, d_z, W_init_x,...
171 |             X, Z, Y, ZZ_t, D_xzi_cell, lambda1, lambda2, GPU_mode);
172 |        
173 |         W_z_opt = reshape(W_z_opt, [m, d_z]); 
174 |         W_x_opt = W_init_x;
175 |         if(GPU_mode)
176 |             W_x_opt = gather(W_x_opt);
177 |         end
178 |     end
179 |     trainWx_FLAG = ~trainWx_FLAG; %  train W_z and W_x alternatively 
180 |     
181 |     %%%% calculate each loss
182 |     parts_Regu =0;
183 |     if(lambda2)
184 |         W_x_t = W_x_opt'; 
185 |         for i = 1:num_Parts
186 |             W_xz = W_x_t((d_p*(i-1)+1) : d_p*(i),:) * W_z_opt; 
187 |             parts_Regu = parts_Regu + sum(sqrt(sum(W_xz.^2, 1))); 
188 |         end
189 |     end
190 |     
191 |     Wxt_Wz_Z =W_x_opt' * W_z_opt * Z;
192 |     
193 |     f0 = norm( (X'* Wxt_Wz_Z - Y) ,'fro')^2; 
194 |     f1 = lambda1 * norm( Wxt_Wz_Z ,'fro')^2; 
195 |     f2 = lambda2 * parts_Regu; 
196 |     f =  f0 + f1 + f2; 
197 |   
198 |     fprintf('\nTime for loop: %f seconds.\n', etime(clock,t)); 
199 |     fprintf('train_acc = %1.4f%%\n',  100 * (1-get_error(X, W_x_opt, W_z_opt , Z, y)));
200 |     fprintf('Total Loss: f = %f,  Loss_0 = %f,  Loss_1 = %f,  Loss_2 = %f \n\n', f, f0, f1,f2); 
201 |     
202 |     fid = fopen([path.repath '/results.txt'], 'a+');
203 |     fprintf(fid, 'ITER %d:     train_acc = %1.4f%%\n',  train_Itn, 100 * (1-get_error(X, W_x_opt, W_z_opt , Z, y)));
204 |     fprintf(fid, 'Total Loss: f = %f,  Loss_0 = %f,  Loss_1 = %f,  Loss_2 = %f \n\n', f, f0, f1,f2); 
205 |     fclose(fid);
206 |     
207 |     if(mod(train_Itn, 10) == 0)
208 |         Weight_Name = sprintf([path.repath '/Weight_opt_%d'], train_Itn);  
209 |         save(Weight_Name, 'W_x_opt', 'W_z_opt'); 
210 |     end
211 |     
212 |     %%%% use the current weight as initialization. 
213 |     W_init_z = W_z_opt; 
214 |     W_init_x = W_x_opt; 
215 | end
216 | 
217 | end
218 | 
219 | function err = get_error(X, W_x, W_z, Z, y)
220 |     pred_score =X' * W_x' * W_z * Z;
221 |     [~, maxIdx] = max(pred_score');
222 |     pred_id = maxIdx';
223 |     GT_id = y;
224 |     err = sum(pred_id ~= GT_id) / length(y);
225 | end
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/WolfeLineSearch.m:
--------------------------------------------------------------------------------
  1 | function [t,f_new,g_new,funEvals,H] = WolfeLineSearch(...
  2 |     x,t,d,f,g,gtd,c1,c2,LS_interp,LS_multi,maxLS,progTol,debug,doPlot,saveHessianComp,funObj,varargin)
  3 | %
  4 | % Bracketing Line Search to Satisfy Wolfe Conditions
  5 | %
  6 | % Inputs:
  7 | %   x: starting location
  8 | %   t: initial step size
  9 | %   d: descent direction
 10 | %   f: function value at starting location
 11 | %   g: gradient at starting location
 12 | %   gtd: directional derivative at starting location
 13 | %   c1: sufficient decrease parameter
 14 | %   c2: curvature parameter
 15 | %   debug: display debugging information
 16 | %   LS_interp: type of interpolation
 17 | %   maxLS: maximum number of iterations
 18 | %   progTol: minimum allowable step length
 19 | %   doPlot: do a graphical display of interpolation
 20 | %   funObj: objective function
 21 | %   varargin: parameters of objective function
 22 | %
 23 | % Outputs:
 24 | %   t: step length
 25 | %   f_new: function value at x+t*d
 26 | %   g_new: gradient value at x+t*d
 27 | %   funEvals: number function evaluations performed by line search
 28 | %   H: Hessian at initial guess (only computed if requested
 29 | 
 30 | % Evaluate the Objective and Gradient at the Initial Step
 31 | if nargout == 5
 32 |     [f_new,g_new,H] = funObj(x + t*d,varargin{:});
 33 | else
 34 |     [f_new,g_new] = funObj(x+t*d,varargin{:});
 35 | end
 36 | funEvals = 1;
 37 | gtd_new = g_new'*d;
 38 | 
 39 | % Bracket an Interval containing a point satisfying the
 40 | % Wolfe criteria
 41 | 
 42 | LSiter = 0;
 43 | t_prev = 0;
 44 | f_prev = f;
 45 | g_prev = g;
 46 | gtd_prev = gtd;
 47 | nrmD = max(abs(d));
 48 | done = 0;
 49 | 
 50 | while LSiter < maxLS
 51 | 
 52 |     %% Bracketing Phase
 53 |     if ~isLegal(f_new) || ~isLegal(g_new)
 54 |         if debug
 55 |             fprintf('Extrapolated into illegal region, switching to Armijo line-search\n');
 56 |         end
 57 |         t = (t + t_prev)/2;
 58 |         % Do Armijo
 59 |         if nargout == 5
 60 |             [t,x_new,f_new,g_new,armijoFunEvals,H] = ArmijoBacktrack(...
 61 |                 x,t,d,f,f,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,...
 62 |                 funObj,varargin{:});
 63 |         else
 64 |             [t,x_new,f_new,g_new,armijoFunEvals] = ArmijoBacktrack(...
 65 |                 x,t,d,f,f,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,saveHessianComp,...
 66 |                 funObj,varargin{:});
 67 |         end
 68 |         funEvals = funEvals + armijoFunEvals;
 69 |         return;
 70 |     end
 71 | 
 72 | 
 73 |     if f_new > f + c1*t*gtd || (LSiter > 1 && f_new >= f_prev)
 74 |         bracket = [t_prev t];
 75 |         bracketFval = [f_prev f_new];
 76 |         bracketGval = [g_prev g_new];
 77 |         break;
 78 |     elseif abs(gtd_new) <= -c2*gtd
 79 |         bracket = t;
 80 |         bracketFval = f_new;
 81 |         bracketGval = g_new;
 82 |         done = 1;
 83 |         break;
 84 |     elseif gtd_new >= 0
 85 |         bracket = [t_prev t];
 86 |         bracketFval = [f_prev f_new];
 87 |         bracketGval = [g_prev g_new];
 88 |         break;
 89 |     end
 90 |     temp = t_prev;
 91 |     t_prev = t;
 92 |     minStep = t + 0.01*(t-temp);
 93 |     maxStep = t*10;
 94 |     if LS_interp <= 1
 95 |         if debug
 96 |             fprintf('Extending Braket\n');
 97 |         end
 98 |         t = maxStep;
 99 |     elseif LS_interp == 2
100 |         if debug
101 |             fprintf('Cubic Extrapolation\n');
102 |         end
103 |         t = polyinterp([temp f_prev gtd_prev; t f_new gtd_new],doPlot,minStep,maxStep);
104 |     elseif LS_interp == 3
105 |         t = mixedExtrap(temp,f_prev,gtd_prev,t,f_new,gtd_new,minStep,maxStep,debug,doPlot);
106 |     end
107 |     
108 |     f_prev = f_new;
109 |     g_prev = g_new;
110 |     gtd_prev = gtd_new;
111 |     if ~saveHessianComp && nargout == 5
112 |         [f_new,g_new,H] = funObj(x + t*d,varargin{:});
113 |     else
114 |         [f_new,g_new] = funObj(x + t*d,varargin{:});
115 |     end
116 |     funEvals = funEvals + 1;
117 |     gtd_new = g_new'*d;
118 |     LSiter = LSiter+1;
119 | end
120 | 
121 | if LSiter == maxLS
122 |     bracket = [0 t];
123 |     bracketFval = [f f_new];
124 |     bracketGval = [g g_new];
125 | end
126 | 
127 | %% Zoom Phase
128 | 
129 | % We now either have a point satisfying the criteria, or a bracket
130 | % surrounding a point satisfying the criteria
131 | % Refine the bracket until we find a point satisfying the criteria
132 | insufProgress = 0;
133 | Tpos = 2;
134 | LOposRemoved = 0;
135 | while ~done && LSiter < maxLS
136 | 
137 |     % Find High and Low Points in bracket
138 |     [f_LO LOpos] = min(bracketFval);
139 |     HIpos = -LOpos + 3;
140 | 
141 |     % Compute new trial value
142 |     if LS_interp <= 1 || ~isLegal(bracketFval) || ~isLegal(bracketGval)
143 |         if debug
144 |             fprintf('Bisecting\n');
145 |         end
146 |         t = mean(bracket);
147 |     elseif LS_interp == 2
148 |         if debug
149 |             fprintf('Grad-Cubic Interpolation\n');
150 |         end
151 |         t = polyinterp([bracket(1) bracketFval(1) bracketGval(:,1)'*d
152 |             bracket(2) bracketFval(2) bracketGval(:,2)'*d],doPlot);
153 |     else
154 |         % Mixed Case %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
155 |         nonTpos = -Tpos+3;
156 |         if LOposRemoved == 0
157 |             oldLOval = bracket(nonTpos);
158 |             oldLOFval = bracketFval(nonTpos);
159 |             oldLOGval = bracketGval(:,nonTpos);
160 |         end
161 |         t = mixedInterp(bracket,bracketFval,bracketGval,d,Tpos,oldLOval,oldLOFval,oldLOGval,debug,doPlot);
162 |     end
163 | 
164 | 
165 |     % Test that we are making sufficient progress
166 |     if min(max(bracket)-t,t-min(bracket))/(max(bracket)-min(bracket)) < 0.1
167 |         if debug
168 |             fprintf('Interpolation close to boundary');
169 |         end
170 |         if insufProgress || t>=max(bracket) || t <= min(bracket)
171 |             if debug
172 |                 fprintf(', Evaluating at 0.1 away from boundary\n');
173 |             end
174 |             if abs(t-max(bracket)) < abs(t-min(bracket))
175 |                 t = max(bracket)-0.1*(max(bracket)-min(bracket));
176 |             else
177 |                 t = min(bracket)+0.1*(max(bracket)-min(bracket));
178 |             end
179 |             insufProgress = 0;
180 |         else
181 |             if debug
182 |                 fprintf('\n');
183 |             end
184 |             insufProgress = 1;
185 |         end
186 |     else
187 |         insufProgress = 0;
188 |     end
189 | 
190 |     % Evaluate new point
191 |     if ~saveHessianComp && nargout == 5
192 |         [f_new,g_new,H] = funObj(x + t*d,varargin{:});
193 |     else
194 |         [f_new,g_new] = funObj(x + t*d,varargin{:});
195 |     end
196 |     funEvals = funEvals + 1;
197 |     gtd_new = g_new'*d;
198 |     LSiter = LSiter+1;
199 | 
200 | 	armijo = f_new < f + c1*t*gtd;
201 |     if ~armijo || f_new >= f_LO
202 |         % Armijo condition not satisfied or not lower than lowest
203 |         % point
204 |         bracket(HIpos) = t;
205 |         bracketFval(HIpos) = f_new;
206 |         bracketGval(:,HIpos) = g_new;
207 |         Tpos = HIpos;
208 |     else
209 |         if abs(gtd_new) <= - c2*gtd
210 |             % Wolfe conditions satisfied
211 |             done = 1;
212 |         elseif gtd_new*(bracket(HIpos)-bracket(LOpos)) >= 0
213 |             % Old HI becomes new LO
214 |             bracket(HIpos) = bracket(LOpos);
215 |             bracketFval(HIpos) = bracketFval(LOpos);
216 |             bracketGval(:,HIpos) = bracketGval(:,LOpos);
217 |             if LS_interp == 3
218 |                 if debug
219 |                     fprintf('LO Pos is being removed!\n');
220 |                 end
221 |                 LOposRemoved = 1;
222 |                 oldLOval = bracket(LOpos);
223 |                 oldLOFval = bracketFval(LOpos);
224 |                 oldLOGval = bracketGval(:,LOpos);
225 |             end
226 |         end
227 |         % New point becomes new LO
228 |         bracket(LOpos) = t;
229 |         bracketFval(LOpos) = f_new;
230 |         bracketGval(:,LOpos) = g_new;
231 |         Tpos = LOpos;
232 | 	end
233 | 
234 |     if ~done && abs(bracket(1)-bracket(2))*nrmD < progTol
235 |         if debug
236 |             fprintf('Line-search bracket has been reduced below progTol\n');
237 |         end
238 |         break;
239 |     end
240 | 
241 | end
242 | 
243 | %%
244 | if LSiter == maxLS
245 |     if debug
246 |         fprintf('Line Search Exceeded Maximum Line Search Iterations\n');
247 |     end
248 | end
249 | 
250 | [f_LO LOpos] = min(bracketFval);
251 | t = bracket(LOpos);
252 | f_new = bracketFval(LOpos);
253 | g_new = bracketGval(:,LOpos);
254 | 
255 | 
256 | 
257 | % Evaluate Hessian at new point
258 | if nargout == 5 && funEvals > 1 && saveHessianComp
259 |     [f_new,g_new,H] = funObj(x + t*d,varargin{:});
260 |     funEvals = funEvals + 1;
261 | end
262 | 
263 | end
264 | 
265 | 
266 | %%
267 | function [t] = mixedExtrap(x0,f0,g0,x1,f1,g1,minStep,maxStep,debug,doPlot);
268 | alpha_c = polyinterp([x0 f0 g0; x1 f1 g1],doPlot,minStep,maxStep);
269 | alpha_s = polyinterp([x0 f0 g0; x1 sqrt(-1) g1],doPlot,minStep,maxStep);
270 | if alpha_c > minStep && abs(alpha_c - x1) < abs(alpha_s - x1)
271 |     if debug
272 |         fprintf('Cubic Extrapolation\n');
273 |     end
274 |     t = alpha_c;
275 | else
276 |     if debug
277 |         fprintf('Secant Extrapolation\n');
278 |     end
279 |     t = alpha_s;
280 | end
281 | end
282 | 
283 | %%
284 | function [t] = mixedInterp(bracket,bracketFval,bracketGval,d,Tpos,oldLOval,oldLOFval,oldLOGval,debug,doPlot);
285 | 
286 | % Mixed Case %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
287 | nonTpos = -Tpos+3;
288 | 
289 | gtdT = bracketGval(:,Tpos)'*d;
290 | gtdNonT = bracketGval(:,nonTpos)'*d;
291 | oldLOgtd = oldLOGval'*d;
292 | if bracketFval(Tpos) > oldLOFval
293 |     alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd
294 |         bracket(Tpos) bracketFval(Tpos) gtdT],doPlot);
295 |     alpha_q = polyinterp([oldLOval oldLOFval oldLOgtd
296 |         bracket(Tpos) bracketFval(Tpos) sqrt(-1)],doPlot);
297 |     if abs(alpha_c - oldLOval) < abs(alpha_q - oldLOval)
298 |         if debug
299 |             fprintf('Cubic Interpolation\n');
300 |         end
301 |         t = alpha_c;
302 |     else
303 |         if debug
304 |             fprintf('Mixed Quad/Cubic Interpolation\n');
305 |         end
306 |         t = (alpha_q + alpha_c)/2;
307 |     end
308 | elseif gtdT'*oldLOgtd < 0
309 |     alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd
310 |         bracket(Tpos) bracketFval(Tpos) gtdT],doPlot);
311 |     alpha_s = polyinterp([oldLOval oldLOFval oldLOgtd
312 |         bracket(Tpos) sqrt(-1) gtdT],doPlot);
313 |     if abs(alpha_c - bracket(Tpos)) >= abs(alpha_s - bracket(Tpos))
314 |         if debug
315 |             fprintf('Cubic Interpolation\n');
316 |         end
317 |         t = alpha_c;
318 |     else
319 |         if debug
320 |             fprintf('Quad Interpolation\n');
321 |         end
322 |         t = alpha_s;
323 |     end
324 | elseif abs(gtdT) <= abs(oldLOgtd)
325 |     alpha_c = polyinterp([oldLOval oldLOFval oldLOgtd
326 |         bracket(Tpos) bracketFval(Tpos) gtdT],...
327 |         doPlot,min(bracket),max(bracket));
328 |     alpha_s = polyinterp([oldLOval sqrt(-1) oldLOgtd
329 |         bracket(Tpos) bracketFval(Tpos) gtdT],...
330 |         doPlot,min(bracket),max(bracket));
331 |     if alpha_c > min(bracket) && alpha_c < max(bracket)
332 |         if abs(alpha_c - bracket(Tpos)) < abs(alpha_s - bracket(Tpos))
333 |             if debug
334 |                 fprintf('Bounded Cubic Extrapolation\n');
335 |             end
336 |             t = alpha_c;
337 |         else
338 |             if debug
339 |                 fprintf('Bounded Secant Extrapolation\n');
340 |             end
341 |             t = alpha_s;
342 |         end
343 |     else
344 |         if debug
345 |             fprintf('Bounded Secant Extrapolation\n');
346 |         end
347 |         t = alpha_s;
348 |     end
349 | 
350 |     if bracket(Tpos) > oldLOval
351 |         t = min(bracket(Tpos) + 0.66*(bracket(nonTpos) - bracket(Tpos)),t);
352 |     else
353 |         t = max(bracket(Tpos) + 0.66*(bracket(nonTpos) - bracket(Tpos)),t);
354 |     end
355 | else
356 |     t = polyinterp([bracket(nonTpos) bracketFval(nonTpos) gtdNonT
357 |         bracket(Tpos) bracketFval(Tpos) gtdT],doPlot);
358 | end
359 | end


--------------------------------------------------------------------------------
/minFunc_2012/minFunc/minFunc.m:
--------------------------------------------------------------------------------
   1 | function [x,f,exitflag,output] = minFunc(funObj,x0,options,varargin)
   2 | % [x,f,exitflag,output] = minFunc(funObj,x0,options,varargin)
   3 | %
   4 | % Unconstrained optimizer using a line search strategy
   5 | %
   6 | % Uses an interface very similar to fminunc
   7 | %   (it doesn't support all of the optimization toolbox options,
   8 | %       but supports many other options).
   9 | %
  10 | % It computes descent directions using one of ('Method'):
  11 | %   - 'sd': Steepest Descent
  12 | %       (no previous information used, not recommended)
  13 | %   - 'csd': Cyclic Steepest Descent
  14 | %       (uses previous step length for a fixed length cycle)
  15 | %   - 'bb': Barzilai and Borwein Gradient
  16 | %       (uses only previous step)
  17 | %   - 'cg': Non-Linear Conjugate Gradient
  18 | %       (uses only previous step and a vector beta)
  19 | %   - 'scg': Scaled Non-Linear Conjugate Gradient
  20 | %       (uses previous step and a vector beta, 
  21 | %           and Hessian-vector products to initialize line search)
  22 | %   - 'pcg': Preconditionined Non-Linear Conjugate Gradient
  23 | %       (uses only previous step and a vector beta, preconditioned version)
  24 | %   - 'lbfgs': Quasi-Newton with Limited-Memory BFGS Updating
  25 | %       (default: uses a predetermined nunber of previous steps to form a 
  26 | %           low-rank Hessian approximation)
  27 | %   - 'newton0': Hessian-Free Newton
  28 | %       (numerically computes Hessian-Vector products)
  29 | %   - 'pnewton0': Preconditioned Hessian-Free Newton 
  30 | %       (numerically computes Hessian-Vector products, preconditioned
  31 | %       version)
  32 | %   - 'qnewton': Quasi-Newton Hessian approximation
  33 | %       (uses dense Hessian approximation)
  34 | %   - 'mnewton': Newton's method with Hessian calculation after every
  35 | %   user-specified number of iterations
  36 | %       (needs user-supplied Hessian matrix)
  37 | %   - 'newton': Newton's method with Hessian calculation every iteration
  38 | %       (needs user-supplied Hessian matrix)
  39 | %   - 'tensor': Tensor
  40 | %       (needs user-supplied Hessian matrix and Tensor of 3rd partial derivatives)
  41 | %
  42 | % Several line search strategies are available for finding a step length satisfying
  43 | %   the termination criteria ('LS_type')
  44 | %   - 0 : A backtracking line-search based on the Armijo condition (default for 'bb')
  45 | %   - 1 : A bracekting line-search based on the strong Wolfe conditions (default for all other methods)
  46 | %   - 2 : The line-search from the Matlab Optimization Toolbox (requires Matlab's linesearch.m to be added to the path)
  47 | %
  48 | % For the Armijo line-search, several interpolation strategies are available ('LS_interp'):
  49 | %   - 0 : Step size halving
  50 | %   - 1 : Polynomial interpolation using new function values
  51 | %   - 2 : Polynomial interpolation using new function and gradient values (default)
  52 | %
  53 | % When (LS_interp = 1), the default setting of (LS_multi = 0) uses quadratic interpolation,
  54 | % while if (LS_multi = 1) it uses cubic interpolation if more than one point are available.
  55 | %
  56 | % When (LS_interp = 2), the default setting of (LS_multi = 0) uses cubic interpolation,
  57 | % while if (LS_multi = 1) it uses quartic or quintic interpolation if more than one point are available
  58 | %
  59 | % To use the non-monotonic Armijo condition, set the 'Fref' value to the number of previous function values to store
  60 | %
  61 | % For the Wolfe line-search, these interpolation strategies are available ('LS_interp'):
  62 | %   - 0 : Step Size Doubling and Bisection
  63 | %   - 1 : Cubic interpolation/extrapolation using new function and gradient values (default)
  64 | %   - 2 : Mixed quadratic/cubic interpolation/extrapolation
  65 | %
  66 | % Several strategies for choosing the initial step size are avaiable ('LS_init'):
  67 | %   - 0: Always try an initial step length of 1 (default for all except 'sd' and 'cg')
  68 | %       (t = 1)
  69 | %   - 1: Use a step similar to the previous step
  70 | %       (t = t_old*min(2,g'd/g_old'd_old))
  71 | %   - 2: Quadratic Initialization using previous function value and new
  72 | %   function value/gradient (use this if steps tend to be very long, default for 'sd' and 'cg')
  73 | %       (t = min(1,2*(f-f_old)/g))
  74 | %   - 3: The minimum between 1 and twice the previous step length
  75 | %       (t = min(1,2*t)
  76 | %   - 4: The scaled conjugate gradient step length (may accelerate
  77 | %   conjugate gradient methods, but requires a Hessian-vector product, default for 'scg')
  78 | %       (t = g'd/d'Hd)
  79 | %
  80 | % Inputs:
  81 | %   funObj - is a function handle
  82 | %   x0 - is a starting vector;
  83 | %   options - is a struct containing parameters (defaults are used for non-existent or blank fields)
  84 | %   varargin{:} - all other arguments are passed as additional arguments to funObj
  85 | %
  86 | % Outputs:
  87 | %   x is the minimum value found
  88 | %   f is the function value at the minimum found
  89 | %   exitflag returns an exit condition
  90 | %   output returns a structure with other information
  91 | %
  92 | % Supported Input Options
  93 | %   Display - Level of display [ off | final | (iter) | full | excessive ]
  94 | %   MaxFunEvals - Maximum number of function evaluations allowed (1000)
  95 | %   MaxIter - Maximum number of iterations allowed (500)
  96 | %   optTol - Termination tolerance on the first-order optimality (1e-5)
  97 | %   progTol - Termination tolerance on progress in terms of function/parameter changes (1e-9)
  98 | %   Method - [ sd | csd | bb | cg | scg | pcg | {lbfgs} | newton0 | pnewton0 |
  99 | %       qnewton | mnewton | newton | tensor ]
 100 | %   c1 - Sufficient Decrease for Armijo condition (1e-4)
 101 | %   c2 - Curvature Decrease for Wolfe conditions (.2 for cg methods, .9 otherwise)
 102 | %   LS_init - Line Search Initialization - see above (2 for cg/sd, 4 for scg, 0 otherwise)
 103 | %   LS - Line Search type - see above (2 for bb, 4 otherwise)
 104 | %   Fref - Setting this to a positive integer greater than 1
 105 | %       will use non-monotone Armijo objective in the line search.
 106 | %       (20 for bb, 10 for csd, 1 for all others)
 107 | %   numDiff - [ 0 | 1 | 2] compute derivatives using user-supplied function (0),
 108 | %       numerically user forward-differencing (1), or numerically using central-differencing (2)
 109 | %       (default: 0) 
 110 | %       (this option has a different effect for 'newton', see below)
 111 | %   useComplex - if 1, use complex differentials if computing numerical derivatives
 112 | %       to get very accurate values (default: 0)
 113 | %   DerivativeCheck - if 'on', computes derivatives numerically at initial
 114 | %       point and compares to user-supplied derivative (default: 'off')
 115 | %   outputFcn - function to run after each iteration (default: []).  It
 116 | %       should have the following interface:
 117 | %       outputFcn(x,iterationType,i,funEvals,f,t,gtd,g,d,optCond,varargin{:});
 118 | %   useMex - where applicable, use mex files to speed things up (default: 1)
 119 | %
 120 | % Method-specific input options:
 121 | %   newton:
 122 | %       HessianModify - type of Hessian modification for direct solvers to
 123 | %       use if the Hessian is not positive definite (default: 0)
 124 | %           0: Minimum Euclidean norm s.t. eigenvalues sufficiently large
 125 | %           (requires eigenvalues on iterations where matrix is not pd)
 126 | %           1: Start with (1/2)*||A||_F and increment until Cholesky succeeds
 127 | %           (an approximation to method 0, does not require eigenvalues)
 128 | %           2: Modified LDL factorization
 129 | %           (only 1 generalized Cholesky factorization done and no eigenvalues required)
 130 | %           3: Modified Spectral Decomposition
 131 | %           (requires eigenvalues)
 132 | %           4: Modified Symmetric Indefinite Factorization
 133 | %           5: Uses the eigenvector of the smallest eigenvalue as negative
 134 | %           curvature direction
 135 | %       cgSolve - use conjugate gradient instead of direct solver (default: 0)
 136 | %           0: Direct Solver
 137 | %           1: Conjugate Gradient
 138 | %           2: Conjugate Gradient with Diagonal Preconditioner
 139 | %           3: Conjugate Gradient with LBFGS Preconditioner
 140 | %           x: Conjugate Graident with Symmetric Successive Over Relaxation
 141 | %           Preconditioner with parameter x
 142 | %               (where x is a real number in the range [0,2])
 143 | %           x: Conjugate Gradient with Incomplete Cholesky Preconditioner
 144 | %           with drop tolerance -x
 145 | %               (where x is a real negative number)
 146 | %       numDiff - compute Hessian numerically
 147 | %                 (default: 0, done with complex differentials if useComplex = 1)
 148 | %       LS_saveHessiancomp - when on, only computes the Hessian at the
 149 | %       first and last iteration of the line search (default: 1)
 150 | %   mnewton:
 151 | %       HessianIter - number of iterations to use same Hessian (default: 5)
 152 | %   qnewton:
 153 | %       initialHessType - scale initial Hessian approximation (default: 1)
 154 | %       qnUpdate - type of quasi-Newton update (default: 3):
 155 | %           0: BFGS
 156 | %           1: SR1 (when it is positive-definite, otherwise BFGS)
 157 | %           2: Hoshino
 158 | %           3: Self-Scaling BFGS
 159 | %           4: Oren's Self-Scaling Variable Metric method 
 160 | %           5: McCormick-Huang asymmetric update
 161 | %       Damped - use damped BFGS update (default: 1)
 162 | %   newton0/pnewton0:
 163 | %       HvFunc - user-supplied function that returns Hessian-vector products
 164 | %           (by default, these are computed numerically using autoHv)
 165 | %           HvFunc should have the following interface: HvFunc(v,x,varargin{:})
 166 | %       useComplex - use a complex perturbation to get high accuracy
 167 | %           Hessian-vector products (default: 0)
 168 | %           (the increased accuracy can make the method much more efficient,
 169 | %               but gradient code must properly support complex inputs)
 170 | %       useNegCurv - a negative curvature direction is used as the descent
 171 | %           direction if one is encountered during the cg iterations
 172 | %           (default: 1)
 173 | %       precFunc (for pnewton0 only) - user-supplied preconditioner
 174 | %           (by default, an L-BFGS preconditioner is used)
 175 | %           precFunc should have the following interfact:
 176 | %           precFunc(v,x,varargin{:})
 177 | %   lbfgs:
 178 | %       Corr - number of corrections to store in memory (default: 100)
 179 | %           (higher numbers converge faster but use more memory)
 180 | %       Damped - use damped update (default: 0)
 181 | %   cg/scg/pcg:
 182 | %       cgUpdate - type of update (default for cg/scg: 2, default for pcg: 1)
 183 | %           0: Fletcher Reeves
 184 | %           1: Polak-Ribiere
 185 | %           2: Hestenes-Stiefel (not supported for pcg)
 186 | %           3: Gilbert-Nocedal
 187 | %       HvFunc (for scg only)- user-supplied function that returns Hessian-vector 
 188 | %           products
 189 | %           (by default, these are computed numerically using autoHv)
 190 | %           HvFunc should have the following interface:
 191 | %           HvFunc(v,x,varargin{:})
 192 | %       precFunc (for pcg only) - user-supplied preconditioner
 193 | %           (by default, an L-BFGS preconditioner is used)
 194 | %           precFunc should have the following interface:
 195 | %           precFunc(v,x,varargin{:})
 196 | %   bb:
 197 | %       bbType - type of bb step (default: 0)
 198 | %           0: min_alpha ||delta_x - alpha delta_g||_2
 199 | %           1: min_alpha ||alpha delta_x - delta_g||_2
 200 | %           2: Conic BB
 201 | %           3: Gradient method with retards
 202 | %   csd:
 203 | %       cycle - length of cycle (default: 3)
 204 | %
 205 | % Supported Output Options
 206 | %   iterations - number of iterations taken
 207 | %   funcCount - number of function evaluations
 208 | %   algorithm - algorithm used
 209 | %   firstorderopt - first-order optimality
 210 | %   message - exit message
 211 | %   trace.funccount - function evaluations after each iteration
 212 | %   trace.fval - function value after each iteration
 213 | %
 214 | % Author: Mark Schmidt (2005)
 215 | % Web: http://www.di.ens.fr/~mschmidt/Software/minFunc.html
 216 | %
 217 | % Sources (in order of how much the source material contributes):
 218 | %   J. Nocedal and S.J. Wright.  1999.  "Numerical Optimization".  Springer Verlag.
 219 | %   R. Fletcher.  1987.  "Practical Methods of Optimization".  Wiley.
 220 | %   J. Demmel.  1997.  "Applied Linear Algebra.  SIAM.
 221 | %   R. Barret, M. Berry, T. Chan, J. Demmel, J. Dongarra, V. Eijkhout, R.
 222 | %   Pozo, C. Romine, and H. Van der Vost.  1994.  "Templates for the Solution of
 223 | %   Linear Systems: Building Blocks for Iterative Methods".  SIAM.
 224 | %   J. More and D. Thuente.  "Line search algorithms with guaranteed
 225 | %   sufficient decrease".  ACM Trans. Math. Softw. vol 20, 286-307, 1994.
 226 | %   M. Raydan.  "The Barzilai and Borwein gradient method for the large
 227 | %   scale unconstrained minimization problem".  SIAM J. Optim., 7, 26-33,
 228 | %   (1997).
 229 | %   "Mathematical Optimization".  The Computational Science Education
 230 | %   Project.  1995.
 231 | %   C. Kelley.  1999.  "Iterative Methods for Optimization".  Frontiers in
 232 | %   Applied Mathematics.  SIAM.
 233 | 
 234 | if nargin < 3
 235 |     options = [];
 236 | end
 237 | 
 238 | % Get Parameters
 239 | [verbose,verboseI,debug,doPlot,maxFunEvals,maxIter,optTol,progTol,method,...
 240 |     corrections,c1,c2,LS_init,cgSolve,qnUpdate,cgUpdate,initialHessType,...
 241 |     HessianModify,Fref,useComplex,numDiff,LS_saveHessianComp,...
 242 |     Damped,HvFunc,bbType,cycle,...
 243 |     HessianIter,outputFcn,useMex,useNegCurv,precFunc,...
 244 |     LS_type,LS_interp,LS_multi,checkGrad] = ...
 245 |     minFunc_processInputOptions(options);
 246 | 
 247 | % Constants
 248 | SD = 0;
 249 | CSD = 1;
 250 | BB = 2;
 251 | CG = 3;
 252 | PCG = 4;
 253 | LBFGS = 5;
 254 | QNEWTON = 6;
 255 | NEWTON0 = 7;
 256 | NEWTON = 8;
 257 | TENSOR = 9;
 258 | 
 259 | % Initialize
 260 | p = length(x0);
 261 | d = zeros(p,1);
 262 | x = x0;
 263 | t = 1;
 264 | 
 265 | % If necessary, form numerical differentiation functions
 266 | funEvalMultiplier = 1;
 267 | if useComplex
 268 | 	numDiffType = 3;
 269 | else
 270 | 	numDiffType = numDiff;
 271 | end
 272 | if numDiff && method ~= TENSOR
 273 |     varargin(3:end+2) = varargin(1:end);
 274 | 	varargin{1} = numDiffType;
 275 | 	varargin{2} = funObj;
 276 |     if method ~= NEWTON
 277 |         if debug
 278 |             if useComplex
 279 |                 fprintf('Using complex differentials for gradient computation\n');
 280 | 			else
 281 |                 fprintf('Using finite differences for gradient computation\n');
 282 |             end
 283 |         end
 284 |         funObj = @autoGrad;
 285 |     else
 286 |         if debug
 287 |             if useComplex
 288 |                 fprintf('Using complex differentials for Hessian computation\n');
 289 |             else
 290 |                 fprintf('Using finite differences for Hessian computation\n');
 291 |             end
 292 |         end
 293 |         funObj = @autoHess;
 294 |     end
 295 | 
 296 |     if method == NEWTON0 && useComplex == 1
 297 |         if debug
 298 |             fprintf('Turning off the use of complex differentials for Hessian-vector products\n');
 299 |         end
 300 |         useComplex = 0;
 301 |     end
 302 | 
 303 |     if useComplex
 304 |         funEvalMultiplier = p;
 305 | 	elseif numDiff == 2
 306 | 		funEvalMultiplier = 2*p;
 307 | 	else
 308 |         funEvalMultiplier = p+1;
 309 |     end
 310 | end
 311 | 
 312 | % Evaluate Initial Point
 313 | if method < NEWTON
 314 |     [f,g] = funObj(x,varargin{:});
 315 |     computeHessian = 0;
 316 | else
 317 |     [f,g,H] = funObj(x,varargin{:});
 318 |     computeHessian = 1;
 319 | end
 320 | funEvals = 1;
 321 | 
 322 | % Derivative Check
 323 | if checkGrad
 324 | 	if numDiff
 325 | 		fprintf('Can not do derivative checking when numDiff is 1\n');
 326 | 		pause
 327 | 	end
 328 | 	derivativeCheck(funObj,x,1,numDiffType,varargin{:}); % Checks gradient
 329 | 	if computeHessian
 330 | 		derivativeCheck(funObj,x,2,numDiffType,varargin{:});
 331 | 	end
 332 | end
 333 | 
 334 | % Output Log
 335 | if verboseI
 336 |     fprintf('%10s %10s %15s %15s %15s\n','Iteration','FunEvals','Step Length','Function Val','Opt Cond');
 337 | end
 338 | 
 339 | % Compute optimality of initial point
 340 | optCond = max(abs(g));
 341 | 
 342 | if nargout > 3
 343 | 	% Initialize Trace
 344 | 	trace.fval = f;
 345 | 	trace.funcCount = funEvals;
 346 | 	trace.optCond = optCond;
 347 | end
 348 | 
 349 | % Exit if initial point is optimal
 350 | if optCond <= optTol
 351 |     exitflag=1;
 352 |     msg = 'Optimality Condition below optTol';
 353 |     if verbose
 354 |         fprintf('%s\n',msg);
 355 |     end
 356 |     if nargout > 3
 357 |         output = struct('iterations',0,'funcCount',1,...
 358 |             'algorithm',method,'firstorderopt',max(abs(g)),'message',msg,'trace',trace);
 359 |     end
 360 |     return;
 361 | end
 362 | 
 363 | % Output Function
 364 | if ~isempty(outputFcn)
 365 |     stop = outputFcn(x,'init',0,funEvals,f,[],[],g,[],max(abs(g)),varargin{:});
 366 | 	if stop
 367 | 		exitflag=-1;
 368 | 		msg = 'Stopped by output function';
 369 | 		if verbose
 370 | 			fprintf('%s\n',msg);
 371 | 		end
 372 | 		if nargout > 3
 373 | 			output = struct('iterations',0,'funcCount',1,...
 374 | 				'algorithm',method,'firstorderopt',max(abs(g)),'message',msg,'trace',trace);
 375 | 		end
 376 | 		return;
 377 | 	end
 378 | end
 379 | 
 380 | % Perform up to a maximum of 'maxIter' descent steps:
 381 | for i = 1:maxIter
 382 | 
 383 |     % ****************** COMPUTE DESCENT DIRECTION *****************
 384 | 
 385 |     switch method
 386 |         case SD % Steepest Descent
 387 |             d = -g;
 388 | 
 389 |         case CSD % Cyclic Steepest Descent
 390 | 
 391 |             if mod(i,cycle) == 1 % Use Steepest Descent
 392 |                 alpha = 1;
 393 |                 LS_init = 2;
 394 |                 LS_type = 1; % Wolfe line search
 395 |             elseif mod(i,cycle) == mod(1+1,cycle) % Use Previous Step
 396 |                 alpha = t;
 397 |                 LS_init = 0;
 398 |                 LS_type = 0; % Armijo line search
 399 |             end
 400 |             d = -alpha*g;
 401 | 
 402 |         case BB % Steepest Descent with Barzilai and Borwein Step Length
 403 | 
 404 |             if i == 1
 405 |                 d = -g;
 406 |             else
 407 |                 y = g-g_old;
 408 |                 s = t*d;
 409 |                 if bbType == 0
 410 |                     yy = y'*y;
 411 |                     alpha = (s'*y)/(yy);
 412 |                     if alpha <= 1e-10 || alpha > 1e10
 413 |                         alpha = 1;
 414 |                     end
 415 |                 elseif bbType == 1
 416 |                     sy = s'*y;
 417 |                     alpha = (s'*s)/sy;
 418 |                     if alpha <= 1e-10 || alpha > 1e10
 419 |                         alpha = 1;
 420 |                     end
 421 |                 elseif bbType == 2 % Conic Interpolation ('Modified BB')
 422 |                     sy = s'*y;
 423 |                     ss = s'*s;
 424 |                     alpha = ss/sy;
 425 |                     if alpha <= 1e-10 || alpha > 1e10
 426 |                         alpha = 1;
 427 |                     end
 428 |                     alphaConic = ss/(6*(myF_old - f) + 4*g'*s + 2*g_old'*s);
 429 |                     if alphaConic > .001*alpha && alphaConic < 1000*alpha
 430 |                         alpha = alphaConic;
 431 |                     end
 432 |                 elseif bbType == 3 % Gradient Method with retards (bb type 1, random selection of previous step)
 433 |                     sy = s'*y;
 434 |                     alpha = (s'*s)/sy;
 435 |                     if alpha <= 1e-10 || alpha > 1e10
 436 |                         alpha = 1;
 437 |                     end
 438 |                     v(1+mod(i-2,5)) = alpha;
 439 |                     alpha = v(ceil(rand*length(v)));
 440 |                 end
 441 |                 d = -alpha*g;
 442 |             end
 443 |             g_old = g;
 444 |             myF_old = f;
 445 | 
 446 | 
 447 |         case CG % Non-Linear Conjugate Gradient
 448 | 
 449 |             if i == 1
 450 |                 d = -g; % Initially use steepest descent direction
 451 |             else
 452 |                 gotgo = g_old'*g_old;
 453 | 
 454 |                 if cgUpdate == 0
 455 |                     % Fletcher-Reeves
 456 |                     beta = (g'*g)/(gotgo);
 457 |                 elseif cgUpdate == 1
 458 |                     % Polak-Ribiere
 459 |                     beta = (g'*(g-g_old)) /(gotgo);
 460 |                 elseif cgUpdate == 2
 461 |                     % Hestenes-Stiefel
 462 |                     beta = (g'*(g-g_old))/((g-g_old)'*d);
 463 |                 else
 464 |                     % Gilbert-Nocedal
 465 |                     beta_FR = (g'*(g-g_old)) /(gotgo);
 466 |                     beta_PR = (g'*g-g'*g_old)/(gotgo);
 467 |                     beta = max(-beta_FR,min(beta_PR,beta_FR));
 468 |                 end
 469 | 
 470 |                 d = -g + beta*d;
 471 | 
 472 |                 % Restart if not a direction of sufficient descent
 473 |                 if g'*d > -progTol
 474 |                     if debug
 475 |                         fprintf('Restarting CG\n');
 476 |                     end
 477 |                     beta = 0;
 478 |                     d = -g;
 479 |                 end
 480 | 
 481 |                 % Old restart rule:
 482 |                 %if beta < 0 || abs(gtgo)/(gotgo) >= 0.1 || g'*d >= 0
 483 | 
 484 |             end
 485 |             g_old = g;
 486 | 
 487 |         case PCG % Preconditioned Non-Linear Conjugate Gradient
 488 | 
 489 | 			% Apply preconditioner to negative gradient
 490 | 			if isempty(precFunc)
 491 | 				% Use L-BFGS Preconditioner
 492 | 				if i == 1
 493 | 					S = zeros(p,corrections);
 494 | 					Y = zeros(p,corrections);
 495 | 					YS = zeros(corrections,1);
 496 | 					lbfgs_start = 1;
 497 | 					lbfgs_end = 0;
 498 | 					Hdiag = 1;
 499 | 					s = -g;
 500 | 				else
 501 | 					[S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(g-g_old,t*d,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex);
 502 | 					if debug && skipped
 503 | 						fprintf('Skipped L-BFGS updated\n');
 504 | 					end
 505 | 					if useMex
 506 | 						s = lbfgsProdC(g,S,Y,YS,int32(lbfgs_start),int32(lbfgs_end),Hdiag);
 507 | 					else
 508 | 						s = lbfgsProd(g,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag);
 509 | 					end
 510 | 				end
 511 | 			else % User-supplied preconditioner
 512 | 				s = precFunc(-g,x,varargin{:});
 513 | 			end
 514 | 			
 515 | 			if i == 1
 516 | 				d = s;
 517 | 			else
 518 | 				
 519 | 				if cgUpdate == 0
 520 | 					% Preconditioned Fletcher-Reeves
 521 | 					beta = (g'*s)/(g_old'*s_old);
 522 | 				elseif cgUpdate < 3
 523 | 					% Preconditioned Polak-Ribiere
 524 | 					beta = (g'*(s-s_old))/(g_old'*s_old);
 525 | 				else
 526 |                     % Preconditioned Gilbert-Nocedal
 527 |                     beta_FR = (g'*s)/(g_old'*s_old);
 528 |                     beta_PR = (g'*(s-s_old))/(g_old'*s_old);
 529 |                     beta = max(-beta_FR,min(beta_PR,beta_FR));
 530 |                 end
 531 |                 d = s + beta*d;
 532 | 
 533 |                 if g'*d > -progTol
 534 |                     if debug
 535 |                         fprintf('Restarting CG\n');
 536 |                     end
 537 |                     beta = 0;
 538 |                     d = s;
 539 |                 end
 540 | 
 541 |             end
 542 |             g_old = g;
 543 |             s_old = s;
 544 |         case LBFGS % L-BFGS
 545 | 
 546 |             % Update the direction and step sizes
 547 | 			if Damped
 548 | 				if i == 1
 549 | 					d = -g; % Initially use steepest descent direction
 550 | 					old_dirs = zeros(length(g),0);
 551 | 					old_stps = zeros(length(d),0);
 552 | 					Hdiag = 1;
 553 | 				else
 554 | 					[old_dirs,old_stps,Hdiag] = dampedUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag);
 555 | 					if useMex
 556 | 						d = lbfgsC(-g,old_dirs,old_stps,Hdiag);
 557 | 					else
 558 | 						d = lbfgs(-g,old_dirs,old_stps,Hdiag);
 559 | 					end
 560 | 				end
 561 | 			else
 562 | 				if i == 1
 563 | 					d = -g; % Initially use steepest descent direction
 564 | 					S = zeros(p,corrections);
 565 | 					Y = zeros(p,corrections);
 566 | 					YS = zeros(corrections,1);
 567 | 					lbfgs_start = 1;
 568 | 					lbfgs_end = 0;
 569 | 					Hdiag = 1;
 570 | 				else
 571 | 					[S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(g-g_old,t*d,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex);
 572 | 					if debug && skipped
 573 | 						fprintf('Skipped L-BFGS updated\n');
 574 | 					end
 575 | 					if useMex
 576 | 						d = lbfgsProdC(g,S,Y,YS,int32(lbfgs_start),int32(lbfgs_end),Hdiag);
 577 | 					else
 578 | 						d = lbfgsProd(g,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag);
 579 | 					end
 580 | 				end
 581 | 			end
 582 | 			g_old = g;
 583 | 
 584 |         case QNEWTON % Use quasi-Newton Hessian approximation
 585 | 
 586 |             if i == 1
 587 |                 d = -g;
 588 |             else
 589 |                 % Compute difference vectors
 590 |                 y = g-g_old;
 591 |                 s = t*d;
 592 | 
 593 |                 if i == 2
 594 |                     % Make initial Hessian approximation
 595 |                     if initialHessType == 0
 596 |                         % Identity
 597 |                         if qnUpdate <= 1
 598 |                             R = eye(length(g));
 599 |                         else
 600 |                             H = eye(length(g));
 601 |                         end
 602 |                     else
 603 |                         % Scaled Identity
 604 |                         if debug
 605 |                             fprintf('Scaling Initial Hessian Approximation\n');
 606 |                         end
 607 |                         if qnUpdate <= 1
 608 |                             % Use Cholesky of Hessian approximation
 609 |                             R = sqrt((y'*y)/(y'*s))*eye(length(g));
 610 |                         else
 611 |                             % Use Inverse of Hessian approximation
 612 |                             H = eye(length(g))*(y'*s)/(y'*y);
 613 |                         end
 614 |                     end
 615 |                 end
 616 | 
 617 |                 if qnUpdate == 0 % Use BFGS updates
 618 |                     Bs = R'*(R*s);
 619 |                     if Damped
 620 |                         eta = .02;
 621 |                         if y'*s < eta*s'*Bs
 622 |                             if debug
 623 |                                 fprintf('Damped Update\n');
 624 |                             end
 625 |                             theta = min(max(0,((1-eta)*s'*Bs)/(s'*Bs - y'*s)),1);
 626 |                             y = theta*y + (1-theta)*Bs;
 627 |                         end
 628 |                         R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-');
 629 |                     else
 630 |                         if y'*s > 1e-10
 631 |                             R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-');
 632 |                         else
 633 |                             if debug
 634 |                                 fprintf('Skipping Update\n');
 635 |                             end
 636 |                         end
 637 |                     end
 638 |                 elseif qnUpdate == 1 % Perform SR1 Update if it maintains positive-definiteness
 639 | 
 640 |                     Bs = R'*(R*s);
 641 |                     ymBs = y-Bs;
 642 |                     if abs(s'*ymBs) >= norm(s)*norm(ymBs)*1e-8 && (s-((R\(R'\y))))'*y > 1e-10
 643 |                         R = cholupdate(R,-ymBs/sqrt(ymBs'*s),'-');
 644 |                     else
 645 |                         if debug
 646 |                             fprintf('SR1 not positive-definite, doing BFGS Update\n');
 647 |                         end
 648 |                         if Damped
 649 |                             eta = .02;
 650 |                             if y'*s < eta*s'*Bs
 651 |                                 if debug
 652 |                                     fprintf('Damped Update\n');
 653 |                                 end
 654 |                                 theta = min(max(0,((1-eta)*s'*Bs)/(s'*Bs - y'*s)),1);
 655 |                                 y = theta*y + (1-theta)*Bs;
 656 |                             end
 657 |                             R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-');
 658 |                         else
 659 |                             if y'*s > 1e-10
 660 |                                 R = cholupdate(cholupdate(R,y/sqrt(y'*s)),Bs/sqrt(s'*Bs),'-');
 661 |                             else
 662 |                                 if debug
 663 |                                     fprintf('Skipping Update\n');
 664 |                                 end
 665 |                             end
 666 |                         end
 667 |                     end
 668 |                 elseif qnUpdate == 2 % Use Hoshino update
 669 |                     v = sqrt(y'*H*y)*(s/(s'*y) - (H*y)/(y'*H*y));
 670 |                     phi = 1/(1 + (y'*H*y)/(s'*y));
 671 |                     H = H + (s*s')/(s'*y) - (H*y*y'*H)/(y'*H*y) + phi*v*v';
 672 | 
 673 |                 elseif qnUpdate == 3 % Self-Scaling BFGS update
 674 |                     ys = y'*s;
 675 |                     Hy = H*y;
 676 |                     yHy = y'*Hy;
 677 |                     gamma = ys/yHy;
 678 |                     v = sqrt(yHy)*(s/ys - Hy/yHy);
 679 |                     H = gamma*(H - Hy*Hy'/yHy + v*v') + (s*s')/ys;
 680 |                 elseif qnUpdate == 4 % Oren's Self-Scaling Variable Metric update
 681 | 
 682 |                     % Oren's method
 683 |                     if (s'*y)/(y'*H*y) > 1
 684 |                         phi = 1; % BFGS
 685 |                         omega = 0;
 686 |                     elseif (s'*(H\s))/(s'*y) < 1
 687 |                         phi = 0; % DFP
 688 |                         omega = 1;
 689 |                     else
 690 |                         phi = (s'*y)*(y'*H*y-s'*y)/((s'*(H\s))*(y'*H*y)-(s'*y)^2);
 691 |                         omega = phi;
 692 |                     end
 693 | 
 694 |                     gamma = (1-omega)*(s'*y)/(y'*H*y) + omega*(s'*(H\s))/(s'*y);
 695 |                     v = sqrt(y'*H*y)*(s/(s'*y) - (H*y)/(y'*H*y));
 696 |                     H = gamma*(H - (H*y*y'*H)/(y'*H*y) + phi*v*v') + (s*s')/(s'*y);
 697 | 
 698 |                 elseif qnUpdate == 5 % McCormick-Huang asymmetric update
 699 |                     theta = 1;
 700 |                     phi = 0;
 701 |                     psi = 1;
 702 |                     omega = 0;
 703 |                     t1 = s*(theta*s + phi*H'*y)';
 704 |                     t2 = (theta*s + phi*H'*y)'*y;
 705 |                     t3 = H*y*(psi*s + omega*H'*y)';
 706 |                     t4 = (psi*s + omega*H'*y)'*y;
 707 |                     H = H + t1/t2 - t3/t4;
 708 |                 end
 709 | 
 710 |                 if qnUpdate <= 1
 711 |                     d = -R\(R'\g);
 712 |                 else
 713 |                     d = -H*g;
 714 |                 end
 715 | 
 716 |             end
 717 |             g_old = g;
 718 | 
 719 |         case NEWTON0 % Hessian-Free Newton
 720 | 
 721 |             cgMaxIter = min(p,maxFunEvals-funEvals);
 722 |             cgForce = min(0.5,sqrt(norm(g)))*norm(g);
 723 | 
 724 |             % Set-up preconditioner
 725 |             precondFunc = [];
 726 |             precondArgs = [];
 727 | 			if cgSolve == 1
 728 | 				if isempty(precFunc) % Apply L-BFGS preconditioner
 729 | 					if i == 1
 730 | 						S = zeros(p,corrections);
 731 | 						Y = zeros(p,corrections);
 732 | 						YS = zeros(corrections,1);
 733 | 						lbfgs_start = 1;
 734 | 						lbfgs_end = 0;
 735 | 						Hdiag = 1;
 736 | 					else
 737 | 						[S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,skipped] = lbfgsAdd(g-g_old,t*d,S,Y,YS,lbfgs_start,lbfgs_end,Hdiag,useMex);
 738 | 						if debug && skipped
 739 | 							fprintf('Skipped L-BFGS updated\n');
 740 | 						end
 741 | 						if useMex
 742 | 							precondFunc = @lbfgsProdC;
 743 | 						else
 744 | 							precondFunc = @lbfgsProd;
 745 | 						end
 746 | 						precondArgs = {S,Y,YS,int32(lbfgs_start),int32(lbfgs_end),Hdiag};
 747 | 					end
 748 | 					g_old = g;
 749 | 				else
 750 | 					% Apply user-defined preconditioner
 751 | 					precondFunc = precFunc;
 752 | 					precondArgs = {x,varargin{:}};
 753 | 				end
 754 | 			end
 755 | 
 756 |             % Solve Newton system using cg and hessian-vector products
 757 |             if isempty(HvFunc)
 758 |                 % No user-supplied Hessian-vector function,
 759 |                 % use automatic differentiation
 760 |                 HvFun = @autoHv;
 761 |                 HvArgs = {x,g,useComplex,funObj,varargin{:}};
 762 |             else
 763 |                 % Use user-supplid Hessian-vector function
 764 |                 HvFun = HvFunc;
 765 |                 HvArgs = {x,varargin{:}};
 766 |             end
 767 |             
 768 |             if useNegCurv
 769 |                 [d,cgIter,cgRes,negCurv] = conjGrad([],-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFun,HvArgs);
 770 |             else
 771 |                 [d,cgIter,cgRes] = conjGrad([],-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFun,HvArgs);
 772 |             end
 773 | 
 774 |             funEvals = funEvals+cgIter;
 775 |             if debug
 776 |                 fprintf('newtonCG stopped on iteration %d w/ residual %.5e\n',cgIter,cgRes);
 777 | 
 778 |             end
 779 | 
 780 |             if useNegCurv
 781 |                 if ~isempty(negCurv)
 782 |                     %if debug
 783 |                     fprintf('Using negative curvature direction\n');
 784 |                     %end
 785 |                     d = negCurv/norm(negCurv);
 786 |                     d = d/sum(abs(g));
 787 |                 end
 788 |             end
 789 | 
 790 |         case NEWTON % Newton search direction
 791 | 
 792 |             if cgSolve == 0
 793 |                 if HessianModify == 0
 794 |                     % Attempt to perform a Cholesky factorization of the Hessian
 795 |                     [R,posDef] = chol(H);
 796 | 
 797 |                     % If the Cholesky factorization was successful, then the Hessian is
 798 |                     % positive definite, solve the system
 799 |                     if posDef == 0
 800 |                         d = -R\(R'\g);
 801 | 
 802 |                     else
 803 |                         % otherwise, adjust the Hessian to be positive definite based on the
 804 |                         % minimum eigenvalue, and solve with QR
 805 |                         % (expensive, we don't want to do this very much)
 806 |                         if debug
 807 |                             fprintf('Adjusting Hessian\n');
 808 |                         end
 809 |                         H = H + eye(length(g)) * max(0,1e-12 - min(real(eig(H))));
 810 |                         d = -H\g;
 811 |                     end
 812 |                 elseif HessianModify == 1
 813 |                     % Modified Incomplete Cholesky
 814 |                     R = mcholinc(H,debug);
 815 |                     d = -R\(R'\g);
 816 |                 elseif HessianModify == 2
 817 |                     % Modified Generalized Cholesky
 818 |                     if useMex
 819 |                         [L D perm] = mcholC(H);
 820 |                     else
 821 |                         [L D perm] = mchol(H);
 822 |                     end
 823 |                     d(perm) = -L' \ ((D.^-1).*(L \ g(perm)));
 824 | 
 825 |                 elseif HessianModify == 3
 826 |                     % Modified Spectral Decomposition
 827 |                     [V,D] = eig((H+H')/2);
 828 |                     D = diag(D);
 829 |                     D = max(abs(D),max(max(abs(D)),1)*1e-12);
 830 |                     d = -V*((V'*g)./D);
 831 |                 elseif HessianModify == 4
 832 |                     % Modified Symmetric Indefinite Factorization
 833 |                     [L,D,perm] = ldl(H,'vector');
 834 |                     [blockPos junk] = find(triu(D,1));
 835 |                     for diagInd = setdiff(setdiff(1:p,blockPos),blockPos+1)
 836 |                         if D(diagInd,diagInd) < 1e-12
 837 |                             D(diagInd,diagInd) = 1e-12;
 838 |                         end
 839 |                     end
 840 |                     for blockInd = blockPos'
 841 |                         block = D(blockInd:blockInd+1,blockInd:blockInd+1);
 842 |                         block_a = block(1);
 843 |                         block_b = block(2);
 844 |                         block_d = block(4);
 845 |                         lambda = (block_a+block_d)/2 - sqrt(4*block_b^2 + (block_a - block_d)^2)/2;
 846 |                         D(blockInd:blockInd+1,blockInd:blockInd+1) = block+eye(2)*(lambda+1e-12);
 847 |                     end
 848 |                     d(perm) = -L' \ (D \ (L \ g(perm)));
 849 |                 else
 850 |                     % Take Newton step if Hessian is pd,
 851 |                     % otherwise take a step with negative curvature
 852 |                     [R,posDef] = chol(H);
 853 |                     if posDef == 0
 854 |                         d = -R\(R'\g);
 855 |                     else
 856 |                         if debug
 857 |                             fprintf('Taking Direction of Negative Curvature\n');
 858 |                         end
 859 |                         [V,D] = eig(H);
 860 |                         u = V(:,1);
 861 |                         d = -sign(u'*g)*u;
 862 |                     end
 863 |                 end
 864 | 
 865 |             else
 866 |                 % Solve with Conjugate Gradient
 867 |                 cgMaxIter = p;
 868 |                 cgForce = min(0.5,sqrt(norm(g)))*norm(g);
 869 | 
 870 |                 % Select Preconditioner
 871 |                 if cgSolve == 1
 872 |                     % No preconditioner
 873 |                     precondFunc = [];
 874 |                     precondArgs = [];
 875 |                 elseif cgSolve == 2
 876 |                     % Diagonal preconditioner
 877 |                     precDiag = diag(H);
 878 |                     precDiag(precDiag < 1e-12) = 1e-12 - min(precDiag);
 879 |                     precondFunc = @precondDiag;
 880 |                     precondArgs = {precDiag.^-1};
 881 |                 elseif cgSolve == 3
 882 |                     % L-BFGS preconditioner
 883 |                     if i == 1
 884 |                         old_dirs = zeros(length(g),0);
 885 |                         old_stps = zeros(length(g),0);
 886 |                         Hdiag = 1;
 887 |                     else
 888 |                         [old_dirs,old_stps,Hdiag] = lbfgsUpdate(g-g_old,t*d,corrections,debug,old_dirs,old_stps,Hdiag);
 889 |                     end
 890 |                     g_old = g;
 891 |                     if useMex
 892 |                         precondFunc = @lbfgsC;
 893 |                     else
 894 |                         precondFunc = @lbfgs;
 895 |                     end
 896 |                     precondArgs = {old_dirs,old_stps,Hdiag};
 897 |                 elseif cgSolve > 0
 898 |                     % Symmetric Successive Overelaxation Preconditioner
 899 |                     omega = cgSolve;
 900 |                     D = diag(H);
 901 |                     D(D < 1e-12) = 1e-12 - min(D);
 902 |                     precDiag = (omega/(2-omega))*D.^-1;
 903 |                     precTriu = diag(D/omega) + triu(H,1);
 904 |                     precondFunc = @precondTriuDiag;
 905 |                     precondArgs = {precTriu,precDiag.^-1};
 906 |                 else
 907 |                     % Incomplete Cholesky Preconditioner
 908 |                     opts.droptol = -cgSolve;
 909 |                     opts.rdiag = 1;
 910 |                     R = cholinc(sparse(H),opts);
 911 |                     if min(diag(R)) < 1e-12
 912 |                         R = cholinc(sparse(H + eye*(1e-12 - min(diag(R)))),opts);
 913 |                     end
 914 |                     precondFunc = @precondTriu;
 915 |                     precondArgs = {R};
 916 |                 end
 917 | 
 918 |                 % Run cg with the appropriate preconditioner
 919 |                 if isempty(HvFunc)
 920 |                     % No user-supplied Hessian-vector function
 921 |                     [d,cgIter,cgRes] = conjGrad(H,-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs);
 922 |                 else
 923 |                     % Use user-supplied Hessian-vector function
 924 |                     [d,cgIter,cgRes] = conjGrad(H,-g,cgForce,cgMaxIter,debug,precondFunc,precondArgs,HvFunc,{x,varargin{:}});
 925 |                 end
 926 |                 if debug
 927 |                     fprintf('CG stopped after %d iterations w/ residual %.5e\n',cgIter,cgRes);
 928 |                     %funEvals = funEvals + cgIter;
 929 |                 end
 930 |             end
 931 | 
 932 |         case TENSOR % Tensor Method
 933 | 
 934 |             if numDiff
 935 |                 % Compute 3rd-order Tensor Numerically
 936 |                 [junk1 junk2 junk3 T] = autoTensor(x,numDiffType,funObj,varargin{:});
 937 |             else
 938 |                 % Use user-supplied 3rd-derivative Tensor
 939 |                 [junk1 junk2 junk3 T] = funObj(x,varargin{:});
 940 |             end
 941 |             options_sub.Method = 'newton';
 942 |             options_sub.Display = 'none';
 943 |             options_sub.progTol = progTol;
 944 |             options_sub.optTol = optTol;
 945 |             d = minFunc(@taylorModel,zeros(p,1),options_sub,f,g,H,T);
 946 | 
 947 |             if any(abs(d) > 1e5) || all(abs(d) < 1e-5) || g'*d > -progTol
 948 |                 if debug
 949 |                     fprintf('Using 2nd-Order Step\n');
 950 |                 end
 951 |                 [V,D] = eig((H+H')/2);
 952 |                 D = diag(D);
 953 |                 D = max(abs(D),max(max(abs(D)),1)*1e-12);
 954 |                 d = -V*((V'*g)./D);
 955 |             else
 956 |                 if debug
 957 |                     fprintf('Using 3rd-Order Step\n');
 958 |                 end
 959 |             end
 960 |     end
 961 | 
 962 |     if ~isLegal(d)
 963 |         fprintf('Step direction is illegal!\n');
 964 |         pause;
 965 |         return
 966 |     end
 967 | 
 968 |     % ****************** COMPUTE STEP LENGTH ************************
 969 | 
 970 |     % Directional Derivative
 971 |     gtd = g'*d;
 972 | 
 973 |     % Check that progress can be made along direction
 974 |     if gtd > -progTol
 975 |         exitflag=2;
 976 |         msg = 'Directional Derivative below progTol';
 977 |         break;
 978 |     end
 979 | 
 980 |     % Select Initial Guess
 981 |     if i == 1
 982 |         if method < NEWTON0
 983 |             t = min(1,1/sum(abs(g)));
 984 |         else
 985 |             t = 1;
 986 |         end
 987 |     else
 988 |         if LS_init == 0
 989 |             % Newton step
 990 |             t = 1;
 991 |         elseif LS_init == 1
 992 |             % Close to previous step length
 993 |             t = t*min(2,(gtd_old)/(gtd));
 994 |         elseif LS_init == 2
 995 |             % Quadratic Initialization based on {f,g} and previous f
 996 |             t = min(1,2*(f-f_old)/(gtd));
 997 |         elseif LS_init == 3
 998 |             % Double previous step length
 999 |             t = min(1,t*2);
1000 |         elseif LS_init == 4
1001 |             % Scaled step length if possible
1002 |             if isempty(HvFunc)
1003 |                 % No user-supplied Hessian-vector function,
1004 |                 % use automatic differentiation
1005 |                 dHd = d'*autoHv(d,x,g,0,funObj,varargin{:});
1006 |             else
1007 |                 % Use user-supplid Hessian-vector function
1008 |                 dHd = d'*HvFunc(d,x,varargin{:});
1009 |             end
1010 | 
1011 |             funEvals = funEvals + 1;
1012 |             if dHd > 0
1013 |                 t = -gtd/(dHd);
1014 |             else
1015 |                 t = min(1,2*(f-f_old)/(gtd));
1016 |             end
1017 |         end
1018 | 
1019 |         if t <= 0
1020 |             t = 1;
1021 |         end
1022 |     end
1023 |     f_old = f;
1024 |     gtd_old = gtd;
1025 | 
1026 |     % Compute reference fr if using non-monotone objective
1027 |     if Fref == 1
1028 |         fr = f;
1029 |     else
1030 |         if i == 1
1031 |             old_fvals = repmat(-inf,[Fref 1]);
1032 |         end
1033 | 
1034 |         if i <= Fref
1035 |             old_fvals(i) = f;
1036 |         else
1037 |             old_fvals = [old_fvals(2:end);f];
1038 |         end
1039 |         fr = max(old_fvals);
1040 |     end
1041 | 
1042 |     computeHessian = 0;
1043 |     if method >= NEWTON
1044 |         if HessianIter == 1
1045 |             computeHessian = 1;
1046 |         elseif i > 1 && mod(i-1,HessianIter) == 0
1047 |             computeHessian = 1;
1048 |         end
1049 |     end
1050 | 
1051 |     % Line Search
1052 |     f_old = f;
1053 |     if LS_type == 0 % Use Armijo Bactracking
1054 |         % Perform Backtracking line search
1055 |         if computeHessian
1056 |             [t,x,f,g,LSfunEvals,H] = ArmijoBacktrack(x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,LS_saveHessianComp,funObj,varargin{:});
1057 |         else
1058 |             [t,x,f,g,LSfunEvals] = ArmijoBacktrack(x,t,d,f,fr,g,gtd,c1,LS_interp,LS_multi,progTol,debug,doPlot,1,funObj,varargin{:});
1059 |         end
1060 |         funEvals = funEvals + LSfunEvals;
1061 | 
1062 |     elseif LS_type == 1 % Find Point satisfying Wolfe conditions
1063 | 
1064 |         if computeHessian
1065 |             [t,f,g,LSfunEvals,H] = WolfeLineSearch(x,t,d,f,g,gtd,c1,c2,LS_interp,LS_multi,25,progTol,debug,doPlot,LS_saveHessianComp,funObj,varargin{:});
1066 |         else
1067 |             [t,f,g,LSfunEvals] = WolfeLineSearch(x,t,d,f,g,gtd,c1,c2,LS_interp,LS_multi,25,progTol,debug,doPlot,1,funObj,varargin{:});
1068 |         end
1069 |         funEvals = funEvals + LSfunEvals;
1070 |         x = x + t*d;
1071 | 
1072 |     else
1073 |         % Use Matlab optim toolbox line search
1074 |         [t,f_new,fPrime_new,g_new,LSexitFlag,LSiter]=...
1075 |             lineSearch({'fungrad',[],funObj},x,p,1,p,d,f,gtd,t,c1,c2,-inf,maxFunEvals-funEvals,...
1076 |             progTol,[],[],[],varargin{:});
1077 |         funEvals = funEvals + LSiter;
1078 |         if isempty(t)
1079 |             exitflag = -2;
1080 |             msg = 'Matlab LineSearch failed';
1081 |             break;
1082 |         end
1083 | 
1084 |         if method >= NEWTON
1085 |             [f_new,g_new,H] = funObj(x + t*d,varargin{:});
1086 |             funEvals = funEvals + 1;
1087 |         end
1088 |         x = x + t*d;
1089 |         f = f_new;
1090 |         g = g_new;
1091 | 	end
1092 | 
1093 | 	% Compute Optimality Condition
1094 | 	optCond = max(abs(g));
1095 | 	
1096 |     % Output iteration information
1097 |     if verboseI
1098 |         fprintf('%10d %10d %15.5e %15.5e %15.5e\n',i,funEvals*funEvalMultiplier,t,f,optCond);
1099 |     end
1100 | 
1101 |     if nargout > 3
1102 |     % Update Trace
1103 |     trace.fval(end+1,1) = f;
1104 |     trace.funcCount(end+1,1) = funEvals;
1105 | 	trace.optCond(end+1,1) = optCond;
1106 | 	end
1107 | 
1108 | 	% Output Function
1109 | 	if ~isempty(outputFcn)
1110 | 		stop = outputFcn(x,'iter',i,funEvals,f,t,gtd,g,d,optCond,varargin{:});
1111 | 		if stop
1112 | 			exitflag=-1;
1113 | 			msg = 'Stopped by output function';
1114 | 			break;
1115 | 		end
1116 | 	end
1117 | 	
1118 |     % Check Optimality Condition
1119 |     if optCond <= optTol
1120 |         exitflag=1;
1121 |         msg = 'Optimality Condition below optTol';
1122 |         break;
1123 |     end
1124 | 
1125 |     % ******************* Check for lack of progress *******************
1126 | 
1127 |     if max(abs(t*d)) <= progTol
1128 |         exitflag=2;
1129 |         msg = 'Step Size below progTol';
1130 |         break;
1131 |     end
1132 | 
1133 | 
1134 |     if abs(f-f_old) < progTol
1135 |         exitflag=2;
1136 |         msg = 'Function Value changing by less than progTol';
1137 |         break;
1138 |     end
1139 | 
1140 |     % ******** Check for going over iteration/evaluation limit *******************
1141 | 
1142 |     if funEvals*funEvalMultiplier >= maxFunEvals
1143 |         exitflag = 0;
1144 |         msg = 'Reached Maximum Number of Function Evaluations';
1145 |         break;
1146 |     end
1147 | 
1148 |     if i == maxIter
1149 |         exitflag = 0;
1150 |         msg='Reached Maximum Number of Iterations';
1151 |         break;
1152 |     end
1153 | 
1154 | end
1155 | 
1156 | if verbose
1157 |     fprintf('%s\n',msg);
1158 | end
1159 | if nargout > 3
1160 |     output = struct('iterations',i,'funcCount',funEvals*funEvalMultiplier,...
1161 |         'algorithm',method,'firstorderopt',max(abs(g)),'message',msg,'trace',trace);
1162 | end
1163 | 
1164 | % Output Function
1165 | if ~isempty(outputFcn)
1166 |      outputFcn(x,'done',i,funEvals,f,t,gtd,g,d,max(abs(g)),varargin{:});
1167 |  end
1168 | 
1169 | end
1170 | 
1171 | 


--------------------------------------------------------------------------------