├── .gitignore
├── LICENSE
├── README.md
├── VERSION
├── algorithms
    └── zeroSR1.m
├── paperExperiments
    ├── Lasso
    │   ├── Contents.m
    │   ├── README.md
    │   ├── cummin.m
    │   ├── fminunc_wrapper.m
    │   ├── proj_Rplus_weighted.m
    │   ├── proj_box_weighted.m
    │   ├── prox_l1_rank1.m
    │   ├── runTestsForPaper.m
    │   ├── test4.png
    │   ├── test5.png
    │   ├── zeroSR1.m
    │   └── zeroSR1_noLinesearch.m
    ├── README.md
    └── groupLasso
    │   ├── Algorithms
    │       ├── FISTA.py
    │       ├── ForwardBackwardSplitting.py
    │       ├── MFZeroSR1_ProximalGradient.py
    │       ├── SpaRSA.py
    │       ├── TsengZerosSR1_ProximalGradient.py
    │       ├── ZeroSR1_ProximalGradient.py
    │       └── __init__.py
    │   ├── README.md
    │   ├── clib
    │       ├── Makefile
    │       ├── mymath.cpp
    │       └── mymath.h
    │   ├── data_group_lasso.npy
    │   ├── mymath.py
    │   └── test_groupLasso.py
├── proxes
    ├── Contents.m
    ├── proj_rank1_Rplus.m
    ├── proj_rank1_box.m
    ├── proj_rank1_linf.m
    ├── prox_rank1_generic.m
    ├── prox_rank1_hinge.m
    ├── prox_rank1_l1.m
    └── prox_rank1_l1pos.m
├── setup_zeroSR1.m
├── smoothFunctions
    ├── normSquaredFunction.m
    └── quadraticFunction.m
├── tests
    ├── computeReferenceSolution.m
    ├── getReferenceSolution.m
    ├── reference_solutions
    │   └── simple_001.mat
    ├── solution_via_cvx.m
    ├── test_prox_accuracy.m
    ├── test_prox_speed.m
    ├── test_prox_speed.png
    └── test_solver_simple.m
└── utilities
    ├── Contents.m
    ├── cummin.m
    ├── emphasizeRecent.m
    ├── fminunc_wrapper.m
    └── rng.m


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Stephen Becker and Jalal Fadili
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of University of Paris 6 - Pierre and Marie Curie, ENSICAEN,
15 |   GREYC, CNRS, or IBM Research, nor the names of their
16 |   contributors may be used to endorse or promote products derived from
17 |   this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # zeroSR1 toolbox
 2 | 
 3 | The zeroSR1 toolbox implements the algorithm from 'A quasi-Newton proximal splitting method' by 
 4 | Stephen Becker and Jalal Fadili, which appeared in [NIPS 2012](http://nips.cc/). The paper is available at [arXiv 1206.1156](http://arxiv.org/abs/1206.1156).
 5 | 
 6 | (Update, January 2018, we have an extended paper [On Quasi-Newton Forward--Backward Splitting: Proximal Calculus and Convergence](https://arxiv.org/abs/1801.08691) by Stephen Becker, Jalal Fadili and Peter Ochs)
 7 | 
 8 | Briefly, the algorithm follows the standard proximal-gradient method, but allows a scaled prox. This enables us to use a limited-memory SR1 method (similar to L-BFGS).
 9 | 
10 | The algorithm solves problems of the form min\_x f(x) + h(x) where f is differentiable (more precisely, with a Lipschitz gradient) and h is one of the following (see the paper):
11 | 
12 | Available "h" | Cost for input of size "n"
13 | ------------- | -------------
14 | l1 norm | O( n log n)
15 | non-negativity constraints | O( n log n)
16 | l1 and non-negativity | O( n log n)
17 | box constraints | O( n log n )
18 | l\_infinity norm constraint | O( n log n )
19 | [hinge loss](http://en.wikipedia.org/wiki/Hinge_loss) | O( n log n )
20 | 
21 | The algorithm compares favorably with other methods, including [L-BFGS-B](http://www.mathworks.com/matlabcentral/fileexchange/35104-lbfgsb-l-bfgs-b-mex-wrapper).
22 | 
23 | This toolbox currently implements in the following languages
24 | 
25 | * Matlab
26 | * Octave
27 | 
28 | Further releases may target these languages:
29 | 
30 | * Python
31 | * R
32 | * C++
33 | 
34 | # Installation
35 | For Matlab, there is no installation necessary. Every time you run a new Matlab session, run the `setup_zeroSR1.m` file and it will add the correct paths.
36 | 
37 | Run `tests/test_solver_simple.m` to see how to solve a typical problem
38 | 
39 | # Structure
40 | In each folder, see the `Contents.m` file for more information
41 | ### Algorithms
42 | This includes the zeroSR1 algorithm as well as implemenations of FISTA and other proximal-gradient methods
43 | 
44 | ### Proxes
45 | The scaled diagonal+ rank1 prox operators for various "g" functions
46 | 
47 | ### SmoothFunctions
48 | These are pre-made wrappers for the various smooth "f" functions. The files here with the `_splitting` suffix are intended for use with any method that requires forming the augmented variable "x\_aug = (x\_pos, x\_neg)". For example, this approach is used when using L-BFGS-B (which only allows box constraints, such as x\_pos >= 0,  x\_neg <= 0) to solve the LASSO problem.
49 | 
50 | ### Utilities
51 | Helper files
52 | 
53 | ### Tests
54 | Verify the algorithm and proxes are working correctly. This uses [CVX](http://cvxr.com/cvx) to verify; if this is not installed on your system, then it relies on precomputed solutions stored in a subdirectory.
55 | 
56 | ### paperExperiments
57 | Recreates the experiments in the 2018 paper
58 | 
59 | # Authors
60 | The original authors are Stephen Becker, Jalal Fadili and Peter Ochs. Further contributions are welcome.
61 | 
62 | ## Citing
63 | This software is provided free of charge, but we request that if you use this for an academic paper, please cite the following work:
64 | 
65 | bibtex entry:
66 | 
67 |     @inproceedings{quasiNewtonNIPS,
68 |       author = {Becker, Stephen and Fadili, Jalal},
69 |       title = {A quasi-{N}ewton proximal splitting method},
70 |       booktitle = {Neural Information Processing Systems (NIPS)},
71 |       year = {2012}
72 |     }
73 | 
74 |     @article{quasiNewtonSIOPT,
75 |         author = {Becker, Stephen and Fadili, Jalal and Ochs, Peter},
76 |         title = {On Quasi-{N}ewton Forward-Backward Splitting: Proximal Calculus and Convergence},
77 |         journal = {SIAM Journal on Optimization},
78 |         volume = {29},
79 |         number = {4},
80 |         pages = {2445-2481},
81 |         year = {2019},
82 |         doi = {10.1137/18M1167152},
83 |         URL = {https://doi.org/10.1137/18M1167152},
84 |         eprint = {https://doi.org/10.1137/18M1167152}
85 |     }
86 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | v0.1
2 | Spring 2014
3 | 


--------------------------------------------------------------------------------
/algorithms/zeroSR1.m:
--------------------------------------------------------------------------------
  1 | function [xk,nit, errStruct, defaultOpts, stepsizes] = zeroSR1(fcn,grad,h,prox,opts)
  2 | % ZEROSR1 Solves smooth + nonsmooth/constrained optimization problems
  3 | % [xk,nit, errStruct, outOpts] = zeroSR1(f,grad_f,h,prox_h,opts)
  4 | %
  5 | % This uses the zero-memory SR1 method (quasi-Newton) to solve:
  6 | % 
  7 | %   min_x f(x)  + h(x)
  8 | %
  9 | % where
 10 | %   'f' calculates f(x), 'grad_f' calculates the gradient of f at x,
 11 | %   and h(x) is a non-smooth term that can be infinite-valued (a constraint),
 12 | %   so long as you present a function 'prox' that computes diagional plus
 13 | %   rank-1 projections. The 'prox' function should accept at least three inputs:
 14 | %
 15 | %   if 'grad_f' is empty, then we assume the 'f' function is actually
 16 | %   computing both f and grad_f (e.g., just f if nargout=1, and
 17 | %   f and grad_f if nargout=2). This method is often preferable
 18 | %   since you can re-use computation
 19 | %
 20 | %   'h' is the non-smooth function, and prox_h is a function with
 21 | %   3 or 4 inputs that returns:
 22 | %       y = prox_h( x0 , d, v, ) 
 23 | % where
 24 | %       y = argmin_x h(x) + 1/2||x-x0||^2_B 
 25 | % and
 26 | %       B = inv(H) = inv( diag(D) + v*v' )
 27 | % or, for the case with 4 arguments, y = prox_h( x0, d, v, sigma )
 28 | %   then B = inv( diag(D) + sigma*v*v' ) where sigma should be +1 or -1
 29 | %   The 4 argument case only matters when opts.SR1=true and opts.BB_type=1
 30 | %   or opts.SR1=true, opts.BB_type=1 and opts.SR1_diagWeight > 1
 31 | %
 32 | % If 'prox_h' isn't provided or is [], it defaults to the identity mapping, which corresponds
 33 | %   to the case when h=0.
 34 | %
 35 | % 'prox_h' is mean to be given by something like prox_rank1_l1
 36 | % e.g., 
 37 | %   prox        = @(x0,d,v) prox_rank1_l1( x0, d, v, lambda );
 38 | %   or, for 4 arguments,
 39 | %   prox        = @(x0,d,v,varargin) prox_rank1_l1( x0, d, v, lambda, [], varargin{:} );
 40 | %
 41 | % "opts" is a structure with additional options. To see their default values,
 42 | %   call this function with no input arguments.
 43 | %
 44 | %   .tol is a tolerance for relative variation
 45 | %   .nmax is max # of allowed iterations
 46 | %   .verbose can be either 0 (no output), 1 (every iteration), or n
 47 | %       If 'n' is an integer greater than 1, output will be written
 48 | %       every n iterations
 49 | %   .x0
 50 | %       starting vector
 51 | %   .N
 52 | %       size of primal domain (only necessary if x0 wasn't provided)
 53 | %
 54 | %   .SR1  if true, uses the zero-memory SR1 method (default)
 55 | %         if false, uses gradient descent/forward-backward method
 56 | %         (or variant, such as BB stepsizes as in the SPG method)
 57 | %   .SR1_diagWeight is a scalar > 0 that controls the weight of the 
 58 | %           BB stepsize, and is usually between 0 and 1.
 59 | %           If set to exactly 1, then the rank 1 term is exactly zero
 60 | %   .BB  
 61 | %       use the Barzilai-Borwein scalar stepsize (by default, true)
 62 | %       .BB_type = 1 uses the longer of the B-B steps
 63 | %       .BB_type = 2 uses the shorter of the steps
 64 | %       with 0SR1, BB_type=1 is not possible
 65 | %       BB_type=2 is used, and is scaled by 0 < opts.SR1_diagWeight < 1
 66 | %
 67 | %   .errFcn can be an arbitrary function that calculates an error metric
 68 | %       on the primal variable at every iteration.
 69 | %
 70 | %
 71 | % Output "errStruct" contains three or four columns:
 72 | %   (1) objective function
 73 | %   (2) norm of gradient
 74 | %   (3) stepsize
 75 | %   (4) error (i.e. the output of errFcn, if provided)
 76 | %   
 77 | % Stephen Becker and Jalal Fadili, Nov 24 2011 -- Dec 2012
 78 | % Copied from zeroSR1.m Dec 11 2012
 79 | %   Feb 28 2014, unnesting all functions to make compatible with octave.
 80 | %
 81 | % See also proximalGradient.m
 82 | 
 83 | 
 84 | 
 85 | % -----------------------------------------------------------------
 86 | % ------------ Boring initializations -----------------------------
 87 | % ------------ for understanding the algorithm, skip ahead --------
 88 | % ------------ to where it says "Begin algorithm"------------------
 89 | % -----------------------------------------------------------------
 90 | 
 91 | if nargin == 0 || nargout >= 4
 92 |     RECORD_OPTS     = true;
 93 | %     defaultOpts     = [];
 94 | else
 95 |     RECORD_OPTS     = false;
 96 | end
 97 | 
 98 | if nargin < 3 || isempty(h)
 99 |     if nargin >= 4 && ~isempty(prox)
100 |         warning('zeroSR1:h_not_provided','Found prox_h but not h itself. Setting h=0, prox=I');
101 |         prox = @(x,varargin) x;
102 |     end
103 |     h = @(x) 0;
104 | end
105 | if nargin < 4 || isempty(prox), prox = @(x,varargin) x; end
106 | if nargin < 5, opts = []; end 
107 | 
108 | setOptsSubFcn(); % zero out any persistent variables
109 | setOpts     = @(varargin) setOptsSubFcn( RECORD_OPTS, opts, varargin{:} );
110 | % Usage: setOpts( field, default, mn, mx, emptyOK (default:false) );
111 | 
112 | fid     = setOpts('fid', 1 );      % print output to the screen or a file
113 | myDisp  = @(str)  fprintf(fid,'%s\n', str );
114 | tol     = setOpts( 'tol', 1e-6 );  
115 | grad_tol= setOpts( 'grad_tol', tol );
116 | nmax    = setOpts( 'nmax', 1000 );  
117 | errFcn  = setOpts( 'errFcn', [] );
118 | VERBOSE = setOpts( 'verbose', false );
119 | if isinf(VERBOSE), VERBOSE = false; end
120 | maxStag = setOpts( 'maxStag', 10 ); % force very high accuracy
121 | xk      = setOpts( 'x0', [], [], [], true );
122 | N       = setOpts( 'N', length(xk) );
123 | if N==0 && nargin > 0, error('for now, must specify opts.N = N'); end
124 | if isempty(xk), xk = zeros(N,1); end
125 | damped  = setOpts('damped',false); % 1=no damping, .01 = very tiny step
126 | 
127 | % -- Options that concern the stepsize --
128 | SR1             = setOpts( 'SR1', true );
129 | BFGS            = setOpts( 'BFGS', false );
130 | if SR1 && BFGS
131 |     error('zeroSR1:conflictingArgs','Cannot set SR1 and BFGS to both be true');
132 | end
133 | BB              = setOpts( 'BB', SR1 || BFGS );
134 | if isfield(opts,'L') && isempty(opts.L) && ~BB
135 |     warning('zeroSR1:noGoodStepsize','Without Lipschitz constant nor BB stepsize nor line search, bad things will happen');
136 | end
137 | L               = setOpts( 'L', 1, 0 );   % Lipschitz constant, e.g. norm(A)^2
138 | 
139 | SIGMA           = +1; % used for SR1 feature
140 | % Default BB stepsize. type "1" is longer and usually faster
141 | BB_type = setOpts('BB_type',2*(SR1||BFGS) + 1*(~(SR1||BFGS)) );
142 | if (SR1||BFGS) && BB_type == 1
143 | %     warning('zeroSR1:badBB_parameter','With zero-memory SR1, BB_type must be set to 2. Forcing BB_type = 2 and continuing');
144 | %     BB_type     = 2;
145 | 
146 |     warning('zeroSR1:experimental','With zero-memory SR1, BB_type=1 is an untested feature');
147 |     SIGMA       = -1;
148 | end
149 | if SR1
150 |     defaultWeight = 0.8*(BB_type==2) + 1.0*(BB_type==1);
151 | else
152 |     defaultWeight = 1;
153 | end
154 | SR1_diagWeight  = setOpts( 'SR1_diagWeight', defaultWeight );
155 | if SR1 && BB_type == 2 && SR1_diagWeight > 1
156 |     SIGMA       = -1;
157 | end
158 | 
159 | % ------------ Scan options for capitalization issues, etc. -------
160 | [defaultOpts,opts] = setOpts();
161 | if nargin == 0 
162 |     disp('Default options:');
163 |     disp( defaultOpts );
164 | end
165 | if ~isempty(fieldnames(opts))
166 |     disp('Error detected! I didn''t recognize these options:');
167 |     disp( opts );
168 |     error('Bad options');
169 | end
170 | if nargin == 0 , return; end
171 | 
172 | % ------------ Initializations and such ---------------------------
173 | xk_old  = xk;
174 | % gradient  = zeros(N,1);
175 | getGradient     = @(varargin) getGradientFcn(fcn,grad, varargin{:});
176 | fxold   = Inf;
177 | t       = 1/L; % initial stepsize
178 | stepsizes = zeros(nmax,1 + (SR1||BFGS)); % records some statisics
179 | if ~isempty(errFcn)
180 |     if ~isa(errFcn,'function_handle')
181 |         error('errFcn must be a function');
182 |     end
183 |     errStruct   = zeros( nmax, 4 ); % f, norm(gx), step, err
184 | else
185 |     errStruct   = zeros( nmax, 3 ); % f, norm(gx), step
186 | end
187 | skipBB = false;
188 | stag   = 0;
189 | 
190 | 
191 | gradient        = getGradient(xk);
192 | gradient_old    = gradient;
193 | f_xk            = [];
194 | 
195 | % -----------------------------------------------------------------
196 | % ------------ Begin algorithm ------------------------------------
197 | % -----------------------------------------------------------------
198 | for nit = 1:nmax
199 | 
200 |     % Do this at end now, so we can get fcn value for free
201 | %     gradient_old    = gradient;
202 | %     gradient        = grad(xk);
203 | 
204 |     % "sk" and "yk" are the vectors that will give us quasi-Newton
205 |     %   information (and also used in BB step, since that can be
206 |     %   seen as a quasi-Newton method)
207 |     sk      = xk        - xk_old;
208 |     yk      = gradient  - gradient_old;   % Following notation in Nocedal/Wright
209 |     if nit > 1 && norm(yk) < 1e-13
210 |         warning('zeroSR1:zeroChangeInGradient','gradient isn''t changing , try changing opts.L');
211 |         yk = [];
212 |         skipBB = true;
213 |     end
214 |     
215 |     
216 |     % ---------------------------------------------------------------------
217 |     % -- Find an initial stepsize --
218 |     % ---------------------------------------------------------------------
219 | %     t_old   = t;
220 |     if BB && nit > 1 && ~skipBB  
221 |         switch BB_type
222 |             case 1
223 |                 t   = (norm(sk)^2)/(sk'*yk); % eq (1.6) in Dai/Fletcher. This is longer
224 |             case 2
225 |                 t   = sk'*yk/( norm(yk)^2 ); % eq (1.7) in Dai/Fletcher. This is shorter
226 |         end
227 |         if t < 1e-14 % t < 0 should not happen on convex problem!
228 |             myDisp('Curvature condition violated!');
229 |             stag    = Inf;
230 |         end
231 |         if SR1 || BFGS
232 |             % we cannot take a full BB step, otherwise we exactly satisfy the secant
233 |             %   equation, and there is no need for a rank-1 correction.
234 |             t    = SR1_diagWeight*t; % SR1_diagWeights is a scalar less than 1 like 0.6
235 |         end
236 |         H0      = @(x) t*x;
237 |         diagH   = t*ones(N,1);
238 |     else 
239 |         t       = 1/L;
240 |         H0      = @(x) t*x;         % diagonal portion of inverse Hessian
241 |         diagH   = t*ones(N,1);
242 |     end
243 |     skipBB  = false;
244 |     stepsizes(nit,1) = t;
245 |     
246 |     
247 |     
248 |     % ---------------------------------------------------------------------
249 |     % -- Quasi-Newton -- Requries: H0, and builds H
250 |     % ---------------------------------------------------------------------
251 |     if SR1 && nit > 1 && ~isempty(yk) 
252 |         gs = yk'*sk;
253 | %         gHg = yk'*(diagH.*yk); % not needed any more
254 |         if gs < 0
255 |             myDisp('Serious curvature condition problem!');
256 |             stag = Inf;  
257 |         end
258 |         H0  = @(x) diagH.*x;
259 |         vk  = sk - H0(yk);
260 |         vkyk    = vk'*yk;
261 |         SIGMA_LOCAL = sign( vkyk );
262 |         %if SIGMA*vkyk  <= 0
263 |         if SIGMA_LOCAL*vkyk  <= 0
264 |             myDisp('Warning: violated curvature conditions');
265 |             % This should only happen if we took an exact B-B step, which we don't.
266 |             vk  = [];
267 |             H   = H0;
268 |             stepsizes(nit,2)    = 0;
269 |         else
270 |             vk  = vk/sqrt( SIGMA_LOCAL*vkyk );
271 |             % And at last, our rank-1 approximation of the inverse Hessian.
272 |             H   = @(x) H0(x) + SIGMA_LOCAL*(vk*(vk'*x));
273 |             % The (inverse) secant equation is B*sk = yk(=y), or Hy=s
274 |             % N.B. We can make a rank-1 approx. of the Hessian too; see the full
275 |             % version of the code.
276 |             
277 |             stepsizes(nit,2)    = vk'*vk;
278 |         end
279 |     elseif BFGS && nit > 1 && ~isempty(yk) 
280 |         gs = yk'*sk;
281 |         rho= 1/gs;
282 |         if gs < 0
283 |             myDisp('Serious curvature condition problem!');
284 |             stag = Inf;  
285 |         end
286 |         H0  = @(x) diagH.*x;
287 |         
288 |         tauBB   = sk'*yk/( norm(yk)^2);
289 |         uk      = sk/2 + H0(sk)/(2*tauBB) - H0(yk);
290 |         % if H0 is tauBB*I (e.g., gamma=1), then vk = sk - H0(yk).
291 |         
292 |         
293 |         stepsizes(nit,2)    = uk'*uk;
294 |         
295 |         vk      = [sk-uk, sk+uk]*sqrt(rho/2); % rank 2!
296 |         SIGMA_LOCAL = [-1,1];
297 |         
298 |         H   = @(x) H0(x) + vk*( diag(SIGMA_LOCAL)*(vk'*x) );
299 |         
300 |         %fprintf('DEBUG: %.2e\n', norm( H(yk) - sk )  );
301 |         
302 |     else
303 |         SIGMA_LOCAL     = SIGMA;
304 |         H = H0;
305 |         vk= [];
306 |     end
307 |     
308 |     
309 |     % ---------------------------------------------------------------------
310 |     % -- Make the proximal update -----------------------------------------
311 |     % ---------------------------------------------------------------------
312 |     p       = H(-gradient);  % Scaled descent direction. H includes the stepsize
313 |     xk_old  = xk;
314 |     if ~isequal(SIGMA_LOCAL,1)
315 |         if damped
316 |             xk      = xk + damped*(prox( xk_old + p, diagH, vk, SIGMA_LOCAL )-xk);
317 |         else
318 |             xk      = prox( xk_old + p, diagH, vk, SIGMA_LOCAL );
319 |         end
320 |     else
321 |         if damped
322 |             xk      = xk + damped*(prox( xk_old + p, diagH, vk )-xk);
323 |         else
324 |             xk      = prox( xk_old + p, diagH, vk ); % proximal step
325 |         end
326 |         
327 |     end
328 |     
329 |     norm_grad = norm( xk - xk_old );
330 |     if any(isnan(xk)) || norm(xk) > 1e10
331 |         stag = Inf; % will cause it to break
332 |         xk   = xk_old;
333 |         myDisp('Prox algorithm failed, probably due to numerical cancellations');
334 |     end
335 |     
336 |     % ---------------------------------------------------------------------
337 |     % -- The rest of the code is boring. The algorithmic stuff is done. ---
338 |     % ---------------------------------------------------------------------
339 |     % -- record function values --
340 |     % ---------------------------------------------------------------------
341 |     gradient_old    = gradient;
342 |     [gradient,f_xk]  = getGradient(xk); % can be cheaper if user provided a nice fcn
343 |     fx  = f_xk + h(xk);
344 | %     fx  = fcn(xk) + h(xk);
345 |     df  = abs(fx - fxold)/abs(fxold);
346 |     fxold = fx;
347 |     
348 |     if (df < tol) || ( t < 1e-10 ) || (isnan(fx) ) || norm_grad < grad_tol
349 |         stag = stag + 1;
350 |     end
351 |     
352 |     if VERBOSE && (~rem(nit,VERBOSE) || stag>maxStag )
353 |         fprintf(fid,'Iter: %5d, f: % 7.3e, df: %.2e, ||grad||: %.2e, step %.2e\n',...
354 |             nit,fx,df, norm_grad, t);
355 |     end
356 |     
357 |     errStruct(nit,1)    = fx;
358 |     errStruct(nit,2)    = norm_grad;
359 |     errStruct(nit,3)    = t;
360 |     if ~isempty(errFcn)
361 |         errStruct(nit,4)    = errFcn( xk );
362 |         if VERBOSE && (~rem(nit,VERBOSE) || stag>maxStag )
363 |             fprintf(fid,'\b, err %.2e\n', errStruct(nit,4) );
364 |         end
365 |     end
366 |     
367 | 
368 |     if stag > maxStag
369 |         if VERBOSE, myDisp('Quitting (e.g. reached tolerence)...'); end
370 |         break;
371 |     end
372 |     
373 | end
374 | 
375 | if nit == nmax && VERBOSE, myDisp('Maxed out iteration limit'); end
376 | if nit < nmax
377 |     errStruct = errStruct( 1:nit, : );
378 |     stepsizes = stepsizes( 1:nit, : );
379 | end
380 | 
381 | end  % end of main routine
382 | 
383 | function [gradientValue,fcnValue] = getGradientFcn( fcn, gradient, x, str )
384 | % The user can either specify fcn and gradient separately,
385 | %   or they can specify them both in a single function (also called fcn)
386 | % This latter option is triggered whenever gradient=[]
387 | if nargin < 4, str = []; end
388 | if isempty(gradient)
389 |     [fcnValue,gradientValue]    = fcn(x);
390 | else
391 |     gradientValue               = gradient(x);
392 |     if nargout > 1 
393 |         if strcmpi(str,'fcn_optional')
394 |             fcnValue = [];
395 |         else
396 |             fcnValue  = fcn(x);
397 |         end
398 |     end
399 | end
400 | end
401 | 
402 | function varargout = setOptsSubFcn(RECORD_OPTS, opts, field, default, mn, mx, emptyOK )
403 |     persistent defaultOpts
404 |     persistent updatedOpts
405 |     if nargin <= 2
406 |         % non-standard usage
407 |         varargout{1} = defaultOpts;
408 |         varargout{2} = updatedOpts;
409 |         defaultOpts = [];
410 |         updatedOpts = [];
411 |         return;
412 |     end
413 |     if isempty( updatedOpts ), updatedOpts = opts; end
414 |         
415 |     % if emptyOK is false, then values of opts.field=[] are not allowed and
416 |     %   are instead set to the default value
417 |     if nargin < 7 || isempty(emptyOK), emptyOK = false; end
418 |     if ~isfield( opts, field ) || (isempty(opts.(field)) && ~emptyOK )
419 |         opts.(field)    = default;
420 |     end
421 |     out = opts.(field);
422 |     varargout{1} = out;
423 |     if nargin >= 5 && ~isempty(mn) && any(out < mn), error('Value is too small'); end
424 |     if nargin >= 6 && ~isempty(mx) && any(out > mx), error('Value is too large'); end
425 |     if isfield( updatedOpts, field )
426 |         updatedOpts    = rmfield( updatedOpts, field ); % so we can do a check later
427 |     end
428 |     if RECORD_OPTS
429 |         defaultOpts.(field) = out;
430 |     end
431 | end
432 | 


--------------------------------------------------------------------------------
/paperExperiments/Lasso/Contents.m:
--------------------------------------------------------------------------------
 1 | % FIG2_LASSO
 2 | %    Recreates Fig 6.1 from https://arxiv.org/pdf/1801.08691.pdf
 3 | % 
 4 | % Main Files
 5 | %   runTestsForPaper     - Script to run all the tests
 6 | %
 7 | % Helper Files
 8 | %   zeroSR1              - [xk,nit, errStruct, outOpts] = zeroSR1(f,g,proj,opts)
 9 | %   zeroSR1_noLinesearch - Solves smooth + nonsmooth/constrained optimization problems
10 | %   fminunc_wrapper      - wrapper for objective and gradient
11 | %   proj_box_weighted    - Projection onto box constraints
12 | %   prox_l1_rank1        - Prox of l1 with diagonal + rank-1 metric
13 | %   proj_Rplus_weighted  - Projection onto x>=0 with diagonal + rank-1
14 | %   cummin               - Cumulative minimum
15 | %
16 | % Feb 1 2018


--------------------------------------------------------------------------------
/paperExperiments/Lasso/README.md:
--------------------------------------------------------------------------------
 1 | # zeroSR1: Lasso Experiments
 2 | 
 3 | This folder contains the Matlab code to run the Lasso experiments.
 4 | The version of code used here may be slightly different than the updated algorithms in the main repository.
 5 | 
 6 | Some third-party packages (not provided, though we list the URLs) are required if you want to compare with the other solvers mentioned in the paper.
 7 | 
 8 | In the code, "test 4" is Fig 6.1 (left) from our [2018 paper](https://arxiv.org/pdf/1801.08691.pdf) (similar to Fig 1.a from our [2012 paper](https://arxiv.org/pdf/1206.1156.pdf))
 9 | 
10 | Similarly, "test 6" is Fig 6.1 (right) from our [2018 paper](https://arxiv.org/pdf/1801.08691.pdf) (similar to Fig 1.b from our [2012 paper](https://arxiv.org/pdf/1206.1156.pdf))
11 | 
12 | ## Third party packages
13 | If you install these, make sure to add them to the Matlab path. You can follow the example `addpath` commands that we used.
14 | 
15 | ### L-BFGS-B
16 | We wrote our own Matlab wrapper for this (using the L-BFGS-B 3.0 Fortran
17 | code). You can download it from: https://github.com/stephenbeckr/L-BFGS-B-C
18 | 
19 | Unpack it somewhere and run `lbfgsb_C/Matlab/compile_mex.m`
20 | 
21 | ### ASA
22 | See http://users.clas.ufl.edu/hager/papers/Software/
23 | 
24 | as of 2013, they have [ver 3.0](http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-3.0.tar.gz) but their older [ver 2.2](http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-2.2.tar.gz) is still online.
25 | 
26 | You also need the Matlab interface; we wrote this ourself, and it can be downloaded from [Mathworks file exchange no 35814](https://www.mathworks.com/matlabcentral/fileexchange/35814-mex-interface-for-bound-constrained-optimization-via-asa) (it will also download the main C source code for you)
27 | 
28 | If you download the Matlab interface, run the `test_ASA.m` script and it will downlaoad the ASA code that it needs.
29 | 
30 | ### CGIST
31 | Get CGIST from their [website](http://tag7.web.rice.edu/CGIST.html) or [direct link to .zip file](http://tag7.web.rice.edu/CGIST_files/cgist.zip).
32 | 
33 | ### FPC
34 | Get FPC AS from [their website](http://www.caam.rice.edu/~optimization/L1/FPC_AS/request-for-downloading-fpc_as.html)
35 | 
36 | ### L1General package, with PSSas and OWL
37 |  Get the L1General2 code from [Mark Schmidt's software website](https://www.cs.ubc.ca/~schmidtm/Software/thesis.html) or [direct link to thesis.zip](https://www.cs.ubc.ca/~schmidtm/Software/thesis.zip).
38 | 
39 | Note: you need to compile mex files for this (for the lbfgs subroutine)
40 | For compilation, try: `minFunc/mexAll.m`
41 | 
42 | We noticed that line 13 in `lbfgsC.c` declared `int nVars,nSteps,lhs_dims[2];` and for us, this threw a warning at compile-time and an error at run-time. One fix is to remove the `lhs_dims[2]` from that line and instead add a new line with: `size_t lhs_dims[2];`
43 | 
44 | ## Output
45 | 
46 | Running test4 should give something like this:
47 | 
48 | ![Test 4 results](test4.png?raw=true)
49 | 
50 | Running test5 should give something like this:
51 | 
52 | ![Test 5 results](test5.png?raw=true)
53 | 
54 | ## Authors
55 | The authors are Stephen Becker, Jalal Fadili and Peter Ochs.
56 | 
57 | This README from Feb 1 2018. Thanks to https://stackedit.io/app for editing markup
58 | 


--------------------------------------------------------------------------------
/paperExperiments/Lasso/cummin.m:
--------------------------------------------------------------------------------
 1 | function x = cummin(x) 
 2 | % y = cummin(x)
 3 | %   finds the cummulative minimum of x
 4 | %   e.g. y_i = min( x_i, y_{i-1} )
 5 | 
 6 | if numel(x) > length(x)
 7 |     error('input must be a vector');
 8 | end
 9 | for k = 2:length(x)
10 |     x(k) = min( x(k), x(k-1) );
11 | end


--------------------------------------------------------------------------------
/paperExperiments/Lasso/fminunc_wrapper.m:
--------------------------------------------------------------------------------
 1 | function [f,g,h] = fminunc_wrapper(x,F,G,H, errFcn,extraFcn)
 2 | % [f,g,h] = fminunc_wrapper( x, F, G, H, errFcn )
 3 | % for use with Matlab's "fminunc"
 4 | %
 5 | % [fHist,errHist] = fminunc_wrapper()
 6 | %       will return the function history
 7 | %       (and error history as well, if errFcn was provided)
 8 | %       and reset the history to zero.
 9 | persistent errHist fcnHist nCalls
10 | if nargin == 0
11 |    f = fcnHist(1:nCalls);
12 |    g = errHist(1:nCalls);
13 |    fcnHist = [];
14 |    errHist = [];
15 |    nCalls  = 0;
16 |    return;
17 | end
18 | if isempty( fcnHist )
19 |     [errHist,fcnHist] = deal( zeros(100,1) );
20 | end
21 | 
22 | f = F(x);
23 | % Record this:
24 | nCalls = nCalls + 1;
25 | if length( errHist ) < nCalls
26 |     % allocate more memory
27 |     errHist(end:2*end) = 0;
28 |     fcnHist(end:2*end) = 0;
29 | end
30 | fcnHist(nCalls) = f;
31 | if nargin >= 6 && ~isempty(extraFcn)
32 |     % this is used when we want to record the objective function
33 |     % for something non-smooth, and this routine is used only for the smooth
34 |     % part. So for recording purposes, add in the nonsmooth part
35 |     % But do NOT return it as a function value or it will mess up the
36 |     % optimization algorithm.
37 |     fcnHist(nCalls) = f + extraFcn(x);
38 | end
39 | 
40 | if nargin > 2 && nargout > 1
41 |     g = G(x);
42 | end
43 | if nargin > 3 && ~isempty(H) && nargout > 2
44 |     h = H(x);
45 | end
46 | 
47 | % and if error is requested...
48 | if nargin >= 5 && ~isempty( errFcn)
49 |     errHist(nCalls) = errFcn(x);
50 | end


--------------------------------------------------------------------------------
/paperExperiments/Lasso/proj_Rplus_weighted.m:
--------------------------------------------------------------------------------
  1 | function [x,lambda,cnt, sEst] = proj_Rplus_weighted( x0, D, L, scale, linTerm )
  2 | % x = proj_Rplus_weighted( x0, D, L )  or 
  3 | % x = proj_Rplus_weighted( x0, D, L, scale ) or 
  4 | % x = proj_Rplus_weighted( x0, D, L, scale, c )
  5 | %   returns the solution
  6 | %       x = argmin   1/2||x-x0||^2_{Q,2}' + <c,x>  subject to scale*x >= 0
  7 | % where
  8 | %   ||x-x0||^2_{Q,2} = < x - x0, Q*(x-x0) >
  9 | % and 
 10 | %   inv(Q) = D + L*L'  is a diagonal + rank-1 matrix
 11 | %   with D = diag(d) > 0 is psd.
 12 | %
 13 | % Only the sign of the "scale" input has an effect (if scale < 0,
 14 | %   the the constraints become x <= 0 instead of x >= 0 ).
 15 | %
 16 | % The algorithm takes O( n*log(n) ) running time.
 17 | %
 18 | % [x,lambda] = ...
 19 | %   also returns a dual vector lambda
 20 | %
 21 | % [x,lambda,iter] = ...
 22 | %   also returns the number of iterations ( iter <= ceil( log_2(n) + 1 ) ).
 23 | %
 24 | % [x,lambda,iter,s] = ...
 25 | %   also returns the scalar dual variable 's'
 26 | %
 27 | % Stephen Becker, Dec 1 2011.  srbecker@caltech.edu
 28 | % March 13, changing order of (... linTerm, scale ) to (... scale, linTerm )
 29 | 
 30 | 
 31 | VERBOSE = false;
 32 | if nargin < 3, L = []; end % this is just scalar stuff then...
 33 | if nargin < 5, linTerm = []; end
 34 | if nargin < 4 || isempty(scale), scale = 1; end
 35 | 
 36 | if isvector(D), d = D; %D = diag(d); 
 37 | else d = diag(D); end
 38 | if any( d < 0 ), error('Diagonal term must be positive'); end
 39 | % make sure everything is a column vector
 40 | if size(x0,2) > 1, x0 = x0.'; end
 41 | if size(L,2)  > 1, L  = L.' ; end
 42 | 
 43 | 
 44 | 
 45 | % -- If the user doesn't specify L, then it's a standard projection --
 46 | if isempty(L)
 47 |     if ~isempty( linTerm )
 48 |         error('Can''t handle that case yet. Shouldn''t be too difficult though...');
 49 |     else
 50 |         x   = max( x0, 0 );
 51 |         lambda  = [];
 52 |         cnt     = 1;
 53 |         sEst    = 0;
 54 |         return;
 55 |     end
 56 | end
 57 | 
 58 | 
 59 | 
 60 | 
 61 | RESCALE     = false;
 62 | if scale == 0
 63 |     error('Cannot handle lambda = 0');
 64 | elseif scale < 0
 65 |    RESCALE  = true;
 66 |    x0       = -x0;
 67 | end
 68 | 
 69 | R = L;
 70 | N = length(x0);
 71 | 
 72 | if nargin >= 4 && ~isempty( linTerm )
 73 |     % We can incorporate this (i.e. "c" in the equation above,
 74 |     %   but not the same "c" used below) into the x0 term:
 75 |     if size(linTerm,2) > 1, linTerm = linTerm.'; end
 76 |     if RESCALE, linTerm = -linTerm; end
 77 |     x0  = x0 - (d.*linTerm + L*(R'*linTerm) );
 78 |     
 79 | end
 80 | 
 81 | 
 82 | % from now on, "lambda" refers to the dual vector
 83 | %   We will find a strictly complementary solution (x,lambda) such
 84 | %   that x >= 0, lambda >= 0, and <x,lambda> = 0
 85 | 
 86 | % sList   = -x0./L;
 87 | % sList   = sort(sList); % +/- inf are OK.
 88 | sList   = unique( -x0./L ); % remove duplicate +/- infinities
 89 | sListInf = [ -Inf; sList; Inf ];
 90 | 
 91 | S       = [ sList(1)-1;   (sList + circshift(sList,-1))/2 ];
 92 | S(end)  = sList(end) + 1;
 93 | % so the element of S are right in the middle: no boundary points,
 94 | %   ensuring strict complementarity
 95 | 
 96 | % Thus, we have defined the active set for both x and lambda
 97 | DONE    = false;
 98 | mn      = 0; % inclusive
 99 | mx      = length(sList); % inclusive
100 | maxIt   = ceil( log2(mx) ) + 1;
101 | 
102 | A       = -R./d;
103 | B       = A.*L;
104 | A       = A.*x0;
105 | 
106 | for cnt = 0:maxIt % should take logN iterations, or fewer
107 |     
108 |     k   = round( (mn+mx)/2 );   % pick the next entry
109 |     s   = S(k+1);  % i.e. sList(k-1) < s < sList(k)
110 |     
111 | %     T   = find( x0 + s*L > 0);
112 |     Tc  = find( x0 + s*L < 0); % we never have y + s = 0, by design of S
113 |     
114 |     % support of lambda is now well defined in terms of 's'.
115 | %     a   = -R(Tc)'*( x0(Tc)./d(Tc) ); 
116 | %     b   = -R(Tc)'*(  L(Tc)./d(Tc) );
117 |     % alternatively, compute them this way (might be faster):
118 |     a   = sum( A(Tc) );
119 |     b   = sum( B(Tc) );
120 |     sEst    = a/(1-b);
121 |     
122 |     % find bounds:
123 |     lb  = sListInf( k+1 );
124 |     ub  = sListInf( k+2 );
125 |     
126 |     % debugging: verify that these are indeed the correct bounds:
127 | %     OK  = ( s > lb ) && ( s < ub );
128 | %     if ~OK, disp('Violated bounds!'); error('Problem!'); end
129 |     
130 |     if sEst < lb
131 |         str = 'v';
132 |         % reduce the upper bound
133 |         mx  = k;
134 |     elseif sEst > ub
135 |         str = '^';
136 |         % increase the lower bound
137 |         mn  = k;
138 |     else
139 |         str = '-';
140 |         DONE = true;
141 |     end
142 |     if VERBOSE, fprintf('k=%2d, [%6.1f, %6.1f], sEst is %6.1f:  %s\n',k,lb,ub, sEst, str ); end
143 |     if DONE, break; end
144 | end
145 | assert( cnt < maxIt, 'rank-1 prox algorithm failed to converge');
146 | 
147 | T       = find( x0 + s*L > 0);
148 | x       = zeros(N,1);
149 | x(T)    = x0(T)  + sEst*L(T);
150 | if nargout > 1
151 |     lambda  = zeros(N,1);
152 |     lambda(Tc)  = -(x0(Tc) + sEst*L(Tc) )./d(Tc);
153 | end
154 | 
155 | if RESCALE
156 |     x = -x;
157 |     if nargout > 1
158 |         lambda  = -lambda;
159 |     end
160 | end
161 | 


--------------------------------------------------------------------------------
/paperExperiments/Lasso/proj_box_weighted.m:
--------------------------------------------------------------------------------
  1 | function [x,lambda,cnt, sEst] = proj_box_weighted( x0, D, L, lwr, upr, linTerm )
  2 | % x = proj_box_weighted( x0, D, L, lwr, upr )  or 
  3 | % x = proj_box_weighted( x0, D, L, lwr, upr, c )
  4 | %   returns the solution
  5 | %       x = argmin   1/2||x-x0||^2_{Q,2}' + <c,x>  subject to lwr <= x <= upr
  6 | % where
  7 | %   ||x-x0||^2_{Q,2} = < x - x0, Q*(x-x0) >
  8 | % and 
  9 | %   inv(Q) = D + L*L'  is a diagonal + rank-1 matrix
 10 | %   with D = diag(d) > 0 is psd.
 11 | %
 12 | % Only the sign of the "scale" input has an effect (if scale < 0,
 13 | %   the the constraints become x <= 0 instead of x >= 0 ).
 14 | %
 15 | % The algorithm takes O( n*log(n) ) running time.
 16 | %
 17 | % [x,lambda] = ...
 18 | %   also returns a dual vector lambda
 19 | %
 20 | % [x,lambda,iter] = ...
 21 | %   also returns the number of iterations ( iter <= ceil( log_2(n) + 1 ) ).
 22 | %
 23 | % [x,lambda,iter,s] = ...
 24 | %   also returns the scalar dual variable 's'
 25 | %
 26 | % Stephen Becker, Jun 12 2012.  srbecker@alumni.caltech.edu
 27 | 
 28 | 
 29 | VERBOSE = false;
 30 | if nargin < 3, L = []; end % this is just scalar stuff then...
 31 | if nargin < 6, linTerm = []; elseif ~isempty(linTerm)
 32 |     error('cannot yet handle this case');
 33 | end
 34 | % if nargin < 6 || isempty(scale), scale = 1; end
 35 | scale=1;
 36 | 
 37 | if isvector(D), d = D; %D = diag(d); 
 38 | else d = diag(D); end
 39 | if any( d < 0 ), error('Diagonal term must be positive'); end
 40 | % make sure everything is a column vector
 41 | if size(x0,2) > 1, x0 = x0.'; end
 42 | if size(L,2)  > 1, L  = L.' ; end
 43 | 
 44 | 
 45 | % -- If the user doesn't specify L, then it's a standard projection --
 46 | if isempty(L)
 47 |     if ~isempty( linTerm )
 48 |         error('Can''t handle that case yet. Shouldn''t be too difficult though...');
 49 |     else
 50 |         x   = max( x0, lwr );
 51 |         x   = min( x, upr );
 52 |         lambda  = [];
 53 |         cnt     = 1;
 54 |         sEst    = 0;
 55 |         return;
 56 |     end
 57 | end
 58 | 
 59 | 
 60 | RESCALE     = false;
 61 | if scale == 0
 62 |     error('Cannot handle lambda = 0');
 63 | elseif scale < 0
 64 |    RESCALE  = true;
 65 |    x0       = -x0;
 66 | end
 67 | 
 68 | R = L;
 69 | N = length(x0);
 70 | 
 71 | % if nargin >= 6 && ~isempty( linTerm )
 72 | %     % We can incorporate this (i.e. "c" in the equation above,
 73 | %     %   but not the same "c" used below) into the x0 term:
 74 | %     if size(linTerm,2) > 1, linTerm = linTerm.'; end
 75 | %     if RESCALE, linTerm = -linTerm; end
 76 | %     x0  = x0 - (d.*linTerm + L*(R'*linTerm) );
 77 | %     
 78 | % end
 79 | 
 80 | 
 81 | % from now on, "lambda" refers to the dual vector
 82 | %   We will find a strictly complementary solution (x,lambda) such
 83 | %   that x >= 0, lambda >= 0, and <x,lambda> = 0
 84 | 
 85 | % sList   = unique( -x0./L ); % remove duplicate +/- infinities
 86 | sList = unique( [(lwr-x0)./L; (upr-x0)./L ]);
 87 | sListInf = [ -Inf; sList; Inf ];
 88 | 
 89 | S       = [ sList(1)-1;   (sList + circshift(sList,-1))/2 ];
 90 | S(end)  = sList(end) + 1;
 91 | % so the element of S are right in the middle: no boundary points,
 92 | %   ensuring strict complementarity
 93 | 
 94 | % Thus, we have defined the active set for both x and lambda
 95 | DONE    = false;
 96 | mn      = 0; % inclusive
 97 | mx      = length(sList); % inclusive
 98 | maxIt   = ceil( log2(mx) ) + 1;
 99 | 
100 | A       = -R./d;
101 | B       = A.*L;
102 | A1       = A.*(x0-lwr);
103 | A2       = A.*(x0-upr);
104 | 
105 | for cnt = 0:maxIt % should take logN iterations, or fewer
106 |     
107 |     k   = round( (mn+mx)/2 );   % pick the next entry
108 |     s   = S(k+1);  % i.e. sList(k-1) < s < sList(k)
109 |     
110 | %     T   = find( x0 + s*L > 0);
111 | %     Tc  = find( x0 + s*L < 0); % we never have y + s = 0, by design of S
112 |     Tc1 = find( x0 + s*L < lwr );
113 |     Tc2 = find( x0 + s*L > upr );
114 |     
115 |     % support of lambda is now well defined in terms of 's'.
116 | %     a   = -R(Tc)'*( x0(Tc)./d(Tc) ); 
117 | %     b   = -R(Tc)'*(  L(Tc)./d(Tc) );
118 |     % alternatively, compute them this way (might be faster):
119 | %     a   = sum( A(Tc) );
120 | %     b   = sum( B(Tc) );
121 |     a   = sum( A1(Tc2) ) + sum( A2(Tc2) );
122 |     b   = sum( B(Tc1) )  + sum( B(Tc2) );
123 |     sEst    = a/(1-b);
124 |     
125 |     % find bounds:
126 |     lb  = sListInf( k+1 );
127 |     ub  = sListInf( k+2 );
128 |     
129 |     % debugging: verify that these are indeed the correct bounds:
130 | %     OK  = ( s > lb ) && ( s < ub );
131 | %     if ~OK, disp('Violated bounds!'); error('Problem!'); end
132 |     
133 |     if sEst < lb
134 |         str = 'v';
135 |         % reduce the upper bound
136 |         mx  = k;
137 |     elseif sEst > ub
138 |         str = '^';
139 |         % increase the lower bound
140 |         mn  = k;
141 |     else
142 |         str = '-';
143 |         DONE = true;
144 |     end
145 |     if VERBOSE, fprintf('k=%2d, [%6.1f, %6.1f], sEst is %6.1f:  %s\n',k,lb,ub, sEst, str ); end
146 |     if DONE, break; end
147 | end
148 | assert( cnt < maxIt, 'rank-1 prox algorithm failed to converge');
149 | 
150 | % T       = find( x0 + s*L > 0);
151 | % x       = zeros(N,1);
152 | % x(T)    = x0(T)  + sEst*L(T);
153 | 
154 | x = x0 + sEst*L;
155 | x = min( max(x,lwr), upr );
156 | 
157 | % if nargout > 1
158 | %     lambda  = zeros(N,1);
159 | %     lambda(Tc)  = -(x0(Tc) + sEst*L(Tc) )./d(Tc);
160 | % end
161 | 
162 | % if RESCALE
163 | %     x = -x;
164 | %     if nargout > 1
165 | %         lambda  = -lambda;
166 | %     end
167 | % end
168 | 


--------------------------------------------------------------------------------
/paperExperiments/Lasso/prox_l1_rank1.m:
--------------------------------------------------------------------------------
  1 | function [x,cBest,cnt] = prox_l1_rank1( x0, D, L, lambda, linTerm )
  2 | % x = prox_l1_weighted( x0, D, u )  or 
  3 | % x = prox_l1_weighted( x0, D, u, lambda ) or 
  4 | % x = prox_l1_weighted( x0, D, u, lambda, c )
  5 | %   returns the solution
  6 | %       x = argmin  lambda*||x||_1 + 1/2||x-x0||^2_{B,2}' + <c,x>
  7 | % where
  8 | %   ||x-x0||^2_{B,2} = < x - x0, B*(x-x0) >
  9 | % and 
 10 | %   H = inv(B) = D + u*u'  is a diagonal + rank-1 matrix
 11 | %   with D = diag(d) > 0 is positive definite.
 12 | %
 13 | % The algorithm takes O( n*log(n) ) running time.
 14 | % Inputs must be real, not complex.
 15 | %
 16 | % [x,c] = ...
 17 | %   also returns c, where
 18 | %   x = shrink( x0 - c*u, d );
 19 | %
 20 | % [x,c,iter] = ...
 21 | %   also returns the number of iterations ( iter <= ceil( log_2(n) + 1 ) ).
 22 | %
 23 | % Stephen Becker, Dec 10 2010 -- April 2012.  stephen.beckr@gmail.com
 24 | 
 25 | % Modified May 27 2011 to handle the "c" term
 26 | % Modified Nov 24 2011 to handle the "lambda" term and be more efficient
 27 | % Modified Feb 29 2012 to be more accurate when L has many zeros
 28 | % Modified Mar 13 2012 to allow vector "lambda" term.
 29 | 
 30 | % Note about the code: the documentation refers to H = D + u*u'
 31 | %   The code uses the notation H = D + L*R' (L for left, R for right)
 32 | %   In general, we need R=L (so that H is positive definite),
 33 | %   but we keep the "R" notation because it makes the derivation more clear,
 34 | %   and we don't always have R=L after we remove zero terms from L.
 35 | 
 36 | 
 37 | % -------- Preprocess ---------------------
 38 | 
 39 | if isvector(D), d = D; % D = diag(d); 
 40 | else d = diag(D); end
 41 | if any( d < 0 ), error('Diagonal term must be positive'); end
 42 | % make sure everything is a column vector
 43 | if size(x0,2) > 1, x0 = x0.'; end
 44 | if size(L,2)  > 1, L  = L.' ; end
 45 | 
 46 | if nargin >= 5 && ~isempty( linTerm )
 47 |     if size(linTerm,2) > 1, linTerm = linTerm.'; end
 48 |     x0  = x0 - (d.*linTerm + L*(L'*linTerm) );
 49 | end
 50 | 
 51 | if nargin < 4 || isempty(lambda), lambda = 1; end
 52 | if numel(lambda)>1 
 53 |     % rescale
 54 |     if size(lambda,2) > 1, lambda = lambda.'; end 
 55 |     if size(lambda,2) > 1
 56 |         lambda = diag(lambda); 
 57 |     end
 58 |     d   = lambda.*d;
 59 | elseif lambda ~= 1
 60 |     % rescale
 61 |     d    = lambda*d;
 62 |     L    = sqrt(lambda)*L;
 63 | end
 64 | if isscalar(d), d = d*ones(size(x0)); end
 65 | N = length(x0);
 66 | 
 67 | % Now, we can pretend lambda=1 and linTerm=0, since they have been accounted for
 68 | 
 69 | shrinkVec   = @(x,d) sign(x).*max( abs(x) - d, 0 );
 70 | 
 71 | % If there is no low-rank term:
 72 | if nargin < 3 || isempty(L)
 73 |     x       = shrinkVec( x0, d );
 74 |     cBest   = 0;
 75 |     cnt     = 0;
 76 |     return;
 77 | end
 78 | 
 79 | % Account for cases when L has many zeros...
 80 | nonzeroL    = find( abs(L) > 100*eps );
 81 | if length(nonzeroL) < N
 82 |     L_HAS_ZEROS = true;
 83 |     
 84 |     % and reduce the rest of it to a smaller problem:
 85 |     old_L   = L;
 86 |     old_x0  = x0;
 87 |     old_d   = d;
 88 |     x0  = x0(nonzeroL);
 89 |     d   = d(nonzeroL);
 90 |     L   = L(nonzeroL);
 91 | else
 92 |     L_HAS_ZEROS = false;
 93 | end
 94 | R = L;
 95 | if numel(lambda)>1  % For diagonal lambda
 96 |     if L_HAS_ZEROS
 97 |         R = lambda(nonzeroL).*R;
 98 |     else
 99 |         R = lambda.*R;
100 |     end
101 | end
102 | 
103 | c1 = (x0+d)./L; % if x_i < 0
104 | c2 = (x0-d)./L; % if x_i > 0
105 | c = [c1,c2];
106 | cList = sort(c(:));  % list of break-points.
107 | offset = 1e0;
108 | cList2 = [ cList(1)-offset; cList + [diff(cList)/2;offset]  ]; % look in-between stuff
109 | cListInf = [-Inf; cList; +Inf ];
110 | sL = sign(L);
111 | 
112 | sLc1 = sL.*c1; % precompute
113 | sLc2 = sL.*c2;
114 | 
115 | NN = length(cList2);
116 | % Keep track of counters:
117 | mn  = 1;
118 | mx  = NN;
119 | cnt = 0;
120 | j = round( (mn+mx)/2 );
121 | 
122 | % This loop would be nice in a mex file
123 | %   (we want to do the "sort" in Matlab, since Matlab has a great sort function)
124 | 
125 | % -------- Main loop ---------------------
126 | maxIt = NN+3;
127 | while cnt < maxIt  % should never max out, but just in case of infinite loop due to coding error...
128 |     cnt = cnt + 1;
129 |     ci = cList2(j);
130 |     
131 |     % -- Step 1: estimate the support 
132 |     dx = ( sL*ci < sLc2 ) - ( sL*ci > sLc1 );
133 |     
134 |     
135 |     Tc = ~dx; 
136 |     T  = ~~dx;
137 |     alpha = R(T)'*dx(T);
138 |     
139 |     invA_vec = 1./d(Tc); % precompute for speed
140 |     
141 |     
142 |     u = L(Tc);
143 |     v = R(Tc); % since lambda nonscalar, we may have u ~= v
144 |     
145 |     vv  = invA_vec.*v;
146 |     zc  = 1 + vv'*u;
147 | %     QQ = invA - invA*u*v'*invA/zc;    % conceptually, this is what we do, but this is slow numerically
148 | %     dxTc = QQ*(x0(Tc) - alpha*L(Tc) );
149 |     
150 |     % Make the above faster:
151 |     yy   = x0(Tc) - alpha*L(Tc);
152 |     dxTc = invA_vec.*(yy - u*(vv'*yy)/zc);
153 |     
154 |     dx(Tc) = dxTc;
155 |     cEst = R'*dx;   % based on this support, this is our estimate of the shrinkage scalar
156 | 
157 |     % Test if this shrinkage scalar is permissible 
158 |     if cEst < cListInf(j)
159 |         % We need to decrease the value of c
160 |         mx = j;
161 |     elseif cEst > cListInf(j+1)
162 |         % We need to increase the value of c
163 |         mn = j;
164 |     else
165 |         % The support is acceptable!
166 |         cBest   = cEst;
167 |         if any( abs(dxTc) > 1 )
168 |             disp('Weird behavior: bad subgradient');
169 |             cBest = NaN;
170 |         end
171 |         break;
172 |     end
173 | 
174 |     % Next direction:
175 | %     j = round( (mn+mx)/2 );
176 |     if mx > mn + 1
177 |         j = round( (mn+mx)/2 );
178 |     else
179 |         % There are only two left, [mn mn+1]
180 |         if j == mn
181 |             j = mn+1;
182 |         else
183 |             j   = mn;
184 |         end
185 |     end
186 |     
187 |     
188 | end
189 | assert( cnt < maxIt, 'rank-1 prox algorithm failed to converge');
190 | if isnan(cBest)
191 |     warning('Found NaN','prox_l1_weighted:failed');
192 |     x = NaN;
193 | else
194 |     % Account for cases when L has many zeros...
195 |     if L_HAS_ZEROS
196 |         x = shrinkVec( old_x0 - cBest*old_L, old_d ); % for
197 |     else
198 |         % In this case, I didn't waste the memory to copy L, x0 and d
199 |         x = shrinkVec( x0 - (cBest)*L, d );
200 |     end
201 | end
202 | 


--------------------------------------------------------------------------------
/paperExperiments/Lasso/runTestsForPaper.m:
--------------------------------------------------------------------------------
  1 | %{
  2 | For the paper, we ran tests 4 and 5
  3 | test 4 is Fig 6.1 (left)  from https://arxiv.org/pdf/1801.08691.pdf
  4 |  (similar to Fig 1.a from https://arxiv.org/pdf/1206.1156.pdf )
  5 | 
  6 | test 5 is Fig 6.1 (right) from https://arxiv.org/pdf/1801.08691.pdf
  7 |  (similar to Fig 1.b from https://arxiv.org/pdf/1206.1156.pdf )
  8 | 
  9 | We compare with 3rd party codes, but we don't redistribute their code, so
 10 | we have documented where we got their code from and you are free to install
 11 | their code and compare.
 12 | 
 13 |   -- Stephen Becker, Feb 2018
 14 | %}
 15 | %% L-BFGS-B
 16 | %{
 17 | We wrote our own Matlab wrapper for this (using the L-BFGS-B 3.0 Fortran
 18 | code). You can download it from: https://github.com/stephenbeckr/L-BFGS-B-C
 19 | 
 20 | Unpack it somewhere and run lbfgsb_C/Matlab/compile_mex.m
 21 | %}
 22 | addpath ~/Repos/lbfgsb_C/Matlab
 23 | %% ASA
 24 | %{
 25 | http://users.clas.ufl.edu/hager/papers/Software/
 26 | as of 2013, they have v 3.0
 27 | http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-3.0.tar.gz
 28 | but old code is still online at:
 29 | http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-2.2.tar.gz
 30 | (old link was http://www.math.ufl.edu/~hager/papers/CG/Archive/ASA_CG-2.2.tar.gz, that's bad now)
 31 | You also need the Matlab interface; we wrote this ourself, and it can be 
 32 |   downloaded from Mathworks (it will also download the main C source code for you)
 33 | https://www.mathworks.com/matlabcentral/fileexchange/35814-mex-interface-for-bound-constrained-optimization-via-asa
 34 | 
 35 | If you download the Matlab interface, run the test_ASA.m script
 36 |  and it will downlaoad the ASA code that it needs.
 37 | 
 38 | %}
 39 | addpath('~/Documents/MATLAB/packages/ASA_CG_matlabWrapper');
 40 | %% CGIST
 41 | %{
 42 | Get CGIST from:
 43 |   http://tag7.web.rice.edu/CGIST.html
 44 | or http://tag7.web.rice.edu/CGIST_files/cgist.zip
 45 | %}
 46 | addpath('~/Documents/MATLAB/packages/cgist');
 47 | %% FPC
 48 | %{
 49 | Get FPC AS from:
 50 | http://www.caam.rice.edu/~optimization/L1/FPC_AS/request-for-downloading-fpc_as.html
 51 | %}
 52 | addpath('~/Documents/MATLAB/packages/FPC_AS_v1.21/src');
 53 | %% L1General and PSSas
 54 | %{
 55 |  Get the L1General2 code from
 56 |    https://www.cs.ubc.ca/~schmidtm/Software/thesis.html
 57 | or https://www.cs.ubc.ca/~schmidtm/Software/thesis.zip
 58 | 
 59 | Note: you need to compile mex files for this (for the lbfgs subroutine)
 60 | For compilation, try: SchmidtThesis/minFunc/mexAll.m 
 61 | 
 62 | 2018, line 13 in lbfgsC.c, " int nVars,nSteps,lhs_dims[2];"
 63 |  With Matlab R2017b, this causes problems. Remove the lhs_dims[2] and add
 64 |  a new line with: "  size_t lhs_dims[2];"
 65 | 
 66 | %}
 67 | addpath ~/Documents/MATLAB/packages/SchmidtThesis/L1General2/
 68 | addpath ~/Documents/MATLAB/packages/SchmidtThesis/misc/
 69 | addpath ~/Documents/MATLAB/packages/SchmidtThesis/minFunc/ 
 70 | 
 71 | %% Setup a problem
 72 | 
 73 | randn('state',234213); rand('state',2342343);
 74 | 
 75 | % --- fcn setup ---
 76 | TEST = 4;
 77 | % TEST = 5;
 78 | switch TEST
 79 |     case 4
 80 |         % compressed sensing...
 81 |         N       = 3000; % any larger than 5000 and it takes a while to get the norm(A)
 82 |         lambda  = .1;
 83 |         A       = randn(N/2,N);
 84 |         b       = randn(size(A,1),1);
 85 |         Q       = A'*A; c = A'*b;
 86 |         
 87 |     case 5
 88 |         % See Fletcher's paper
 89 |         n = 13; 
 90 |         N = n^3; 
 91 |         fprintf('N is %d\n', N );
 92 |         lambda  = 1;
 93 |         
 94 |         I   = eye(n);
 95 |         BDG = -( diag( ones(n-1,1), 1 ) + diag( ones(n-1,1), -1 ) );
 96 |         T   = 6*I + BDG;
 97 |         
 98 |         W   = kron( I, T ) + kron( BDG, I );
 99 |         Q   = kron( I, W ) + kron( BDG, eye(n^2) );
100 |         
101 |         sigma = 20; a1 = 0.4; a2 = 0.7; a3 = 0.5;
102 |         pdeSol = @(x,y,z) x.*(x-1).*y.*(y-1).*z.*(z-1).*exp( ...
103 |             -.5*sigma^2*( (x-a1).^2 + (y-a2).^2 + (z-a3).^3 ) );
104 |         % Find rhs c = Q*u, where u is the solution above
105 |         h = 1/(n+1);
106 |         grd = h:h:(1-h); % interior points
107 |         [X,Y,Z] = meshgrid(grd);
108 |         u_pde = pdeSol( X, Y, Z );
109 |         c = Q*vec(u_pde);
110 |         fprintf('||c||_inf is %g\n', norm(c,Inf) );
111 |         
112 |         A   = chol(Q); % has small condition number, e.g. 8, and is upper bi-diagonal
113 |         b   = (A')\c;
114 | end
115 | 
116 | %% More setup
117 | 
118 | % --- Plotting and such ---
119 | 
120 | NAMES       = {};
121 | OBJECTIVES  = {};
122 | TIMES       = {};
123 | % -------------------------
124 | if size(A,1) < size(Q,1)
125 |     if issparse(Q), normQ = normest(A*A'); else normQ   = norm(A*A'); end
126 | else
127 |     if issparse(Q), normQ = normest(Q); else normQ   = norm(Q); end
128 | end
129 | lambdaVect = lambda*ones(N,1);
130 | fcn         = @(w) w'*(Q*w)/2 - c'*w + lambda*norm(w,1);
131 | 
132 | % NOTE: the non-standard form (not |Ax-b|, rather <x,Qx> )
133 | fcnSimple   = @(w) w'*(Q*w)/2 - c'*w;
134 | gradSimple  = @(w) Q*w - c; % doesn't include non-smooth portion
135 | % for L-BFGS-B, we will add to gradSimple, since we have made new smooth terms
136 |     
137 | % for SR1
138 | prox    = @(x0,d,l) prox_l1_rank1( x0, d, l, lambda );
139 | 
140 | % Setup operators for L-BFGS-B
141 | pos     = @(w) w(1:N,:);
142 | neg     = @(w) w(N+1:2*N,:);
143 | dbl     = @(gg) [gg;-gg];
144 | lambdaVect2     = [lambdaVect;lambdaVect];
145 | fcn2    = @(w) fcnSimple( pos(w) - neg(w) ) + lambdaVect2'*w;
146 | grad2   = @(w) dbl(gradSimple(pos(w)-neg(w))) + lambdaVect2;
147 |     
148 | 
149 | %% SR1
150 | disp('Solving via SR1 with l1 constraint ...');
151 | % fcn and grad are defined above now...
152 | 
153 | opts = struct('N',N,'verbose',50,'nmax',4000,'tol',1e-14);
154 | % opts.x0     = .1*ones(N,1); % use this for SR1 versions
155 | % opts.nmax = 5;
156 | opts.BB     = true;
157 | % opts.theta  = []; opts.restart=6; % use [] for FISTA
158 | opts.theta  = 1; opts.SR1 = true; 
159 | opts.SR1_diagWeight=0.8;
160 | 
161 | opts.L      = normQ;
162 | 
163 | opts.backtrack = false;
164 | 
165 | tic
166 | % The code I used for the 2012 tests
167 | % [xk,nit, errStruct,optsOut] = zeroSR1(fcn,gradSimple,prox,opts);
168 | 
169 | % Dec '12, try our simplified code:
170 | opts = rmfield(opts,{'theta','backtrack'});
171 | [xk,nit, errStruct,optsOut] = zeroSR1_noLinesearch(fcn,gradSimple,prox,opts);
172 | 
173 | tm = toc;
174 | NAMES{end+1} = '0-mem SR1';
175 | OBJECTIVES{end+1} = errStruct(:,1);
176 | TIMES{end+1} = tm;   
177 | %% and run our code, but choose FISTA...
178 | 
179 | opts.BB     = true;
180 | opts.theta  = []; opts.restart=1000; % use [] for FISTA
181 | % opts.theta  = 1; 
182 | opts.SR1 = false; 
183 | 
184 | opts.backtrack = true;
185 | 
186 | tic
187 | [xk,nit, errStruct,optsOut] = zeroSR1(fcn,gradSimple,prox,opts);
188 | tm = toc;
189 | NAMES{end+1} = 'FISTA w/ BB'; % with linesearch
190 | OBJECTIVES{end+1} = errStruct(:,1);
191 | TIMES{end+1} = tm;   
192 | %% and run our code, but choose BB...
193 | 
194 | opts.BB     = true;
195 | opts.theta  = 1;
196 | opts.SR1 = false; 
197 | 
198 | opts.backtrack = true;
199 | 
200 | tic
201 | [xk,nit, errStruct,optsOut] = zeroSR1(fcn,gradSimple,prox,opts);
202 | tm = toc;
203 | NAMES{end+1} = 'SPG/SpaRSA'; % with linesearch
204 | OBJECTIVES{end+1} = errStruct(:,1);
205 | TIMES{end+1} = tm; 
206 | %% Run L-BFGS-B
207 | if ~exist('lbfgsb','file')
208 |     disp('Cannot find L-BFGS-B on your path, so skipping this test');
209 | else
210 | %{
211 | Solve min L(x) + lambda*||x||_1 by formulating as:
212 |   min_{z,y} L(z-y) + ones(2N,1)'*[z,y]
213 | s.t.
214 | z,y >= 0. i.e. "x" is z - y
215 |   
216 | 
217 | if we switch to simple x >= 0 formulation, then it solves it in 2 steps!!
218 | 
219 | %}
220 |     disp('Solving via L-BFGS-B...');
221 | 
222 |     tic
223 |     fun     = @(x)fminunc_wrapper( x, fcn2, grad2);
224 |     opts    = struct( 'factr', 1e4, 'pgtol', 1e-12, 'm', 10, 'maxIts', 20000, 'maxTotalIts',1e6 );
225 |     opts.printEvery     = 100;
226 |     opts.factr          = 1e1; % more accurate soln
227 |     if N > 200
228 |         opts.factr = 1e-2;
229 |         opts.pgtol = 1e-14;
230 |     end
231 |     % opts.factr          = 1e7; % default
232 |     [x2, ~, info] = lbfgsb(fun, zeros(2*N,1), inf(2*N,1), opts );
233 |     x   = pos(x2) - neg(x2);
234 |     tm = toc;
235 |     
236 |     NAMES{end+1} = 'L-BFGS-B';
237 |     OBJECTIVES{end+1} = info.err(:,1);
238 |     TIMES{end+1} = tm;
239 | end
240 | %% Run ASA
241 | if ~exist('asa_wrapper','file')
242 |     disp('Cannot find ASA on your path, so skipping this test');
243 | else
244 |     
245 |     % param = struct('A',A,'b',b);
246 |     % param = struct('A',A,'b',b,'lambda',lambda); % No, I am not using this format...
247 |     % an alternative way:
248 |     
249 |     % param = struct('Q',Q,'c',-c,'lambda',lambda);
250 |     param = struct('Q',[Q,-Q;-Q,Q],'c',-[c;-c]+lambdaVect2,'offset',0);
251 |     param.maxits = 1e6;
252 |     
253 |     % if isfield( MAXITS, 'ASA' ) && ~isempty( MAXITS.ASA )
254 |     %     param.maxits = min( param.maxits, MAXITS.ASA );
255 |     % end
256 |     
257 |     % add some options (these are optional). See driver1.c for examples,
258 |     %   and see asa_user.h for all possible values
259 |     [opts,CGopts] = deal(struct('PrintParms',false));
260 |     opts.PrintParms = 0;
261 |     opts.PrintFinal = 1;
262 |     opts.PrintLevel = 0;
263 |     opts.StopFac = 1e-9;
264 |     
265 |     % zero-out the counters
266 |     asa_quadratic_fcnGrad();
267 |     
268 |     lo = zeros(2*N,1);
269 |     hi = inf(2*N,1);
270 |     % x0 = ones(2*N,1);
271 |     x0 = zeros(2*N,1);
272 |     % run the function
273 |     disp('starting...');
274 |     tic
275 |     [x2,status,statistics] = asa_wrapper( x0, lo, hi,'asa_quadratic_fcn',...
276 |         'asa_quadratic_grad', 'asa_quadratic_fcnGrad', opts, CGopts, param);
277 |     tm = toc;
278 |     x   = pos(x2) - neg(x2);
279 |     % View the function values
280 |     [fcnHistory] = asa_quadratic_fcnGrad();
281 |     
282 |     NAMES{end+1} = 'ASA';
283 |     OBJECTIVES{end+1} = fcnHistory;
284 |     TIMES{end+1} = tm;
285 | end
286 | %% Run PSSas and OWN (stuff from L1General toolbox)
287 | if ~exist('L1General2_PSSas','file') || ~exist('L1General2_OWL','file')
288 |     disp('Cannot find PSSas or OWL and L1General on your path, so skipping this test');
289 | else
290 |     
291 |     gOptions = [];
292 |     gOptions.maxIter = 4000;
293 |     gOptions.verbose = 1; % Set to 0 to turn off output
294 |     gOptions.corrections = 10; % for L-BFGS
295 |     gOptions.optTol  = 1e-14;
296 |     gOptions.progTol  = 1e-15;
297 |     
298 |     % funObj     = @(x)fminunc_wrapper( x, fcn, gradSimple);
299 |     funObj     = @(x)fminunc_wrapper( x, fcnSimple, gradSimple,[]);
300 |     %   This works well for error, but not for objective fcn value,
301 |     %   since this is only the smooth portion. So we need to add in
302 |     %   a non-smooth term that gets added just to the history.
303 |     extraFcn   = @(x) lambda*norm(x,1);
304 |     funObj     = @(x)fminunc_wrapper( x, fcnSimple, gradSimple,[],[],extraFcn);
305 |     
306 |     
307 |     w_init = zeros(N,1);
308 |     
309 |     fprintf('\nProjected Scaled Sub-Gradient (Active-Set variant)\n');
310 |     options = gOptions;
311 |     
312 |     fminunc_wrapper();
313 |     tic
314 |     [wk,objectiveValues] = L1General2_PSSas(funObj,w_init,lambdaVect,options);
315 |     tm = toc;
316 |     if isempty( objectiveValues )  % it stopped on first iter
317 |         % do it again, with larger starting guess
318 |         w_init = ones(N,1);
319 |         fminunc_wrapper();
320 |         tic
321 |         [wk,objectiveValues] = L1General2_PSSas(funObj,w_init,lambdaVect,options);
322 |         tm = toc;
323 |     end
324 |     [fcnHistory,errHistory] =fminunc_wrapper();
325 |     NAMES{end+1} = 'PSSas';
326 |     OBJECTIVES{end+1} = fcnHistory;
327 |     TIMES{end+1} = tm;
328 |     
329 |     
330 |     % And re-run for the OWL code
331 |     fminunc_wrapper();
332 |     tic
333 |     wk = L1General2_OWL(funObj,w_init,lambdaVect,options);
334 |     tm = toc;
335 |     [fcnHistory,errHistory] =fminunc_wrapper();
336 |     NAMES{end+1} = 'OWL';
337 |     OBJECTIVES{end+1} = fcnHistory;
338 |     TIMES{end+1} = tm;
339 | 
340 | end
341 | %% run cgist
342 | if ~exist('cgist','file')
343 |     disp('Cannot find cgist on your path, so skipping this test');
344 | else
345 |     % solves ||Ax-f||^2 + lambda*|x|_1
346 |     % So, from <x,Qx>/2 -c'*x format, we have
347 |     %
348 |     regularizer = 'l1';
349 |     opts = [];
350 |     opts.tol = 1e-8;
351 |     opts.record_objective = true;
352 |     opts.record_iterates = false; % big!
353 |     opts.errFcn = [];
354 |     tic
355 |     [xk, multCount, subgradientNorm, out] = cgist(A,[],b,lambda,regularizer,opts);
356 |     tm = toc;
357 |     % need to subtract norm(b)^2/2 to get objective fcn to line up
358 |     out.objectives = out.objectives - norm(b)^2/2;
359 |     
360 |     NAMES{end+1} = 'CGIST';
361 |     OBJECTIVES{end+1} = out.objectives;
362 |     TIMES{end+1} = tm;
363 | end
364 | %% run FPC-AS
365 | if ~exist('FPC_AS','file')
366 |     disp('Cannot find FPC_AS on your path, so skipping this test');
367 | else
368 |     % v 1.1, 10/2008 Zaiwen Wen
369 |     %
370 |     % For some reason, need to give it some negatives... (-x vs +x)
371 |     
372 |     opts = [];
373 |     opts.gtol = 1e-9;   % a termination option of FPC_AS; see manual
374 |     opts.mxitr = 6e3;
375 |     opts.sub_mxitr = 80; % # of sub-space iterations (max)
376 |     opts.lbfgs_m = 5; % storage
377 |     opts.record = 0; % -1,0,1
378 |     opts.PrintOptions = 0;
379 |     % opts.scale_A = 1;
380 |     M = [];
381 |     % M = 10*eye(N);
382 |     sc = 1;
383 |     tic
384 |     [x, out] = FPC_AS(N,-A/sc,b/sc,lambda/sqrt(sc),M,opts);
385 |     tm = toc;
386 |     out.fcnHist = out.fcnHist - norm(b)^2/2;
387 |     NAMES{end+1} = 'FPC-AS';
388 |     OBJECTIVES{end+1} = out.fcnHist;
389 |     TIMES{end+1} = tm;
390 | 
391 | end
392 | %% PLOT EVERYTHING
393 | figure(1); clf;
394 | 
395 | obj_best = Inf;
396 | for k = 1:length(OBJECTIVES)
397 |     obj_best = min(obj_best, min( OBJECTIVES{k}) );
398 | end
399 |     
400 | for k = 1:length(NAMES)
401 |     tGrid = linspace(0,TIMES{k},length(OBJECTIVES{k}));
402 |     h=semilogy( tGrid, cummin( OBJECTIVES{k} - obj_best)  );
403 |     
404 |     set(h,'linewidth',2);
405 |     
406 |     hold all
407 | end
408 | legend(NAMES)
409 | xlabel('time in seconds','fontsize',18);
410 | ylabel('objective value error','fontsize',18);
411 | set(gca,'fontsize',18)
412 |  switch TEST
413 |      case 4
414 |          title('Fig 6.1 (left) from https://arxiv.org/pdf/1801.08691.pdf');
415 |          xlim([0,110]);
416 |          ylim([1e-8,1e4]);
417 |      case 5
418 |          title('Fig 6.1 (right) from https://arxiv.org/pdf/1801.08691.pdf');
419 |          xlim([0,2.5]);
420 |          ylim([1e-8,1e9]);
421 |  end
422 | 


--------------------------------------------------------------------------------
/paperExperiments/Lasso/test4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/paperExperiments/Lasso/test4.png


--------------------------------------------------------------------------------
/paperExperiments/Lasso/test5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/paperExperiments/Lasso/test5.png


--------------------------------------------------------------------------------
/paperExperiments/Lasso/zeroSR1_noLinesearch.m:
--------------------------------------------------------------------------------
  1 | function [xk,nit, errStruct, defaultOpts, stepsizes] = zeroSR1_noLinesearch(fcn,grad,prox,opts)
  2 | % ZEROSR1_NOLINESEARCH Solves smooth + nonsmooth/constrained optimization problems
  3 | % [xk,nit, errStruct, outOpts] = zeroSR1_noLinesearch(f,g,proj,opts)
  4 | %
  5 | % This uses the zero-memory SR1 method (quasi-Newton) to solve:
  6 | % 
  7 | %   min_x f(x)  + h(x)
  8 | %
  9 | % where
 10 | %   'f' calculates f(x), 'g' calculates the gradient of f at x,
 11 | %   and h(x) is a non-smooth term that can be infinite-valued (a constraint),
 12 | %   so long as you present a function 'prox' that computes diagional plus
 13 | %   rank-1 projections. The 'prox' function should accept at least three inputs:
 14 | %
 15 | %   y = prox( x0 , d, v, ) 
 16 | % where
 17 | %   y = argmin_x h(x) + 1/2||x-x0||^2_B 
 18 | % where
 19 | %   B = inv(H) = inv( diag(D) + v*v' )
 20 | %
 21 | % If 'prox' isn't provided or is [], it defaults to the identity mapping, which corresponds
 22 | %   to the case when h=0.
 23 | %
 24 | % "opts" is a structure with additional options. To see their default values,
 25 | %   call this function with no input arguments.
 26 | %
 27 | %   .tol is a tolerance for relative variation
 28 | %   .nmax is max # of allowed iterations
 29 | %   .verbose can be either 0 (no output), 1 (every iteration), or n
 30 | %       If 'n' is an integer greater than 1, output will be written
 31 | %       every n iterations
 32 | %   .x0
 33 | %       starting vector
 34 | %   .N
 35 | %       size of primal domain (only necessary of x0 wasn't provided)
 36 | %
 37 | %   .SR1  if true, uses the zero-memory SR1 method
 38 | %         if false, uses gradient descent/forward-backward method
 39 | %         (or variant, such as FISTA, or BB stepsizes as in the SPG method)
 40 | %   .BB  
 41 | %       use the Barzilai-Borwein scalar stepsize
 42 | %
 43 | %   .errFcn can be an arbitrary function that calculates an error metric
 44 | %       on the primal variable at every iteration.
 45 | %
 46 | %
 47 | % Output "errStruct" contains three or four columns:
 48 | %   (1) objective function
 49 | %   (2) norm of gradient
 50 | %   (3) stepsize
 51 | %   (4) error (i.e. the output of errFcn, if provided)
 52 | %   
 53 | % Stephen Becker and Jalal Fadili, Nov 24 2011 -- Dec 2012
 54 | % Copied from zeroSR1.m Dec 11 2012
 55 | %   (zeroSR1.m is the "full version of the code with more bells and 
 56 | %    whistles, and also allows Nesterov acceleration and over-relaxation.
 57 | %   This version is designed to have more human readable source-code. )
 58 | % See also zeroSR1.m
 59 | 
 60 | 
 61 | 
 62 | % -----------------------------------------------------------------
 63 | % ------------ Boring initializations -----------------------------
 64 | % ------------ for understanding the algorithm, skip ahead --------
 65 | % ------------ to where it says "Begin algorithm"------------------
 66 | % -----------------------------------------------------------------
 67 | 
 68 | if nargin == 0 || nargout >= 4
 69 |     RECORD_OPTS     = true;
 70 |     defaultOpts     = [];
 71 | else
 72 |     RECORD_OPTS     = false;
 73 | end
 74 | 
 75 | if nargin < 3 || isempty(prox), prox = @(x,diag,v) x; end
 76 | if nargin < 4, opts = []; end 
 77 | 
 78 | function out = setOpts( field, default, mn, mx )
 79 |     if ~isfield( opts, field )
 80 |         opts.(field)    = default;
 81 |     end
 82 |     out = opts.(field);
 83 |     if nargin >= 3 && ~isempty(mn) && any(out < mn), error('Value is too small'); end
 84 |     if nargin >= 4 && ~isempty(mx) && any(out > mx), error('Value is too large'); end
 85 |     opts    = rmfield( opts, field ); % so we can do a check later
 86 |     if RECORD_OPTS
 87 |         defaultOpts.(field) = out;
 88 |     end
 89 | end
 90 | 
 91 | 
 92 | fid     = setOpts( 'fid', 1 );      % print output to the screen or a file
 93 | myDisp  = @(str) fprintf(fid,'%s\n', str );
 94 | tol     = setOpts( 'tol', 1e-6 );  
 95 | grad_tol= setOpts( 'grad_tol', tol );
 96 | nmax    = setOpts( 'nmax', 1000 );  
 97 | errFcn  = setOpts( 'errFcn', [] );
 98 | VERBOSE = setOpts( 'verbose', false );
 99 | if isinf(VERBOSE), VERBOSE = false; end
100 | maxStag = setOpts( 'maxStag', 10 ); % force very high accuracy
101 | xk      = setOpts( 'x0', [] );
102 | N       = setOpts( 'N', length(xk) );
103 | if N==0 && nargin > 0, error('for now, must specify opts.N = N'); end
104 | if isempty(xk), xk = zeros(N,1); end
105 | 
106 | % -- Options that concern the stepsize --
107 | L               = setOpts( 'L', 1, 0 );   % Lipschitz constant, e.g. norm(A)^2
108 | SR1             = setOpts( 'SR1', false );
109 | SR1_diagWeight  = setOpts( 'SR1_diagWeight', 0.8 );
110 | BB              = setOpts( 'BB', SR1 );
111 | 
112 | if SR1, BB_type = setOpts('BB_type',2);
113 | else, BB_type   = setOpts('BB_type',1); % faster, generally
114 | end
115 | if SR1 && BB_type == 1
116 |     warning('With zero-memory SR1, BB_type must be set to 2. Forcing BB_type = 2 and continuing','zeroSR1:BB_warn');
117 |     BB_type     = 2;
118 | end
119 | % ------------ Scan options for capitalization issues, etc. -------
120 | if nargin == 0 
121 |     disp('Default options:');
122 |     disp( defaultOpts );
123 | end
124 | if ~isempty(fieldnames(opts))
125 |     disp('Error detected! I didn''t recognize these options:');
126 |     disp( opts );
127 |     error('Bad options');
128 | end
129 | if nargin == 0 , return; end
130 | 
131 | % ------------ Initializations and such ---------------------------
132 | xk_old  = xk;
133 | gradient  = zeros(N,1);
134 | fxold   = Inf;
135 | t       = 1/L; % initial stepsize
136 | stepsizes = zeros(nmax,1 + SR1); % records some statisics
137 | if ~isempty(errFcn)
138 |     if ~isa(errFcn,'function_handle')
139 |         error('errFcn must be a function');
140 |     end
141 |     errStruct   = zeros( nmax, 4 ); % f, norm(gx), step, err
142 | else
143 |     errStruct   = zeros( nmax, 3 ); % f, norm(gx), step
144 | end
145 | skipBB = false;
146 | stag   = 0;
147 | 
148 | % -----------------------------------------------------------------
149 | % ------------ Begin algorithm ------------------------------------
150 | % -----------------------------------------------------------------
151 | for nit = 1:nmax
152 | 
153 |     gradient_old    = gradient;
154 |     gradient        = grad(xk);
155 | 
156 |     % "sk" and "gk" are the vectors that will give us quasi-Newton
157 |     %   information (and also used in BB step, since that can be
158 |     %   seen as a quasi-Newton method)
159 |     sk      = xk        - xk_old;
160 |     gk      = gradient  - gradient_old;   % this is "yk" in Nocedal/Wright
161 |     if nit > 1 && norm(gk) < 1e-13
162 |         warning('gradient isn''t changing , try changing opts.L','specialSR1:zeroChangeInGradient');
163 |         gk = [];
164 |         skipBB = true;
165 |     end
166 |     
167 |     
168 |     % ---------------------------------------------------------------------
169 |     % -- Find an initial stepsize --
170 |     % ---------------------------------------------------------------------
171 |     t_old   = t;
172 |     if BB && nit > 1 && ~skipBB  
173 |         switch BB_type
174 |             case 1
175 |                 t   = (norm(sk)^2)/(sk'*gk); % eq (1.6) in Dai/Fletcher. This is longer
176 |             case 2
177 |                 t   = sk'*gk/( norm(gk)^2 ); % eq (1.7) in Dai/Fletcher. This is shorter
178 |         end
179 |         if t < 1e-14 % t < 0 should not happen on convex problem!
180 |             myDisp('Curvature condition violated!');
181 |             stag    = Inf;
182 |         end
183 |         if SR1
184 |             % we cannot take a full BB step, otherwise we exactly satisfy the secant
185 |             %   equation, and there is no need for a rank-1 correction.
186 |             t    = SR1_diagWeight*t; % SR1_diagWeights is a scalar less than 1 like 0.6
187 |         end
188 |         H0      = @(x) t*x;
189 |         diagH   = t*ones(N,1);
190 |     else 
191 |         t       = 1/L;
192 |         H0      = @(x) t*x;         % diagonal portion of inverse Hessian
193 |         diagH   = t*ones(N,1);
194 |     end
195 |     skipBB  = false;
196 |     stepsizes(nit,1) = t;
197 |     
198 |     
199 |     
200 |     % ---------------------------------------------------------------------
201 |     % -- Quasi-Newton -- Requries: H0, and builds H
202 |     % ---------------------------------------------------------------------
203 |     if SR1 && nit > 1 && ~isempty(gk) 
204 |         gs = gk'*sk;
205 |         gHg = gk'*(diagH.*gk);
206 |         if gs < 0,  myDisp('Serious curvature condition problem!'); stag = Inf;  end
207 |         H0  = @(x) diagH.*x;
208 |         vk  = sk - H0(gk);
209 |         if vk'*gk  <= 0
210 |             myDisp('Warning: violated curvature conditions');
211 |             % This should only happen if we took an exact B-B step, which we don't.
212 |             vk  = [];
213 |             H   = H0;
214 |         else
215 |             vk  = vk/sqrt( vk'*gk );
216 |             % And at last, our rank-1 approximation of the inverse Hessian.
217 |             H   = @(x) H0(x) + vk*(vk'*x);
218 |             % The (inverse) secant equation is B*sk = gk(=y), or Hy=s
219 |             % N.B. We can make a rank-1 approx. of the Hessian too; see the full
220 |             % version of the code.
221 |         end
222 |         stepsizes(nit,2)    = vk'*vk;
223 |     else
224 |         H = H0;
225 |         vk= [];
226 |     end
227 |     
228 |     
229 |     % ---------------------------------------------------------------------
230 |     % -- Make the proximal update -----------------------------------------
231 |     % ---------------------------------------------------------------------
232 |     p       = H(-gradient);  % Scaled descent direction. H includes the stepsize
233 |     xk_old  = xk;
234 |     xk      = prox( xk_old + p, diagH, vk ); % proximal step
235 |     norm_grad = norm( xk - xk_old );
236 |     if any(isnan(xk)) || norm(xk) > 1e10
237 |         stag = Inf; % will cause it to break
238 |         xk   = xk_old;
239 |         myDisp('Prox algorithm failed, probably due to numerical cancellations');
240 |     end
241 |     
242 |     % ---------------------------------------------------------------------
243 |     % -- The rest of the code is boring. The algorithmic stuff is done. ---
244 |     % ---------------------------------------------------------------------
245 |     % -- record function values --
246 |     % ---------------------------------------------------------------------
247 |     fx  = fcn(xk);
248 |     df  = abs(fx - fxold)/abs(fxold);
249 |     fxold = fx;
250 |     
251 |     printf('Iter: %5d, f: %.3e, df: %.2e, ||grad||: %.2e, step %.2e\n',...
252 |         nit,fx,df, norm_grad, t);
253 |     
254 |     errStruct(nit,1)    = fx;
255 |     errStruct(nit,2)    = norm_grad;
256 |     errStruct(nit,3)    = t;
257 |     if ~isempty(errFcn)
258 |         errStruct(nit,4)    = errFcn( xk );
259 |         printf('\b, err %.2e\n', errStruct(nit,4) );
260 |     end
261 |     
262 |     if (df < tol) || ( t < 1e-10 ) || (isnan(fx) ) || norm_grad < grad_tol
263 |         stag = stag + 1;
264 |     end
265 |     if stag > maxStag
266 |         if VERBOSE, myDisp('Quitting (e.g. reached tolerence)...'); end
267 |         break;
268 |     end
269 |     
270 | end
271 | 
272 | if nit == nmax && VERBOSE, myDisp('Maxed out iteration limit'); end
273 | if nit < nmax
274 |     errStruct = errStruct( 1:nit, : );
275 |     stepsizes = stepsizes( 1:nit, : );
276 |     printf('Iter: %5d, f: %.3e, df: %.2e, ||grad||: %.2e, step %.2e\n',...
277 |         nit,fx,df, norm_grad, t);
278 |     if ~isempty(errFcn)
279 |         printf('\b, err %.2e\n', errStruct(nit,4) );
280 |     end
281 | end
282 | 
283 | % ---------------------------------------------------------------------
284 | % Nested functions:
285 | % ---------------------------------------------------------------------
286 | function printf(varargin)
287 |  if VERBOSE
288 |     if VERBOSE > 1 
289 |         if ~rem(nit,VERBOSE)
290 |           fprintf(fid,varargin{:}); 
291 |         end
292 |     else
293 |       fprintf(fid,varargin{:}); 
294 |     end
295 |  end
296 | end
297 | 
298 | 
299 | end  % end of main routine
300 | 


--------------------------------------------------------------------------------
/paperExperiments/README.md:
--------------------------------------------------------------------------------
 1 | # zeroSR1 toolbox: Experiments
 2 | 
 3 | 
 4 | This folder contains the Matlab and Python code needed to reproduce the figures from our 2018 paper.
 5 | The version of code used here may be slightly different than the updated algorithms in the main repository.
 6 | 
 7 | Some third-party packages (not provided, though we list the URLs) are required if you want to compare with the other solvers mentioned in the paper.
 8 | ## Authors
 9 | The authors are Stephen Becker, Jalal Fadili and Peter Ochs
10 | 
11 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/Algorithms/FISTA.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import zeros, sqrt
  3 | import time as clock
  4 | 
  5 | def fista(model, oracle, options, tol, maxiter, check):
  6 |     """
  7 | 
  8 |     FISTA algorithm for solving:
  9 | 
 10 |         min_{x} h(x);   h(x):= g(x) + f(x)
 11 | 
 12 |     Update step:
 13 | 
 14 |         t_0     = 1
 15 |         t_kp1   = 0.5*(1.0 + sqrt(1.0 + 4*t_k**2));
 16 |         beta_k  = (t_k-1)/t_kp1;
 17 |         t_k     = t_kp1;
 18 | 
 19 |         y^{k}   = x^{k} + beta_k*(x^{k} - x^{k-1})
 20 |         x^{k+1} = prox_{alpha*g}(y^{k} - alpha*grad f(y^{k}))
 21 |     
 22 |     
 23 |     Properties:
 24 |     -----------
 25 |     f       convex, continuously differentiable with L-Lipschitz 
 26 |             continuous gradient
 27 |     g       convex, simple
 28 | 
 29 |     Parameter:
 30 |     ----------
 31 |     model             model data of the optimization problem
 32 |     
 33 |     oracle:
 34 |     .'grad_f'         computes the gradient of the objective grad f(x^{k})
 35 |     .'prox_g'         computes the proximal mapping of g
 36 |     .'fun_g'          computes the value of g
 37 |     .'fun_f'          computes the value of f
 38 |     .'residual'       used for breaking condition or resor plots
 39 |     
 40 |     options (required):
 41 |     .'stepsize'       stepsize alpha = 1/L (backtracking can be used)
 42 |     .'init'           initialization
 43 |     
 44 |     options (optional):
 45 |     .'storeResidual'  flag to store all residual values
 46 |     .'storeTime'      flag to store the time of each iteration
 47 |     .'storePoints'    flag to store all iterates
 48 |     .'storeObjective' flag to store all objective values
 49 |     .'storeBeta'      flag to store beta values
 50 |     .'backtrackingMaxiter'  if > 1, then backtracking is performed, which 
 51 |                             requires 'backtrackingFactor', otherwise default
 52 |                             values are set and fixed step size is used througout
 53 |     .'backtrackingFactor'   scaling of the step size when backtracking step
 54 |                             is successful or not; value in (0,1)
 55 |         
 56 |     tol               tolerance threshold for the residual
 57 |     maxiter           maximal number of iterations
 58 |     check             provide information after 'check' iterations
 59 |         
 60 |     Return:
 61 |     -------
 62 |     output
 63 |     .'sol'            solution of the problems
 64 |     .'seq_res'        sequence of residual values (if activated)
 65 |     .'seq_time'       sequence of time points (if activated)
 66 |     .'seq_x'          sequence of iterates (if activated)
 67 |     .'seq_obj'        sequence of objective values (if activated)
 68 |     .'seq_beta'       sequence of beta values (overrelaxation parameter / if activated)
 69 |     .'breakvalue'     code for the type of breaking condition
 70 |                       1: maximal number of iterations exceeded
 71 |                       2: breaking condition reached (residual below tol)
 72 |                       3: not enough backtracking iterations
 73 | 
 74 |     """
 75 |     
 76 |     # store options
 77 |     if 'storeResidual'  not in options:
 78 |         options['storeResidual']  = False;
 79 |     if 'storeTime'      not in options:
 80 |         options['storeTime']      = False;
 81 |     if 'storePoints'    not in options:
 82 |         options['storePoints']    = False;
 83 |     if 'storeObjective' not in options:
 84 |         options['storeObjective'] = False;
 85 |     if 'storeBeta' not in options:
 86 |         options['storeBeta'] = False;
 87 |     
 88 |     # backtracking options
 89 |     backtrackingMaxiter = 1;   
 90 |     backtrackingFactor  = 1.0;
 91 |     if 'backtrackingMaxiter' in options:
 92 |         backtrackingMaxiter = options['backtrackingMaxiter'];
 93 |         backtrackingFactor  = options['backtrackingFactor'];
 94 | 
 95 |     # load oracle
 96 |     fun_f    = oracle['fun_f'];
 97 |     fun_g    = oracle['fun_g'];
 98 |     grad_f   = oracle['grad_f'];
 99 |     prox_g   = oracle['prox_g'];
100 |     residual = oracle['residual'];
101 | 
102 |     # load parameter
103 |     alpha    = options['stepsize'];
104 |     
105 | 
106 |     # initialization
107 |     x_kp1 = options['init'];
108 |     x_k   = x_kp1.copy();
109 |     y_k   = x_kp1.copy();
110 |     t_k   = 1.0;
111 |     f_kp1 = fun_f(x_kp1, model, options);
112 |     h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
113 |     res0 = residual(x_kp1, 1.0, model, options);
114 |     
115 |     # taping
116 |     if options['storeResidual'] == True:
117 |         seq_res = zeros(maxiter+1);
118 |         seq_res[0] = 1;
119 |     if options['storeTime'] == True:
120 |         seq_time = zeros(maxiter+1);
121 |         seq_time[0] = 0;
122 |     if options['storePoints'] == True:
123 | 	seq_x = zeros((model['N'],maxiter+1));        
124 | 	seq_x[:,0] = x_kp1;
125 |     if options['storeObjective'] == True:
126 | 	seq_obj = zeros(maxiter+1);        
127 |         seq_obj[0] = h_kp1;
128 |     if options['storeBeta'] == True:
129 | 	seq_beta = zeros(maxiter);        
130 |     time = 0;
131 | 
132 |     # solve 
133 |     breakvalue = 1;
134 |     for iter in range(1,maxiter+1):
135 |         
136 |         stime = clock.time();
137 |         
138 |         # update variables
139 |         t_kp1 = 0.5*(1.0 + sqrt(1.0 + 4*t_k**2));
140 |         beta = (t_k-1)/t_kp1;
141 |         t_k = t_kp1;
142 |         y_k = x_kp1 + beta*(x_kp1 - x_k);
143 |         x_k = x_kp1.copy();
144 |         f_k = f_kp1.copy();
145 | 
146 |         # compute gradient
147 |         grad_k = grad_f(y_k, model, options);
148 | 
149 |         for iterbt in range(0,backtrackingMaxiter):
150 | 
151 |             # forward step
152 |             x_kp1 = y_k - alpha*grad_k;
153 | 
154 |             # backward step
155 |             x_kp1 = prox_g(x_kp1, alpha, model, options);
156 | 
157 |             # compute new value of smooth part of objective
158 |             f_kp1 = fun_f(x_kp1, model, options);
159 | 
160 |             # no backtracking
161 |             if backtrackingMaxiter == 1:
162 |                 break;
163 | 
164 |             # check backtracking breaking condition
165 |             dx = x_kp1 - y_k;
166 |             Delta = sum(grad_k*dx) + 0.5/alpha*sum(dx**2);
167 |             if (f_kp1 < f_k + Delta + 1e-8):
168 |                 if iterbt == 0:
169 |                     alpha = alpha/backtrackingFactor;
170 |                 break;
171 |             else:
172 |                 alpha = alpha*backtrackingFactor;
173 |                 if (iterbt+1 == backtrackingMaxiter):
174 |                     breakvalue = 3;
175 | 
176 |         
177 |         # compute new objective value
178 |         h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
179 | 
180 |         # check breaking condition
181 |         res = residual(x_kp1, res0, model, options);
182 |         if res < tol:
183 |             breakvalue = 2;
184 | 
185 |         # tape residual
186 |         time = time + (clock.time() - stime);
187 |         if options['storeResidual'] == True:
188 |             seq_res[iter] = res;
189 |         if options['storeTime'] == True:
190 |             seq_time[iter] = time;
191 |         if options['storePoints'] == True:
192 |             seq_x[:,iter] = x_kp1;
193 |         if options['storeObjective'] == True:
194 |             seq_obj[iter] = h_kp1;
195 |         if options['storeBeta'] == True:
196 |             seq_beta[iter-1] = beta;
197 | 
198 |         # print info
199 |         if (iter % check == 0):
200 |             print 'iter: %d, time: %5f, alpha: %f, beta: %f, res: %f' % (iter, time, alpha, beta, res);
201 |         
202 |     
203 |         # handle breaking condition
204 |         if breakvalue == 2:
205 |             print('Tolerance value reached!!!');
206 |             break;
207 |         elif breakvalue == 3:
208 |             print('Not enough backtracking iterations!!!');
209 |             break;
210 | 
211 | 
212 |     # return results
213 |     output = {
214 |         'sol': x_kp1,
215 |         'breakvalue': breakvalue
216 |     }
217 | 
218 |     if options['storeResidual'] == True:
219 |         output['seq_res'] = seq_res;
220 |     if options['storeTime'] == True:
221 |         output['seq_time'] = seq_time;
222 |     if options['storePoints'] == True:
223 |         output['seq_x'] = seq_x;
224 |     if options['storeObjective'] == True:
225 |         output['seq_obj'] = seq_obj;
226 |     if options['storeBeta'] == True:
227 |         output['seq_beta'] = seq_beta;
228 | 
229 |     return output;
230 | 
231 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/Algorithms/ForwardBackwardSplitting.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import zeros
  3 | import time as clock
  4 | 
  5 | # Forward--backward splitting algorithm
  6 | def fbs(model, oracle, options, tol, maxiter, check):
  7 |     """
  8 | 
  9 |     Optimization problem:
 10 |     ---------------------
 11 | 
 12 |         min_{x} h(x);   h(x) := g(x) + f(x)
 13 | 
 14 |     where:
 15 | 
 16 |     f       continuously differentiable with L-Lipschitz continuous gradient
 17 |     g       convex, simple
 18 | 
 19 |     Algorithm 1:
 20 |     ------------
 21 |     
 22 |         // Initialization 
 23 |         x^{0}   = x_init
 24 |         // Update for k=0,...,maxiter
 25 |         x^{k+1} = prox_{alpha*g}(x^{k} - alpha*grad f(x^{k}))
 26 |     
 27 |     where:
 28 | 
 29 |     alpha   in (0,2/L) or use backtracking
 30 | 
 31 | 
 32 |     Parameter:
 33 |     ----------
 34 |     model             model data of the optimization problem
 35 | 
 36 |     oracle:
 37 |     .'grad_f'         computes the gradient of the objective grad f(x^{k})
 38 |     .'prox_g'         computes the proximal mapping of g
 39 |     .'fun_g'          computes the value of g
 40 |     .'fun_f'          computes the value of f
 41 |     .'residual'       used for breaking condition or resor plots
 42 |     
 43 |     options (required):
 44 |     .'stepsize'       stepsize alpha
 45 |     .'init'           initialization
 46 |     
 47 |     options (optional):
 48 |     .'storeResidual'  flag to store all residual values
 49 |     .'storeTime'      flag to store the time of each iteration
 50 |     .'storePoints'    flag to store all iterates
 51 |     .'storeObjective' flag to store all objective values
 52 |     .'backtrackingMaxiter'  if > 1, then backtracking is performed, which 
 53 |                             requires 'backtrackingFactor', otherwise default
 54 |                             values are set and fixed step size is used througout
 55 |     .'backtrackingFactor'   scaling of the step size when backtracking step
 56 |                             is successful or not; value in (0,1)
 57 |     
 58 |     tol               tolerance threshold for the residual
 59 |     maxiter           maximal number of iterations
 60 |     check             provide information after 'check' iterations
 61 | 
 62 | 
 63 |     Return:
 64 |     -------
 65 |     output
 66 |     .'sol'            solution of the problems
 67 |     .'seq_res'        sequence of residual values (if activated)
 68 |     .'seq_time'       sequence of time points (if activated)
 69 |     .'seq_x'          sequence of iterates (if activated)
 70 |     .'seq_obj'        sequence of objective values (if activated)
 71 |     .'breakvalue'     code for the type of breaking event
 72 |                       1: maximal number of iterations exceeded
 73 |                       2: breaking condition reached (residual below tol)
 74 |                       3: not enough backtracking/line sreach iterations
 75 |         
 76 |     """
 77 |     
 78 |     # store options
 79 |     if 'storeResidual'  not in options:
 80 |         options['storeResidual']  = False;
 81 |     if 'storeTime'      not in options:
 82 |         options['storeTime']      = False;
 83 |     if 'storePoints'    not in options:
 84 |         options['storePoints']    = False;
 85 |     if 'storeObjective' not in options:
 86 |         options['storeObjective'] = False;
 87 |     
 88 |     # backtracking options
 89 |     backtrackingMaxiter = 1;   
 90 |     backtrackingFactor  = 1.0;
 91 |     if 'backtrackingMaxiter' in options:
 92 |         backtrackingMaxiter = options['backtrackingMaxiter'];
 93 |         backtrackingFactor  = options['backtrackingFactor'];
 94 | 
 95 |     # load oracle
 96 |     fun_f    = oracle['fun_f'];
 97 |     fun_g    = oracle['fun_g'];
 98 |     grad_f   = oracle['grad_f'];
 99 |     prox_g   = oracle['prox_g'];
100 |     residual = oracle['residual'];
101 | 
102 |     # load parameter
103 |     alpha    = options['stepsize'];
104 |     
105 | 
106 |     # initialization
107 |     x_kp1 = options['init'];
108 |     f_kp1 = fun_f(x_kp1, model, options);
109 |     h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
110 |     res0 = residual(x_kp1, 1.0, model, options);
111 | 
112 |     # taping
113 |     if options['storeResidual'] == True:
114 |         seq_res = zeros(maxiter+1);
115 |         seq_res[0] = 1;
116 |     if options['storeTime'] == True:
117 |         seq_time = zeros(maxiter+1);
118 |         seq_time[0] = 0;
119 |     if options['storePoints'] == True:
120 | 	seq_x = zeros((model['N'],maxiter+1));        
121 | 	seq_x[:,0] = x_kp1;
122 |     if options['storeObjective'] == True:
123 | 	seq_obj = zeros(maxiter+1);        
124 |         seq_obj[0] = h_kp1;
125 |     time = 0;
126 | 
127 |     # solve 
128 |     breakvalue = 1;
129 |     for iter in range(1,maxiter+1):
130 |         
131 |         stime = clock.time();
132 |         
133 |         # update variables
134 |         x_k = x_kp1.copy();
135 |         h_k = h_kp1.copy();
136 |         f_k = f_kp1.copy();
137 | 
138 |         # compute gradient
139 |         grad_k = grad_f(x_k, model, options);
140 | 
141 |         for iterbt in range(0,backtrackingMaxiter):
142 | 
143 |             # forward step
144 |             x_kp1 = x_k - alpha*grad_k;
145 | 
146 |             # backward step
147 |             x_kp1 = prox_g(x_kp1, alpha, model, options);
148 | 
149 |             # compute new value of smooth part of objective
150 |             f_kp1 = fun_f(x_kp1, model, options);
151 | 
152 |             # no backtracking
153 |             if backtrackingMaxiter == 1:
154 |                 break;
155 | 
156 |             # check backtracking breaking condition
157 |             dx = x_kp1 - x_k;
158 |             Delta = sum(grad_k*dx) + 0.5/alpha*sum(dx**2);
159 |             if (f_kp1 < f_k + Delta + 1e-8):
160 |                 if iterbt == 0:
161 |                     alpha = alpha/backtrackingFactor;
162 |                 break;
163 |             else:
164 |                 alpha = alpha*backtrackingFactor;
165 |                 if (iterbt+1 == backtrackingMaxiter):
166 |                     breakvalue = 3;
167 |             
168 |         # compute new objective value
169 |         h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
170 | 
171 |         # check breaking condition
172 |         res = residual(x_kp1, res0, model, options);
173 |         if res < tol:
174 |             breakvalue = 2;
175 | 
176 |         # tape residual
177 |         time = time + (clock.time() - stime);
178 |         if options['storeResidual'] == True:
179 |             seq_res[iter] = res;
180 |         if options['storeTime'] == True:
181 |             seq_time[iter] = time;
182 |         if options['storePoints'] == True:
183 |             seq_x[:,iter] = x_kp1;
184 |         if options['storeObjective'] == True:
185 |             seq_obj[iter] = h_kp1;
186 | 
187 |         # print info
188 |         if (iter % check == 0):
189 |             print 'iter: %d, time: %5f, alpha: %f, res: %f' % (iter, time, alpha, res);
190 |         
191 |     
192 |         # handle breaking condition
193 |         if breakvalue == 2:
194 |             print('Tolerance value reached!!!');
195 |             break;
196 |         elif breakvalue == 3:
197 |             print('Not enough backtracking iterations!!!');
198 |             break;
199 | 
200 | 
201 |     # return results
202 |     output = {
203 |         'sol': x_kp1,
204 |         'breakvalue': breakvalue
205 |     }
206 | 
207 |     if options['storeResidual'] == True:
208 |         output['seq_res'] = seq_res;
209 |     if options['storeTime'] == True:
210 |         output['seq_time'] = seq_time;
211 |     if options['storePoints'] == True:
212 |         output['seq_x'] = seq_x;
213 |     if options['storeObjective'] == True:
214 |         output['seq_obj'] = seq_obj;
215 | 
216 |     return output;
217 | 
218 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/Algorithms/MFZeroSR1_ProximalGradient.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import zeros, sqrt, sign
  3 | import time as clock
  4 | 
  5 | def mfzeroSR1_pg(model, oracle, options, tol, maxiter, check):
  6 |     """
  7 | 
  8 |     Monotone Proximal Quasi-Newton algorithm for solving:
  9 | 
 10 |         min_{x} h(x);   h(x):= g(x) + f(x)
 11 | 
 12 |     Update step: See Section 3.3.1 in 
 13 | 
 14 |     P. Ochs and T. Pock: "Adaptive Fista" ArXiv:1711.04343 [Math], November 12, 2017. 
 15 | 
 16 | 
 17 |     Properties:
 18 |     -----------
 19 |     f       convex quadratic function with L-Lipschitz continuous gradient
 20 |     g       simple
 21 |     alpha   in (0,1/L)
 22 | 
 23 |     Assumption:                                                                 
 24 |     -----------
 25 |                                                                                 
 26 |         y^{k} = B^{k}*s^{k}     (secant equation for f)
 27 |                                 Holds exactly, when f is quadratic.             
 28 |                                                                                 
 29 |     where:                                                                      
 30 |                                                                                 
 31 |         y^{k}    := grad f(x^{k}) - grad f(x^{k-1})                         
 32 |         s^{k}    := x^{k} - x^{k-1}                                             
 33 |         B_0      := L*Id                                                        
 34 |         B^{k}    := B_0 - \sigma_k*u^{k}*u^{k}'  (Hessian approximation)        
 35 |         d^{k}    := B_0*s^k - y^k   [stored in u_k later] 
 36 |         u^{k}    := d^{k}/sqrt(<d^{k},s^{k}>)                                   
 37 |         \sigma_k := \sign(<d^{k},s^{k}>)                                        
 38 |                     \in {-1,0,1}: +1: B^{k} is pos. def. (curvature cond.)      
 39 |                                   -1: B^{k} is neg. def. (curvature cond.)      
 40 |                                    0: do not use metric                         
 41 |                     For quadratic function f: B^{k} is positive semi-definite   
 42 |         H^{k}    := (B^{k})^{-1}  (inverse Hessian approximation)               
 43 |                   = B_0^{-1} + \sigma u^{k}*u^{k}' /(L*(L-\sigma_k*|u^{k}|^2))  
 44 |                     (Using Sherman--Morrison formula)                           
 45 |                                                                                 
 46 |         prox_{g}^{B^{k}} is a diagonal minus \sigma*Rank1 proximal mapping and  
 47 |                       requires specialized implementations.     
 48 | 
 49 | 
 50 |     Parameter:
 51 |     ----------
 52 |     model             model data of the optimization problem
 53 |     
 54 |     oracle:
 55 |     .'grad_f'         computes the gradient of the objective grad f(x^{k})
 56 |     .'prox_g'         computes the proximal mapping of g
 57 |     .'fun_g'          computes the value of g
 58 |     .'fun_f'          computes the value of f
 59 |     .'residual'       used for breaking condition or resor plots
 60 |     
 61 |     options (required):
 62 |     .'stepsize'       stepsize alpha = 1/L (required)
 63 |     .'gamma'          Armijo-like parameter in (0,1)
 64 |     .'init'           initialization
 65 |     
 66 |     options (optional):
 67 |     .'storeResidual'  flag to store all residual values
 68 |     .'storeTime'      flag to store the time of each iteration
 69 |     .'storePoints'    flag to store all iterates
 70 |     .'storeObjective' flag to store all objective values
 71 |         
 72 |     tol               tolerance threshold for the residual
 73 |     maxiter           maximal number of iterations
 74 |     check             provide information after 'check' iterations
 75 |         
 76 |     Return:
 77 |     -------
 78 |     output
 79 |     .'sol'            solution of the problems
 80 |     .'seq_res'        sequence of residual values (if activated)
 81 |     .'seq_time'       sequence of time points (if activated)
 82 |     .'seq_x'          sequence of iterates (if activated)
 83 |     .'seq_obj'        sequence of objective values (if activated)
 84 |     .'breakvalue'     code for the type of breaking condition
 85 |                       1: maximal number of iterations exceeded
 86 |                       2: breaking condition reached (residual below tol)
 87 | 
 88 |     """
 89 |     
 90 |     # store options
 91 |     if 'storeResidual'  not in options:
 92 |         options['storeResidual']  = False;
 93 |     if 'storeTime'      not in options:
 94 |         options['storeTime']      = False;
 95 |     if 'storePoints'    not in options:
 96 |         options['storePoints']    = False;
 97 |     if 'storeObjective' not in options:
 98 |         options['storeObjective'] = False;
 99 |     
100 |     # load oracle
101 |     fun_f    = oracle['fun_f'];
102 |     fun_g    = oracle['fun_g'];
103 |     grad_f   = oracle['grad_f'];
104 |     prox_g   = oracle['prox_g'];
105 |     residual = oracle['residual'];
106 | 
107 |     # load parameter
108 |     alpha    = options['stepsize'];
109 | 
110 |     # initialization
111 |     x_kp1 = options['init'];
112 |     x_k   = x_kp1.copy();
113 |     v_kp1 = zeros(x_k.shape);
114 |     z_kp1 = zeros(x_k.shape);
115 |     s_k   = zeros(x_k.shape);
116 |     y_k   = zeros(x_k.shape);
117 |     u_k   = zeros(x_k.shape);
118 |     one   = np.ones(x_k.shape);
119 |     grad_k   = zeros(x_k.shape);
120 |     grad_km1 = zeros(x_k.shape);
121 |     t_kp1 = 1.0;
122 |     sigma_k = 0;
123 |     f_kp1 = fun_f(x_kp1, model, options);
124 |     h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
125 |     res0  = residual(x_kp1, 1.0, model, options);
126 | 
127 |     # taping
128 |     if options['storeResidual'] == True:
129 |         seq_res = zeros(maxiter+1);
130 |         seq_res[0] = 1;
131 |     if options['storeTime'] == True:
132 |         seq_time = zeros(maxiter+1);
133 |         seq_time[0] = 0;
134 |     if options['storePoints'] == True:
135 | 	seq_x = zeros((model['N'],maxiter+1));        
136 | 	seq_x[:,0] = x_kp1;
137 |     if options['storeObjective'] == True:
138 | 	seq_obj = zeros(maxiter+1);        
139 |         seq_obj[0] = h_kp1;
140 |     time = 0;
141 | 
142 |     # solve 
143 |     breakvalue = 1;
144 |     for iter in range(1,maxiter+1):
145 |         
146 |         stime = clock.time();
147 |         
148 |         # update variables
149 |         x_km1    = x_k.copy();
150 |         x_k      = x_kp1.copy();
151 |         grad_km1 = grad_k.copy();
152 |         h_k      = h_kp1.copy();
153 |         f_k      = f_kp1.copy();
154 |         t_k = t_kp1;
155 | 
156 |         t_kp1 = 0.5*(1.0 + sqrt(1.0 + 4.0*t_k**2));
157 | 
158 |         # extrapolation
159 |         extra_y_k = x_kp1 + t_k/t_kp1*(z_kp1 - x_k) + (t_k-1)/t_kp1*(x_k - x_km1) ;
160 |         
161 |         # compute FISTA step
162 |         grad_k = grad_f(extra_y_k, model, options);
163 |         z_kp1 = extra_y_k - alpha*grad_k;
164 |         z_kp1 = prox_g(z_kp1, one/alpha, 0.0, 0, model, options);
165 |         h_z = fun_g(z_kp1, model, options) + fun_f(z_kp1, model, options);
166 | 
167 |         # compute gradient
168 |         grad_k = grad_f(x_k, model, options);
169 |  
170 |         # build rank 1 metric B^{k} = L*id - sigma_k*u^{k}*u^{k}' (Hessian approximation)
171 |         sigma_k = 0;
172 |         if iter>1:
173 |             s_k = x_k - x_km1;
174 |             y_k = grad_k - grad_km1;
175 | 
176 |             u_k = s_k/alpha - y_k;
177 |             dts = u_k.T.dot(s_k);
178 |             if abs(dts) >= 1e-8:
179 |             #if abs(dts) >= 1e-8*sqrt(sum(y_k**2))*sqrt(sum(u_k**2)):
180 |                 sigma_k = sign(dts);
181 |                 u_k = u_k/sqrt(abs(dts));
182 |             
183 |             if sigma_k < 0:
184 |                 sigma_k = 0;
185 |                 breakvalue = 5;
186 | 
187 |         # forward step (x^{k+1} = x^{k} - (B^{k})^{-1}*\nabla f(x^{k})
188 |         v_kp1 = x_k - alpha*grad_k - sigma_k*(u_k.dot(u_k.T.dot(alpha*grad_k)))/(1.0/alpha-sum(u_k**2));
189 | 
190 |         # backward step (w.r.t. to the metric B^{k})
191 |         v_kp1 = prox_g(v_kp1, one/alpha, u_k, -1.0*sigma_k, model, options);
192 |         h_v = fun_g(v_kp1, model, options) + fun_f(v_kp1, model, options);
193 | 
194 |         # compare objective values
195 |         if (h_z <= h_v):
196 |             x_kp1 = z_kp1;
197 |             h_kp1 = h_z;
198 |         else:
199 |             x_kp1 = v_kp1;
200 |             h_kp1 = h_v;
201 | 
202 |         # check breaking condition
203 |         res = residual(x_kp1, res0, model, options);
204 |         if res < tol:
205 |             breakvalue = 2;
206 | 
207 |         # tape residual
208 |         time = time + (clock.time() - stime);
209 |         if options['storeResidual'] == True:
210 |             seq_res[iter] = res;
211 |         if options['storeTime'] == True:
212 |             seq_time[iter] = time;
213 |         if options['storePoints'] == True:
214 |             seq_x[:,iter] = x_kp1;
215 |         if options['storeObjective'] == True:
216 |             seq_obj[iter] = h_kp1;
217 | 
218 |         # print info
219 |         if (iter % check == 0):
220 |             print 'iter: %d, time: %5f, alpha: %f, res: %f' % (iter, time, alpha, res);
221 |         
222 |     
223 |         # handle breaking condition
224 |         if breakvalue == 2:
225 |             print('Tolerance value reached!!!');
226 |             break;
227 |         elif breakvalue == 4:
228 |             print('Metric is not positive definite!');
229 |             break;
230 |         elif breakvalue == 5:
231 |             print('Metric is not positive definite!');
232 | 
233 | 
234 |     # return results
235 |     output = {
236 |         'sol': x_kp1,
237 |         'breakvalue': breakvalue
238 |     }
239 | 
240 |     if options['storeResidual'] == True:
241 |         output['seq_res'] = seq_res;
242 |     if options['storeTime'] == True:
243 |         output['seq_time'] = seq_time;
244 |     if options['storePoints'] == True:
245 |         output['seq_x'] = seq_x;
246 |     if options['storeObjective'] == True:
247 |         output['seq_obj'] = seq_obj;
248 | 
249 |     return output;
250 | 
251 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/Algorithms/SpaRSA.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import zeros
  3 | import time as clock
  4 | 
  5 | def spg(model, oracle, options, tol, maxiter, check):
  6 |     """
  7 | 
  8 |     Sparse Reconstruction by Separable Approximation
  9 | 
 10 |         min_{x} h(x);   h(x):= g(x) + f(x)
 11 | 
 12 |     Update step:
 13 |     
 14 |         Parameter:
 15 |         0 < Lip_min < Lip_max 
 16 |         eta > 1
 17 |         sigma \in (0,1)
 18 | 
 19 |         Choose Lip \in [Lip_min, Lip_max] as BB step-size projected onto
 20 |         the given interval, i.e. Lip = <s,y>/<s,s> where s = x^{k} - x^{k-1}
 21 |         and y = \nabla f(x^{k}) - \nabla f(x^{k-1}).
 22 | 
 23 |         Backtracking w.r.t. Lip of (set Lip = eta*Lip)
 24 |             x^{k+1} = prox_{1/Lip*g}(x^{k} - 1/Lip*grad f(x^{k}))
 25 |         until 
 26 |             h(x^{k+1}) <= \max_{i=0,...,M} h(x^{k-i}) - 
 27 |                             0.5*sigma*Lip ||x^{k+1}-x^{k}||^2
 28 |     
 29 |     
 30 |     Properties:
 31 |     -----------
 32 |     f       continuously differentiable with L-Lipschitz continuous gradient
 33 |     g       convex, simple
 34 | 
 35 |     Parameter:
 36 |     ----------
 37 |     model             model data of the optimization problem
 38 |     
 39 |     oracle:
 40 |     .'grad_f'         computes the gradient of the objective grad f(x^{k})
 41 |     .'prox_g'         computes the proximal mapping of g
 42 |     .'fun_g'          computes the value of g
 43 |     .'fun_f'          computes the value of f
 44 |     .'residual'       used for breaking condition or resor plots
 45 |     
 46 |     options (required):
 47 |     .'init'           initialization
 48 |     
 49 |     options (optional):
 50 |     .'storeResidual'  flag to store all residual values
 51 |     .'storeTime'      flag to store the time of each iteration
 52 |     .'storePoints'    flag to store all iterates
 53 |     .'storeObjective' flag to store all objective values
 54 |     .'Lipschitz_min'  : Lip_min (default: 1e-4)
 55 |     .'Lipschitz_max'  : Lip_max (default: 1e10)
 56 |     .'backtrackingMaxiter' 
 57 |                       if > 1, then backtracking is performed, which 
 58 |                       requires 'backtrackingFactor', otherwise default
 59 |                       values are set and fixed step size is used througout
 60 |                       default: 20
 61 |     .'backtrackingFactor' : eta
 62 |                       scaling of the step size when backtracking step
 63 |                       is successful or not; value eta>1
 64 |                       default: 1.1
 65 |     .'backtrackingAcceptFactor' : sigma
 66 |                       scaling of the sufficient descent term 
 67 |     .'backtrackingHistory' : M
 68 |                       how many old objective values are stored
 69 |                       default: 0
 70 |                 
 71 |     tol               tolerance threshold for the residual
 72 |     maxiter           maximal number of iterations
 73 |     check             provide information after 'check' iterations
 74 |         
 75 |     Return:
 76 |     -------
 77 |     output
 78 |     .'sol'            solution of the problems
 79 |     .'seq_res'        sequence of residual values (if activated)
 80 |     .'seq_time'       sequence of time points (if activated)
 81 |     .'seq_x'          sequence of iterates (if activated)
 82 |     .'seq_obj'        sequence of objective values (if activated)
 83 |     .'breakvalue'     code for the type of breaking condition
 84 |                       1: maximal number of iterations exceeded
 85 |                       2: breaking condition reached (residual below tol)
 86 |                       3: not enough backtracking iterations
 87 | 
 88 |     Reference:
 89 |     ----------
 90 |     S.J. Wright, R.D. Nowak, and M.A.T. Figueiredo: "Sparse Reconstruction by 
 91 |     Separable Approximation." IEEE Transactions on Signal Processing 57, 
 92 |     No. 7:2479--93. 2009.
 93 | 
 94 |     """
 95 |     
 96 |     # store options
 97 |     if 'storeResidual'  not in options:
 98 |         options['storeResidual']  = False;
 99 |     if 'storeTime'      not in options:
100 |         options['storeTime']      = False;
101 |     if 'storePoints'    not in options:
102 |         options['storePoints']    = False;
103 |     if 'storeObjective' not in options:
104 |         options['storeObjective'] = False;
105 |     
106 |     # step size options
107 |     Lip_min = 1e-4;
108 |     Lip_max = 1e10;
109 |     if 'Lipschitz_min' in options:
110 |         Lip_min = options['Lipschitz_min'];
111 |     if 'Lipschitz_max' in options:
112 |         Lip_max = options['Lipschitz_max'];
113 | 
114 |     # backtracking options
115 |     backtrackingMaxiter = 30;   
116 |     backtrackingFactor = 1.5;
117 |     M = 0;
118 |     sigma = 1e-4;
119 |     if 'backtrackingMaxiter' in options:
120 |         backtrackingMaxiter = options['backtrackingMaxiter'];
121 |     if 'backtrackingFactor' in options:
122 |         backtrackingFactor = options['backtrackingFactor'];
123 |     if 'backtrackingAcceptFactor' in options:
124 |         sigma  = options['backtrackingAcceptFactor'];
125 |     if 'backtrackingHistory' in options:
126 |         M = options['backtrackingHistory'];
127 | 
128 |     # load oracle
129 |     fun_f    = oracle['fun_f'];
130 |     fun_g    = oracle['fun_g'];
131 |     grad_f   = oracle['grad_f'];
132 |     prox_g   = oracle['prox_g'];
133 |     residual = oracle['residual'];
134 | 
135 |     # initialization
136 |     Lip = 1.0;      # dummy value here
137 |     x_kp1 = options['init'];
138 |     x_k   = x_kp1.copy();
139 |     f_kp1 = fun_f(x_kp1, model, options);
140 |     h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
141 |     grad_k = grad_f(x_k, model, options);
142 |     res0 = residual(x_kp1, 1.0, model, options);
143 |     hist_h = -1e10*np.ones(M+1);
144 | 
145 |     # taping
146 |     if options['storeResidual'] == True:
147 |         seq_res = zeros(maxiter+1);
148 |         seq_res[0] = 1;
149 |     if options['storeTime'] == True:
150 |         seq_time = zeros(maxiter+1);
151 |         seq_time[0] = 0;
152 |     if options['storePoints'] == True:
153 | 	seq_x = zeros((model['N'],maxiter+1));        
154 | 	seq_x[:,0] = x_kp1;
155 |     if options['storeObjective'] == True:
156 | 	seq_obj = zeros(maxiter+1);        
157 |         seq_obj[0] = h_kp1;
158 |     time = 0;
159 | 
160 |     # solve 
161 |     breakvalue = 1;
162 |     for iter in range(1,maxiter+1):
163 |         
164 |         stime = clock.time();
165 | 
166 |         # update variables
167 |         x_km1 = x_k.copy();
168 |         x_k = x_kp1.copy();
169 |         h_k = h_kp1.copy();
170 |         f_k = f_kp1.copy();
171 |         grad_km1 = grad_k.copy();
172 |         hist_h[iter%(M+1)] = h_kp1;
173 |         max_h = np.amax(hist_h);
174 | 
175 |         # compute gradient
176 |         grad_k = grad_f(x_k, model, options);
177 | 
178 |         # compute Barzilai--Borwein step length
179 |         if iter>0:
180 |             s_k = x_k - x_km1;
181 |             y_k = grad_k - grad_km1;
182 |             nrm = np.dot(s_k.T, s_k);
183 |             if nrm>0:
184 |                 Lip = np.maximum(Lip_min, np.minimum(Lip_max, \
185 |                             np.dot(s_k.T, y_k)/np.dot(s_k.T, s_k) ));
186 |             else:
187 |                 Lip = Lip_max;
188 | 
189 | 
190 |         for iterbt in range(0,backtrackingMaxiter):
191 | 
192 |             # forward step
193 |             x_kp1 = x_k - 1.0/Lip*grad_k;
194 | 
195 |             # backward step
196 |             x_kp1 = prox_g(x_kp1, 1.0/Lip, model, options);
197 | 
198 |             # compute new value of smooth part of objective
199 |             h_kp1 = fun_f(x_kp1, model, options) + fun_g(x_kp1, model, options);
200 | 
201 |             # no backtracking
202 |             if backtrackingMaxiter == 1:
203 |                 break;
204 | 
205 |             # check backtracking breaking condition
206 |             Delta = -0.5*sigma*Lip*sum((x_kp1 - x_k)**2);
207 |             if (h_kp1 < max_h + Delta + 1e-8):
208 |                 break;
209 |             else:
210 |                 Lip = Lip*backtrackingFactor;
211 |                 if (iterbt+1 == backtrackingMaxiter):
212 |                     breakvalue = 3;
213 |             
214 |         # check breaking condition
215 |         res = residual(x_kp1, res0, model, options);
216 |         if res < tol:
217 |             breakvalue = 2;
218 | 
219 |         # tape residual
220 |         time = time + (clock.time() - stime);
221 |         if options['storeResidual'] == True:
222 |             seq_res[iter] = res;
223 |         if options['storeTime'] == True:
224 |             seq_time[iter] = time;
225 |         if options['storePoints'] == True:
226 |             seq_x[:,iter] = x_kp1;
227 |         if options['storeObjective'] == True:
228 |             seq_obj[iter] = h_kp1;
229 | 
230 |         # print info
231 |         if (iter % check == 0):
232 |             print 'iter: %d, time: %5f, Lip: %f, res: %f' % (iter, time, Lip, res);
233 |         
234 |     
235 |         # handle breaking condition
236 |         if breakvalue == 2:
237 |             print('Tolerance value reached!!!');
238 |             break;
239 |         elif breakvalue == 3:
240 |             print('Not enough backtracking iterations!!!');
241 |             break;
242 | 
243 | 
244 |     # return results
245 |     output = {
246 |         'sol': x_kp1,
247 |         'breakvalue': breakvalue
248 |     }
249 | 
250 |     if options['storeResidual'] == True:
251 |         output['seq_res'] = seq_res;
252 |     if options['storeTime'] == True:
253 |         output['seq_time'] = seq_time;
254 |     if options['storePoints'] == True:
255 |         output['seq_x'] = seq_x;
256 |     if options['storeObjective'] == True:
257 |         output['seq_obj'] = seq_obj;
258 | 
259 |     return output;
260 | 
261 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/Algorithms/TsengZerosSR1_ProximalGradient.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import zeros, sqrt, sign
  3 | import time as clock
  4 | 
  5 | def tseng_zeroSR1_pg(model, oracle, options, tol, maxiter, check):
  6 |     """
  7 | 
  8 |     Tseng-like Proximal Quasi-Newton algorithm for solving:
  9 | 
 10 |         min_{x} h(x);   h(x):= g(x) + f(x)
 11 | 
 12 |     Update step: See Section 3.3.2 in 
 13 | 
 14 |     P. Ochs and T. Pock: "Adaptive Fista" ArXiv:1711.04343 [Math], November 12, 2017. 
 15 | 
 16 | 
 17 |     Properties:
 18 |     -----------
 19 |     f       convex quadratic function with L-Lipschitz continuous gradient
 20 |     g       simple
 21 |     alpha   in (0,1/L)
 22 | 
 23 |     Assumption:                                                                 
 24 |     -----------
 25 |                                                                                 
 26 |         y^{k} = B^{k}*s^{k}     (secant equation for f)
 27 |                                 Holds exactly, when f is quadratic.             
 28 |                                                                                 
 29 |     where:                                                                      
 30 |                                                                                 
 31 |         y^{k}    := grad f(x^{k}) - grad f(x^{k-1})                         
 32 |         s^{k}    := x^{k} - x^{k-1}                                             
 33 |         B_0      := L*Id                                                        
 34 |         B^{k}    := B_0 - \sigma_k*u^{k}*u^{k}'  (Hessian approximation)        
 35 |         d^{k}    := B_0*s^k - y^k   [stored in u_k later] 
 36 |         u^{k}    := d^{k}/sqrt(<d^{k},s^{k}>)                                   
 37 |         \sigma_k := \sign(<d^{k},s^{k}>)                                        
 38 |                     \in {-1,0,1}: +1: B^{k} is pos. def. (curvature cond.)      
 39 |                                   -1: B^{k} is neg. def. (curvature cond.)      
 40 |                                    0: do not use metric                         
 41 |                     For quadratic function f: B^{k} is positive semi-definite   
 42 |         H^{k}    := (B^{k})^{-1}  (inverse Hessian approximation)               
 43 |                   = B_0^{-1} + \sigma u^{k}*u^{k}' /(L*(L-\sigma_k*|u^{k}|^2))  
 44 |                     (Using Sherman--Morrison formula)                           
 45 |                                                                                 
 46 |         prox_{g}^{B^{k}} is a diagonal minus \sigma*Rank1 proximal mapping and  
 47 |                       requires specialized implementations.     
 48 | 
 49 | 
 50 |     Parameter:
 51 |     ----------
 52 |     model             model data of the optimization problem
 53 |     
 54 |     oracle:
 55 |     .'grad_f'         computes the gradient of the objective grad f(x^{k})
 56 |     .'prox_g'         computes the proximal mapping of g
 57 |     .'fun_g'          computes the value of g
 58 |     .'fun_f'          computes the value of f
 59 |     .'residual'       used for breaking condition or resor plots
 60 |     
 61 |     options (required):
 62 |     .'stepsize'       stepsize alpha = 1/L (required)
 63 |     .'gamma'          Armijo-like parameter in (0,1)
 64 |     .'init'           initialization
 65 |     
 66 |     options (optional):
 67 |     .'storeResidual'  flag to store all residual values
 68 |     .'storeTime'      flag to store the time of each iteration
 69 |     .'storePoints'    flag to store all iterates
 70 |     .'storeObjective' flag to store all objective values
 71 |         
 72 |     tol               tolerance threshold for the residual
 73 |     maxiter           maximal number of iterations
 74 |     check             provide information after 'check' iterations
 75 |         
 76 |     Return:
 77 |     -------
 78 |     output
 79 |     .'sol'            solution of the problems
 80 |     .'seq_res'        sequence of residual values (if activated)
 81 |     .'seq_time'       sequence of time points (if activated)
 82 |     .'seq_x'          sequence of iterates (if activated)
 83 |     .'seq_obj'        sequence of objective values (if activated)
 84 |     .'breakvalue'     code for the type of breaking condition
 85 |                       1: maximal number of iterations exceeded
 86 |                       2: breaking condition reached (residual below tol)
 87 | 
 88 |     """
 89 |     
 90 |     # store options
 91 |     if 'storeResidual'  not in options:
 92 |         options['storeResidual']  = False;
 93 |     if 'storeTime'      not in options:
 94 |         options['storeTime']      = False;
 95 |     if 'storePoints'    not in options:
 96 |         options['storePoints']    = False;
 97 |     if 'storeObjective' not in options:
 98 |         options['storeObjective'] = False;
 99 |     
100 |     # load oracle
101 |     fun_f    = oracle['fun_f'];
102 |     fun_g    = oracle['fun_g'];
103 |     grad_f   = oracle['grad_f'];
104 |     prox_g   = oracle['prox_g'];
105 |     residual = oracle['residual'];
106 | 
107 |     # load parameter
108 |     alpha    = options['stepsize'];
109 | 
110 |     # initialization
111 |     x_kp1 = options['init'];
112 |     x_k   = x_kp1.copy();
113 |     v_kp1 = zeros(x_k.shape);
114 |     z_kp1 = zeros(x_k.shape);
115 |     s_k   = zeros(x_k.shape);
116 |     y_k   = zeros(x_k.shape);
117 |     u_k   = zeros(x_k.shape);
118 |     one   = np.ones(x_k.shape);
119 |     grad_k   = zeros(x_k.shape);
120 |     grad_km1 = zeros(x_k.shape);
121 |     theta_kp1 = 1.0;
122 |     sigma_k = 0;
123 |     f_kp1 = fun_f(x_kp1, model, options);
124 |     h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
125 |     res0  = residual(x_kp1, 1.0, model, options);
126 | 
127 |     # taping
128 |     if options['storeResidual'] == True:
129 |         seq_res = zeros(maxiter+1);
130 |         seq_res[0] = 1;
131 |     if options['storeTime'] == True:
132 |         seq_time = zeros(maxiter+1);
133 |         seq_time[0] = 0;
134 |     if options['storePoints'] == True:
135 | 	seq_x = zeros((model['N'],maxiter+1));        
136 | 	seq_x[:,0] = x_kp1;
137 |     if options['storeObjective'] == True:
138 | 	seq_obj = zeros(maxiter+1);        
139 |         seq_obj[0] = h_kp1;
140 |     time = 0;
141 | 
142 |     # solve 
143 |     breakvalue = 1;
144 |     for iter in range(1,maxiter+1):
145 |         
146 |         stime = clock.time();
147 |         
148 |         # update variables
149 |         x_k      = x_kp1.copy();
150 |         z_k      = z_kp1.copy();
151 |         h_k      = h_kp1.copy();
152 |         f_k      = f_kp1.copy();
153 |         theta_k = theta_kp1;
154 | 
155 |         theta_kp1 = 0.5*(sqrt(theta_k**4 + 4.0*theta_k**2)-theta_k**2);
156 | 
157 |         # extrapolation
158 |         pre_y_k = (1.0-theta_k)*x_k + theta_k*z_k;
159 |         
160 |         # compute FISTA step
161 |         alpha_theta = alpha/theta_k;
162 |         grad_k = grad_f(pre_y_k, model, options);
163 |         z_kp1 = z_k - alpha_theta*grad_k;
164 |         z_kp1 = prox_g(z_kp1, one/alpha_theta, 0.0, 0, model, options);
165 | 
166 |         # post-combination
167 |         v_kp1 = (1-theta_k)*x_k + theta_k*z_kp1;
168 |         
169 |         # compute gradient
170 |         grad_k = grad_f(x_k, model, options);
171 |         grad_z_k = grad_f(z_k, model, options);
172 |  
173 |         # build rank 1 metric B^{k} = L*id - sigma_k*u^{k}*u^{k}' (Hessian approximation)
174 |         sigma_k = 0;
175 |         if iter>1:
176 |             s_k = x_k - z_k;
177 |             y_k = grad_k - grad_z_k;
178 | 
179 |             u_k = s_k/alpha - y_k;
180 |             dts = u_k.T.dot(s_k);
181 |             if abs(dts) >= 1e-8:
182 |             #if abs(dts) >= 1e-8*sqrt(sum(y_k**2))*sqrt(sum(u_k**2)):
183 |                 sigma_k = sign(dts);
184 |                 u_k = u_k/sqrt(abs(dts));
185 |             
186 |             if sigma_k < 0:
187 |                 sigma_k = 0;
188 |                 breakvalue = 5;
189 | 
190 |         # forward step (x^{k+1} = x^{k} - (B^{k})^{-1}*\nabla f(x^{k})
191 |         x_kp1 = x_k - alpha*grad_k - sigma_k*(u_k.dot(u_k.T.dot(alpha*grad_k)))/(1.0/alpha-sum(u_k**2));
192 | 
193 |         # backward step (w.r.t. to the metric B^{k})
194 |         x_kp1 = prox_g(x_kp1, one/alpha, u_k, -1.0*sigma_k, model, options);
195 |         
196 |         # compute new objective value        
197 |         h_xkp1 = fun_g(x_kp1, model, options) + fun_f(x_kp1, model, options);
198 | 
199 |         # check breaking condition
200 |         res = residual(x_kp1, res0, model, options);
201 |         if res < tol:
202 |             breakvalue = 2;
203 | 
204 |         # tape residual
205 |         time = time + (clock.time() - stime);
206 |         if options['storeResidual'] == True:
207 |             seq_res[iter] = res;
208 |         if options['storeTime'] == True:
209 |             seq_time[iter] = time;
210 |         if options['storePoints'] == True:
211 |             seq_x[:,iter] = x_kp1;
212 |         if options['storeObjective'] == True:
213 |             seq_obj[iter] = h_kp1;
214 | 
215 |         # print info
216 |         if (iter % check == 0):
217 |             print 'iter: %d, time: %5f, alpha: %f, res: %f' % (iter, time, alpha, res);
218 |         
219 |     
220 |         # handle breaking condition
221 |         if breakvalue == 2:
222 |             print('Tolerance value reached!!!');
223 |             break;
224 |         elif breakvalue == 4:
225 |             print('Metric is not positive definite!');
226 |             break;
227 |         elif breakvalue == 5:
228 |             print('Metric is not positive definite!');
229 | 
230 | 
231 |     # return results
232 |     output = {
233 |         'sol': x_kp1,
234 |         'breakvalue': breakvalue
235 |     }
236 | 
237 |     if options['storeResidual'] == True:
238 |         output['seq_res'] = seq_res;
239 |     if options['storeTime'] == True:
240 |         output['seq_time'] = seq_time;
241 |     if options['storePoints'] == True:
242 |         output['seq_x'] = seq_x;
243 |     if options['storeObjective'] == True:
244 |         output['seq_obj'] = seq_obj;
245 | 
246 |     return output;
247 | 
248 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/Algorithms/ZeroSR1_ProximalGradient.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import zeros, sqrt, sign
  3 | import time as clock
  4 | 
  5 | def zeroSR1_pg(model, oracle, options, tol, maxiter, check):
  6 |     """
  7 | 
  8 |     Line-Search Proximal Quasi-Newton algorithm for solving:
  9 | 
 10 |         min_{x} h(x);   h(x):= g(x) + f(x)
 11 | 
 12 |     Update step:
 13 |     
 14 |         z^{k}   = argmin_{x} g(x) + <grad f(x^{k}),x-x^{k}> 
 15 |                                   + 0.5<(x-x^{k}),B^{k}(x-x^{k})>             
 16 |                 = prox_{g}^{B^{k}}(x^{k} - H^{k}grad f(x^{k}))                
 17 |         x^{k+1} = LineSearch(x^{k} + eta_k*(z^{k} - x^{k}); eta_k)
 18 |         
 19 |         where LineSearch(x^{k} + eta_k*(z^{k} - x^{k}); eta_k) finds eta_k
 20 |         such that
 21 |             f(x^{k+1}) <= f(x^{k}) + gamma*eta_k*Delta_k
 22 |         with 
 23 |             Delta_k = <grad f(x^{k}),z^{k}-x^{k}> + 1/(2*alpha)|z^{k}-x^{k}|_{B^{k}}^2
 24 |         by scaling the line search variable eta by delta.
 25 | 
 26 | 
 27 |     Properties:
 28 |     -----------
 29 |     f       continuously differentiable with L-Lipschitz continuous gradient
 30 |     g       convex, simple
 31 |     alpha   in (0,2/L) or use backtracking
 32 |     gamma   in (0,1) Armijo-like parameter
 33 |     delta   scaling of the line search variable
 34 | 
 35 |     Assumption:                                                                 
 36 |     -----------
 37 |                                                                                 
 38 |         y^{k} = B^{k}*s^{k}     (secant equation for f)
 39 |                                 Holds exactly, when f is quadratic.             
 40 |                                                                                 
 41 |     where:                                                                      
 42 |                                                                                 
 43 |         y^{k}    := grad f(x^{k}) - grad f(x^{k-1})                         
 44 |         s^{k}    := x^{k} - x^{k-1}                                             
 45 |         B_0      := L*Id                                                        
 46 |         B^{k}    := B_0 - \sigma_k*u^{k}*u^{k}'  (Hessian approximation)        
 47 |         d^{k}    := B_0*s^k - y^k   [stored in u_k later] 
 48 |         u^{k}    := d^{k}/sqrt(<d^{k},s^{k}>)                                   
 49 |         \sigma_k := \sign(<d^{k},s^{k}>)                                        
 50 |                     \in {-1,0,1}: +1: B^{k} is pos. def. (curvature cond.)      
 51 |                                   -1: B^{k} is neg. def. (curvature cond.)      
 52 |                                    0: do not use metric                         
 53 |                     For quadratic function f: B^{k} is positive semi-definite   
 54 |         H^{k}    := (B^{k})^{-1}  (inverse Hessian approximation)               
 55 |                   = B_0^{-1} + \sigma u^{k}*u^{k}' /(L*(L-\sigma_k*|u^{k}|^2))  
 56 |                     (Using Sherman--Morrison formula)                           
 57 |                                                                                 
 58 |         prox_{g}^{B^{k}} is a diagonal minus \sigma*Rank1 proximal mapping and  
 59 |                       requires specialized implementations.     
 60 | 
 61 | 
 62 |     Parameter:
 63 |     ----------
 64 |     model             model data of the optimization problem
 65 |     
 66 |     oracle:
 67 |     .'grad_f'         computes the gradient of the objective grad f(x^{k})
 68 |     .'prox_g'         computes the proximal mapping of g
 69 |     .'fun_g'          computes the value of g
 70 |     .'fun_f'          computes the value of f
 71 |     .'residual'       used for breaking condition or resor plots
 72 |     
 73 |     options (required):
 74 |     .'stepsize'       stepsize alpha = 1/L (required)
 75 |     .'gamma'          Armijo-like parameter in (0,1)
 76 |     .'init'           initialization
 77 |     .'eta0'           initialization of line search parameter
 78 |     .'delta'          scaling of the line search variable
 79 |     .'lineSearchMaxiter'  maximal number of line search trial steps
 80 |     
 81 |     options (optional):
 82 |     .'storeResidual'  flag to store all residual values
 83 |     .'storeTime'      flag to store the time of each iteration
 84 |     .'storePoints'    flag to store all iterates
 85 |     .'storeObjective' flag to store all objective values
 86 |     .'storeBeta'      flag to store alle beta values
 87 |         
 88 |     tol               tolerance threshold for the residual
 89 |     maxiter           maximal number of iterations
 90 |     check             provide information after 'check' iterations
 91 |         
 92 |     Return:
 93 |     -------
 94 |     output
 95 |     .'sol'            solution of the problems
 96 |     .'seq_res'        sequence of residual values (if activated)
 97 |     .'seq_time'       sequence of time points (if activated)
 98 |     .'seq_x'          sequence of iterates (if activated)
 99 |     .'seq_obj'        sequence of objective values (if activated)
100 |     .'seq_beta'       sequence of beta values (extrapolation parameters)
101 |     .'breakvalue'     code for the type of breaking condition
102 |                       1: maximal number of iterations exceeded
103 |                       2: breaking condition reached (residual below tol)
104 |                       3: not enough backtracking iterations
105 | 
106 |     """
107 |     
108 |     # store options
109 |     if 'storeResidual'  not in options:
110 |         options['storeResidual']  = False;
111 |     if 'storeTime'      not in options:
112 |         options['storeTime']      = False;
113 |     if 'storePoints'    not in options:
114 |         options['storePoints']    = False;
115 |     if 'storeObjective' not in options:
116 |         options['storeObjective'] = False;
117 |     if 'storeBeta' not in options:
118 |         options['storeBeta'] = False;
119 |     
120 |     # load oracle
121 |     fun_f    = oracle['fun_f'];
122 |     fun_g    = oracle['fun_g'];
123 |     grad_f   = oracle['grad_f'];
124 |     prox_g   = oracle['prox_g'];
125 |     residual = oracle['residual'];
126 | 
127 |     # load parameter
128 |     Lip               = 1/options['stepsize'];
129 |     tau0              = options['stepsize'];
130 |     gamma             = options['gamma'];
131 |     eta0              = options['eta0'];
132 |     delta             = options['delta'];
133 |     lineSearchMaxiter = options['lineSearchMaxiter'];
134 |     tau_scaling       = 0.8;#95;
135 |     
136 | 
137 |     # initialization
138 |     x_kp1 = options['init'];
139 |     x_k   = x_kp1.copy();
140 |     z_k   = zeros(x_k.shape);
141 |     s_k   = zeros(x_k.shape);
142 |     y_k   = zeros(x_k.shape);
143 |     u_k   = zeros(x_k.shape);
144 |     one   = np.ones(x_k.shape);
145 |     grad_k   = zeros(x_k.shape);
146 |     grad_km1 = zeros(x_k.shape);
147 |     eta   = eta0;
148 |     sigma_k = 0;
149 |     f_kp1 = fun_f(x_kp1, model, options);
150 |     h_kp1 = f_kp1 + fun_g(x_kp1, model, options);
151 |     res0  = residual(x_kp1, 1.0, model, options);
152 | 
153 |     # taping
154 |     if options['storeResidual'] == True:
155 |         seq_res = zeros(maxiter+1);
156 |         seq_res[0] = 1;
157 |     if options['storeTime'] == True:
158 |         seq_time = zeros(maxiter+1);
159 |         seq_time[0] = 0;
160 |     if options['storePoints'] == True:
161 | 	seq_x = zeros((model['N'],maxiter+1));        
162 | 	seq_x[:,0] = x_kp1;
163 |     if options['storeObjective'] == True:
164 | 	seq_obj = zeros(maxiter+1);        
165 |         seq_obj[0] = h_kp1;
166 |     if options['storeBeta'] == True:
167 | 	seq_beta = zeros(maxiter);        
168 |     time = 0;
169 | 
170 |     # solve 
171 |     breakvalue = 1;
172 |     for iter in range(1,maxiter+1):
173 |         
174 |         stime = clock.time();
175 |         
176 |         # update variables
177 |         x_km1    = x_k.copy();
178 |         x_k      = x_kp1.copy();
179 |         grad_km1 = grad_k.copy();
180 |         h_k      = h_kp1.copy();
181 |         f_k      = f_kp1.copy();
182 | 
183 |         # compute gradient
184 |         grad_k = grad_f(x_k, model, options);
185 |  
186 |         # build rank 1 metric B^{k} = L*id - sigma_k*u^{k}*u^{k}' (Hessian approximation)
187 |         sigma_k = 0;
188 |         tau = tau0;
189 |         if iter>1:
190 |             s_k = x_k - x_km1;
191 |             y_k = grad_k - grad_km1;
192 |             
193 |             # step size selection (tau_BB2 if possible)
194 |             if True: ## use BB2 step size
195 |                 nrm_yk = np.dot(y_k.T, y_k);
196 |                 if (nrm_yk > 1e-8):
197 |                     tau = tau_scaling*np.dot(s_k.T, y_k)/nrm_yk;
198 | 
199 |             H0 = tau;
200 | 
201 |             u_k = s_k - H0*y_k;
202 |             dts = u_k.T.dot(y_k);
203 |             if abs(dts) >= 1e-8:
204 |             #if abs(dts) >= 1e-8*sqrt(sum(y_k**2))*sqrt(sum(u_k**2)):
205 |                 sigma_k = sign(dts);
206 |                 u_k = u_k/sqrt(abs(dts));
207 |             
208 |             if sigma_k < 0:
209 |                 sigma_k = 0;
210 |                 breakvalue = 5;
211 | 
212 |         # forward step (x^{k+1} = x^{k} - (B^{k})^{-1}*\nabla f(x^{k})
213 |         z_k = x_k - tau*grad_k - sigma_k*(u_k.dot(u_k.T.dot(tau*grad_k)))/(1.0/tau-sum(u_k**2));
214 | 
215 |         # backward step (w.r.t. to the metric B^{k})
216 |         z_k = prox_g(z_k, one/tau, u_k, -1.0*sigma_k, model, options);
217 | 
218 |         # compute Delta
219 |         Delta = 0;
220 |         if lineSearchMaxiter > 0:
221 |             dx = z_k - x_k;
222 |             Delta = sum(grad_k*dx) + 0.5/tau*sum(dx**2);
223 |         else:
224 |             eta = 1;
225 |             x_kp1 = z_k;
226 | 
227 |         # line search
228 |         for iterls in range(0,lineSearchMaxiter):
229 | 
230 |             # trial point
231 |             x_kp1 = x_k + eta*(z_k - x_k);
232 | 
233 |             # compute new objective value
234 |             h_kp1 = fun_f(x_kp1, model, options) + fun_g(x_kp1, model, options);
235 | 
236 |             # no backtracking
237 |             if lineSearchMaxiter <= 1:
238 |                 break;
239 | 
240 |             # check backtracking breaking condition
241 |             if (h_kp1 < h_k + eta*gamma*Delta + 1e-8):
242 |                 if iterls == 0:
243 |                     #eta = eta/delta;
244 |                     eta = eta0;#eta/delta;
245 |                 break;
246 |             else:
247 |                 eta = eta*delta;
248 |                 if (iterls+1 == lineSearchMaxiter):
249 |                     breakvalue = 3;
250 | 
251 |         # check breaking condition
252 |         res = residual(x_kp1, res0, model, options);
253 |         if res < tol:
254 |             breakvalue = 2;
255 | 
256 |         # tape residual
257 |         time = time + (clock.time() - stime);
258 |         if options['storeResidual'] == True:
259 |             seq_res[iter] = res;
260 |         if options['storeTime'] == True:
261 |             seq_time[iter] = time;
262 |         if options['storePoints'] == True:
263 |             seq_x[:,iter] = x_kp1;
264 |         if options['storeObjective'] == True:
265 |             seq_obj[iter] = h_kp1;
266 |         if options['storeBeta'] == True:
267 |             Md = Lip*s_k - y_k
268 |             if iter>1:
269 |                 beta = np.dot(Md.T, z_k - x_k)/np.dot(Md.T, s_k);
270 |             else:
271 |                 beta = 0;
272 |             seq_beta[iter-1] = beta;
273 | 
274 |         # print info
275 |         if (iter % check == 0):
276 |             print 'iter: %d, time: %5f, Lip: %f, eta: %f, res: %f' % (iter, time, Lip, eta, res);
277 |         
278 |     
279 |         # handle breaking condition
280 |         if breakvalue == 2:
281 |             print('Tolerance value reached!!!');
282 |             break;
283 |         elif breakvalue == 3:
284 |             print('Not enough backtracking iterations!!!');
285 |             breakvalue = 1;
286 |             #break;
287 |         elif breakvalue == 4:
288 |             print('Metric is not positive definite!');
289 |             break;
290 |         elif breakvalue == 5:
291 |             print('Metric is not positive definite!');
292 | 
293 | 
294 |     # return results
295 |     output = {
296 |         'sol': x_kp1,
297 |         'breakvalue': breakvalue
298 |     }
299 | 
300 |     if options['storeResidual'] == True:
301 |         output['seq_res'] = seq_res;
302 |     if options['storeTime'] == True:
303 |         output['seq_time'] = seq_time;
304 |     if options['storePoints'] == True:
305 |         output['seq_x'] = seq_x;
306 |     if options['storeObjective'] == True:
307 |         output['seq_obj'] = seq_obj;
308 |     if options['storeBeta'] == True:
309 |         output['seq_beta'] = seq_beta;
310 | 
311 |     return output;
312 | 
313 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/Algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | # See Algorithm.py for the documentation of an generic algorithm. All
2 | # implemented algorithms have the same parameters and are commented
3 | # analogously.
4 | 
5 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/README.md:
--------------------------------------------------------------------------------
 1 | # ZeroSR1 GroupLasso Experiment
 2 | 
 3 | The code is written in Python and uses a simple C-interface for solving the rank-1 proximal mapping more efficiently. The file `test_groupLasso.py` reproduces the code for the GroupLasso Experiment from the 2018 paper.
 4 | 
 5 | # Installation
 6 | * Go to the folder `clib` and compile `mymath.cpp` using the `Makefile` that is provided in that folder.
 7 | * Then, you can run `python test_groupLasso.py` from the folder `zeroSR1/paperExperiments/groupLasso/`.
 8 | 
 9 | # Problem
10 | The optimization problem is generated and solved in `test_groupLasso.py` using several methods.
11 | 
12 | ## Usage
13 | In order to measure the error to the optimal value, set the flag `compute_optimal_value = True`, which runs (by default) FISTA with 50000 iterations and writes the optimal value to the file `data_group_lasso.npy`. Once this run finished, set `compute_optimal_value = False` and evaluate the implemented algorithms.
14 | 
15 | ## Implemented Algorithms
16 | * Forward-Backward Splitting
17 | * FISTA
18 | * Zero SR1 Proximal Quasi-Newton (with rank-1 prox implemented in C)
19 | * Monotone Fast Zero SR1 Proximal Quasi-Newton
20 | * Tseng Fast Zero SR1 Proximal Quasi-Newton
21 | * Sparse Reconstruction by Separable Approximation
22 | 
23 | Rank-1 proximal mappings are implemented in C (see folder `clib`).
24 | 
25 | ## Parameters
26 | For the parameters, we refer to `test_groupLasso.py` and the implementations in the folder `Algorithms`.
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/clib/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 			g++ -fPIC mymath.cpp -shared -o mymath.so
3 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/clib/mymath.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <cmath>
  3 | #include <stdio.h>
  4 | #include <iostream>
  5 | #include "mymath.h"
  6 | #include <vector>
  7 | #include <algorithm>
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | class Prox_rk1_generic
 14 | {
 15 |   public: 
 16 |     Prox_rk1_generic(
 17 |       double* _x,         // solution of the proximal mapping
 18 |       double* _x0,        // proximal center (const)
 19 |       double* _d,         // diagonal of the diagonal part of the metric
 20 |       double* _u,         // rank1 part of the metrix
 21 |       double _sigma,      // sign of the rank1 part of the metric
 22 |       int _N)             // dimension of the problem
 23 |     {
 24 |       x = _x;
 25 |       x0 = _x0;
 26 |       d = _d;
 27 |       u = _u;
 28 |       sigma = _sigma;
 29 |       N = _N;
 30 |       // allocate memory for an auxiliary variable
 31 |       x_tilde = new double[N];  
 32 |       oneside_shift = 1e10;
 33 |     }
 34 |     ~Prox_rk1_generic()
 35 |     {
 36 |       delete[] x_tilde;
 37 |     }
 38 | 
 39 |     void solve()
 40 |     {
 41 |       // get breakpoints
 42 |       std::vector<double> bpts;
 43 |       get_breakpoints(bpts);
 44 | 
 45 |       // sort list of breakpoints
 46 |       sort( bpts.begin(), bpts.end() );
 47 |       bpts.erase( unique( bpts.begin(), bpts.end() ), bpts.end() );
 48 |       int nbpts = bpts.size();
 49 | 
 50 |       // Now, we search for the interval between two (adjacent) breakpoints 
 51 |       // that contains the root of $p(a) := a - <u, x(a) - x0> = 0$, where x(a) 
 52 |       // is the prox evaluated at a.
 53 |       // The algorithmic strategy is binary search / bisectioning, which can be 
 54 |       // done, since p(a) is monotonically increasing.
 55 |       int idx_la = 0;             // index of left interval border
 56 |       int idx_ra = nbpts-1;       // index of right interval border
 57 |       double la, ra;              // left and right interval borders
 58 | 
 59 |       if (nbpts == 0)
 60 |       {
 61 |         //std::cout << "Find root in (-infty,infty)" << std::endl;
 62 |         find_root(-oneside_shift,oneside_shift);
 63 |         return;
 64 |       }
 65 | 
 66 |       // check left border
 67 |       if (value(bpts[idx_la]) > 0)
 68 |       {
 69 |         // The zero of p(a) is in (-\infty,bpts(idx_la)].
 70 |         //std::cout << "Find root in (-infty," << bpts[idx_la] << ")" << std::endl;
 71 |         find_root(bpts[idx_la]-oneside_shift,bpts[idx_la]);
 72 |         return;
 73 |       }
 74 | 
 75 |       // check right border
 76 |       if (value(bpts[idx_ra]) < 0)
 77 |       {
 78 |         // The zero of p(a) is in [bpts(idx_ra),+\infty)
 79 |         //std::cout << "Find root in (" << bpts[idx_ra] << ",infty)" << std::endl;
 80 |         find_root(bpts[idx_ra],bpts[idx_ra]+oneside_shift);
 81 |         return;
 82 |       }
 83 | 
 84 |       // find interval with zero of p(a)
 85 |       int maxiter = (int)(ceil(log(nbpts)/log(2.0))+1);
 86 |       int j;
 87 |       for (int i=0; i<maxiter; ++i)
 88 |       {
 89 |         //std::cout << "Find root in (" << bpts[idx_la] << "," << bpts[idx_ra] << ")" << std::endl;
 90 |         j = (idx_ra+idx_la+1.0)/2.0;
 91 |         
 92 |         if (value(bpts[j]) < 0)
 93 |         {
 94 |           idx_la = j;
 95 |         }
 96 |         else
 97 |         {
 98 |           idx_ra = j;
 99 |         }
100 | 
101 |         if (idx_ra - idx_la <= 1)
102 |         {
103 |             break;
104 |         }
105 |       }
106 |       //std::cout << "Find root in (" << bpts[idx_la] << "," << bpts[idx_ra] << ")" << std::endl;
107 |       find_root(bpts[idx_la],bpts[idx_ra]);
108 |       return;
109 | 
110 |     };
111 | 
112 |     virtual void find_root(double la, double ra) = 0;
113 |     virtual void prox_diag(double* x_tilde) = 0;
114 |     virtual void get_breakpoints(std::vector<double>& bpts) = 0;
115 | 
116 |     // computes a - dot(u.T, x(a)-x0)
117 |     double value (double a)
118 |     {
119 |       for (int i=0; i<N; ++i)
120 |       {
121 |         x_tilde[i] = x0[i] - sigma*a*u[i]/d[i];
122 |       }
123 |       prox_diag(x_tilde);
124 | 
125 |       for (int i=0; i<N; ++i)
126 |         a -= u[i]*(x[i]-x0[i]);
127 |       return a;
128 |     }
129 | 
130 | 
131 | 
132 |   protected:
133 | 
134 |     // The arguments of the function are stored as member variables to 
135 |     // have a quick and flexible access to all the variables
136 | 
137 |     double* x;         // solution of the proximal mapping
138 |     double* x0;        // proximal center (const)
139 |     double* x_tilde;   // a shifted version of the proximal center 
140 |     double* d;         // diagonal of the diagonal part of the metric
141 |     double* u;         // rank1 part of the metrix
142 |     double sigma;      // sign of the rank1 part of the metric
143 |     int N;             // dimension of the problem
144 | 
145 |     // for one-sided intervals, \infty is approximated by this number
146 |     double oneside_shift;
147 | 
148 | 
149 | };
150 | 
151 | 
152 | ////////////////////////////////////////////////////////////////////////////////
153 | ////////////////////////////////////////////////////////////////////////////////
154 | ////////////////////////////////////////////////////////////////////////////////
155 | ////////////////////////////////////////////////////////////////////////////////
156 | ////////////////////////////////////////////////////////////////////////////////
157 | ////////////////////////////////////////////////////////////////////////////////
158 | 
159 | 
160 | 
161 | 
162 | 
163 | class Prox_rk1_generic_PLC : public Prox_rk1_generic
164 | {
165 |   public:
166 |     Prox_rk1_generic_PLC(
167 |       double* _x,         // solution of the proximal mapping
168 |       double* _x0,        // proximal center (const)
169 |       double* _d,         // diagonal of the diagonal part of the metric
170 |       double* _u,         // rank1 part of the metrix
171 |       double _sigma,      // sign of the rank1 part of the metric
172 |       int _N)             // dimension of the problem
173 |     : Prox_rk1_generic(_x,_x0,_d,_u,_sigma,_N) 
174 |     {
175 |       oneside_shift = 10.0;
176 |     };
177 |   
178 | 
179 |   // find root of a linear function in [la, ra] 
180 |   void find_root(double la, double ra)
181 |   {
182 |     double p_la = value(la);
183 |     double p_ra = value(ra);
184 |     double slope = (p_ra - p_la)/(ra - la);
185 |     double a = la - p_la/slope;
186 |       
187 |     // compute solution of proximal mapping
188 |     for (int i=0; i<N; ++i)
189 |     {
190 |       x_tilde[i] = x0[i] - sigma*a*u[i]/d[i];
191 |     }
192 |     prox_diag(x_tilde);
193 | 
194 |     // sanity check
195 |     double err = value(a); // Warning: This also modifies the output!
196 |     if (fabs(err) > 1e-8)
197 |     {
198 |       std::cout << "WARNING! Rank1 prox could not be solved accurately. Error: "
199 |                 << err << std::endl;
200 |     }
201 | 
202 |   }
203 |   
204 |   virtual void prox_diag(double* x_tilde) = 0;
205 |   virtual void get_breakpoints(std::vector<double>& bpts) = 0;
206 |   
207 | };
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | ////////////////////////////////////////////////////////////////////////////////
215 | ////////////////////////////////////////////////////////////////////////////////
216 | ////////////////////////////////////////////////////////////////////////////////
217 | ////////////////////////////////////////////////////////////////////////////////
218 | ////////////////////////////////////////////////////////////////////////////////
219 | ////////////////////////////////////////////////////////////////////////////////
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | class Prox_rk1_generic_PS : public Prox_rk1_generic
227 | {
228 |   public:
229 |     Prox_rk1_generic_PS(
230 |       double* _x,         // solution of the proximal mapping
231 |       double* _x0,        // proximal center (const)
232 |       double* _d,         // diagonal of the diagonal part of the metric
233 |       double* _u,         // rank1 part of the metrix
234 |       double _sigma,      // sign of the rank1 part of the metric
235 |       int _N)             // dimension of the problem
236 |     : Prox_rk1_generic(_x,_x0,_d,_u,_sigma,_N) 
237 |     {
238 |       use_a_init = false;
239 |     };
240 |   
241 |   
242 |   
243 |   void find_root(double la, double ra)
244 |   {
245 |     
246 |     // initialization
247 |     double a = 0.0;
248 |     if (use_a_init)
249 |     {
250 |       a = a_init;
251 |     }
252 |     a = fmax(la, fmin(ra, a));
253 |     double tau = 1.0;
254 |     double pa, dp_da;
255 |     for (int iter=0; iter<20; ++iter)
256 |     {
257 |       pa = value(a);
258 |       if (fabs(pa) < 1e-8) break;  // breaking condition
259 |       dp_da = derivative(a);
260 |       a = a - tau*pa/dp_da;
261 |       tau = tau*0.95;
262 |     }
263 |     // sanity check
264 |     double err = value(a); // Warning: This also modifies the output!
265 |     if (fabs(err) > 1e-8)
266 |     {
267 |       std::cout << "WARNING! Rank1 prox could not be solved accurately. Error: "
268 |                 << err << std::endl;
269 |     }
270 |     if (use_a_init)
271 |     {
272 |       a_init = a;
273 |     }
274 | 
275 |   }
276 |   virtual double derivative(double a) = 0;
277 |   
278 |   virtual void prox_diag(double* x_tilde) = 0;
279 |   virtual void get_breakpoints(std::vector<double>& bpts) = 0;
280 |   
281 |   // use this to do warm start in find_root
282 |   bool use_a_init;
283 |   double a_init;
284 | 
285 | 
286 | 
287 | };
288 | 
289 | 
290 | 
291 | class Prox_rk1_groupl2l1 : public Prox_rk1_generic_PS
292 | {
293 |   public:
294 |     Prox_rk1_groupl2l1(
295 |       double* _x,         // solution of the proximal mapping
296 |       double* _x0,        // proximal center (const)
297 |       double* _d,         // diagonal of the diagonal part of the metric
298 |       double* _u,         // rank1 part of the metrix
299 |       double _sigma,      // sign of the rank1 part of the metric
300 |       int _N)             // dimension of the problem
301 |     : Prox_rk1_generic_PS(_x,_x0,_d,_u,_sigma,_N) { };
302 | 
303 | 
304 |   double derivative(double a)
305 |   {
306 |     int j,k;
307 |     double da = 0.0;
308 |     double da_b;
309 |     double d_b;
310 |     double nrm;
311 |     double nrm_db_inv;
312 |     double dot_xu_b;
313 | 
314 |     for (k=0; k<lenB-1; ++k)
315 |     {
316 |       d_b = d[B[k]];
317 |       nrm = 0.0;
318 |       dot_xu_b = 0.0;
319 |       for (j=B[k]; j<B[k+1]; ++j)
320 |       {
321 |         nrm += x_tilde[j]*x_tilde[j];
322 |         dot_xu_b += x_tilde[j]*u[j];
323 |       }
324 |       nrm = sqrt(nrm);                  // = |x_b|
325 |       nrm_db_inv = 1.0/(nrm*d_b);       // = 1.0/(d_b*|x_b|)
326 |       dot_xu_b =  dot_xu_b*nrm_db_inv;  // = <x_b/|x_b|,u_b/d_b>
327 | 
328 |       if (nrm > 1.0/d_b)
329 |       {
330 |         da_b = 0.0;
331 |         for (j=B[k]; j<B[k+1]; ++j)
332 |         {
333 |           // Compute the j-th coordinate of 
334 |           //    (1-1/(d_b*|x_b|))*u_b/d_b
335 |           //    + x_b/|x_b|*<x_b/|x_b|,u_b/d_b>/(d_b*|x_b|)
336 |           // 
337 |           da_b = (1.0-nrm_db_inv)*u[j]/d_b              
338 |                + (x_tilde[j]*dot_xu_b)*nrm_db_inv/nrm;
339 |           
340 |           da += u[j]*da_b;  // = <u_b,da_b> 
341 |         }
342 |       }
343 |     }
344 |     da = 1.0 + sigma*da; // = 1.0 + sigma*<u,da>
345 |   
346 |     return da;
347 |   }
348 | 
349 |   void prox_diag(double* x_tilde)
350 |   {
351 | 
352 |     double tmp;
353 |     for (int k=0; k<lenB-1; ++k)
354 |     {
355 |       double nrm = 0.0;
356 |       for (int j=B[k]; j<B[k+1]; ++j)
357 |       {
358 |         tmp = x_tilde[j]*d[j];
359 |         nrm += tmp*tmp;
360 |       }
361 |       if (nrm <= 1.0)
362 |       {
363 |         for (int j=B[k]; j<B[k+1]; ++j)
364 |         {
365 |           x[j] = 0.0;
366 |         }
367 |       }
368 |       else
369 |       {
370 |         nrm = 1.0/sqrt(nrm);
371 |         for (int j=B[k]; j<B[k+1]; ++j)
372 |         {
373 |           x[j] = x_tilde[j] - x_tilde[j]*nrm;
374 |         }
375 |       }
376 |     }
377 | 
378 |   }
379 | 
380 | 
381 |   void get_breakpoints(std::vector<double>& bpts)
382 |   {
383 |     // find breakpoints
384 |     bpts.reserve(2*lenB);
385 |     double AA, BB, CC; 
386 |     double d_b;
387 |     double dis;
388 |     for (int i=0; i+1<lenB; ++i)
389 |     {
390 |       // solve AA*alpha**2 + BB*alpha + CC = 0
391 |       // AA = sum(u_b**2); 
392 |       // BB = -2.0*sigma*d_b*sum(x0_b*u_b);
393 |       // CC = d_b**2*sum(x0_b**2) - 1.0;
394 |       AA = BB = CC = 0.0;
395 |       d_b = d[B[i]];
396 |       for (int j=B[i]; j<B[i+1]; ++j)
397 |       {
398 |         AA += u[j]*u[j];
399 |         BB += x0[j]*u[j];
400 |         CC += x0[j]*x0[j];
401 |       }
402 |       BB = -2.0*sigma*d_b*BB;
403 |       CC = d_b*d_b*CC-1.0;
404 |       dis = BB*BB - 4.0*AA*CC;
405 |       if (dis < 0.0)
406 |         continue;
407 |       dis=sqrt(dis);
408 |       bpts.push_back(0.5*(-BB-dis)/AA);
409 |       bpts.push_back(0.5*(-BB+dis)/AA);
410 |     }
411 |     
412 |     //std::cout << "Breakpoints: [";
413 |     //for (int i=0; i<bpts.size(); ++i)
414 |     //{
415 |     //  std::cout << bpts[i];
416 |     //  if (i+1< bpts.size())
417 |     //    std::cout << ", ";
418 |     //}
419 |     //std::cout << "]" << std::endl;
420 |   }
421 | 
422 | 
423 | 
424 |   // prox specific parameters
425 |   int* B;        
426 |   int lenB;
427 | 
428 | };
429 | 
430 | 
431 | 
432 | 
433 | 
434 | /*
435 |  *  Proximal mapping w.r.t. the diagonal +/- rank1 metric for the function
436 |  *  
437 |  *      g(x) = |x|_B 
438 |  *  
439 |  *  where
440 |  *      
441 |  *      B       [0,K_1,K_2,...,N] is a list of coordinates belonging to the 
442 |  *              same group. It contains len(B)-1 groups. The i-the group 
443 |  *              (i=0,1,...,len(B)-1) contains the indizes {B[i], ..., B[i+1]-1}.
444 |  *      |x|_B   := sum_{i=0}^{len(B)-1} |x_{B[i], ..., B[i+1]-1}|_2
445 |  *      d       WARNING: The implementation requires that the coordinates
446 |  *              of d belonging to the same group are equal!
447 |  *
448 |  *  The solution pf prox_g^D is a piecewise smooth function with breakpoints 
449 |  *  at roots a of |x_b - a*u_b/d_b| - 1/d_b, where the index b refers to the 
450 |  *  block of coordinates b. Roots needs to be found for each of the blocks.
451 |  *  It is solved using 'prox_rk1_generic_PS'.
452 |  * 
453 |  *  x    : double*      result of proximal problem
454 |  *  x0   : double*      proximal center
455 |  *  d    : double*      diagonal part of metric
456 |  *  u    : double*      rank1 part of the metric
457 |  *  N    : int          problem dimension
458 |  *  sigma: double       sign of the rank1 part of the metric
459 |  *  B    : int*         List of coordinate groups
460 |  *  lenB : int          length of list of coordinate groups
461 |  *
462 |  */
463 | void prox_rk1_groupl2l1(double* x, 
464 |                         double* x0, 
465 |                         double* d,
466 |                         double* u,
467 |                         int N,
468 |                         double sigma,
469 |                         int* B,
470 |                         int lenB,
471 |                         double* a_init)
472 | {
473 | 
474 |   Prox_rk1_groupl2l1 prox(x,x0,d,u,sigma,N);
475 |   prox.B = B;
476 |   prox.lenB = lenB;
477 |   prox.use_a_init = true;
478 |   prox.a_init = *a_init;
479 |   prox.solve();
480 |   *a_init = prox.a_init;
481 | 
482 |   // return x (changed inside Prox.solve(bpts))
483 | }
484 | 
485 | 
486 | 
487 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/clib/mymath.h:
--------------------------------------------------------------------------------
 1 | #ifndef MYMATH_H__
 2 | #define MYMATH_H__
 3 | 
 4 | extern "C" {
 5 | 
 6 | void prox_rk1_groupl2l1(double* x,
 7 |                         double* x0, 
 8 |                         double* d,
 9 |                         double* u,
10 |                         int N,
11 |                         double sigma,
12 |                         int* B,
13 |                         int nbpts,
14 |                         double* a_init);
15 | 
16 | }
17 | 
18 | #endif  // MYMATH_H__
19 | 
20 | 


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/data_group_lasso.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/paperExperiments/groupLasso/data_group_lasso.npy


--------------------------------------------------------------------------------
/paperExperiments/groupLasso/test_groupLasso.py:
--------------------------------------------------------------------------------
  1 | from mymath import *
  2 | 
  3 | 
  4 | ################################################################################
  5 | ### Lasso problem ##############################################################
  6 | #                                                                              #
  7 | # Optimization problem:                                                        #
  8 | #                                                                              #
  9 | #     min_x  0.5*|Ax-b|_2^2 + mu*|x|_B                                         #
 10 | #                                                                              #
 11 | # where                                                                        #
 12 | #     |x|_B = sum_{I\in B} |x_{i\in I}|_2                                      # 
 13 | #                                                                              #
 14 | # Model:                                                                       #
 15 | #     A       M x N matrix                                                     #
 16 | #     b       M vector                                                         #
 17 | #     mu      positive parameter                                               #
 18 | #     N       dimension of the optimization variable                           #
 19 | #                                                                              #
 20 | ################################################################################
 21 | 
 22 | 
 23 | def Model():
 24 |     """
 25 |     Define the model data of the problem to be solved in this project.
 26 | 
 27 |     Returns:
 28 |     --------
 29 |     struct
 30 |     .'A'      M x N matrix
 31 |     .'b'      M vector
 32 |     .'M'      Matrix dimension 1
 33 |     .'N'      Matrix dimension 2
 34 |     .'mu'     positive regularization weight
 35 |     .'B'      defines the group structure (list of start and end-indizes of 
 36 |               coordinates in the same group.
 37 | 
 38 |     """
 39 | 
 40 |     M = 1600;
 41 |     N = 2500;
 42 |     mu = 1.0;
 43 | 
 44 |     A = rand(M,N);
 45 |     b = rand(M,1);
 46 | 
 47 |     # define groups of coordinates
 48 |     maxK = 12;  # maximal group size
 49 |     k = 0;
 50 |     B = [0];
 51 |     while (k < N):
 52 |         K = pmax(1,np.asscalar(np.ceil(maxK*rand(1)))).astype(int);
 53 |         k = np.minimum(N,k + K);
 54 |         B.append(k);
 55 |         
 56 |     print "l21-block sparsity: ", B    
 57 | 
 58 |     return {'M': M, 'N': N, 'mu': mu, 'A': A, 'b': b, 'B': B};
 59 | 
 60 | 
 61 | # generate problem
 62 | np.random.seed(2305978);
 63 | model = Model();
 64 | 
 65 | compute_optimal_value = False; # Should be run first to get nice plots / switch off then
 66 | 
 67 | 
 68 | ################################################################################
 69 | ### Define problem specific oracles ############################################
 70 | 
 71 | 
 72 | ### zero order oracles
 73 | def lasso_objective(x, model, options):
 74 |     A = model['A'];
 75 |     b = model['b'];
 76 |     mu = model['mu'];
 77 |     B = model['B'];
 78 |     reg = 0.0;
 79 |     for k in range(0,len(B)-1):
 80 |         reg = reg + sqrt(sum(x[B[k]:B[k+1]]**2));
 81 |     return 0.5*sum((A.dot(x) - b)**2) + mu*reg;
 82 | def lasso_objectiveSmooth(x, model, options):
 83 |     A = model['A'];
 84 |     b = model['b'];
 85 |     return 0.5*sum((A.dot(x) - b)**2);
 86 | def lasso_objectiveNonSmooth(x, model, options):
 87 |     mu = model['mu'];
 88 |     B = model['B'];
 89 |     reg = 0.0;
 90 |     for k in range(0,len(B)-1):
 91 |         reg = reg + sqrt(sum(x[B[k]:B[k+1]]**2));
 92 |     return mu*reg;
 93 | 
 94 | ### first order oracles
 95 | def lasso_gradient(x, model, options): 
 96 |     A = model['A'];
 97 |     b = model['b'];
 98 |     return A.T.dot(A.dot(x) - b);
 99 | 
100 | ### prox operators
101 | def lasso_prox(x, tau, model, options):
102 |     mu  = model['mu'];
103 |     B   = model['B'];
104 |     return prox_groupl2l1(x, 1.0/(tau*mu), {'B':B});
105 | 
106 | ### rank1 prox operators
107 | def lasso_prox_rank1_c(x, D, u, sigma, model, options):
108 |     mu  = model['mu'];
109 |     B   = model['B'];
110 | 
111 |     if options.has_key('prox_a_init'):
112 |         params = {'B': B, 'prox_a_init':options['prox_a_init']};
113 |     else:
114 |         params = {'B':B, 'prox_a_init':np.array(0.0)};
115 | 
116 |     if mu == 0:
117 |         return x;
118 |     elif sigma==0:
119 |         return prox_groupl2l1(x, D/mu, {'B':B});
120 |     else:
121 |         x = cprox_rk1_groupl2l1(x,D/mu,u/sqrt(mu),sigma,params);
122 |         options['prox_a_init'] = params['prox_a_init'];
123 |         return x;
124 | 
125 | ### residuals for breaking conditions
126 | if compute_optimal_value: 
127 |     def lasso_residual(x, res0, model, options):
128 |         return lasso_objective(x, model, options)/res0;
129 | else:    # if numeric solution is available
130 |     # load solution
131 |     h_sol = np.load('data_group_lasso.npy');
132 |     def lasso_residual(x, res0, model, options):
133 |         return (lasso_objective(x, model, options) - h_sol);
134 | 
135 | 
136 | 
137 | ################################################################################
138 | ### Run Algorithms #############################################################
139 | from Algorithms.ForwardBackwardSplitting import *;
140 | from Algorithms.FISTA import *;
141 | from Algorithms.ZeroSR1_ProximalGradient import *;
142 | from Algorithms.MFZeroSR1_ProximalGradient import *;
143 | from Algorithms.TsengZerosSR1_ProximalGradient import *;
144 | from Algorithms.SpaRSA import *;
145 | 
146 | # general parameter
147 | maxiter = 1200;
148 | check = 200;
149 | tol = -1;
150 | 
151 | # auxilliary variables
152 | A = model['A'];
153 | Lip = np.linalg.norm(A.T.dot(A))+1e-3;
154 | 
155 | # initialization
156 | x0 = zeros((model['N'],1));
157 | 
158 | # taping
159 | xs = [];
160 | rs = [];
161 | ts = [];
162 | cols = [];
163 | legs = [];
164 | nams = [];
165 | 
166 | # turn algorithms to be run on or off
167 | run_fbs          = 1;    # Forward--backward splitting
168 | run_fista        = 1;    # FISTA
169 | run_zeroSR1c     = 1;    # Zero SR1 Proximal Quasi-Newton (with rank-1 prox in c)
170 | run_mfzeroSR1    = 1;    # Monotone Fast Zero SR1 Proximal Quasi-Newton
171 | run_tseng_zeroSR1= 1;    # Tseng Fast Zero SR1 Proximal Quasi-Newton
172 | run_SpaRSA       = 1;    # Sparse Reconstruction by Separable Approximation
173 | 
174 | if compute_optimal_value: # optimal solution is computed using FISTA
175 |     maxiter = 50000
176 |     check = 1000;
177 |     run_fbs          = 0; 
178 |     run_fista        = 1; 
179 |     run_mfista       = 0; 
180 |     run_zeroSR1c     = 0; 
181 |     run_mfzeroSR1    = 0; 
182 |     run_tseng_zeroSR1= 0; 
183 |     run_SpaRSA       = 0; 
184 |     
185 | 
186 | ################################################################################
187 | if run_fbs: 
188 |     
189 |     print('');
190 |     print('********************************************************************************');
191 |     print('*** Forward--Backward Splitting ***');
192 |     print('***********************************');
193 | 
194 |     options = {
195 |         'init':           x0,
196 |         'stepsize':       1.0/Lip,
197 |         'storeResidual':  True,
198 |         'storeTime':      True
199 |     }
200 |     oracle = {
201 |         'fun_f':    lasso_objectiveSmooth,
202 |         'fun_g':    lasso_objectiveNonSmooth,
203 |         'grad_f':   lasso_gradient,
204 |         'prox_g':   lasso_prox,
205 |         'residual': lasso_residual
206 |     }
207 |     
208 |     output = fbs(model, oracle, options, tol, maxiter, check);
209 |     xs.append(output['sol']);
210 |     rs.append(output['seq_res']);
211 |     ts.append(output['seq_time']);
212 |     cols.append((0,0,1,1));
213 |     legs.append('FBS');
214 |     nams.append('FBS');
215 | 
216 | 
217 | 
218 | ################################################################################
219 | if run_fista: 
220 |     
221 |     print('');
222 |     print('********************************************************************************');
223 |     print('*** FISTA ***');
224 |     print('*************');
225 | 
226 |     options = {
227 |         'init':           x0,
228 |         'stepsize':       1.0/Lip,
229 |         'storeResidual':  True,
230 |         'storeTime':      True
231 |     }
232 |     oracle = {
233 |         'fun_f':    lasso_objectiveSmooth,
234 |         'fun_g':    lasso_objectiveNonSmooth,
235 |         'grad_f':   lasso_gradient,
236 |         'prox_g':   lasso_prox,
237 |         'residual': lasso_residual
238 |     }
239 |     
240 |     output = fista(model, oracle, options, tol, maxiter, check);
241 |     xs.append(output['sol']);
242 |     rs.append(output['seq_res']);
243 |     ts.append(output['seq_time']);
244 |     cols.append((0,0.95,0,1));
245 |     legs.append('FISTA');
246 |     nams.append('FISTA');
247 | 
248 |     if compute_optimal_value:
249 |         h_sol = lasso_objective(xs[-1], model, options);
250 |         np.save('data_group_lasso.npy', h_sol);
251 |     
252 | 
253 | ################################################################################
254 | if run_zeroSR1c: 
255 |     
256 |     print('');
257 |     print('********************************************************************************');
258 |     print('*** Zero SR1 Proximal Quasi-Newton Method ***');
259 |     print('*********************************************');
260 | 
261 |     options = {
262 |         'init':              x0,
263 |         'stepsize':          2.0/Lip,
264 |         'gamma':             1e-4,
265 |         'delta':             0.7,
266 |         'eta0':              1.0,
267 |         'lineSearchMaxiter': 20,
268 |         'storeResidual':     True,
269 |         'storeTime':         True
270 |     }
271 |     oracle = {
272 |         'fun_f':    lasso_objectiveSmooth,
273 |         'fun_g':    lasso_objectiveNonSmooth,
274 |         'grad_f':   lasso_gradient,
275 |         'prox_g':   lasso_prox_rank1_c,
276 |         'residual': lasso_residual
277 |     }
278 |     
279 |     output = zeroSR1_pg(model, oracle, options, tol, maxiter, check);
280 |     xs.append(output['sol']);
281 |     rs.append(output['seq_res']);
282 |     ts.append(output['seq_time']);
283 |     cols.append((0.2,0.8,0.5,1));
284 |     legs.append('zeroSR1c-LS');
285 |     nams.append('zeroSR1c_LS');
286 | 
287 | 
288 | 
289 | ################################################################################
290 | if run_mfzeroSR1: 
291 |     
292 |     print('');
293 |     print('********************************************************************************');
294 |     print('*** Monotone Fast Zero SR1 Proximal Quasi-Newton Method ***');
295 |     print('***********************************************************');
296 | 
297 |     options = {
298 |         'init':              x0,
299 |         'stepsize':          1.0/Lip,
300 |         'storeResidual':     True,
301 |         'storeTime':         True
302 |     }
303 |     oracle = {
304 |         'fun_f':    lasso_objectiveSmooth,
305 |         'fun_g':    lasso_objectiveNonSmooth,
306 |         'grad_f':   lasso_gradient,
307 |         'prox_g':   lasso_prox_rank1_c,
308 |         'residual': lasso_residual
309 |     }
310 |     
311 |     output = mfzeroSR1_pg(model, oracle, options, tol, maxiter, check);
312 |     xs.append(output['sol']);
313 |     rs.append(output['seq_res']);
314 |     ts.append(output['seq_time']);
315 |     cols.append((0.1,0.8,0.8,1));
316 |     legs.append('mfzeroSR1');
317 |     nams.append('mfzeroSR1');
318 | 
319 | ################################################################################
320 | if run_tseng_zeroSR1: 
321 |     
322 |     print('');
323 |     print('********************************************************************************');
324 |     print('*** Tseng Fast Zero SR1 Proximal Quasi-Newton Method ***');
325 |     print('********************************************************');
326 | 
327 |     options = {
328 |         'init':              x0,
329 |         'stepsize':          1.0/Lip,
330 |         'storeResidual':     True,
331 |         'storeTime':         True
332 |     }
333 |     oracle = {
334 |         'fun_f':    lasso_objectiveSmooth,
335 |         'fun_g':    lasso_objectiveNonSmooth,
336 |         'grad_f':   lasso_gradient,
337 |         'prox_g':   lasso_prox_rank1_c,
338 |         'residual': lasso_residual
339 |     }
340 |     
341 |     output = tseng_zeroSR1_pg(model, oracle, options, tol, maxiter, check);
342 |     xs.append(output['sol']);
343 |     rs.append(output['seq_res']);
344 |     ts.append(output['seq_time']);
345 |     cols.append((0.5,0.1,0.8,1));
346 |     legs.append('tsengzeroSR1');
347 |     nams.append('tsengzeroSR1');
348 | 
349 | 
350 | ################################################################################
351 | if run_SpaRSA: 
352 |     
353 |     print('');
354 |     print('********************************************************************************');
355 |     print('*** Sparse Reconstruction by Separable Approximation ***');
356 |     print('********************************************************');
357 | 
358 |     options = {
359 |         'init':           x0,
360 |         'backtrackingHistory': 0,
361 |         'storeResidual':  True,
362 |         'storeTime':      True
363 |     }
364 |     oracle = {
365 |         'fun_f':    lasso_objectiveSmooth,
366 |         'fun_g':    lasso_objectiveNonSmooth,
367 |         'grad_f':   lasso_gradient,
368 |         'prox_g':   lasso_prox,
369 |         'residual': lasso_residual
370 |     }
371 |     
372 |     output = spg(model, oracle, options, tol, maxiter, check);
373 |     xs.append(output['sol']);
374 |     rs.append(output['seq_res']);
375 |     ts.append(output['seq_time']);
376 |     cols.append((0.3,0.5,1,1));
377 |     legs.append('SPG/SpaRSA');
378 |     nams.append('SPG-SpaRSA');
379 | 
380 | 
381 | 
382 | 
383 | ################################################################################
384 | ### evaluation #################################################################
385 | nalgs = len(rs);
386 | 
387 | 
388 | #print "Solution:\n", xs[-1];
389 | #print "Group Structure\n", model['B']
390 | print "Sparisty: ", (sum(abs(xs[-1])<1e-8)*100.0)/model['N'];
391 | 
392 | # print final residual
393 | print('');
394 | for i in range(0,nalgs):
395 |     print 'alg: %s, time: %f, res: %f' % (legs[i], ts[i][-1], rs[i][-1])
396 | 
397 | if compute_optimal_value == False:
398 |     for i in range(0,nalgs):
399 |         if ts[i].size > 0:
400 |             file = open("GroupLasso_conv_"+nams[i]+"_time.dat", "w");
401 |             for j in range(0,maxiter,1):
402 |                 file.write("%f %.12f\n" %(ts[i][j], rs[i][j])); 
403 |             file.close();
404 |         file = open("GroupLasso_conv_"+nams[i]+"_iter.dat", "w");
405 |         for j in range(0,maxiter,1):
406 |             file.write("%d %.12f\n" %(j, rs[i][j])); 
407 |         file.close();
408 | 
409 | 
410 | # plotting
411 | fig1 = plt.figure();
412 | 
413 | for i in range(0,nalgs):
414 |     plt.plot(ts[i][1:-1], rs[i][1:-1], '-', color=cols[i], linewidth=2);
415 |     #plt.plot(rs[i][1:-1], '-', color=cols[i], linewidth=2);
416 | 
417 | plt.legend(legs);
418 | plt.yscale('log');
419 | #plt.xscale('log');
420 | 
421 | plt.xlabel('time')
422 | plt.ylabel('residual');
423 | plt.title('GroupLasso')
424 | 
425 | plt.show();
426 | #plt.draw();
427 | 
428 | 
429 | 
430 | 
431 | 
432 | 
433 | 
434 | 
435 | 
436 | 
437 | 
438 | 


--------------------------------------------------------------------------------
/proxes/Contents.m:
--------------------------------------------------------------------------------
1 | % PROXES Contains the code to compute the diagonal +/- rank-1 proxes
2 | %   proj_rank1_Rplus   - returns the scaled proximity operator for non-negativity constraints
3 | %   proj_rank1_box     - returns the scaled proximity operator for box constraints
4 | %   proj_rank1_linf    - returns the scaled proximity operator for l_infinity norm constraints
5 | %   prox_rank1_generic - returns the scaled proximity operator for a generic function h
6 | %   prox_rank1_hinge   - returns the scaled proximity operator for the hinge loss
7 | %   prox_rank1_l1      - returns the scaled proximity operator for the l1 norm
8 | 


--------------------------------------------------------------------------------
/proxes/proj_rank1_Rplus.m:
--------------------------------------------------------------------------------
 1 | function varargout = proj_rank1_Rplus(varargin)   
 2 | % PROJ_RANK1_RPLUS returns the scaled proximity operator for non-negativity constraints
 3 | %
 4 | %   x = proj_rank1_Rplus( x0, D, u )
 5 | %           where 
 6 | %           x = argmin_{x} h(x) + 1/2||x-x0||^2_{V}
 7 | %           and
 8 | %           V^{-1} = D + u*u'  (or diag(D) + u*u' if D is a vector)
 9 | %           "D" must be diagonal and positive. "u" can be any vector.
10 | %
11 | %   Here, h(x) is the indicator function of the set { x : x >= 0 }
12 | %
13 | %   There are also variants:
14 | %   x = proj_rank1_Rplus( x0, D, u, lambda, linTerm, sigma, inverse)
15 | %           returns
16 | %           x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x
17 | %           and
18 | %           either V^{-1} = D + sigma*u*u' if "inverse" is true (default)
19 | %           or     V      = D + sigma*u*u' if "inverse" is false
20 | %           and in both cases, "sigma" is either +1 (default) or -1.
21 | %           "lambda" should be non-zero
22 | %
23 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com
24 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili
25 | %   NIPS 2012, http://arxiv.org/abs/1206.1156
26 | %
27 | % See also prox_rank1_generic.m
28 | 
29 | prox            = @(x,t) max(0, x);
30 | prox_brk_pts    = @(s) 0; % since projection, scaling has no effect
31 |             
32 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} );


--------------------------------------------------------------------------------
/proxes/proj_rank1_box.m:
--------------------------------------------------------------------------------
 1 | function varargout = proj_rank1_box(lwr,upr,varargin)   
 2 | % PROJ_RANK1_BOX returns the scaled proximity operator for box constraints
 3 | %
 4 | %   x = proj_rank1_box( lwr, upr, x0, D, u )
 5 | %           where 
 6 | %           x = argmin_{x} h(x) + 1/2||x-x0||^2_{V}
 7 | %           and
 8 | %           V^{-1} = D + u*u'  (or diag(D) + u*u' if D is a vector)
 9 | %           "D" must be diagonal and positive. "u" can be any vector.
10 | %
11 | %   Here, h(x) is the indicator function of the set
12 | %       { x : lwr <= x <= upr }
13 | %       (Set any component of lwr to -Inf and upr to +Inf to effectively
14 | %        ignore those particular constraints)
15 | %
16 | %   There are also variants:
17 | %   x = proj_rank1_box( lwr, upr, x0, D, u, lambda, linTerm, sigma, inverse)
18 | %           returns
19 | %           x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x
20 | %           and
21 | %           either V^{-1} = D + sigma*u*u' if "inverse" is true (default)
22 | %           or     V      = D + sigma*u*u' if "inverse" is false
23 | %           and in both cases, "sigma" is either +1 (default) or -1.
24 | %           "lambda" should be non-zero
25 | %
26 | % Note that UNLIKE prox_rank1_l1.m and other functions, the calling
27 | % sequence is slighty different, since you must pass in "lwr" and "upr"
28 | %
29 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com
30 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili
31 | %   NIPS 2012, http://arxiv.org/abs/1206.1156
32 | %
33 | % See also prox_rank1_generic.m
34 | 
35 | prox            = @(x,t) max( min(upr,x), lwr );
36 | prox_brk_pts    = @(s) [lwr,upr]; % since projection, scaling has no effect
37 |             
38 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} );


--------------------------------------------------------------------------------
/proxes/proj_rank1_linf.m:
--------------------------------------------------------------------------------
 1 | function varargout = proj_rank1_linf(varargin)   
 2 | % PROJ_RANK1_LINF returns the scaled proximity operator for l_infinity norm constraints
 3 | %
 4 | %   x = proj_rank1_linf( x0, D, u )
 5 | %           where 
 6 | %           x = argmin_{x} h(x) + 1/2||x-x0||^2_{V}
 7 | %           and
 8 | %           V^{-1} = D + u*u'  (or diag(D) + u*u' if D is a vector)
 9 | %           "D" must be diagonal and positive. "u" can be any vector.
10 | %
11 | %   Here, h(x) is the indicator function of the l_infinity ball, i.e.,
12 | %       { x | norm(x,inf) <= 1 }
13 | %       To scale the ball, just use the scaling parameter "lambda" (see below)
14 | %
15 | %   There are also variants:
16 | %   x = proj_rank1_linf( x0, D, u, lambda, linTerm, sigma, inverse)
17 | %           returns
18 | %           x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x
19 | %           and
20 | %           either V^{-1} = D + sigma*u*u' if "inverse" is true (default)
21 | %           or     V      = D + sigma*u*u' if "inverse" is false
22 | %           and in both cases, "sigma" is either +1 (default) or -1.
23 | %           "lambda" should be non-zero
24 | %
25 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com
26 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili
27 | %   NIPS 2012, http://arxiv.org/abs/1206.1156
28 | %
29 | % See also prox_rank1_generic.m
30 | 
31 | prox            = @(x,t) sign(x).*min( 1, abs(x) );
32 | prox_brk_pts    = @(s) [-ones(size(s)),ones(size(s))]; % since projection, scaling has no effect
33 |             
34 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} );


--------------------------------------------------------------------------------
/proxes/prox_rank1_generic.m:
--------------------------------------------------------------------------------
  1 | function [x,a,cnt] = prox_rank1_generic( prox, prox_brk_pts, x0, D, u, lambda, linTerm, plusminus, INVERT )
  2 | % PROX_RANK1_GENERIC returns the scaled proximity operator for a generic function h
  3 | %   (provided the generic function is separable and has a piece-wise linear prox)
  4 | % This function is intended be used as follows:
  5 | %
  6 | %   (1) Instantiate:
  7 | %        scaledProx = @(varargin) prox_rank1_generic( prox, prox_brk_pts,varargin{:})
  8 | %           where 'prox' and 'prox_brk_pts' implicitly define the function h
  9 | %           i.e., prox(x0,t) = argmin_{x} t*h(x) + 1/2||x-x0||^2
 10 | %           and 
 11 | %               prox_brk_pts(t) returns a row-vector with the break points
 12 | %               that specify where t*h(x) is piecewise linear
 13 | %               (this is if h(x) = [ h_1(x_1); ... ; h_n(x_n) ]. If instead not
 14 | %                all the h_i are identical, prox_brk_pts(t) should return
 15 | %                a matrix).
 16 | %           See the examples below because prox_brk_pts must allow a vector "t"
 17 | %            so you must define this appropriately.
 18 | %
 19 | %   (2) Call the "scaledProx" function, which has signature:
 20 | %       x = scaledProx( x0, D, u )
 21 | %           where 
 22 | %           x = argmin_{x} h(x) + 1/2||x-x0||^2_{V}
 23 | %           and
 24 | %           V^{-1} = D + u*u'  (or diag(D) + u*u' if D is a vector)
 25 | %           "D" must be diagonal and positive. "u" can be any vector.
 26 | %
 27 | %       There are also variants:
 28 | %
 29 | %       x = scaledProx( x0, D, u, lambda, linTerm, sigma, inverse)
 30 | %           returns
 31 | %           x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x
 32 | %           and
 33 | %           either V^{-1} = D + sigma*u*u' if "inverse" is true (default)
 34 | %           or     V      = D + sigma*u*u' if "inverse" is false
 35 | %           and in both cases, "sigma" is either +1 (default) or -1.
 36 | %           "lambda" should be non-zero
 37 | %
 38 | %   Examples:
 39 | %      1. if h(x) = ||x||_1 then
 40 | %           prox            = @(x,t) sign(x).*max(0, abs(x) - t );
 41 | %           prox_brk_pts    = @(t) [-t,t];
 42 | %      2. if h(x) is the indicator function of the set { x : x >= 0}, then
 43 | %           prox            = @(x,t) max(0, x);
 44 | %           prox_brk_pts    = @(t) 0; 
 45 | %      3. if h(x) is the indicator function of the set { x : lwr <= x <= upr }
 46 | %           where lwr and upr are vectors, then
 47 | %           prox            = @(x,t) max( min(upr,x), lwr );
 48 | %           prox_brk_pts    = @(t) [lwr,upr];  (Note: this is a matrix)
 49 | %      4. if h(x) is the hinge-loss h(x) = max( 1-x, 0 ), then
 50 | %           prox        = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1  );
 51 | %           prox_brk_pts    = @(t)[ones(size(t)), 1-t];
 52 | %      5. if h(x) is the indicator function of the l_infinity ball, then
 53 | %           prox            = @(x,t) sign(x).*min( 1, abs(x) );
 54 | %           prox_brk_pts    = @(t) [-ones(size(t)),ones(size(t))]; 
 55 | %
 56 | %
 57 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com
 58 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili
 59 | %   NIPS 2012, http://arxiv.org/abs/1206.1156
 60 | 
 61 | PRINT = false; % set to "true" for debugging purposes
 62 | if PRINT
 63 |     dispp = @disp;
 64 |     printf = @fprintf;
 65 | else
 66 |     dispp = @(varargin) 1;
 67 |     printf = @(varargin) 1;
 68 | end
 69 | dispp(' ');
 70 | 
 71 | n   = length(x0);
 72 | if nargin < 5 || isempty(u), u = 0; end
 73 | if nargin < 6, lambda = []; end
 74 | if nargin < 7, linTerm = []; end
 75 | if nargin < 8 || isempty(plusminus), plusminus = 1; end
 76 | assert( plusminus==-1 | plusminus==+1 )
 77 | if nargin < 9 || isempty(INVERT), INVERT = true; end
 78 | 
 79 | if size(D,2) > 1, d = diag(D); else d = D; end % extract diagonal part
 80 | if any( d < 0 ), error('D must only have strictly positive entries'); end
 81 | 
 82 | if all( u==0 )
 83 |     % Just a diagonal scaling, so this code is overkill,
 84 |     % but we should be able to handle it for consistency.
 85 |     NO_U = true;
 86 | else
 87 |     NO_U = false;
 88 |     if numel(u) > length(u)
 89 |         error('u must be a vector, not a matrix');
 90 |     end
 91 | end
 92 | 
 93 | % Now, V > 0 (i.e., V is positive definite) iff V^{-1} exists and V^{-1} > 0
 94 | % So V^{-1} > 0 is automatically true if sigma = + 1
 95 | % If sigma = -1, then it could be indefinite or semidefinite
 96 | %
 97 | % It is possible to check all eigenvalues in O(n^2) rather than O(n^3)
 98 | % but it's not particularly simple to implement.
 99 | % See http://www.stat.uchicago.edu/~lekheng/courses/309f10/modified.pdf
100 | % Golub, 1973, "Some Modified Matrix Eigenvalue Problems"
101 | % http://epubs.siam.org/doi/abs/10.1137/1015032
102 | % But in the special case when D is a scaled identity, checking is very easy:
103 | if plusminus < 0 && all( d==d(1) )
104 |     minE = d(1) + plusminus*norm(u)^2;
105 |     if minE <= 0, error('The scaling matrix is not positive definite'); end
106 | end
107 | 
108 | % this comes from the Sherman-Morrison-Woodbury formula:
109 | if NO_U
110 |     uinv = 0;
111 | else
112 |     uinv    = (u./d)/sqrt(1+u'*(u./d));
113 | end
114 | % In all cases, we find prox_h^V, but how we define V
115 | %   in terms of d and u depends on "INVERT"
116 | if INVERT
117 |     % So V^{-1} = diag(d)     + sigma*u*u'
118 |     % and     V = diag(1./d)  - sigma*uinv*uinv';
119 |     Vinv = @(y) d.*y + plusminus*(u'*y)*u;
120 |     
121 |     %   The code below expects V = diag(dd) + sigma*uu*uu', so...
122 |     dd          = 1./d; 
123 |     uu          = uinv; 
124 |     plusminus   = -plusminus;
125 |     
126 |     % The code also requires uu./dd and 1./dd, so define these here
127 | %     ud      = uu./dd; 
128 |     ud      = u/sqrt(1+u'*(u./d)); % more accurate? % 6.01e-3 error
129 |     dInv    = 1./dd;
130 | else
131 |     % Here, V    = diag(d) + sigma*u*u'
132 |     % and V^{-1} = diag(1./d) - sigma*uinv*uinv';
133 |     Vinv = @(y) y./d - plusminus*(uinv'*y)*uinv;
134 | 
135 |     %   The code below expects V = diag(dd) + sigma*uu*uu', so...
136 |     dd          = d; 
137 |     uu          = u; 
138 |     %plusminus   = plusminus;
139 |     
140 |     % The code also requires uu./dd and 1./dd, so define these here
141 |     ud      = uu./dd;
142 |     dInv    = 1./dd;
143 | end
144 | if NO_U, uu = 0; ud = 0; end % any value, since we won't use them...
145 | if ~isempty(lambda)
146 |     % We make a change of variables, e.g., x <-- lambda*.x
147 |     % change x0 <-- lambda.*x0, linTerm <-- linTerm./lambda
148 |     % and V <-- diag(1./lambda)*V*diag(1./lambda). Because V is defined
149 |     % implicitly, and depends on INVERT, this is a bit of a headache.
150 |     % We'll do some changes here, and some later in the code
151 |     % e.g., combine linTerm and V scaling so we don't have to redefine Vinv
152 |     if any(lambda==0), error('scaling factor lambda must be non-zero'); end
153 |     % note that lambda < 0 should be OK
154 |     x0 = lambda.*x0;
155 | 
156 |     % Scale V = diag(dd) + sigma*uu*uu' by V <-- diag(1./lambda)*V*diag(1./lambda)
157 |     dd = dd./(lambda.^2);
158 |     uu = uu./lambda;
159 |     ud = ud.*lambda;
160 |     dInv    = 1./dd;
161 | end
162 | 
163 | t   = prox_brk_pts(1./dd);
164 | if size(t,1) < n
165 |     if size(t,1) > 1
166 |         error('"prox_brk_pts" should return a ROW VECTOR of break points');
167 |     end
168 |     % otherwise, assume each component identical, so scale
169 |     t = repmat(t,n,1);
170 | end
171 | if ~isempty(linTerm) && norm(linTerm)>=0
172 |     if isempty(lambda)
173 |         x0  = x0 - Vinv(linTerm);
174 |     else
175 |         % V is scaled V <-- diag(1./lambda)*V*diag(1./lambda)
176 |         %   so Vinv is scaled the opposite.
177 |         % linTerm is scaled linTerm <== linTerm./lambda
178 |         x0  = x0 - lambda.*Vinv(linTerm);
179 |     end
180 | end
181 | 
182 | % The main heart:
183 | X       = @(a) prox( x0 - plusminus*a*ud, dInv );
184 | 
185 | % Early return if we have only a diagonal scaling...
186 | if NO_U
187 |     % in this case, "alpha" is irrelevant
188 |     x   = prox( x0, dInv );
189 |     if ~isempty(lambda)
190 |         % Undo the scaling of x <-- lambda.*x
191 |         x = x./lambda;
192 |     end
193 |     return;
194 | end
195 |     
196 | brk_pts = bsxfun(@times,plusminus*(dd./uu),  bsxfun(@minus,x0,t) );
197 | brk_pts = unique(brk_pts(:)); % will sort and remove duplicates
198 | brk_pts = brk_pts(~isinf(brk_pts)); % in case lwr/upr=Inf for box
199 | 
200 | 
201 | % p(a) = a + dot(u, y - prox_{1/d_i}( y_i - a u_i/d_i) )
202 | % Then p is strictly increasing. We want a root of this: p(a) = 0
203 | 
204 | % Defined above for numerical reasons...
205 | % ud      = uu./dd;
206 | % dInv    = 1./dd;
207 | 
208 | 
209 | % Main for-loop:
210 | % "lower bound" are "a" for which p <= 0
211 | % "upper bound" are "a" for which p >= 0
212 | % if a is increasing, so is p(a) (double-check for both plusminus cases )
213 | lwrBnd       = 0;
214 | uprBnd       = length(brk_pts) + 1;
215 | iMax         = ceil( log2(length(brk_pts)) ) + 1;
216 | for i = 1:iMax
217 |     if uprBnd - lwrBnd <= 1
218 |         dispp('Bounds are too close; breaking');
219 |         break;
220 |     end
221 |     j = round(mean([lwrBnd,uprBnd]));
222 |     printf('j is %d (bounds were [%d,%d])\n', j, lwrBnd,uprBnd );
223 |     if j==lwrBnd
224 |         dispp('j==lwrBnd, so increasing');
225 |         j = j+1;
226 |     elseif j==uprBnd
227 |         dispp('j==uprBnd, so increasing');
228 |         j = j-1;
229 |     end
230 |     
231 |     a   = brk_pts(j);
232 |     x   = X(a);  % the prox
233 |     p   = a + dot(uu,x0-x);
234 |     
235 |     if p > 0
236 |         uprBnd = j;
237 |     elseif p < 0
238 |         lwrBnd = j;
239 |     end
240 |     if PRINT
241 |         % Don't rely on redefinition of printf,
242 |         % since then we would still calculate find(~x)
243 |         % which is slow
244 |         printf('i=%2d, a = %6.3f, p = %8.3f, zeros ', i, a, p );
245 |         if n < 100, printf('%d ', find(~x) ); end
246 |         % printf('; nonzeros ');printf('%d ', find(x) );
247 |         printf('\n');
248 |     end
249 | end
250 | cnt     = i; % number of iterations we took
251 | 
252 | % Now, determine linear part, which we infer from two points.
253 | % If lwr/upr bounds are infinite, we take special care
254 | % e.g., we make a new "a" slightly lower/bigger, and use this
255 | % to extract linear part.
256 | if lwrBnd == 0
257 |     a2 = brk_pts( uprBnd );
258 |     a1 = a2 - 10; % arbitrary
259 |     aBounds = [-Inf,a2];
260 | elseif uprBnd == length(brk_pts) + 1;
261 |     a1 = brk_pts( lwrBnd );
262 |     a2 = a1 + 10; % arbitrary
263 |     aBounds = [a1,Inf];
264 | else
265 |     % In general case, we can infer linear part from the two break points
266 |     a1 = brk_pts( lwrBnd );
267 |     a2 = brk_pts( uprBnd );
268 |     aBounds = [a1,a2];
269 | end
270 | x1 = X(a1);
271 | x2 = X(a2);
272 | dx = (x2 - x1)/(a2-a1); 
273 | % Thus for a in (a1,a2), x(a) = x1 + (a-a1)*dx
274 | % Solve 0 = a + dot( uu, y - (x1 + (a-a1)*dx ) )
275 | %         = a + dot(uu,y - x1 + a1*dx ) - a*dot(uu,dx)
276 | % so:
277 | a = dot( uu, x0 - x1 + a1*dx)/( -1 + dot(uu,dx) );
278 | if a < aBounds(1) || a > aBounds(2), error('alpha is not in the correct range'); end
279 | % If we were not linear, we could do a root-finding algorithm, e.g., 
280 | % a = fzero( @(a) a+dot(uu,x0-X(a)), a );
281 | 
282 | % Now, the solution:
283 | x = X(a);
284 | 
285 | if ~isempty(lambda)
286 |     % Undo the scaling of x <-- lambda.*x
287 |     x = x./lambda;
288 | end
289 | 
290 | printf('Took %d of %d iterations, lwrBnd is %d/%d \n', i, iMax, lwrBnd,length( brk_pts ) );
291 | 


--------------------------------------------------------------------------------
/proxes/prox_rank1_hinge.m:
--------------------------------------------------------------------------------
 1 | function varargout = prox_rank1_hinge(varargin)   
 2 | % PROX_RANK1_HINGE returns the scaled proximity operator for the hinge loss
 3 | %
 4 | %   x = prox_rank1_hinge( x0, D, u )
 5 | %           where 
 6 | %           x = argmin_{x} h(x) + 1/2||x-x0||^2_{V}
 7 | %           and
 8 | %           V^{-1} = D + u*u'  (or diag(D) + u*u' if D is a vector)
 9 | %           "D" must be diagonal and positive. "u" can be any vector.
10 | %
11 | %   Here, h(x) = sum(max(0,1-x)), a.k.a., the hinge-loss
12 | %
13 | %   There are also variants:
14 | %   x = prox_rank1_hinge( x0, D, u, lambda, linTerm, sigma, inverse)
15 | %           returns
16 | %           x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x
17 | %           and
18 | %           either V^{-1} = D + sigma*u*u' if "inverse" is true (default)
19 | %           or     V      = D + sigma*u*u' if "inverse" is false
20 | %           and in both cases, "sigma" is either +1 (default) or -1.
21 | %           "lambda" should be non-zero
22 | %
23 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com
24 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili
25 | %   NIPS 2012, http://arxiv.org/abs/1206.1156
26 | %
27 | % See also prox_rank1_generic.m
28 | 
29 | prox    = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1  );
30 | prox_brk_pts    = @(s)[ones(size(s)), 1-s];
31 |             
32 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} );


--------------------------------------------------------------------------------
/proxes/prox_rank1_l1.m:
--------------------------------------------------------------------------------
 1 | function varargout = prox_rank1_l1(varargin)   
 2 | % PROX_RANK1_L1 returns the scaled proximity operator for the l1 norm
 3 | %
 4 | %   x = prox_rank1_l1( x0, D, u )
 5 | %           where 
 6 | %           x = argmin_{x} h(x) + 1/2||x-x0||^2_{V}
 7 | %           and
 8 | %           V^{-1} = D + u*u'  (or diag(D) + u*u' if D is a vector)
 9 | %           "D" must be diagonal and positive. "u" can be any vector.
10 | %
11 | %   Here, h(x) = ||x||_1 (the "l-1" norm)
12 | %
13 | %   There are also variants:
14 | %   x = prox_rank1_l1( x0, D, u, lambda, linTerm, sigma, inverse)
15 | %           returns
16 | %           x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x
17 | %           and
18 | %           either V^{-1} = D + sigma*u*u' if "inverse" is true (default)
19 | %           or     V      = D + sigma*u*u' if "inverse" is false
20 | %           and in both cases, "sigma" is either +1 (default) or -1.
21 | %           "lambda" should be non-zero
22 | %
23 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com
24 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili
25 | %   NIPS 2012, http://arxiv.org/abs/1206.1156
26 | %
27 | % See also prox_rank1_generic.m
28 | 
29 | prox            = @(x,t) sign(x).*max(0, abs(x) - t );
30 | prox_brk_pts    = @(s) [-s,s];
31 |             
32 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} );


--------------------------------------------------------------------------------
/proxes/prox_rank1_l1pos.m:
--------------------------------------------------------------------------------
 1 | function varargout = prox_rank1_l1pos(varargin)   
 2 | % PROX_RANK1_L1POS returns the scaled proximity operator for the l1 norm 
 3 | %   with non-negativity constraints
 4 | %
 5 | %   x = prox_rank1_l1pos( x0, D, u )
 6 | %           where 
 7 | %           x = argmin_{x} h(x) + 1/2||x-x0||^2_{V}
 8 | %           and
 9 | %           V^{-1} = D + u*u'  (or diag(D) + u*u' if D is a vector)
10 | %           "D" must be diagonal and positive. "u" can be any vector.
11 | %
12 | %   Here, h(x) = ||x||_1 + the indicator function of the set { x : x >= 0 }
13 | %
14 | %   There are also variants:
15 | %   x = prox_rank1_l1pos( x0, D, u, lambda, linTerm, sigma, inverse)
16 | %           returns
17 | %           x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x
18 | %           and
19 | %           either V^{-1} = D + sigma*u*u' if "inverse" is true (default)
20 | %           or     V      = D + sigma*u*u' if "inverse" is false
21 | %           and in both cases, "sigma" is either +1 (default) or -1.
22 | %           "lambda" should be non-zero
23 | %
24 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com
25 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili
26 | %   NIPS 2012, http://arxiv.org/abs/1206.1156
27 | %
28 | % See also prox_rank1_generic.m, prox_rank1_l1.m, proj_rank1_Rplus.m
29 | 
30 | prox            = @(x,t) max(0, x - t );
31 | prox_brk_pts    = @(s) [s];
32 |             
33 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} );


--------------------------------------------------------------------------------
/setup_zeroSR1.m:
--------------------------------------------------------------------------------
 1 | function setup_zeroSR1
 2 | % SETUP_ZEROSR1 Adds the zeroSR1 toolbox to the path
 3 | 
 4 | baseDirectory = fileparts(mfilename('fullpath'));
 5 | addpath(genpath(baseDirectory));
 6 | 
 7 | % and make a variable in the main workspace
 8 | % assignin('base','ZEROSR1ROOT', baseDirectory );
 9 | 
10 | % Make it global so it will not be removed by "clear" statements
11 | % (though "clear all" will still remove it)
12 | evalin('base', sprintf('global ZEROSR1ROOT; ZEROSR1ROOT=''%s'';',baseDirectory) );


--------------------------------------------------------------------------------
/smoothFunctions/normSquaredFunction.m:
--------------------------------------------------------------------------------
  1 | function [f,g,h] = normSquaredFunction(x,A,At,b,c,errFcn,extraFcn, constant)
  2 | % f = normSquaredFunction(x,A,At,b,c,errFcn,extraFcn, constant)
  3 | %   returns the objective function 'f'
  4 | %   to f(x) = .5||Ax-b||_2^2 + c'*x + constant
  5 | % [f,g,h] = ...
  6 | %   return the gradient and Hessian as well
  7 | %
  8 | %   "A" can be a matrix (in which case set At=[], since it is ignored)
  9 | %   or it can be a function handle to compute the matrix-vector product
 10 | %   (in which case "At" should be a function handle to compute
 11 | %    the transposed-matrix - vector product )
 12 | %
 13 | %   By default, b=0 and c=0. Set any inputs to [] to use default values.
 14 | %
 15 | % [fHist,errHist] = normSquaredFunction()
 16 | %       will return the function history
 17 | %       (and error history as well, if errFcn was provided)
 18 | %       and reset the history to zero.
 19 | %   "fHist" is a record of f + extraFcn
 20 | %   (this is intended to be used where extraFcn is the non-smooth term "h")
 21 | %
 22 | % This function is (almost*) mathematically (not computationally) equivalent
 23 | %   to quadraticFunction( x, Q, c ) where
 24 | %   Q = A'*A and c = A'*b.
 25 | %   (*almost equivalent since there is a constant value difference in 
 26 | %    the objective function; you can use "constant" to change this)
 27 | %
 28 | % The Lipschitz constant of the gradient is 
 29 | %   the squared spectral norm of A, i.e., norm(A)^2
 30 | %
 31 | %
 32 | % March 4 2014, Stephen Becker, stephen.beckr@gmail.com
 33 | %
 34 | % See also quadraticFunction.m
 35 | 
 36 | persistent errHist fcnHist nCalls
 37 | if nargin == 0
 38 |    f = fcnHist(1:nCalls);
 39 |    g = errHist(1:nCalls);
 40 |    fcnHist = [];
 41 |    errHist = [];
 42 |    nCalls  = 0;
 43 |    return;
 44 | end
 45 | if isempty( fcnHist )
 46 |     [errHist,fcnHist] = deal( zeros(100,1) );
 47 | end
 48 | 
 49 | error(nargchk(2,8,nargin,'struct'));
 50 | if nargin < 4 || isempty(b), b = 0; end
 51 | if nargin >= 5 && ~isempty(c)
 52 |     cx  = dot(c(:),x(:) );
 53 | else
 54 |     cx  = 0;
 55 |     c   = 0;
 56 | end
 57 | if nargin < 8 || isempty(constant), constant = 0; end
 58 | if isa(A,'function_handle')
 59 |     Ax = A(x);
 60 | else
 61 |     Ax = A*x;
 62 | end
 63 | res = Ax - b;
 64 | f   = .5*norm(res(:))^2 + cx + constant;
 65 | 
 66 | % Record this:
 67 | nCalls = nCalls + 1;
 68 | if length( errHist ) < nCalls
 69 |     % allocate more memory
 70 |     errHist(end:2*end) = 0;
 71 |     fcnHist(end:2*end) = 0;
 72 | end
 73 | fcnHist(nCalls) = f;
 74 | if nargin >= 7 && ~isempty(extraFcn)
 75 |     % this is used when we want to record the objective function
 76 |     % for something non-smooth, and this routine is used only for the smooth
 77 |     % part. So for recording purposes, add in the nonsmooth part
 78 |     % But do NOT return it as a function value or it will mess up the
 79 |     % optimization algorithm.
 80 |     fcnHist(nCalls) = f + extraFcn(x);
 81 | end
 82 | 
 83 | if nargout > 1
 84 |     if isa(A,'function_handle')
 85 |         if isempty( At )
 86 |             error('If "A" is given implicitly, we need "At" to compute the gradient');
 87 |         end
 88 |         g   = At( res ) + c;
 89 |     else
 90 |         g   = A'*res + c;
 91 |     end
 92 | end
 93 | if nargout > 2
 94 |     if isa(A,'function_handle')
 95 |         error('Function is only known implicitly, so cannot provide Hessian easily');
 96 |     end
 97 |     h = A'*A;
 98 | end
 99 | 
100 | % and if error is requested...
101 | if nargin >= 6 && ~isempty( errFcn)
102 |     errHist(nCalls) = errFcn(x);
103 | end


--------------------------------------------------------------------------------
/smoothFunctions/quadraticFunction.m:
--------------------------------------------------------------------------------
 1 | function [f,g,h] = quadraticFunction(x,Q,c,errFcn,extraFcn, constant)
 2 | % f = quadraticFunction(x,Q,c, errFcn,extraFcn,constant)
 3 | %   returns the objective function 'f'
 4 | %   to f(x) = .5<x,Qx> - <c,x> + constant
 5 | % [f,g,h] = ...
 6 | %   return the gradient and Hessian as well
 7 | %
 8 | %   "Q" can be a matrix (and it should be Hermitian positive semi-definite)
 9 | %   or it can be a function handle to compute the matrix-vector product
10 | %
11 | % [fHist,errHist] = quadraticFunction()
12 | %       will return the function history
13 | %       (and error history as well, if errFcn was provided)
14 | %       and reset the history to zero.
15 | %   "fHist" is a record of f + extraFcn
16 | %   (this is intended to be used where extraFcn is the non-smooth term "h")
17 | %
18 | % This function is (almost*) mathematically (not computationally) equivalent
19 | %   to normSquaredFunction( x, A, b ) where
20 | %   Q = A'*A and c = A'*b.
21 | %   (*almost equivalent since there is a constant value difference in 
22 | %    the objective function)
23 | %
24 | % The Lipschitz constant of the gradient is the spectral norm of Q, i.e., norm(Q)
25 | %
26 | % Feb 19 2013, Stephen Becker, stephen.beckr@gmail.com
27 | %
28 | % See also normSquaredFunction.m
29 | 
30 | persistent errHist fcnHist nCalls
31 | if nargin == 0
32 |    f = fcnHist(1:nCalls);
33 |    g = errHist(1:nCalls);
34 |    fcnHist = [];
35 |    errHist = [];
36 |    nCalls  = 0;
37 |    return;
38 | end
39 | if isempty( fcnHist )
40 |     [errHist,fcnHist] = deal( zeros(100,1) );
41 | end
42 | 
43 | 
44 | % fcnSimple   = @(w) w'*(Q*w)/2 - c'*w;
45 | % gradSimple  = @(w) Q*w - c; % 
46 | if isa(Q,'function_handle')
47 |     Qx = Q(x);
48 | else
49 |     Qx = Q*x;
50 | end
51 | f   = (x'*Qx)/2 - c'*x;
52 | if nargin >= 6 && ~isempty(constant)
53 |     f   = f + constant;
54 | end
55 | 
56 | % Record this:
57 | nCalls = nCalls + 1;
58 | if length( errHist ) < nCalls
59 |     % allocate more memory
60 |     errHist(end:2*end) = 0;
61 |     fcnHist(end:2*end) = 0;
62 | end
63 | fcnHist(nCalls) = f;
64 | if nargin >= 5 && ~isempty(extraFcn)
65 |     % this is used when we want to record the objective function
66 |     % for something non-smooth, and this routine is used only for the smooth
67 |     % part. So for recording purposes, add in the nonsmooth part
68 |     % But do NOT return it as a function value or it will mess up the
69 |     % optimization algorithm.
70 |     fcnHist(nCalls) = f + extraFcn(x);
71 | end
72 | 
73 | if nargin > 2 && nargout > 1
74 | %     g = G(x);
75 |     g = Qx - c;
76 | end
77 | if nargout > 2
78 |     if isa(Q,'function_handle')
79 |         error('Function is only known implicitly, so cannot provide Hessian easily');
80 |     end
81 |     h = Q;
82 | %     h = H(x);
83 | end
84 | 
85 | % and if error is requested...
86 | if nargin >= 4 && ~isempty( errFcn)
87 |     errHist(nCalls) = errFcn(x);
88 | end


--------------------------------------------------------------------------------
/tests/computeReferenceSolution.m:
--------------------------------------------------------------------------------
 1 | % Meant to be called by getReferenceSolution.m
 2 | % This file is NOT compatible with Octave
 3 | 
 4 | fprintf('Computing reference solution via CVX\n');
 5 | cvx_precision best
 6 | cvx_quiet true
 7 | 
 8 | switch problemName
 9 |     case 'simple_001'
10 |         cvx_begin
11 |         variable xRef(N)
12 |         minimize sum_square(A*xRef-b)/2 + lambda*norm(xRef,1)
13 |         cvx_end
14 |     case 'simple_002' % same setting, different parameters
15 |         cvx_begin
16 |         variable xRef(N)
17 |         minimize sum_square(A*xRef-b)/2 + lambda*norm(xRef,1)
18 |         cvx_end
19 | end


--------------------------------------------------------------------------------
/tests/getReferenceSolution.m:
--------------------------------------------------------------------------------
 1 | global ZEROSR1ROOT
 2 | if exist('ZEROSR1ROOT','var') && ~isempty(ZEROSR1ROOT)
 3 |     refDir = fullfile(ZEROSR1ROOT,'tests','reference_solutions');
 4 | else
 5 |     fprintf('\n\nERROR: cannot find variable ZEROSR1ROOT\n');
 6 |     fprintf('This is probably because you did not run setup_zeroSR1\n');
 7 |     fprintf('  or you "cleared" variables since then. Please re-run setup-zeroSR1\n');
 8 |     error('zeroSR1:cannotFindVariable','Cannot find ZEROSR1ROOT');
 9 | end
10 | 
11 | fileName = fullfile(refDir,[problemName,'.mat']);
12 | 
13 | if exist(fileName,'file')
14 |     fprintf('Loading reference solution from file\n');
15 |     load(fileName); % loads xRef
16 | else
17 |     % Compute answer
18 |     % Do this in a separate file since otherwise
19 |     % Octave cannot parse this.
20 |     
21 |     if ~exist('cvx_begin','file')
22 |         error('Did not find reference solution nor CVX');
23 |     end
24 |     
25 |     computeReferenceSolution; % makes xRef
26 |     
27 |     % and save to the file
28 |     save(fileName,'xRef');
29 | end


--------------------------------------------------------------------------------
/tests/reference_solutions/simple_001.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/tests/reference_solutions/simple_001.mat


--------------------------------------------------------------------------------
/tests/solution_via_cvx.m:
--------------------------------------------------------------------------------
  1 | function [x,V] = solution_via_cvx(type,x0,d,u,lambda,offset,lwr,upr,pm,INV)
  2 | % x = solution_via_cvx(type,x0,d,u,lambda,offset,lwr,upr,pm)
  3 | %   returns the solution using CVX, to serve as a reference.
  4 | %   'type' can be one of 'l1', 'rplus' or 'box'
  5 | %   For 'box', specify lwr and upr bounds.
  6 | %
  7 | %   This computes the weighted prox_h^V(x0) where the function "h"
  8 | %   is specified by "type" (and perhaps scaled with lambda,
  9 | %   and/or linear term <offset,x> ), and
 10 | %
 11 | %   V^{-1} = diag(d) + pm*u*u'
 12 | %           where pm is +1 or -1 (default is +1)
 13 | %           i.e., V = diag(1./d) - pm*(u./d)*(u./d)'/( 1 + u'*diag(1./d)*u)
 14 | %           via Sherman-Morrison formula
 15 | %   or, in the form
 16 | %   x = solution_via_cvx(type,x0,d,u,lambda,offset,lwr,upr, INV)
 17 | %       if INV=false (default is true),
 18 | %       then
 19 | %       V = diag(d) + pm*u*u' (rather than this being inv(V) )
 20 | %
 21 | % In all cases, V must be positive definite
 22 | % [x,V] = solution_via_cvx(...)
 23 | %   also returns the matrix V
 24 | %
 25 | % Lambda is such that we really evaulate h(lambda.*x)
 26 | %   where lambda is a scalar or an array ofo the same size as x
 27 | %
 28 | % If CVX is not installed, or if this is called via octave
 29 | %   (new versions of CVX do not run on octave),
 30 | %   then the output is x=Inf.
 31 | %
 32 | % Stephen Becker, Feb 22 2014 stephen.beckr@gmail.com
 33 | 
 34 | % TODO: if CVX is not installed, read solution from a .mat file
 35 | 
 36 | if nargin < 10 || isempty(INV), INV = true; end
 37 | if nargin < 9 || isempty(pm), pm = +1; end
 38 | if nargin < 8 || isempty(upr), upr = []; end
 39 | if nargin < 7 || isempty(lwr), lwr = []; end
 40 | if nargin < 5 || isempty(lambda), lambda = 1; end
 41 | if nargin < 4 || isempty(u), u = 0; end
 42 | assert( pm==-1 | pm==+1 );
 43 | [R,L]   = deal(u);
 44 | n       = length(x0);
 45 | if nargin < 6 || isempty(offset), offset = zeros(n,1); end
 46 | 
 47 | % Vinv    = diag(d) + L*R';
 48 | % V       = inv(Vinv);
 49 | if INV % default
 50 |     Dinv    = diag(1./d);
 51 |     if all(u==0)
 52 |         V = Dinv;
 53 |     else
 54 |         V       = Dinv - pm*(Dinv*L)*(R'*Dinv)/( 1 + R'*Dinv*L );
 55 |     end
 56 |     if pm ==1
 57 |         % There is a chance that V is not positive definite if u was too large
 58 | %         .0421
 59 |         % It is possible to check all eigenvalues in O(n^2) rather than O(n^3)
 60 |         % but it's not particularly simple to implement.
 61 |         % See http://www.stat.uchicago.edu/~lekheng/courses/309f10/modified.pdf
 62 |         % Golub, 1973, "Some Modified Matrix Eigenvalue Problems"
 63 |         % http://epubs.siam.org/doi/abs/10.1137/1015032
 64 |         % but...
 65 |         %   if the diagonal term is just a scaled identity,
 66 |         %   then it is trivial
 67 |         if all( d==d(1) ) && ~all(u==0)% diagonal
 68 |             minE = 1/d(1) - norm(Dinv*L)^2/(1+R'*Dinv*L);
 69 |         else
 70 |             minE = min(eig(V));
 71 |         end
 72 |         if minE <= 0
 73 |             error('V must be positive definite');
 74 |         end
 75 |     end
 76 | else
 77 |     V       = diag(d) + pm*(u*u');
 78 |     if pm == -1
 79 |         if all( d==d(1) ) % diagonal
 80 |             minE = d(1) - norm(u)^2;
 81 |         else
 82 |             minE = min(eig(V));
 83 |         end
 84 |         if minE <= 0
 85 |             error('V must be positive definite');
 86 |         end    
 87 |     end
 88 | end
 89 |     
 90 | if exist('OCTAVE_VERSION','builtin') || ~exist('cvx_begin','file')
 91 |     x = Inf;
 92 |     return;
 93 | end
 94 | 
 95 | x = solveInCVX(type,x0,V,offset,lambda,lwr,upr);
 96 | % clean it up a bit:
 97 | x    = x.*( abs(x) > 1e-10 );
 98 | 
 99 | end % end of function
100 | 
101 | 
102 | function x = solveInCVX(type,x0,V,offset,lambda,lwr,upr)
103 | n   = length(x0);
104 | cvx_precision best
105 | cvx_quiet true
106 | %     minimize lambda*norm(x,1) + 1/2*sum_square( Vsqrt*(x-x0) ) + dot(offset,x)
107 |     % avoid Vsqrt=sqrtm(V) for more accurate answer:
108 |     
109 | switch lower(type)
110 |     case 'l1'
111 |         cvx_begin
112 |         variable x(n,1)
113 |         minimize norm(lambda.*x,1) + 1/2*quad_form(x-x0, V ) + dot(offset,x)
114 |         cvx_end
115 |     case 'l1pos'
116 |         cvx_begin
117 |         variable x(n,1)
118 |         minimize norm(lambda.*x,1) + 1/2*quad_form(x-x0, V ) + dot(offset,x)
119 |         subject to
120 |             lambda.*x >= 0
121 |         cvx_end
122 |     case 'rplus'
123 |         cvx_begin
124 |         variable x(n,1)
125 |         minimize 1/2*quad_form(x-x0, V ) + dot(offset,x)
126 |         subject to 
127 |             lambda.*x >= 0
128 |         cvx_end
129 |     case 'box'
130 |         if ~all( lwr <= upr )
131 |             error('Problem is infeasible');
132 |         end
133 |         % Carefully handle cases when lwr = -Inf and/or upr=+Inf
134 |         set1 = ~isinf(lwr);
135 |         set2 = ~isinf(upr);
136 |         if length(lambda)==1, lambda = repmat(lambda,n,1); end
137 |         cvx_begin
138 |         variable x(n,1)
139 |         minimize 1/2*quad_form(x-x0, V ) + dot(offset,x)
140 |         subject to 
141 |             lambda(set1).*x(set1) >= lwr(set1)
142 |             lambda(set2).*x(set2) <= upr(set2)
143 |         cvx_end
144 |     case 'hinge'
145 |         hinge = @(x) sum(max(0,1-x));
146 |         cvx_begin
147 |          variable x(n,1)
148 |          minimize 1/2*quad_form(x-x0,V) + dot(offset,x) + hinge(lambda.*x)
149 |         cvx_end
150 |     case 'linf'
151 |         hinge = @(x) sum(lambda.*max(0,1-x));
152 |         cvx_begin
153 |          variable x(n,1)
154 |          minimize 1/2*quad_form(x-x0,V) + dot(offset,x)
155 |          subject to
156 |            norm(lambda.*x, inf ) <= 1
157 |         cvx_end
158 |     otherwise
159 |         error('That type is not yet supported');
160 | end
161 | end


--------------------------------------------------------------------------------
/tests/test_prox_accuracy.m:
--------------------------------------------------------------------------------
  1 | %{
  2 | Tests the accuracy of several prox operators
  3 | This will run a random assortment of tests
  4 |     The reference solution is computed with "solution_via_cvx"
  5 |     which uses CVX (http://cvxr.com/).
  6 |     If you don't have CVX installed, then it won't work
  7 | 
  8 |     In the future, one could make a predefined test set and then
  9 |     precompute the answers so that CVX is not necessary...
 10 | 
 11 | Stephen Becker, Feb 26 2014  stephen.beckr@gmail.com
 12 | %}
 13 | 
 14 | % run setup_zeroSR1.m if you haven't already
 15 | clear; clc;
 16 | 
 17 | nTests  = 100;
 18 | n       = 1e2; % dimension of the problem
 19 | 
 20 | myQuadForm = @(x,V) x'*(V*x);
 21 | for test = 1:nTests
 22 |     % Make a random problem
 23 |     d = rand(n,1);
 24 |     u = 10*randn(n,1);
 25 |     y = randn(n,1);
 26 |     offset  = randn(n,1);
 27 |     lwr     = randn(n,1); % used for the box constraints
 28 |     upr     = lwr + 2*rand(n,1);
 29 |     lwr(randi(n)) = -Inf;
 30 |     upr(randi(n)) = Inf;
 31 |     lambda  = randn(n,1);
 32 |     
 33 |     % And sometimes turn off these features
 34 |     if randn(1) > 0, lambda = []; end
 35 |     if randn(1) > 0, offset = []; end
 36 |     if randn(1) > 0, d(2:end) = d(1); end
 37 |     if randn(1) > 0, u = 0; end % i.e., normal prox!
 38 |     sigma = 1;
 39 |     
 40 |     % Pick a solver at random
 41 |     solverTypes = {'l1','l1pos','Rplus','box','hinge','linf'};
 42 |     type = solverTypes{ randi(length(solverTypes)) };
 43 |     
 44 |     INVERT  = sign( randn(1) )+1; % sometimes specify V, sometimes specify inv(V)
 45 |     
 46 |     if isempty(lambda), lambda = 1; end
 47 |     if isempty(offset), offset = zeros(n,1); end
 48 |     INFEASIBLE  = 1e14;
 49 |     EPS         = 1e-13; % feasiblity tolerance
 50 |     switch lower(type)
 51 |         case 'l1'
 52 |             [x_cvx,V]   = solution_via_cvx('l1',y,d,u,lambda,offset,[],[],sigma,INVERT);
 53 |             obj = @(x) norm(lambda.*x,1) + 1/2*myQuadForm(x-y,V) + dot(offset,x);
 54 |             % If we use prox_rank1_generic
 55 |             prox            = @(x,t) sign(x).*max(0, abs(x) - t );
 56 |             prox_brk_pts    = @(s) [-s,s];
 57 |             % or, use
 58 |             % scaledProx = @prox_rank1_l1;
 59 |         case 'l1pos'
 60 |             [x_cvx,V]   = solution_via_cvx('l1pos',y,d,u,lambda,offset,[],[],sigma,INVERT);
 61 |             obj = @(x) norm(lambda.*x,1) + 1/2*myQuadForm(x-y,V) + dot(offset,x) + INFEASIBLE*any( lambda.*x < -EPS );
 62 |             % If we use prox_rank1_generic
 63 |             prox            = @(x,t) max(0, x - t );
 64 |             prox_brk_pts    = @(s) [s];
 65 |         case 'rplus'
 66 |             [x_cvx,V]   = solution_via_cvx('Rplus',y,d,u,lambda,offset,[],[],sigma,INVERT);
 67 |             obj = @(x) 1/2*myQuadForm(x-y,V) + dot(offset,x) + INFEASIBLE*any( lambda.*x < -EPS );
 68 |             prox            = @(x,t) max(0, x);
 69 |             prox_brk_pts    = @(s) 0; % since projection, scaling has no effect
 70 |             % scaledProx = @proj_rank1_Rplus;
 71 |         case 'box'
 72 |             [x_cvx,V]   = solution_via_cvx('box',y,d,u,lambda,offset,lwr,upr,sigma,INVERT);
 73 |             obj = @(x) 1/2*myQuadForm(x-y,V) + dot(offset,x) + ...
 74 |                 INFEASIBLE*any( lambda.*x < lwr-EPS | lambda.*x > upr+EPS );
 75 |             prox            = @(x,t) max( min(upr,x), lwr );
 76 |             prox_brk_pts    = @(s) [lwr,upr]; % since projection, scaling has no effect
 77 |             % scaledProx = @(varargin)proj_rank1_box(lwr,upr,varargin{:});
 78 |         case 'hinge'
 79 |             hinge = @(x) sum(max(0,1-lambda.*x));
 80 |             [x_cvx,V]   = solution_via_cvx('hinge',y,d,u,lambda,offset,[],[],sigma,INVERT);
 81 |             obj = @(x) hinge(x) + 1/2*myQuadForm(x-y,V) + dot(offset,x);
 82 |             prox    = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1  );
 83 |             prox_brk_pts    = @(s)[ones(size(s)), 1-s];
 84 |             % scaledProx = @prox_rank1_hinge;
 85 |         case 'linf'
 86 |             [x_cvx,V]   = solution_via_cvx('linf',y,d,u,lambda,offset,[],[],sigma,INVERT);
 87 |             obj = @(x) INFEASIBLE*(norm(lambda.*x,Inf)>1+EPS) + 1/2*myQuadForm(x-y,V) + dot(offset,x);
 88 |             prox            = @(x,t) sign(x).*min( 1, abs(x) );
 89 |             prox_brk_pts    = @(s) [-ones(size(s)),ones(size(s))]; % since projection, scaling has no effect
 90 |             % scaledProx = @proj_rank1_linf;
 91 |     end
 92 |     if all(lambda==1), lambda = []; end % turn off the feature
 93 |     if all(offset==0), offset = []; end % turn off the feature
 94 |     
 95 |     scaledProx = @(varargin) prox_rank1_generic( prox, prox_brk_pts, varargin{:});
 96 |     x = scaledProx( y, d, u, lambda, offset, sigma, INVERT);
 97 |     
 98 |     if any(isinf( x_cvx ))
 99 |         % This means either CVX is not installed or this
100 |         % is running in Octave.
101 |         fprintf('Test %d/%d. Solver type %s. CVX solution not available\n', ...
102 |             test, nTests, type );
103 |         if obj(x) > INFEASIBLE
104 |             fprintf(2,'\tSolution is not feasible! Maybe due to roundoff?\n');
105 |             break;
106 |         end
107 |         if isnan(x)
108 |             fprintf(2,'\tError detected!\n');
109 |             break;
110 |         end
111 |     else
112 |     
113 |         fprintf('Test %d/%d. Solver type %s. Error is %.2e\n', ...
114 |             test,nTests, type, norm( x - x_cvx )/max(1e-5,norm(x_cvx)) );
115 |         fprintf('\tObjective is %.2e, for cvx is %.2e, obj(x) - obj(x_cvx) is %.2e\n', ...
116 |             obj(x), obj(x_cvx), obj(x)-obj(x_cvx) );
117 |         TOLERANCE1 = 1e-3;
118 |         TOLERANCE2 = 1e-6;
119 |         if any(isnan(x_cvx))
120 |             if any(isnan(x))
121 |                 fprintf(2,'\tBoth solutions are NaN. Hmmm...\n');
122 |             else
123 |                 fprintf(2,'\tCVX returned NaN, our solver did not.\n');
124 |                 if obj(x) > INFEASIBLE/2
125 |                     fprintf(2,'\tSolution is not feasible! Maybe due to roundoff?\n');
126 |                     break;
127 |                 end
128 |             end
129 |         else
130 |             if obj(x_cvx) > INFEASIBLE/2
131 |                 fprintf(2,'\tCVX solution is not feasible!\n');
132 |             end
133 |             if obj(x) > INFEASIBLE/2
134 |                 fprintf(2,'\tOur solution is not feasible! Maybe due to roundoff?\n');
135 |                 break;
136 |             end
137 |             if (obj(x)-obj(x_cvx))/max(1,abs(obj(x_cvx)))   < TOLERANCE2
138 |                 fprintf(2,'\tGOOD\n');
139 |             elseif (obj(x)-obj(x_cvx))/max(1,abs(obj(x_cvx)))   < TOLERANCE1
140 |                 fprintf(2,'\tMARGINAL -- Loss of accuracy\n');
141 |             else
142 |                 fprintf(2,'\tBAD\n');
143 |                 break;
144 |             end
145 |         end
146 |     end
147 | end


--------------------------------------------------------------------------------
/tests/test_prox_speed.m:
--------------------------------------------------------------------------------
 1 | %{
 2 | Test the speed of the various projections, as a function of input size
 3 | The paper claims it is O( n log n), so we verify that here.
 4 | 
 5 | We test 5 proxes, and also compare to the time it takes to sort n numbers,
 6 | and also compare to O(n) and O(n log n) lines.
 7 | 
 8 | The results: the scaled prox algorithms take about 10x the time
 9 | to sort n numbers. Not bad.
10 | 
11 | Stephen Becker, Feb 26 2014 stephen.beckr@gmail.com
12 | %}
13 | nReps   = 5;
14 | nList   = logspace(2,7,6);
15 | typeList = {'l1','Rplus','box','hinge','linf','sort'};  nTypes = length(typeList);
16 | RESULTS = zeros(nTypes,length(nList),nReps);
17 | INVERT  = true; % must be true for now...
18 | for ni = 1:length(nList)
19 |     n = nList(ni);
20 |     fprintf('Test %d of %d: n = %d\n', ni, length(nList), n );
21 |     for ri = 1:nReps
22 |         d = rand(n,1);
23 |         u = 10*randn(n,1);
24 |         y = randn(n,1);
25 |         offset  = randn(n,1);
26 |         lambda  = 9;
27 |         lwr     = randn(n,1);
28 |         upr     = lwr + 2*rand(n,1);
29 |         
30 |         for type_i = 1:nTypes
31 |             type = typeList{type_i};
32 |             
33 |             switch lower(type)
34 |                 case 'l1'
35 |                     prox            = @(x,t) sign(x).*max(0, abs(x) - t );
36 |                     prox_brk_pts    = @(s) [-s,s];
37 |                 case 'rplus'
38 |                     prox            = @(x,t) max(0, x);
39 |                     prox_brk_pts    = @(s) 0; % since projection, scaling has no effect
40 |                 case 'box'
41 |                     prox            = @(x,t) max( min(upr,x), lwr );
42 |                     prox_brk_pts    = @(s) [lwr,upr]; % since projection, scaling has no effect
43 |                 case 'hinge'
44 |                     prox    = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1  );
45 |                     prox_brk_pts    = @(s)[ones(size(s)), 1-s];
46 |                 case 'linf'
47 |                     prox            = @(x,t) sign(x).*min( 1, abs(x) );
48 |                     prox_brk_pts    = @(s) [-ones(size(s)),ones(size(s))];
49 |                 case 'sort'
50 |             end
51 |             if strcmpi(type,'sort') % a baseline measure of speed
52 |                 t2 = tic;
53 |                 x = sort( y );
54 |                 tm2 = toc(t2);
55 |             else
56 |                 scaledProx = @(varargin) prox_rank1_generic( prox, prox_brk_pts, varargin{:}); 
57 |                 t2 = tic;
58 |                 x = scaledProx( y, d, u, lambda, offset, 1, INVERT);
59 |                 tm2 = toc(t2);
60 |             end
61 |             RESULTS( type_i, ni, ri ) = tm2;
62 |         end
63 |     end
64 | end
65 | %% Plot
66 | figure(1); clf;
67 | times = median(RESULTS,3);
68 | h=loglog( nList, times', 'o-' );
69 | set(h(end),'marker','*')
70 | xlabel('Dimension "n" of input');
71 | ylabel('Time to solve, in seconds');
72 | % Add a line of n
73 | hold all
74 | ref = 3; % which point to reference
75 | loglog( nList, nList*times(1,ref)/nList(ref), '--','linewidth',2 );
76 | loglog( nList, nList.*log2(nList)*times(1,ref)/(nList(ref).*log2(nList(ref))), '--','linewidth',2 );
77 | loglog( nList, nList.^2*times(1,ref)/(nList(ref)^2), '--','linewidth',2 );
78 | legend( {typeList{:}, 'O(n)','O(n log n)','O(n^2)'}, 'location','northwest' )
79 | ylim([1e-3,20]);
80 | title('Time to compute the scaled prox, median of 5 runs');
81 | %% Save as a file
82 | % set(gcf, 'PaperPositionMode', 'auto');
83 | % print -dpng test_prox_speed.png


--------------------------------------------------------------------------------
/tests/test_prox_speed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/tests/test_prox_speed.png


--------------------------------------------------------------------------------
/tests/test_solver_simple.m:
--------------------------------------------------------------------------------
 1 | %{
 2 | Solve a few simple problems to make sure it works
 3 | Solutions are saved in reference_solutions/
 4 | 
 5 | For now, we only have 1 test problem
 6 | 
 7 | Stephen Becker, March 1 2014
 8 | %}
 9 | 
10 | PROBLEM = 1;
11 | 
12 | switch PROBLEM
13 |     
14 |     case 1
15 |         N   = 12;
16 |         A   = hilb(N);
17 |         b   = ones(N,1);
18 |         lambda = 1e-1;
19 |         
20 |         Q   = A'*A;
21 |         c   = A'*b;
22 |         normQ   = norm(Q);
23 |     
24 |         problemName=sprintf('simple_%03d', PROBLEM );
25 |         % Call this script, which returns variable xRef
26 |         getReferenceSolution;
27 |         nrmXref = norm(xRef);
28 |         errFcn  = @(x) norm( x - xRef )/nrmXref;
29 |         
30 |         % prox        = @(x0,d,u) prox_rank1_l1( x0, d, u, lambda );
31 |         % or, allow 4 arguments, e.g., sigma
32 |         prox        = @(x0,d,u,varargin) prox_rank1_l1( x0, d, u, lambda, [], varargin{:} );
33 |         h           = @(x) lambda*norm(x,1);
34 |         
35 |         % NOTE: the non-standard form (not |Ax-b|, rather <x,Qx> )
36 |         % The "simple" means we do NOT include the lambda term
37 | %         fcnSimple   = @(w) w'*(Q*w)/2 - c'*w;
38 | %         gradSimple  = @(w) Q*w - c; % doesn't include non-smooth portion
39 | %         % for L-BFGS-B, we will add to gradSimple, since we have made new smooth terms
40 | %         fcn         = @(w) fcnSimple(w) + h(w);
41 |         
42 |         % This does all the work for you
43 | %         fcnGrad     = @(x) quadraticFunction(x,Q,c);
44 |         
45 |         % Or this form, which doesn't require Q to be formed
46 |         %   it should be a bit more numerically stable too
47 |         fcnGrad     = @(x) normSquaredFunction(x,A,[],b);
48 | 
49 | end
50 |     
51 | %% Solve with zeroSR1
52 | opts = struct('N',N,'verbose',25,'nmax',4000,'tol',1e-13);
53 | opts.L      = normQ; % optional
54 | opts.errFcn = errFcn;
55 | 
56 | %  -- Default values usually fine --
57 | % opts.BB     = true;
58 | % opts.SR1_diagWeight=0.8;
59 | 
60 | tic
61 | [xk,nit, errStruct,optsOut] = zeroSR1(fcnGrad,[],h,prox,opts);
62 | % -- You can also call it this way, but can be slower --
63 | % [xk,nit, errStruct,optsOut] = zeroSR1(fcnSimple,gradSimple,h,prox,opts);
64 | tm = toc;
65 | solverStr = 'zeroSR1';
66 | fprintf('Final error for %15s is %.2e, took %.2f seconds\n', solverStr, errFcn(xk), tm );
67 | figure(1); clf;
68 | semilogy(errStruct(:,4) );
69 | hold all
70 | emphasizeRecent
71 | 
72 | %% and same solver but with pure BB, no 0SR1
73 | opts.SR1 = false;
74 | opts.BB_type    = 2;
75 | opts.BB         = true;
76 | [xk,nit, errStruct,optsOut] = zeroSR1(fcnGrad,[],h,prox,opts);
77 | tm2 = toc;
78 | solverStr = 'BB, no linesearch, i.e., basically SPG/SpaRSA';
79 | fprintf('Final error for %15s is %.2e, took %.2f seconds\n', solverStr, errFcn(xk), tm2 );
80 | semilogy(errStruct(:,4) );
81 | hold all
82 | emphasizeRecent
83 | legend('zeroSR1','standard proximal gradient');


--------------------------------------------------------------------------------
/utilities/Contents.m:
--------------------------------------------------------------------------------
1 | % UTILITIES Collection of useful functions
2 | %   cummin          - reports the cumulative minimum of a sequence
3 | %   emphasizeRecent - Makes the most recent line-series in bold, and all the others
4 | %   fminunc_wrapper - bundles together function and gradient calls
5 | %   rng             - Control the random number generator used by RAND, RANDI, etc.
6 | 


--------------------------------------------------------------------------------
/utilities/cummin.m:
--------------------------------------------------------------------------------
 1 | function x = cummin(x) 
 2 | % y = cummin(x)
 3 | %   finds the cummulative minimum of x
 4 | %   e.g. y_i = min( x_i, y_{i-1} )
 5 | %
 6 | % Stephen Becker, 2011, stephen.beckr@gmail.com
 7 | 
 8 | if numel(x) > length(x)
 9 |     error('input must be a vector');
10 | end
11 | for k = 2:length(x)
12 |     x(k) = min( x(k), x(k-1) );
13 | end
14 | 


--------------------------------------------------------------------------------
/utilities/emphasizeRecent.m:
--------------------------------------------------------------------------------
 1 | function emphasizeRecent
 2 | % Makes the most recent line-series in bold, and all the others
 3 | % are not in bold. Call this with no arguments or outputs.
 4 | % Written by Stephen Becker, stephen.beckr@gmail.com  2011
 5 | 
 6 | list = get(gca,'children');
 7 | 
 8 | % Make everything else normal width
 9 | % i.e. undo any previous calls of emphasizeRecent()
10 | set( list, 'linewidth', 0.5 );
11 | 
12 | % Make most recent item in bold
13 | set( list(1), 'linewidth',2);
14 | 


--------------------------------------------------------------------------------
/utilities/fminunc_wrapper.m:
--------------------------------------------------------------------------------
 1 | function [f,g,h] = fminunc_wrapper(x,F,G,H, errFcn,extraFcn)
 2 | % [f,g,h] = fminunc_wrapper( x, F, G, H, errFcn )
 3 | % for use with Matlab's "fminunc" and other optimization programs
 4 | %   with similar conventions.
 5 | %   Here, "x" is the current point, "F" is the objective function,
 6 | %   "G" is the gradient of F, and "H" is the Hessian of F.
 7 | %
 8 | %   "errFcn", if provided, will be evaulated at x and the results
 9 | %   stored in the "errHist" variable.
10 | %
11 | % [fHist,errHist] = fminunc_wrapper()
12 | %       will return the function history
13 | %       (and error history as well, if errFcn was provided)
14 | %       and reset the history to zero.
15 | %
16 | % Written by Stephen Becker, 2011, stephen.beckr@gmail.com
17 | % Feb 2015, if F is vector-valued, then the history feature
18 | %   is disabled (could fix it if I need this feature)
19 | 
20 | persistent errHist fcnHist nCalls
21 | if nargin == 0
22 |     % we are in [fHist,errHist] = fminunc_wrapper(); mode )
23 |    f = fcnHist(1:nCalls);
24 |    g = errHist(1:nCalls);
25 |    fcnHist = [];
26 |    errHist = [];
27 |    nCalls  = 0;
28 |    return;
29 | end
30 | if isempty( fcnHist )
31 |     [errHist,fcnHist] = deal( zeros(100,1) );
32 | end
33 | 
34 | f = F(x);
35 | if numel(f)==1
36 |     % Record this:
37 |     nCalls = nCalls + 1;
38 |     if length( errHist ) < nCalls
39 |         % allocate more memory
40 |         errHist(end:2*end) = 0;
41 |         fcnHist(end:2*end) = 0;
42 |     end
43 |     fcnHist(nCalls) = f;
44 |     if nargin >= 6 && ~isempty(extraFcn)
45 |         % this is used when we want to record the objective function
46 |         % for something non-smooth, and this routine is used only for the smooth
47 |         % part. So for recording purposes, add in the nonsmooth part
48 |         % But do NOT return it as a function value or it will mess up the
49 |         % optimization algorithm.
50 |         fcnHist(nCalls) = f + extraFcn(x);
51 |     end
52 | end
53 | 
54 | if nargin > 2 && nargout > 1
55 |     g = G(x);
56 | end
57 | if nargin > 3 && ~isempty(H) && nargout > 2
58 |     h = H(x);
59 | end
60 | 
61 | % and if error is requested...
62 | if nargin >= 5 && ~isempty( errFcn)
63 |     if length( errHist ) < nCalls
64 |         % allocate more memory
65 |         errHist(end:2*end) = 0;
66 |     end
67 |     errHist(nCalls) = errFcn(x);
68 | end
69 | 


--------------------------------------------------------------------------------
/utilities/rng.m:
--------------------------------------------------------------------------------
  1 | function varargout = rng(varargin)
  2 | %RNG Control the random number generator used by RAND, RANDI, and RANDN (SRB version)
  3 | %   RNG(SD) seeds the random number generator using the non-negative
  4 | %   integer SD so that RAND, RANDI, and RANDN produce a predictable
  5 | %   sequence of numbers.
  6 | %
  7 | %   RNG('shuffle') seeds the random number generator based on the current
  8 | %   time so that RAND, RANDI, and RANDN produce a different sequence of
  9 | %   numbers after each time you call RNG.
 10 | %
 11 | %   RNG(SD,GENERATOR) and RNG('shuffle',GENERATOR) additionally specify the
 12 | %   type of the random number generator used by RAND, RANDI, and RANDN.
 13 | %   GENERATOR is one of:
 14 | %
 15 | %       Generator              Description
 16 | %      ------------------------------------------------------------------
 17 | %      'twister'               Mersenne Twister
 18 | %      'combRecursive'         Combined Multiple Recursive
 19 | %      'multFibonacci'         Multiplicative Lagged Fibonacci
 20 | %      'v5uniform'             Legacy MATLAB 5.0 uniform generator
 21 | %      'v5normal'              Legacy MATLAB 5.0 normal generator
 22 | %      'v4'                    Legacy MATLAB 4.0 generator
 23 | %
 24 | %   RNG('default') puts the settings of the random number generator used by
 25 | %   RAND, RANDI, and RANDN to their default values so that they produce the
 26 | %   same random numbers as if you restarted MATLAB. In this release, the
 27 | %   default settings are the Mersenne Twister with seed 0.
 28 | %
 29 | %   S = RNG returns the current settings of the random number generator
 30 | %   used by RAND, RANDI, and RANDN. The settings are returned in a
 31 | %   structure S with fields 'Type', 'Seed', and 'State'.
 32 | %    
 33 | %   RNG(S) restores the settings of the random number generator used by
 34 | %   RAND, RANDI, and RANDN back to the values captured previously by
 35 | %   S = RNG.
 36 | %
 37 | %   S = RNG(...) additionally returns the previous settings of the random
 38 | %   number generator used by RAND, RANDI, and RANDN before changing the
 39 | %   seed, generator type or the settings.
 40 | %
 41 | %      Example 1:
 42 | %         s = rng           % get the current generator settings
 43 | %         x = rand(1,5)     % RAND generates some values
 44 | %         rng(s)            % restore the generator settings
 45 | %         y = rand(1,5)     % generate the same values so x and y are equal
 46 | % 
 47 | %      Example 2:
 48 | %         oldS = rng(0,'v5uniform') % use legacy generator
 49 | %         x = rand                  % legacy startup value .9501
 50 | %         rng(oldS)                 % restore the old settings
 51 | %
 52 | %   See <a href="matlab:helpview([docroot '\techdoc\math\math.map'],'update_random_number_generator')">Updating Your Random Number Generator Syntax</a> to use RNG to replace
 53 | %   RAND or RANDN with the 'seed', 'state', or 'twister' inputs.
 54 | %
 55 | % MODIFIED BY STEPHEN BECKER
 56 | %   See also RAND, RANDI, RANDN, RandStream, NOW.
 57 | 
 58 | 
 59 | %   See <a href="http://www.mathworks.com/access/helpdesk/help/techdoc/math/brn4ixh.html#brvku_2">Choosing a Random Number Generator</a> for details on these generators.
 60 | 
 61 | %   Copyright 2010 The MathWorks, Inc. 
 62 | %   $Revision: 1.1.6.1 $  $Date: 2010/10/25 16:06:38 $
 63 | 
 64 | persistent do_once
 65 | % 2014, rng is not builtin, it's in a package, so be careful:
 66 | C = which('rng','-all');
 67 | if isempty( do_once ), do_once = 0; end
 68 | if size(C,1) > 1 && do_once < size(C,1)
 69 |     do_once = do_once + 1;
 70 |     % add this directory to the very top of the path so it shadows this
 71 |     % file...
 72 |     addpath(fileparts( C{end} ) )
 73 | %     disp('Re-run your code; the path to rng has been fixed');
 74 |     [varargout{1:nargout}] = rng( varargin{:} );
 75 |     return;
 76 | end
 77 | 
 78 | if exist('rng','builtin')
 79 |     [varargout{1:nargout}] = builtin('rng',varargin{:} );
 80 |     return;
 81 | end
 82 | 
 83 | % if exist('rng','builtin')
 84 | %     switch nargin
 85 | %         case 0
 86 | %             if nargout > 0
 87 | %                 settings = builtin('rng');
 88 | %             else
 89 | %                 builtin('rng');
 90 | %             end
 91 | %         case 1
 92 | %             if nargout > 0
 93 | %                 settings = builtin('rng',arg1);
 94 | %             else
 95 | %                 builtin('rng',arg1);
 96 | %             end
 97 | %         case 2
 98 | %             if nargout > 0
 99 | %                 settings = builtin('rng',arg1,arg2);
100 | %             else
101 | %                 builtin('rng',arg1,arg2);
102 | %             end
103 | %     end
104 | %     return;
105 | % end
106 | 
107 | % -- SRB adding this --
108 | error(nargchk(1,1,nargin));
109 | error(nargoutchk(0,0,nargout));
110 | arg1 = varargin{1};
111 | % For R2008a, this doesn't work... (not sure what earliest version is)
112 | if verLessThan('matlab','7.7')
113 |     randn('state',arg1);
114 |     rand('state',arg1);
115 | elseif verLessThan('matlab','8')
116 |     RandStream.setDefaultStream(RandStream('mt19937ar', 'seed', arg1 ));
117 | else
118 |     RandStream.setGlobalStream(RandStream('mt19937ar', 'seed', arg1 ));
119 | end
120 | 


--------------------------------------------------------------------------------