├── .gitignore ├── LICENSE ├── README.md ├── VERSION ├── algorithms └── zeroSR1.m ├── paperExperiments ├── Lasso │ ├── Contents.m │ ├── README.md │ ├── cummin.m │ ├── fminunc_wrapper.m │ ├── proj_Rplus_weighted.m │ ├── proj_box_weighted.m │ ├── prox_l1_rank1.m │ ├── runTestsForPaper.m │ ├── test4.png │ ├── test5.png │ ├── zeroSR1.m │ └── zeroSR1_noLinesearch.m ├── README.md └── groupLasso │ ├── Algorithms │ ├── FISTA.py │ ├── ForwardBackwardSplitting.py │ ├── MFZeroSR1_ProximalGradient.py │ ├── SpaRSA.py │ ├── TsengZerosSR1_ProximalGradient.py │ ├── ZeroSR1_ProximalGradient.py │ └── __init__.py │ ├── README.md │ ├── clib │ ├── Makefile │ ├── mymath.cpp │ └── mymath.h │ ├── data_group_lasso.npy │ ├── mymath.py │ └── test_groupLasso.py ├── proxes ├── Contents.m ├── proj_rank1_Rplus.m ├── proj_rank1_box.m ├── proj_rank1_linf.m ├── prox_rank1_generic.m ├── prox_rank1_hinge.m ├── prox_rank1_l1.m └── prox_rank1_l1pos.m ├── setup_zeroSR1.m ├── smoothFunctions ├── normSquaredFunction.m └── quadraticFunction.m ├── tests ├── computeReferenceSolution.m ├── getReferenceSolution.m ├── reference_solutions │ └── simple_001.mat ├── solution_via_cvx.m ├── test_prox_accuracy.m ├── test_prox_speed.m ├── test_prox_speed.png └── test_solver_simple.m └── utilities ├── Contents.m ├── cummin.m ├── emphasizeRecent.m ├── fminunc_wrapper.m └── rng.m /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Stephen Becker and Jalal Fadili 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of University of Paris 6 - Pierre and Marie Curie, ENSICAEN, 15 | GREYC, CNRS, or IBM Research, nor the names of their 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zeroSR1 toolbox 2 | 3 | The zeroSR1 toolbox implements the algorithm from 'A quasi-Newton proximal splitting method' by 4 | Stephen Becker and Jalal Fadili, which appeared in [NIPS 2012](http://nips.cc/). The paper is available at [arXiv 1206.1156](http://arxiv.org/abs/1206.1156). 5 | 6 | (Update, January 2018, we have an extended paper [On Quasi-Newton Forward--Backward Splitting: Proximal Calculus and Convergence](https://arxiv.org/abs/1801.08691) by Stephen Becker, Jalal Fadili and Peter Ochs) 7 | 8 | Briefly, the algorithm follows the standard proximal-gradient method, but allows a scaled prox. This enables us to use a limited-memory SR1 method (similar to L-BFGS). 9 | 10 | The algorithm solves problems of the form min\_x f(x) + h(x) where f is differentiable (more precisely, with a Lipschitz gradient) and h is one of the following (see the paper): 11 | 12 | Available "h" | Cost for input of size "n" 13 | ------------- | ------------- 14 | l1 norm | O( n log n) 15 | non-negativity constraints | O( n log n) 16 | l1 and non-negativity | O( n log n) 17 | box constraints | O( n log n ) 18 | l\_infinity norm constraint | O( n log n ) 19 | [hinge loss](http://en.wikipedia.org/wiki/Hinge_loss) | O( n log n ) 20 | 21 | The algorithm compares favorably with other methods, including [L-BFGS-B](http://www.mathworks.com/matlabcentral/fileexchange/35104-lbfgsb-l-bfgs-b-mex-wrapper). 22 | 23 | This toolbox currently implements in the following languages 24 | 25 | * Matlab 26 | * Octave 27 | 28 | Further releases may target these languages: 29 | 30 | * Python 31 | * R 32 | * C++ 33 | 34 | # Installation 35 | For Matlab, there is no installation necessary. Every time you run a new Matlab session, run the `setup_zeroSR1.m` file and it will add the correct paths. 36 | 37 | Run `tests/test_solver_simple.m` to see how to solve a typical problem 38 | 39 | # Structure 40 | In each folder, see the `Contents.m` file for more information 41 | ### Algorithms 42 | This includes the zeroSR1 algorithm as well as implemenations of FISTA and other proximal-gradient methods 43 | 44 | ### Proxes 45 | The scaled diagonal+ rank1 prox operators for various "g" functions 46 | 47 | ### SmoothFunctions 48 | These are pre-made wrappers for the various smooth "f" functions. The files here with the `_splitting` suffix are intended for use with any method that requires forming the augmented variable "x\_aug = (x\_pos, x\_neg)". For example, this approach is used when using L-BFGS-B (which only allows box constraints, such as x\_pos >= 0, x\_neg <= 0) to solve the LASSO problem. 49 | 50 | ### Utilities 51 | Helper files 52 | 53 | ### Tests 54 | Verify the algorithm and proxes are working correctly. This uses [CVX](http://cvxr.com/cvx) to verify; if this is not installed on your system, then it relies on precomputed solutions stored in a subdirectory. 55 | 56 | ### paperExperiments 57 | Recreates the experiments in the 2018 paper 58 | 59 | # Authors 60 | The original authors are Stephen Becker, Jalal Fadili and Peter Ochs. Further contributions are welcome. 61 | 62 | ## Citing 63 | This software is provided free of charge, but we request that if you use this for an academic paper, please cite the following work: 64 | 65 | bibtex entry: 66 | 67 | @inproceedings{quasiNewtonNIPS, 68 | author = {Becker, Stephen and Fadili, Jalal}, 69 | title = {A quasi-{N}ewton proximal splitting method}, 70 | booktitle = {Neural Information Processing Systems (NIPS)}, 71 | year = {2012} 72 | } 73 | 74 | @article{quasiNewtonSIOPT, 75 | author = {Becker, Stephen and Fadili, Jalal and Ochs, Peter}, 76 | title = {On Quasi-{N}ewton Forward-Backward Splitting: Proximal Calculus and Convergence}, 77 | journal = {SIAM Journal on Optimization}, 78 | volume = {29}, 79 | number = {4}, 80 | pages = {2445-2481}, 81 | year = {2019}, 82 | doi = {10.1137/18M1167152}, 83 | URL = {https://doi.org/10.1137/18M1167152}, 84 | eprint = {https://doi.org/10.1137/18M1167152} 85 | } 86 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | v0.1 2 | Spring 2014 3 | -------------------------------------------------------------------------------- /algorithms/zeroSR1.m: -------------------------------------------------------------------------------- 1 | function [xk,nit, errStruct, defaultOpts, stepsizes] = zeroSR1(fcn,grad,h,prox,opts) 2 | % ZEROSR1 Solves smooth + nonsmooth/constrained optimization problems 3 | % [xk,nit, errStruct, outOpts] = zeroSR1(f,grad_f,h,prox_h,opts) 4 | % 5 | % This uses the zero-memory SR1 method (quasi-Newton) to solve: 6 | % 7 | % min_x f(x) + h(x) 8 | % 9 | % where 10 | % 'f' calculates f(x), 'grad_f' calculates the gradient of f at x, 11 | % and h(x) is a non-smooth term that can be infinite-valued (a constraint), 12 | % so long as you present a function 'prox' that computes diagional plus 13 | % rank-1 projections. The 'prox' function should accept at least three inputs: 14 | % 15 | % if 'grad_f' is empty, then we assume the 'f' function is actually 16 | % computing both f and grad_f (e.g., just f if nargout=1, and 17 | % f and grad_f if nargout=2). This method is often preferable 18 | % since you can re-use computation 19 | % 20 | % 'h' is the non-smooth function, and prox_h is a function with 21 | % 3 or 4 inputs that returns: 22 | % y = prox_h( x0 , d, v, ) 23 | % where 24 | % y = argmin_x h(x) + 1/2||x-x0||^2_B 25 | % and 26 | % B = inv(H) = inv( diag(D) + v*v' ) 27 | % or, for the case with 4 arguments, y = prox_h( x0, d, v, sigma ) 28 | % then B = inv( diag(D) + sigma*v*v' ) where sigma should be +1 or -1 29 | % The 4 argument case only matters when opts.SR1=true and opts.BB_type=1 30 | % or opts.SR1=true, opts.BB_type=1 and opts.SR1_diagWeight > 1 31 | % 32 | % If 'prox_h' isn't provided or is [], it defaults to the identity mapping, which corresponds 33 | % to the case when h=0. 34 | % 35 | % 'prox_h' is mean to be given by something like prox_rank1_l1 36 | % e.g., 37 | % prox = @(x0,d,v) prox_rank1_l1( x0, d, v, lambda ); 38 | % or, for 4 arguments, 39 | % prox = @(x0,d,v,varargin) prox_rank1_l1( x0, d, v, lambda, [], varargin{:} ); 40 | % 41 | % "opts" is a structure with additional options. To see their default values, 42 | % call this function with no input arguments. 43 | % 44 | % .tol is a tolerance for relative variation 45 | % .nmax is max # of allowed iterations 46 | % .verbose can be either 0 (no output), 1 (every iteration), or n 47 | % If 'n' is an integer greater than 1, output will be written 48 | % every n iterations 49 | % .x0 50 | % starting vector 51 | % .N 52 | % size of primal domain (only necessary if x0 wasn't provided) 53 | % 54 | % .SR1 if true, uses the zero-memory SR1 method (default) 55 | % if false, uses gradient descent/forward-backward method 56 | % (or variant, such as BB stepsizes as in the SPG method) 57 | % .SR1_diagWeight is a scalar > 0 that controls the weight of the 58 | % BB stepsize, and is usually between 0 and 1. 59 | % If set to exactly 1, then the rank 1 term is exactly zero 60 | % .BB 61 | % use the Barzilai-Borwein scalar stepsize (by default, true) 62 | % .BB_type = 1 uses the longer of the B-B steps 63 | % .BB_type = 2 uses the shorter of the steps 64 | % with 0SR1, BB_type=1 is not possible 65 | % BB_type=2 is used, and is scaled by 0 < opts.SR1_diagWeight < 1 66 | % 67 | % .errFcn can be an arbitrary function that calculates an error metric 68 | % on the primal variable at every iteration. 69 | % 70 | % 71 | % Output "errStruct" contains three or four columns: 72 | % (1) objective function 73 | % (2) norm of gradient 74 | % (3) stepsize 75 | % (4) error (i.e. the output of errFcn, if provided) 76 | % 77 | % Stephen Becker and Jalal Fadili, Nov 24 2011 -- Dec 2012 78 | % Copied from zeroSR1.m Dec 11 2012 79 | % Feb 28 2014, unnesting all functions to make compatible with octave. 80 | % 81 | % See also proximalGradient.m 82 | 83 | 84 | 85 | % ----------------------------------------------------------------- 86 | % ------------ Boring initializations ----------------------------- 87 | % ------------ for understanding the algorithm, skip ahead -------- 88 | % ------------ to where it says "Begin algorithm"------------------ 89 | % ----------------------------------------------------------------- 90 | 91 | if nargin == 0 || nargout >= 4 92 | RECORD_OPTS = true; 93 | % defaultOpts = []; 94 | else 95 | RECORD_OPTS = false; 96 | end 97 | 98 | if nargin < 3 || isempty(h) 99 | if nargin >= 4 && ~isempty(prox) 100 | warning('zeroSR1:h_not_provided','Found prox_h but not h itself. Setting h=0, prox=I'); 101 | prox = @(x,varargin) x; 102 | end 103 | h = @(x) 0; 104 | end 105 | if nargin < 4 || isempty(prox), prox = @(x,varargin) x; end 106 | if nargin < 5, opts = []; end 107 | 108 | setOptsSubFcn(); % zero out any persistent variables 109 | setOpts = @(varargin) setOptsSubFcn( RECORD_OPTS, opts, varargin{:} ); 110 | % Usage: setOpts( field, default, mn, mx, emptyOK (default:false) ); 111 | 112 | fid = setOpts('fid', 1 ); % print output to the screen or a file 113 | myDisp = @(str) fprintf(fid,'%s\n', str ); 114 | tol = setOpts( 'tol', 1e-6 ); 115 | grad_tol= setOpts( 'grad_tol', tol ); 116 | nmax = setOpts( 'nmax', 1000 ); 117 | errFcn = setOpts( 'errFcn', [] ); 118 | VERBOSE = setOpts( 'verbose', false ); 119 | if isinf(VERBOSE), VERBOSE = false; end 120 | maxStag = setOpts( 'maxStag', 10 ); % force very high accuracy 121 | xk = setOpts( 'x0', [], [], [], true ); 122 | N = setOpts( 'N', length(xk) ); 123 | if N==0 && nargin > 0, error('for now, must specify opts.N = N'); end 124 | if isempty(xk), xk = zeros(N,1); end 125 | damped = setOpts('damped',false); % 1=no damping, .01 = very tiny step 126 | 127 | % -- Options that concern the stepsize -- 128 | SR1 = setOpts( 'SR1', true ); 129 | BFGS = setOpts( 'BFGS', false ); 130 | if SR1 && BFGS 131 | error('zeroSR1:conflictingArgs','Cannot set SR1 and BFGS to both be true'); 132 | end 133 | BB = setOpts( 'BB', SR1 || BFGS ); 134 | if isfield(opts,'L') && isempty(opts.L) && ~BB 135 | warning('zeroSR1:noGoodStepsize','Without Lipschitz constant nor BB stepsize nor line search, bad things will happen'); 136 | end 137 | L = setOpts( 'L', 1, 0 ); % Lipschitz constant, e.g. norm(A)^2 138 | 139 | SIGMA = +1; % used for SR1 feature 140 | % Default BB stepsize. type "1" is longer and usually faster 141 | BB_type = setOpts('BB_type',2*(SR1||BFGS) + 1*(~(SR1||BFGS)) ); 142 | if (SR1||BFGS) && BB_type == 1 143 | % warning('zeroSR1:badBB_parameter','With zero-memory SR1, BB_type must be set to 2. Forcing BB_type = 2 and continuing'); 144 | % BB_type = 2; 145 | 146 | warning('zeroSR1:experimental','With zero-memory SR1, BB_type=1 is an untested feature'); 147 | SIGMA = -1; 148 | end 149 | if SR1 150 | defaultWeight = 0.8*(BB_type==2) + 1.0*(BB_type==1); 151 | else 152 | defaultWeight = 1; 153 | end 154 | SR1_diagWeight = setOpts( 'SR1_diagWeight', defaultWeight ); 155 | if SR1 && BB_type == 2 && SR1_diagWeight > 1 156 | SIGMA = -1; 157 | end 158 | 159 | % ------------ Scan options for capitalization issues, etc. ------- 160 | [defaultOpts,opts] = setOpts(); 161 | if nargin == 0 162 | disp('Default options:'); 163 | disp( defaultOpts ); 164 | end 165 | if ~isempty(fieldnames(opts)) 166 | disp('Error detected! I didn''t recognize these options:'); 167 | disp( opts ); 168 | error('Bad options'); 169 | end 170 | if nargin == 0 , return; end 171 | 172 | % ------------ Initializations and such --------------------------- 173 | xk_old = xk; 174 | % gradient = zeros(N,1); 175 | getGradient = @(varargin) getGradientFcn(fcn,grad, varargin{:}); 176 | fxold = Inf; 177 | t = 1/L; % initial stepsize 178 | stepsizes = zeros(nmax,1 + (SR1||BFGS)); % records some statisics 179 | if ~isempty(errFcn) 180 | if ~isa(errFcn,'function_handle') 181 | error('errFcn must be a function'); 182 | end 183 | errStruct = zeros( nmax, 4 ); % f, norm(gx), step, err 184 | else 185 | errStruct = zeros( nmax, 3 ); % f, norm(gx), step 186 | end 187 | skipBB = false; 188 | stag = 0; 189 | 190 | 191 | gradient = getGradient(xk); 192 | gradient_old = gradient; 193 | f_xk = []; 194 | 195 | % ----------------------------------------------------------------- 196 | % ------------ Begin algorithm ------------------------------------ 197 | % ----------------------------------------------------------------- 198 | for nit = 1:nmax 199 | 200 | % Do this at end now, so we can get fcn value for free 201 | % gradient_old = gradient; 202 | % gradient = grad(xk); 203 | 204 | % "sk" and "yk" are the vectors that will give us quasi-Newton 205 | % information (and also used in BB step, since that can be 206 | % seen as a quasi-Newton method) 207 | sk = xk - xk_old; 208 | yk = gradient - gradient_old; % Following notation in Nocedal/Wright 209 | if nit > 1 && norm(yk) < 1e-13 210 | warning('zeroSR1:zeroChangeInGradient','gradient isn''t changing , try changing opts.L'); 211 | yk = []; 212 | skipBB = true; 213 | end 214 | 215 | 216 | % --------------------------------------------------------------------- 217 | % -- Find an initial stepsize -- 218 | % --------------------------------------------------------------------- 219 | % t_old = t; 220 | if BB && nit > 1 && ~skipBB 221 | switch BB_type 222 | case 1 223 | t = (norm(sk)^2)/(sk'*yk); % eq (1.6) in Dai/Fletcher. This is longer 224 | case 2 225 | t = sk'*yk/( norm(yk)^2 ); % eq (1.7) in Dai/Fletcher. This is shorter 226 | end 227 | if t < 1e-14 % t < 0 should not happen on convex problem! 228 | myDisp('Curvature condition violated!'); 229 | stag = Inf; 230 | end 231 | if SR1 || BFGS 232 | % we cannot take a full BB step, otherwise we exactly satisfy the secant 233 | % equation, and there is no need for a rank-1 correction. 234 | t = SR1_diagWeight*t; % SR1_diagWeights is a scalar less than 1 like 0.6 235 | end 236 | H0 = @(x) t*x; 237 | diagH = t*ones(N,1); 238 | else 239 | t = 1/L; 240 | H0 = @(x) t*x; % diagonal portion of inverse Hessian 241 | diagH = t*ones(N,1); 242 | end 243 | skipBB = false; 244 | stepsizes(nit,1) = t; 245 | 246 | 247 | 248 | % --------------------------------------------------------------------- 249 | % -- Quasi-Newton -- Requries: H0, and builds H 250 | % --------------------------------------------------------------------- 251 | if SR1 && nit > 1 && ~isempty(yk) 252 | gs = yk'*sk; 253 | % gHg = yk'*(diagH.*yk); % not needed any more 254 | if gs < 0 255 | myDisp('Serious curvature condition problem!'); 256 | stag = Inf; 257 | end 258 | H0 = @(x) diagH.*x; 259 | vk = sk - H0(yk); 260 | vkyk = vk'*yk; 261 | SIGMA_LOCAL = sign( vkyk ); 262 | %if SIGMA*vkyk <= 0 263 | if SIGMA_LOCAL*vkyk <= 0 264 | myDisp('Warning: violated curvature conditions'); 265 | % This should only happen if we took an exact B-B step, which we don't. 266 | vk = []; 267 | H = H0; 268 | stepsizes(nit,2) = 0; 269 | else 270 | vk = vk/sqrt( SIGMA_LOCAL*vkyk ); 271 | % And at last, our rank-1 approximation of the inverse Hessian. 272 | H = @(x) H0(x) + SIGMA_LOCAL*(vk*(vk'*x)); 273 | % The (inverse) secant equation is B*sk = yk(=y), or Hy=s 274 | % N.B. We can make a rank-1 approx. of the Hessian too; see the full 275 | % version of the code. 276 | 277 | stepsizes(nit,2) = vk'*vk; 278 | end 279 | elseif BFGS && nit > 1 && ~isempty(yk) 280 | gs = yk'*sk; 281 | rho= 1/gs; 282 | if gs < 0 283 | myDisp('Serious curvature condition problem!'); 284 | stag = Inf; 285 | end 286 | H0 = @(x) diagH.*x; 287 | 288 | tauBB = sk'*yk/( norm(yk)^2); 289 | uk = sk/2 + H0(sk)/(2*tauBB) - H0(yk); 290 | % if H0 is tauBB*I (e.g., gamma=1), then vk = sk - H0(yk). 291 | 292 | 293 | stepsizes(nit,2) = uk'*uk; 294 | 295 | vk = [sk-uk, sk+uk]*sqrt(rho/2); % rank 2! 296 | SIGMA_LOCAL = [-1,1]; 297 | 298 | H = @(x) H0(x) + vk*( diag(SIGMA_LOCAL)*(vk'*x) ); 299 | 300 | %fprintf('DEBUG: %.2e\n', norm( H(yk) - sk ) ); 301 | 302 | else 303 | SIGMA_LOCAL = SIGMA; 304 | H = H0; 305 | vk= []; 306 | end 307 | 308 | 309 | % --------------------------------------------------------------------- 310 | % -- Make the proximal update ----------------------------------------- 311 | % --------------------------------------------------------------------- 312 | p = H(-gradient); % Scaled descent direction. H includes the stepsize 313 | xk_old = xk; 314 | if ~isequal(SIGMA_LOCAL,1) 315 | if damped 316 | xk = xk + damped*(prox( xk_old + p, diagH, vk, SIGMA_LOCAL )-xk); 317 | else 318 | xk = prox( xk_old + p, diagH, vk, SIGMA_LOCAL ); 319 | end 320 | else 321 | if damped 322 | xk = xk + damped*(prox( xk_old + p, diagH, vk )-xk); 323 | else 324 | xk = prox( xk_old + p, diagH, vk ); % proximal step 325 | end 326 | 327 | end 328 | 329 | norm_grad = norm( xk - xk_old ); 330 | if any(isnan(xk)) || norm(xk) > 1e10 331 | stag = Inf; % will cause it to break 332 | xk = xk_old; 333 | myDisp('Prox algorithm failed, probably due to numerical cancellations'); 334 | end 335 | 336 | % --------------------------------------------------------------------- 337 | % -- The rest of the code is boring. The algorithmic stuff is done. --- 338 | % --------------------------------------------------------------------- 339 | % -- record function values -- 340 | % --------------------------------------------------------------------- 341 | gradient_old = gradient; 342 | [gradient,f_xk] = getGradient(xk); % can be cheaper if user provided a nice fcn 343 | fx = f_xk + h(xk); 344 | % fx = fcn(xk) + h(xk); 345 | df = abs(fx - fxold)/abs(fxold); 346 | fxold = fx; 347 | 348 | if (df < tol) || ( t < 1e-10 ) || (isnan(fx) ) || norm_grad < grad_tol 349 | stag = stag + 1; 350 | end 351 | 352 | if VERBOSE && (~rem(nit,VERBOSE) || stag>maxStag ) 353 | fprintf(fid,'Iter: %5d, f: % 7.3e, df: %.2e, ||grad||: %.2e, step %.2e\n',... 354 | nit,fx,df, norm_grad, t); 355 | end 356 | 357 | errStruct(nit,1) = fx; 358 | errStruct(nit,2) = norm_grad; 359 | errStruct(nit,3) = t; 360 | if ~isempty(errFcn) 361 | errStruct(nit,4) = errFcn( xk ); 362 | if VERBOSE && (~rem(nit,VERBOSE) || stag>maxStag ) 363 | fprintf(fid,'\b, err %.2e\n', errStruct(nit,4) ); 364 | end 365 | end 366 | 367 | 368 | if stag > maxStag 369 | if VERBOSE, myDisp('Quitting (e.g. reached tolerence)...'); end 370 | break; 371 | end 372 | 373 | end 374 | 375 | if nit == nmax && VERBOSE, myDisp('Maxed out iteration limit'); end 376 | if nit < nmax 377 | errStruct = errStruct( 1:nit, : ); 378 | stepsizes = stepsizes( 1:nit, : ); 379 | end 380 | 381 | end % end of main routine 382 | 383 | function [gradientValue,fcnValue] = getGradientFcn( fcn, gradient, x, str ) 384 | % The user can either specify fcn and gradient separately, 385 | % or they can specify them both in a single function (also called fcn) 386 | % This latter option is triggered whenever gradient=[] 387 | if nargin < 4, str = []; end 388 | if isempty(gradient) 389 | [fcnValue,gradientValue] = fcn(x); 390 | else 391 | gradientValue = gradient(x); 392 | if nargout > 1 393 | if strcmpi(str,'fcn_optional') 394 | fcnValue = []; 395 | else 396 | fcnValue = fcn(x); 397 | end 398 | end 399 | end 400 | end 401 | 402 | function varargout = setOptsSubFcn(RECORD_OPTS, opts, field, default, mn, mx, emptyOK ) 403 | persistent defaultOpts 404 | persistent updatedOpts 405 | if nargin <= 2 406 | % non-standard usage 407 | varargout{1} = defaultOpts; 408 | varargout{2} = updatedOpts; 409 | defaultOpts = []; 410 | updatedOpts = []; 411 | return; 412 | end 413 | if isempty( updatedOpts ), updatedOpts = opts; end 414 | 415 | % if emptyOK is false, then values of opts.field=[] are not allowed and 416 | % are instead set to the default value 417 | if nargin < 7 || isempty(emptyOK), emptyOK = false; end 418 | if ~isfield( opts, field ) || (isempty(opts.(field)) && ~emptyOK ) 419 | opts.(field) = default; 420 | end 421 | out = opts.(field); 422 | varargout{1} = out; 423 | if nargin >= 5 && ~isempty(mn) && any(out < mn), error('Value is too small'); end 424 | if nargin >= 6 && ~isempty(mx) && any(out > mx), error('Value is too large'); end 425 | if isfield( updatedOpts, field ) 426 | updatedOpts = rmfield( updatedOpts, field ); % so we can do a check later 427 | end 428 | if RECORD_OPTS 429 | defaultOpts.(field) = out; 430 | end 431 | end 432 | -------------------------------------------------------------------------------- /paperExperiments/Lasso/Contents.m: -------------------------------------------------------------------------------- 1 | % FIG2_LASSO 2 | % Recreates Fig 6.1 from https://arxiv.org/pdf/1801.08691.pdf 3 | % 4 | % Main Files 5 | % runTestsForPaper - Script to run all the tests 6 | % 7 | % Helper Files 8 | % zeroSR1 - [xk,nit, errStruct, outOpts] = zeroSR1(f,g,proj,opts) 9 | % zeroSR1_noLinesearch - Solves smooth + nonsmooth/constrained optimization problems 10 | % fminunc_wrapper - wrapper for objective and gradient 11 | % proj_box_weighted - Projection onto box constraints 12 | % prox_l1_rank1 - Prox of l1 with diagonal + rank-1 metric 13 | % proj_Rplus_weighted - Projection onto x>=0 with diagonal + rank-1 14 | % cummin - Cumulative minimum 15 | % 16 | % Feb 1 2018 -------------------------------------------------------------------------------- /paperExperiments/Lasso/README.md: -------------------------------------------------------------------------------- 1 | # zeroSR1: Lasso Experiments 2 | 3 | This folder contains the Matlab code to run the Lasso experiments. 4 | The version of code used here may be slightly different than the updated algorithms in the main repository. 5 | 6 | Some third-party packages (not provided, though we list the URLs) are required if you want to compare with the other solvers mentioned in the paper. 7 | 8 | In the code, "test 4" is Fig 6.1 (left) from our [2018 paper](https://arxiv.org/pdf/1801.08691.pdf) (similar to Fig 1.a from our [2012 paper](https://arxiv.org/pdf/1206.1156.pdf)) 9 | 10 | Similarly, "test 6" is Fig 6.1 (right) from our [2018 paper](https://arxiv.org/pdf/1801.08691.pdf) (similar to Fig 1.b from our [2012 paper](https://arxiv.org/pdf/1206.1156.pdf)) 11 | 12 | ## Third party packages 13 | If you install these, make sure to add them to the Matlab path. You can follow the example `addpath` commands that we used. 14 | 15 | ### L-BFGS-B 16 | We wrote our own Matlab wrapper for this (using the L-BFGS-B 3.0 Fortran 17 | code). You can download it from: https://github.com/stephenbeckr/L-BFGS-B-C 18 | 19 | Unpack it somewhere and run `lbfgsb_C/Matlab/compile_mex.m` 20 | 21 | ### ASA 22 | See http://users.clas.ufl.edu/hager/papers/Software/ 23 | 24 | as of 2013, they have [ver 3.0](http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-3.0.tar.gz) but their older [ver 2.2](http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-2.2.tar.gz) is still online. 25 | 26 | You also need the Matlab interface; we wrote this ourself, and it can be downloaded from [Mathworks file exchange no 35814](https://www.mathworks.com/matlabcentral/fileexchange/35814-mex-interface-for-bound-constrained-optimization-via-asa) (it will also download the main C source code for you) 27 | 28 | If you download the Matlab interface, run the `test_ASA.m` script and it will downlaoad the ASA code that it needs. 29 | 30 | ### CGIST 31 | Get CGIST from their [website](http://tag7.web.rice.edu/CGIST.html) or [direct link to .zip file](http://tag7.web.rice.edu/CGIST_files/cgist.zip). 32 | 33 | ### FPC 34 | Get FPC AS from [their website](http://www.caam.rice.edu/~optimization/L1/FPC_AS/request-for-downloading-fpc_as.html) 35 | 36 | ### L1General package, with PSSas and OWL 37 | Get the L1General2 code from [Mark Schmidt's software website](https://www.cs.ubc.ca/~schmidtm/Software/thesis.html) or [direct link to thesis.zip](https://www.cs.ubc.ca/~schmidtm/Software/thesis.zip). 38 | 39 | Note: you need to compile mex files for this (for the lbfgs subroutine) 40 | For compilation, try: `minFunc/mexAll.m` 41 | 42 | We noticed that line 13 in `lbfgsC.c` declared `int nVars,nSteps,lhs_dims[2];` and for us, this threw a warning at compile-time and an error at run-time. One fix is to remove the `lhs_dims[2]` from that line and instead add a new line with: `size_t lhs_dims[2];` 43 | 44 | ## Output 45 | 46 | Running test4 should give something like this: 47 | 48 | ![Test 4 results](test4.png?raw=true) 49 | 50 | Running test5 should give something like this: 51 | 52 | ![Test 5 results](test5.png?raw=true) 53 | 54 | ## Authors 55 | The authors are Stephen Becker, Jalal Fadili and Peter Ochs. 56 | 57 | This README from Feb 1 2018. Thanks to https://stackedit.io/app for editing markup 58 | -------------------------------------------------------------------------------- /paperExperiments/Lasso/cummin.m: -------------------------------------------------------------------------------- 1 | function x = cummin(x) 2 | % y = cummin(x) 3 | % finds the cummulative minimum of x 4 | % e.g. y_i = min( x_i, y_{i-1} ) 5 | 6 | if numel(x) > length(x) 7 | error('input must be a vector'); 8 | end 9 | for k = 2:length(x) 10 | x(k) = min( x(k), x(k-1) ); 11 | end -------------------------------------------------------------------------------- /paperExperiments/Lasso/fminunc_wrapper.m: -------------------------------------------------------------------------------- 1 | function [f,g,h] = fminunc_wrapper(x,F,G,H, errFcn,extraFcn) 2 | % [f,g,h] = fminunc_wrapper( x, F, G, H, errFcn ) 3 | % for use with Matlab's "fminunc" 4 | % 5 | % [fHist,errHist] = fminunc_wrapper() 6 | % will return the function history 7 | % (and error history as well, if errFcn was provided) 8 | % and reset the history to zero. 9 | persistent errHist fcnHist nCalls 10 | if nargin == 0 11 | f = fcnHist(1:nCalls); 12 | g = errHist(1:nCalls); 13 | fcnHist = []; 14 | errHist = []; 15 | nCalls = 0; 16 | return; 17 | end 18 | if isempty( fcnHist ) 19 | [errHist,fcnHist] = deal( zeros(100,1) ); 20 | end 21 | 22 | f = F(x); 23 | % Record this: 24 | nCalls = nCalls + 1; 25 | if length( errHist ) < nCalls 26 | % allocate more memory 27 | errHist(end:2*end) = 0; 28 | fcnHist(end:2*end) = 0; 29 | end 30 | fcnHist(nCalls) = f; 31 | if nargin >= 6 && ~isempty(extraFcn) 32 | % this is used when we want to record the objective function 33 | % for something non-smooth, and this routine is used only for the smooth 34 | % part. So for recording purposes, add in the nonsmooth part 35 | % But do NOT return it as a function value or it will mess up the 36 | % optimization algorithm. 37 | fcnHist(nCalls) = f + extraFcn(x); 38 | end 39 | 40 | if nargin > 2 && nargout > 1 41 | g = G(x); 42 | end 43 | if nargin > 3 && ~isempty(H) && nargout > 2 44 | h = H(x); 45 | end 46 | 47 | % and if error is requested... 48 | if nargin >= 5 && ~isempty( errFcn) 49 | errHist(nCalls) = errFcn(x); 50 | end -------------------------------------------------------------------------------- /paperExperiments/Lasso/proj_Rplus_weighted.m: -------------------------------------------------------------------------------- 1 | function [x,lambda,cnt, sEst] = proj_Rplus_weighted( x0, D, L, scale, linTerm ) 2 | % x = proj_Rplus_weighted( x0, D, L ) or 3 | % x = proj_Rplus_weighted( x0, D, L, scale ) or 4 | % x = proj_Rplus_weighted( x0, D, L, scale, c ) 5 | % returns the solution 6 | % x = argmin 1/2||x-x0||^2_{Q,2}' + subject to scale*x >= 0 7 | % where 8 | % ||x-x0||^2_{Q,2} = < x - x0, Q*(x-x0) > 9 | % and 10 | % inv(Q) = D + L*L' is a diagonal + rank-1 matrix 11 | % with D = diag(d) > 0 is psd. 12 | % 13 | % Only the sign of the "scale" input has an effect (if scale < 0, 14 | % the the constraints become x <= 0 instead of x >= 0 ). 15 | % 16 | % The algorithm takes O( n*log(n) ) running time. 17 | % 18 | % [x,lambda] = ... 19 | % also returns a dual vector lambda 20 | % 21 | % [x,lambda,iter] = ... 22 | % also returns the number of iterations ( iter <= ceil( log_2(n) + 1 ) ). 23 | % 24 | % [x,lambda,iter,s] = ... 25 | % also returns the scalar dual variable 's' 26 | % 27 | % Stephen Becker, Dec 1 2011. srbecker@caltech.edu 28 | % March 13, changing order of (... linTerm, scale ) to (... scale, linTerm ) 29 | 30 | 31 | VERBOSE = false; 32 | if nargin < 3, L = []; end % this is just scalar stuff then... 33 | if nargin < 5, linTerm = []; end 34 | if nargin < 4 || isempty(scale), scale = 1; end 35 | 36 | if isvector(D), d = D; %D = diag(d); 37 | else d = diag(D); end 38 | if any( d < 0 ), error('Diagonal term must be positive'); end 39 | % make sure everything is a column vector 40 | if size(x0,2) > 1, x0 = x0.'; end 41 | if size(L,2) > 1, L = L.' ; end 42 | 43 | 44 | 45 | % -- If the user doesn't specify L, then it's a standard projection -- 46 | if isempty(L) 47 | if ~isempty( linTerm ) 48 | error('Can''t handle that case yet. Shouldn''t be too difficult though...'); 49 | else 50 | x = max( x0, 0 ); 51 | lambda = []; 52 | cnt = 1; 53 | sEst = 0; 54 | return; 55 | end 56 | end 57 | 58 | 59 | 60 | 61 | RESCALE = false; 62 | if scale == 0 63 | error('Cannot handle lambda = 0'); 64 | elseif scale < 0 65 | RESCALE = true; 66 | x0 = -x0; 67 | end 68 | 69 | R = L; 70 | N = length(x0); 71 | 72 | if nargin >= 4 && ~isempty( linTerm ) 73 | % We can incorporate this (i.e. "c" in the equation above, 74 | % but not the same "c" used below) into the x0 term: 75 | if size(linTerm,2) > 1, linTerm = linTerm.'; end 76 | if RESCALE, linTerm = -linTerm; end 77 | x0 = x0 - (d.*linTerm + L*(R'*linTerm) ); 78 | 79 | end 80 | 81 | 82 | % from now on, "lambda" refers to the dual vector 83 | % We will find a strictly complementary solution (x,lambda) such 84 | % that x >= 0, lambda >= 0, and = 0 85 | 86 | % sList = -x0./L; 87 | % sList = sort(sList); % +/- inf are OK. 88 | sList = unique( -x0./L ); % remove duplicate +/- infinities 89 | sListInf = [ -Inf; sList; Inf ]; 90 | 91 | S = [ sList(1)-1; (sList + circshift(sList,-1))/2 ]; 92 | S(end) = sList(end) + 1; 93 | % so the element of S are right in the middle: no boundary points, 94 | % ensuring strict complementarity 95 | 96 | % Thus, we have defined the active set for both x and lambda 97 | DONE = false; 98 | mn = 0; % inclusive 99 | mx = length(sList); % inclusive 100 | maxIt = ceil( log2(mx) ) + 1; 101 | 102 | A = -R./d; 103 | B = A.*L; 104 | A = A.*x0; 105 | 106 | for cnt = 0:maxIt % should take logN iterations, or fewer 107 | 108 | k = round( (mn+mx)/2 ); % pick the next entry 109 | s = S(k+1); % i.e. sList(k-1) < s < sList(k) 110 | 111 | % T = find( x0 + s*L > 0); 112 | Tc = find( x0 + s*L < 0); % we never have y + s = 0, by design of S 113 | 114 | % support of lambda is now well defined in terms of 's'. 115 | % a = -R(Tc)'*( x0(Tc)./d(Tc) ); 116 | % b = -R(Tc)'*( L(Tc)./d(Tc) ); 117 | % alternatively, compute them this way (might be faster): 118 | a = sum( A(Tc) ); 119 | b = sum( B(Tc) ); 120 | sEst = a/(1-b); 121 | 122 | % find bounds: 123 | lb = sListInf( k+1 ); 124 | ub = sListInf( k+2 ); 125 | 126 | % debugging: verify that these are indeed the correct bounds: 127 | % OK = ( s > lb ) && ( s < ub ); 128 | % if ~OK, disp('Violated bounds!'); error('Problem!'); end 129 | 130 | if sEst < lb 131 | str = 'v'; 132 | % reduce the upper bound 133 | mx = k; 134 | elseif sEst > ub 135 | str = '^'; 136 | % increase the lower bound 137 | mn = k; 138 | else 139 | str = '-'; 140 | DONE = true; 141 | end 142 | if VERBOSE, fprintf('k=%2d, [%6.1f, %6.1f], sEst is %6.1f: %s\n',k,lb,ub, sEst, str ); end 143 | if DONE, break; end 144 | end 145 | assert( cnt < maxIt, 'rank-1 prox algorithm failed to converge'); 146 | 147 | T = find( x0 + s*L > 0); 148 | x = zeros(N,1); 149 | x(T) = x0(T) + sEst*L(T); 150 | if nargout > 1 151 | lambda = zeros(N,1); 152 | lambda(Tc) = -(x0(Tc) + sEst*L(Tc) )./d(Tc); 153 | end 154 | 155 | if RESCALE 156 | x = -x; 157 | if nargout > 1 158 | lambda = -lambda; 159 | end 160 | end 161 | -------------------------------------------------------------------------------- /paperExperiments/Lasso/proj_box_weighted.m: -------------------------------------------------------------------------------- 1 | function [x,lambda,cnt, sEst] = proj_box_weighted( x0, D, L, lwr, upr, linTerm ) 2 | % x = proj_box_weighted( x0, D, L, lwr, upr ) or 3 | % x = proj_box_weighted( x0, D, L, lwr, upr, c ) 4 | % returns the solution 5 | % x = argmin 1/2||x-x0||^2_{Q,2}' + subject to lwr <= x <= upr 6 | % where 7 | % ||x-x0||^2_{Q,2} = < x - x0, Q*(x-x0) > 8 | % and 9 | % inv(Q) = D + L*L' is a diagonal + rank-1 matrix 10 | % with D = diag(d) > 0 is psd. 11 | % 12 | % Only the sign of the "scale" input has an effect (if scale < 0, 13 | % the the constraints become x <= 0 instead of x >= 0 ). 14 | % 15 | % The algorithm takes O( n*log(n) ) running time. 16 | % 17 | % [x,lambda] = ... 18 | % also returns a dual vector lambda 19 | % 20 | % [x,lambda,iter] = ... 21 | % also returns the number of iterations ( iter <= ceil( log_2(n) + 1 ) ). 22 | % 23 | % [x,lambda,iter,s] = ... 24 | % also returns the scalar dual variable 's' 25 | % 26 | % Stephen Becker, Jun 12 2012. srbecker@alumni.caltech.edu 27 | 28 | 29 | VERBOSE = false; 30 | if nargin < 3, L = []; end % this is just scalar stuff then... 31 | if nargin < 6, linTerm = []; elseif ~isempty(linTerm) 32 | error('cannot yet handle this case'); 33 | end 34 | % if nargin < 6 || isempty(scale), scale = 1; end 35 | scale=1; 36 | 37 | if isvector(D), d = D; %D = diag(d); 38 | else d = diag(D); end 39 | if any( d < 0 ), error('Diagonal term must be positive'); end 40 | % make sure everything is a column vector 41 | if size(x0,2) > 1, x0 = x0.'; end 42 | if size(L,2) > 1, L = L.' ; end 43 | 44 | 45 | % -- If the user doesn't specify L, then it's a standard projection -- 46 | if isempty(L) 47 | if ~isempty( linTerm ) 48 | error('Can''t handle that case yet. Shouldn''t be too difficult though...'); 49 | else 50 | x = max( x0, lwr ); 51 | x = min( x, upr ); 52 | lambda = []; 53 | cnt = 1; 54 | sEst = 0; 55 | return; 56 | end 57 | end 58 | 59 | 60 | RESCALE = false; 61 | if scale == 0 62 | error('Cannot handle lambda = 0'); 63 | elseif scale < 0 64 | RESCALE = true; 65 | x0 = -x0; 66 | end 67 | 68 | R = L; 69 | N = length(x0); 70 | 71 | % if nargin >= 6 && ~isempty( linTerm ) 72 | % % We can incorporate this (i.e. "c" in the equation above, 73 | % % but not the same "c" used below) into the x0 term: 74 | % if size(linTerm,2) > 1, linTerm = linTerm.'; end 75 | % if RESCALE, linTerm = -linTerm; end 76 | % x0 = x0 - (d.*linTerm + L*(R'*linTerm) ); 77 | % 78 | % end 79 | 80 | 81 | % from now on, "lambda" refers to the dual vector 82 | % We will find a strictly complementary solution (x,lambda) such 83 | % that x >= 0, lambda >= 0, and = 0 84 | 85 | % sList = unique( -x0./L ); % remove duplicate +/- infinities 86 | sList = unique( [(lwr-x0)./L; (upr-x0)./L ]); 87 | sListInf = [ -Inf; sList; Inf ]; 88 | 89 | S = [ sList(1)-1; (sList + circshift(sList,-1))/2 ]; 90 | S(end) = sList(end) + 1; 91 | % so the element of S are right in the middle: no boundary points, 92 | % ensuring strict complementarity 93 | 94 | % Thus, we have defined the active set for both x and lambda 95 | DONE = false; 96 | mn = 0; % inclusive 97 | mx = length(sList); % inclusive 98 | maxIt = ceil( log2(mx) ) + 1; 99 | 100 | A = -R./d; 101 | B = A.*L; 102 | A1 = A.*(x0-lwr); 103 | A2 = A.*(x0-upr); 104 | 105 | for cnt = 0:maxIt % should take logN iterations, or fewer 106 | 107 | k = round( (mn+mx)/2 ); % pick the next entry 108 | s = S(k+1); % i.e. sList(k-1) < s < sList(k) 109 | 110 | % T = find( x0 + s*L > 0); 111 | % Tc = find( x0 + s*L < 0); % we never have y + s = 0, by design of S 112 | Tc1 = find( x0 + s*L < lwr ); 113 | Tc2 = find( x0 + s*L > upr ); 114 | 115 | % support of lambda is now well defined in terms of 's'. 116 | % a = -R(Tc)'*( x0(Tc)./d(Tc) ); 117 | % b = -R(Tc)'*( L(Tc)./d(Tc) ); 118 | % alternatively, compute them this way (might be faster): 119 | % a = sum( A(Tc) ); 120 | % b = sum( B(Tc) ); 121 | a = sum( A1(Tc2) ) + sum( A2(Tc2) ); 122 | b = sum( B(Tc1) ) + sum( B(Tc2) ); 123 | sEst = a/(1-b); 124 | 125 | % find bounds: 126 | lb = sListInf( k+1 ); 127 | ub = sListInf( k+2 ); 128 | 129 | % debugging: verify that these are indeed the correct bounds: 130 | % OK = ( s > lb ) && ( s < ub ); 131 | % if ~OK, disp('Violated bounds!'); error('Problem!'); end 132 | 133 | if sEst < lb 134 | str = 'v'; 135 | % reduce the upper bound 136 | mx = k; 137 | elseif sEst > ub 138 | str = '^'; 139 | % increase the lower bound 140 | mn = k; 141 | else 142 | str = '-'; 143 | DONE = true; 144 | end 145 | if VERBOSE, fprintf('k=%2d, [%6.1f, %6.1f], sEst is %6.1f: %s\n',k,lb,ub, sEst, str ); end 146 | if DONE, break; end 147 | end 148 | assert( cnt < maxIt, 'rank-1 prox algorithm failed to converge'); 149 | 150 | % T = find( x0 + s*L > 0); 151 | % x = zeros(N,1); 152 | % x(T) = x0(T) + sEst*L(T); 153 | 154 | x = x0 + sEst*L; 155 | x = min( max(x,lwr), upr ); 156 | 157 | % if nargout > 1 158 | % lambda = zeros(N,1); 159 | % lambda(Tc) = -(x0(Tc) + sEst*L(Tc) )./d(Tc); 160 | % end 161 | 162 | % if RESCALE 163 | % x = -x; 164 | % if nargout > 1 165 | % lambda = -lambda; 166 | % end 167 | % end 168 | -------------------------------------------------------------------------------- /paperExperiments/Lasso/prox_l1_rank1.m: -------------------------------------------------------------------------------- 1 | function [x,cBest,cnt] = prox_l1_rank1( x0, D, L, lambda, linTerm ) 2 | % x = prox_l1_weighted( x0, D, u ) or 3 | % x = prox_l1_weighted( x0, D, u, lambda ) or 4 | % x = prox_l1_weighted( x0, D, u, lambda, c ) 5 | % returns the solution 6 | % x = argmin lambda*||x||_1 + 1/2||x-x0||^2_{B,2}' + 7 | % where 8 | % ||x-x0||^2_{B,2} = < x - x0, B*(x-x0) > 9 | % and 10 | % H = inv(B) = D + u*u' is a diagonal + rank-1 matrix 11 | % with D = diag(d) > 0 is positive definite. 12 | % 13 | % The algorithm takes O( n*log(n) ) running time. 14 | % Inputs must be real, not complex. 15 | % 16 | % [x,c] = ... 17 | % also returns c, where 18 | % x = shrink( x0 - c*u, d ); 19 | % 20 | % [x,c,iter] = ... 21 | % also returns the number of iterations ( iter <= ceil( log_2(n) + 1 ) ). 22 | % 23 | % Stephen Becker, Dec 10 2010 -- April 2012. stephen.beckr@gmail.com 24 | 25 | % Modified May 27 2011 to handle the "c" term 26 | % Modified Nov 24 2011 to handle the "lambda" term and be more efficient 27 | % Modified Feb 29 2012 to be more accurate when L has many zeros 28 | % Modified Mar 13 2012 to allow vector "lambda" term. 29 | 30 | % Note about the code: the documentation refers to H = D + u*u' 31 | % The code uses the notation H = D + L*R' (L for left, R for right) 32 | % In general, we need R=L (so that H is positive definite), 33 | % but we keep the "R" notation because it makes the derivation more clear, 34 | % and we don't always have R=L after we remove zero terms from L. 35 | 36 | 37 | % -------- Preprocess --------------------- 38 | 39 | if isvector(D), d = D; % D = diag(d); 40 | else d = diag(D); end 41 | if any( d < 0 ), error('Diagonal term must be positive'); end 42 | % make sure everything is a column vector 43 | if size(x0,2) > 1, x0 = x0.'; end 44 | if size(L,2) > 1, L = L.' ; end 45 | 46 | if nargin >= 5 && ~isempty( linTerm ) 47 | if size(linTerm,2) > 1, linTerm = linTerm.'; end 48 | x0 = x0 - (d.*linTerm + L*(L'*linTerm) ); 49 | end 50 | 51 | if nargin < 4 || isempty(lambda), lambda = 1; end 52 | if numel(lambda)>1 53 | % rescale 54 | if size(lambda,2) > 1, lambda = lambda.'; end 55 | if size(lambda,2) > 1 56 | lambda = diag(lambda); 57 | end 58 | d = lambda.*d; 59 | elseif lambda ~= 1 60 | % rescale 61 | d = lambda*d; 62 | L = sqrt(lambda)*L; 63 | end 64 | if isscalar(d), d = d*ones(size(x0)); end 65 | N = length(x0); 66 | 67 | % Now, we can pretend lambda=1 and linTerm=0, since they have been accounted for 68 | 69 | shrinkVec = @(x,d) sign(x).*max( abs(x) - d, 0 ); 70 | 71 | % If there is no low-rank term: 72 | if nargin < 3 || isempty(L) 73 | x = shrinkVec( x0, d ); 74 | cBest = 0; 75 | cnt = 0; 76 | return; 77 | end 78 | 79 | % Account for cases when L has many zeros... 80 | nonzeroL = find( abs(L) > 100*eps ); 81 | if length(nonzeroL) < N 82 | L_HAS_ZEROS = true; 83 | 84 | % and reduce the rest of it to a smaller problem: 85 | old_L = L; 86 | old_x0 = x0; 87 | old_d = d; 88 | x0 = x0(nonzeroL); 89 | d = d(nonzeroL); 90 | L = L(nonzeroL); 91 | else 92 | L_HAS_ZEROS = false; 93 | end 94 | R = L; 95 | if numel(lambda)>1 % For diagonal lambda 96 | if L_HAS_ZEROS 97 | R = lambda(nonzeroL).*R; 98 | else 99 | R = lambda.*R; 100 | end 101 | end 102 | 103 | c1 = (x0+d)./L; % if x_i < 0 104 | c2 = (x0-d)./L; % if x_i > 0 105 | c = [c1,c2]; 106 | cList = sort(c(:)); % list of break-points. 107 | offset = 1e0; 108 | cList2 = [ cList(1)-offset; cList + [diff(cList)/2;offset] ]; % look in-between stuff 109 | cListInf = [-Inf; cList; +Inf ]; 110 | sL = sign(L); 111 | 112 | sLc1 = sL.*c1; % precompute 113 | sLc2 = sL.*c2; 114 | 115 | NN = length(cList2); 116 | % Keep track of counters: 117 | mn = 1; 118 | mx = NN; 119 | cnt = 0; 120 | j = round( (mn+mx)/2 ); 121 | 122 | % This loop would be nice in a mex file 123 | % (we want to do the "sort" in Matlab, since Matlab has a great sort function) 124 | 125 | % -------- Main loop --------------------- 126 | maxIt = NN+3; 127 | while cnt < maxIt % should never max out, but just in case of infinite loop due to coding error... 128 | cnt = cnt + 1; 129 | ci = cList2(j); 130 | 131 | % -- Step 1: estimate the support 132 | dx = ( sL*ci < sLc2 ) - ( sL*ci > sLc1 ); 133 | 134 | 135 | Tc = ~dx; 136 | T = ~~dx; 137 | alpha = R(T)'*dx(T); 138 | 139 | invA_vec = 1./d(Tc); % precompute for speed 140 | 141 | 142 | u = L(Tc); 143 | v = R(Tc); % since lambda nonscalar, we may have u ~= v 144 | 145 | vv = invA_vec.*v; 146 | zc = 1 + vv'*u; 147 | % QQ = invA - invA*u*v'*invA/zc; % conceptually, this is what we do, but this is slow numerically 148 | % dxTc = QQ*(x0(Tc) - alpha*L(Tc) ); 149 | 150 | % Make the above faster: 151 | yy = x0(Tc) - alpha*L(Tc); 152 | dxTc = invA_vec.*(yy - u*(vv'*yy)/zc); 153 | 154 | dx(Tc) = dxTc; 155 | cEst = R'*dx; % based on this support, this is our estimate of the shrinkage scalar 156 | 157 | % Test if this shrinkage scalar is permissible 158 | if cEst < cListInf(j) 159 | % We need to decrease the value of c 160 | mx = j; 161 | elseif cEst > cListInf(j+1) 162 | % We need to increase the value of c 163 | mn = j; 164 | else 165 | % The support is acceptable! 166 | cBest = cEst; 167 | if any( abs(dxTc) > 1 ) 168 | disp('Weird behavior: bad subgradient'); 169 | cBest = NaN; 170 | end 171 | break; 172 | end 173 | 174 | % Next direction: 175 | % j = round( (mn+mx)/2 ); 176 | if mx > mn + 1 177 | j = round( (mn+mx)/2 ); 178 | else 179 | % There are only two left, [mn mn+1] 180 | if j == mn 181 | j = mn+1; 182 | else 183 | j = mn; 184 | end 185 | end 186 | 187 | 188 | end 189 | assert( cnt < maxIt, 'rank-1 prox algorithm failed to converge'); 190 | if isnan(cBest) 191 | warning('Found NaN','prox_l1_weighted:failed'); 192 | x = NaN; 193 | else 194 | % Account for cases when L has many zeros... 195 | if L_HAS_ZEROS 196 | x = shrinkVec( old_x0 - cBest*old_L, old_d ); % for 197 | else 198 | % In this case, I didn't waste the memory to copy L, x0 and d 199 | x = shrinkVec( x0 - (cBest)*L, d ); 200 | end 201 | end 202 | -------------------------------------------------------------------------------- /paperExperiments/Lasso/runTestsForPaper.m: -------------------------------------------------------------------------------- 1 | %{ 2 | For the paper, we ran tests 4 and 5 3 | test 4 is Fig 6.1 (left) from https://arxiv.org/pdf/1801.08691.pdf 4 | (similar to Fig 1.a from https://arxiv.org/pdf/1206.1156.pdf ) 5 | 6 | test 5 is Fig 6.1 (right) from https://arxiv.org/pdf/1801.08691.pdf 7 | (similar to Fig 1.b from https://arxiv.org/pdf/1206.1156.pdf ) 8 | 9 | We compare with 3rd party codes, but we don't redistribute their code, so 10 | we have documented where we got their code from and you are free to install 11 | their code and compare. 12 | 13 | -- Stephen Becker, Feb 2018 14 | %} 15 | %% L-BFGS-B 16 | %{ 17 | We wrote our own Matlab wrapper for this (using the L-BFGS-B 3.0 Fortran 18 | code). You can download it from: https://github.com/stephenbeckr/L-BFGS-B-C 19 | 20 | Unpack it somewhere and run lbfgsb_C/Matlab/compile_mex.m 21 | %} 22 | addpath ~/Repos/lbfgsb_C/Matlab 23 | %% ASA 24 | %{ 25 | http://users.clas.ufl.edu/hager/papers/Software/ 26 | as of 2013, they have v 3.0 27 | http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-3.0.tar.gz 28 | but old code is still online at: 29 | http://users.clas.ufl.edu/hager/papers/CG/Archive/ASA_CG-2.2.tar.gz 30 | (old link was http://www.math.ufl.edu/~hager/papers/CG/Archive/ASA_CG-2.2.tar.gz, that's bad now) 31 | You also need the Matlab interface; we wrote this ourself, and it can be 32 | downloaded from Mathworks (it will also download the main C source code for you) 33 | https://www.mathworks.com/matlabcentral/fileexchange/35814-mex-interface-for-bound-constrained-optimization-via-asa 34 | 35 | If you download the Matlab interface, run the test_ASA.m script 36 | and it will downlaoad the ASA code that it needs. 37 | 38 | %} 39 | addpath('~/Documents/MATLAB/packages/ASA_CG_matlabWrapper'); 40 | %% CGIST 41 | %{ 42 | Get CGIST from: 43 | http://tag7.web.rice.edu/CGIST.html 44 | or http://tag7.web.rice.edu/CGIST_files/cgist.zip 45 | %} 46 | addpath('~/Documents/MATLAB/packages/cgist'); 47 | %% FPC 48 | %{ 49 | Get FPC AS from: 50 | http://www.caam.rice.edu/~optimization/L1/FPC_AS/request-for-downloading-fpc_as.html 51 | %} 52 | addpath('~/Documents/MATLAB/packages/FPC_AS_v1.21/src'); 53 | %% L1General and PSSas 54 | %{ 55 | Get the L1General2 code from 56 | https://www.cs.ubc.ca/~schmidtm/Software/thesis.html 57 | or https://www.cs.ubc.ca/~schmidtm/Software/thesis.zip 58 | 59 | Note: you need to compile mex files for this (for the lbfgs subroutine) 60 | For compilation, try: SchmidtThesis/minFunc/mexAll.m 61 | 62 | 2018, line 13 in lbfgsC.c, " int nVars,nSteps,lhs_dims[2];" 63 | With Matlab R2017b, this causes problems. Remove the lhs_dims[2] and add 64 | a new line with: " size_t lhs_dims[2];" 65 | 66 | %} 67 | addpath ~/Documents/MATLAB/packages/SchmidtThesis/L1General2/ 68 | addpath ~/Documents/MATLAB/packages/SchmidtThesis/misc/ 69 | addpath ~/Documents/MATLAB/packages/SchmidtThesis/minFunc/ 70 | 71 | %% Setup a problem 72 | 73 | randn('state',234213); rand('state',2342343); 74 | 75 | % --- fcn setup --- 76 | TEST = 4; 77 | % TEST = 5; 78 | switch TEST 79 | case 4 80 | % compressed sensing... 81 | N = 3000; % any larger than 5000 and it takes a while to get the norm(A) 82 | lambda = .1; 83 | A = randn(N/2,N); 84 | b = randn(size(A,1),1); 85 | Q = A'*A; c = A'*b; 86 | 87 | case 5 88 | % See Fletcher's paper 89 | n = 13; 90 | N = n^3; 91 | fprintf('N is %d\n', N ); 92 | lambda = 1; 93 | 94 | I = eye(n); 95 | BDG = -( diag( ones(n-1,1), 1 ) + diag( ones(n-1,1), -1 ) ); 96 | T = 6*I + BDG; 97 | 98 | W = kron( I, T ) + kron( BDG, I ); 99 | Q = kron( I, W ) + kron( BDG, eye(n^2) ); 100 | 101 | sigma = 20; a1 = 0.4; a2 = 0.7; a3 = 0.5; 102 | pdeSol = @(x,y,z) x.*(x-1).*y.*(y-1).*z.*(z-1).*exp( ... 103 | -.5*sigma^2*( (x-a1).^2 + (y-a2).^2 + (z-a3).^3 ) ); 104 | % Find rhs c = Q*u, where u is the solution above 105 | h = 1/(n+1); 106 | grd = h:h:(1-h); % interior points 107 | [X,Y,Z] = meshgrid(grd); 108 | u_pde = pdeSol( X, Y, Z ); 109 | c = Q*vec(u_pde); 110 | fprintf('||c||_inf is %g\n', norm(c,Inf) ); 111 | 112 | A = chol(Q); % has small condition number, e.g. 8, and is upper bi-diagonal 113 | b = (A')\c; 114 | end 115 | 116 | %% More setup 117 | 118 | % --- Plotting and such --- 119 | 120 | NAMES = {}; 121 | OBJECTIVES = {}; 122 | TIMES = {}; 123 | % ------------------------- 124 | if size(A,1) < size(Q,1) 125 | if issparse(Q), normQ = normest(A*A'); else normQ = norm(A*A'); end 126 | else 127 | if issparse(Q), normQ = normest(Q); else normQ = norm(Q); end 128 | end 129 | lambdaVect = lambda*ones(N,1); 130 | fcn = @(w) w'*(Q*w)/2 - c'*w + lambda*norm(w,1); 131 | 132 | % NOTE: the non-standard form (not |Ax-b|, rather ) 133 | fcnSimple = @(w) w'*(Q*w)/2 - c'*w; 134 | gradSimple = @(w) Q*w - c; % doesn't include non-smooth portion 135 | % for L-BFGS-B, we will add to gradSimple, since we have made new smooth terms 136 | 137 | % for SR1 138 | prox = @(x0,d,l) prox_l1_rank1( x0, d, l, lambda ); 139 | 140 | % Setup operators for L-BFGS-B 141 | pos = @(w) w(1:N,:); 142 | neg = @(w) w(N+1:2*N,:); 143 | dbl = @(gg) [gg;-gg]; 144 | lambdaVect2 = [lambdaVect;lambdaVect]; 145 | fcn2 = @(w) fcnSimple( pos(w) - neg(w) ) + lambdaVect2'*w; 146 | grad2 = @(w) dbl(gradSimple(pos(w)-neg(w))) + lambdaVect2; 147 | 148 | 149 | %% SR1 150 | disp('Solving via SR1 with l1 constraint ...'); 151 | % fcn and grad are defined above now... 152 | 153 | opts = struct('N',N,'verbose',50,'nmax',4000,'tol',1e-14); 154 | % opts.x0 = .1*ones(N,1); % use this for SR1 versions 155 | % opts.nmax = 5; 156 | opts.BB = true; 157 | % opts.theta = []; opts.restart=6; % use [] for FISTA 158 | opts.theta = 1; opts.SR1 = true; 159 | opts.SR1_diagWeight=0.8; 160 | 161 | opts.L = normQ; 162 | 163 | opts.backtrack = false; 164 | 165 | tic 166 | % The code I used for the 2012 tests 167 | % [xk,nit, errStruct,optsOut] = zeroSR1(fcn,gradSimple,prox,opts); 168 | 169 | % Dec '12, try our simplified code: 170 | opts = rmfield(opts,{'theta','backtrack'}); 171 | [xk,nit, errStruct,optsOut] = zeroSR1_noLinesearch(fcn,gradSimple,prox,opts); 172 | 173 | tm = toc; 174 | NAMES{end+1} = '0-mem SR1'; 175 | OBJECTIVES{end+1} = errStruct(:,1); 176 | TIMES{end+1} = tm; 177 | %% and run our code, but choose FISTA... 178 | 179 | opts.BB = true; 180 | opts.theta = []; opts.restart=1000; % use [] for FISTA 181 | % opts.theta = 1; 182 | opts.SR1 = false; 183 | 184 | opts.backtrack = true; 185 | 186 | tic 187 | [xk,nit, errStruct,optsOut] = zeroSR1(fcn,gradSimple,prox,opts); 188 | tm = toc; 189 | NAMES{end+1} = 'FISTA w/ BB'; % with linesearch 190 | OBJECTIVES{end+1} = errStruct(:,1); 191 | TIMES{end+1} = tm; 192 | %% and run our code, but choose BB... 193 | 194 | opts.BB = true; 195 | opts.theta = 1; 196 | opts.SR1 = false; 197 | 198 | opts.backtrack = true; 199 | 200 | tic 201 | [xk,nit, errStruct,optsOut] = zeroSR1(fcn,gradSimple,prox,opts); 202 | tm = toc; 203 | NAMES{end+1} = 'SPG/SpaRSA'; % with linesearch 204 | OBJECTIVES{end+1} = errStruct(:,1); 205 | TIMES{end+1} = tm; 206 | %% Run L-BFGS-B 207 | if ~exist('lbfgsb','file') 208 | disp('Cannot find L-BFGS-B on your path, so skipping this test'); 209 | else 210 | %{ 211 | Solve min L(x) + lambda*||x||_1 by formulating as: 212 | min_{z,y} L(z-y) + ones(2N,1)'*[z,y] 213 | s.t. 214 | z,y >= 0. i.e. "x" is z - y 215 | 216 | 217 | if we switch to simple x >= 0 formulation, then it solves it in 2 steps!! 218 | 219 | %} 220 | disp('Solving via L-BFGS-B...'); 221 | 222 | tic 223 | fun = @(x)fminunc_wrapper( x, fcn2, grad2); 224 | opts = struct( 'factr', 1e4, 'pgtol', 1e-12, 'm', 10, 'maxIts', 20000, 'maxTotalIts',1e6 ); 225 | opts.printEvery = 100; 226 | opts.factr = 1e1; % more accurate soln 227 | if N > 200 228 | opts.factr = 1e-2; 229 | opts.pgtol = 1e-14; 230 | end 231 | % opts.factr = 1e7; % default 232 | [x2, ~, info] = lbfgsb(fun, zeros(2*N,1), inf(2*N,1), opts ); 233 | x = pos(x2) - neg(x2); 234 | tm = toc; 235 | 236 | NAMES{end+1} = 'L-BFGS-B'; 237 | OBJECTIVES{end+1} = info.err(:,1); 238 | TIMES{end+1} = tm; 239 | end 240 | %% Run ASA 241 | if ~exist('asa_wrapper','file') 242 | disp('Cannot find ASA on your path, so skipping this test'); 243 | else 244 | 245 | % param = struct('A',A,'b',b); 246 | % param = struct('A',A,'b',b,'lambda',lambda); % No, I am not using this format... 247 | % an alternative way: 248 | 249 | % param = struct('Q',Q,'c',-c,'lambda',lambda); 250 | param = struct('Q',[Q,-Q;-Q,Q],'c',-[c;-c]+lambdaVect2,'offset',0); 251 | param.maxits = 1e6; 252 | 253 | % if isfield( MAXITS, 'ASA' ) && ~isempty( MAXITS.ASA ) 254 | % param.maxits = min( param.maxits, MAXITS.ASA ); 255 | % end 256 | 257 | % add some options (these are optional). See driver1.c for examples, 258 | % and see asa_user.h for all possible values 259 | [opts,CGopts] = deal(struct('PrintParms',false)); 260 | opts.PrintParms = 0; 261 | opts.PrintFinal = 1; 262 | opts.PrintLevel = 0; 263 | opts.StopFac = 1e-9; 264 | 265 | % zero-out the counters 266 | asa_quadratic_fcnGrad(); 267 | 268 | lo = zeros(2*N,1); 269 | hi = inf(2*N,1); 270 | % x0 = ones(2*N,1); 271 | x0 = zeros(2*N,1); 272 | % run the function 273 | disp('starting...'); 274 | tic 275 | [x2,status,statistics] = asa_wrapper( x0, lo, hi,'asa_quadratic_fcn',... 276 | 'asa_quadratic_grad', 'asa_quadratic_fcnGrad', opts, CGopts, param); 277 | tm = toc; 278 | x = pos(x2) - neg(x2); 279 | % View the function values 280 | [fcnHistory] = asa_quadratic_fcnGrad(); 281 | 282 | NAMES{end+1} = 'ASA'; 283 | OBJECTIVES{end+1} = fcnHistory; 284 | TIMES{end+1} = tm; 285 | end 286 | %% Run PSSas and OWN (stuff from L1General toolbox) 287 | if ~exist('L1General2_PSSas','file') || ~exist('L1General2_OWL','file') 288 | disp('Cannot find PSSas or OWL and L1General on your path, so skipping this test'); 289 | else 290 | 291 | gOptions = []; 292 | gOptions.maxIter = 4000; 293 | gOptions.verbose = 1; % Set to 0 to turn off output 294 | gOptions.corrections = 10; % for L-BFGS 295 | gOptions.optTol = 1e-14; 296 | gOptions.progTol = 1e-15; 297 | 298 | % funObj = @(x)fminunc_wrapper( x, fcn, gradSimple); 299 | funObj = @(x)fminunc_wrapper( x, fcnSimple, gradSimple,[]); 300 | % This works well for error, but not for objective fcn value, 301 | % since this is only the smooth portion. So we need to add in 302 | % a non-smooth term that gets added just to the history. 303 | extraFcn = @(x) lambda*norm(x,1); 304 | funObj = @(x)fminunc_wrapper( x, fcnSimple, gradSimple,[],[],extraFcn); 305 | 306 | 307 | w_init = zeros(N,1); 308 | 309 | fprintf('\nProjected Scaled Sub-Gradient (Active-Set variant)\n'); 310 | options = gOptions; 311 | 312 | fminunc_wrapper(); 313 | tic 314 | [wk,objectiveValues] = L1General2_PSSas(funObj,w_init,lambdaVect,options); 315 | tm = toc; 316 | if isempty( objectiveValues ) % it stopped on first iter 317 | % do it again, with larger starting guess 318 | w_init = ones(N,1); 319 | fminunc_wrapper(); 320 | tic 321 | [wk,objectiveValues] = L1General2_PSSas(funObj,w_init,lambdaVect,options); 322 | tm = toc; 323 | end 324 | [fcnHistory,errHistory] =fminunc_wrapper(); 325 | NAMES{end+1} = 'PSSas'; 326 | OBJECTIVES{end+1} = fcnHistory; 327 | TIMES{end+1} = tm; 328 | 329 | 330 | % And re-run for the OWL code 331 | fminunc_wrapper(); 332 | tic 333 | wk = L1General2_OWL(funObj,w_init,lambdaVect,options); 334 | tm = toc; 335 | [fcnHistory,errHistory] =fminunc_wrapper(); 336 | NAMES{end+1} = 'OWL'; 337 | OBJECTIVES{end+1} = fcnHistory; 338 | TIMES{end+1} = tm; 339 | 340 | end 341 | %% run cgist 342 | if ~exist('cgist','file') 343 | disp('Cannot find cgist on your path, so skipping this test'); 344 | else 345 | % solves ||Ax-f||^2 + lambda*|x|_1 346 | % So, from /2 -c'*x format, we have 347 | % 348 | regularizer = 'l1'; 349 | opts = []; 350 | opts.tol = 1e-8; 351 | opts.record_objective = true; 352 | opts.record_iterates = false; % big! 353 | opts.errFcn = []; 354 | tic 355 | [xk, multCount, subgradientNorm, out] = cgist(A,[],b,lambda,regularizer,opts); 356 | tm = toc; 357 | % need to subtract norm(b)^2/2 to get objective fcn to line up 358 | out.objectives = out.objectives - norm(b)^2/2; 359 | 360 | NAMES{end+1} = 'CGIST'; 361 | OBJECTIVES{end+1} = out.objectives; 362 | TIMES{end+1} = tm; 363 | end 364 | %% run FPC-AS 365 | if ~exist('FPC_AS','file') 366 | disp('Cannot find FPC_AS on your path, so skipping this test'); 367 | else 368 | % v 1.1, 10/2008 Zaiwen Wen 369 | % 370 | % For some reason, need to give it some negatives... (-x vs +x) 371 | 372 | opts = []; 373 | opts.gtol = 1e-9; % a termination option of FPC_AS; see manual 374 | opts.mxitr = 6e3; 375 | opts.sub_mxitr = 80; % # of sub-space iterations (max) 376 | opts.lbfgs_m = 5; % storage 377 | opts.record = 0; % -1,0,1 378 | opts.PrintOptions = 0; 379 | % opts.scale_A = 1; 380 | M = []; 381 | % M = 10*eye(N); 382 | sc = 1; 383 | tic 384 | [x, out] = FPC_AS(N,-A/sc,b/sc,lambda/sqrt(sc),M,opts); 385 | tm = toc; 386 | out.fcnHist = out.fcnHist - norm(b)^2/2; 387 | NAMES{end+1} = 'FPC-AS'; 388 | OBJECTIVES{end+1} = out.fcnHist; 389 | TIMES{end+1} = tm; 390 | 391 | end 392 | %% PLOT EVERYTHING 393 | figure(1); clf; 394 | 395 | obj_best = Inf; 396 | for k = 1:length(OBJECTIVES) 397 | obj_best = min(obj_best, min( OBJECTIVES{k}) ); 398 | end 399 | 400 | for k = 1:length(NAMES) 401 | tGrid = linspace(0,TIMES{k},length(OBJECTIVES{k})); 402 | h=semilogy( tGrid, cummin( OBJECTIVES{k} - obj_best) ); 403 | 404 | set(h,'linewidth',2); 405 | 406 | hold all 407 | end 408 | legend(NAMES) 409 | xlabel('time in seconds','fontsize',18); 410 | ylabel('objective value error','fontsize',18); 411 | set(gca,'fontsize',18) 412 | switch TEST 413 | case 4 414 | title('Fig 6.1 (left) from https://arxiv.org/pdf/1801.08691.pdf'); 415 | xlim([0,110]); 416 | ylim([1e-8,1e4]); 417 | case 5 418 | title('Fig 6.1 (right) from https://arxiv.org/pdf/1801.08691.pdf'); 419 | xlim([0,2.5]); 420 | ylim([1e-8,1e9]); 421 | end 422 | -------------------------------------------------------------------------------- /paperExperiments/Lasso/test4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/paperExperiments/Lasso/test4.png -------------------------------------------------------------------------------- /paperExperiments/Lasso/test5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/paperExperiments/Lasso/test5.png -------------------------------------------------------------------------------- /paperExperiments/Lasso/zeroSR1_noLinesearch.m: -------------------------------------------------------------------------------- 1 | function [xk,nit, errStruct, defaultOpts, stepsizes] = zeroSR1_noLinesearch(fcn,grad,prox,opts) 2 | % ZEROSR1_NOLINESEARCH Solves smooth + nonsmooth/constrained optimization problems 3 | % [xk,nit, errStruct, outOpts] = zeroSR1_noLinesearch(f,g,proj,opts) 4 | % 5 | % This uses the zero-memory SR1 method (quasi-Newton) to solve: 6 | % 7 | % min_x f(x) + h(x) 8 | % 9 | % where 10 | % 'f' calculates f(x), 'g' calculates the gradient of f at x, 11 | % and h(x) is a non-smooth term that can be infinite-valued (a constraint), 12 | % so long as you present a function 'prox' that computes diagional plus 13 | % rank-1 projections. The 'prox' function should accept at least three inputs: 14 | % 15 | % y = prox( x0 , d, v, ) 16 | % where 17 | % y = argmin_x h(x) + 1/2||x-x0||^2_B 18 | % where 19 | % B = inv(H) = inv( diag(D) + v*v' ) 20 | % 21 | % If 'prox' isn't provided or is [], it defaults to the identity mapping, which corresponds 22 | % to the case when h=0. 23 | % 24 | % "opts" is a structure with additional options. To see their default values, 25 | % call this function with no input arguments. 26 | % 27 | % .tol is a tolerance for relative variation 28 | % .nmax is max # of allowed iterations 29 | % .verbose can be either 0 (no output), 1 (every iteration), or n 30 | % If 'n' is an integer greater than 1, output will be written 31 | % every n iterations 32 | % .x0 33 | % starting vector 34 | % .N 35 | % size of primal domain (only necessary of x0 wasn't provided) 36 | % 37 | % .SR1 if true, uses the zero-memory SR1 method 38 | % if false, uses gradient descent/forward-backward method 39 | % (or variant, such as FISTA, or BB stepsizes as in the SPG method) 40 | % .BB 41 | % use the Barzilai-Borwein scalar stepsize 42 | % 43 | % .errFcn can be an arbitrary function that calculates an error metric 44 | % on the primal variable at every iteration. 45 | % 46 | % 47 | % Output "errStruct" contains three or four columns: 48 | % (1) objective function 49 | % (2) norm of gradient 50 | % (3) stepsize 51 | % (4) error (i.e. the output of errFcn, if provided) 52 | % 53 | % Stephen Becker and Jalal Fadili, Nov 24 2011 -- Dec 2012 54 | % Copied from zeroSR1.m Dec 11 2012 55 | % (zeroSR1.m is the "full version of the code with more bells and 56 | % whistles, and also allows Nesterov acceleration and over-relaxation. 57 | % This version is designed to have more human readable source-code. ) 58 | % See also zeroSR1.m 59 | 60 | 61 | 62 | % ----------------------------------------------------------------- 63 | % ------------ Boring initializations ----------------------------- 64 | % ------------ for understanding the algorithm, skip ahead -------- 65 | % ------------ to where it says "Begin algorithm"------------------ 66 | % ----------------------------------------------------------------- 67 | 68 | if nargin == 0 || nargout >= 4 69 | RECORD_OPTS = true; 70 | defaultOpts = []; 71 | else 72 | RECORD_OPTS = false; 73 | end 74 | 75 | if nargin < 3 || isempty(prox), prox = @(x,diag,v) x; end 76 | if nargin < 4, opts = []; end 77 | 78 | function out = setOpts( field, default, mn, mx ) 79 | if ~isfield( opts, field ) 80 | opts.(field) = default; 81 | end 82 | out = opts.(field); 83 | if nargin >= 3 && ~isempty(mn) && any(out < mn), error('Value is too small'); end 84 | if nargin >= 4 && ~isempty(mx) && any(out > mx), error('Value is too large'); end 85 | opts = rmfield( opts, field ); % so we can do a check later 86 | if RECORD_OPTS 87 | defaultOpts.(field) = out; 88 | end 89 | end 90 | 91 | 92 | fid = setOpts( 'fid', 1 ); % print output to the screen or a file 93 | myDisp = @(str) fprintf(fid,'%s\n', str ); 94 | tol = setOpts( 'tol', 1e-6 ); 95 | grad_tol= setOpts( 'grad_tol', tol ); 96 | nmax = setOpts( 'nmax', 1000 ); 97 | errFcn = setOpts( 'errFcn', [] ); 98 | VERBOSE = setOpts( 'verbose', false ); 99 | if isinf(VERBOSE), VERBOSE = false; end 100 | maxStag = setOpts( 'maxStag', 10 ); % force very high accuracy 101 | xk = setOpts( 'x0', [] ); 102 | N = setOpts( 'N', length(xk) ); 103 | if N==0 && nargin > 0, error('for now, must specify opts.N = N'); end 104 | if isempty(xk), xk = zeros(N,1); end 105 | 106 | % -- Options that concern the stepsize -- 107 | L = setOpts( 'L', 1, 0 ); % Lipschitz constant, e.g. norm(A)^2 108 | SR1 = setOpts( 'SR1', false ); 109 | SR1_diagWeight = setOpts( 'SR1_diagWeight', 0.8 ); 110 | BB = setOpts( 'BB', SR1 ); 111 | 112 | if SR1, BB_type = setOpts('BB_type',2); 113 | else, BB_type = setOpts('BB_type',1); % faster, generally 114 | end 115 | if SR1 && BB_type == 1 116 | warning('With zero-memory SR1, BB_type must be set to 2. Forcing BB_type = 2 and continuing','zeroSR1:BB_warn'); 117 | BB_type = 2; 118 | end 119 | % ------------ Scan options for capitalization issues, etc. ------- 120 | if nargin == 0 121 | disp('Default options:'); 122 | disp( defaultOpts ); 123 | end 124 | if ~isempty(fieldnames(opts)) 125 | disp('Error detected! I didn''t recognize these options:'); 126 | disp( opts ); 127 | error('Bad options'); 128 | end 129 | if nargin == 0 , return; end 130 | 131 | % ------------ Initializations and such --------------------------- 132 | xk_old = xk; 133 | gradient = zeros(N,1); 134 | fxold = Inf; 135 | t = 1/L; % initial stepsize 136 | stepsizes = zeros(nmax,1 + SR1); % records some statisics 137 | if ~isempty(errFcn) 138 | if ~isa(errFcn,'function_handle') 139 | error('errFcn must be a function'); 140 | end 141 | errStruct = zeros( nmax, 4 ); % f, norm(gx), step, err 142 | else 143 | errStruct = zeros( nmax, 3 ); % f, norm(gx), step 144 | end 145 | skipBB = false; 146 | stag = 0; 147 | 148 | % ----------------------------------------------------------------- 149 | % ------------ Begin algorithm ------------------------------------ 150 | % ----------------------------------------------------------------- 151 | for nit = 1:nmax 152 | 153 | gradient_old = gradient; 154 | gradient = grad(xk); 155 | 156 | % "sk" and "gk" are the vectors that will give us quasi-Newton 157 | % information (and also used in BB step, since that can be 158 | % seen as a quasi-Newton method) 159 | sk = xk - xk_old; 160 | gk = gradient - gradient_old; % this is "yk" in Nocedal/Wright 161 | if nit > 1 && norm(gk) < 1e-13 162 | warning('gradient isn''t changing , try changing opts.L','specialSR1:zeroChangeInGradient'); 163 | gk = []; 164 | skipBB = true; 165 | end 166 | 167 | 168 | % --------------------------------------------------------------------- 169 | % -- Find an initial stepsize -- 170 | % --------------------------------------------------------------------- 171 | t_old = t; 172 | if BB && nit > 1 && ~skipBB 173 | switch BB_type 174 | case 1 175 | t = (norm(sk)^2)/(sk'*gk); % eq (1.6) in Dai/Fletcher. This is longer 176 | case 2 177 | t = sk'*gk/( norm(gk)^2 ); % eq (1.7) in Dai/Fletcher. This is shorter 178 | end 179 | if t < 1e-14 % t < 0 should not happen on convex problem! 180 | myDisp('Curvature condition violated!'); 181 | stag = Inf; 182 | end 183 | if SR1 184 | % we cannot take a full BB step, otherwise we exactly satisfy the secant 185 | % equation, and there is no need for a rank-1 correction. 186 | t = SR1_diagWeight*t; % SR1_diagWeights is a scalar less than 1 like 0.6 187 | end 188 | H0 = @(x) t*x; 189 | diagH = t*ones(N,1); 190 | else 191 | t = 1/L; 192 | H0 = @(x) t*x; % diagonal portion of inverse Hessian 193 | diagH = t*ones(N,1); 194 | end 195 | skipBB = false; 196 | stepsizes(nit,1) = t; 197 | 198 | 199 | 200 | % --------------------------------------------------------------------- 201 | % -- Quasi-Newton -- Requries: H0, and builds H 202 | % --------------------------------------------------------------------- 203 | if SR1 && nit > 1 && ~isempty(gk) 204 | gs = gk'*sk; 205 | gHg = gk'*(diagH.*gk); 206 | if gs < 0, myDisp('Serious curvature condition problem!'); stag = Inf; end 207 | H0 = @(x) diagH.*x; 208 | vk = sk - H0(gk); 209 | if vk'*gk <= 0 210 | myDisp('Warning: violated curvature conditions'); 211 | % This should only happen if we took an exact B-B step, which we don't. 212 | vk = []; 213 | H = H0; 214 | else 215 | vk = vk/sqrt( vk'*gk ); 216 | % And at last, our rank-1 approximation of the inverse Hessian. 217 | H = @(x) H0(x) + vk*(vk'*x); 218 | % The (inverse) secant equation is B*sk = gk(=y), or Hy=s 219 | % N.B. We can make a rank-1 approx. of the Hessian too; see the full 220 | % version of the code. 221 | end 222 | stepsizes(nit,2) = vk'*vk; 223 | else 224 | H = H0; 225 | vk= []; 226 | end 227 | 228 | 229 | % --------------------------------------------------------------------- 230 | % -- Make the proximal update ----------------------------------------- 231 | % --------------------------------------------------------------------- 232 | p = H(-gradient); % Scaled descent direction. H includes the stepsize 233 | xk_old = xk; 234 | xk = prox( xk_old + p, diagH, vk ); % proximal step 235 | norm_grad = norm( xk - xk_old ); 236 | if any(isnan(xk)) || norm(xk) > 1e10 237 | stag = Inf; % will cause it to break 238 | xk = xk_old; 239 | myDisp('Prox algorithm failed, probably due to numerical cancellations'); 240 | end 241 | 242 | % --------------------------------------------------------------------- 243 | % -- The rest of the code is boring. The algorithmic stuff is done. --- 244 | % --------------------------------------------------------------------- 245 | % -- record function values -- 246 | % --------------------------------------------------------------------- 247 | fx = fcn(xk); 248 | df = abs(fx - fxold)/abs(fxold); 249 | fxold = fx; 250 | 251 | printf('Iter: %5d, f: %.3e, df: %.2e, ||grad||: %.2e, step %.2e\n',... 252 | nit,fx,df, norm_grad, t); 253 | 254 | errStruct(nit,1) = fx; 255 | errStruct(nit,2) = norm_grad; 256 | errStruct(nit,3) = t; 257 | if ~isempty(errFcn) 258 | errStruct(nit,4) = errFcn( xk ); 259 | printf('\b, err %.2e\n', errStruct(nit,4) ); 260 | end 261 | 262 | if (df < tol) || ( t < 1e-10 ) || (isnan(fx) ) || norm_grad < grad_tol 263 | stag = stag + 1; 264 | end 265 | if stag > maxStag 266 | if VERBOSE, myDisp('Quitting (e.g. reached tolerence)...'); end 267 | break; 268 | end 269 | 270 | end 271 | 272 | if nit == nmax && VERBOSE, myDisp('Maxed out iteration limit'); end 273 | if nit < nmax 274 | errStruct = errStruct( 1:nit, : ); 275 | stepsizes = stepsizes( 1:nit, : ); 276 | printf('Iter: %5d, f: %.3e, df: %.2e, ||grad||: %.2e, step %.2e\n',... 277 | nit,fx,df, norm_grad, t); 278 | if ~isempty(errFcn) 279 | printf('\b, err %.2e\n', errStruct(nit,4) ); 280 | end 281 | end 282 | 283 | % --------------------------------------------------------------------- 284 | % Nested functions: 285 | % --------------------------------------------------------------------- 286 | function printf(varargin) 287 | if VERBOSE 288 | if VERBOSE > 1 289 | if ~rem(nit,VERBOSE) 290 | fprintf(fid,varargin{:}); 291 | end 292 | else 293 | fprintf(fid,varargin{:}); 294 | end 295 | end 296 | end 297 | 298 | 299 | end % end of main routine 300 | -------------------------------------------------------------------------------- /paperExperiments/README.md: -------------------------------------------------------------------------------- 1 | # zeroSR1 toolbox: Experiments 2 | 3 | 4 | This folder contains the Matlab and Python code needed to reproduce the figures from our 2018 paper. 5 | The version of code used here may be slightly different than the updated algorithms in the main repository. 6 | 7 | Some third-party packages (not provided, though we list the URLs) are required if you want to compare with the other solvers mentioned in the paper. 8 | ## Authors 9 | The authors are Stephen Becker, Jalal Fadili and Peter Ochs 10 | 11 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/Algorithms/FISTA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import zeros, sqrt 3 | import time as clock 4 | 5 | def fista(model, oracle, options, tol, maxiter, check): 6 | """ 7 | 8 | FISTA algorithm for solving: 9 | 10 | min_{x} h(x); h(x):= g(x) + f(x) 11 | 12 | Update step: 13 | 14 | t_0 = 1 15 | t_kp1 = 0.5*(1.0 + sqrt(1.0 + 4*t_k**2)); 16 | beta_k = (t_k-1)/t_kp1; 17 | t_k = t_kp1; 18 | 19 | y^{k} = x^{k} + beta_k*(x^{k} - x^{k-1}) 20 | x^{k+1} = prox_{alpha*g}(y^{k} - alpha*grad f(y^{k})) 21 | 22 | 23 | Properties: 24 | ----------- 25 | f convex, continuously differentiable with L-Lipschitz 26 | continuous gradient 27 | g convex, simple 28 | 29 | Parameter: 30 | ---------- 31 | model model data of the optimization problem 32 | 33 | oracle: 34 | .'grad_f' computes the gradient of the objective grad f(x^{k}) 35 | .'prox_g' computes the proximal mapping of g 36 | .'fun_g' computes the value of g 37 | .'fun_f' computes the value of f 38 | .'residual' used for breaking condition or resor plots 39 | 40 | options (required): 41 | .'stepsize' stepsize alpha = 1/L (backtracking can be used) 42 | .'init' initialization 43 | 44 | options (optional): 45 | .'storeResidual' flag to store all residual values 46 | .'storeTime' flag to store the time of each iteration 47 | .'storePoints' flag to store all iterates 48 | .'storeObjective' flag to store all objective values 49 | .'storeBeta' flag to store beta values 50 | .'backtrackingMaxiter' if > 1, then backtracking is performed, which 51 | requires 'backtrackingFactor', otherwise default 52 | values are set and fixed step size is used througout 53 | .'backtrackingFactor' scaling of the step size when backtracking step 54 | is successful or not; value in (0,1) 55 | 56 | tol tolerance threshold for the residual 57 | maxiter maximal number of iterations 58 | check provide information after 'check' iterations 59 | 60 | Return: 61 | ------- 62 | output 63 | .'sol' solution of the problems 64 | .'seq_res' sequence of residual values (if activated) 65 | .'seq_time' sequence of time points (if activated) 66 | .'seq_x' sequence of iterates (if activated) 67 | .'seq_obj' sequence of objective values (if activated) 68 | .'seq_beta' sequence of beta values (overrelaxation parameter / if activated) 69 | .'breakvalue' code for the type of breaking condition 70 | 1: maximal number of iterations exceeded 71 | 2: breaking condition reached (residual below tol) 72 | 3: not enough backtracking iterations 73 | 74 | """ 75 | 76 | # store options 77 | if 'storeResidual' not in options: 78 | options['storeResidual'] = False; 79 | if 'storeTime' not in options: 80 | options['storeTime'] = False; 81 | if 'storePoints' not in options: 82 | options['storePoints'] = False; 83 | if 'storeObjective' not in options: 84 | options['storeObjective'] = False; 85 | if 'storeBeta' not in options: 86 | options['storeBeta'] = False; 87 | 88 | # backtracking options 89 | backtrackingMaxiter = 1; 90 | backtrackingFactor = 1.0; 91 | if 'backtrackingMaxiter' in options: 92 | backtrackingMaxiter = options['backtrackingMaxiter']; 93 | backtrackingFactor = options['backtrackingFactor']; 94 | 95 | # load oracle 96 | fun_f = oracle['fun_f']; 97 | fun_g = oracle['fun_g']; 98 | grad_f = oracle['grad_f']; 99 | prox_g = oracle['prox_g']; 100 | residual = oracle['residual']; 101 | 102 | # load parameter 103 | alpha = options['stepsize']; 104 | 105 | 106 | # initialization 107 | x_kp1 = options['init']; 108 | x_k = x_kp1.copy(); 109 | y_k = x_kp1.copy(); 110 | t_k = 1.0; 111 | f_kp1 = fun_f(x_kp1, model, options); 112 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 113 | res0 = residual(x_kp1, 1.0, model, options); 114 | 115 | # taping 116 | if options['storeResidual'] == True: 117 | seq_res = zeros(maxiter+1); 118 | seq_res[0] = 1; 119 | if options['storeTime'] == True: 120 | seq_time = zeros(maxiter+1); 121 | seq_time[0] = 0; 122 | if options['storePoints'] == True: 123 | seq_x = zeros((model['N'],maxiter+1)); 124 | seq_x[:,0] = x_kp1; 125 | if options['storeObjective'] == True: 126 | seq_obj = zeros(maxiter+1); 127 | seq_obj[0] = h_kp1; 128 | if options['storeBeta'] == True: 129 | seq_beta = zeros(maxiter); 130 | time = 0; 131 | 132 | # solve 133 | breakvalue = 1; 134 | for iter in range(1,maxiter+1): 135 | 136 | stime = clock.time(); 137 | 138 | # update variables 139 | t_kp1 = 0.5*(1.0 + sqrt(1.0 + 4*t_k**2)); 140 | beta = (t_k-1)/t_kp1; 141 | t_k = t_kp1; 142 | y_k = x_kp1 + beta*(x_kp1 - x_k); 143 | x_k = x_kp1.copy(); 144 | f_k = f_kp1.copy(); 145 | 146 | # compute gradient 147 | grad_k = grad_f(y_k, model, options); 148 | 149 | for iterbt in range(0,backtrackingMaxiter): 150 | 151 | # forward step 152 | x_kp1 = y_k - alpha*grad_k; 153 | 154 | # backward step 155 | x_kp1 = prox_g(x_kp1, alpha, model, options); 156 | 157 | # compute new value of smooth part of objective 158 | f_kp1 = fun_f(x_kp1, model, options); 159 | 160 | # no backtracking 161 | if backtrackingMaxiter == 1: 162 | break; 163 | 164 | # check backtracking breaking condition 165 | dx = x_kp1 - y_k; 166 | Delta = sum(grad_k*dx) + 0.5/alpha*sum(dx**2); 167 | if (f_kp1 < f_k + Delta + 1e-8): 168 | if iterbt == 0: 169 | alpha = alpha/backtrackingFactor; 170 | break; 171 | else: 172 | alpha = alpha*backtrackingFactor; 173 | if (iterbt+1 == backtrackingMaxiter): 174 | breakvalue = 3; 175 | 176 | 177 | # compute new objective value 178 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 179 | 180 | # check breaking condition 181 | res = residual(x_kp1, res0, model, options); 182 | if res < tol: 183 | breakvalue = 2; 184 | 185 | # tape residual 186 | time = time + (clock.time() - stime); 187 | if options['storeResidual'] == True: 188 | seq_res[iter] = res; 189 | if options['storeTime'] == True: 190 | seq_time[iter] = time; 191 | if options['storePoints'] == True: 192 | seq_x[:,iter] = x_kp1; 193 | if options['storeObjective'] == True: 194 | seq_obj[iter] = h_kp1; 195 | if options['storeBeta'] == True: 196 | seq_beta[iter-1] = beta; 197 | 198 | # print info 199 | if (iter % check == 0): 200 | print 'iter: %d, time: %5f, alpha: %f, beta: %f, res: %f' % (iter, time, alpha, beta, res); 201 | 202 | 203 | # handle breaking condition 204 | if breakvalue == 2: 205 | print('Tolerance value reached!!!'); 206 | break; 207 | elif breakvalue == 3: 208 | print('Not enough backtracking iterations!!!'); 209 | break; 210 | 211 | 212 | # return results 213 | output = { 214 | 'sol': x_kp1, 215 | 'breakvalue': breakvalue 216 | } 217 | 218 | if options['storeResidual'] == True: 219 | output['seq_res'] = seq_res; 220 | if options['storeTime'] == True: 221 | output['seq_time'] = seq_time; 222 | if options['storePoints'] == True: 223 | output['seq_x'] = seq_x; 224 | if options['storeObjective'] == True: 225 | output['seq_obj'] = seq_obj; 226 | if options['storeBeta'] == True: 227 | output['seq_beta'] = seq_beta; 228 | 229 | return output; 230 | 231 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/Algorithms/ForwardBackwardSplitting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import zeros 3 | import time as clock 4 | 5 | # Forward--backward splitting algorithm 6 | def fbs(model, oracle, options, tol, maxiter, check): 7 | """ 8 | 9 | Optimization problem: 10 | --------------------- 11 | 12 | min_{x} h(x); h(x) := g(x) + f(x) 13 | 14 | where: 15 | 16 | f continuously differentiable with L-Lipschitz continuous gradient 17 | g convex, simple 18 | 19 | Algorithm 1: 20 | ------------ 21 | 22 | // Initialization 23 | x^{0} = x_init 24 | // Update for k=0,...,maxiter 25 | x^{k+1} = prox_{alpha*g}(x^{k} - alpha*grad f(x^{k})) 26 | 27 | where: 28 | 29 | alpha in (0,2/L) or use backtracking 30 | 31 | 32 | Parameter: 33 | ---------- 34 | model model data of the optimization problem 35 | 36 | oracle: 37 | .'grad_f' computes the gradient of the objective grad f(x^{k}) 38 | .'prox_g' computes the proximal mapping of g 39 | .'fun_g' computes the value of g 40 | .'fun_f' computes the value of f 41 | .'residual' used for breaking condition or resor plots 42 | 43 | options (required): 44 | .'stepsize' stepsize alpha 45 | .'init' initialization 46 | 47 | options (optional): 48 | .'storeResidual' flag to store all residual values 49 | .'storeTime' flag to store the time of each iteration 50 | .'storePoints' flag to store all iterates 51 | .'storeObjective' flag to store all objective values 52 | .'backtrackingMaxiter' if > 1, then backtracking is performed, which 53 | requires 'backtrackingFactor', otherwise default 54 | values are set and fixed step size is used througout 55 | .'backtrackingFactor' scaling of the step size when backtracking step 56 | is successful or not; value in (0,1) 57 | 58 | tol tolerance threshold for the residual 59 | maxiter maximal number of iterations 60 | check provide information after 'check' iterations 61 | 62 | 63 | Return: 64 | ------- 65 | output 66 | .'sol' solution of the problems 67 | .'seq_res' sequence of residual values (if activated) 68 | .'seq_time' sequence of time points (if activated) 69 | .'seq_x' sequence of iterates (if activated) 70 | .'seq_obj' sequence of objective values (if activated) 71 | .'breakvalue' code for the type of breaking event 72 | 1: maximal number of iterations exceeded 73 | 2: breaking condition reached (residual below tol) 74 | 3: not enough backtracking/line sreach iterations 75 | 76 | """ 77 | 78 | # store options 79 | if 'storeResidual' not in options: 80 | options['storeResidual'] = False; 81 | if 'storeTime' not in options: 82 | options['storeTime'] = False; 83 | if 'storePoints' not in options: 84 | options['storePoints'] = False; 85 | if 'storeObjective' not in options: 86 | options['storeObjective'] = False; 87 | 88 | # backtracking options 89 | backtrackingMaxiter = 1; 90 | backtrackingFactor = 1.0; 91 | if 'backtrackingMaxiter' in options: 92 | backtrackingMaxiter = options['backtrackingMaxiter']; 93 | backtrackingFactor = options['backtrackingFactor']; 94 | 95 | # load oracle 96 | fun_f = oracle['fun_f']; 97 | fun_g = oracle['fun_g']; 98 | grad_f = oracle['grad_f']; 99 | prox_g = oracle['prox_g']; 100 | residual = oracle['residual']; 101 | 102 | # load parameter 103 | alpha = options['stepsize']; 104 | 105 | 106 | # initialization 107 | x_kp1 = options['init']; 108 | f_kp1 = fun_f(x_kp1, model, options); 109 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 110 | res0 = residual(x_kp1, 1.0, model, options); 111 | 112 | # taping 113 | if options['storeResidual'] == True: 114 | seq_res = zeros(maxiter+1); 115 | seq_res[0] = 1; 116 | if options['storeTime'] == True: 117 | seq_time = zeros(maxiter+1); 118 | seq_time[0] = 0; 119 | if options['storePoints'] == True: 120 | seq_x = zeros((model['N'],maxiter+1)); 121 | seq_x[:,0] = x_kp1; 122 | if options['storeObjective'] == True: 123 | seq_obj = zeros(maxiter+1); 124 | seq_obj[0] = h_kp1; 125 | time = 0; 126 | 127 | # solve 128 | breakvalue = 1; 129 | for iter in range(1,maxiter+1): 130 | 131 | stime = clock.time(); 132 | 133 | # update variables 134 | x_k = x_kp1.copy(); 135 | h_k = h_kp1.copy(); 136 | f_k = f_kp1.copy(); 137 | 138 | # compute gradient 139 | grad_k = grad_f(x_k, model, options); 140 | 141 | for iterbt in range(0,backtrackingMaxiter): 142 | 143 | # forward step 144 | x_kp1 = x_k - alpha*grad_k; 145 | 146 | # backward step 147 | x_kp1 = prox_g(x_kp1, alpha, model, options); 148 | 149 | # compute new value of smooth part of objective 150 | f_kp1 = fun_f(x_kp1, model, options); 151 | 152 | # no backtracking 153 | if backtrackingMaxiter == 1: 154 | break; 155 | 156 | # check backtracking breaking condition 157 | dx = x_kp1 - x_k; 158 | Delta = sum(grad_k*dx) + 0.5/alpha*sum(dx**2); 159 | if (f_kp1 < f_k + Delta + 1e-8): 160 | if iterbt == 0: 161 | alpha = alpha/backtrackingFactor; 162 | break; 163 | else: 164 | alpha = alpha*backtrackingFactor; 165 | if (iterbt+1 == backtrackingMaxiter): 166 | breakvalue = 3; 167 | 168 | # compute new objective value 169 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 170 | 171 | # check breaking condition 172 | res = residual(x_kp1, res0, model, options); 173 | if res < tol: 174 | breakvalue = 2; 175 | 176 | # tape residual 177 | time = time + (clock.time() - stime); 178 | if options['storeResidual'] == True: 179 | seq_res[iter] = res; 180 | if options['storeTime'] == True: 181 | seq_time[iter] = time; 182 | if options['storePoints'] == True: 183 | seq_x[:,iter] = x_kp1; 184 | if options['storeObjective'] == True: 185 | seq_obj[iter] = h_kp1; 186 | 187 | # print info 188 | if (iter % check == 0): 189 | print 'iter: %d, time: %5f, alpha: %f, res: %f' % (iter, time, alpha, res); 190 | 191 | 192 | # handle breaking condition 193 | if breakvalue == 2: 194 | print('Tolerance value reached!!!'); 195 | break; 196 | elif breakvalue == 3: 197 | print('Not enough backtracking iterations!!!'); 198 | break; 199 | 200 | 201 | # return results 202 | output = { 203 | 'sol': x_kp1, 204 | 'breakvalue': breakvalue 205 | } 206 | 207 | if options['storeResidual'] == True: 208 | output['seq_res'] = seq_res; 209 | if options['storeTime'] == True: 210 | output['seq_time'] = seq_time; 211 | if options['storePoints'] == True: 212 | output['seq_x'] = seq_x; 213 | if options['storeObjective'] == True: 214 | output['seq_obj'] = seq_obj; 215 | 216 | return output; 217 | 218 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/Algorithms/MFZeroSR1_ProximalGradient.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import zeros, sqrt, sign 3 | import time as clock 4 | 5 | def mfzeroSR1_pg(model, oracle, options, tol, maxiter, check): 6 | """ 7 | 8 | Monotone Proximal Quasi-Newton algorithm for solving: 9 | 10 | min_{x} h(x); h(x):= g(x) + f(x) 11 | 12 | Update step: See Section 3.3.1 in 13 | 14 | P. Ochs and T. Pock: "Adaptive Fista" ArXiv:1711.04343 [Math], November 12, 2017. 15 | 16 | 17 | Properties: 18 | ----------- 19 | f convex quadratic function with L-Lipschitz continuous gradient 20 | g simple 21 | alpha in (0,1/L) 22 | 23 | Assumption: 24 | ----------- 25 | 26 | y^{k} = B^{k}*s^{k} (secant equation for f) 27 | Holds exactly, when f is quadratic. 28 | 29 | where: 30 | 31 | y^{k} := grad f(x^{k}) - grad f(x^{k-1}) 32 | s^{k} := x^{k} - x^{k-1} 33 | B_0 := L*Id 34 | B^{k} := B_0 - \sigma_k*u^{k}*u^{k}' (Hessian approximation) 35 | d^{k} := B_0*s^k - y^k [stored in u_k later] 36 | u^{k} := d^{k}/sqrt() 37 | \sigma_k := \sign() 38 | \in {-1,0,1}: +1: B^{k} is pos. def. (curvature cond.) 39 | -1: B^{k} is neg. def. (curvature cond.) 40 | 0: do not use metric 41 | For quadratic function f: B^{k} is positive semi-definite 42 | H^{k} := (B^{k})^{-1} (inverse Hessian approximation) 43 | = B_0^{-1} + \sigma u^{k}*u^{k}' /(L*(L-\sigma_k*|u^{k}|^2)) 44 | (Using Sherman--Morrison formula) 45 | 46 | prox_{g}^{B^{k}} is a diagonal minus \sigma*Rank1 proximal mapping and 47 | requires specialized implementations. 48 | 49 | 50 | Parameter: 51 | ---------- 52 | model model data of the optimization problem 53 | 54 | oracle: 55 | .'grad_f' computes the gradient of the objective grad f(x^{k}) 56 | .'prox_g' computes the proximal mapping of g 57 | .'fun_g' computes the value of g 58 | .'fun_f' computes the value of f 59 | .'residual' used for breaking condition or resor plots 60 | 61 | options (required): 62 | .'stepsize' stepsize alpha = 1/L (required) 63 | .'gamma' Armijo-like parameter in (0,1) 64 | .'init' initialization 65 | 66 | options (optional): 67 | .'storeResidual' flag to store all residual values 68 | .'storeTime' flag to store the time of each iteration 69 | .'storePoints' flag to store all iterates 70 | .'storeObjective' flag to store all objective values 71 | 72 | tol tolerance threshold for the residual 73 | maxiter maximal number of iterations 74 | check provide information after 'check' iterations 75 | 76 | Return: 77 | ------- 78 | output 79 | .'sol' solution of the problems 80 | .'seq_res' sequence of residual values (if activated) 81 | .'seq_time' sequence of time points (if activated) 82 | .'seq_x' sequence of iterates (if activated) 83 | .'seq_obj' sequence of objective values (if activated) 84 | .'breakvalue' code for the type of breaking condition 85 | 1: maximal number of iterations exceeded 86 | 2: breaking condition reached (residual below tol) 87 | 88 | """ 89 | 90 | # store options 91 | if 'storeResidual' not in options: 92 | options['storeResidual'] = False; 93 | if 'storeTime' not in options: 94 | options['storeTime'] = False; 95 | if 'storePoints' not in options: 96 | options['storePoints'] = False; 97 | if 'storeObjective' not in options: 98 | options['storeObjective'] = False; 99 | 100 | # load oracle 101 | fun_f = oracle['fun_f']; 102 | fun_g = oracle['fun_g']; 103 | grad_f = oracle['grad_f']; 104 | prox_g = oracle['prox_g']; 105 | residual = oracle['residual']; 106 | 107 | # load parameter 108 | alpha = options['stepsize']; 109 | 110 | # initialization 111 | x_kp1 = options['init']; 112 | x_k = x_kp1.copy(); 113 | v_kp1 = zeros(x_k.shape); 114 | z_kp1 = zeros(x_k.shape); 115 | s_k = zeros(x_k.shape); 116 | y_k = zeros(x_k.shape); 117 | u_k = zeros(x_k.shape); 118 | one = np.ones(x_k.shape); 119 | grad_k = zeros(x_k.shape); 120 | grad_km1 = zeros(x_k.shape); 121 | t_kp1 = 1.0; 122 | sigma_k = 0; 123 | f_kp1 = fun_f(x_kp1, model, options); 124 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 125 | res0 = residual(x_kp1, 1.0, model, options); 126 | 127 | # taping 128 | if options['storeResidual'] == True: 129 | seq_res = zeros(maxiter+1); 130 | seq_res[0] = 1; 131 | if options['storeTime'] == True: 132 | seq_time = zeros(maxiter+1); 133 | seq_time[0] = 0; 134 | if options['storePoints'] == True: 135 | seq_x = zeros((model['N'],maxiter+1)); 136 | seq_x[:,0] = x_kp1; 137 | if options['storeObjective'] == True: 138 | seq_obj = zeros(maxiter+1); 139 | seq_obj[0] = h_kp1; 140 | time = 0; 141 | 142 | # solve 143 | breakvalue = 1; 144 | for iter in range(1,maxiter+1): 145 | 146 | stime = clock.time(); 147 | 148 | # update variables 149 | x_km1 = x_k.copy(); 150 | x_k = x_kp1.copy(); 151 | grad_km1 = grad_k.copy(); 152 | h_k = h_kp1.copy(); 153 | f_k = f_kp1.copy(); 154 | t_k = t_kp1; 155 | 156 | t_kp1 = 0.5*(1.0 + sqrt(1.0 + 4.0*t_k**2)); 157 | 158 | # extrapolation 159 | extra_y_k = x_kp1 + t_k/t_kp1*(z_kp1 - x_k) + (t_k-1)/t_kp1*(x_k - x_km1) ; 160 | 161 | # compute FISTA step 162 | grad_k = grad_f(extra_y_k, model, options); 163 | z_kp1 = extra_y_k - alpha*grad_k; 164 | z_kp1 = prox_g(z_kp1, one/alpha, 0.0, 0, model, options); 165 | h_z = fun_g(z_kp1, model, options) + fun_f(z_kp1, model, options); 166 | 167 | # compute gradient 168 | grad_k = grad_f(x_k, model, options); 169 | 170 | # build rank 1 metric B^{k} = L*id - sigma_k*u^{k}*u^{k}' (Hessian approximation) 171 | sigma_k = 0; 172 | if iter>1: 173 | s_k = x_k - x_km1; 174 | y_k = grad_k - grad_km1; 175 | 176 | u_k = s_k/alpha - y_k; 177 | dts = u_k.T.dot(s_k); 178 | if abs(dts) >= 1e-8: 179 | #if abs(dts) >= 1e-8*sqrt(sum(y_k**2))*sqrt(sum(u_k**2)): 180 | sigma_k = sign(dts); 181 | u_k = u_k/sqrt(abs(dts)); 182 | 183 | if sigma_k < 0: 184 | sigma_k = 0; 185 | breakvalue = 5; 186 | 187 | # forward step (x^{k+1} = x^{k} - (B^{k})^{-1}*\nabla f(x^{k}) 188 | v_kp1 = x_k - alpha*grad_k - sigma_k*(u_k.dot(u_k.T.dot(alpha*grad_k)))/(1.0/alpha-sum(u_k**2)); 189 | 190 | # backward step (w.r.t. to the metric B^{k}) 191 | v_kp1 = prox_g(v_kp1, one/alpha, u_k, -1.0*sigma_k, model, options); 192 | h_v = fun_g(v_kp1, model, options) + fun_f(v_kp1, model, options); 193 | 194 | # compare objective values 195 | if (h_z <= h_v): 196 | x_kp1 = z_kp1; 197 | h_kp1 = h_z; 198 | else: 199 | x_kp1 = v_kp1; 200 | h_kp1 = h_v; 201 | 202 | # check breaking condition 203 | res = residual(x_kp1, res0, model, options); 204 | if res < tol: 205 | breakvalue = 2; 206 | 207 | # tape residual 208 | time = time + (clock.time() - stime); 209 | if options['storeResidual'] == True: 210 | seq_res[iter] = res; 211 | if options['storeTime'] == True: 212 | seq_time[iter] = time; 213 | if options['storePoints'] == True: 214 | seq_x[:,iter] = x_kp1; 215 | if options['storeObjective'] == True: 216 | seq_obj[iter] = h_kp1; 217 | 218 | # print info 219 | if (iter % check == 0): 220 | print 'iter: %d, time: %5f, alpha: %f, res: %f' % (iter, time, alpha, res); 221 | 222 | 223 | # handle breaking condition 224 | if breakvalue == 2: 225 | print('Tolerance value reached!!!'); 226 | break; 227 | elif breakvalue == 4: 228 | print('Metric is not positive definite!'); 229 | break; 230 | elif breakvalue == 5: 231 | print('Metric is not positive definite!'); 232 | 233 | 234 | # return results 235 | output = { 236 | 'sol': x_kp1, 237 | 'breakvalue': breakvalue 238 | } 239 | 240 | if options['storeResidual'] == True: 241 | output['seq_res'] = seq_res; 242 | if options['storeTime'] == True: 243 | output['seq_time'] = seq_time; 244 | if options['storePoints'] == True: 245 | output['seq_x'] = seq_x; 246 | if options['storeObjective'] == True: 247 | output['seq_obj'] = seq_obj; 248 | 249 | return output; 250 | 251 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/Algorithms/SpaRSA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import zeros 3 | import time as clock 4 | 5 | def spg(model, oracle, options, tol, maxiter, check): 6 | """ 7 | 8 | Sparse Reconstruction by Separable Approximation 9 | 10 | min_{x} h(x); h(x):= g(x) + f(x) 11 | 12 | Update step: 13 | 14 | Parameter: 15 | 0 < Lip_min < Lip_max 16 | eta > 1 17 | sigma \in (0,1) 18 | 19 | Choose Lip \in [Lip_min, Lip_max] as BB step-size projected onto 20 | the given interval, i.e. Lip = / where s = x^{k} - x^{k-1} 21 | and y = \nabla f(x^{k}) - \nabla f(x^{k-1}). 22 | 23 | Backtracking w.r.t. Lip of (set Lip = eta*Lip) 24 | x^{k+1} = prox_{1/Lip*g}(x^{k} - 1/Lip*grad f(x^{k})) 25 | until 26 | h(x^{k+1}) <= \max_{i=0,...,M} h(x^{k-i}) - 27 | 0.5*sigma*Lip ||x^{k+1}-x^{k}||^2 28 | 29 | 30 | Properties: 31 | ----------- 32 | f continuously differentiable with L-Lipschitz continuous gradient 33 | g convex, simple 34 | 35 | Parameter: 36 | ---------- 37 | model model data of the optimization problem 38 | 39 | oracle: 40 | .'grad_f' computes the gradient of the objective grad f(x^{k}) 41 | .'prox_g' computes the proximal mapping of g 42 | .'fun_g' computes the value of g 43 | .'fun_f' computes the value of f 44 | .'residual' used for breaking condition or resor plots 45 | 46 | options (required): 47 | .'init' initialization 48 | 49 | options (optional): 50 | .'storeResidual' flag to store all residual values 51 | .'storeTime' flag to store the time of each iteration 52 | .'storePoints' flag to store all iterates 53 | .'storeObjective' flag to store all objective values 54 | .'Lipschitz_min' : Lip_min (default: 1e-4) 55 | .'Lipschitz_max' : Lip_max (default: 1e10) 56 | .'backtrackingMaxiter' 57 | if > 1, then backtracking is performed, which 58 | requires 'backtrackingFactor', otherwise default 59 | values are set and fixed step size is used througout 60 | default: 20 61 | .'backtrackingFactor' : eta 62 | scaling of the step size when backtracking step 63 | is successful or not; value eta>1 64 | default: 1.1 65 | .'backtrackingAcceptFactor' : sigma 66 | scaling of the sufficient descent term 67 | .'backtrackingHistory' : M 68 | how many old objective values are stored 69 | default: 0 70 | 71 | tol tolerance threshold for the residual 72 | maxiter maximal number of iterations 73 | check provide information after 'check' iterations 74 | 75 | Return: 76 | ------- 77 | output 78 | .'sol' solution of the problems 79 | .'seq_res' sequence of residual values (if activated) 80 | .'seq_time' sequence of time points (if activated) 81 | .'seq_x' sequence of iterates (if activated) 82 | .'seq_obj' sequence of objective values (if activated) 83 | .'breakvalue' code for the type of breaking condition 84 | 1: maximal number of iterations exceeded 85 | 2: breaking condition reached (residual below tol) 86 | 3: not enough backtracking iterations 87 | 88 | Reference: 89 | ---------- 90 | S.J. Wright, R.D. Nowak, and M.A.T. Figueiredo: "Sparse Reconstruction by 91 | Separable Approximation." IEEE Transactions on Signal Processing 57, 92 | No. 7:2479--93. 2009. 93 | 94 | """ 95 | 96 | # store options 97 | if 'storeResidual' not in options: 98 | options['storeResidual'] = False; 99 | if 'storeTime' not in options: 100 | options['storeTime'] = False; 101 | if 'storePoints' not in options: 102 | options['storePoints'] = False; 103 | if 'storeObjective' not in options: 104 | options['storeObjective'] = False; 105 | 106 | # step size options 107 | Lip_min = 1e-4; 108 | Lip_max = 1e10; 109 | if 'Lipschitz_min' in options: 110 | Lip_min = options['Lipschitz_min']; 111 | if 'Lipschitz_max' in options: 112 | Lip_max = options['Lipschitz_max']; 113 | 114 | # backtracking options 115 | backtrackingMaxiter = 30; 116 | backtrackingFactor = 1.5; 117 | M = 0; 118 | sigma = 1e-4; 119 | if 'backtrackingMaxiter' in options: 120 | backtrackingMaxiter = options['backtrackingMaxiter']; 121 | if 'backtrackingFactor' in options: 122 | backtrackingFactor = options['backtrackingFactor']; 123 | if 'backtrackingAcceptFactor' in options: 124 | sigma = options['backtrackingAcceptFactor']; 125 | if 'backtrackingHistory' in options: 126 | M = options['backtrackingHistory']; 127 | 128 | # load oracle 129 | fun_f = oracle['fun_f']; 130 | fun_g = oracle['fun_g']; 131 | grad_f = oracle['grad_f']; 132 | prox_g = oracle['prox_g']; 133 | residual = oracle['residual']; 134 | 135 | # initialization 136 | Lip = 1.0; # dummy value here 137 | x_kp1 = options['init']; 138 | x_k = x_kp1.copy(); 139 | f_kp1 = fun_f(x_kp1, model, options); 140 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 141 | grad_k = grad_f(x_k, model, options); 142 | res0 = residual(x_kp1, 1.0, model, options); 143 | hist_h = -1e10*np.ones(M+1); 144 | 145 | # taping 146 | if options['storeResidual'] == True: 147 | seq_res = zeros(maxiter+1); 148 | seq_res[0] = 1; 149 | if options['storeTime'] == True: 150 | seq_time = zeros(maxiter+1); 151 | seq_time[0] = 0; 152 | if options['storePoints'] == True: 153 | seq_x = zeros((model['N'],maxiter+1)); 154 | seq_x[:,0] = x_kp1; 155 | if options['storeObjective'] == True: 156 | seq_obj = zeros(maxiter+1); 157 | seq_obj[0] = h_kp1; 158 | time = 0; 159 | 160 | # solve 161 | breakvalue = 1; 162 | for iter in range(1,maxiter+1): 163 | 164 | stime = clock.time(); 165 | 166 | # update variables 167 | x_km1 = x_k.copy(); 168 | x_k = x_kp1.copy(); 169 | h_k = h_kp1.copy(); 170 | f_k = f_kp1.copy(); 171 | grad_km1 = grad_k.copy(); 172 | hist_h[iter%(M+1)] = h_kp1; 173 | max_h = np.amax(hist_h); 174 | 175 | # compute gradient 176 | grad_k = grad_f(x_k, model, options); 177 | 178 | # compute Barzilai--Borwein step length 179 | if iter>0: 180 | s_k = x_k - x_km1; 181 | y_k = grad_k - grad_km1; 182 | nrm = np.dot(s_k.T, s_k); 183 | if nrm>0: 184 | Lip = np.maximum(Lip_min, np.minimum(Lip_max, \ 185 | np.dot(s_k.T, y_k)/np.dot(s_k.T, s_k) )); 186 | else: 187 | Lip = Lip_max; 188 | 189 | 190 | for iterbt in range(0,backtrackingMaxiter): 191 | 192 | # forward step 193 | x_kp1 = x_k - 1.0/Lip*grad_k; 194 | 195 | # backward step 196 | x_kp1 = prox_g(x_kp1, 1.0/Lip, model, options); 197 | 198 | # compute new value of smooth part of objective 199 | h_kp1 = fun_f(x_kp1, model, options) + fun_g(x_kp1, model, options); 200 | 201 | # no backtracking 202 | if backtrackingMaxiter == 1: 203 | break; 204 | 205 | # check backtracking breaking condition 206 | Delta = -0.5*sigma*Lip*sum((x_kp1 - x_k)**2); 207 | if (h_kp1 < max_h + Delta + 1e-8): 208 | break; 209 | else: 210 | Lip = Lip*backtrackingFactor; 211 | if (iterbt+1 == backtrackingMaxiter): 212 | breakvalue = 3; 213 | 214 | # check breaking condition 215 | res = residual(x_kp1, res0, model, options); 216 | if res < tol: 217 | breakvalue = 2; 218 | 219 | # tape residual 220 | time = time + (clock.time() - stime); 221 | if options['storeResidual'] == True: 222 | seq_res[iter] = res; 223 | if options['storeTime'] == True: 224 | seq_time[iter] = time; 225 | if options['storePoints'] == True: 226 | seq_x[:,iter] = x_kp1; 227 | if options['storeObjective'] == True: 228 | seq_obj[iter] = h_kp1; 229 | 230 | # print info 231 | if (iter % check == 0): 232 | print 'iter: %d, time: %5f, Lip: %f, res: %f' % (iter, time, Lip, res); 233 | 234 | 235 | # handle breaking condition 236 | if breakvalue == 2: 237 | print('Tolerance value reached!!!'); 238 | break; 239 | elif breakvalue == 3: 240 | print('Not enough backtracking iterations!!!'); 241 | break; 242 | 243 | 244 | # return results 245 | output = { 246 | 'sol': x_kp1, 247 | 'breakvalue': breakvalue 248 | } 249 | 250 | if options['storeResidual'] == True: 251 | output['seq_res'] = seq_res; 252 | if options['storeTime'] == True: 253 | output['seq_time'] = seq_time; 254 | if options['storePoints'] == True: 255 | output['seq_x'] = seq_x; 256 | if options['storeObjective'] == True: 257 | output['seq_obj'] = seq_obj; 258 | 259 | return output; 260 | 261 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/Algorithms/TsengZerosSR1_ProximalGradient.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import zeros, sqrt, sign 3 | import time as clock 4 | 5 | def tseng_zeroSR1_pg(model, oracle, options, tol, maxiter, check): 6 | """ 7 | 8 | Tseng-like Proximal Quasi-Newton algorithm for solving: 9 | 10 | min_{x} h(x); h(x):= g(x) + f(x) 11 | 12 | Update step: See Section 3.3.2 in 13 | 14 | P. Ochs and T. Pock: "Adaptive Fista" ArXiv:1711.04343 [Math], November 12, 2017. 15 | 16 | 17 | Properties: 18 | ----------- 19 | f convex quadratic function with L-Lipschitz continuous gradient 20 | g simple 21 | alpha in (0,1/L) 22 | 23 | Assumption: 24 | ----------- 25 | 26 | y^{k} = B^{k}*s^{k} (secant equation for f) 27 | Holds exactly, when f is quadratic. 28 | 29 | where: 30 | 31 | y^{k} := grad f(x^{k}) - grad f(x^{k-1}) 32 | s^{k} := x^{k} - x^{k-1} 33 | B_0 := L*Id 34 | B^{k} := B_0 - \sigma_k*u^{k}*u^{k}' (Hessian approximation) 35 | d^{k} := B_0*s^k - y^k [stored in u_k later] 36 | u^{k} := d^{k}/sqrt() 37 | \sigma_k := \sign() 38 | \in {-1,0,1}: +1: B^{k} is pos. def. (curvature cond.) 39 | -1: B^{k} is neg. def. (curvature cond.) 40 | 0: do not use metric 41 | For quadratic function f: B^{k} is positive semi-definite 42 | H^{k} := (B^{k})^{-1} (inverse Hessian approximation) 43 | = B_0^{-1} + \sigma u^{k}*u^{k}' /(L*(L-\sigma_k*|u^{k}|^2)) 44 | (Using Sherman--Morrison formula) 45 | 46 | prox_{g}^{B^{k}} is a diagonal minus \sigma*Rank1 proximal mapping and 47 | requires specialized implementations. 48 | 49 | 50 | Parameter: 51 | ---------- 52 | model model data of the optimization problem 53 | 54 | oracle: 55 | .'grad_f' computes the gradient of the objective grad f(x^{k}) 56 | .'prox_g' computes the proximal mapping of g 57 | .'fun_g' computes the value of g 58 | .'fun_f' computes the value of f 59 | .'residual' used for breaking condition or resor plots 60 | 61 | options (required): 62 | .'stepsize' stepsize alpha = 1/L (required) 63 | .'gamma' Armijo-like parameter in (0,1) 64 | .'init' initialization 65 | 66 | options (optional): 67 | .'storeResidual' flag to store all residual values 68 | .'storeTime' flag to store the time of each iteration 69 | .'storePoints' flag to store all iterates 70 | .'storeObjective' flag to store all objective values 71 | 72 | tol tolerance threshold for the residual 73 | maxiter maximal number of iterations 74 | check provide information after 'check' iterations 75 | 76 | Return: 77 | ------- 78 | output 79 | .'sol' solution of the problems 80 | .'seq_res' sequence of residual values (if activated) 81 | .'seq_time' sequence of time points (if activated) 82 | .'seq_x' sequence of iterates (if activated) 83 | .'seq_obj' sequence of objective values (if activated) 84 | .'breakvalue' code for the type of breaking condition 85 | 1: maximal number of iterations exceeded 86 | 2: breaking condition reached (residual below tol) 87 | 88 | """ 89 | 90 | # store options 91 | if 'storeResidual' not in options: 92 | options['storeResidual'] = False; 93 | if 'storeTime' not in options: 94 | options['storeTime'] = False; 95 | if 'storePoints' not in options: 96 | options['storePoints'] = False; 97 | if 'storeObjective' not in options: 98 | options['storeObjective'] = False; 99 | 100 | # load oracle 101 | fun_f = oracle['fun_f']; 102 | fun_g = oracle['fun_g']; 103 | grad_f = oracle['grad_f']; 104 | prox_g = oracle['prox_g']; 105 | residual = oracle['residual']; 106 | 107 | # load parameter 108 | alpha = options['stepsize']; 109 | 110 | # initialization 111 | x_kp1 = options['init']; 112 | x_k = x_kp1.copy(); 113 | v_kp1 = zeros(x_k.shape); 114 | z_kp1 = zeros(x_k.shape); 115 | s_k = zeros(x_k.shape); 116 | y_k = zeros(x_k.shape); 117 | u_k = zeros(x_k.shape); 118 | one = np.ones(x_k.shape); 119 | grad_k = zeros(x_k.shape); 120 | grad_km1 = zeros(x_k.shape); 121 | theta_kp1 = 1.0; 122 | sigma_k = 0; 123 | f_kp1 = fun_f(x_kp1, model, options); 124 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 125 | res0 = residual(x_kp1, 1.0, model, options); 126 | 127 | # taping 128 | if options['storeResidual'] == True: 129 | seq_res = zeros(maxiter+1); 130 | seq_res[0] = 1; 131 | if options['storeTime'] == True: 132 | seq_time = zeros(maxiter+1); 133 | seq_time[0] = 0; 134 | if options['storePoints'] == True: 135 | seq_x = zeros((model['N'],maxiter+1)); 136 | seq_x[:,0] = x_kp1; 137 | if options['storeObjective'] == True: 138 | seq_obj = zeros(maxiter+1); 139 | seq_obj[0] = h_kp1; 140 | time = 0; 141 | 142 | # solve 143 | breakvalue = 1; 144 | for iter in range(1,maxiter+1): 145 | 146 | stime = clock.time(); 147 | 148 | # update variables 149 | x_k = x_kp1.copy(); 150 | z_k = z_kp1.copy(); 151 | h_k = h_kp1.copy(); 152 | f_k = f_kp1.copy(); 153 | theta_k = theta_kp1; 154 | 155 | theta_kp1 = 0.5*(sqrt(theta_k**4 + 4.0*theta_k**2)-theta_k**2); 156 | 157 | # extrapolation 158 | pre_y_k = (1.0-theta_k)*x_k + theta_k*z_k; 159 | 160 | # compute FISTA step 161 | alpha_theta = alpha/theta_k; 162 | grad_k = grad_f(pre_y_k, model, options); 163 | z_kp1 = z_k - alpha_theta*grad_k; 164 | z_kp1 = prox_g(z_kp1, one/alpha_theta, 0.0, 0, model, options); 165 | 166 | # post-combination 167 | v_kp1 = (1-theta_k)*x_k + theta_k*z_kp1; 168 | 169 | # compute gradient 170 | grad_k = grad_f(x_k, model, options); 171 | grad_z_k = grad_f(z_k, model, options); 172 | 173 | # build rank 1 metric B^{k} = L*id - sigma_k*u^{k}*u^{k}' (Hessian approximation) 174 | sigma_k = 0; 175 | if iter>1: 176 | s_k = x_k - z_k; 177 | y_k = grad_k - grad_z_k; 178 | 179 | u_k = s_k/alpha - y_k; 180 | dts = u_k.T.dot(s_k); 181 | if abs(dts) >= 1e-8: 182 | #if abs(dts) >= 1e-8*sqrt(sum(y_k**2))*sqrt(sum(u_k**2)): 183 | sigma_k = sign(dts); 184 | u_k = u_k/sqrt(abs(dts)); 185 | 186 | if sigma_k < 0: 187 | sigma_k = 0; 188 | breakvalue = 5; 189 | 190 | # forward step (x^{k+1} = x^{k} - (B^{k})^{-1}*\nabla f(x^{k}) 191 | x_kp1 = x_k - alpha*grad_k - sigma_k*(u_k.dot(u_k.T.dot(alpha*grad_k)))/(1.0/alpha-sum(u_k**2)); 192 | 193 | # backward step (w.r.t. to the metric B^{k}) 194 | x_kp1 = prox_g(x_kp1, one/alpha, u_k, -1.0*sigma_k, model, options); 195 | 196 | # compute new objective value 197 | h_xkp1 = fun_g(x_kp1, model, options) + fun_f(x_kp1, model, options); 198 | 199 | # check breaking condition 200 | res = residual(x_kp1, res0, model, options); 201 | if res < tol: 202 | breakvalue = 2; 203 | 204 | # tape residual 205 | time = time + (clock.time() - stime); 206 | if options['storeResidual'] == True: 207 | seq_res[iter] = res; 208 | if options['storeTime'] == True: 209 | seq_time[iter] = time; 210 | if options['storePoints'] == True: 211 | seq_x[:,iter] = x_kp1; 212 | if options['storeObjective'] == True: 213 | seq_obj[iter] = h_kp1; 214 | 215 | # print info 216 | if (iter % check == 0): 217 | print 'iter: %d, time: %5f, alpha: %f, res: %f' % (iter, time, alpha, res); 218 | 219 | 220 | # handle breaking condition 221 | if breakvalue == 2: 222 | print('Tolerance value reached!!!'); 223 | break; 224 | elif breakvalue == 4: 225 | print('Metric is not positive definite!'); 226 | break; 227 | elif breakvalue == 5: 228 | print('Metric is not positive definite!'); 229 | 230 | 231 | # return results 232 | output = { 233 | 'sol': x_kp1, 234 | 'breakvalue': breakvalue 235 | } 236 | 237 | if options['storeResidual'] == True: 238 | output['seq_res'] = seq_res; 239 | if options['storeTime'] == True: 240 | output['seq_time'] = seq_time; 241 | if options['storePoints'] == True: 242 | output['seq_x'] = seq_x; 243 | if options['storeObjective'] == True: 244 | output['seq_obj'] = seq_obj; 245 | 246 | return output; 247 | 248 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/Algorithms/ZeroSR1_ProximalGradient.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import zeros, sqrt, sign 3 | import time as clock 4 | 5 | def zeroSR1_pg(model, oracle, options, tol, maxiter, check): 6 | """ 7 | 8 | Line-Search Proximal Quasi-Newton algorithm for solving: 9 | 10 | min_{x} h(x); h(x):= g(x) + f(x) 11 | 12 | Update step: 13 | 14 | z^{k} = argmin_{x} g(x) + 15 | + 0.5<(x-x^{k}),B^{k}(x-x^{k})> 16 | = prox_{g}^{B^{k}}(x^{k} - H^{k}grad f(x^{k})) 17 | x^{k+1} = LineSearch(x^{k} + eta_k*(z^{k} - x^{k}); eta_k) 18 | 19 | where LineSearch(x^{k} + eta_k*(z^{k} - x^{k}); eta_k) finds eta_k 20 | such that 21 | f(x^{k+1}) <= f(x^{k}) + gamma*eta_k*Delta_k 22 | with 23 | Delta_k = + 1/(2*alpha)|z^{k}-x^{k}|_{B^{k}}^2 24 | by scaling the line search variable eta by delta. 25 | 26 | 27 | Properties: 28 | ----------- 29 | f continuously differentiable with L-Lipschitz continuous gradient 30 | g convex, simple 31 | alpha in (0,2/L) or use backtracking 32 | gamma in (0,1) Armijo-like parameter 33 | delta scaling of the line search variable 34 | 35 | Assumption: 36 | ----------- 37 | 38 | y^{k} = B^{k}*s^{k} (secant equation for f) 39 | Holds exactly, when f is quadratic. 40 | 41 | where: 42 | 43 | y^{k} := grad f(x^{k}) - grad f(x^{k-1}) 44 | s^{k} := x^{k} - x^{k-1} 45 | B_0 := L*Id 46 | B^{k} := B_0 - \sigma_k*u^{k}*u^{k}' (Hessian approximation) 47 | d^{k} := B_0*s^k - y^k [stored in u_k later] 48 | u^{k} := d^{k}/sqrt() 49 | \sigma_k := \sign() 50 | \in {-1,0,1}: +1: B^{k} is pos. def. (curvature cond.) 51 | -1: B^{k} is neg. def. (curvature cond.) 52 | 0: do not use metric 53 | For quadratic function f: B^{k} is positive semi-definite 54 | H^{k} := (B^{k})^{-1} (inverse Hessian approximation) 55 | = B_0^{-1} + \sigma u^{k}*u^{k}' /(L*(L-\sigma_k*|u^{k}|^2)) 56 | (Using Sherman--Morrison formula) 57 | 58 | prox_{g}^{B^{k}} is a diagonal minus \sigma*Rank1 proximal mapping and 59 | requires specialized implementations. 60 | 61 | 62 | Parameter: 63 | ---------- 64 | model model data of the optimization problem 65 | 66 | oracle: 67 | .'grad_f' computes the gradient of the objective grad f(x^{k}) 68 | .'prox_g' computes the proximal mapping of g 69 | .'fun_g' computes the value of g 70 | .'fun_f' computes the value of f 71 | .'residual' used for breaking condition or resor plots 72 | 73 | options (required): 74 | .'stepsize' stepsize alpha = 1/L (required) 75 | .'gamma' Armijo-like parameter in (0,1) 76 | .'init' initialization 77 | .'eta0' initialization of line search parameter 78 | .'delta' scaling of the line search variable 79 | .'lineSearchMaxiter' maximal number of line search trial steps 80 | 81 | options (optional): 82 | .'storeResidual' flag to store all residual values 83 | .'storeTime' flag to store the time of each iteration 84 | .'storePoints' flag to store all iterates 85 | .'storeObjective' flag to store all objective values 86 | .'storeBeta' flag to store alle beta values 87 | 88 | tol tolerance threshold for the residual 89 | maxiter maximal number of iterations 90 | check provide information after 'check' iterations 91 | 92 | Return: 93 | ------- 94 | output 95 | .'sol' solution of the problems 96 | .'seq_res' sequence of residual values (if activated) 97 | .'seq_time' sequence of time points (if activated) 98 | .'seq_x' sequence of iterates (if activated) 99 | .'seq_obj' sequence of objective values (if activated) 100 | .'seq_beta' sequence of beta values (extrapolation parameters) 101 | .'breakvalue' code for the type of breaking condition 102 | 1: maximal number of iterations exceeded 103 | 2: breaking condition reached (residual below tol) 104 | 3: not enough backtracking iterations 105 | 106 | """ 107 | 108 | # store options 109 | if 'storeResidual' not in options: 110 | options['storeResidual'] = False; 111 | if 'storeTime' not in options: 112 | options['storeTime'] = False; 113 | if 'storePoints' not in options: 114 | options['storePoints'] = False; 115 | if 'storeObjective' not in options: 116 | options['storeObjective'] = False; 117 | if 'storeBeta' not in options: 118 | options['storeBeta'] = False; 119 | 120 | # load oracle 121 | fun_f = oracle['fun_f']; 122 | fun_g = oracle['fun_g']; 123 | grad_f = oracle['grad_f']; 124 | prox_g = oracle['prox_g']; 125 | residual = oracle['residual']; 126 | 127 | # load parameter 128 | Lip = 1/options['stepsize']; 129 | tau0 = options['stepsize']; 130 | gamma = options['gamma']; 131 | eta0 = options['eta0']; 132 | delta = options['delta']; 133 | lineSearchMaxiter = options['lineSearchMaxiter']; 134 | tau_scaling = 0.8;#95; 135 | 136 | 137 | # initialization 138 | x_kp1 = options['init']; 139 | x_k = x_kp1.copy(); 140 | z_k = zeros(x_k.shape); 141 | s_k = zeros(x_k.shape); 142 | y_k = zeros(x_k.shape); 143 | u_k = zeros(x_k.shape); 144 | one = np.ones(x_k.shape); 145 | grad_k = zeros(x_k.shape); 146 | grad_km1 = zeros(x_k.shape); 147 | eta = eta0; 148 | sigma_k = 0; 149 | f_kp1 = fun_f(x_kp1, model, options); 150 | h_kp1 = f_kp1 + fun_g(x_kp1, model, options); 151 | res0 = residual(x_kp1, 1.0, model, options); 152 | 153 | # taping 154 | if options['storeResidual'] == True: 155 | seq_res = zeros(maxiter+1); 156 | seq_res[0] = 1; 157 | if options['storeTime'] == True: 158 | seq_time = zeros(maxiter+1); 159 | seq_time[0] = 0; 160 | if options['storePoints'] == True: 161 | seq_x = zeros((model['N'],maxiter+1)); 162 | seq_x[:,0] = x_kp1; 163 | if options['storeObjective'] == True: 164 | seq_obj = zeros(maxiter+1); 165 | seq_obj[0] = h_kp1; 166 | if options['storeBeta'] == True: 167 | seq_beta = zeros(maxiter); 168 | time = 0; 169 | 170 | # solve 171 | breakvalue = 1; 172 | for iter in range(1,maxiter+1): 173 | 174 | stime = clock.time(); 175 | 176 | # update variables 177 | x_km1 = x_k.copy(); 178 | x_k = x_kp1.copy(); 179 | grad_km1 = grad_k.copy(); 180 | h_k = h_kp1.copy(); 181 | f_k = f_kp1.copy(); 182 | 183 | # compute gradient 184 | grad_k = grad_f(x_k, model, options); 185 | 186 | # build rank 1 metric B^{k} = L*id - sigma_k*u^{k}*u^{k}' (Hessian approximation) 187 | sigma_k = 0; 188 | tau = tau0; 189 | if iter>1: 190 | s_k = x_k - x_km1; 191 | y_k = grad_k - grad_km1; 192 | 193 | # step size selection (tau_BB2 if possible) 194 | if True: ## use BB2 step size 195 | nrm_yk = np.dot(y_k.T, y_k); 196 | if (nrm_yk > 1e-8): 197 | tau = tau_scaling*np.dot(s_k.T, y_k)/nrm_yk; 198 | 199 | H0 = tau; 200 | 201 | u_k = s_k - H0*y_k; 202 | dts = u_k.T.dot(y_k); 203 | if abs(dts) >= 1e-8: 204 | #if abs(dts) >= 1e-8*sqrt(sum(y_k**2))*sqrt(sum(u_k**2)): 205 | sigma_k = sign(dts); 206 | u_k = u_k/sqrt(abs(dts)); 207 | 208 | if sigma_k < 0: 209 | sigma_k = 0; 210 | breakvalue = 5; 211 | 212 | # forward step (x^{k+1} = x^{k} - (B^{k})^{-1}*\nabla f(x^{k}) 213 | z_k = x_k - tau*grad_k - sigma_k*(u_k.dot(u_k.T.dot(tau*grad_k)))/(1.0/tau-sum(u_k**2)); 214 | 215 | # backward step (w.r.t. to the metric B^{k}) 216 | z_k = prox_g(z_k, one/tau, u_k, -1.0*sigma_k, model, options); 217 | 218 | # compute Delta 219 | Delta = 0; 220 | if lineSearchMaxiter > 0: 221 | dx = z_k - x_k; 222 | Delta = sum(grad_k*dx) + 0.5/tau*sum(dx**2); 223 | else: 224 | eta = 1; 225 | x_kp1 = z_k; 226 | 227 | # line search 228 | for iterls in range(0,lineSearchMaxiter): 229 | 230 | # trial point 231 | x_kp1 = x_k + eta*(z_k - x_k); 232 | 233 | # compute new objective value 234 | h_kp1 = fun_f(x_kp1, model, options) + fun_g(x_kp1, model, options); 235 | 236 | # no backtracking 237 | if lineSearchMaxiter <= 1: 238 | break; 239 | 240 | # check backtracking breaking condition 241 | if (h_kp1 < h_k + eta*gamma*Delta + 1e-8): 242 | if iterls == 0: 243 | #eta = eta/delta; 244 | eta = eta0;#eta/delta; 245 | break; 246 | else: 247 | eta = eta*delta; 248 | if (iterls+1 == lineSearchMaxiter): 249 | breakvalue = 3; 250 | 251 | # check breaking condition 252 | res = residual(x_kp1, res0, model, options); 253 | if res < tol: 254 | breakvalue = 2; 255 | 256 | # tape residual 257 | time = time + (clock.time() - stime); 258 | if options['storeResidual'] == True: 259 | seq_res[iter] = res; 260 | if options['storeTime'] == True: 261 | seq_time[iter] = time; 262 | if options['storePoints'] == True: 263 | seq_x[:,iter] = x_kp1; 264 | if options['storeObjective'] == True: 265 | seq_obj[iter] = h_kp1; 266 | if options['storeBeta'] == True: 267 | Md = Lip*s_k - y_k 268 | if iter>1: 269 | beta = np.dot(Md.T, z_k - x_k)/np.dot(Md.T, s_k); 270 | else: 271 | beta = 0; 272 | seq_beta[iter-1] = beta; 273 | 274 | # print info 275 | if (iter % check == 0): 276 | print 'iter: %d, time: %5f, Lip: %f, eta: %f, res: %f' % (iter, time, Lip, eta, res); 277 | 278 | 279 | # handle breaking condition 280 | if breakvalue == 2: 281 | print('Tolerance value reached!!!'); 282 | break; 283 | elif breakvalue == 3: 284 | print('Not enough backtracking iterations!!!'); 285 | breakvalue = 1; 286 | #break; 287 | elif breakvalue == 4: 288 | print('Metric is not positive definite!'); 289 | break; 290 | elif breakvalue == 5: 291 | print('Metric is not positive definite!'); 292 | 293 | 294 | # return results 295 | output = { 296 | 'sol': x_kp1, 297 | 'breakvalue': breakvalue 298 | } 299 | 300 | if options['storeResidual'] == True: 301 | output['seq_res'] = seq_res; 302 | if options['storeTime'] == True: 303 | output['seq_time'] = seq_time; 304 | if options['storePoints'] == True: 305 | output['seq_x'] = seq_x; 306 | if options['storeObjective'] == True: 307 | output['seq_obj'] = seq_obj; 308 | if options['storeBeta'] == True: 309 | output['seq_beta'] = seq_beta; 310 | 311 | return output; 312 | 313 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/Algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | # See Algorithm.py for the documentation of an generic algorithm. All 2 | # implemented algorithms have the same parameters and are commented 3 | # analogously. 4 | 5 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/README.md: -------------------------------------------------------------------------------- 1 | # ZeroSR1 GroupLasso Experiment 2 | 3 | The code is written in Python and uses a simple C-interface for solving the rank-1 proximal mapping more efficiently. The file `test_groupLasso.py` reproduces the code for the GroupLasso Experiment from the 2018 paper. 4 | 5 | # Installation 6 | * Go to the folder `clib` and compile `mymath.cpp` using the `Makefile` that is provided in that folder. 7 | * Then, you can run `python test_groupLasso.py` from the folder `zeroSR1/paperExperiments/groupLasso/`. 8 | 9 | # Problem 10 | The optimization problem is generated and solved in `test_groupLasso.py` using several methods. 11 | 12 | ## Usage 13 | In order to measure the error to the optimal value, set the flag `compute_optimal_value = True`, which runs (by default) FISTA with 50000 iterations and writes the optimal value to the file `data_group_lasso.npy`. Once this run finished, set `compute_optimal_value = False` and evaluate the implemented algorithms. 14 | 15 | ## Implemented Algorithms 16 | * Forward-Backward Splitting 17 | * FISTA 18 | * Zero SR1 Proximal Quasi-Newton (with rank-1 prox implemented in C) 19 | * Monotone Fast Zero SR1 Proximal Quasi-Newton 20 | * Tseng Fast Zero SR1 Proximal Quasi-Newton 21 | * Sparse Reconstruction by Separable Approximation 22 | 23 | Rank-1 proximal mappings are implemented in C (see folder `clib`). 24 | 25 | ## Parameters 26 | For the parameters, we refer to `test_groupLasso.py` and the implementations in the folder `Algorithms`. 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/clib/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | g++ -fPIC mymath.cpp -shared -o mymath.so 3 | -------------------------------------------------------------------------------- /paperExperiments/groupLasso/clib/mymath.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include "mymath.h" 6 | #include 7 | #include 8 | 9 | 10 | 11 | 12 | 13 | class Prox_rk1_generic 14 | { 15 | public: 16 | Prox_rk1_generic( 17 | double* _x, // solution of the proximal mapping 18 | double* _x0, // proximal center (const) 19 | double* _d, // diagonal of the diagonal part of the metric 20 | double* _u, // rank1 part of the metrix 21 | double _sigma, // sign of the rank1 part of the metric 22 | int _N) // dimension of the problem 23 | { 24 | x = _x; 25 | x0 = _x0; 26 | d = _d; 27 | u = _u; 28 | sigma = _sigma; 29 | N = _N; 30 | // allocate memory for an auxiliary variable 31 | x_tilde = new double[N]; 32 | oneside_shift = 1e10; 33 | } 34 | ~Prox_rk1_generic() 35 | { 36 | delete[] x_tilde; 37 | } 38 | 39 | void solve() 40 | { 41 | // get breakpoints 42 | std::vector bpts; 43 | get_breakpoints(bpts); 44 | 45 | // sort list of breakpoints 46 | sort( bpts.begin(), bpts.end() ); 47 | bpts.erase( unique( bpts.begin(), bpts.end() ), bpts.end() ); 48 | int nbpts = bpts.size(); 49 | 50 | // Now, we search for the interval between two (adjacent) breakpoints 51 | // that contains the root of $p(a) := a - = 0$, where x(a) 52 | // is the prox evaluated at a. 53 | // The algorithmic strategy is binary search / bisectioning, which can be 54 | // done, since p(a) is monotonically increasing. 55 | int idx_la = 0; // index of left interval border 56 | int idx_ra = nbpts-1; // index of right interval border 57 | double la, ra; // left and right interval borders 58 | 59 | if (nbpts == 0) 60 | { 61 | //std::cout << "Find root in (-infty,infty)" << std::endl; 62 | find_root(-oneside_shift,oneside_shift); 63 | return; 64 | } 65 | 66 | // check left border 67 | if (value(bpts[idx_la]) > 0) 68 | { 69 | // The zero of p(a) is in (-\infty,bpts(idx_la)]. 70 | //std::cout << "Find root in (-infty," << bpts[idx_la] << ")" << std::endl; 71 | find_root(bpts[idx_la]-oneside_shift,bpts[idx_la]); 72 | return; 73 | } 74 | 75 | // check right border 76 | if (value(bpts[idx_ra]) < 0) 77 | { 78 | // The zero of p(a) is in [bpts(idx_ra),+\infty) 79 | //std::cout << "Find root in (" << bpts[idx_ra] << ",infty)" << std::endl; 80 | find_root(bpts[idx_ra],bpts[idx_ra]+oneside_shift); 81 | return; 82 | } 83 | 84 | // find interval with zero of p(a) 85 | int maxiter = (int)(ceil(log(nbpts)/log(2.0))+1); 86 | int j; 87 | for (int i=0; i& bpts) = 0; 115 | 116 | // computes a - dot(u.T, x(a)-x0) 117 | double value (double a) 118 | { 119 | for (int i=0; i 1e-8) 197 | { 198 | std::cout << "WARNING! Rank1 prox could not be solved accurately. Error: " 199 | << err << std::endl; 200 | } 201 | 202 | } 203 | 204 | virtual void prox_diag(double* x_tilde) = 0; 205 | virtual void get_breakpoints(std::vector& bpts) = 0; 206 | 207 | }; 208 | 209 | 210 | 211 | 212 | 213 | 214 | //////////////////////////////////////////////////////////////////////////////// 215 | //////////////////////////////////////////////////////////////////////////////// 216 | //////////////////////////////////////////////////////////////////////////////// 217 | //////////////////////////////////////////////////////////////////////////////// 218 | //////////////////////////////////////////////////////////////////////////////// 219 | //////////////////////////////////////////////////////////////////////////////// 220 | 221 | 222 | 223 | 224 | 225 | 226 | class Prox_rk1_generic_PS : public Prox_rk1_generic 227 | { 228 | public: 229 | Prox_rk1_generic_PS( 230 | double* _x, // solution of the proximal mapping 231 | double* _x0, // proximal center (const) 232 | double* _d, // diagonal of the diagonal part of the metric 233 | double* _u, // rank1 part of the metrix 234 | double _sigma, // sign of the rank1 part of the metric 235 | int _N) // dimension of the problem 236 | : Prox_rk1_generic(_x,_x0,_d,_u,_sigma,_N) 237 | { 238 | use_a_init = false; 239 | }; 240 | 241 | 242 | 243 | void find_root(double la, double ra) 244 | { 245 | 246 | // initialization 247 | double a = 0.0; 248 | if (use_a_init) 249 | { 250 | a = a_init; 251 | } 252 | a = fmax(la, fmin(ra, a)); 253 | double tau = 1.0; 254 | double pa, dp_da; 255 | for (int iter=0; iter<20; ++iter) 256 | { 257 | pa = value(a); 258 | if (fabs(pa) < 1e-8) break; // breaking condition 259 | dp_da = derivative(a); 260 | a = a - tau*pa/dp_da; 261 | tau = tau*0.95; 262 | } 263 | // sanity check 264 | double err = value(a); // Warning: This also modifies the output! 265 | if (fabs(err) > 1e-8) 266 | { 267 | std::cout << "WARNING! Rank1 prox could not be solved accurately. Error: " 268 | << err << std::endl; 269 | } 270 | if (use_a_init) 271 | { 272 | a_init = a; 273 | } 274 | 275 | } 276 | virtual double derivative(double a) = 0; 277 | 278 | virtual void prox_diag(double* x_tilde) = 0; 279 | virtual void get_breakpoints(std::vector& bpts) = 0; 280 | 281 | // use this to do warm start in find_root 282 | bool use_a_init; 283 | double a_init; 284 | 285 | 286 | 287 | }; 288 | 289 | 290 | 291 | class Prox_rk1_groupl2l1 : public Prox_rk1_generic_PS 292 | { 293 | public: 294 | Prox_rk1_groupl2l1( 295 | double* _x, // solution of the proximal mapping 296 | double* _x0, // proximal center (const) 297 | double* _d, // diagonal of the diagonal part of the metric 298 | double* _u, // rank1 part of the metrix 299 | double _sigma, // sign of the rank1 part of the metric 300 | int _N) // dimension of the problem 301 | : Prox_rk1_generic_PS(_x,_x0,_d,_u,_sigma,_N) { }; 302 | 303 | 304 | double derivative(double a) 305 | { 306 | int j,k; 307 | double da = 0.0; 308 | double da_b; 309 | double d_b; 310 | double nrm; 311 | double nrm_db_inv; 312 | double dot_xu_b; 313 | 314 | for (k=0; k 327 | 328 | if (nrm > 1.0/d_b) 329 | { 330 | da_b = 0.0; 331 | for (j=B[k]; j/(d_b*|x_b|) 336 | // 337 | da_b = (1.0-nrm_db_inv)*u[j]/d_b 338 | + (x_tilde[j]*dot_xu_b)*nrm_db_inv/nrm; 339 | 340 | da += u[j]*da_b; // = 341 | } 342 | } 343 | } 344 | da = 1.0 + sigma*da; // = 1.0 + sigma* 345 | 346 | return da; 347 | } 348 | 349 | void prox_diag(double* x_tilde) 350 | { 351 | 352 | double tmp; 353 | for (int k=0; k& bpts) 382 | { 383 | // find breakpoints 384 | bpts.reserve(2*lenB); 385 | double AA, BB, CC; 386 | double d_b; 387 | double dis; 388 | for (int i=0; i+1 0: 400 | file = open("GroupLasso_conv_"+nams[i]+"_time.dat", "w"); 401 | for j in range(0,maxiter,1): 402 | file.write("%f %.12f\n" %(ts[i][j], rs[i][j])); 403 | file.close(); 404 | file = open("GroupLasso_conv_"+nams[i]+"_iter.dat", "w"); 405 | for j in range(0,maxiter,1): 406 | file.write("%d %.12f\n" %(j, rs[i][j])); 407 | file.close(); 408 | 409 | 410 | # plotting 411 | fig1 = plt.figure(); 412 | 413 | for i in range(0,nalgs): 414 | plt.plot(ts[i][1:-1], rs[i][1:-1], '-', color=cols[i], linewidth=2); 415 | #plt.plot(rs[i][1:-1], '-', color=cols[i], linewidth=2); 416 | 417 | plt.legend(legs); 418 | plt.yscale('log'); 419 | #plt.xscale('log'); 420 | 421 | plt.xlabel('time') 422 | plt.ylabel('residual'); 423 | plt.title('GroupLasso') 424 | 425 | plt.show(); 426 | #plt.draw(); 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | -------------------------------------------------------------------------------- /proxes/Contents.m: -------------------------------------------------------------------------------- 1 | % PROXES Contains the code to compute the diagonal +/- rank-1 proxes 2 | % proj_rank1_Rplus - returns the scaled proximity operator for non-negativity constraints 3 | % proj_rank1_box - returns the scaled proximity operator for box constraints 4 | % proj_rank1_linf - returns the scaled proximity operator for l_infinity norm constraints 5 | % prox_rank1_generic - returns the scaled proximity operator for a generic function h 6 | % prox_rank1_hinge - returns the scaled proximity operator for the hinge loss 7 | % prox_rank1_l1 - returns the scaled proximity operator for the l1 norm 8 | -------------------------------------------------------------------------------- /proxes/proj_rank1_Rplus.m: -------------------------------------------------------------------------------- 1 | function varargout = proj_rank1_Rplus(varargin) 2 | % PROJ_RANK1_RPLUS returns the scaled proximity operator for non-negativity constraints 3 | % 4 | % x = proj_rank1_Rplus( x0, D, u ) 5 | % where 6 | % x = argmin_{x} h(x) + 1/2||x-x0||^2_{V} 7 | % and 8 | % V^{-1} = D + u*u' (or diag(D) + u*u' if D is a vector) 9 | % "D" must be diagonal and positive. "u" can be any vector. 10 | % 11 | % Here, h(x) is the indicator function of the set { x : x >= 0 } 12 | % 13 | % There are also variants: 14 | % x = proj_rank1_Rplus( x0, D, u, lambda, linTerm, sigma, inverse) 15 | % returns 16 | % x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x 17 | % and 18 | % either V^{-1} = D + sigma*u*u' if "inverse" is true (default) 19 | % or V = D + sigma*u*u' if "inverse" is false 20 | % and in both cases, "sigma" is either +1 (default) or -1. 21 | % "lambda" should be non-zero 22 | % 23 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com 24 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili 25 | % NIPS 2012, http://arxiv.org/abs/1206.1156 26 | % 27 | % See also prox_rank1_generic.m 28 | 29 | prox = @(x,t) max(0, x); 30 | prox_brk_pts = @(s) 0; % since projection, scaling has no effect 31 | 32 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} ); -------------------------------------------------------------------------------- /proxes/proj_rank1_box.m: -------------------------------------------------------------------------------- 1 | function varargout = proj_rank1_box(lwr,upr,varargin) 2 | % PROJ_RANK1_BOX returns the scaled proximity operator for box constraints 3 | % 4 | % x = proj_rank1_box( lwr, upr, x0, D, u ) 5 | % where 6 | % x = argmin_{x} h(x) + 1/2||x-x0||^2_{V} 7 | % and 8 | % V^{-1} = D + u*u' (or diag(D) + u*u' if D is a vector) 9 | % "D" must be diagonal and positive. "u" can be any vector. 10 | % 11 | % Here, h(x) is the indicator function of the set 12 | % { x : lwr <= x <= upr } 13 | % (Set any component of lwr to -Inf and upr to +Inf to effectively 14 | % ignore those particular constraints) 15 | % 16 | % There are also variants: 17 | % x = proj_rank1_box( lwr, upr, x0, D, u, lambda, linTerm, sigma, inverse) 18 | % returns 19 | % x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x 20 | % and 21 | % either V^{-1} = D + sigma*u*u' if "inverse" is true (default) 22 | % or V = D + sigma*u*u' if "inverse" is false 23 | % and in both cases, "sigma" is either +1 (default) or -1. 24 | % "lambda" should be non-zero 25 | % 26 | % Note that UNLIKE prox_rank1_l1.m and other functions, the calling 27 | % sequence is slighty different, since you must pass in "lwr" and "upr" 28 | % 29 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com 30 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili 31 | % NIPS 2012, http://arxiv.org/abs/1206.1156 32 | % 33 | % See also prox_rank1_generic.m 34 | 35 | prox = @(x,t) max( min(upr,x), lwr ); 36 | prox_brk_pts = @(s) [lwr,upr]; % since projection, scaling has no effect 37 | 38 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} ); -------------------------------------------------------------------------------- /proxes/proj_rank1_linf.m: -------------------------------------------------------------------------------- 1 | function varargout = proj_rank1_linf(varargin) 2 | % PROJ_RANK1_LINF returns the scaled proximity operator for l_infinity norm constraints 3 | % 4 | % x = proj_rank1_linf( x0, D, u ) 5 | % where 6 | % x = argmin_{x} h(x) + 1/2||x-x0||^2_{V} 7 | % and 8 | % V^{-1} = D + u*u' (or diag(D) + u*u' if D is a vector) 9 | % "D" must be diagonal and positive. "u" can be any vector. 10 | % 11 | % Here, h(x) is the indicator function of the l_infinity ball, i.e., 12 | % { x | norm(x,inf) <= 1 } 13 | % To scale the ball, just use the scaling parameter "lambda" (see below) 14 | % 15 | % There are also variants: 16 | % x = proj_rank1_linf( x0, D, u, lambda, linTerm, sigma, inverse) 17 | % returns 18 | % x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x 19 | % and 20 | % either V^{-1} = D + sigma*u*u' if "inverse" is true (default) 21 | % or V = D + sigma*u*u' if "inverse" is false 22 | % and in both cases, "sigma" is either +1 (default) or -1. 23 | % "lambda" should be non-zero 24 | % 25 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com 26 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili 27 | % NIPS 2012, http://arxiv.org/abs/1206.1156 28 | % 29 | % See also prox_rank1_generic.m 30 | 31 | prox = @(x,t) sign(x).*min( 1, abs(x) ); 32 | prox_brk_pts = @(s) [-ones(size(s)),ones(size(s))]; % since projection, scaling has no effect 33 | 34 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} ); -------------------------------------------------------------------------------- /proxes/prox_rank1_generic.m: -------------------------------------------------------------------------------- 1 | function [x,a,cnt] = prox_rank1_generic( prox, prox_brk_pts, x0, D, u, lambda, linTerm, plusminus, INVERT ) 2 | % PROX_RANK1_GENERIC returns the scaled proximity operator for a generic function h 3 | % (provided the generic function is separable and has a piece-wise linear prox) 4 | % This function is intended be used as follows: 5 | % 6 | % (1) Instantiate: 7 | % scaledProx = @(varargin) prox_rank1_generic( prox, prox_brk_pts,varargin{:}) 8 | % where 'prox' and 'prox_brk_pts' implicitly define the function h 9 | % i.e., prox(x0,t) = argmin_{x} t*h(x) + 1/2||x-x0||^2 10 | % and 11 | % prox_brk_pts(t) returns a row-vector with the break points 12 | % that specify where t*h(x) is piecewise linear 13 | % (this is if h(x) = [ h_1(x_1); ... ; h_n(x_n) ]. If instead not 14 | % all the h_i are identical, prox_brk_pts(t) should return 15 | % a matrix). 16 | % See the examples below because prox_brk_pts must allow a vector "t" 17 | % so you must define this appropriately. 18 | % 19 | % (2) Call the "scaledProx" function, which has signature: 20 | % x = scaledProx( x0, D, u ) 21 | % where 22 | % x = argmin_{x} h(x) + 1/2||x-x0||^2_{V} 23 | % and 24 | % V^{-1} = D + u*u' (or diag(D) + u*u' if D is a vector) 25 | % "D" must be diagonal and positive. "u" can be any vector. 26 | % 27 | % There are also variants: 28 | % 29 | % x = scaledProx( x0, D, u, lambda, linTerm, sigma, inverse) 30 | % returns 31 | % x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x 32 | % and 33 | % either V^{-1} = D + sigma*u*u' if "inverse" is true (default) 34 | % or V = D + sigma*u*u' if "inverse" is false 35 | % and in both cases, "sigma" is either +1 (default) or -1. 36 | % "lambda" should be non-zero 37 | % 38 | % Examples: 39 | % 1. if h(x) = ||x||_1 then 40 | % prox = @(x,t) sign(x).*max(0, abs(x) - t ); 41 | % prox_brk_pts = @(t) [-t,t]; 42 | % 2. if h(x) is the indicator function of the set { x : x >= 0}, then 43 | % prox = @(x,t) max(0, x); 44 | % prox_brk_pts = @(t) 0; 45 | % 3. if h(x) is the indicator function of the set { x : lwr <= x <= upr } 46 | % where lwr and upr are vectors, then 47 | % prox = @(x,t) max( min(upr,x), lwr ); 48 | % prox_brk_pts = @(t) [lwr,upr]; (Note: this is a matrix) 49 | % 4. if h(x) is the hinge-loss h(x) = max( 1-x, 0 ), then 50 | % prox = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1 ); 51 | % prox_brk_pts = @(t)[ones(size(t)), 1-t]; 52 | % 5. if h(x) is the indicator function of the l_infinity ball, then 53 | % prox = @(x,t) sign(x).*min( 1, abs(x) ); 54 | % prox_brk_pts = @(t) [-ones(size(t)),ones(size(t))]; 55 | % 56 | % 57 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com 58 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili 59 | % NIPS 2012, http://arxiv.org/abs/1206.1156 60 | 61 | PRINT = false; % set to "true" for debugging purposes 62 | if PRINT 63 | dispp = @disp; 64 | printf = @fprintf; 65 | else 66 | dispp = @(varargin) 1; 67 | printf = @(varargin) 1; 68 | end 69 | dispp(' '); 70 | 71 | n = length(x0); 72 | if nargin < 5 || isempty(u), u = 0; end 73 | if nargin < 6, lambda = []; end 74 | if nargin < 7, linTerm = []; end 75 | if nargin < 8 || isempty(plusminus), plusminus = 1; end 76 | assert( plusminus==-1 | plusminus==+1 ) 77 | if nargin < 9 || isempty(INVERT), INVERT = true; end 78 | 79 | if size(D,2) > 1, d = diag(D); else d = D; end % extract diagonal part 80 | if any( d < 0 ), error('D must only have strictly positive entries'); end 81 | 82 | if all( u==0 ) 83 | % Just a diagonal scaling, so this code is overkill, 84 | % but we should be able to handle it for consistency. 85 | NO_U = true; 86 | else 87 | NO_U = false; 88 | if numel(u) > length(u) 89 | error('u must be a vector, not a matrix'); 90 | end 91 | end 92 | 93 | % Now, V > 0 (i.e., V is positive definite) iff V^{-1} exists and V^{-1} > 0 94 | % So V^{-1} > 0 is automatically true if sigma = + 1 95 | % If sigma = -1, then it could be indefinite or semidefinite 96 | % 97 | % It is possible to check all eigenvalues in O(n^2) rather than O(n^3) 98 | % but it's not particularly simple to implement. 99 | % See http://www.stat.uchicago.edu/~lekheng/courses/309f10/modified.pdf 100 | % Golub, 1973, "Some Modified Matrix Eigenvalue Problems" 101 | % http://epubs.siam.org/doi/abs/10.1137/1015032 102 | % But in the special case when D is a scaled identity, checking is very easy: 103 | if plusminus < 0 && all( d==d(1) ) 104 | minE = d(1) + plusminus*norm(u)^2; 105 | if minE <= 0, error('The scaling matrix is not positive definite'); end 106 | end 107 | 108 | % this comes from the Sherman-Morrison-Woodbury formula: 109 | if NO_U 110 | uinv = 0; 111 | else 112 | uinv = (u./d)/sqrt(1+u'*(u./d)); 113 | end 114 | % In all cases, we find prox_h^V, but how we define V 115 | % in terms of d and u depends on "INVERT" 116 | if INVERT 117 | % So V^{-1} = diag(d) + sigma*u*u' 118 | % and V = diag(1./d) - sigma*uinv*uinv'; 119 | Vinv = @(y) d.*y + plusminus*(u'*y)*u; 120 | 121 | % The code below expects V = diag(dd) + sigma*uu*uu', so... 122 | dd = 1./d; 123 | uu = uinv; 124 | plusminus = -plusminus; 125 | 126 | % The code also requires uu./dd and 1./dd, so define these here 127 | % ud = uu./dd; 128 | ud = u/sqrt(1+u'*(u./d)); % more accurate? % 6.01e-3 error 129 | dInv = 1./dd; 130 | else 131 | % Here, V = diag(d) + sigma*u*u' 132 | % and V^{-1} = diag(1./d) - sigma*uinv*uinv'; 133 | Vinv = @(y) y./d - plusminus*(uinv'*y)*uinv; 134 | 135 | % The code below expects V = diag(dd) + sigma*uu*uu', so... 136 | dd = d; 137 | uu = u; 138 | %plusminus = plusminus; 139 | 140 | % The code also requires uu./dd and 1./dd, so define these here 141 | ud = uu./dd; 142 | dInv = 1./dd; 143 | end 144 | if NO_U, uu = 0; ud = 0; end % any value, since we won't use them... 145 | if ~isempty(lambda) 146 | % We make a change of variables, e.g., x <-- lambda*.x 147 | % change x0 <-- lambda.*x0, linTerm <-- linTerm./lambda 148 | % and V <-- diag(1./lambda)*V*diag(1./lambda). Because V is defined 149 | % implicitly, and depends on INVERT, this is a bit of a headache. 150 | % We'll do some changes here, and some later in the code 151 | % e.g., combine linTerm and V scaling so we don't have to redefine Vinv 152 | if any(lambda==0), error('scaling factor lambda must be non-zero'); end 153 | % note that lambda < 0 should be OK 154 | x0 = lambda.*x0; 155 | 156 | % Scale V = diag(dd) + sigma*uu*uu' by V <-- diag(1./lambda)*V*diag(1./lambda) 157 | dd = dd./(lambda.^2); 158 | uu = uu./lambda; 159 | ud = ud.*lambda; 160 | dInv = 1./dd; 161 | end 162 | 163 | t = prox_brk_pts(1./dd); 164 | if size(t,1) < n 165 | if size(t,1) > 1 166 | error('"prox_brk_pts" should return a ROW VECTOR of break points'); 167 | end 168 | % otherwise, assume each component identical, so scale 169 | t = repmat(t,n,1); 170 | end 171 | if ~isempty(linTerm) && norm(linTerm)>=0 172 | if isempty(lambda) 173 | x0 = x0 - Vinv(linTerm); 174 | else 175 | % V is scaled V <-- diag(1./lambda)*V*diag(1./lambda) 176 | % so Vinv is scaled the opposite. 177 | % linTerm is scaled linTerm <== linTerm./lambda 178 | x0 = x0 - lambda.*Vinv(linTerm); 179 | end 180 | end 181 | 182 | % The main heart: 183 | X = @(a) prox( x0 - plusminus*a*ud, dInv ); 184 | 185 | % Early return if we have only a diagonal scaling... 186 | if NO_U 187 | % in this case, "alpha" is irrelevant 188 | x = prox( x0, dInv ); 189 | if ~isempty(lambda) 190 | % Undo the scaling of x <-- lambda.*x 191 | x = x./lambda; 192 | end 193 | return; 194 | end 195 | 196 | brk_pts = bsxfun(@times,plusminus*(dd./uu), bsxfun(@minus,x0,t) ); 197 | brk_pts = unique(brk_pts(:)); % will sort and remove duplicates 198 | brk_pts = brk_pts(~isinf(brk_pts)); % in case lwr/upr=Inf for box 199 | 200 | 201 | % p(a) = a + dot(u, y - prox_{1/d_i}( y_i - a u_i/d_i) ) 202 | % Then p is strictly increasing. We want a root of this: p(a) = 0 203 | 204 | % Defined above for numerical reasons... 205 | % ud = uu./dd; 206 | % dInv = 1./dd; 207 | 208 | 209 | % Main for-loop: 210 | % "lower bound" are "a" for which p <= 0 211 | % "upper bound" are "a" for which p >= 0 212 | % if a is increasing, so is p(a) (double-check for both plusminus cases ) 213 | lwrBnd = 0; 214 | uprBnd = length(brk_pts) + 1; 215 | iMax = ceil( log2(length(brk_pts)) ) + 1; 216 | for i = 1:iMax 217 | if uprBnd - lwrBnd <= 1 218 | dispp('Bounds are too close; breaking'); 219 | break; 220 | end 221 | j = round(mean([lwrBnd,uprBnd])); 222 | printf('j is %d (bounds were [%d,%d])\n', j, lwrBnd,uprBnd ); 223 | if j==lwrBnd 224 | dispp('j==lwrBnd, so increasing'); 225 | j = j+1; 226 | elseif j==uprBnd 227 | dispp('j==uprBnd, so increasing'); 228 | j = j-1; 229 | end 230 | 231 | a = brk_pts(j); 232 | x = X(a); % the prox 233 | p = a + dot(uu,x0-x); 234 | 235 | if p > 0 236 | uprBnd = j; 237 | elseif p < 0 238 | lwrBnd = j; 239 | end 240 | if PRINT 241 | % Don't rely on redefinition of printf, 242 | % since then we would still calculate find(~x) 243 | % which is slow 244 | printf('i=%2d, a = %6.3f, p = %8.3f, zeros ', i, a, p ); 245 | if n < 100, printf('%d ', find(~x) ); end 246 | % printf('; nonzeros ');printf('%d ', find(x) ); 247 | printf('\n'); 248 | end 249 | end 250 | cnt = i; % number of iterations we took 251 | 252 | % Now, determine linear part, which we infer from two points. 253 | % If lwr/upr bounds are infinite, we take special care 254 | % e.g., we make a new "a" slightly lower/bigger, and use this 255 | % to extract linear part. 256 | if lwrBnd == 0 257 | a2 = brk_pts( uprBnd ); 258 | a1 = a2 - 10; % arbitrary 259 | aBounds = [-Inf,a2]; 260 | elseif uprBnd == length(brk_pts) + 1; 261 | a1 = brk_pts( lwrBnd ); 262 | a2 = a1 + 10; % arbitrary 263 | aBounds = [a1,Inf]; 264 | else 265 | % In general case, we can infer linear part from the two break points 266 | a1 = brk_pts( lwrBnd ); 267 | a2 = brk_pts( uprBnd ); 268 | aBounds = [a1,a2]; 269 | end 270 | x1 = X(a1); 271 | x2 = X(a2); 272 | dx = (x2 - x1)/(a2-a1); 273 | % Thus for a in (a1,a2), x(a) = x1 + (a-a1)*dx 274 | % Solve 0 = a + dot( uu, y - (x1 + (a-a1)*dx ) ) 275 | % = a + dot(uu,y - x1 + a1*dx ) - a*dot(uu,dx) 276 | % so: 277 | a = dot( uu, x0 - x1 + a1*dx)/( -1 + dot(uu,dx) ); 278 | if a < aBounds(1) || a > aBounds(2), error('alpha is not in the correct range'); end 279 | % If we were not linear, we could do a root-finding algorithm, e.g., 280 | % a = fzero( @(a) a+dot(uu,x0-X(a)), a ); 281 | 282 | % Now, the solution: 283 | x = X(a); 284 | 285 | if ~isempty(lambda) 286 | % Undo the scaling of x <-- lambda.*x 287 | x = x./lambda; 288 | end 289 | 290 | printf('Took %d of %d iterations, lwrBnd is %d/%d \n', i, iMax, lwrBnd,length( brk_pts ) ); 291 | -------------------------------------------------------------------------------- /proxes/prox_rank1_hinge.m: -------------------------------------------------------------------------------- 1 | function varargout = prox_rank1_hinge(varargin) 2 | % PROX_RANK1_HINGE returns the scaled proximity operator for the hinge loss 3 | % 4 | % x = prox_rank1_hinge( x0, D, u ) 5 | % where 6 | % x = argmin_{x} h(x) + 1/2||x-x0||^2_{V} 7 | % and 8 | % V^{-1} = D + u*u' (or diag(D) + u*u' if D is a vector) 9 | % "D" must be diagonal and positive. "u" can be any vector. 10 | % 11 | % Here, h(x) = sum(max(0,1-x)), a.k.a., the hinge-loss 12 | % 13 | % There are also variants: 14 | % x = prox_rank1_hinge( x0, D, u, lambda, linTerm, sigma, inverse) 15 | % returns 16 | % x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x 17 | % and 18 | % either V^{-1} = D + sigma*u*u' if "inverse" is true (default) 19 | % or V = D + sigma*u*u' if "inverse" is false 20 | % and in both cases, "sigma" is either +1 (default) or -1. 21 | % "lambda" should be non-zero 22 | % 23 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com 24 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili 25 | % NIPS 2012, http://arxiv.org/abs/1206.1156 26 | % 27 | % See also prox_rank1_generic.m 28 | 29 | prox = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1 ); 30 | prox_brk_pts = @(s)[ones(size(s)), 1-s]; 31 | 32 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} ); -------------------------------------------------------------------------------- /proxes/prox_rank1_l1.m: -------------------------------------------------------------------------------- 1 | function varargout = prox_rank1_l1(varargin) 2 | % PROX_RANK1_L1 returns the scaled proximity operator for the l1 norm 3 | % 4 | % x = prox_rank1_l1( x0, D, u ) 5 | % where 6 | % x = argmin_{x} h(x) + 1/2||x-x0||^2_{V} 7 | % and 8 | % V^{-1} = D + u*u' (or diag(D) + u*u' if D is a vector) 9 | % "D" must be diagonal and positive. "u" can be any vector. 10 | % 11 | % Here, h(x) = ||x||_1 (the "l-1" norm) 12 | % 13 | % There are also variants: 14 | % x = prox_rank1_l1( x0, D, u, lambda, linTerm, sigma, inverse) 15 | % returns 16 | % x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x 17 | % and 18 | % either V^{-1} = D + sigma*u*u' if "inverse" is true (default) 19 | % or V = D + sigma*u*u' if "inverse" is false 20 | % and in both cases, "sigma" is either +1 (default) or -1. 21 | % "lambda" should be non-zero 22 | % 23 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com 24 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili 25 | % NIPS 2012, http://arxiv.org/abs/1206.1156 26 | % 27 | % See also prox_rank1_generic.m 28 | 29 | prox = @(x,t) sign(x).*max(0, abs(x) - t ); 30 | prox_brk_pts = @(s) [-s,s]; 31 | 32 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} ); -------------------------------------------------------------------------------- /proxes/prox_rank1_l1pos.m: -------------------------------------------------------------------------------- 1 | function varargout = prox_rank1_l1pos(varargin) 2 | % PROX_RANK1_L1POS returns the scaled proximity operator for the l1 norm 3 | % with non-negativity constraints 4 | % 5 | % x = prox_rank1_l1pos( x0, D, u ) 6 | % where 7 | % x = argmin_{x} h(x) + 1/2||x-x0||^2_{V} 8 | % and 9 | % V^{-1} = D + u*u' (or diag(D) + u*u' if D is a vector) 10 | % "D" must be diagonal and positive. "u" can be any vector. 11 | % 12 | % Here, h(x) = ||x||_1 + the indicator function of the set { x : x >= 0 } 13 | % 14 | % There are also variants: 15 | % x = prox_rank1_l1pos( x0, D, u, lambda, linTerm, sigma, inverse) 16 | % returns 17 | % x = argmin_{x} h(lambda.*x) + 1/2||x-x0||^2_{V} + linTerm'*x 18 | % and 19 | % either V^{-1} = D + sigma*u*u' if "inverse" is true (default) 20 | % or V = D + sigma*u*u' if "inverse" is false 21 | % and in both cases, "sigma" is either +1 (default) or -1. 22 | % "lambda" should be non-zero 23 | % 24 | % Stephen Becker, Feb 26 2014, stephen.beckr@gmail.com 25 | % Reference: "A quasi-Newton proximal splitting method" by S. Becker and J. Fadili 26 | % NIPS 2012, http://arxiv.org/abs/1206.1156 27 | % 28 | % See also prox_rank1_generic.m, prox_rank1_l1.m, proj_rank1_Rplus.m 29 | 30 | prox = @(x,t) max(0, x - t ); 31 | prox_brk_pts = @(s) [s]; 32 | 33 | [varargout{1:nargout}] = prox_rank1_generic( prox, prox_brk_pts, varargin{:} ); -------------------------------------------------------------------------------- /setup_zeroSR1.m: -------------------------------------------------------------------------------- 1 | function setup_zeroSR1 2 | % SETUP_ZEROSR1 Adds the zeroSR1 toolbox to the path 3 | 4 | baseDirectory = fileparts(mfilename('fullpath')); 5 | addpath(genpath(baseDirectory)); 6 | 7 | % and make a variable in the main workspace 8 | % assignin('base','ZEROSR1ROOT', baseDirectory ); 9 | 10 | % Make it global so it will not be removed by "clear" statements 11 | % (though "clear all" will still remove it) 12 | evalin('base', sprintf('global ZEROSR1ROOT; ZEROSR1ROOT=''%s'';',baseDirectory) ); -------------------------------------------------------------------------------- /smoothFunctions/normSquaredFunction.m: -------------------------------------------------------------------------------- 1 | function [f,g,h] = normSquaredFunction(x,A,At,b,c,errFcn,extraFcn, constant) 2 | % f = normSquaredFunction(x,A,At,b,c,errFcn,extraFcn, constant) 3 | % returns the objective function 'f' 4 | % to f(x) = .5||Ax-b||_2^2 + c'*x + constant 5 | % [f,g,h] = ... 6 | % return the gradient and Hessian as well 7 | % 8 | % "A" can be a matrix (in which case set At=[], since it is ignored) 9 | % or it can be a function handle to compute the matrix-vector product 10 | % (in which case "At" should be a function handle to compute 11 | % the transposed-matrix - vector product ) 12 | % 13 | % By default, b=0 and c=0. Set any inputs to [] to use default values. 14 | % 15 | % [fHist,errHist] = normSquaredFunction() 16 | % will return the function history 17 | % (and error history as well, if errFcn was provided) 18 | % and reset the history to zero. 19 | % "fHist" is a record of f + extraFcn 20 | % (this is intended to be used where extraFcn is the non-smooth term "h") 21 | % 22 | % This function is (almost*) mathematically (not computationally) equivalent 23 | % to quadraticFunction( x, Q, c ) where 24 | % Q = A'*A and c = A'*b. 25 | % (*almost equivalent since there is a constant value difference in 26 | % the objective function; you can use "constant" to change this) 27 | % 28 | % The Lipschitz constant of the gradient is 29 | % the squared spectral norm of A, i.e., norm(A)^2 30 | % 31 | % 32 | % March 4 2014, Stephen Becker, stephen.beckr@gmail.com 33 | % 34 | % See also quadraticFunction.m 35 | 36 | persistent errHist fcnHist nCalls 37 | if nargin == 0 38 | f = fcnHist(1:nCalls); 39 | g = errHist(1:nCalls); 40 | fcnHist = []; 41 | errHist = []; 42 | nCalls = 0; 43 | return; 44 | end 45 | if isempty( fcnHist ) 46 | [errHist,fcnHist] = deal( zeros(100,1) ); 47 | end 48 | 49 | error(nargchk(2,8,nargin,'struct')); 50 | if nargin < 4 || isempty(b), b = 0; end 51 | if nargin >= 5 && ~isempty(c) 52 | cx = dot(c(:),x(:) ); 53 | else 54 | cx = 0; 55 | c = 0; 56 | end 57 | if nargin < 8 || isempty(constant), constant = 0; end 58 | if isa(A,'function_handle') 59 | Ax = A(x); 60 | else 61 | Ax = A*x; 62 | end 63 | res = Ax - b; 64 | f = .5*norm(res(:))^2 + cx + constant; 65 | 66 | % Record this: 67 | nCalls = nCalls + 1; 68 | if length( errHist ) < nCalls 69 | % allocate more memory 70 | errHist(end:2*end) = 0; 71 | fcnHist(end:2*end) = 0; 72 | end 73 | fcnHist(nCalls) = f; 74 | if nargin >= 7 && ~isempty(extraFcn) 75 | % this is used when we want to record the objective function 76 | % for something non-smooth, and this routine is used only for the smooth 77 | % part. So for recording purposes, add in the nonsmooth part 78 | % But do NOT return it as a function value or it will mess up the 79 | % optimization algorithm. 80 | fcnHist(nCalls) = f + extraFcn(x); 81 | end 82 | 83 | if nargout > 1 84 | if isa(A,'function_handle') 85 | if isempty( At ) 86 | error('If "A" is given implicitly, we need "At" to compute the gradient'); 87 | end 88 | g = At( res ) + c; 89 | else 90 | g = A'*res + c; 91 | end 92 | end 93 | if nargout > 2 94 | if isa(A,'function_handle') 95 | error('Function is only known implicitly, so cannot provide Hessian easily'); 96 | end 97 | h = A'*A; 98 | end 99 | 100 | % and if error is requested... 101 | if nargin >= 6 && ~isempty( errFcn) 102 | errHist(nCalls) = errFcn(x); 103 | end -------------------------------------------------------------------------------- /smoothFunctions/quadraticFunction.m: -------------------------------------------------------------------------------- 1 | function [f,g,h] = quadraticFunction(x,Q,c,errFcn,extraFcn, constant) 2 | % f = quadraticFunction(x,Q,c, errFcn,extraFcn,constant) 3 | % returns the objective function 'f' 4 | % to f(x) = .5 - + constant 5 | % [f,g,h] = ... 6 | % return the gradient and Hessian as well 7 | % 8 | % "Q" can be a matrix (and it should be Hermitian positive semi-definite) 9 | % or it can be a function handle to compute the matrix-vector product 10 | % 11 | % [fHist,errHist] = quadraticFunction() 12 | % will return the function history 13 | % (and error history as well, if errFcn was provided) 14 | % and reset the history to zero. 15 | % "fHist" is a record of f + extraFcn 16 | % (this is intended to be used where extraFcn is the non-smooth term "h") 17 | % 18 | % This function is (almost*) mathematically (not computationally) equivalent 19 | % to normSquaredFunction( x, A, b ) where 20 | % Q = A'*A and c = A'*b. 21 | % (*almost equivalent since there is a constant value difference in 22 | % the objective function) 23 | % 24 | % The Lipschitz constant of the gradient is the spectral norm of Q, i.e., norm(Q) 25 | % 26 | % Feb 19 2013, Stephen Becker, stephen.beckr@gmail.com 27 | % 28 | % See also normSquaredFunction.m 29 | 30 | persistent errHist fcnHist nCalls 31 | if nargin == 0 32 | f = fcnHist(1:nCalls); 33 | g = errHist(1:nCalls); 34 | fcnHist = []; 35 | errHist = []; 36 | nCalls = 0; 37 | return; 38 | end 39 | if isempty( fcnHist ) 40 | [errHist,fcnHist] = deal( zeros(100,1) ); 41 | end 42 | 43 | 44 | % fcnSimple = @(w) w'*(Q*w)/2 - c'*w; 45 | % gradSimple = @(w) Q*w - c; % 46 | if isa(Q,'function_handle') 47 | Qx = Q(x); 48 | else 49 | Qx = Q*x; 50 | end 51 | f = (x'*Qx)/2 - c'*x; 52 | if nargin >= 6 && ~isempty(constant) 53 | f = f + constant; 54 | end 55 | 56 | % Record this: 57 | nCalls = nCalls + 1; 58 | if length( errHist ) < nCalls 59 | % allocate more memory 60 | errHist(end:2*end) = 0; 61 | fcnHist(end:2*end) = 0; 62 | end 63 | fcnHist(nCalls) = f; 64 | if nargin >= 5 && ~isempty(extraFcn) 65 | % this is used when we want to record the objective function 66 | % for something non-smooth, and this routine is used only for the smooth 67 | % part. So for recording purposes, add in the nonsmooth part 68 | % But do NOT return it as a function value or it will mess up the 69 | % optimization algorithm. 70 | fcnHist(nCalls) = f + extraFcn(x); 71 | end 72 | 73 | if nargin > 2 && nargout > 1 74 | % g = G(x); 75 | g = Qx - c; 76 | end 77 | if nargout > 2 78 | if isa(Q,'function_handle') 79 | error('Function is only known implicitly, so cannot provide Hessian easily'); 80 | end 81 | h = Q; 82 | % h = H(x); 83 | end 84 | 85 | % and if error is requested... 86 | if nargin >= 4 && ~isempty( errFcn) 87 | errHist(nCalls) = errFcn(x); 88 | end -------------------------------------------------------------------------------- /tests/computeReferenceSolution.m: -------------------------------------------------------------------------------- 1 | % Meant to be called by getReferenceSolution.m 2 | % This file is NOT compatible with Octave 3 | 4 | fprintf('Computing reference solution via CVX\n'); 5 | cvx_precision best 6 | cvx_quiet true 7 | 8 | switch problemName 9 | case 'simple_001' 10 | cvx_begin 11 | variable xRef(N) 12 | minimize sum_square(A*xRef-b)/2 + lambda*norm(xRef,1) 13 | cvx_end 14 | case 'simple_002' % same setting, different parameters 15 | cvx_begin 16 | variable xRef(N) 17 | minimize sum_square(A*xRef-b)/2 + lambda*norm(xRef,1) 18 | cvx_end 19 | end -------------------------------------------------------------------------------- /tests/getReferenceSolution.m: -------------------------------------------------------------------------------- 1 | global ZEROSR1ROOT 2 | if exist('ZEROSR1ROOT','var') && ~isempty(ZEROSR1ROOT) 3 | refDir = fullfile(ZEROSR1ROOT,'tests','reference_solutions'); 4 | else 5 | fprintf('\n\nERROR: cannot find variable ZEROSR1ROOT\n'); 6 | fprintf('This is probably because you did not run setup_zeroSR1\n'); 7 | fprintf(' or you "cleared" variables since then. Please re-run setup-zeroSR1\n'); 8 | error('zeroSR1:cannotFindVariable','Cannot find ZEROSR1ROOT'); 9 | end 10 | 11 | fileName = fullfile(refDir,[problemName,'.mat']); 12 | 13 | if exist(fileName,'file') 14 | fprintf('Loading reference solution from file\n'); 15 | load(fileName); % loads xRef 16 | else 17 | % Compute answer 18 | % Do this in a separate file since otherwise 19 | % Octave cannot parse this. 20 | 21 | if ~exist('cvx_begin','file') 22 | error('Did not find reference solution nor CVX'); 23 | end 24 | 25 | computeReferenceSolution; % makes xRef 26 | 27 | % and save to the file 28 | save(fileName,'xRef'); 29 | end -------------------------------------------------------------------------------- /tests/reference_solutions/simple_001.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/tests/reference_solutions/simple_001.mat -------------------------------------------------------------------------------- /tests/solution_via_cvx.m: -------------------------------------------------------------------------------- 1 | function [x,V] = solution_via_cvx(type,x0,d,u,lambda,offset,lwr,upr,pm,INV) 2 | % x = solution_via_cvx(type,x0,d,u,lambda,offset,lwr,upr,pm) 3 | % returns the solution using CVX, to serve as a reference. 4 | % 'type' can be one of 'l1', 'rplus' or 'box' 5 | % For 'box', specify lwr and upr bounds. 6 | % 7 | % This computes the weighted prox_h^V(x0) where the function "h" 8 | % is specified by "type" (and perhaps scaled with lambda, 9 | % and/or linear term ), and 10 | % 11 | % V^{-1} = diag(d) + pm*u*u' 12 | % where pm is +1 or -1 (default is +1) 13 | % i.e., V = diag(1./d) - pm*(u./d)*(u./d)'/( 1 + u'*diag(1./d)*u) 14 | % via Sherman-Morrison formula 15 | % or, in the form 16 | % x = solution_via_cvx(type,x0,d,u,lambda,offset,lwr,upr, INV) 17 | % if INV=false (default is true), 18 | % then 19 | % V = diag(d) + pm*u*u' (rather than this being inv(V) ) 20 | % 21 | % In all cases, V must be positive definite 22 | % [x,V] = solution_via_cvx(...) 23 | % also returns the matrix V 24 | % 25 | % Lambda is such that we really evaulate h(lambda.*x) 26 | % where lambda is a scalar or an array ofo the same size as x 27 | % 28 | % If CVX is not installed, or if this is called via octave 29 | % (new versions of CVX do not run on octave), 30 | % then the output is x=Inf. 31 | % 32 | % Stephen Becker, Feb 22 2014 stephen.beckr@gmail.com 33 | 34 | % TODO: if CVX is not installed, read solution from a .mat file 35 | 36 | if nargin < 10 || isempty(INV), INV = true; end 37 | if nargin < 9 || isempty(pm), pm = +1; end 38 | if nargin < 8 || isempty(upr), upr = []; end 39 | if nargin < 7 || isempty(lwr), lwr = []; end 40 | if nargin < 5 || isempty(lambda), lambda = 1; end 41 | if nargin < 4 || isempty(u), u = 0; end 42 | assert( pm==-1 | pm==+1 ); 43 | [R,L] = deal(u); 44 | n = length(x0); 45 | if nargin < 6 || isempty(offset), offset = zeros(n,1); end 46 | 47 | % Vinv = diag(d) + L*R'; 48 | % V = inv(Vinv); 49 | if INV % default 50 | Dinv = diag(1./d); 51 | if all(u==0) 52 | V = Dinv; 53 | else 54 | V = Dinv - pm*(Dinv*L)*(R'*Dinv)/( 1 + R'*Dinv*L ); 55 | end 56 | if pm ==1 57 | % There is a chance that V is not positive definite if u was too large 58 | % .0421 59 | % It is possible to check all eigenvalues in O(n^2) rather than O(n^3) 60 | % but it's not particularly simple to implement. 61 | % See http://www.stat.uchicago.edu/~lekheng/courses/309f10/modified.pdf 62 | % Golub, 1973, "Some Modified Matrix Eigenvalue Problems" 63 | % http://epubs.siam.org/doi/abs/10.1137/1015032 64 | % but... 65 | % if the diagonal term is just a scaled identity, 66 | % then it is trivial 67 | if all( d==d(1) ) && ~all(u==0)% diagonal 68 | minE = 1/d(1) - norm(Dinv*L)^2/(1+R'*Dinv*L); 69 | else 70 | minE = min(eig(V)); 71 | end 72 | if minE <= 0 73 | error('V must be positive definite'); 74 | end 75 | end 76 | else 77 | V = diag(d) + pm*(u*u'); 78 | if pm == -1 79 | if all( d==d(1) ) % diagonal 80 | minE = d(1) - norm(u)^2; 81 | else 82 | minE = min(eig(V)); 83 | end 84 | if minE <= 0 85 | error('V must be positive definite'); 86 | end 87 | end 88 | end 89 | 90 | if exist('OCTAVE_VERSION','builtin') || ~exist('cvx_begin','file') 91 | x = Inf; 92 | return; 93 | end 94 | 95 | x = solveInCVX(type,x0,V,offset,lambda,lwr,upr); 96 | % clean it up a bit: 97 | x = x.*( abs(x) > 1e-10 ); 98 | 99 | end % end of function 100 | 101 | 102 | function x = solveInCVX(type,x0,V,offset,lambda,lwr,upr) 103 | n = length(x0); 104 | cvx_precision best 105 | cvx_quiet true 106 | % minimize lambda*norm(x,1) + 1/2*sum_square( Vsqrt*(x-x0) ) + dot(offset,x) 107 | % avoid Vsqrt=sqrtm(V) for more accurate answer: 108 | 109 | switch lower(type) 110 | case 'l1' 111 | cvx_begin 112 | variable x(n,1) 113 | minimize norm(lambda.*x,1) + 1/2*quad_form(x-x0, V ) + dot(offset,x) 114 | cvx_end 115 | case 'l1pos' 116 | cvx_begin 117 | variable x(n,1) 118 | minimize norm(lambda.*x,1) + 1/2*quad_form(x-x0, V ) + dot(offset,x) 119 | subject to 120 | lambda.*x >= 0 121 | cvx_end 122 | case 'rplus' 123 | cvx_begin 124 | variable x(n,1) 125 | minimize 1/2*quad_form(x-x0, V ) + dot(offset,x) 126 | subject to 127 | lambda.*x >= 0 128 | cvx_end 129 | case 'box' 130 | if ~all( lwr <= upr ) 131 | error('Problem is infeasible'); 132 | end 133 | % Carefully handle cases when lwr = -Inf and/or upr=+Inf 134 | set1 = ~isinf(lwr); 135 | set2 = ~isinf(upr); 136 | if length(lambda)==1, lambda = repmat(lambda,n,1); end 137 | cvx_begin 138 | variable x(n,1) 139 | minimize 1/2*quad_form(x-x0, V ) + dot(offset,x) 140 | subject to 141 | lambda(set1).*x(set1) >= lwr(set1) 142 | lambda(set2).*x(set2) <= upr(set2) 143 | cvx_end 144 | case 'hinge' 145 | hinge = @(x) sum(max(0,1-x)); 146 | cvx_begin 147 | variable x(n,1) 148 | minimize 1/2*quad_form(x-x0,V) + dot(offset,x) + hinge(lambda.*x) 149 | cvx_end 150 | case 'linf' 151 | hinge = @(x) sum(lambda.*max(0,1-x)); 152 | cvx_begin 153 | variable x(n,1) 154 | minimize 1/2*quad_form(x-x0,V) + dot(offset,x) 155 | subject to 156 | norm(lambda.*x, inf ) <= 1 157 | cvx_end 158 | otherwise 159 | error('That type is not yet supported'); 160 | end 161 | end -------------------------------------------------------------------------------- /tests/test_prox_accuracy.m: -------------------------------------------------------------------------------- 1 | %{ 2 | Tests the accuracy of several prox operators 3 | This will run a random assortment of tests 4 | The reference solution is computed with "solution_via_cvx" 5 | which uses CVX (http://cvxr.com/). 6 | If you don't have CVX installed, then it won't work 7 | 8 | In the future, one could make a predefined test set and then 9 | precompute the answers so that CVX is not necessary... 10 | 11 | Stephen Becker, Feb 26 2014 stephen.beckr@gmail.com 12 | %} 13 | 14 | % run setup_zeroSR1.m if you haven't already 15 | clear; clc; 16 | 17 | nTests = 100; 18 | n = 1e2; % dimension of the problem 19 | 20 | myQuadForm = @(x,V) x'*(V*x); 21 | for test = 1:nTests 22 | % Make a random problem 23 | d = rand(n,1); 24 | u = 10*randn(n,1); 25 | y = randn(n,1); 26 | offset = randn(n,1); 27 | lwr = randn(n,1); % used for the box constraints 28 | upr = lwr + 2*rand(n,1); 29 | lwr(randi(n)) = -Inf; 30 | upr(randi(n)) = Inf; 31 | lambda = randn(n,1); 32 | 33 | % And sometimes turn off these features 34 | if randn(1) > 0, lambda = []; end 35 | if randn(1) > 0, offset = []; end 36 | if randn(1) > 0, d(2:end) = d(1); end 37 | if randn(1) > 0, u = 0; end % i.e., normal prox! 38 | sigma = 1; 39 | 40 | % Pick a solver at random 41 | solverTypes = {'l1','l1pos','Rplus','box','hinge','linf'}; 42 | type = solverTypes{ randi(length(solverTypes)) }; 43 | 44 | INVERT = sign( randn(1) )+1; % sometimes specify V, sometimes specify inv(V) 45 | 46 | if isempty(lambda), lambda = 1; end 47 | if isempty(offset), offset = zeros(n,1); end 48 | INFEASIBLE = 1e14; 49 | EPS = 1e-13; % feasiblity tolerance 50 | switch lower(type) 51 | case 'l1' 52 | [x_cvx,V] = solution_via_cvx('l1',y,d,u,lambda,offset,[],[],sigma,INVERT); 53 | obj = @(x) norm(lambda.*x,1) + 1/2*myQuadForm(x-y,V) + dot(offset,x); 54 | % If we use prox_rank1_generic 55 | prox = @(x,t) sign(x).*max(0, abs(x) - t ); 56 | prox_brk_pts = @(s) [-s,s]; 57 | % or, use 58 | % scaledProx = @prox_rank1_l1; 59 | case 'l1pos' 60 | [x_cvx,V] = solution_via_cvx('l1pos',y,d,u,lambda,offset,[],[],sigma,INVERT); 61 | obj = @(x) norm(lambda.*x,1) + 1/2*myQuadForm(x-y,V) + dot(offset,x) + INFEASIBLE*any( lambda.*x < -EPS ); 62 | % If we use prox_rank1_generic 63 | prox = @(x,t) max(0, x - t ); 64 | prox_brk_pts = @(s) [s]; 65 | case 'rplus' 66 | [x_cvx,V] = solution_via_cvx('Rplus',y,d,u,lambda,offset,[],[],sigma,INVERT); 67 | obj = @(x) 1/2*myQuadForm(x-y,V) + dot(offset,x) + INFEASIBLE*any( lambda.*x < -EPS ); 68 | prox = @(x,t) max(0, x); 69 | prox_brk_pts = @(s) 0; % since projection, scaling has no effect 70 | % scaledProx = @proj_rank1_Rplus; 71 | case 'box' 72 | [x_cvx,V] = solution_via_cvx('box',y,d,u,lambda,offset,lwr,upr,sigma,INVERT); 73 | obj = @(x) 1/2*myQuadForm(x-y,V) + dot(offset,x) + ... 74 | INFEASIBLE*any( lambda.*x < lwr-EPS | lambda.*x > upr+EPS ); 75 | prox = @(x,t) max( min(upr,x), lwr ); 76 | prox_brk_pts = @(s) [lwr,upr]; % since projection, scaling has no effect 77 | % scaledProx = @(varargin)proj_rank1_box(lwr,upr,varargin{:}); 78 | case 'hinge' 79 | hinge = @(x) sum(max(0,1-lambda.*x)); 80 | [x_cvx,V] = solution_via_cvx('hinge',y,d,u,lambda,offset,[],[],sigma,INVERT); 81 | obj = @(x) hinge(x) + 1/2*myQuadForm(x-y,V) + dot(offset,x); 82 | prox = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1 ); 83 | prox_brk_pts = @(s)[ones(size(s)), 1-s]; 84 | % scaledProx = @prox_rank1_hinge; 85 | case 'linf' 86 | [x_cvx,V] = solution_via_cvx('linf',y,d,u,lambda,offset,[],[],sigma,INVERT); 87 | obj = @(x) INFEASIBLE*(norm(lambda.*x,Inf)>1+EPS) + 1/2*myQuadForm(x-y,V) + dot(offset,x); 88 | prox = @(x,t) sign(x).*min( 1, abs(x) ); 89 | prox_brk_pts = @(s) [-ones(size(s)),ones(size(s))]; % since projection, scaling has no effect 90 | % scaledProx = @proj_rank1_linf; 91 | end 92 | if all(lambda==1), lambda = []; end % turn off the feature 93 | if all(offset==0), offset = []; end % turn off the feature 94 | 95 | scaledProx = @(varargin) prox_rank1_generic( prox, prox_brk_pts, varargin{:}); 96 | x = scaledProx( y, d, u, lambda, offset, sigma, INVERT); 97 | 98 | if any(isinf( x_cvx )) 99 | % This means either CVX is not installed or this 100 | % is running in Octave. 101 | fprintf('Test %d/%d. Solver type %s. CVX solution not available\n', ... 102 | test, nTests, type ); 103 | if obj(x) > INFEASIBLE 104 | fprintf(2,'\tSolution is not feasible! Maybe due to roundoff?\n'); 105 | break; 106 | end 107 | if isnan(x) 108 | fprintf(2,'\tError detected!\n'); 109 | break; 110 | end 111 | else 112 | 113 | fprintf('Test %d/%d. Solver type %s. Error is %.2e\n', ... 114 | test,nTests, type, norm( x - x_cvx )/max(1e-5,norm(x_cvx)) ); 115 | fprintf('\tObjective is %.2e, for cvx is %.2e, obj(x) - obj(x_cvx) is %.2e\n', ... 116 | obj(x), obj(x_cvx), obj(x)-obj(x_cvx) ); 117 | TOLERANCE1 = 1e-3; 118 | TOLERANCE2 = 1e-6; 119 | if any(isnan(x_cvx)) 120 | if any(isnan(x)) 121 | fprintf(2,'\tBoth solutions are NaN. Hmmm...\n'); 122 | else 123 | fprintf(2,'\tCVX returned NaN, our solver did not.\n'); 124 | if obj(x) > INFEASIBLE/2 125 | fprintf(2,'\tSolution is not feasible! Maybe due to roundoff?\n'); 126 | break; 127 | end 128 | end 129 | else 130 | if obj(x_cvx) > INFEASIBLE/2 131 | fprintf(2,'\tCVX solution is not feasible!\n'); 132 | end 133 | if obj(x) > INFEASIBLE/2 134 | fprintf(2,'\tOur solution is not feasible! Maybe due to roundoff?\n'); 135 | break; 136 | end 137 | if (obj(x)-obj(x_cvx))/max(1,abs(obj(x_cvx))) < TOLERANCE2 138 | fprintf(2,'\tGOOD\n'); 139 | elseif (obj(x)-obj(x_cvx))/max(1,abs(obj(x_cvx))) < TOLERANCE1 140 | fprintf(2,'\tMARGINAL -- Loss of accuracy\n'); 141 | else 142 | fprintf(2,'\tBAD\n'); 143 | break; 144 | end 145 | end 146 | end 147 | end -------------------------------------------------------------------------------- /tests/test_prox_speed.m: -------------------------------------------------------------------------------- 1 | %{ 2 | Test the speed of the various projections, as a function of input size 3 | The paper claims it is O( n log n), so we verify that here. 4 | 5 | We test 5 proxes, and also compare to the time it takes to sort n numbers, 6 | and also compare to O(n) and O(n log n) lines. 7 | 8 | The results: the scaled prox algorithms take about 10x the time 9 | to sort n numbers. Not bad. 10 | 11 | Stephen Becker, Feb 26 2014 stephen.beckr@gmail.com 12 | %} 13 | nReps = 5; 14 | nList = logspace(2,7,6); 15 | typeList = {'l1','Rplus','box','hinge','linf','sort'}; nTypes = length(typeList); 16 | RESULTS = zeros(nTypes,length(nList),nReps); 17 | INVERT = true; % must be true for now... 18 | for ni = 1:length(nList) 19 | n = nList(ni); 20 | fprintf('Test %d of %d: n = %d\n', ni, length(nList), n ); 21 | for ri = 1:nReps 22 | d = rand(n,1); 23 | u = 10*randn(n,1); 24 | y = randn(n,1); 25 | offset = randn(n,1); 26 | lambda = 9; 27 | lwr = randn(n,1); 28 | upr = lwr + 2*rand(n,1); 29 | 30 | for type_i = 1:nTypes 31 | type = typeList{type_i}; 32 | 33 | switch lower(type) 34 | case 'l1' 35 | prox = @(x,t) sign(x).*max(0, abs(x) - t ); 36 | prox_brk_pts = @(s) [-s,s]; 37 | case 'rplus' 38 | prox = @(x,t) max(0, x); 39 | prox_brk_pts = @(s) 0; % since projection, scaling has no effect 40 | case 'box' 41 | prox = @(x,t) max( min(upr,x), lwr ); 42 | prox_brk_pts = @(s) [lwr,upr]; % since projection, scaling has no effect 43 | case 'hinge' 44 | prox = @(x,t) 1 + (x-1).*( x > 1 ) + (x + t - 1).*( x + t < 1 ); 45 | prox_brk_pts = @(s)[ones(size(s)), 1-s]; 46 | case 'linf' 47 | prox = @(x,t) sign(x).*min( 1, abs(x) ); 48 | prox_brk_pts = @(s) [-ones(size(s)),ones(size(s))]; 49 | case 'sort' 50 | end 51 | if strcmpi(type,'sort') % a baseline measure of speed 52 | t2 = tic; 53 | x = sort( y ); 54 | tm2 = toc(t2); 55 | else 56 | scaledProx = @(varargin) prox_rank1_generic( prox, prox_brk_pts, varargin{:}); 57 | t2 = tic; 58 | x = scaledProx( y, d, u, lambda, offset, 1, INVERT); 59 | tm2 = toc(t2); 60 | end 61 | RESULTS( type_i, ni, ri ) = tm2; 62 | end 63 | end 64 | end 65 | %% Plot 66 | figure(1); clf; 67 | times = median(RESULTS,3); 68 | h=loglog( nList, times', 'o-' ); 69 | set(h(end),'marker','*') 70 | xlabel('Dimension "n" of input'); 71 | ylabel('Time to solve, in seconds'); 72 | % Add a line of n 73 | hold all 74 | ref = 3; % which point to reference 75 | loglog( nList, nList*times(1,ref)/nList(ref), '--','linewidth',2 ); 76 | loglog( nList, nList.*log2(nList)*times(1,ref)/(nList(ref).*log2(nList(ref))), '--','linewidth',2 ); 77 | loglog( nList, nList.^2*times(1,ref)/(nList(ref)^2), '--','linewidth',2 ); 78 | legend( {typeList{:}, 'O(n)','O(n log n)','O(n^2)'}, 'location','northwest' ) 79 | ylim([1e-3,20]); 80 | title('Time to compute the scaled prox, median of 5 runs'); 81 | %% Save as a file 82 | % set(gcf, 'PaperPositionMode', 'auto'); 83 | % print -dpng test_prox_speed.png -------------------------------------------------------------------------------- /tests/test_prox_speed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/zeroSR1/ca0bcb8bf7746c1aa14dc32ffbdba23708293990/tests/test_prox_speed.png -------------------------------------------------------------------------------- /tests/test_solver_simple.m: -------------------------------------------------------------------------------- 1 | %{ 2 | Solve a few simple problems to make sure it works 3 | Solutions are saved in reference_solutions/ 4 | 5 | For now, we only have 1 test problem 6 | 7 | Stephen Becker, March 1 2014 8 | %} 9 | 10 | PROBLEM = 1; 11 | 12 | switch PROBLEM 13 | 14 | case 1 15 | N = 12; 16 | A = hilb(N); 17 | b = ones(N,1); 18 | lambda = 1e-1; 19 | 20 | Q = A'*A; 21 | c = A'*b; 22 | normQ = norm(Q); 23 | 24 | problemName=sprintf('simple_%03d', PROBLEM ); 25 | % Call this script, which returns variable xRef 26 | getReferenceSolution; 27 | nrmXref = norm(xRef); 28 | errFcn = @(x) norm( x - xRef )/nrmXref; 29 | 30 | % prox = @(x0,d,u) prox_rank1_l1( x0, d, u, lambda ); 31 | % or, allow 4 arguments, e.g., sigma 32 | prox = @(x0,d,u,varargin) prox_rank1_l1( x0, d, u, lambda, [], varargin{:} ); 33 | h = @(x) lambda*norm(x,1); 34 | 35 | % NOTE: the non-standard form (not |Ax-b|, rather ) 36 | % The "simple" means we do NOT include the lambda term 37 | % fcnSimple = @(w) w'*(Q*w)/2 - c'*w; 38 | % gradSimple = @(w) Q*w - c; % doesn't include non-smooth portion 39 | % % for L-BFGS-B, we will add to gradSimple, since we have made new smooth terms 40 | % fcn = @(w) fcnSimple(w) + h(w); 41 | 42 | % This does all the work for you 43 | % fcnGrad = @(x) quadraticFunction(x,Q,c); 44 | 45 | % Or this form, which doesn't require Q to be formed 46 | % it should be a bit more numerically stable too 47 | fcnGrad = @(x) normSquaredFunction(x,A,[],b); 48 | 49 | end 50 | 51 | %% Solve with zeroSR1 52 | opts = struct('N',N,'verbose',25,'nmax',4000,'tol',1e-13); 53 | opts.L = normQ; % optional 54 | opts.errFcn = errFcn; 55 | 56 | % -- Default values usually fine -- 57 | % opts.BB = true; 58 | % opts.SR1_diagWeight=0.8; 59 | 60 | tic 61 | [xk,nit, errStruct,optsOut] = zeroSR1(fcnGrad,[],h,prox,opts); 62 | % -- You can also call it this way, but can be slower -- 63 | % [xk,nit, errStruct,optsOut] = zeroSR1(fcnSimple,gradSimple,h,prox,opts); 64 | tm = toc; 65 | solverStr = 'zeroSR1'; 66 | fprintf('Final error for %15s is %.2e, took %.2f seconds\n', solverStr, errFcn(xk), tm ); 67 | figure(1); clf; 68 | semilogy(errStruct(:,4) ); 69 | hold all 70 | emphasizeRecent 71 | 72 | %% and same solver but with pure BB, no 0SR1 73 | opts.SR1 = false; 74 | opts.BB_type = 2; 75 | opts.BB = true; 76 | [xk,nit, errStruct,optsOut] = zeroSR1(fcnGrad,[],h,prox,opts); 77 | tm2 = toc; 78 | solverStr = 'BB, no linesearch, i.e., basically SPG/SpaRSA'; 79 | fprintf('Final error for %15s is %.2e, took %.2f seconds\n', solverStr, errFcn(xk), tm2 ); 80 | semilogy(errStruct(:,4) ); 81 | hold all 82 | emphasizeRecent 83 | legend('zeroSR1','standard proximal gradient'); -------------------------------------------------------------------------------- /utilities/Contents.m: -------------------------------------------------------------------------------- 1 | % UTILITIES Collection of useful functions 2 | % cummin - reports the cumulative minimum of a sequence 3 | % emphasizeRecent - Makes the most recent line-series in bold, and all the others 4 | % fminunc_wrapper - bundles together function and gradient calls 5 | % rng - Control the random number generator used by RAND, RANDI, etc. 6 | -------------------------------------------------------------------------------- /utilities/cummin.m: -------------------------------------------------------------------------------- 1 | function x = cummin(x) 2 | % y = cummin(x) 3 | % finds the cummulative minimum of x 4 | % e.g. y_i = min( x_i, y_{i-1} ) 5 | % 6 | % Stephen Becker, 2011, stephen.beckr@gmail.com 7 | 8 | if numel(x) > length(x) 9 | error('input must be a vector'); 10 | end 11 | for k = 2:length(x) 12 | x(k) = min( x(k), x(k-1) ); 13 | end 14 | -------------------------------------------------------------------------------- /utilities/emphasizeRecent.m: -------------------------------------------------------------------------------- 1 | function emphasizeRecent 2 | % Makes the most recent line-series in bold, and all the others 3 | % are not in bold. Call this with no arguments or outputs. 4 | % Written by Stephen Becker, stephen.beckr@gmail.com 2011 5 | 6 | list = get(gca,'children'); 7 | 8 | % Make everything else normal width 9 | % i.e. undo any previous calls of emphasizeRecent() 10 | set( list, 'linewidth', 0.5 ); 11 | 12 | % Make most recent item in bold 13 | set( list(1), 'linewidth',2); 14 | -------------------------------------------------------------------------------- /utilities/fminunc_wrapper.m: -------------------------------------------------------------------------------- 1 | function [f,g,h] = fminunc_wrapper(x,F,G,H, errFcn,extraFcn) 2 | % [f,g,h] = fminunc_wrapper( x, F, G, H, errFcn ) 3 | % for use with Matlab's "fminunc" and other optimization programs 4 | % with similar conventions. 5 | % Here, "x" is the current point, "F" is the objective function, 6 | % "G" is the gradient of F, and "H" is the Hessian of F. 7 | % 8 | % "errFcn", if provided, will be evaulated at x and the results 9 | % stored in the "errHist" variable. 10 | % 11 | % [fHist,errHist] = fminunc_wrapper() 12 | % will return the function history 13 | % (and error history as well, if errFcn was provided) 14 | % and reset the history to zero. 15 | % 16 | % Written by Stephen Becker, 2011, stephen.beckr@gmail.com 17 | % Feb 2015, if F is vector-valued, then the history feature 18 | % is disabled (could fix it if I need this feature) 19 | 20 | persistent errHist fcnHist nCalls 21 | if nargin == 0 22 | % we are in [fHist,errHist] = fminunc_wrapper(); mode ) 23 | f = fcnHist(1:nCalls); 24 | g = errHist(1:nCalls); 25 | fcnHist = []; 26 | errHist = []; 27 | nCalls = 0; 28 | return; 29 | end 30 | if isempty( fcnHist ) 31 | [errHist,fcnHist] = deal( zeros(100,1) ); 32 | end 33 | 34 | f = F(x); 35 | if numel(f)==1 36 | % Record this: 37 | nCalls = nCalls + 1; 38 | if length( errHist ) < nCalls 39 | % allocate more memory 40 | errHist(end:2*end) = 0; 41 | fcnHist(end:2*end) = 0; 42 | end 43 | fcnHist(nCalls) = f; 44 | if nargin >= 6 && ~isempty(extraFcn) 45 | % this is used when we want to record the objective function 46 | % for something non-smooth, and this routine is used only for the smooth 47 | % part. So for recording purposes, add in the nonsmooth part 48 | % But do NOT return it as a function value or it will mess up the 49 | % optimization algorithm. 50 | fcnHist(nCalls) = f + extraFcn(x); 51 | end 52 | end 53 | 54 | if nargin > 2 && nargout > 1 55 | g = G(x); 56 | end 57 | if nargin > 3 && ~isempty(H) && nargout > 2 58 | h = H(x); 59 | end 60 | 61 | % and if error is requested... 62 | if nargin >= 5 && ~isempty( errFcn) 63 | if length( errHist ) < nCalls 64 | % allocate more memory 65 | errHist(end:2*end) = 0; 66 | end 67 | errHist(nCalls) = errFcn(x); 68 | end 69 | -------------------------------------------------------------------------------- /utilities/rng.m: -------------------------------------------------------------------------------- 1 | function varargout = rng(varargin) 2 | %RNG Control the random number generator used by RAND, RANDI, and RANDN (SRB version) 3 | % RNG(SD) seeds the random number generator using the non-negative 4 | % integer SD so that RAND, RANDI, and RANDN produce a predictable 5 | % sequence of numbers. 6 | % 7 | % RNG('shuffle') seeds the random number generator based on the current 8 | % time so that RAND, RANDI, and RANDN produce a different sequence of 9 | % numbers after each time you call RNG. 10 | % 11 | % RNG(SD,GENERATOR) and RNG('shuffle',GENERATOR) additionally specify the 12 | % type of the random number generator used by RAND, RANDI, and RANDN. 13 | % GENERATOR is one of: 14 | % 15 | % Generator Description 16 | % ------------------------------------------------------------------ 17 | % 'twister' Mersenne Twister 18 | % 'combRecursive' Combined Multiple Recursive 19 | % 'multFibonacci' Multiplicative Lagged Fibonacci 20 | % 'v5uniform' Legacy MATLAB 5.0 uniform generator 21 | % 'v5normal' Legacy MATLAB 5.0 normal generator 22 | % 'v4' Legacy MATLAB 4.0 generator 23 | % 24 | % RNG('default') puts the settings of the random number generator used by 25 | % RAND, RANDI, and RANDN to their default values so that they produce the 26 | % same random numbers as if you restarted MATLAB. In this release, the 27 | % default settings are the Mersenne Twister with seed 0. 28 | % 29 | % S = RNG returns the current settings of the random number generator 30 | % used by RAND, RANDI, and RANDN. The settings are returned in a 31 | % structure S with fields 'Type', 'Seed', and 'State'. 32 | % 33 | % RNG(S) restores the settings of the random number generator used by 34 | % RAND, RANDI, and RANDN back to the values captured previously by 35 | % S = RNG. 36 | % 37 | % S = RNG(...) additionally returns the previous settings of the random 38 | % number generator used by RAND, RANDI, and RANDN before changing the 39 | % seed, generator type or the settings. 40 | % 41 | % Example 1: 42 | % s = rng % get the current generator settings 43 | % x = rand(1,5) % RAND generates some values 44 | % rng(s) % restore the generator settings 45 | % y = rand(1,5) % generate the same values so x and y are equal 46 | % 47 | % Example 2: 48 | % oldS = rng(0,'v5uniform') % use legacy generator 49 | % x = rand % legacy startup value .9501 50 | % rng(oldS) % restore the old settings 51 | % 52 | % See Updating Your Random Number Generator Syntax to use RNG to replace 53 | % RAND or RANDN with the 'seed', 'state', or 'twister' inputs. 54 | % 55 | % MODIFIED BY STEPHEN BECKER 56 | % See also RAND, RANDI, RANDN, RandStream, NOW. 57 | 58 | 59 | % See Choosing a Random Number Generator for details on these generators. 60 | 61 | % Copyright 2010 The MathWorks, Inc. 62 | % $Revision: 1.1.6.1 $ $Date: 2010/10/25 16:06:38 $ 63 | 64 | persistent do_once 65 | % 2014, rng is not builtin, it's in a package, so be careful: 66 | C = which('rng','-all'); 67 | if isempty( do_once ), do_once = 0; end 68 | if size(C,1) > 1 && do_once < size(C,1) 69 | do_once = do_once + 1; 70 | % add this directory to the very top of the path so it shadows this 71 | % file... 72 | addpath(fileparts( C{end} ) ) 73 | % disp('Re-run your code; the path to rng has been fixed'); 74 | [varargout{1:nargout}] = rng( varargin{:} ); 75 | return; 76 | end 77 | 78 | if exist('rng','builtin') 79 | [varargout{1:nargout}] = builtin('rng',varargin{:} ); 80 | return; 81 | end 82 | 83 | % if exist('rng','builtin') 84 | % switch nargin 85 | % case 0 86 | % if nargout > 0 87 | % settings = builtin('rng'); 88 | % else 89 | % builtin('rng'); 90 | % end 91 | % case 1 92 | % if nargout > 0 93 | % settings = builtin('rng',arg1); 94 | % else 95 | % builtin('rng',arg1); 96 | % end 97 | % case 2 98 | % if nargout > 0 99 | % settings = builtin('rng',arg1,arg2); 100 | % else 101 | % builtin('rng',arg1,arg2); 102 | % end 103 | % end 104 | % return; 105 | % end 106 | 107 | % -- SRB adding this -- 108 | error(nargchk(1,1,nargin)); 109 | error(nargoutchk(0,0,nargout)); 110 | arg1 = varargin{1}; 111 | % For R2008a, this doesn't work... (not sure what earliest version is) 112 | if verLessThan('matlab','7.7') 113 | randn('state',arg1); 114 | rand('state',arg1); 115 | elseif verLessThan('matlab','8') 116 | RandStream.setDefaultStream(RandStream('mt19937ar', 'seed', arg1 )); 117 | else 118 | RandStream.setGlobalStream(RandStream('mt19937ar', 'seed', arg1 )); 119 | end 120 | --------------------------------------------------------------------------------