├── .gitignore ├── .dokx ├── mkdocs.yml ├── test ├── test_cg.lua ├── test_adam.lua ├── test_sgd.lua ├── test_lbfgs_w_ls.lua ├── test_adagrad.lua ├── test_rmsprop.lua ├── test_adamax.lua ├── test_adadelta.lua ├── test_cmaes.lua ├── test_logger.lua ├── l2.lua ├── test_lbfgs.lua ├── test_confusion.lua ├── rosenbrock.lua ├── test_fista.lua └── sparsecoding.lua ├── CMakeLists.txt ├── init.lua ├── optim-1.0.5-0.rockspec ├── optim-1.0.4-0.rockspec ├── optim-1.0.3-0.rockspec ├── optim-1.0.3-1.rockspec ├── checkgrad.lua ├── README.md ├── adagrad.lua ├── rmsprop.lua ├── COPYRIGHT.txt ├── adadelta.lua ├── asgd.lua ├── adamax.lua ├── adam.lua ├── nag.lua ├── sgd.lua ├── rprop.lua ├── Logger.lua ├── cg.lua ├── lswolfe.lua ├── fista.lua ├── polyinterp.lua ├── lbfgs.lua ├── cmaes.lua ├── ConfusionMatrix.lua └── doc └── index.md /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /.dokx: -------------------------------------------------------------------------------- 1 | return { 2 | githubURL = "torch/optim", 3 | exclude = {"test", "polyinterp.lua"} 4 | } 5 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: optim 2 | theme : simplex 3 | repo_url : https://github.com/torch/optim 4 | use_directory_urls : false 5 | markdown_extensions: [extra] 6 | docs_dir : doc 7 | pages: 8 | - [index.md, Optim] 9 | -------------------------------------------------------------------------------- /test/test_cg.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | 8 | x = torch.Tensor(2):fill(0) 9 | x,fx,i=optim.cg(rosenbrock,x,{maxIter=50}) 10 | 11 | print() 12 | print('Rosenbrock test: compare with http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html') 13 | print() 14 | print('Number of function evals = ',i) 15 | print('x=');print(x) 16 | print('fx=') 17 | for i=1,#fx do print(i,fx[i]); end 18 | -------------------------------------------------------------------------------- /test/test_adam.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | require 'rosenbrock' 4 | require 'l2' 5 | x = torch.Tensor(2):fill(0) 6 | fx = {} 7 | config = {learningRate=0.002} 8 | for i = 1,10001 do 9 | x,f=optim.adam(rosenbrock,x,config) 10 | if (i-1)%1000 == 0 then 11 | table.insert(fx,f[1]) 12 | end 13 | end 14 | print() 15 | print('Rosenbrock test') 16 | print() 17 | print('x=');print(x) 18 | print('fx=') 19 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 20 | -------------------------------------------------------------------------------- /test/test_sgd.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | 10 | config = {learningRate=1e-3} 11 | for i = 1,10001 do 12 | x,f=optim.sgd(rosenbrock,x,config) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_lbfgs_w_ls.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | print('--- batch test w/ line search ---') 8 | 9 | x = torch.Tensor(2):fill(0) 10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, lineSearch=optim.lswolfe}) 11 | 12 | print() 13 | print('Rosenbrock test') 14 | print() 15 | print('Number of function evals = ',i) 16 | print('x=');print(x) 17 | print('fx=') 18 | for i=1,#fx do print(i,fx[i]); end 19 | print() 20 | print() 21 | -------------------------------------------------------------------------------- /test/test_adagrad.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | 10 | config = {learningRate=1e-1} 11 | for i = 1,10001 do 12 | x,f=optim.adagrad(rosenbrock,x,config) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_rmsprop.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | 10 | config = {learningRate=5e-4} 11 | for i = 1,10001 do 12 | x,f=optim.rmsprop(rosenbrock,x,config) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_adamax.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'torch' 3 | require 'optim' 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | state = {} 10 | config = {} 11 | for i = 1,10001 do 12 | x,f=optim.adamax(rosenbrock,x,config,state) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_adadelta.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | state = {} 10 | config = {eps=1e-10} 11 | for i = 1,10001 do 12 | x,f=optim.adadelta(rosenbrock,x,config,state) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_cmaes.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | -- 10-D rosenbrock 8 | x = torch.Tensor(10):fill(0) 9 | config = {maxEval=10000, sigma=0.5, verb_disp=0} 10 | 11 | -- will take some time 12 | x,fx,i=optim.cmaes(rosenbrock,x,config) 13 | 14 | 15 | print('Rosenbrock test') 16 | print() 17 | -- approx 6500 function evals expected 18 | print('Number of function evals = ',i) 19 | print('x=');print(x) 20 | print('fx=') 21 | for i=1,#fx do print(i,fx[i]); end 22 | print() 23 | print() -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) 3 | CMAKE_POLICY(VERSION 2.6) 4 | IF(LUAROCKS_PREFIX) 5 | MESSAGE(STATUS "Installing Torch through Luarocks") 6 | STRING(REGEX REPLACE "(.*)lib/luarocks/rocks.*" "\\1" CMAKE_INSTALL_PREFIX "${LUAROCKS_PREFIX}") 7 | MESSAGE(STATUS "Prefix inferred from Luarocks: ${CMAKE_INSTALL_PREFIX}") 8 | ENDIF() 9 | FIND_PACKAGE(Torch REQUIRED) 10 | 11 | SET(src) 12 | FILE(GLOB luasrc *.lua) 13 | ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}") 14 | #ADD_TORCH_DOK(dok optim "Machine Learning" "Optimization" 3.2) 15 | -------------------------------------------------------------------------------- /test/test_logger.lua: -------------------------------------------------------------------------------- 1 | require 'optim' 2 | 3 | 4 | logger_former = optim.Logger('accuracy-former.log') 5 | logger_new = optim.Logger('accuracy-new.log') 6 | 7 | logger_new:setNames({'channel 1', 'channel 2', 'channel 3'}) 8 | 9 | for i = 1, 20 do 10 | logger_former:add({['channel 1'] = 1 , ['channel 2'] = 0.1 * i, ['channel 3'] = 1 - 0.2 * i}) 11 | logger_new:add({1 , 0.1 * i, 1 - 0.2 * i}) 12 | end 13 | 14 | logger_former:style({['channel 1'] = '-' , ['channel 2'] = '-', ['channel 3'] = '-'}) 15 | logger_new:style{'-', '-', '-'} 16 | 17 | logger_former:plot() 18 | logger_new:plot() 19 | 20 | 21 | -------------------------------------------------------------------------------- /test/l2.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | -- rosenbrock.m This function returns the function value, partial derivatives 3 | -- and Hessian of the (general dimension) rosenbrock function, given by: 4 | -- 5 | -- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 6 | -- 7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1). 8 | -- 9 | -- Carl Edward Rasmussen, 2001-07-21. 10 | 11 | function l2(x) 12 | 13 | local xx = x:clone() 14 | xx:cmul(xx) 15 | local fout = xx:sum() 16 | 17 | local dx = torch.Tensor():resizeAs(x):copy(x) 18 | dx:mul(2) 19 | --print('l2 eval = ', fout) 20 | return fout,dx 21 | 22 | end -------------------------------------------------------------------------------- /init.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'torch' 3 | 4 | optim = {} 5 | 6 | -- optimizations 7 | require('optim.sgd') 8 | require('optim.cg') 9 | require('optim.asgd') 10 | require('optim.nag') 11 | require('optim.fista') 12 | require('optim.lbfgs') 13 | require('optim.adagrad') 14 | require('optim.rprop') 15 | require('optim.adam') 16 | require('optim.adamax') 17 | require('optim.rmsprop') 18 | require('optim.adadelta') 19 | require('optim.cmaes') 20 | 21 | -- line search functions 22 | require('optim.lswolfe') 23 | 24 | -- helpers 25 | require('optim.polyinterp') 26 | require('optim.checkgrad') 27 | 28 | -- tools 29 | require('optim.ConfusionMatrix') 30 | require('optim.Logger') 31 | 32 | return optim 33 | -------------------------------------------------------------------------------- /optim-1.0.5-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.5-0" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | } 7 | 8 | description = { 9 | summary = "An optimization library for Torch.", 10 | detailed = [[ 11 | This package contains several optimization routines for Torch. 12 | ]], 13 | homepage = "https://github.com/torch/optim", 14 | license = "BSD" 15 | } 16 | 17 | dependencies = { 18 | "torch >= 7.0", 19 | } 20 | 21 | build = { 22 | type = "command", 23 | build_command = [[ 24 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 25 | ]], 26 | install_command = "cd build && $(MAKE) install" 27 | } 28 | -------------------------------------------------------------------------------- /optim-1.0.4-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.4-0" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | tag = "1.0.4-0" 7 | } 8 | 9 | description = { 10 | summary = "An optimization library for Torch.", 11 | detailed = [[ 12 | This package contains several optimization routines for Torch. 13 | ]], 14 | homepage = "https://github.com/torch/optim", 15 | license = "BSD" 16 | } 17 | 18 | dependencies = { 19 | "torch >= 7.0", 20 | } 21 | 22 | build = { 23 | type = "command", 24 | build_command = [[ 25 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 26 | ]], 27 | install_command = "cd build && $(MAKE) install" 28 | } 29 | -------------------------------------------------------------------------------- /optim-1.0.3-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.3-0" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | tag = "1.0.3-0" 7 | } 8 | 9 | description = { 10 | summary = "An optimization library for Torch.", 11 | detailed = [[ 12 | This package contains several optimization routines for Torch. 13 | ]], 14 | homepage = "https://github.com/torch/optim", 15 | license = "BSD" 16 | } 17 | 18 | dependencies = { 19 | "torch >= 7.0", 20 | "sys >= 1.0", 21 | } 22 | 23 | build = { 24 | type = "command", 25 | build_command = [[ 26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 27 | ]], 28 | install_command = "cd build && $(MAKE) install" 29 | } 30 | -------------------------------------------------------------------------------- /optim-1.0.3-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.3-1" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | tag = "1.0.3-1" 7 | } 8 | 9 | description = { 10 | summary = "An optimization library for Torch.", 11 | detailed = [[ 12 | This package contains several optimization routines for Torch. 13 | ]], 14 | homepage = "https://github.com/torch/optim", 15 | license = "BSD" 16 | } 17 | 18 | dependencies = { 19 | "torch >= 7.0", 20 | "sys >= 1.0", 21 | } 22 | 23 | build = { 24 | type = "command", 25 | build_command = [[ 26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 27 | ]], 28 | install_command = "cd build && $(MAKE) install" 29 | } 30 | -------------------------------------------------------------------------------- /test/test_lbfgs.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | print('--- regular batch test ---') 8 | 9 | x = torch.Tensor(2):fill(0) 10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, learningRate=1e-1}) 11 | 12 | print() 13 | print('Rosenbrock test') 14 | print() 15 | print('Number of function evals = ',i) 16 | print('x=');print(x) 17 | print('fx=') 18 | for i=1,#fx do print(i,fx[i]); end 19 | print() 20 | print() 21 | 22 | print('--- stochastic test ---') 23 | 24 | x = torch.Tensor(2):fill(0) 25 | fx = {} 26 | config = {learningRate=1e-1, maxIter=1} 27 | for i = 1,100 do 28 | x,f=optim.lbfgs(rosenbrock,x,config) 29 | table.insert(fx,f[1]) 30 | end 31 | 32 | print() 33 | print('Rosenbrock test') 34 | print() 35 | print('Number of function evals = ',i) 36 | print('x=');print(x) 37 | print('fx=') 38 | for i=1,#fx do print(i,fx[i]); end 39 | -------------------------------------------------------------------------------- /test/test_confusion.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | n_feature = 3 5 | classes = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 6 | 7 | print'ConfusionMatrix:__init() test' 8 | cm = optim.ConfusionMatrix(#classes, classes) 9 | 10 | target = 3 11 | prediction = torch.randn(#classes) 12 | 13 | print'ConfusionMatrix:add() test' 14 | cm:add(prediction, target) 15 | cm:add(prediction, torch.randn(#classes)) 16 | 17 | batch_size = 8 18 | 19 | targets = torch.randperm(batch_size) 20 | predictions = torch.randn(batch_size, #classes) 21 | 22 | print'ConfusionMatrix:batchAdd() test' 23 | cm:batchAdd(predictions, targets) 24 | assert(cm.mat:sum() == batch_size + 2, 'missing examples') 25 | 26 | print'ConfusionMatrix:updateValids() test' 27 | cm:updateValids() 28 | 29 | print'ConfusionMatrix:__tostring__() test' 30 | print(cm) 31 | 32 | target = 0 33 | cm:add(prediction, target) 34 | assert(cm.mat:sum() == batch_size + 2, 'too many examples') 35 | 36 | -- FAR/FRR testing on identify matrix. FRR/FAR should be zero for identity. 37 | cm.mat = torch.eye(#classes, #classes) 38 | classFrrs, classFars, frrs, fars = cm:farFrr() 39 | assert(classFrrs:sum() + classFars:sum() == 0, "Incorrect values") 40 | -------------------------------------------------------------------------------- /checkgrad.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of a simple numerical gradient checker. 2 | 3 | ARGS: 4 | 5 | - `opfunc` : a function that takes a single input (X), the point of 6 | evaluation, and returns f(X) and df/dX 7 | - `x` : the initial point 8 | - `eps` : the epsilon to use for the numerical check (default is 1e-7) 9 | 10 | RETURN: 11 | 12 | - `diff` : error in the gradient, should be near tol 13 | - `dC` : exact gradient at point 14 | - `dC_est` : numerically estimates gradient at point 15 | 16 | ]]-- 17 | 18 | 19 | -- function that numerically checks gradient of NCA loss: 20 | function optim.checkgrad(opfunc, x, eps) 21 | 22 | -- compute true gradient: 23 | local _,dC = opfunc(x) 24 | dC:resize(x:size()) 25 | 26 | -- compute numeric approximations to gradient: 27 | local eps = eps or 1e-7 28 | local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC) 29 | for i = 1,dC:size(1) do 30 | x[i] = x[i] + eps 31 | local C1 = opfunc(x) 32 | x[i] = x[i] - 2 * eps 33 | local C2 = opfunc(x) 34 | x[i] = x[i] + eps 35 | dC_est[i] = (C1 - C2) / (2 * eps) 36 | end 37 | 38 | -- estimate error of gradient: 39 | local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est) 40 | return diff,dC,dC_est 41 | end 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Optimization package 2 | 3 | This package contains several optimization routines for [Torch](https://github.com/torch/torch7/blob/master/README.md). 4 | Each optimization algorithm is based on the same interface: 5 | 6 | ```lua 7 | x*, {f}, ... = optim.method(func, x, state) 8 | ``` 9 | 10 | where: 11 | 12 | * `func`: a user-defined closure that respects this API: `f, df/dx = func(x)` 13 | * `x`: the current parameter vector (a 1D `torch.Tensor`) 14 | * `state`: a table of parameters, and state variables, dependent upon the algorithm 15 | * `x*`: the new parameter vector that minimizes `f, x* = argmin_x f(x)` 16 | * `{f}`: a table of all f values, in the order they've been evaluated 17 | (for some simple algorithms, like SGD, `#f == 1`) 18 | 19 | ## Available algorithms 20 | 21 | Please check [this file](doc/index.md) for the full list of 22 | optimization algorithms available and examples. Get also into the 23 | [`test`](test/) directory for straightforward examples using the 24 | [Rosenbrock's](test/rosenbrock.lua) function. 25 | 26 | ## Important Note 27 | 28 | The state table is used to hold the state of the algorithm. 29 | It's usually initialized once, by the user, and then passed to the optim function 30 | as a black box. Example: 31 | 32 | ```lua 33 | state = { 34 | learningRate = 1e-3, 35 | momentum = 0.5 36 | } 37 | 38 | for i,sample in ipairs(training_samples) do 39 | local func = function(x) 40 | -- define eval function 41 | return f,df_dx 42 | end 43 | optim.sgd(func,x,state) 44 | end 45 | ``` 46 | -------------------------------------------------------------------------------- /test/rosenbrock.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | -- rosenbrock.m This function returns the function value, partial derivatives 3 | -- and Hessian of the (general dimension) rosenbrock function, given by: 4 | -- 5 | -- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 6 | -- 7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1). 8 | -- 9 | -- Carl Edward Rasmussen, 2001-07-21. 10 | 11 | function rosenbrock(x) 12 | 13 | -- (1) compute f(x) 14 | local d = x:size(1) 15 | -- x1 = x(i)^2 16 | local x1 = x.new(d-1):copy(x:narrow(1,1,d-1)) 17 | -- x(i+1) - x(i)^2 18 | x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1)) 19 | 20 | -- 100*(x(i+1) - x(i)^2)^2 21 | x1:cmul(x1):mul(100) 22 | 23 | -- x(i) 24 | local x0 = x.new(d-1):copy(x:narrow(1,1,d-1)) 25 | -- 1-x(i) 26 | x0:mul(-1):add(1) 27 | -- (1-x(i))^2 28 | x0:cmul(x0) 29 | -- 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 30 | x1:add(x0) 31 | local fout = x1:sum() 32 | 33 | -- (2) compute f(x)/dx 34 | local dxout = x.new():resizeAs(x):zero() 35 | -- df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1)); 36 | 37 | x1:copy(x:narrow(1,1,d-1)) 38 | x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1)):cmul(x:narrow(1,1,d-1)):mul(-400) 39 | x0:copy(x:narrow(1,1,d-1)):mul(-1):add(1):mul(-2) 40 | x1:add(x0) 41 | dxout:narrow(1,1,d-1):copy(x1) 42 | 43 | -- df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2); 44 | x0:copy(x:narrow(1,1,d-1)) 45 | x0:cmul(x0):mul(-1):add(x:narrow(1,2,d-1)):mul(200) 46 | dxout:narrow(1,2,d-1):add(x0) 47 | 48 | return fout,dxout 49 | 50 | end 51 | -------------------------------------------------------------------------------- /adagrad.lua: -------------------------------------------------------------------------------- 1 | --[[ ADAGRAD implementation for SGD 2 | 3 | ARGS: 4 | - `opfunc` : a function that takes a single input (X), the point of 5 | evaluation, and returns f(X) and df/dX 6 | - `x` : the initial point 7 | - `state` : a table describing the state of the optimizer; after each 8 | call the state is modified 9 | - `state.learningRate` : learning rate 10 | - `state.paramVariance` : vector of temporal variances of parameters 11 | - `state.weightDecay` : scalar that controls weight decay 12 | RETURN: 13 | - `x` : the new x vector 14 | - `f(x)` : the function, evaluated before the update 15 | 16 | ]] 17 | function optim.adagrad(opfunc, x, config, state) 18 | -- (0) get/update state 19 | if config == nil and state == nil then 20 | print('no state table, ADAGRAD initializing') 21 | end 22 | local config = config or {} 23 | local state = state or config 24 | local lr = config.learningRate or 1e-3 25 | local lrd = config.learningRateDecay or 0 26 | local wd = config.weightDecay or 0 27 | state.evalCounter = state.evalCounter or 0 28 | local nevals = state.evalCounter 29 | 30 | -- (1) evaluate f(x) and df/dx 31 | local fx,dfdx = opfunc(x) 32 | 33 | -- (2) weight decay with a single parameter 34 | if wd ~= 0 then 35 | dfdx:add(wd, x) 36 | end 37 | 38 | -- (3) learning rate decay (annealing) 39 | local clr = lr / (1 + nevals*lrd) 40 | 41 | -- (4) parameter update with single or individual learning rates 42 | if not state.paramVariance then 43 | state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 44 | state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx) 45 | end 46 | state.paramVariance:addcmul(1,dfdx,dfdx) 47 | state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt() 48 | x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10)) 49 | 50 | -- (5) update evaluation counter 51 | state.evalCounter = state.evalCounter + 1 52 | 53 | -- return x*, f(x) before optimization 54 | return x,{fx} 55 | end 56 | -------------------------------------------------------------------------------- /rmsprop.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of RMSprop 2 | 3 | ARGS: 4 | 5 | - 'opfunc' : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - 'x' : the initial point 8 | - 'config` : a table with configuration parameters for the optimizer 9 | - 'config.learningRate' : learning rate 10 | - 'config.alpha' : smoothing constant 11 | - 'config.epsilon' : value with which to initialise m 12 | - 'config.weightDecay' : weight decay 13 | - 'state' : a table describing the state of the optimizer; 14 | after each call the state is modified 15 | - 'state.m' : leaky sum of squares of parameter gradients, 16 | - 'state.tmp' : and the square root (with epsilon smoothing) 17 | 18 | RETURN: 19 | - `x` : the new x vector 20 | - `f(x)` : the function, evaluated before the update 21 | 22 | ]] 23 | 24 | function optim.rmsprop(opfunc, x, config, state) 25 | -- (0) get/update state 26 | local config = config or {} 27 | local state = state or config 28 | local lr = config.learningRate or 1e-2 29 | local alpha = config.alpha or 0.99 30 | local epsilon = config.epsilon or 1e-8 31 | local wd = config.weightDecay or 0 32 | 33 | -- (1) evaluate f(x) and df/dx 34 | local fx, dfdx = opfunc(x) 35 | 36 | -- (2) weight decay 37 | if wd ~= 0 then 38 | dfdx:add(wd, x) 39 | end 40 | 41 | -- (3) initialize mean square values and square gradient storage 42 | if not state.m then 43 | state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(1) 44 | state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx) 45 | end 46 | 47 | -- (4) calculate new (leaky) mean squared values 48 | state.m:mul(alpha) 49 | state.m:addcmul(1.0-alpha, dfdx, dfdx) 50 | 51 | -- (5) perform update 52 | state.tmp:sqrt(state.m):add(epsilon) 53 | x:addcdiv(-lr, dfdx, state.tmp) 54 | 55 | -- return x*, f(x) before optimization 56 | return x, {fx} 57 | end 58 | -------------------------------------------------------------------------------- /COPYRIGHT.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 2 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 3 | Copyright (c) 2011-2013 NYU (Clement Farabet) 4 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 5 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 6 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 7 | 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | 13 | 1. Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright 17 | notice, this list of conditions and the following disclaimer in the 18 | documentation and/or other materials provided with the distribution. 19 | 20 | 3. Neither the names of NEC Laboratories American and IDIAP Research 21 | Institute nor the names of its contributors may be used to endorse or 22 | promote products derived from this software without specific prior 23 | written permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 26 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 29 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 | POSSIBILITY OF SUCH DAMAGE. 36 | -------------------------------------------------------------------------------- /adadelta.lua: -------------------------------------------------------------------------------- 1 | --[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701 2 | 3 | ARGS: 4 | - `opfunc` : a function that takes a single input (X), the point of 5 | evaluation, and returns f(X) and df/dX 6 | - `x` : the initial point 7 | - `config` : a table of hyper-parameters 8 | - `config.rho` : interpolation parameter 9 | - `config.eps` : for numerical stability 10 | - `config.weightDecay` : weight decay 11 | - `state` : a table describing the state of the optimizer; after each 12 | call the state is modified 13 | - `state.paramVariance` : vector of temporal variances of parameters 14 | - `state.accDelta` : vector of accummulated delta of gradients 15 | RETURN: 16 | - `x` : the new x vector 17 | - `f(x)` : the function, evaluated before the update 18 | ]] 19 | function optim.adadelta(opfunc, x, config, state) 20 | -- (0) get/update state 21 | if config == nil and state == nil then 22 | print('no state table, ADADELTA initializing') 23 | end 24 | local config = config or {} 25 | local state = state or config 26 | local rho = config.rho or 0.9 27 | local eps = config.eps or 1e-6 28 | local wd = config.weightDecay or 0 29 | state.evalCounter = state.evalCounter or 0 30 | -- (1) evaluate f(x) and df/dx 31 | local fx,dfdx = opfunc(x) 32 | 33 | -- (2) weight decay 34 | if wd ~= 0 then 35 | dfdx:add(wd, x) 36 | end 37 | 38 | -- (3) parameter update 39 | if not state.paramVariance then 40 | state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 41 | state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 42 | state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 43 | state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 44 | end 45 | state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx) 46 | state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt() 47 | state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx) 48 | x:add(-1, state.delta) 49 | state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta) 50 | -- (4) update evaluation counter 51 | state.evalCounter = state.evalCounter + 1 52 | 53 | -- return x*, f(x) before optimization 54 | return x,{fx} 55 | end 56 | -------------------------------------------------------------------------------- /asgd.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of ASGD 2 | 3 | ASGD: 4 | 5 | x := (1 - lambda eta_t) x - eta_t df/dx(z,x) 6 | a := a + mu_t [ x - a ] 7 | 8 | eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75 9 | mu_t = 1/max(1,t-t0) 10 | 11 | implements ASGD algoritm as in L.Bottou's sgd-2.0 12 | 13 | ARGS: 14 | 15 | - `opfunc` : a function that takes a single input (X), the point of 16 | evaluation, and returns f(X) and df/dX 17 | - `x` : the initial point 18 | - `state` : a table describing the state of the optimizer; after each 19 | call the state is modified 20 | - `state.eta0` : learning rate 21 | - `state.lambda` : decay term 22 | - `state.alpha` : power for eta update 23 | - `state.t0` : point at which to start averaging 24 | 25 | RETURN: 26 | - `x` : the new x vector 27 | - `f(x)` : the function, evaluated before the update 28 | - `ax` : the averaged x vector 29 | 30 | (Clement Farabet, 2012) 31 | --]] 32 | function optim.asgd(opfunc, x, config, state) 33 | -- (0) get/update state 34 | local config = config or {} 35 | local state = state or config 36 | config.eta0 = config.eta0 or 1e-4 37 | config.lambda = config.lambda or 1e-4 38 | config.alpha = config.alpha or 0.75 39 | config.t0 = config.t0 or 1e6 40 | 41 | -- (hidden state) 42 | state.eta_t = state.eta_t or config.eta0 43 | state.mu_t = state.mu_t or 1 44 | state.t = state.t or 0 45 | 46 | -- (1) evaluate f(x) and df/dx 47 | local fx,dfdx = opfunc(x) 48 | 49 | -- (2) decay term 50 | x:mul(1 - config.lambda*state.eta_t) 51 | 52 | -- (3) update x 53 | x:add(-state.eta_t, dfdx) 54 | 55 | -- (4) averaging 56 | state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero() 57 | state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax) 58 | if state.mu_t ~= 1 then 59 | state.tmp:copy(x) 60 | state.tmp:add(-1,state.ax):mul(state.mu_t) 61 | state.ax:add(state.tmp) 62 | else 63 | state.ax:copy(x) 64 | end 65 | 66 | -- (5) update eta_t and mu_t 67 | state.t = state.t + 1 68 | state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha) 69 | state.mu_t = 1 / math.max(1, state.t - config.t0) 70 | 71 | -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...) 72 | return x,{fx},state.ax 73 | end 74 | -------------------------------------------------------------------------------- /adamax.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf 2 | 3 | ARGS: 4 | 5 | - 'opfunc' : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - 'x' : the initial point 8 | - 'config` : a table with configuration parameters for the optimizer 9 | - 'config.learningRate' : learning rate 10 | - 'config.beta1' : first moment coefficient 11 | - 'config.beta2' : second moment coefficient 12 | - 'config.epsilon' : for numerical stability 13 | - 'state' : a table describing the state of the optimizer; 14 | after each call the state is modified. 15 | 16 | RETURN: 17 | - `x` : the new x vector 18 | - `f(x)` : the function, evaluated before the update 19 | 20 | ]] 21 | 22 | function optim.adamax(opfunc, x, config, state) 23 | -- (0) get/update state 24 | local config = config or {} 25 | local state = state or config 26 | local lr = config.learningRate or 0.002 27 | 28 | local beta1 = config.beta1 or 0.9 29 | local beta2 = config.beta2 or 0.999 30 | local epsilon = config.epsilon or 1e-38 31 | local wd = config.weightDecay or 0 32 | 33 | -- (1) evaluate f(x) and df/dx 34 | local fx, dfdx = opfunc(x) 35 | 36 | -- (2) weight decay 37 | if wd ~= 0 then 38 | dfdx:add(wd, x) 39 | end 40 | 41 | -- Initialization 42 | state.t = state.t or 0 43 | -- Exponential moving average of gradient values 44 | state.m = state.m or x.new(dfdx:size()):zero() 45 | -- Exponential moving average of the infinity norm 46 | state.u = state.u or x.new(dfdx:size()):zero() 47 | -- A tmp tensor to hold the input to max() 48 | state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero() 49 | 50 | state.t = state.t + 1 51 | 52 | -- Update biased first moment estimate. 53 | state.m:mul(beta1):add(1-beta1, dfdx) 54 | -- Update the exponentially weighted infinity norm. 55 | state.max[1]:copy(state.u):mul(beta2) 56 | state.max[2]:copy(dfdx):abs():add(epsilon) 57 | state.u:max(state.max, 1) 58 | 59 | local biasCorrection1 = 1 - beta1^state.t 60 | local stepSize = lr/biasCorrection1 61 | -- (2) update x 62 | x:addcdiv(-stepSize, state.m, state.u) 63 | 64 | -- return x*, f(x) before optimization 65 | return x, {fx} 66 | end 67 | -------------------------------------------------------------------------------- /adam.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf 2 | 3 | ARGS: 4 | 5 | - 'opfunc' : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - 'x' : the initial point 8 | - 'config` : a table with configuration parameters for the optimizer 9 | - 'config.learningRate' : learning rate 10 | - 'config.beta1' : first moment coefficient 11 | - 'config.beta2' : second moment coefficient 12 | - 'config.epsilon' : for numerical stability 13 | - 'config.weightDecay' : weight decay 14 | - 'state' : a table describing the state of the optimizer; after each 15 | call the state is modified 16 | 17 | RETURN: 18 | - `x` : the new x vector 19 | - `f(x)` : the function, evaluated before the update 20 | 21 | ]] 22 | 23 | function optim.adam(opfunc, x, config, state) 24 | -- (0) get/update state 25 | local config = config or {} 26 | local state = state or config 27 | local lr = config.learningRate or 0.001 28 | 29 | local beta1 = config.beta1 or 0.9 30 | local beta2 = config.beta2 or 0.999 31 | local epsilon = config.epsilon or 1e-8 32 | local wd = config.weightDecay or 0 33 | 34 | -- (1) evaluate f(x) and df/dx 35 | local fx, dfdx = opfunc(x) 36 | 37 | -- (2) weight decay 38 | if wd ~= 0 then 39 | dfdx:add(wd, x) 40 | end 41 | 42 | -- Initialization 43 | state.t = state.t or 0 44 | -- Exponential moving average of gradient values 45 | state.m = state.m or x.new(dfdx:size()):zero() 46 | -- Exponential moving average of squared gradient values 47 | state.v = state.v or x.new(dfdx:size()):zero() 48 | -- A tmp tensor to hold the sqrt(v) + epsilon 49 | state.denom = state.denom or x.new(dfdx:size()):zero() 50 | 51 | state.t = state.t + 1 52 | 53 | -- Decay the first and second moment running average coefficient 54 | state.m:mul(beta1):add(1-beta1, dfdx) 55 | state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) 56 | 57 | state.denom:copy(state.v):sqrt():add(epsilon) 58 | 59 | local biasCorrection1 = 1 - beta1^state.t 60 | local biasCorrection2 = 1 - beta2^state.t 61 | local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1 62 | -- (3) update x 63 | x:addcdiv(-stepSize, state.m, state.denom) 64 | 65 | -- return x*, f(x) before optimization 66 | return x, {fx} 67 | end 68 | -------------------------------------------------------------------------------- /test/test_fista.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'unsup' 3 | require 'torch' 4 | require 'gnuplot' 5 | require 'sparsecoding' 6 | 7 | -- gnuplot.setgnuplotexe('/usr/bin/gnuplot44') 8 | -- gnuplot.setgnuplotterminal('x11') 9 | 10 | function gettableval(tt,v) 11 | local x = torch.Tensor(#tt) 12 | for i=1,#tt do x[i] = tt[i][v] end 13 | return x 14 | end 15 | function doplots(v) 16 | v = v or 'F' 17 | local fistaf = torch.DiskFile('fista2.bin'):binary() 18 | local istaf = torch.DiskFile('ista2.bin'):binary() 19 | 20 | local hfista = fistaf:readObject() 21 | fistaf:close() 22 | local hista = istaf:readObject() 23 | istaf:close() 24 | 25 | gnuplot.figure() 26 | gnuplot.plot({'fista ' .. v,gettableval(hfista,v)},{'ista ' .. v, gettableval(hista,v)}) 27 | end 28 | 29 | seed = seed or 123 30 | if dofista == nil then 31 | dofista = true 32 | else 33 | dofista = not dofista 34 | end 35 | 36 | torch.manualSeed(seed) 37 | math.randomseed(seed) 38 | nc = 3 39 | ni = 30 40 | no = 100 41 | x = torch.Tensor(ni):zero() 42 | 43 | --- I am keeping these just to make sure random init stays same 44 | fista = unsup.LinearFistaL1(ni,no,0.1) 45 | fista = nil 46 | 47 | fistaparams = {} 48 | fistaparams.doFistaUpdate = dofista 49 | fistaparams.maxline = 10 50 | fistaparams.maxiter = 200 51 | fistaparams.verbose = true 52 | 53 | D=torch.randn(ni,no) 54 | for i=1,D:size(2) do 55 | D:select(2,i):div(D:select(2,i):std()+1e-12) 56 | end 57 | 58 | mixi = torch.Tensor(nc) 59 | mixj = torch.Tensor(nc) 60 | for i=1,nc do 61 | local ii = math.random(1,no) 62 | local cc = torch.uniform(0,1/nc) 63 | mixi[i] = ii; 64 | mixj[i] = cc; 65 | print(ii,cc) 66 | x:add(cc, D:select(2,ii)) 67 | end 68 | 69 | fista = optim.FistaL1(D,fistaparams) 70 | code,h = fista.run(x,0.1) 71 | 72 | --fista.reconstruction:addmv(0,1,D,code) 73 | rec = fista.reconstruction 74 | --code,rec,h = fista:forward(x); 75 | 76 | gnuplot.figure(1) 77 | gnuplot.plot({'data',mixi,mixj,'+'},{'code',torch.linspace(1,no,no),code,'+'}) 78 | gnuplot.title('Fista = ' .. tostring(fistaparams.doFistaUpdate)) 79 | 80 | gnuplot.figure(2) 81 | gnuplot.plot({'input',torch.linspace(1,ni,ni),x,'+-'},{'reconstruction',torch.linspace(1,ni,ni),rec,'+-'}); 82 | gnuplot.title('Reconstruction Error : ' .. x:dist(rec) .. ' ' .. 'Fista = ' .. tostring(fistaparams.doFistaUpdate)) 83 | --w2:axis(0,ni+1,-1,1) 84 | 85 | if dofista then 86 | print('Running FISTA') 87 | fname = 'fista2.bin' 88 | else 89 | print('Running ISTA') 90 | fname = 'ista2.bin' 91 | end 92 | ff = torch.DiskFile(fname,'w'):binary() 93 | ff:writeObject(h) 94 | ff:close() 95 | 96 | -------------------------------------------------------------------------------- /nag.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | -- An implementation of SGD adapted with features of Nesterov's 3 | -- Accelerated Gradient method, based on the paper 4 | -- On the Importance of Initialization and Momentum in Deep Learning 5 | -- Sutsveker et. al., ICML 2013 6 | -- 7 | -- ARGS: 8 | -- opfunc : a function that takes a single input (X), the point of 9 | -- evaluation, and returns f(X) and df/dX 10 | -- x : the initial point 11 | -- state : a table describing the state of the optimizer; after each 12 | -- call the state is modified 13 | -- state.learningRate : learning rate 14 | -- state.learningRateDecay : learning rate decay 15 | -- state.weightDecay : weight decay 16 | -- state.momentum : momentum 17 | -- state.learningRates : vector of individual learning rates 18 | -- 19 | -- RETURN: 20 | -- x : the new x vector 21 | -- f(x) : the function, evaluated before the update 22 | -- 23 | -- (Dilip Krishnan, 2013) 24 | -- 25 | 26 | function optim.nag(opfunc, x, config, state) 27 | -- (0) get/update state 28 | local config = config or {} 29 | local state = state or config 30 | local lr = config.learningRate or 1e-3 31 | local lrd = config.learningRateDecay or 0 32 | local wd = config.weightDecay or 0 33 | local mom = config.momentum or 0.9 34 | local damp = config.dampening or mom 35 | local lrs = config.learningRates 36 | state.evalCounter = state.evalCounter or 0 37 | local nevals = state.evalCounter 38 | 39 | if mom <= 0 then 40 | error('Momentum must be positive for Nesterov Accelerated Gradient') 41 | end 42 | 43 | -- (1) evaluate f(x) and df/dx 44 | -- first step in the direction of the momentum vector 45 | 46 | if state.dfdx then 47 | x:add(mom, state.dfdx) 48 | end 49 | -- then compute gradient at that point 50 | -- comment out the above line to get the original SGD 51 | local fx,dfdx = opfunc(x) 52 | 53 | -- (2) weight decay 54 | if wd ~= 0 then 55 | dfdx:add(wd, x) 56 | end 57 | 58 | -- (3) learning rate decay (annealing) 59 | local clr = lr / (1 + nevals*lrd) 60 | 61 | -- (4) apply momentum 62 | if not state.dfdx then 63 | state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0) 64 | else 65 | state.dfdx:mul(mom) 66 | end 67 | 68 | -- (5) parameter update with single or individual learning rates 69 | if lrs then 70 | if not state.deltaParameters then 71 | state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) 72 | end 73 | state.deltaParameters:copy(lrs):cmul(dfdx) 74 | x:add(-clr, state.deltaParameters) 75 | state.dfdx:add(-clr, state.deltaParameters) 76 | else 77 | x:add(-clr, dfdx) 78 | state.dfdx:add(-clr, dfdx) 79 | end 80 | 81 | -- (6) update evaluation counter 82 | state.evalCounter = state.evalCounter + 1 83 | 84 | -- return x, f(x) before optimization 85 | return x,{fx} 86 | end 87 | -------------------------------------------------------------------------------- /sgd.lua: -------------------------------------------------------------------------------- 1 | --[[ A plain implementation of SGD 2 | 3 | ARGS: 4 | 5 | - `opfunc` : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - `x` : the initial point 8 | - `config` : a table with configuration parameters for the optimizer 9 | - `config.learningRate` : learning rate 10 | - `config.learningRateDecay` : learning rate decay 11 | - `config.weightDecay` : weight decay 12 | - `config.weightDecays` : vector of individual weight decays 13 | - `config.momentum` : momentum 14 | - `config.dampening` : dampening for momentum 15 | - `config.nesterov` : enables Nesterov momentum 16 | - `config.learningRates` : vector of individual learning rates 17 | - `state` : a table describing the state of the optimizer; after each 18 | call the state is modified 19 | - `state.evalCounter` : evaluation counter (optional: 0, by default) 20 | 21 | RETURN: 22 | - `x` : the new x vector 23 | - `f(x)` : the function, evaluated before the update 24 | 25 | (Clement Farabet, 2012) 26 | ]] 27 | function optim.sgd(opfunc, x, config, state) 28 | -- (0) get/update state 29 | local config = config or {} 30 | local state = state or config 31 | local lr = config.learningRate or 1e-3 32 | local lrd = config.learningRateDecay or 0 33 | local wd = config.weightDecay or 0 34 | local mom = config.momentum or 0 35 | local damp = config.dampening or mom 36 | local nesterov = config.nesterov or false 37 | local lrs = config.learningRates 38 | local wds = config.weightDecays 39 | state.evalCounter = state.evalCounter or 0 40 | local nevals = state.evalCounter 41 | assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening") 42 | 43 | -- (1) evaluate f(x) and df/dx 44 | local fx,dfdx = opfunc(x) 45 | 46 | -- (2) weight decay with single or individual parameters 47 | if wd ~= 0 then 48 | dfdx:add(wd, x) 49 | elseif wds then 50 | if not state.decayParameters then 51 | state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) 52 | end 53 | state.decayParameters:copy(wds):cmul(x) 54 | dfdx:add(state.decayParameters) 55 | end 56 | 57 | -- (3) apply momentum 58 | if mom ~= 0 then 59 | if not state.dfdx then 60 | state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx) 61 | else 62 | state.dfdx:mul(mom):add(1-damp, dfdx) 63 | end 64 | if nesterov then 65 | dfdx:add(mom, state.dfdx) 66 | else 67 | dfdx = state.dfdx 68 | end 69 | end 70 | 71 | -- (4) learning rate decay (annealing) 72 | local clr = lr / (1 + nevals*lrd) 73 | 74 | -- (5) parameter update with single or individual learning rates 75 | if lrs then 76 | if not state.deltaParameters then 77 | state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) 78 | end 79 | state.deltaParameters:copy(lrs):cmul(dfdx) 80 | x:add(-clr, state.deltaParameters) 81 | else 82 | x:add(-clr, dfdx) 83 | end 84 | 85 | -- (6) update evaluation counter 86 | state.evalCounter = state.evalCounter + 1 87 | 88 | -- return x*, f(x) before optimization 89 | return x,{fx} 90 | end 91 | -------------------------------------------------------------------------------- /rprop.lua: -------------------------------------------------------------------------------- 1 | --[[ A plain implementation of RPROP 2 | 3 | ARGS: 4 | - `opfunc` : a function that takes a single input (X), the point of 5 | evaluation, and returns f(X) and df/dX 6 | - `x` : the initial point 7 | - `state` : a table describing the state of the optimizer; after each 8 | call the state is modified 9 | - `state.stepsize` : initial step size, common to all components 10 | - `state.etaplus` : multiplicative increase factor, > 1 (default 1.2) 11 | - `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5) 12 | - `state.stepsizemax` : maximum stepsize allowed (default 50) 13 | - `state.stepsizemin` : minimum stepsize allowed (default 1e-6) 14 | - `state.niter` : number of iterations (default 1) 15 | 16 | RETURN: 17 | - `x` : the new x vector 18 | - `f(x)` : the function, evaluated before the update 19 | 20 | (Martin Riedmiller, Koray Kavukcuoglu 2013) 21 | --]] 22 | function optim.rprop(opfunc, x, config, state) 23 | if config == nil and state == nil then 24 | print('no state table RPROP initializing') 25 | end 26 | -- (0) get/update state 27 | local config = config or {} 28 | local state = state or config 29 | local stepsize = config.stepsize or 0.1 30 | local etaplus = config.etaplus or 1.2 31 | local etaminus = config.etaminus or 0.5 32 | local stepsizemax = config.stepsizemax or 50.0 33 | local stepsizemin = config.stepsizemin or 1E-06 34 | local niter = config.niter or 1 35 | 36 | local hfx = {} 37 | 38 | for i=1,niter do 39 | 40 | -- (1) evaluate f(x) and df/dx 41 | local fx,dfdx = opfunc(x) 42 | 43 | -- init temp storage 44 | if not state.delta then 45 | state.delta = dfdx.new(dfdx:size()):zero() 46 | state.stepsize = dfdx.new(dfdx:size()):fill(stepsize) 47 | state.sign = dfdx.new(dfdx:size()) 48 | state.psign = torch.ByteTensor(dfdx:size()) 49 | state.nsign = torch.ByteTensor(dfdx:size()) 50 | state.zsign = torch.ByteTensor(dfdx:size()) 51 | state.dminmax = torch.ByteTensor(dfdx:size()) 52 | if torch.type(x)=='torch.CudaTensor' then 53 | -- Push to GPU 54 | state.psign = state.psign:cuda() 55 | state.nsign = state.nsign:cuda() 56 | state.zsign = state.zsign:cuda() 57 | state.dminmax = state.dminmax:cuda() 58 | end 59 | end 60 | 61 | -- sign of derivative from last step to this one 62 | torch.cmul(state.sign, dfdx, state.delta) 63 | torch.sign(state.sign, state.sign) 64 | 65 | -- get indices of >0, <0 and ==0 entries 66 | state.sign.gt(state.psign, state.sign, 0) 67 | state.sign.lt(state.nsign, state.sign, 0) 68 | state.sign.eq(state.zsign, state.sign, 0) 69 | 70 | -- get step size updates 71 | state.sign[state.psign] = etaplus 72 | state.sign[state.nsign] = etaminus 73 | state.sign[state.zsign] = 1 74 | 75 | -- update stepsizes with step size updates 76 | state.stepsize:cmul(state.sign) 77 | 78 | -- threshold step sizes 79 | -- >50 => 50 80 | state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax) 81 | state.stepsize[state.dminmax] = stepsizemax 82 | -- <1e-6 ==> 1e-6 83 | state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin) 84 | state.stepsize[state.dminmax] = stepsizemin 85 | 86 | -- for dir<0, dfdx=0 87 | -- for dir>=0 dfdx=dfdx 88 | dfdx[state.nsign] = 0 89 | -- state.sign = sign(dfdx) 90 | torch.sign(state.sign,dfdx) 91 | 92 | -- update weights 93 | x:addcmul(-1,state.sign,state.stepsize) 94 | 95 | -- update state.dfdx with current dfdx 96 | state.delta:copy(dfdx) 97 | 98 | table.insert(hfx,fx) 99 | end 100 | 101 | -- return x*, f(x) before optimization 102 | return x,hfx 103 | end 104 | -------------------------------------------------------------------------------- /test/sparsecoding.lua: -------------------------------------------------------------------------------- 1 | require 'kex' 2 | 3 | -- L1 FISTA Solution 4 | -- L1 solution with a linear dictionary ||Ax-b||^2 + \lambda ||x||_1 5 | -- D : dictionary, each column is a dictionary element 6 | -- params: set of params to pass to FISTA and possibly temp allocation (**optional**) 7 | -- check unsup.FistaLS function for details. 8 | -- returns fista : a table with the following entries 9 | -- fista.run(x,lambda) : run L1 sparse coding algorithm with input x and lambda. 10 | -- The following entries will be allocated and reused by each call to fista.run(x,lambda) 11 | -- fista.reconstruction: reconstructed input. 12 | -- fista.gradf : gradient of L2 part of the problem wrt x 13 | -- fista.code : the solution of L1 problem 14 | -- The following entries just point to data passed to fista.run(x) 15 | -- fista.input : points to the tensor 'x' used in the last fista.run(x,lambda) 16 | -- fista.lambda : the lambda value used in the last fista.run(x,lambda) 17 | function optim.FistaL1(D, params) 18 | 19 | -- this is for keeping parameters related to fista algorithm 20 | local params = params or {} 21 | -- this is for temporary variables and such 22 | local fista = {} 23 | 24 | -- related to FISTA 25 | params.L = params.L or 0.1 26 | params.Lstep = params.Lstep or 1.5 27 | params.maxiter = params.maxiter or 50 28 | params.maxline = params.maxline or 20 29 | params.errthres = params.errthres or 1e-4 30 | 31 | -- temporary stuff that might be good to keep around 32 | fista.reconstruction = torch.Tensor() 33 | fista.gradf = torch.Tensor() 34 | fista.gradg = torch.Tensor() 35 | fista.code = torch.Tensor() 36 | 37 | -- these will be assigned in run(x) 38 | -- fista.input points to the last input that was run 39 | -- fista.lambda is the lambda value from the last run 40 | fista.input = nil 41 | fista.lambda = nil 42 | 43 | -- CREATE FUNCTION CLOSURES 44 | -- smooth function 45 | fista.f = function (x,mode) 46 | 47 | local reconstruction = fista.reconstruction 48 | local input = fista.input 49 | -- ------------------- 50 | -- function evaluation 51 | if x:dim() == 1 then 52 | --print(D:size(),x:size()) 53 | reconstruction:resize(D:size(1)) 54 | reconstruction:addmv(0,1,D,x) 55 | elseif x:dim(2) then 56 | reconstruction:resize(x:size(1),D:size(1)) 57 | reconstruction:addmm(0,1,x,D:t()) 58 | end 59 | local fval = input:dist(reconstruction)^2 60 | 61 | -- ---------------------- 62 | -- derivative calculation 63 | if mode and mode:match('dx') then 64 | local gradf = fista.gradf 65 | reconstruction:add(-1,input):mul(2) 66 | gradf:resizeAs(x) 67 | if input:dim() == 1 then 68 | gradf:addmv(0,1,D:t(),reconstruction) 69 | else 70 | gradf:addmm(0,1,reconstruction, D) 71 | end 72 | --------------------------------------- 73 | -- return function value and derivative 74 | return fval, gradf, reconstruction 75 | end 76 | 77 | ------------------------ 78 | -- return function value 79 | return fval, reconstruction 80 | end 81 | 82 | -- non-smooth function L1 83 | fista.g = function (x) 84 | 85 | local fval = fista.lambda*x:norm(1) 86 | 87 | if mod and mode:match('dx') then 88 | local gradg = fista.gradg 89 | gradg:resizAs(x) 90 | gradg:sign():mul(fista.lambda) 91 | return fval,gradg 92 | end 93 | return fval 94 | end 95 | 96 | -- argmin_x Q(x,y), just shrinkage for L1 97 | fista.pl = function (x,L) 98 | x:shrinkage(fista.lambda/L) 99 | end 100 | 101 | fista.run = function(x, lam, codeinit) 102 | local code = fista.code 103 | fista.input = x 104 | fista.lambda = lam 105 | 106 | -- resize code, maybe a different number of dimensions 107 | -- fill with zeros, initial point 108 | if codeinit then 109 | code:resizeAs(codeinit) 110 | code:copy(codeinit) 111 | else 112 | if x:dim() == 1 then 113 | code:resize(D:size(2)) 114 | elseif x:dim() == 2 then 115 | code:resize(x:size(1),D:size(2)) 116 | else 117 | error(' I do not know how to handle ' .. x:dim() .. ' dimensional input') 118 | end 119 | code:fill(0) 120 | end 121 | -- return the result of unsup.FistaLS call. 122 | return optim.FistaLS(fista.f, fista.g, fista.pl, fista.code, params) 123 | end 124 | 125 | return fista 126 | end 127 | 128 | -------------------------------------------------------------------------------- /Logger.lua: -------------------------------------------------------------------------------- 1 | --[[ Logger: a simple class to log symbols during training, 2 | and automate plot generation 3 | 4 | Example: 5 | logger = optim.Logger('somefile.log') -- file to save stuff 6 | 7 | for i = 1,N do -- log some symbols during 8 | train_error = ... -- training/testing 9 | test_error = ... 10 | logger:add{['training error'] = train_error, 11 | ['test error'] = test_error} 12 | end 13 | 14 | logger:style{['training error'] = '-', -- define styles for plots 15 | ['test error'] = '-'} 16 | logger:plot() -- and plot 17 | 18 | ---- OR --- 19 | 20 | logger = optim.Logger('somefile.log') -- file to save stuff 21 | logger:setNames{'training error', 'test error'} 22 | 23 | for i = 1,N do -- log some symbols during 24 | train_error = ... -- training/testing 25 | test_error = ... 26 | logger:add{train_error, test_error} 27 | end 28 | 29 | logger:style{'-', '-'} -- define styles for plots 30 | logger:plot() -- and plot 31 | 32 | ----------- 33 | 34 | logger:setlogscale(true) -- enable logscale on Y-axis 35 | logger:plot() -- and plot 36 | ]] 37 | require 'xlua' 38 | local Logger = torch.class('optim.Logger') 39 | 40 | function Logger:__init(filename, timestamp) 41 | if filename then 42 | self.name = filename 43 | os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"') 44 | if timestamp then 45 | -- append timestamp to create unique log file 46 | filename = filename .. '-'..os.date("%Y_%m_%d_%X") 47 | end 48 | self.file = io.open(filename,'w') 49 | self.epsfile = self.name .. '.eps' 50 | else 51 | self.file = io.stdout 52 | self.name = 'stdout' 53 | print(' warning: no path provided, logging to std out') 54 | end 55 | self.empty = true 56 | self.symbols = {} 57 | self.styles = {} 58 | self.names = {} 59 | self.idx = {} 60 | self.figure = nil 61 | self.showPlot = true 62 | self.plotRawCmd = nil 63 | self.defaultStyle = '+' 64 | self.logscale = false 65 | end 66 | 67 | function Logger:setNames(names) 68 | self.names = names 69 | self.empty = false 70 | self.nsymbols = #names 71 | for k,key in pairs(names) do 72 | self.file:write(key .. '\t') 73 | self.symbols[k] = {} 74 | self.styles[k] = {self.defaultStyle} 75 | self.idx[key] = k 76 | end 77 | self.file:write('\n') 78 | self.file:flush() 79 | end 80 | 81 | function Logger:add(symbols) 82 | -- (1) first time ? print symbols' names on first row 83 | if self.empty then 84 | self.empty = false 85 | self.nsymbols = #symbols 86 | for k,val in pairs(symbols) do 87 | self.file:write(k .. '\t') 88 | self.symbols[k] = {} 89 | self.styles[k] = {self.defaultStyle} 90 | self.names[k] = k 91 | end 92 | self.idx = self.names 93 | self.file:write('\n') 94 | end 95 | -- (2) print all symbols on one row 96 | for k,val in pairs(symbols) do 97 | if type(val) == 'number' then 98 | self.file:write(string.format('%11.4e',val) .. '\t') 99 | elseif type(val) == 'string' then 100 | self.file:write(val .. '\t') 101 | else 102 | xlua.error('can only log numbers and strings', 'Logger') 103 | end 104 | end 105 | self.file:write('\n') 106 | self.file:flush() 107 | -- (3) save symbols in internal table 108 | for k,val in pairs(symbols) do 109 | table.insert(self.symbols[k], val) 110 | end 111 | end 112 | 113 | function Logger:style(symbols) 114 | for name,style in pairs(symbols) do 115 | if type(style) == 'string' then 116 | self.styles[name] = {style} 117 | elseif type(style) == 'table' then 118 | self.styles[name] = style 119 | else 120 | xlua.error('style should be a string or a table of strings','Logger') 121 | end 122 | end 123 | end 124 | 125 | function Logger:setlogscale(value) 126 | self.logscale = value 127 | end 128 | 129 | function Logger:plot(...) 130 | if not xlua.require('gnuplot') then 131 | if not self.warned then 132 | print(' warning: cannot plot with this version of Torch') 133 | self.warned = true 134 | end 135 | return 136 | end 137 | local plotit = false 138 | local plots = {} 139 | local plotsymbol = 140 | function(name,list) 141 | if #list > 1 then 142 | local nelts = #list 143 | local plot_y = torch.Tensor(nelts) 144 | for i = 1,nelts do 145 | plot_y[i] = list[i] 146 | end 147 | for _,style in ipairs(self.styles[name]) do 148 | table.insert(plots, {self.names[name], plot_y, style}) 149 | end 150 | plotit = true 151 | end 152 | end 153 | local args = {...} 154 | if not args[1] then -- plot all symbols 155 | for name,list in pairs(self.symbols) do 156 | plotsymbol(name,list) 157 | end 158 | else -- plot given symbols 159 | for _,name in ipairs(args) do 160 | plotsymbol(self.idx[name], self.symbols[self.idx[name]]) 161 | end 162 | end 163 | if plotit then 164 | if self.showPlot then 165 | self.figure = gnuplot.figure(self.figure) 166 | if self.logscale then gnuplot.logscale('on') end 167 | gnuplot.plot(plots) 168 | if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end 169 | gnuplot.grid('on') 170 | gnuplot.title('') 171 | end 172 | if self.epsfile then 173 | os.execute('rm -f "' .. self.epsfile .. '"') 174 | local epsfig = gnuplot.epsfigure(self.epsfile) 175 | if self.logscale then gnuplot.logscale('on') end 176 | gnuplot.plot(plots) 177 | if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end 178 | gnuplot.grid('on') 179 | gnuplot.title('') 180 | gnuplot.plotflush() 181 | gnuplot.close(epsfig) 182 | end 183 | end 184 | end 185 | -------------------------------------------------------------------------------- /cg.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | 3 | This cg implementation is a rewrite of minimize.m written by Carl 4 | E. Rasmussen. It is supposed to produce exactly same results (give 5 | or take numerical accuracy due to some changed order of 6 | operations). You can compare the result on rosenbrock with minimize.m. 7 | http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html 8 | 9 | [x fx c] = minimize([0 0]', 'rosenbrock', -25) 10 | 11 | Note that we limit the number of function evaluations only, it seems much 12 | more important in practical use. 13 | 14 | ARGS: 15 | 16 | - `opfunc` : a function that takes a single input, the point of evaluation. 17 | - `x` : the initial point 18 | - `state` : a table of parameters and temporary allocations. 19 | - `state.maxEval` : max number of function evaluations 20 | - `state.maxIter` : max number of iterations 21 | - `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage 22 | - `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage 23 | 24 | RETURN: 25 | 26 | - `x*` : the new x vector, at the optimal point 27 | - `f` : a table of all function values where 28 | `f[1]` is the value of the function before any optimization and 29 | `f[#f]` is the final fully optimized value, at x* 30 | 31 | (Koray Kavukcuoglu, 2012) 32 | --]] 33 | function optim.cg(opfunc, x, config, state) 34 | -- parameters 35 | local config = config or {} 36 | local state = state or config 37 | local rho = config.rho or 0.01 38 | local sig = config.sig or 0.5 39 | local int = config.int or 0.1 40 | local ext = config.ext or 3.0 41 | local maxIter = config.maxIter or 20 42 | local ratio = config.ratio or 100 43 | local maxEval = config.maxEval or maxIter*1.25 44 | local red = 1 45 | 46 | local verbose = config.verbose or 0 47 | 48 | local i = 0 49 | local ls_failed = 0 50 | local fx = {} 51 | 52 | -- we need three points for the interpolation/extrapolation stuff 53 | local z1,z2,z3 = 0,0,0 54 | local d1,d2,d3 = 0,0,0 55 | local f1,f2,f3 = 0,0,0 56 | 57 | local df1 = state.df1 or x.new() 58 | local df2 = state.df2 or x.new() 59 | local df3 = state.df3 or x.new() 60 | local tdf 61 | 62 | df1:resizeAs(x) 63 | df2:resizeAs(x) 64 | df3:resizeAs(x) 65 | 66 | -- search direction 67 | local s = state.s or x.new() 68 | s:resizeAs(x) 69 | 70 | -- we need a temp storage for X 71 | local x0 = state.x0 or x.new() 72 | local f0 = 0 73 | local df0 = state.df0 or x.new() 74 | x0:resizeAs(x) 75 | df0:resizeAs(x) 76 | 77 | -- evaluate at initial point 78 | f1,tdf = opfunc(x) 79 | fx[#fx+1] = f1 80 | df1:copy(tdf) 81 | i=i+1 82 | 83 | -- initial search direction 84 | s:copy(df1):mul(-1) 85 | 86 | d1 = -s:dot(s ) -- slope 87 | z1 = red/(1-d1) -- initial step 88 | 89 | while i < math.abs(maxEval) do 90 | 91 | x0:copy(x) 92 | f0 = f1 93 | df0:copy(df1) 94 | 95 | x:add(z1,s) 96 | f2,tdf = opfunc(x) 97 | df2:copy(tdf) 98 | i=i+1 99 | d2 = df2:dot(s) 100 | f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1 101 | local m = math.min(maxIter,maxEval-i) 102 | local success = 0 103 | local limit = -1 104 | 105 | while true do 106 | while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do 107 | limit = z1 108 | if f2 > f1 then 109 | z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3) 110 | else 111 | local A = 6*(f2-f3)/z3+3*(d2+d3) 112 | local B = 3*(f3-f2)-z3*(d3+2*d2) 113 | z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A 114 | end 115 | if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then 116 | z2 = z3/2; 117 | end 118 | z2 = math.max(math.min(z2, int*z3),(1-int)*z3); 119 | z1 = z1 + z2; 120 | x:add(z2,s) 121 | f2,tdf = opfunc(x) 122 | df2:copy(tdf) 123 | i=i+1 124 | m = m - 1 125 | d2 = df2:dot(s) 126 | z3 = z3-z2; 127 | end 128 | if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then 129 | break 130 | elseif d2 > sig*d1 then 131 | success = 1; 132 | break; 133 | elseif m == 0 then 134 | break; 135 | end 136 | local A = 6*(f2-f3)/z3+3*(d2+d3); 137 | local B = 3*(f3-f2)-z3*(d3+2*d2); 138 | z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3)) 139 | 140 | if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then 141 | if limit < -0.5 then 142 | z2 = z1 * (ext -1) 143 | else 144 | z2 = (limit-z1)/2 145 | end 146 | elseif (limit > -0.5) and (z2+z1) > limit then 147 | z2 = (limit-z1)/2 148 | elseif limit < -0.5 and (z2+z1) > z1*ext then 149 | z2 = z1*(ext-1) 150 | elseif z2 < -z3*int then 151 | z2 = -z3*int 152 | elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then 153 | z2 = (limit-z1)*(1-int) 154 | end 155 | f3=f2; d3=d2; z3=-z2; 156 | z1 = z1+z2; 157 | x:add(z2,s) 158 | 159 | f2,tdf = opfunc(x) 160 | df2:copy(tdf) 161 | i=i+1 162 | m = m - 1 163 | d2 = df2:dot(s) 164 | end 165 | if success == 1 then 166 | f1 = f2 167 | fx[#fx+1] = f1; 168 | local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1) 169 | s:mul(ss) 170 | s:add(-1,df2) 171 | local tmp = df1:clone() 172 | df1:copy(df2) 173 | df2:copy(tmp) 174 | d2 = df1:dot(s) 175 | if d2> 0 then 176 | s:copy(df1) 177 | s:mul(-1) 178 | d2 = -s:dot(s) 179 | end 180 | 181 | z1 = z1 * math.min(ratio, d1/(d2-1e-320)) 182 | d1 = d2 183 | ls_failed = 0 184 | else 185 | x:copy(x0) 186 | f1 = f0 187 | df1:copy(df0) 188 | if ls_failed or i>maxEval then 189 | break 190 | end 191 | local tmp = df1:clone() 192 | df1:copy(df2) 193 | df2:copy(tmp) 194 | s:copy(df1) 195 | s:mul(-1) 196 | d1 = -s:dot(s) 197 | z1 = 1/(1-d1) 198 | ls_failed = 1 199 | end 200 | end 201 | state.df0 = df0 202 | state.df1 = df1 203 | state.df2 = df2 204 | state.df3 = df3 205 | state.x0 = x0 206 | state.s = s 207 | return x,fx,i 208 | end 209 | -------------------------------------------------------------------------------- /lswolfe.lua: -------------------------------------------------------------------------------- 1 | --[[ A Line Search satisfying the Wolfe conditions 2 | 3 | ARGS: 4 | - `opfunc` : a function (the objective) that takes a single input (X), 5 | the point of evaluation, and returns f(X) and df/dX 6 | - `x` : initial point / starting location 7 | - `t` : initial step size 8 | - `d` : descent direction 9 | - `f` : initial function value 10 | - `g` : gradient at initial location 11 | - `gtd` : directional derivative at starting location 12 | - `options.c1` : sufficient decrease parameter 13 | - `options.c2` : curvature parameter 14 | - `options.tolX` : minimum allowable step length 15 | - `options.maxIter` : maximum nb of iterations 16 | 17 | RETURN: 18 | - `f` : function value at x+t*d 19 | - `g` : gradient value at x+t*d 20 | - `x` : the next x (=x+t*d) 21 | - `t` : the step length 22 | - `lsFuncEval` : the number of function evaluations 23 | ]] 24 | function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options) 25 | -- options 26 | options = options or {} 27 | local c1 = options.c1 or 1e-4 28 | local c2 = options.c2 or 0.9 29 | local tolX = options.tolX or 1e-9 30 | local maxIter = options.maxIter or 20 31 | local isverbose = options.verbose or false 32 | 33 | -- some shortcuts 34 | local abs = torch.abs 35 | local min = math.min 36 | local max = math.max 37 | local Tensor = torch.Tensor 38 | 39 | -- verbose function 40 | local function verbose(...) 41 | if isverbose then print(' ', ...) end 42 | end 43 | 44 | -- evaluate objective and gradient using initial step 45 | local x_init = x:clone() 46 | x:add(t,d) 47 | local f_new,g_new = opfunc(x) 48 | local lsFuncEval = 1 49 | local gtd_new = g_new * d 50 | 51 | -- bracket an interval containing a point satisfying the Wolfe 52 | -- criteria 53 | local LSiter,t_prev,done = 0,0,false 54 | local f_prev,g_prev,gtd_prev = f,g:clone(),gtd 55 | local bracket,bracketFval,bracketGval 56 | while LSiter < maxIter do 57 | -- check conditions: 58 | if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then 59 | bracket = Tensor{t_prev,t} 60 | bracketFval = Tensor{f_prev,f_new} 61 | bracketGval = Tensor(2,g_new:size(1)) 62 | bracketGval[1] = g_prev 63 | bracketGval[2] = g_new 64 | break 65 | 66 | elseif abs(gtd_new) <= -c2*gtd then 67 | bracket = Tensor{t} 68 | bracketFval = Tensor{f_new} 69 | bracketGval = Tensor(1,g_new:size(1)) 70 | bracketGval[1] = g_new 71 | done = true 72 | break 73 | 74 | elseif gtd_new >= 0 then 75 | bracket = Tensor{t_prev,t} 76 | bracketFval = Tensor{f_prev,f_new} 77 | bracketGval = Tensor(2,g_new:size(1)) 78 | bracketGval[1] = g_prev 79 | bracketGval[2] = g_new 80 | break 81 | 82 | end 83 | 84 | -- interpolate: 85 | local tmp = t_prev 86 | t_prev = t 87 | local minStep = t + 0.01*(t-tmp) 88 | local maxStep = t*10 89 | t = optim.polyinterp(Tensor{{tmp,f_prev,gtd_prev}, 90 | {t,f_new,gtd_new}}, 91 | minStep, maxStep) 92 | 93 | -- next step: 94 | f_prev = f_new 95 | g_prev = g_new:clone() 96 | gtd_prev = gtd_new 97 | x[{}] = x_init 98 | x:add(t,d) 99 | f_new,g_new = opfunc(x) 100 | lsFuncEval = lsFuncEval + 1 101 | gtd_new = g_new * d 102 | LSiter = LSiter + 1 103 | end 104 | 105 | -- reached max nb of iterations? 106 | if LSiter == maxIter then 107 | bracket = Tensor{0,t} 108 | bracketFval = Tensor{f,f_new} 109 | bracketGval = Tensor(2,g_new:size(1)) 110 | bracketGval[1] = g 111 | bracketGval[2] = g_new 112 | end 113 | 114 | -- zoom phase: we now have a point satisfying the criteria, or 115 | -- a bracket around it. We refine the bracket until we find the 116 | -- exact point satisfying the criteria 117 | local insufProgress = false 118 | local LOposRemoved = 0 119 | while not done and LSiter < maxIter do 120 | -- find high and low points in bracket 121 | local f_LO,LOpos = bracketFval:min(1) 122 | LOpos = LOpos[1] f_LO = f_LO[1] 123 | local HIpos = -LOpos+3 124 | 125 | -- compute new trial value 126 | t = optim.polyinterp(Tensor{{bracket[1],bracketFval[1],bracketGval[1]*d}, 127 | {bracket[2],bracketFval[2],bracketGval[2]*d}}) 128 | 129 | -- test what we are making sufficient progress 130 | if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then 131 | if insufProgress or t>=bracket:max() or t <= bracket:min() then 132 | if abs(t-bracket:max()) < abs(t-bracket:min()) then 133 | t = bracket:max()-0.1*(bracket:max()-bracket:min()) 134 | else 135 | t = bracket:min()+0.1*(bracket:max()-bracket:min()) 136 | end 137 | insufProgress = false 138 | else 139 | insufProgress = true 140 | end 141 | else 142 | insufProgress = false 143 | end 144 | 145 | -- Evaluate new point 146 | x[{}] = x_init 147 | x:add(t,d) 148 | f_new,g_new = opfunc(x) 149 | lsFuncEval = lsFuncEval + 1 150 | gtd_new = g_new * d 151 | LSiter = LSiter + 1 152 | if f_new > f + c1*t*gtd or f_new >= f_LO then 153 | -- Armijo condition not satisfied or not lower than lowest point 154 | bracket[HIpos] = t 155 | bracketFval[HIpos] = f_new 156 | bracketGval[HIpos] = g_new 157 | else 158 | if abs(gtd_new) <= - c2*gtd then 159 | -- Wolfe conditions satisfied 160 | done = true 161 | elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then 162 | -- Old HI becomes new LO 163 | bracket[HIpos] = bracket[LOpos] 164 | bracketFval[HIpos] = bracketFval[LOpos] 165 | bracketGval[HIpos] = bracketGval[LOpos] 166 | end 167 | -- New point becomes new LO 168 | bracket[LOpos] = t 169 | bracketFval[LOpos] = f_new 170 | bracketGval[LOpos] = g_new 171 | end 172 | 173 | -- done? 174 | if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then 175 | break 176 | end 177 | end 178 | 179 | -- be verbose 180 | if LSiter == maxIter then 181 | verbose('reached max number of iterations') 182 | end 183 | 184 | -- return stuff 185 | local _,LOpos = bracketFval:min(1) 186 | LOpos = LOpos[1] 187 | t = bracket[LOpos] 188 | f_new = bracketFval[LOpos] 189 | g_new = bracketGval[LOpos] 190 | x[{}] = x_init 191 | x:add(t,d) 192 | return f_new,g_new,x,t,lsFuncEval 193 | end 194 | -------------------------------------------------------------------------------- /fista.lua: -------------------------------------------------------------------------------- 1 | --[[ FISTA with backtracking line search 2 | 3 | - `f` : smooth function 4 | - `g` : non-smooth function 5 | - `pl` : minimizer of intermediate problem Q(x,y) 6 | - `xinit` : initial point 7 | - `params` : table of parameters (**optional**) 8 | - `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1) 9 | - `params.Lstep` : step size multiplier at each iteration (1.5) 10 | - `params.maxiter` : max number of iterations (50) 11 | - `params.maxline` : max number of line search iterations per iteration (20) 12 | - `params.errthres`: Error thershold for convergence check (1e-4) 13 | - `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true) 14 | - `params.verbose` : store each iteration solution and print detailed info (false) 15 | 16 | On output, `params` will contain these additional fields that can be reused. 17 | 18 | - `params.L` : last used L value will be written. 19 | 20 | These are temporary storages needed by the algo and if the same params object is 21 | passed a second time, these same storages will be used without new allocation. 22 | 23 | - `params.xkm` : previous iterarion point 24 | - `params.y` : fista iteration 25 | - `params.ply` : ply = pl(y - 1/L grad(f)) 26 | 27 | Returns the solution x and history of {function evals, number of line search ,...} 28 | 29 | Algorithm is published in 30 | 31 | @article{beck-fista-09, 32 | Author = {Beck, Amir and Teboulle, Marc}, 33 | Journal = {SIAM J. Img. Sci.}, 34 | Number = {1}, 35 | Pages = {183--202}, 36 | Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems}, 37 | Volume = {2}, 38 | Year = {2009}} 39 | ]] 40 | function optim.FistaLS(f, g, pl, xinit, params) 41 | 42 | local params = params or {} 43 | local L = params.L or 0.1 44 | local Lstep = params.Lstep or 1.5 45 | local maxiter = params.maxiter or 50 46 | local maxline = params.maxline or 20 47 | local errthres = params.errthres or 1e-4 48 | local doFistaUpdate = params.doFistaUpdate 49 | local verbose = params.verbose 50 | 51 | -- temporary allocations 52 | params.xkm = params.xkm or torch.Tensor() 53 | params.y = params.y or torch.Tensor() 54 | params.ply = params.ply or torch.Tensor() 55 | local xkm = params.xkm -- previous iteration 56 | local y = params.y -- fista iteration 57 | local ply = params.ply -- soft shrinked y 58 | 59 | -- we start from all zeros 60 | local xk = xinit 61 | xkm:resizeAs(xk):zero() 62 | ply:resizeAs(xk):zero() 63 | y:resizeAs(xk):zero() 64 | 65 | local history = {} -- keep track of stuff 66 | local niter = 0 -- number of iterations done 67 | local converged = false -- are we done? 68 | local tk = 1 -- momentum param for FISTA 69 | local tkp = 0 70 | 71 | 72 | local gy = g(y) 73 | local fval = math.huge -- fval = f+g 74 | while not converged and niter < maxiter do 75 | 76 | -- run through smooth function (code is input, input is target) 77 | -- get derivatives from smooth function 78 | local fy,gfy = f(y,'dx') 79 | --local gfy = f(y) 80 | 81 | local fply = 0 82 | local gply = 0 83 | local Q = 0 84 | 85 | ---------------------------------------------- 86 | -- do line search to find new current location starting from fista loc 87 | local nline = 0 88 | local linesearchdone = false 89 | while not linesearchdone do 90 | -- take a step in gradient direction of smooth function 91 | ply:copy(y) 92 | ply:add(-1/L,gfy) 93 | 94 | -- and solve for minimum of auxiliary problem 95 | pl(ply,L) 96 | -- this is candidate for new current iteration 97 | xk:copy(ply) 98 | 99 | -- evaluate this point F(ply) 100 | fply = f(ply) 101 | 102 | -- ply - y 103 | ply:add(-1, y) 104 | -- 105 | local Q2 = gfy:dot(ply) 106 | -- L/2 ||beta-y||^2 107 | local Q3 = L/2 * ply:dot(ply) 108 | -- Q(beta,y) = F(y) + + L/2||beta-y||^2 + G(beta) 109 | Q = fy + Q2 + Q3 110 | 111 | if verbose then 112 | print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3)) 113 | end 114 | -- check if F(beta) < Q(pl(y),\t) 115 | if fply <= Q then --and Fply + Gply <= F then 116 | -- now evaluate G here 117 | linesearchdone = true 118 | elseif nline >= maxline then 119 | linesearchdone = true 120 | xk:copy(xkm) -- if we can't find a better point, current iter = previous iter 121 | --print('oops') 122 | else 123 | L = L * Lstep 124 | end 125 | nline = nline + 1 126 | end 127 | -- end line search 128 | --------------------------------------------- 129 | 130 | --------------------------------------------- 131 | -- FISTA 132 | --------------------------------------------- 133 | if doFistaUpdate then 134 | -- do the FISTA step 135 | tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2 136 | -- x(k-1) = x(k-1) - x(k) 137 | xkm:add(-1,xk) 138 | -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k)) 139 | y:copy(xk) 140 | y:add( (1-tk)/tkp , xkm) 141 | -- store for next iterations 142 | -- x(k-1) = x(k) 143 | xkm:copy(xk) 144 | else 145 | y:copy(xk) 146 | end 147 | -- t(k) = t(k+1) 148 | tk = tkp 149 | fply = f(y) 150 | gply = g(y) 151 | if verbose then 152 | print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply)) 153 | end 154 | 155 | niter = niter + 1 156 | 157 | -- bookeeping 158 | fval = fply + gply 159 | history[niter] = {} 160 | history[niter].nline = nline 161 | history[niter].L = L 162 | history[niter].F = fval 163 | history[niter].Fply = fply 164 | history[niter].Gply = gply 165 | history[niter].Q = Q 166 | params.L = L 167 | if verbose then 168 | history[niter].xk = xk:clone() 169 | history[niter].y = y:clone() 170 | end 171 | 172 | -- are we done? 173 | if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then 174 | converged = true 175 | xinit:copy(y) 176 | return y,history 177 | end 178 | 179 | if niter >= maxiter then 180 | xinit:copy(y) 181 | return y,history 182 | end 183 | 184 | --if niter > 1 and history[niter].F > history[niter-1].F then 185 | --print(niter, 'This was supposed to be a convex function, we are going up') 186 | --converged = true 187 | --return xk,history 188 | --end 189 | end 190 | error('not supposed to be here') 191 | end 192 | 193 | -------------------------------------------------------------------------------- /polyinterp.lua: -------------------------------------------------------------------------------- 1 | local function isreal(x) 2 | return x == x 3 | end 4 | 5 | local function isnan(x) 6 | return not x == x 7 | end 8 | 9 | local function roots(c) 10 | local tol=1e-12 11 | c[torch.lt(torch.abs(c),tol)]=0 12 | 13 | local nonzero = torch.ne(c,0) 14 | if nonzero:max() == 0 then 15 | return 0 16 | end 17 | 18 | -- first non-zero 19 | local _,pos = torch.max(nonzero,1) 20 | pos = pos[1] 21 | c=c[{ {pos,-1} }] 22 | 23 | local nz = 0 24 | for i=c:size(1),1,-1 do 25 | if c[i] ~= 0 then 26 | break 27 | else 28 | nz = nz + 1 29 | end 30 | end 31 | c=c[{ {1,c:size(1)-nz} }] 32 | 33 | local n = c:size(1)-1 34 | if n == 1 then 35 | local e = torch.Tensor({{-c[2]/c[1], 0}}) 36 | if nz > 0 then 37 | return torch.cat(e, torch.zeros(nz, 2), 1) 38 | else 39 | return e 40 | end 41 | elseif n > 1 then 42 | local A = torch.diag(torch.ones(n-1),-1) 43 | A[1] = -c[{ {2,n+1} }]/c[1]; 44 | local e = torch.eig(A,'N') 45 | if nz > 0 then 46 | return torch.cat(e, torch.zeros(nz,2), 1) 47 | else 48 | return e 49 | end 50 | else 51 | return torch.zeros(nz,2) 52 | end 53 | end 54 | 55 | local function real(x) 56 | if type(x) == number then return x end 57 | return x[{ {} , 1}] 58 | end 59 | 60 | local function imag(x) 61 | if type(x) == 'number' then return 0 end 62 | if x:nDimension() == 1 then 63 | return torch.zeros(x:size(1)) 64 | else 65 | return x[{ {}, 2}] 66 | end 67 | end 68 | 69 | local function polyval(p,x) 70 | local pwr = p:size(1) 71 | if type(x) == 'number' then 72 | local val = 0 73 | p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end) 74 | return val 75 | else 76 | local val = x.new(x:size(1)) 77 | p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end) 78 | return val 79 | end 80 | end 81 | 82 | ---------------------------------------------------------------------- 83 | -- Minimum of interpolating polynomial based on function and 84 | -- derivative values 85 | -- 86 | -- ARGS: 87 | -- points : N triplets (x,f,g), must be a Tensor 88 | -- xmin : min value that brackets minimum (default: min of points) 89 | -- xmax : max value that brackets maximum (default: max of points) 90 | -- 91 | -- RETURN: 92 | -- minPos : position of minimum 93 | -- 94 | function optim.polyinterp(points,xminBound,xmaxBound) 95 | -- locals 96 | local sqrt = torch.sqrt 97 | local mean = torch.mean 98 | local Tensor = torch.Tensor 99 | local zeros = torch.zeros 100 | local max = math.max 101 | local min = math.min 102 | 103 | -- nb of points / order of polynomial 104 | local nPoints = points:size(1) 105 | local order = nPoints*2-1 106 | 107 | -- returned values 108 | local minPos 109 | 110 | -- Code for most common case: 111 | -- + cubic interpolation of 2 points w/ function and derivative values for both 112 | -- + no xminBound/xmaxBound 113 | if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then 114 | -- Solution in this case (where x2 is the farthest point): 115 | -- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2); 116 | -- d2 = sqrt(d1^2 - g1*g2); 117 | -- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2)); 118 | -- t_new = min(max(minPos,x1),x2); 119 | local minVal,minPos = points[{ {},1 }]:min(1) 120 | minVal = minVal[1] minPos = minPos[1] 121 | local notMinPos = -minPos+3; 122 | 123 | local d1 = points[{minPos,3}] + points[{notMinPos,3}] 124 | - 3*(points[{minPos,2}]-points[{notMinPos,2}]) 125 | / (points[{minPos,1}]-points[{notMinPos,1}]); 126 | local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]); 127 | 128 | if isreal(d2) then -- isreal() 129 | local t = points[{notMinPos,1}] - (points[{notMinPos,1}] 130 | - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1) 131 | / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2)) 132 | 133 | minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}]) 134 | else 135 | minPos = mean(points[{{},1}]) 136 | end 137 | return minPos 138 | end 139 | 140 | -- TODO: get the code below to work! 141 | --error(' extrapolation not implemented yet...') 142 | 143 | -- Compute Bounds of Interpolation Area 144 | local xmin = points[{{},1}]:min() 145 | local xmax = points[{{},1}]:max() 146 | xminBound = xminBound or xmin 147 | xmaxBound = xmaxBound or xmax 148 | 149 | -- Add constraints on function values 150 | local A = zeros(nPoints*2,order+1) 151 | local b = zeros(nPoints*2,1) 152 | for i = 1,nPoints do 153 | local constraint = zeros(order+1) 154 | for j = order,0,-1 do 155 | constraint[order-j+1] = points[{i,1}]^j 156 | end 157 | A[i] = constraint 158 | b[i] = points[{i,2}] 159 | end 160 | 161 | -- Add constraints based on derivatives 162 | for i = 1,nPoints do 163 | local constraint = zeros(order+1) 164 | for j = 1,order do 165 | constraint[j] = (order-j+1)*points[{i,1}]^(order-j) 166 | end 167 | A[nPoints+i] = constraint 168 | b[nPoints+i] = points[{i,3}] 169 | end 170 | 171 | -- Find interpolating polynomial 172 | local res = torch.gels(b,A) 173 | local params = res[{ {1,nPoints*2} }]:squeeze() 174 | 175 | --print(A) 176 | --print(b) 177 | --print(params) 178 | params[torch.le(torch.abs(params),1e-12)]=0 179 | 180 | -- Compute Critical Points 181 | local dParams = zeros(order); 182 | for i = 1,params:size(1)-1 do 183 | dParams[i] = params[i]*(order-i+1) 184 | end 185 | 186 | -- nan/inf? 187 | local nans = false 188 | if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then 189 | nans = true 190 | end 191 | -- for i = 1,dParams:size(1) do 192 | -- if dParams[i] ~= dParams[i] or dParams[i] == math.huge then 193 | -- nans = true 194 | -- break 195 | -- end 196 | -- end 197 | local cp = torch.cat(Tensor{xminBound,xmaxBound},points[{{},1}]) 198 | if not nans then 199 | local cproots = roots(dParams) 200 | local cpi = zeros(cp:size(1),2) 201 | cpi[{ {1,cp:size(1)} , 1 }] = cp 202 | cp = torch.cat(cpi,cproots,1) 203 | end 204 | 205 | --print(dParams) 206 | --print(cp) 207 | 208 | -- Test Critical Points 209 | local fmin = math.huge 210 | -- Default to Bisection if no critical points valid: 211 | minPos = (xminBound+xmaxBound)/2 212 | --print(minPos,fmin) 213 | --print(xminBound,xmaxBound) 214 | for i = 1,cp:size(1) do 215 | local xCP = cp[{ {i,i} , {} }] 216 | --print('xcp=') 217 | --print(xCP) 218 | local ixCP = imag(xCP)[1] 219 | local rxCP = real(xCP)[1] 220 | if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then 221 | local fCP = polyval(params,rxCP) 222 | --print('fcp=') 223 | --print(fCP) 224 | --print(fCP < fmin) 225 | if fCP < fmin then 226 | minPos = rxCP 227 | fmin = fCP 228 | --print('u',minPos,fmin) 229 | end 230 | --print('v',minPos,fmin) 231 | end 232 | end 233 | return minPos,fmin 234 | end 235 | -------------------------------------------------------------------------------- /lbfgs.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt) 2 | 3 | This implementation of L-BFGS relies on a user-provided line 4 | search function (state.lineSearch). If this function is not 5 | provided, then a simple learningRate is used to produce fixed 6 | size steps. Fixed size steps are much less costly than line 7 | searches, and can be useful for stochastic problems. 8 | 9 | The learning rate is used even when a line search is provided. 10 | This is also useful for large-scale stochastic problems, where 11 | opfunc is a noisy approximation of f(x). In that case, the learning 12 | rate allows a reduction of confidence in the step size. 13 | 14 | ARGS: 15 | 16 | - `opfunc` : a function that takes a single input (X), the point of 17 | evaluation, and returns f(X) and df/dX 18 | - `x` : the initial point 19 | - `state` : a table describing the state of the optimizer; after each 20 | call the state is modified 21 | - `state.maxIter` : Maximum number of iterations allowed 22 | - `state.maxEval` : Maximum number of function evaluations 23 | - `state.tolFun` : Termination tolerance on the first-order optimality 24 | - `state.tolX` : Termination tol on progress in terms of func/param changes 25 | - `state.lineSearch` : A line search function 26 | - `state.learningRate` : If no line search provided, then a fixed step size is used 27 | 28 | RETURN: 29 | - `x*` : the new `x` vector, at the optimal point 30 | - `f` : a table of all function values: 31 | `f[1]` is the value of the function before any optimization and 32 | `f[#f]` is the final fully optimized value, at `x*` 33 | 34 | (Clement Farabet, 2012) 35 | ]] 36 | function optim.lbfgs(opfunc, x, config, state) 37 | -- get/update state 38 | local config = config or {} 39 | local state = state or config 40 | local maxIter = tonumber(config.maxIter) or 20 41 | local maxEval = tonumber(config.maxEval) or maxIter*1.25 42 | local tolFun = config.tolFun or 1e-5 43 | local tolX = config.tolX or 1e-9 44 | local nCorrection = config.nCorrection or 100 45 | local lineSearch = config.lineSearch 46 | local lineSearchOpts = config.lineSearchOptions 47 | local learningRate = config.learningRate or 1 48 | local isverbose = config.verbose or false 49 | 50 | state.funcEval = state.funcEval or 0 51 | state.nIter = state.nIter or 0 52 | 53 | -- verbose function 54 | local verbose 55 | if isverbose then 56 | verbose = function(...) print(' ', ...) end 57 | else 58 | verbose = function() end 59 | end 60 | 61 | -- import some functions 62 | local abs = math.abs 63 | local min = math.min 64 | 65 | -- evaluate initial f(x) and df/dx 66 | local f,g = opfunc(x) 67 | local f_hist = {f} 68 | local currentFuncEval = 1 69 | state.funcEval = state.funcEval + 1 70 | local p = g:size(1) 71 | 72 | -- check optimality of initial point 73 | state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1 74 | tmp1:copy(g):abs() 75 | if tmp1:sum() <= tolFun then 76 | -- optimality condition below tolFun 77 | verbose('optimality condition below tolFun') 78 | return x,f_hist 79 | end 80 | 81 | if not state.dir_bufs then 82 | -- reusable buffers for y's and s's, and their histories 83 | verbose('creating recyclable direction/step/history buffers') 84 | state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1) 85 | state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1) 86 | for i=1,#state.dir_bufs do 87 | state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1) 88 | state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1) 89 | end 90 | end 91 | 92 | -- variables cached in state (for tracing) 93 | local d = state.d 94 | local t = state.t 95 | local old_dirs = state.old_dirs 96 | local old_stps = state.old_stps 97 | local Hdiag = state.Hdiag 98 | local g_old = state.g_old 99 | local f_old = state.f_old 100 | 101 | -- optimize for a max of maxIter iterations 102 | local nIter = 0 103 | while nIter < maxIter do 104 | -- keep track of nb of iterations 105 | nIter = nIter + 1 106 | state.nIter = state.nIter + 1 107 | 108 | ------------------------------------------------------------ 109 | -- compute gradient descent direction 110 | ------------------------------------------------------------ 111 | if state.nIter == 1 then 112 | d = g:clone():mul(-1) -- -g 113 | old_dirs = {} 114 | old_stps = {} 115 | Hdiag = 1 116 | else 117 | -- do lbfgs update (update memory) 118 | local y = table.remove(state.dir_bufs) -- pop 119 | local s = table.remove(state.stp_bufs) 120 | y:add(g, -1, g_old) -- g - g_old 121 | s:mul(d, t) -- d*t 122 | local ys = y:dot(s) -- y*s 123 | if ys > 1e-10 then 124 | -- updating memory 125 | if #old_dirs == nCorrection then 126 | -- shift history by one (limited-memory) 127 | local removed1 = table.remove(old_dirs, 1) 128 | local removed2 = table.remove(old_stps, 1) 129 | table.insert(state.dir_bufs, removed1) 130 | table.insert(state.stp_bufs, removed2) 131 | end 132 | 133 | -- store new direction/step 134 | table.insert(old_dirs, s) 135 | table.insert(old_stps, y) 136 | 137 | -- update scale of initial Hessian approximation 138 | Hdiag = ys / y:dot(y) -- (y*y) 139 | else 140 | -- put y and s back into the buffer pool 141 | table.insert(state.dir_bufs, y) 142 | table.insert(state.stp_bufs, s) 143 | end 144 | 145 | -- compute the approximate (L-BFGS) inverse Hessian 146 | -- multiplied by the gradient 147 | local k = #old_dirs 148 | 149 | -- need to be accessed element-by-element, so don't re-type tensor: 150 | state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro 151 | for i = 1,k do 152 | ro[i] = 1 / old_stps[i]:dot(old_dirs[i]) 153 | end 154 | 155 | -- iteration in L-BFGS loop collapsed to use just one buffer 156 | local q = tmp1 -- reuse tmp1 for the q buffer 157 | -- need to be accessed element-by-element, so don't re-type tensor: 158 | state.al = state.al or torch.zeros(nCorrection) local al = state.al 159 | 160 | q:mul(g, -1) -- -g 161 | for i = k,1,-1 do 162 | al[i] = old_dirs[i]:dot(q) * ro[i] 163 | q:add(-al[i], old_stps[i]) 164 | end 165 | 166 | -- multiply by initial Hessian 167 | r = d -- share the same buffer, since we don't need the old d 168 | r:mul(q, Hdiag) -- q[1] * Hdiag 169 | for i = 1,k do 170 | local be_i = old_stps[i]:dot(r) * ro[i] 171 | r:add(al[i]-be_i, old_dirs[i]) 172 | end 173 | -- final direction is in r/d (same object) 174 | end 175 | g_old = g_old or g:clone() 176 | g_old:copy(g) 177 | f_old = f 178 | 179 | ------------------------------------------------------------ 180 | -- compute step length 181 | ------------------------------------------------------------ 182 | -- directional derivative 183 | local gtd = g:dot(d) -- g * d 184 | 185 | -- check that progress can be made along that direction 186 | if gtd > -tolX then 187 | break 188 | end 189 | 190 | -- reset initial guess for step size 191 | if state.nIter == 1 then 192 | tmp1:copy(g):abs() 193 | t = min(1,1/tmp1:sum()) * learningRate 194 | else 195 | t = learningRate 196 | end 197 | 198 | -- optional line search: user function 199 | local lsFuncEval = 0 200 | if lineSearch and type(lineSearch) == 'function' then 201 | -- perform line search, using user function 202 | f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts) 203 | table.insert(f_hist, f) 204 | else 205 | -- no line search, simply move with fixed-step 206 | x:add(t,d) 207 | if nIter ~= maxIter then 208 | -- re-evaluate function only if not in last iteration 209 | -- the reason we do this: in a stochastic setting, 210 | -- no use to re-evaluate that function here 211 | f,g = opfunc(x) 212 | lsFuncEval = 1 213 | table.insert(f_hist, f) 214 | end 215 | end 216 | 217 | -- update func eval 218 | currentFuncEval = currentFuncEval + lsFuncEval 219 | state.funcEval = state.funcEval + lsFuncEval 220 | 221 | ------------------------------------------------------------ 222 | -- check conditions 223 | ------------------------------------------------------------ 224 | if nIter == maxIter then 225 | -- no use to run tests 226 | verbose('reached max number of iterations') 227 | break 228 | end 229 | 230 | if currentFuncEval >= maxEval then 231 | -- max nb of function evals 232 | verbose('max nb of function evals') 233 | break 234 | end 235 | 236 | tmp1:copy(g):abs() 237 | if tmp1:sum() <= tolFun then 238 | -- check optimality 239 | verbose('optimality condition below tolFun') 240 | break 241 | end 242 | 243 | tmp1:copy(d):mul(t):abs() 244 | if tmp1:sum() <= tolX then 245 | -- step size below tolX 246 | verbose('step size below tolX') 247 | break 248 | end 249 | 250 | if abs(f-f_old) < tolX then 251 | -- function value changing less than tolX 252 | verbose('function value changing less than tolX') 253 | break 254 | end 255 | end 256 | 257 | -- save state 258 | state.old_dirs = old_dirs 259 | state.old_stps = old_stps 260 | state.Hdiag = Hdiag 261 | state.g_old = g_old 262 | state.f_old = f_old 263 | state.t = t 264 | state.d = d 265 | 266 | -- return optimal x, and history of f(x) 267 | return x,f_hist,currentFuncEval 268 | end 269 | -------------------------------------------------------------------------------- /cmaes.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'math' 3 | 4 | local BestSolution = {} 5 | --[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), 6 | ported from https://www.lri.fr/~hansen/barecmaes2.html. 7 | 8 | Parameters 9 | ---------- 10 | ARGS: 11 | 12 | - `opfunc` : a function that takes a single input (X), the point of 13 | evaluation, and returns f(X) and df/dX. Note that df/dX is not used 14 | - `x` : the initial point 15 | - `state.sigma` 16 | float, initial step-size (standard deviation in each 17 | coordinate) 18 | - `state.maxEval` 19 | int, maximal number of function evaluations 20 | - `state.ftarget` 21 | float, target function value 22 | - `state.popsize` 23 | population size. If this is left empty, 24 | 4 + int(3 * log(|x|)) will be used 25 | - `state.ftarget` 26 | stop if fitness < ftarget 27 | - `state.verb_disp` 28 | int, display on console every verb_disp iteration, 0 for never 29 | 30 | RETURN: 31 | - `x*` : the new `x` vector, at the optimal point 32 | - `f` : a table of all function values: 33 | `f[1]` is the value of the function before any optimization and 34 | `f[#f]` is the final fully optimized value, at `x*` 35 | --]] 36 | function optim.cmaes(opfunc, x, config, state) 37 | if (x.triu == nil or x.diag == nil) then 38 | error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x") 39 | end 40 | -- process input parameters 41 | local config = config or {} 42 | local state = state or config 43 | local xmean = x:clone():view(-1) -- distribution mean, a flattened copy 44 | local N = xmean:size(1) -- number of objective variables/problem dimension 45 | local sigma = state.sigma -- coordinate wise standard deviation (step size) 46 | local ftarget = state.ftarget -- stop if fitness < ftarget 47 | local maxEval = tonumber(state.maxEval) or 1e3*N^2 48 | local objfunc = opfunc 49 | local verb_disp = state.verb_disp -- display step size 50 | local min_iterations = state.min_iterations or 1 51 | 52 | local lambda = state.popsize -- population size, offspring number 53 | -- Strategy parameter setting: Selection 54 | if state.popsize == nil then 55 | lambda = 4 + math.floor(3 * math.log(N)) 56 | end 57 | 58 | local mu = lambda / 2 -- number of parents/points for recombination 59 | local weights = torch.range(0,mu-1):apply(function(i) 60 | return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights 61 | weights:div(weights:sum()) -- normalize recombination weights array 62 | local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i 63 | weights = weights:typeAs(x) 64 | 65 | -- Strategy parameter setting: Adaptation 66 | local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C 67 | local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control 68 | local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C 69 | local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update 70 | local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1 71 | 72 | -- Initialize dynamic (internal) state variables 73 | local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C 74 | local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma 75 | local B = torch.eye(N):typeAs(x) -- B defines the coordinate system 76 | local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling 77 | local C = torch.eye(N):typeAs(x) -- covariance matrix 78 | if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig 79 | error('torch.symeig not available for ' .. x:type() .. 80 | " please use Float- or DoubleTensor for x") 81 | end 82 | local candidates = torch.Tensor(lambda,N):typeAs(x) 83 | local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2 84 | local eigeneval = 0 -- tracking the update of B and D 85 | local counteval = 0 86 | local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination 87 | local fitvals = torch.Tensor(lambda)-- fitness values 88 | local best = BestSolution.new(nil,nil,counteval) 89 | local iteration = 0 -- iteration of the optimize loop 90 | 91 | 92 | local function ask() 93 | --[[return a list of lambda candidate solutions according to 94 | m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I) 95 | --]] 96 | -- Eigendecomposition: first update B, D and invsqrtC from C 97 | -- postpone in case to achieve O(N^2) 98 | if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then 99 | eigeneval = counteval 100 | C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry 101 | D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3) 102 | D = torch.sqrt(D) -- D contains standard deviations now 103 | invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t()) 104 | end 105 | for k=1,lambda do --repeat lambda times 106 | local z = D:clone():normal(0,1):cmul(D) 107 | candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma) 108 | end 109 | 110 | return candidates 111 | end 112 | 113 | 114 | local function tell(arx) 115 | --[[update the evolution paths and the distribution parameters m, 116 | sigma, and C within CMA-ES. 117 | 118 | Parameters 119 | ---------- 120 | `arx` 121 | a list of solutions, presumably from `ask()` 122 | `fitvals` 123 | the corresponding objective function values --]] 124 | -- bookkeeping, preparation 125 | counteval = counteval + lambda -- slightly artificial to do here 126 | local xold = xmean:clone() 127 | 128 | -- Sort by fitness and compute weighted mean into xmean 129 | local arindex = nil --sorted indices 130 | fitvals, arindex = torch.sort(fitvals) 131 | arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions 132 | 133 | table.insert(f_hist, fitvals[1]) --append best fitness to history 134 | best:update(arx[1], fitvals[1], counteval) 135 | 136 | xmean:zero() 137 | xmean:addmv(arx:t(), weights) --dot product 138 | 139 | -- Cumulation: update evolution paths 140 | local y = xmean - xold 141 | local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold) 142 | 143 | local c = (cs * (2-cs) * mueff)^0.5 / sigma 144 | ps = ps - ps * cs + z * c -- exponential decay on ps 145 | local hsig = (torch.sum(torch.pow(ps,2)) / 146 | (1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1)) 147 | hsig = hsig and 1.0 or 0.0 --use binary numbers 148 | 149 | c = (cc * (2-cc) * mueff)^0.5 / sigma 150 | pc = pc - pc * cc + y * c * hsig -- exponential decay on pc 151 | 152 | -- Adapt covariance matrix C 153 | local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc) 154 | -- for a minor adjustment to the variance loss by hsig 155 | for i=1,N do 156 | for j=1,N do 157 | local r = torch.range(1,mu) 158 | r:apply(function(k) 159 | return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end) 160 | local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update 161 | C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + 162 | c1 * pc[i]*pc[j] + cmu * Cmuij) 163 | end 164 | end 165 | 166 | -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82 167 | sigma = sigma * math.exp(math.min(0.6, 168 | (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2)) 169 | end 170 | 171 | local function stop() 172 | --[[return satisfied termination conditions in a table like 173 | {'termination reason':value, ...}, for example {'tolfun':1e-12}, 174 | or the empty table {}--]] 175 | local res = {} 176 | if counteval > 0 then 177 | if counteval >= maxEval then 178 | res['evals'] = maxEval 179 | end 180 | if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then 181 | res['ftarget'] = ftarget 182 | end 183 | if torch.max(D) > 1e7 * torch.min(D) then 184 | res['condition'] = 1e7 185 | end 186 | if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then 187 | res['tolfun'] = 1e-12 188 | end 189 | if sigma * torch.max(D) < 1e-11 then 190 | -- remark: max(D) >= max(diag(C))^0.5 191 | res['tolx'] = 1e-11 192 | end 193 | end 194 | return res 195 | end 196 | 197 | local function disp(verb_modulo) 198 | --[[display some iteration info--]] 199 | if verb_disp == 0 then 200 | return nil 201 | end 202 | local iteration = counteval / lambda 203 | 204 | if iteration == 1 or iteration % (10*verb_modulo) == 0 then 205 | print('evals:\t ax-ratio max(std) f-value') 206 | end 207 | if iteration <= 2 or iteration % verb_modulo == 0 then 208 | local max_std = math.sqrt(torch.max(torch.diag(C))) 209 | print(tostring(counteval).. ': ' .. 210 | string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) 211 | .. tostring(fitvals[1])) 212 | end 213 | 214 | return nil 215 | end 216 | 217 | while next(stop()) == nil or iteration < min_iterations do 218 | iteration = iteration + 1 219 | 220 | local X = ask() -- deliver candidate solutions 221 | for i=1, lambda do 222 | -- put candidate tensor back in input shape and evaluate in opfunc 223 | local candidate = X[i]:viewAs(x) 224 | fitvals[i] = objfunc(candidate) 225 | end 226 | 227 | tell(X) 228 | disp(verb_disp) 229 | end 230 | 231 | local bestmu, f, c = best:get() 232 | if verb_disp > 0 then 233 | for k, v in pairs(stop()) do 234 | print('termination by', k, '=', v) 235 | end 236 | print('best f-value =', f) 237 | print('solution = ') 238 | print(bestmu) 239 | print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration) 240 | end 241 | table.insert(f_hist, f) 242 | 243 | return bestmu, f_hist, counteval 244 | end 245 | 246 | 247 | 248 | BestSolution.__index = BestSolution 249 | function BestSolution.new(x, f, evals) 250 | local self = setmetatable({}, BestSolution) 251 | self.x = x 252 | self.f = f 253 | self.evals = evals 254 | return self 255 | end 256 | 257 | function BestSolution.update(self, arx, arf, evals) 258 | --[[initialize the best solution with `x`, `f`, and `evals`. 259 | Better solutions have smaller `f`-values.--]] 260 | if self.f == nil or arf < self.f then 261 | self.x = arx:clone() 262 | self.f = arf 263 | self.evals = evals 264 | end 265 | return self 266 | end 267 | 268 | function BestSolution.get(self) 269 | return self.x, self.f, self.evals 270 | end 271 | -------------------------------------------------------------------------------- /ConfusionMatrix.lua: -------------------------------------------------------------------------------- 1 | --[[ A Confusion Matrix class 2 | 3 | Example: 4 | 5 | conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix 6 | conf:zero() -- reset matrix 7 | for i = 1,N do 8 | conf:add( neuralnet:forward(sample), label ) -- accumulate errors 9 | end 10 | print(conf) -- print matrix 11 | image.display(conf:render()) -- render matrix 12 | ]] 13 | local ConfusionMatrix = torch.class('optim.ConfusionMatrix') 14 | 15 | function ConfusionMatrix:__init(nclasses, classes) 16 | if type(nclasses) == 'table' then 17 | classes = nclasses 18 | nclasses = #classes 19 | end 20 | self.mat = torch.LongTensor(nclasses,nclasses):zero() 21 | self.valids = torch.FloatTensor(nclasses):zero() 22 | self.unionvalids = torch.FloatTensor(nclasses):zero() 23 | self.nclasses = nclasses 24 | self.totalValid = 0 25 | self.averageValid = 0 26 | self.classes = classes or {} 27 | -- buffers 28 | self._mat_flat = self.mat:view(-1) 29 | self._target = torch.FloatTensor() 30 | self._prediction = torch.FloatTensor() 31 | self._max = torch.FloatTensor() 32 | self._pred_idx = torch.LongTensor() 33 | self._targ_idx = torch.LongTensor() 34 | end 35 | 36 | -- takes scalar prediction and target as input 37 | function ConfusionMatrix:_add(p, t) 38 | assert(p and type(p) == 'number') 39 | assert(t and type(t) == 'number') 40 | -- non-positive values are considered missing 41 | -- and therefore ignored 42 | if t > 0 then 43 | self.mat[t][p] = self.mat[t][p] + 1 44 | end 45 | end 46 | 47 | function ConfusionMatrix:add(prediction, target) 48 | if type(prediction) == 'number' then 49 | -- comparing numbers 50 | self:_add(prediction, target) 51 | else 52 | self._prediction:resize(prediction:size()):copy(prediction) 53 | assert(prediction:dim() == 1) 54 | if type(target) == 'number' then 55 | -- prediction is a vector, then target assumed to be an index 56 | self._max:max(self._pred_idx, self._prediction, 1) 57 | self:_add(self._pred_idx[1], target) 58 | else 59 | -- both prediction and target are vectors 60 | assert(target:dim() == 1) 61 | self._target:resize(target:size()):copy(target) 62 | self._max:max(self._targ_idx, self._target, 1) 63 | self._max:max(self._pred_idx, self._prediction, 1) 64 | self:_add(self._pred_idx[1], self._targ_idx[1]) 65 | end 66 | end 67 | end 68 | 69 | function ConfusionMatrix:batchAdd(predictions, targets) 70 | local preds, targs, __ 71 | self._prediction:resize(predictions:size()):copy(predictions) 72 | if predictions:dim() == 1 then 73 | -- predictions is a vector of classes 74 | preds = self._prediction 75 | elseif predictions:dim() == 2 then 76 | -- prediction is a matrix of class likelihoods 77 | if predictions:size(2) == 1 then 78 | -- or prediction just needs flattening 79 | preds = self._prediction:select(2,1) 80 | else 81 | self._max:max(self._pred_idx, self._prediction, 2) 82 | preds = self._pred_idx:select(2,1) 83 | end 84 | else 85 | error("predictions has invalid number of dimensions") 86 | end 87 | 88 | self._target:resize(targets:size()):copy(targets) 89 | if targets:dim() == 1 then 90 | -- targets is a vector of classes 91 | targs = self._target 92 | elseif targets:dim() == 2 then 93 | -- targets is a matrix of one-hot rows 94 | if targets:size(2) == 1 then 95 | -- or targets just needs flattening 96 | targs = self._target:select(2,1) 97 | else 98 | self._max:max(self._targ_idx, self._target, 2) 99 | targs = self._targ_idx:select(2,1) 100 | end 101 | else 102 | error("targets has invalid number of dimensions") 103 | end 104 | 105 | -- non-positive values are considered missing and therefore ignored 106 | local mask = targs:ge(1) 107 | targs = targs[mask] 108 | preds = preds[mask] 109 | 110 | self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility 111 | 112 | preds = preds:typeAs(targs) 113 | 114 | assert(self.mat:isContiguous() and self.mat:stride(2) == 1) 115 | local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat) 116 | local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1)) 117 | self._mat_flat:indexAdd(1, indices, ones) 118 | end 119 | 120 | function ConfusionMatrix:zero() 121 | self.mat:zero() 122 | self.valids:zero() 123 | self.unionvalids:zero() 124 | self.totalValid = 0 125 | self.averageValid = 0 126 | end 127 | 128 | local function isNaN(number) 129 | return number ~= number 130 | end 131 | 132 | function ConfusionMatrix:updateValids() 133 | local total = 0 134 | for t = 1,self.nclasses do 135 | self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum() 136 | self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t]) 137 | total = total + self.mat[t][t] 138 | end 139 | self.totalValid = total / self.mat:sum() 140 | self.averageValid = 0 141 | self.averageUnionValid = 0 142 | local nvalids = 0 143 | local nunionvalids = 0 144 | for t = 1,self.nclasses do 145 | if not isNaN(self.valids[t]) then 146 | self.averageValid = self.averageValid + self.valids[t] 147 | nvalids = nvalids + 1 148 | end 149 | if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then 150 | self.averageUnionValid = self.averageUnionValid + self.unionvalids[t] 151 | nunionvalids = nunionvalids + 1 152 | end 153 | end 154 | self.averageValid = self.averageValid / nvalids 155 | self.averageUnionValid = self.averageUnionValid / nunionvalids 156 | end 157 | 158 | -- Calculating FAR/FRR associated with the confusion matrix 159 | 160 | function ConfusionMatrix:farFrr() 161 | local cmat = self.mat 162 | local noOfClasses = cmat:size()[1] 163 | self._frrs = self._frrs or torch.zeros(noOfClasses) 164 | self._frrs:zero() 165 | self._classFrrs = self._classFrrs or torch.zeros(noOfClasses) 166 | self._classFrrs:zero() 167 | self._classFrrs:add(-1) 168 | self._fars = self._fars or torch.zeros(noOfClasses) 169 | self._fars:zero() 170 | self._classFars = self._classFars or torch.zeros(noOfClasses) 171 | self._classFars:zero() 172 | self._classFars:add(-1) 173 | local classSamplesCount = cmat:sum(2) 174 | local indx = 1 175 | for i=1,noOfClasses do 176 | if classSamplesCount[i][1] ~= 0 then 177 | self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1] 178 | self._classFrrs[i] = self._frrs[indx] 179 | -- Calculating FARs 180 | local farNumerator = 0 181 | local farDenominator = 0 182 | for j=1, noOfClasses do 183 | if i ~= j then 184 | if classSamplesCount[j][1] ~= 0 then 185 | farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1] 186 | farDenominator = farDenominator + 1 187 | end 188 | end 189 | end 190 | self._fars[indx] = farNumerator/farDenominator 191 | self._classFars[i] = self._fars[indx] 192 | indx = indx + 1 193 | end 194 | end 195 | indx = indx - 1 196 | local returnFrrs = self._frrs[{{1, indx}}] 197 | local returnFars = self._fars[{{1, indx}}] 198 | return self._classFrrs, self._classFars, returnFrrs, returnFars 199 | end 200 | 201 | local function log10(n) 202 | if math.log10 then 203 | return math.log10(n) 204 | else 205 | return math.log(n) / math.log(10) 206 | end 207 | end 208 | 209 | function ConfusionMatrix:__tostring__() 210 | self:updateValids() 211 | local str = {'ConfusionMatrix:\n'} 212 | local nclasses = self.nclasses 213 | table.insert(str, '[') 214 | local maxCnt = self.mat:max() 215 | local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt))) 216 | for t = 1,nclasses do 217 | local pclass = self.valids[t] * 100 218 | pclass = string.format('%2.3f', pclass) 219 | if t == 1 then 220 | table.insert(str, '[') 221 | else 222 | table.insert(str, ' [') 223 | end 224 | for p = 1,nclasses do 225 | table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p])) 226 | end 227 | if self.classes and self.classes[1] then 228 | if t == nclasses then 229 | table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') 230 | else 231 | table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') 232 | end 233 | else 234 | if t == nclasses then 235 | table.insert(str, ']] ' .. pclass .. '% \n') 236 | else 237 | table.insert(str, '] ' .. pclass .. '% \n') 238 | end 239 | end 240 | end 241 | table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n') 242 | table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n') 243 | table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%') 244 | return table.concat(str) 245 | end 246 | 247 | function ConfusionMatrix:render(sortmode, display, block, legendwidth) 248 | -- args 249 | local confusion = self.mat:double() 250 | local classes = self.classes 251 | local sortmode = sortmode or 'score' -- 'score' or 'occurrence' 252 | local block = block or 25 253 | local legendwidth = legendwidth or 200 254 | local display = display or false 255 | 256 | -- legends 257 | local legend = { 258 | ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]', 259 | ['occurrence'] = 'Confusiong matrix [sorted by occurences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]' 260 | } 261 | 262 | -- parse matrix / normalize / count scores 263 | local diag = torch.FloatTensor(#classes) 264 | local freqs = torch.FloatTensor(#classes) 265 | local unconf = confusion 266 | local confusion = confusion:clone() 267 | local corrects = 0 268 | local total = 0 269 | for target = 1,#classes do 270 | freqs[target] = confusion[target]:sum() 271 | corrects = corrects + confusion[target][target] 272 | total = total + freqs[target] 273 | confusion[target]:div( math.max(confusion[target]:sum(),1) ) 274 | diag[target] = confusion[target][target] 275 | end 276 | 277 | -- accuracies 278 | local accuracy = corrects / total * 100 279 | local perclass = 0 280 | local total = 0 281 | for target = 1,#classes do 282 | if confusion[target]:sum() > 0 then 283 | perclass = perclass + diag[target] 284 | total = total + 1 285 | end 286 | end 287 | perclass = perclass / total * 100 288 | freqs:div(unconf:sum()) 289 | 290 | -- sort matrix 291 | if sortmode == 'score' then 292 | _,order = torch.sort(diag,1,true) 293 | elseif sortmode == 'occurrence' then 294 | _,order = torch.sort(freqs,1,true) 295 | else 296 | error('sort mode must be one of: score | occurrence') 297 | end 298 | 299 | -- render matrix 300 | local render = torch.zeros(#classes*block, #classes*block) 301 | for target = 1,#classes do 302 | for prediction = 1,#classes do 303 | render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]] 304 | end 305 | end 306 | 307 | -- add grid 308 | for target = 1,#classes do 309 | render[{ {target*block},{} }] = 0.1 310 | render[{ {},{target*block} }] = 0.1 311 | end 312 | 313 | -- create rendering 314 | require 'image' 315 | require 'qtwidget' 316 | require 'qttorch' 317 | local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] ) 318 | image.display{image=render, win=win1} 319 | 320 | -- add legend 321 | for i in ipairs(classes) do 322 | -- background cell 323 | win1:setcolor{r=0,g=0,b=0} 324 | win1:rectangle((#render)[2],(i-1)*block,legendwidth,block) 325 | win1:fill() 326 | 327 | -- % 328 | win1:setfont(qt.QFont{serif=false, size=fontsize}) 329 | local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4 330 | win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2} 331 | win1:moveto((#render)[2]+10,i*block-block/3) 332 | win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100)) 333 | 334 | -- legend 335 | win1:setfont(qt.QFont{serif=false, size=fontsize}) 336 | local gscale = diag[order[i]]*0.8+0.2 337 | win1:setcolor{r=gscale,g=gscale,b=gscale} 338 | win1:moveto(120+(#render)[2]+10,i*block-block/3) 339 | win1:show(classes[order[i]]) 340 | 341 | for j in ipairs(classes) do 342 | -- scores 343 | local score = confusion[order[j]][order[i]] 344 | local gscale = (1-score)*(score*0.8+0.2) 345 | win1:setcolor{r=gscale,g=gscale,b=gscale} 346 | win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3) 347 | win1:show(string.format('%02.0f',math.floor(score*100+0.5))) 348 | end 349 | end 350 | 351 | -- generate tensor 352 | local t = win1:image():toTensor() 353 | 354 | -- display 355 | if display then 356 | image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)} 357 | end 358 | 359 | -- return rendering 360 | return t 361 | end 362 | -------------------------------------------------------------------------------- /doc/index.md: -------------------------------------------------------------------------------- 1 | 2 | # Optim Package 3 | 4 | This package provides a set of optimization algorithms, which all follow 5 | a unified, closure-based API. 6 | 7 | This package is fully compatible with the [nn](http://nn.readthedocs.org) package, but can also be 8 | used to optimize arbitrary objective functions. 9 | 10 | For now, the following algorithms are provided: 11 | 12 | * [Stochastic Gradient Descent](#optim.sgd) 13 | * [Averaged Stochastic Gradient Descent](#optim.asgd) 14 | * [L-BFGS](#optim.lbfgs) 15 | * [Congugate Gradients](#optim.cg) 16 | * [AdaDelta](#optim.adadelta) 17 | * [AdaGrad](#optim.adagrad) 18 | * [Adam](#optim.adam) 19 | * [AdaMax](#optim.adamax) 20 | * [FISTA with backtracking line search](#optim.FistaLS) 21 | * [Nesterov's Accelerated Gradient method](#optim.nag) 22 | * [RMSprop](#optim.rmsprop) 23 | * [Rprop](#optim.rprop) 24 | * [CMAES](#optim.cmaes) 25 | 26 | All these algorithms are designed to support batch optimization as 27 | well as stochastic optimization. It's up to the user to construct an 28 | objective function that represents the batch, mini-batch, or single sample 29 | on which to evaluate the objective. 30 | 31 | Some of these algorithms support a line search, which can be passed as 32 | a function (L-BFGS), whereas others only support a learning rate (SGD). 33 | 34 | 35 | ## Overview 36 | 37 | This package contains several optimization routines for [Torch](https://github.com/torch/torch7/blob/master/README.md). 38 | Most optimization algorithms has the following interface: 39 | 40 | ```lua 41 | x*, {f}, ... = optim.method(opfunc, x, state) 42 | ``` 43 | 44 | where: 45 | 46 | * `opfunc`: a user-defined closure that respects this API: `f, df/dx = func(x)` 47 | * `x`: the current parameter vector (a 1D `torch.Tensor`) 48 | * `state`: a table of parameters, and state variables, dependent upon the algorithm 49 | * `x*`: the new parameter vector that minimizes `f, x* = argmin_x f(x)` 50 | * `{f}`: a table of all f values, in the order they've been evaluated (for some simple algorithms, like SGD, `#f == 1`) 51 | 52 | 53 | ## Example 54 | 55 | The state table is used to hold the state of the algorihtm. 56 | It's usually initialized once, by the user, and then passed to the optim function 57 | as a black box. Example: 58 | 59 | ```lua 60 | state = { 61 | learningRate = 1e-3, 62 | momentum = 0.5 63 | } 64 | 65 | for i,sample in ipairs(training_samples) do 66 | local func = function(x) 67 | -- define eval function 68 | return f,df_dx 69 | end 70 | optim.sgd(func,x,state) 71 | end 72 | ``` 73 | 74 | 75 | ## Algorithms 76 | 77 | Most algorithms provided rely on a unified interface: 78 | ```lua 79 | x_new,fs = optim.method(opfunc, x, state) 80 | ``` 81 | where: 82 | x is the trainable/adjustable parameter vector, 83 | state contains both options for the algorithm and the state of the algorihtm, 84 | opfunc is a closure that has the following interface: 85 | ```lua 86 | f,df_dx = opfunc(x) 87 | ``` 88 | x_new is the new parameter vector (after optimization), 89 | fs is a a table containing all the values of the objective, as evaluated during 90 | the optimization procedure: fs[1] is the value before optimization, and fs[#fs] 91 | is the most optimized one (the lowest). 92 | 93 | 94 | ### [x] sgd(opfunc, x, state) 95 | 96 | An implementation of Stochastic Gradient Descent (SGD). 97 | 98 | Arguments: 99 | 100 | * `opfunc` : a function that takes a single input (`X`), the point of a evaluation, and returns `f(X)` and `df/dX` 101 | * `x` : the initial point 102 | * `config` : a table with configuration parameters for the optimizer 103 | * `config.learningRate` : learning rate 104 | * `config.learningRateDecay` : learning rate decay 105 | * `config.weightDecay` : weight decay 106 | * `config.weightDecays` : vector of individual weight decays 107 | * `config.momentum` : momentum 108 | * `config.dampening` : dampening for momentum 109 | * `config.nesterov` : enables Nesterov momentum 110 | * `state` : a table describing the state of the optimizer; after each call the state is modified 111 | * `state.learningRates` : vector of individual learning rates 112 | 113 | Returns : 114 | 115 | * `x` : the new x vector 116 | * `f(x)` : the function, evaluated before the update 117 | 118 | 119 | ### [x] asgd(opfunc, x, state) 120 | 121 | An implementation of Averaged Stochastic Gradient Descent (ASGD): 122 | 123 | ``` 124 | x = (1 - lambda eta_t) x - eta_t df/dx(z,x) 125 | a = a + mu_t [ x - a ] 126 | 127 | eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75 128 | mu_t = 1/max(1,t-t0) 129 | ``` 130 | 131 | Arguments: 132 | 133 | * `opfunc` : a function that takes a single input (`X`), the point of evaluation, and returns `f(X)` and `df/dX` 134 | * `x` : the initial point 135 | * `state` : a table describing the state of the optimizer; after each call the state is modified 136 | * `state.eta0` : learning rate 137 | * `state.lambda` : decay term 138 | * `state.alpha` : power for eta update 139 | * `state.t0` : point at which to start averaging 140 | 141 | Returns: 142 | 143 | * `x` : the new x vector 144 | * `f(x)` : the function, evaluated before the update 145 | * `ax` : the averaged x vector 146 | 147 | 148 | 149 | ### [x] lbfgs(opfunc, x, state) 150 | 151 | An implementation of L-BFGS that relies on a user-provided line 152 | search function (`state.lineSearch`). If this function is not 153 | provided, then a simple learningRate is used to produce fixed 154 | size steps. Fixed size steps are much less costly than line 155 | searches, and can be useful for stochastic problems. 156 | 157 | The learning rate is used even when a line search is provided. 158 | This is also useful for large-scale stochastic problems, where 159 | opfunc is a noisy approximation of `f(x)`. In that case, the learning 160 | rate allows a reduction of confidence in the step size. 161 | 162 | Arguments : 163 | 164 | * `opfunc` : a function that takes a single input (`X`), the point of evaluation, and returns `f(X)` and `df/dX` 165 | * `x` : the initial point 166 | * `state` : a table describing the state of the optimizer; after each call the state is modified 167 | * `state.maxIter` : Maximum number of iterations allowed 168 | * `state.maxEval` : Maximum number of function evaluations 169 | * `state.tolFun` : Termination tolerance on the first-order optimality 170 | * `state.tolX` : Termination tol on progress in terms of func/param changes 171 | * `state.lineSearch` : A line search function 172 | * `state.learningRate` : If no line search provided, then a fixed step size is used 173 | 174 | Returns : 175 | * `x*` : the new `x` vector, at the optimal point 176 | * `f` : a table of all function values: 177 | * `f[1]` is the value of the function before any optimization and 178 | * `f[#f]` is the final fully optimized value, at `x*` 179 | 180 | 181 | 182 | ### [x] cg(opfunc, x, state) 183 | 184 | An implementation of the Conjugate Gradient method which is a rewrite of 185 | `minimize.m` written by Carl E. Rasmussen. 186 | It is supposed to produce exactly same results (give 187 | or take numerical accuracy due to some changed order of 188 | operations). You can compare the result on rosenbrock with 189 | [minimize.m](http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html). 190 | ``` 191 | [x fx c] = minimize([0 0]', 'rosenbrock', -25) 192 | ``` 193 | 194 | Note that we limit the number of function evaluations only, it seems much 195 | more important in practical use. 196 | 197 | Arguments : 198 | 199 | * `opfunc` : a function that takes a single input, the point of evaluation. 200 | * `x` : the initial point 201 | * `state` : a table of parameters and temporary allocations. 202 | * `state.maxEval` : max number of function evaluations 203 | * `state.maxIter` : max number of iterations 204 | * `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage 205 | * `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage 206 | 207 | Returns : 208 | 209 | * `x*` : the new x vector, at the optimal point 210 | * `f` : a table of all function values where 211 | * `f[1]` is the value of the function before any optimization and 212 | * `f[#f]` is the final fully optimized value, at x* 213 | 214 | 215 | ### [x] adadelta(opfunc, x, config, state) 216 | ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701 217 | 218 | Arguments : 219 | 220 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX 221 | * `x` : the initial point 222 | * `config` : a table of hyper-parameters 223 | * `config.rho` : interpolation parameter 224 | * `config.eps` : for numerical stability 225 | * `state` : a table describing the state of the optimizer; after each call the state is modified 226 | * `state.paramVariance` : vector of temporal variances of parameters 227 | * `state.accDelta` : vector of accummulated delta of gradients 228 | 229 | Returns : 230 | 231 | * `x` : the new x vector 232 | * `f(x)` : the function, evaluated before the update 233 | 234 | 235 | ### [x] adagrad(opfunc, x, config, state) 236 | AdaGrad implementation for SGD 237 | 238 | Arguments : 239 | 240 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX 241 | * `x` : the initial point 242 | * `state` : a table describing the state of the optimizer; after each call the state is modified 243 | * `state.learningRate` : learning rate 244 | * `state.paramVariance` : vector of temporal variances of parameters 245 | 246 | Returns : 247 | 248 | * `x` : the new x vector 249 | * `f(x)` : the function, evaluated before the update 250 | 251 | 252 | ### [x] adam(opfunc, x, config, state) 253 | An implementation of Adam from http://arxiv.org/pdf/1412.6980.pdf 254 | 255 | Arguments : 256 | 257 | * `opfunc` : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX 258 | * `x` : the initial point 259 | * `config` : a table with configuration parameters for the optimizer 260 | * `config.learningRate` : learning rate 261 | * `config.beta1` : first moment coefficient 262 | * `config.beta2` : second moment coefficient 263 | * `config.epsilon` : for numerical stability 264 | * `state` : a table describing the state of the optimizer; after each call the state is modified 265 | 266 | Returns : 267 | 268 | * `x` : the new x vector 269 | * `f(x)` : the function, evaluated before the update 270 | 271 | 272 | ### [x] adamax(opfunc, x, config, state) 273 | An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf 274 | 275 | Arguments : 276 | 277 | * `opfunc` : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX 278 | * `x` : the initial point 279 | * `config` : a table with configuration parameters for the optimizer 280 | * `config.learningRate` : learning rate 281 | * `config.beta1` : first moment coefficient 282 | * `config.beta2` : second moment coefficient 283 | * `config.epsilon` : for numerical stability 284 | * `state` : a table describing the state of the optimizer; after each call the state is modified. 285 | 286 | Returns : 287 | 288 | * `x` : the new x vector 289 | * `f(x)` : the function, evaluated before the update 290 | 291 | 292 | ### [x] FistaLS(f, g, pl, xinit, params) 293 | FISTA with backtracking line search 294 | * `f` : smooth function 295 | * `g` : non-smooth function 296 | * `pl` : minimizer of intermediate problem Q(x,y) 297 | * `xinit` : initial point 298 | * `params` : table of parameters (**optional**) 299 | * `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1) 300 | * `params.Lstep` : step size multiplier at each iteration (1.5) 301 | * `params.maxiter` : max number of iterations (50) 302 | * `params.maxline` : max number of line search iterations per iteration (20) 303 | * `params.errthres`: Error thershold for convergence check (1e-4) 304 | * `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true) 305 | * `params.verbose` : store each iteration solution and print detailed info (false) 306 | 307 | On output, `params` will contain these additional fields that can be reused. 308 | * `params.L` : last used L value will be written. 309 | 310 | These are temporary storages needed by the algo and if the same params object is 311 | passed a second time, these same storages will be used without new allocation. 312 | * `params.xkm` : previous iterarion point 313 | * `params.y` : fista iteration 314 | * `params.ply` : ply = pl(y * 1/L grad(f)) 315 | 316 | Returns the solution x and history of {function evals, number of line search ,...} 317 | 318 | Algorithm is published in http://epubs.siam.org/doi/abs/10.1137/080716542 319 | 320 | 321 | ### [x] nag(opfunc, x, config, state) 322 | An implementation of SGD adapted with features of Nesterov's 323 | Accelerated Gradient method, based on the paper "On the Importance of Initialization and Momentum in Deep Learning" (Sutsveker et. al., ICML 2013). 324 | 325 | Arguments : 326 | 327 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX 328 | * `x` : the initial point 329 | * `state` : a table describing the state of the optimizer; after each call the state is modified 330 | * `state.learningRate` : learning rate 331 | * `state.learningRateDecay` : learning rate decay 332 | * `astate.weightDecay` : weight decay 333 | * `state.momentum` : momentum 334 | * `state.learningRates` : vector of individual learning rates 335 | 336 | Returns : 337 | 338 | * `x` : the new x vector 339 | * `f(x)` : the function, evaluated before the update 340 | 341 | 342 | ### [x] rmsprop(opfunc, x, config, state) 343 | An implementation of RMSprop 344 | 345 | Arguments : 346 | 347 | * `opfunc` : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX 348 | * `x` : the initial point 349 | * `config` : a table with configuration parameters for the optimizer 350 | * `config.learningRate` : learning rate 351 | * `config.alpha` : smoothing constant 352 | * `config.epsilon` : value with which to initialise m 353 | * `state` : a table describing the state of the optimizer; after each call the state is modified 354 | * `state.m` : leaky sum of squares of parameter gradients, 355 | * `state.tmp` : and the square root (with epsilon smoothing) 356 | 357 | Returns : 358 | 359 | * `x` : the new x vector 360 | * `f(x)` : the function, evaluated before the update 361 | 362 | 363 | ### [x] rprop(opfunc, x, config, state) 364 | A plain implementation of Rprop 365 | (Martin Riedmiller, Koray Kavukcuoglu 2013) 366 | 367 | Arguments : 368 | 369 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX 370 | * `x` : the initial point 371 | * `state` : a table describing the state of the optimizer; after each call the state is modified 372 | * `state.stepsize` : initial step size, common to all components 373 | * `state.etaplus` : multiplicative increase factor, > 1 (default 1.2) 374 | * `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5) 375 | * `state.stepsizemax` : maximum stepsize allowed (default 50) 376 | * `state.stepsizemin` : minimum stepsize allowed (default 1e-6) 377 | * `state.niter` : number of iterations (default 1) 378 | 379 | Returns : 380 | 381 | * `x` : the new x vector 382 | * `f(x)` : the function, evaluated before the update 383 | 384 | 385 | 386 | 387 | 388 | ### [x] cmaes(opfunc, x, config, state) 389 | An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), 390 | ported from https://www.lri.fr/~hansen/barecmaes2.html. 391 | 392 | CMAES is a stochastic, derivative-free method for heuristic global optimization of non-linear or non-convex continuous optimization problems. Note that this method will on average take much more function evaluations to converge then a gradient based method. 393 | 394 | Arguments: 395 | 396 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX. Note that df/dX is not used and can be left 0 397 | * `x` : the initial point 398 | * `state.sigma` : float, initial step-size (standard deviation in each coordinate) 399 | * `state.maxEval` : int, maximal number of function evaluations 400 | * `state.ftarget` : float, target function value 401 | * `state.popsize` : population size. If this is left empty, 4 + int(3 * log(|x|)) will be used 402 | * `state.ftarget` : stop if fitness < ftarget 403 | * `state.verb_disp` : display info on console every verb_disp iteration, 0 for never 404 | 405 | Returns: 406 | * `x*` : the new `x` vector, at the optimal point 407 | * `f` : a table of all function values: 408 | * `f[1]` is the value of the function before any optimization and 409 | * `f[#f]` is the final fully optimized value, at `x*` 410 | --------------------------------------------------------------------------------