├── .gitignore ├── doc ├── logger_plot.png ├── image │ ├── parameterflattening.png │ ├── parameterflattening.svg.png │ └── parameterflattening.svg ├── logger.md ├── intro.md └── algos.md ├── .dokx ├── mkdocs.yml ├── README.md ├── test ├── test_cg.lua ├── test_adam.lua ├── test_sgd.lua ├── test_lbfgs_w_ls.lua ├── test_adagrad.lua ├── test_rmsprop.lua ├── test_adamax.lua ├── test_adadelta.lua ├── test_cmaes.lua ├── test_de.lua ├── test_logger.lua ├── l2.lua ├── test_lbfgs.lua ├── test_confusion.lua ├── rosenbrock.lua ├── test_fista.lua └── sparsecoding.lua ├── init.lua ├── CMakeLists.txt ├── optim-1.0.5-0.rockspec ├── optim-1.0.4-0.rockspec ├── optim-1.0.3-0.rockspec ├── optim-1.0.3-1.rockspec ├── checkgrad.lua ├── adagrad.lua ├── COPYRIGHT.txt ├── rmsprop.lua ├── adadelta.lua ├── asgd.lua ├── adamax.lua ├── adam.lua ├── nag.lua ├── sgd.lua ├── de.lua ├── rprop.lua ├── Logger.lua ├── cg.lua ├── lswolfe.lua ├── polyinterp.lua ├── fista.lua ├── lbfgs.lua ├── cmaes.lua └── ConfusionMatrix.lua /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /doc/logger_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/torch/optim/HEAD/doc/logger_plot.png -------------------------------------------------------------------------------- /.dokx: -------------------------------------------------------------------------------- 1 | return { 2 | githubURL = "torch/optim", 3 | exclude = {"test", "polyinterp.lua"} 4 | } 5 | -------------------------------------------------------------------------------- /doc/image/parameterflattening.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/torch/optim/HEAD/doc/image/parameterflattening.png -------------------------------------------------------------------------------- /doc/image/parameterflattening.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/torch/optim/HEAD/doc/image/parameterflattening.svg.png -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: optim 2 | theme : simplex 3 | repo_url : https://github.com/torch/optim 4 | use_directory_urls : false 5 | markdown_extensions: [extra] 6 | docs_dir : doc 7 | pages: 8 | - [index.md, Optim] 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Optimization package 3 | 4 | This package contains several optimization routines and a logger for [Torch](https://github.com/torch/torch7/blob/master/README.md): 5 | 6 | * [Overview](doc/intro.md); 7 | * [Optimization algorithms](doc/algos.md); 8 | * [Logger](doc/logger.md). 9 | -------------------------------------------------------------------------------- /test/test_cg.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | 8 | x = torch.Tensor(2):fill(0) 9 | x,fx,i=optim.cg(rosenbrock,x,{maxIter=50}) 10 | 11 | print() 12 | print('Rosenbrock test: compare with http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html') 13 | print() 14 | print('Number of function evals = ',i) 15 | print('x=');print(x) 16 | print('fx=') 17 | for i=1,#fx do print(i,fx[i]); end 18 | -------------------------------------------------------------------------------- /test/test_adam.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | require 'rosenbrock' 4 | require 'l2' 5 | x = torch.Tensor(2):fill(0) 6 | fx = {} 7 | config = {learningRate=0.002} 8 | for i = 1,10001 do 9 | x,f=optim.adam(rosenbrock,x,config) 10 | if (i-1)%1000 == 0 then 11 | table.insert(fx,f[1]) 12 | end 13 | end 14 | print() 15 | print('Rosenbrock test') 16 | print() 17 | print('x=');print(x) 18 | print('fx=') 19 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 20 | -------------------------------------------------------------------------------- /test/test_sgd.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | 10 | config = {learningRate=1e-3} 11 | for i = 1,10001 do 12 | x,f=optim.sgd(rosenbrock,x,config) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_lbfgs_w_ls.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | print('--- batch test w/ line search ---') 8 | 9 | x = torch.Tensor(2):fill(0) 10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, lineSearch=optim.lswolfe}) 11 | 12 | print() 13 | print('Rosenbrock test') 14 | print() 15 | print('Number of function evals = ',i) 16 | print('x=');print(x) 17 | print('fx=') 18 | for i=1,#fx do print(i,fx[i]); end 19 | print() 20 | print() 21 | -------------------------------------------------------------------------------- /test/test_adagrad.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | 10 | config = {learningRate=1e-1} 11 | for i = 1,10001 do 12 | x,f=optim.adagrad(rosenbrock,x,config) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_rmsprop.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | 10 | config = {learningRate=5e-4} 11 | for i = 1,10001 do 12 | x,f=optim.rmsprop(rosenbrock,x,config) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_adamax.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'torch' 3 | require 'optim' 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | state = {} 10 | config = {} 11 | for i = 1,10001 do 12 | x,f=optim.adamax(rosenbrock,x,config,state) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_adadelta.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | x = torch.Tensor(2):fill(0) 8 | fx = {} 9 | state = {} 10 | config = {eps=1e-10} 11 | for i = 1,10001 do 12 | x,f=optim.adadelta(rosenbrock,x,config,state) 13 | if (i-1)%1000 == 0 then 14 | table.insert(fx,f[1]) 15 | end 16 | end 17 | 18 | print() 19 | print('Rosenbrock test') 20 | print() 21 | print('x=');print(x) 22 | print('fx=') 23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end 24 | -------------------------------------------------------------------------------- /test/test_cmaes.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | -- 10-D rosenbrock 8 | x = torch.Tensor(10):fill(0) 9 | config = {maxEval=10000, sigma=0.5, verb_disp=0} 10 | 11 | -- will take some time 12 | x,fx,i=optim.cmaes(rosenbrock,x,config) 13 | 14 | 15 | print('Rosenbrock test') 16 | print() 17 | -- approx 6500 function evals expected 18 | print('Number of function evals = ',i) 19 | print('x=');print(x) 20 | print('fx=') 21 | for i=1,#fx do print(i,fx[i]); end 22 | print() 23 | print() -------------------------------------------------------------------------------- /test/test_de.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | 8 | -- 10-D rosenbrock 9 | x = torch.Tensor(2):fill(0) 10 | config = {popsize=50, scaleFactor=0.5, crossoverRate=0.9, maxFEs=3000} 11 | 12 | -- will take some time 13 | x,fx=optim.de(rosenbrock,x,config) 14 | 15 | 16 | print('Rosenbrock test') 17 | print() 18 | -- approx 6500 function evals expected 19 | print('Number of function evals = ',i) 20 | print('x=');print(x) 21 | print('fx=') 22 | for i=1,config.maxFEs do print(i,fx[i]); end 23 | print() 24 | print() 25 | -------------------------------------------------------------------------------- /test/test_logger.lua: -------------------------------------------------------------------------------- 1 | require 'optim' 2 | 3 | 4 | logger_former = optim.Logger('accuracy-former.log') 5 | logger_new = optim.Logger('accuracy-new.log') 6 | 7 | logger_new:setNames({'channel 1', 'channel 2', 'channel 3'}) 8 | 9 | for i = 1, 20 do 10 | logger_former:add({['channel 1'] = 1 , ['channel 2'] = 0.1 * i, ['channel 3'] = 1 - 0.2 * i}) 11 | logger_new:add({1 , 0.1 * i, 1 - 0.2 * i}) 12 | end 13 | 14 | logger_former:style({['channel 1'] = '-' , ['channel 2'] = '-', ['channel 3'] = '-'}) 15 | logger_new:style{'-', '-', '-'} 16 | 17 | logger_former:plot() 18 | logger_new:plot() 19 | 20 | 21 | -------------------------------------------------------------------------------- /test/l2.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | -- rosenbrock.m This function returns the function value, partial derivatives 3 | -- and Hessian of the (general dimension) rosenbrock function, given by: 4 | -- 5 | -- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 6 | -- 7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1). 8 | -- 9 | -- Carl Edward Rasmussen, 2001-07-21. 10 | 11 | function l2(x) 12 | 13 | local xx = x:clone() 14 | xx:cmul(xx) 15 | local fout = xx:sum() 16 | 17 | local dx = torch.Tensor():resizeAs(x):copy(x) 18 | dx:mul(2) 19 | --print('l2 eval = ', fout) 20 | return fout,dx 21 | 22 | end -------------------------------------------------------------------------------- /init.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'torch' 3 | 4 | optim = {} 5 | 6 | -- optimizations 7 | require('optim.sgd') 8 | require('optim.cg') 9 | require('optim.asgd') 10 | require('optim.nag') 11 | require('optim.fista') 12 | require('optim.lbfgs') 13 | require('optim.adagrad') 14 | require('optim.rprop') 15 | require('optim.adam') 16 | require('optim.adamax') 17 | require('optim.rmsprop') 18 | require('optim.adadelta') 19 | require('optim.cmaes') 20 | require('optim.de') 21 | 22 | -- line search functions 23 | require('optim.lswolfe') 24 | 25 | -- helpers 26 | require('optim.polyinterp') 27 | require('optim.checkgrad') 28 | 29 | -- tools 30 | require('optim.ConfusionMatrix') 31 | require('optim.Logger') 32 | 33 | return optim 34 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) 3 | CMAKE_POLICY(VERSION 2.6) 4 | IF(LUAROCKS_PREFIX) 5 | MESSAGE(STATUS "Installing Torch through Luarocks") 6 | STRING(REGEX REPLACE "(.*)lib/luarocks/rocks.*" "\\1" CMAKE_INSTALL_PREFIX "${LUAROCKS_PREFIX}") 7 | MESSAGE(STATUS "Prefix inferred from Luarocks: ${CMAKE_INSTALL_PREFIX}") 8 | ENDIF() 9 | FIND_PACKAGE(Torch REQUIRED) 10 | 11 | SET(src) 12 | FILE(GLOB luasrc *.lua) 13 | ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}") 14 | #ADD_TORCH_DOK(dok optim "Machine Learning" "Optimization" 3.2) 15 | 16 | INSTALL(DIRECTORY "doc" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/optim") 17 | INSTALL(FILES "README.md" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/optim") 18 | -------------------------------------------------------------------------------- /optim-1.0.5-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.5-0" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | } 7 | 8 | description = { 9 | summary = "An optimization library for Torch.", 10 | detailed = [[ 11 | This package contains several optimization routines for Torch. 12 | ]], 13 | homepage = "https://github.com/torch/optim", 14 | license = "BSD" 15 | } 16 | 17 | dependencies = { 18 | "torch >= 7.0", 19 | } 20 | 21 | build = { 22 | type = "command", 23 | build_command = [[ 24 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 25 | ]], 26 | install_command = "cd build && $(MAKE) install" 27 | } 28 | -------------------------------------------------------------------------------- /optim-1.0.4-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.4-0" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | tag = "1.0.4-0" 7 | } 8 | 9 | description = { 10 | summary = "An optimization library for Torch.", 11 | detailed = [[ 12 | This package contains several optimization routines for Torch. 13 | ]], 14 | homepage = "https://github.com/torch/optim", 15 | license = "BSD" 16 | } 17 | 18 | dependencies = { 19 | "torch >= 7.0", 20 | } 21 | 22 | build = { 23 | type = "command", 24 | build_command = [[ 25 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 26 | ]], 27 | install_command = "cd build && $(MAKE) install" 28 | } 29 | -------------------------------------------------------------------------------- /optim-1.0.3-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.3-0" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | tag = "1.0.3-0" 7 | } 8 | 9 | description = { 10 | summary = "An optimization library for Torch.", 11 | detailed = [[ 12 | This package contains several optimization routines for Torch. 13 | ]], 14 | homepage = "https://github.com/torch/optim", 15 | license = "BSD" 16 | } 17 | 18 | dependencies = { 19 | "torch >= 7.0", 20 | "sys >= 1.0", 21 | } 22 | 23 | build = { 24 | type = "command", 25 | build_command = [[ 26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 27 | ]], 28 | install_command = "cd build && $(MAKE) install" 29 | } 30 | -------------------------------------------------------------------------------- /optim-1.0.3-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "optim" 2 | version = "1.0.3-1" 3 | 4 | source = { 5 | url = "git://github.com/torch/optim", 6 | tag = "1.0.3-1" 7 | } 8 | 9 | description = { 10 | summary = "An optimization library for Torch.", 11 | detailed = [[ 12 | This package contains several optimization routines for Torch. 13 | ]], 14 | homepage = "https://github.com/torch/optim", 15 | license = "BSD" 16 | } 17 | 18 | dependencies = { 19 | "torch >= 7.0", 20 | "sys >= 1.0", 21 | } 22 | 23 | build = { 24 | type = "command", 25 | build_command = [[ 26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 27 | ]], 28 | install_command = "cd build && $(MAKE) install" 29 | } 30 | -------------------------------------------------------------------------------- /test/test_lbfgs.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | require 'rosenbrock' 5 | require 'l2' 6 | 7 | print('--- regular batch test ---') 8 | 9 | x = torch.Tensor(2):fill(0) 10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, learningRate=1e-1}) 11 | 12 | print() 13 | print('Rosenbrock test') 14 | print() 15 | print('Number of function evals = ',i) 16 | print('x=');print(x) 17 | print('fx=') 18 | for i=1,#fx do print(i,fx[i]); end 19 | print() 20 | print() 21 | 22 | print('--- stochastic test ---') 23 | 24 | x = torch.Tensor(2):fill(0) 25 | fx = {} 26 | config = {learningRate=1e-1, maxIter=1} 27 | for i = 1,100 do 28 | x,f=optim.lbfgs(rosenbrock,x,config) 29 | table.insert(fx,f[1]) 30 | end 31 | 32 | print() 33 | print('Rosenbrock test') 34 | print() 35 | print('Number of function evals = ',i) 36 | print('x=');print(x) 37 | print('fx=') 38 | for i=1,#fx do print(i,fx[i]); end 39 | -------------------------------------------------------------------------------- /test/test_confusion.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'optim' 3 | 4 | n_feature = 3 5 | classes = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 6 | 7 | print'ConfusionMatrix:__init() test' 8 | cm = optim.ConfusionMatrix(#classes, classes) 9 | 10 | target = 3 11 | prediction = torch.randn(#classes) 12 | 13 | print'ConfusionMatrix:add() test' 14 | cm:add(prediction, target) 15 | cm:add(prediction, torch.randn(#classes)) 16 | 17 | batch_size = 8 18 | 19 | targets = torch.randperm(batch_size) 20 | predictions = torch.randn(batch_size, #classes) 21 | 22 | print'ConfusionMatrix:batchAdd() test' 23 | cm:batchAdd(predictions, targets) 24 | assert(cm.mat:sum() == batch_size + 2, 'missing examples') 25 | 26 | print'ConfusionMatrix:updateValids() test' 27 | cm:updateValids() 28 | 29 | print'ConfusionMatrix:__tostring__() test' 30 | print(cm) 31 | 32 | target = 0 33 | cm:add(prediction, target) 34 | assert(cm.mat:sum() == batch_size + 2, 'too many examples') 35 | 36 | -- FAR/FRR testing on identify matrix. FRR/FAR should be zero for identity. 37 | cm.mat = torch.eye(#classes, #classes) 38 | classFrrs, classFars, frrs, fars = cm:farFrr() 39 | assert(classFrrs:sum() + classFars:sum() == 0, "Incorrect values") 40 | -------------------------------------------------------------------------------- /checkgrad.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of a simple numerical gradient checker. 2 | 3 | ARGS: 4 | 5 | - `opfunc` : a function that takes a single input (X), the point of 6 | evaluation, and returns f(X) and df/dX 7 | - `x` : the initial point 8 | - `eps` : the epsilon to use for the numerical check (default is 1e-7) 9 | 10 | RETURN: 11 | 12 | - `diff` : error in the gradient, should be near tol 13 | - `dC` : exact gradient at point 14 | - `dC_est` : numerically estimates gradient at point 15 | 16 | ]]-- 17 | 18 | 19 | -- function that numerically checks gradient of NCA loss: 20 | function optim.checkgrad(opfunc, x, eps) 21 | 22 | -- compute true gradient: 23 | local Corg,dC = opfunc(x) 24 | dC:resize(x:size()) 25 | 26 | local Ctmp -- temporary value 27 | local isTensor = torch.isTensor(Corg) 28 | if isTensor then 29 | Ctmp = Corg.new(Corg:size()) 30 | end 31 | 32 | -- compute numeric approximations to gradient: 33 | local eps = eps or 1e-7 34 | local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC) 35 | for i = 1,dC:size(1) do 36 | local tmp = x[i] 37 | x[i] = x[i] + eps 38 | local C1 = opfunc(x) 39 | if isTensor then 40 | Ctmp:copy(C1) 41 | C1 = Ctmp 42 | end 43 | x[i] = x[i] - 2 * eps 44 | local C2 = opfunc(x) 45 | x[i] = tmp 46 | dC_est[i] = (C1 - C2) / (2 * eps) 47 | end 48 | 49 | -- estimate error of gradient: 50 | local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est) 51 | return diff,dC,dC_est 52 | end 53 | -------------------------------------------------------------------------------- /test/rosenbrock.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | -- rosenbrock.m This function returns the function value, partial derivatives 3 | -- and Hessian of the (general dimension) rosenbrock function, given by: 4 | -- 5 | -- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 6 | -- 7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1). 8 | -- 9 | -- Carl Edward Rasmussen, 2001-07-21. 10 | 11 | function rosenbrock(x) 12 | 13 | -- (1) compute f(x) 14 | local d = x:size(1) 15 | -- x1 = x(i)^2 16 | local x1 = x.new(d-1):copy(x:narrow(1,1,d-1)) 17 | -- x(i+1) - x(i)^2 18 | x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1)) 19 | 20 | -- 100*(x(i+1) - x(i)^2)^2 21 | x1:cmul(x1):mul(100) 22 | 23 | -- x(i) 24 | local x0 = x.new(d-1):copy(x:narrow(1,1,d-1)) 25 | -- 1-x(i) 26 | x0:mul(-1):add(1) 27 | -- (1-x(i))^2 28 | x0:cmul(x0) 29 | -- 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 30 | x1:add(x0) 31 | local fout = x1:sum() 32 | 33 | -- (2) compute f(x)/dx 34 | local dxout = x.new():resizeAs(x):zero() 35 | -- df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1)); 36 | 37 | x1:copy(x:narrow(1,1,d-1)) 38 | x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1)):cmul(x:narrow(1,1,d-1)):mul(-400) 39 | x0:copy(x:narrow(1,1,d-1)):mul(-1):add(1):mul(-2) 40 | x1:add(x0) 41 | dxout:narrow(1,1,d-1):copy(x1) 42 | 43 | -- df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2); 44 | x0:copy(x:narrow(1,1,d-1)) 45 | x0:cmul(x0):mul(-1):add(x:narrow(1,2,d-1)):mul(200) 46 | dxout:narrow(1,2,d-1):add(x0) 47 | 48 | return fout,dxout 49 | 50 | end 51 | -------------------------------------------------------------------------------- /adagrad.lua: -------------------------------------------------------------------------------- 1 | --[[ ADAGRAD implementation for SGD 2 | 3 | ARGS: 4 | - `opfunc` : a function that takes a single input (X), the point of 5 | evaluation, and returns f(X) and df/dX 6 | - `x` : the initial point 7 | - `state` : a table describing the state of the optimizer; after each 8 | call the state is modified 9 | - `state.learningRate` : learning rate 10 | - `state.paramVariance` : vector of temporal variances of parameters 11 | - `state.weightDecay` : scalar that controls weight decay 12 | RETURN: 13 | - `x` : the new x vector 14 | - `f(x)` : the function, evaluated before the update 15 | 16 | ]] 17 | function optim.adagrad(opfunc, x, config, state) 18 | -- (0) get/update state 19 | if config == nil and state == nil then 20 | print('no state table, ADAGRAD initializing') 21 | end 22 | local config = config or {} 23 | local state = state or config 24 | local lr = config.learningRate or 1e-3 25 | local lrd = config.learningRateDecay or 0 26 | local wd = config.weightDecay or 0 27 | state.evalCounter = state.evalCounter or 0 28 | local nevals = state.evalCounter 29 | 30 | -- (1) evaluate f(x) and df/dx 31 | local fx,dfdx = opfunc(x) 32 | 33 | -- (2) weight decay with a single parameter 34 | if wd ~= 0 then 35 | dfdx:add(wd, x) 36 | end 37 | 38 | -- (3) learning rate decay (annealing) 39 | local clr = lr / (1 + nevals*lrd) 40 | 41 | -- (4) parameter update with single or individual learning rates 42 | if not state.paramVariance then 43 | state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 44 | state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx) 45 | end 46 | state.paramVariance:addcmul(1,dfdx,dfdx) 47 | state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt() 48 | x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10)) 49 | 50 | -- (5) update evaluation counter 51 | state.evalCounter = state.evalCounter + 1 52 | 53 | -- return x*, f(x) before optimization 54 | return x,{fx} 55 | end 56 | -------------------------------------------------------------------------------- /COPYRIGHT.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 2 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 3 | Copyright (c) 2011-2013 NYU (Clement Farabet) 4 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 5 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 6 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 7 | 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | 13 | 1. Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright 17 | notice, this list of conditions and the following disclaimer in the 18 | documentation and/or other materials provided with the distribution. 19 | 20 | 3. Neither the names of NEC Laboratories American and IDIAP Research 21 | Institute nor the names of its contributors may be used to endorse or 22 | promote products derived from this software without specific prior 23 | written permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 26 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 29 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 | POSSIBILITY OF SUCH DAMAGE. 36 | -------------------------------------------------------------------------------- /rmsprop.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of RMSprop 2 | 3 | ARGS: 4 | 5 | - 'opfunc' : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - 'x' : the initial point 8 | - 'config` : a table with configuration parameters for the optimizer 9 | - 'config.learningRate' : learning rate 10 | - 'config.alpha' : smoothing constant 11 | - 'config.epsilon' : value with which to initialise m 12 | - 'config.weightDecay' : weight decay 13 | - 'state' : a table describing the state of the optimizer; 14 | after each call the state is modified 15 | - 'state.m' : leaky sum of squares of parameter gradients, 16 | - 'state.tmp' : and the square root (with epsilon smoothing) 17 | 18 | RETURN: 19 | - `x` : the new x vector 20 | - `f(x)` : the function, evaluated before the update 21 | 22 | ]] 23 | 24 | function optim.rmsprop(opfunc, x, config, state) 25 | -- (0) get/update state 26 | local config = config or {} 27 | local state = state or config 28 | local lr = config.learningRate or 1e-2 29 | local alpha = config.alpha or 0.99 30 | local epsilon = config.epsilon or 1e-8 31 | local wd = config.weightDecay or 0 32 | local mfill = config.initialMean or 0 33 | 34 | -- (1) evaluate f(x) and df/dx 35 | local fx, dfdx = opfunc(x) 36 | 37 | -- (2) weight decay 38 | if wd ~= 0 then 39 | dfdx:add(wd, x) 40 | end 41 | 42 | -- (3) initialize mean square values and square gradient storage 43 | if not state.m then 44 | state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill) 45 | state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx) 46 | end 47 | 48 | -- (4) calculate new (leaky) mean squared values 49 | state.m:mul(alpha) 50 | state.m:addcmul(1.0-alpha, dfdx, dfdx) 51 | 52 | -- (5) perform update 53 | state.tmp:sqrt(state.m):add(epsilon) 54 | x:addcdiv(-lr, dfdx, state.tmp) 55 | 56 | -- return x*, f(x) before optimization 57 | return x, {fx} 58 | end 59 | -------------------------------------------------------------------------------- /adadelta.lua: -------------------------------------------------------------------------------- 1 | --[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701 2 | 3 | ARGS: 4 | - `opfunc` : a function that takes a single input (X), the point of 5 | evaluation, and returns f(X) and df/dX 6 | - `x` : the initial point 7 | - `config` : a table of hyper-parameters 8 | - `config.rho` : interpolation parameter 9 | - `config.eps` : for numerical stability 10 | - `config.weightDecay` : weight decay 11 | - `state` : a table describing the state of the optimizer; after each 12 | call the state is modified 13 | - `state.paramVariance` : vector of temporal variances of parameters 14 | - `state.accDelta` : vector of accummulated delta of gradients 15 | RETURN: 16 | - `x` : the new x vector 17 | - `f(x)` : the function, evaluated before the update 18 | ]] 19 | function optim.adadelta(opfunc, x, config, state) 20 | -- (0) get/update state 21 | if config == nil and state == nil then 22 | print('no state table, ADADELTA initializing') 23 | end 24 | local config = config or {} 25 | local state = state or config 26 | local rho = config.rho or 0.9 27 | local eps = config.eps or 1e-6 28 | local wd = config.weightDecay or 0 29 | state.evalCounter = state.evalCounter or 0 30 | -- (1) evaluate f(x) and df/dx 31 | local fx,dfdx = opfunc(x) 32 | 33 | -- (2) weight decay 34 | if wd ~= 0 then 35 | dfdx:add(wd, x) 36 | end 37 | 38 | -- (3) parameter update 39 | if not state.paramVariance then 40 | state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 41 | state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 42 | state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 43 | state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() 44 | end 45 | state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx) 46 | state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt() 47 | state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx) 48 | x:add(-1, state.delta) 49 | state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta) 50 | -- (4) update evaluation counter 51 | state.evalCounter = state.evalCounter + 1 52 | 53 | -- return x*, f(x) before optimization 54 | return x,{fx} 55 | end 56 | -------------------------------------------------------------------------------- /asgd.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of ASGD 2 | 3 | ASGD: 4 | 5 | x := (1 - lambda eta_t) x - eta_t df/dx(z,x) 6 | a := a + mu_t [ x - a ] 7 | 8 | eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75 9 | mu_t = 1/max(1,t-t0) 10 | 11 | implements ASGD algoritm as in L.Bottou's sgd-2.0 12 | 13 | ARGS: 14 | 15 | - `opfunc` : a function that takes a single input (X), the point of 16 | evaluation, and returns f(X) and df/dX 17 | - `x` : the initial point 18 | - `state` : a table describing the state of the optimizer; after each 19 | call the state is modified 20 | - `state.eta0` : learning rate 21 | - `state.lambda` : decay term 22 | - `state.alpha` : power for eta update 23 | - `state.t0` : point at which to start averaging 24 | 25 | RETURN: 26 | - `x` : the new x vector 27 | - `f(x)` : the function, evaluated before the update 28 | - `ax` : the averaged x vector 29 | 30 | (Clement Farabet, 2012) 31 | --]] 32 | function optim.asgd(opfunc, x, config, state) 33 | -- (0) get/update state 34 | local config = config or {} 35 | local state = state or config 36 | config.eta0 = config.eta0 or 1e-4 37 | config.lambda = config.lambda or 1e-4 38 | config.alpha = config.alpha or 0.75 39 | config.t0 = config.t0 or 1e6 40 | 41 | -- (hidden state) 42 | state.eta_t = state.eta_t or config.eta0 43 | state.mu_t = state.mu_t or 1 44 | state.t = state.t or 0 45 | 46 | -- (1) evaluate f(x) and df/dx 47 | local fx,dfdx = opfunc(x) 48 | 49 | -- (2) decay term 50 | x:mul(1 - config.lambda*state.eta_t) 51 | 52 | -- (3) update x 53 | x:add(-state.eta_t, dfdx) 54 | 55 | -- (4) averaging 56 | state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero() 57 | state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax) 58 | if state.mu_t ~= 1 then 59 | state.tmp:copy(x) 60 | state.tmp:add(-1,state.ax):mul(state.mu_t) 61 | state.ax:add(state.tmp) 62 | else 63 | state.ax:copy(x) 64 | end 65 | 66 | -- (5) update eta_t and mu_t 67 | state.t = state.t + 1 68 | state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha) 69 | state.mu_t = 1 / math.max(1, state.t - config.t0) 70 | 71 | -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...) 72 | return x,{fx},state.ax 73 | end 74 | -------------------------------------------------------------------------------- /adamax.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf 2 | 3 | ARGS: 4 | 5 | - 'opfunc' : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - 'x' : the initial point 8 | - 'config` : a table with configuration parameters for the optimizer 9 | - 'config.learningRate' : learning rate 10 | - 'config.beta1' : first moment coefficient 11 | - 'config.beta2' : second moment coefficient 12 | - 'config.epsilon' : for numerical stability 13 | - 'state' : a table describing the state of the optimizer; 14 | after each call the state is modified. 15 | 16 | RETURN: 17 | - `x` : the new x vector 18 | - `f(x)` : the function, evaluated before the update 19 | 20 | ]] 21 | 22 | function optim.adamax(opfunc, x, config, state) 23 | -- (0) get/update state 24 | local config = config or {} 25 | local state = state or config 26 | local lr = config.learningRate or 0.002 27 | 28 | local beta1 = config.beta1 or 0.9 29 | local beta2 = config.beta2 or 0.999 30 | local epsilon = config.epsilon or 1e-38 31 | local wd = config.weightDecay or 0 32 | 33 | -- (1) evaluate f(x) and df/dx 34 | local fx, dfdx = opfunc(x) 35 | 36 | -- (2) weight decay 37 | if wd ~= 0 then 38 | dfdx:add(wd, x) 39 | end 40 | 41 | -- Initialization 42 | state.t = state.t or 0 43 | -- Exponential moving average of gradient values 44 | state.m = state.m or x.new(dfdx:size()):zero() 45 | -- Exponential moving average of the infinity norm 46 | state.u = state.u or x.new(dfdx:size()):zero() 47 | -- A tmp tensor to hold the input to max() 48 | state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero() 49 | 50 | state.t = state.t + 1 51 | 52 | -- Update biased first moment estimate. 53 | state.m:mul(beta1):add(1-beta1, dfdx) 54 | -- Update the exponentially weighted infinity norm. 55 | state.max[1]:copy(state.u):mul(beta2) 56 | state.max[2]:copy(dfdx):abs():add(epsilon) 57 | state.u:max(state.max, 1) 58 | 59 | local biasCorrection1 = 1 - beta1^state.t 60 | local stepSize = lr/biasCorrection1 61 | -- (2) update x 62 | x:addcdiv(-stepSize, state.m, state.u) 63 | 64 | -- return x*, f(x) before optimization 65 | return x, {fx} 66 | end 67 | -------------------------------------------------------------------------------- /test/test_fista.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'unsup' 3 | require 'torch' 4 | require 'gnuplot' 5 | require 'sparsecoding' 6 | 7 | -- gnuplot.setgnuplotexe('/usr/bin/gnuplot44') 8 | -- gnuplot.setgnuplotterminal('x11') 9 | 10 | function gettableval(tt,v) 11 | local x = torch.Tensor(#tt) 12 | for i=1,#tt do x[i] = tt[i][v] end 13 | return x 14 | end 15 | function doplots(v) 16 | v = v or 'F' 17 | local fistaf = torch.DiskFile('fista2.bin'):binary() 18 | local istaf = torch.DiskFile('ista2.bin'):binary() 19 | 20 | local hfista = fistaf:readObject() 21 | fistaf:close() 22 | local hista = istaf:readObject() 23 | istaf:close() 24 | 25 | gnuplot.figure() 26 | gnuplot.plot({'fista ' .. v,gettableval(hfista,v)},{'ista ' .. v, gettableval(hista,v)}) 27 | end 28 | 29 | seed = seed or 123 30 | if dofista == nil then 31 | dofista = true 32 | else 33 | dofista = not dofista 34 | end 35 | 36 | torch.manualSeed(seed) 37 | math.randomseed(seed) 38 | nc = 3 39 | ni = 30 40 | no = 100 41 | x = torch.Tensor(ni):zero() 42 | 43 | --- I am keeping these just to make sure random init stays same 44 | fista = unsup.LinearFistaL1(ni,no,0.1) 45 | fista = nil 46 | 47 | fistaparams = {} 48 | fistaparams.doFistaUpdate = dofista 49 | fistaparams.maxline = 10 50 | fistaparams.maxiter = 200 51 | fistaparams.verbose = true 52 | 53 | D=torch.randn(ni,no) 54 | for i=1,D:size(2) do 55 | D:select(2,i):div(D:select(2,i):std()+1e-12) 56 | end 57 | 58 | mixi = torch.Tensor(nc) 59 | mixj = torch.Tensor(nc) 60 | for i=1,nc do 61 | local ii = math.random(1,no) 62 | local cc = torch.uniform(0,1/nc) 63 | mixi[i] = ii; 64 | mixj[i] = cc; 65 | print(ii,cc) 66 | x:add(cc, D:select(2,ii)) 67 | end 68 | 69 | fista = optim.FistaL1(D,fistaparams) 70 | code,h = fista.run(x,0.1) 71 | 72 | --fista.reconstruction:addmv(0,1,D,code) 73 | rec = fista.reconstruction 74 | --code,rec,h = fista:forward(x); 75 | 76 | gnuplot.figure(1) 77 | gnuplot.plot({'data',mixi,mixj,'+'},{'code',torch.linspace(1,no,no),code,'+'}) 78 | gnuplot.title('Fista = ' .. tostring(fistaparams.doFistaUpdate)) 79 | 80 | gnuplot.figure(2) 81 | gnuplot.plot({'input',torch.linspace(1,ni,ni),x,'+-'},{'reconstruction',torch.linspace(1,ni,ni),rec,'+-'}); 82 | gnuplot.title('Reconstruction Error : ' .. x:dist(rec) .. ' ' .. 'Fista = ' .. tostring(fistaparams.doFistaUpdate)) 83 | --w2:axis(0,ni+1,-1,1) 84 | 85 | if dofista then 86 | print('Running FISTA') 87 | fname = 'fista2.bin' 88 | else 89 | print('Running ISTA') 90 | fname = 'ista2.bin' 91 | end 92 | ff = torch.DiskFile(fname,'w'):binary() 93 | ff:writeObject(h) 94 | ff:close() 95 | 96 | -------------------------------------------------------------------------------- /adam.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of Adam https://arxiv.org/abs/1412.6980 2 | 3 | ARGS: 4 | 5 | - 'opfunc' : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - 'x' : the initial point 8 | - 'config` : a table with configuration parameters for the optimizer 9 | - 'config.learningRate' : learning rate 10 | - `config.learningRateDecay` : learning rate decay 11 | - 'config.beta1' : first moment coefficient 12 | - 'config.beta2' : second moment coefficient 13 | - 'config.epsilon' : for numerical stability 14 | - 'config.weightDecay' : weight decay 15 | - 'state' : a table describing the state of the optimizer; after each 16 | call the state is modified 17 | 18 | RETURN: 19 | - `x` : the new x vector 20 | - `f(x)` : the function, evaluated before the update 21 | 22 | ]] 23 | 24 | function optim.adam(opfunc, x, config, state) 25 | -- (0) get/update state 26 | local config = config or {} 27 | local state = state or config 28 | local lr = config.learningRate or 0.001 29 | local lrd = config.learningRateDecay or 0 30 | 31 | local beta1 = config.beta1 or 0.9 32 | local beta2 = config.beta2 or 0.999 33 | local epsilon = config.epsilon or 1e-8 34 | local wd = config.weightDecay or 0 35 | 36 | -- (1) evaluate f(x) and df/dx 37 | local fx, dfdx = opfunc(x) 38 | 39 | -- (2) weight decay 40 | if wd ~= 0 then 41 | dfdx:add(wd, x) 42 | end 43 | 44 | -- Initialization 45 | state.t = state.t or 0 46 | -- Exponential moving average of gradient values 47 | state.m = state.m or x.new(dfdx:size()):zero() 48 | -- Exponential moving average of squared gradient values 49 | state.v = state.v or x.new(dfdx:size()):zero() 50 | -- A tmp tensor to hold the sqrt(v) + epsilon 51 | state.denom = state.denom or x.new(dfdx:size()):zero() 52 | 53 | -- (3) learning rate decay (annealing) 54 | local clr = lr / (1 + state.t*lrd) 55 | 56 | state.t = state.t + 1 57 | 58 | -- Decay the first and second moment running average coefficient 59 | state.m:mul(beta1):add(1-beta1, dfdx) 60 | state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) 61 | 62 | state.denom:copy(state.v):sqrt():add(epsilon) 63 | 64 | local biasCorrection1 = 1 - beta1^state.t 65 | local biasCorrection2 = 1 - beta2^state.t 66 | local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1 67 | -- (4) update x 68 | x:addcdiv(-stepSize, state.m, state.denom) 69 | 70 | -- return x*, f(x) before optimization 71 | return x, {fx} 72 | end 73 | -------------------------------------------------------------------------------- /doc/logger.md: -------------------------------------------------------------------------------- 1 | 2 | # Logger 3 | 4 | `optim` provides also logging and live plotting capabilities via the `optim.Logger()` function. 5 | 6 | Live logging is essential to monitor the *network accuracy* and *cost function* during training and testing, for spotting *under-* and *over-fitting*, for *early stopping* or just for monitoring the health of the current optimisation task. 7 | 8 | 9 | ## Logging data 10 | 11 | Let walk through an example to see how it works. 12 | 13 | We start with initialising our logger connected to a text file `accuracy.log`. 14 | 15 | ```lua 16 | logger = optim.Logger('accuracy.log') 17 | ``` 18 | 19 | We can decide to log on it, for example, *training* and *testing accuracies*. 20 | 21 | ```lua 22 | logger:setNames{'Training acc.', 'Test acc.'} 23 | ``` 24 | 25 | And now we can populate our logger randomly. 26 | 27 | ```lua 28 | for i = 1, 10 do 29 | trainAcc = math.random(0, 100) 30 | testAcc = math.random(0, 100) 31 | logger:add{trainAcc, testAcc} 32 | end 33 | ``` 34 | 35 | We can `cat` `accuracy.log` and see what's in it. 36 | 37 | ``` 38 | Training acc. Test acc. 39 | 7.0000e+01 5.9000e+01 40 | 7.6000e+01 8.0000e+00 41 | 6.6000e+01 3.4000e+01 42 | 7.4000e+01 4.3000e+01 43 | 5.7000e+01 1.1000e+01 44 | 5.0000e+00 9.8000e+01 45 | 7.1000e+01 1.7000e+01 46 | 9.8000e+01 2.7000e+01 47 | 3.5000e+01 4.7000e+01 48 | 6.8000e+01 5.8000e+01 49 | ``` 50 | 51 | ## Visualising logs 52 | 53 | OK, cool, but how can we actually see what's going on? 54 | 55 | To have a better grasp of what's happening, we can plot our curves. 56 | We need first to specify the plotting style, choosing from: 57 | 58 | * `.` for dots 59 | * `+` for points 60 | * `-` for lines 61 | * `+-` for points and lines 62 | * `~` for using smoothed lines with cubic interpolation 63 | * `|` for using boxes 64 | * custom string, one can also pass custom strings to use full capability of gnuplot. 65 | 66 | ```lua 67 | logger:style{'+-', '+-'} 68 | logger:plot() 69 | ``` 70 | 71 | ![Logging plot](logger_plot.png) 72 | 73 | If we'd like an interactive visualisation, we can put the `logger:plot()` instruction within the `for` loop, and the chart will be updated at every iteration. 74 | 75 | In case we'd like to prevent `gnuplot` to display the plots, we can set the option `logger:display(false)`. 76 | In this way, plots will be saved but not displayed. 77 | To restore the normal behaviour, use `logger:display(true)`. 78 | 79 | We can set a logarithmic *y* axis with `logger:setlogscale(true)` and reset it with `logger:setlogscale(false)`. 80 | -------------------------------------------------------------------------------- /nag.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | -- An implementation of SGD adapted with features of Nesterov's 3 | -- Accelerated Gradient method, based on the paper 4 | -- On the Importance of Initialization and Momentum in Deep Learning 5 | -- Sutsveker et. al., ICML 2013 6 | -- 7 | -- ARGS: 8 | -- opfunc : a function that takes a single input (X), the point of 9 | -- evaluation, and returns f(X) and df/dX 10 | -- x : the initial point 11 | -- state : a table describing the state of the optimizer; after each 12 | -- call the state is modified 13 | -- state.learningRate : learning rate 14 | -- state.learningRateDecay : learning rate decay 15 | -- state.weightDecay : weight decay 16 | -- state.momentum : momentum 17 | -- state.learningRates : vector of individual learning rates 18 | -- 19 | -- RETURN: 20 | -- x : the new x vector 21 | -- f(x) : the function, evaluated before the update 22 | -- 23 | -- (Dilip Krishnan, 2013) 24 | -- 25 | 26 | function optim.nag(opfunc, x, config, state) 27 | -- (0) get/update state 28 | local config = config or {} 29 | local state = state or config 30 | local lr = config.learningRate or 1e-3 31 | local lrd = config.learningRateDecay or 0 32 | local wd = config.weightDecay or 0 33 | local mom = config.momentum or 0.9 34 | local damp = config.dampening or mom 35 | local lrs = config.learningRates 36 | state.evalCounter = state.evalCounter or 0 37 | local nevals = state.evalCounter 38 | 39 | if mom <= 0 then 40 | error('Momentum must be positive for Nesterov Accelerated Gradient') 41 | end 42 | 43 | -- (1) evaluate f(x) and df/dx 44 | -- first step in the direction of the momentum vector 45 | 46 | if state.dfdx then 47 | x:add(mom, state.dfdx) 48 | end 49 | -- then compute gradient at that point 50 | -- comment out the above line to get the original SGD 51 | local fx,dfdx = opfunc(x) 52 | 53 | -- (2) weight decay 54 | if wd ~= 0 then 55 | dfdx:add(wd, x) 56 | end 57 | 58 | -- (3) learning rate decay (annealing) 59 | local clr = lr / (1 + nevals*lrd) 60 | 61 | -- (4) apply momentum 62 | if not state.dfdx then 63 | state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0) 64 | else 65 | state.dfdx:mul(mom) 66 | end 67 | 68 | -- (5) parameter update with single or individual learning rates 69 | if lrs then 70 | if not state.deltaParameters then 71 | state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) 72 | end 73 | state.deltaParameters:copy(lrs):cmul(dfdx) 74 | x:add(-clr, state.deltaParameters) 75 | state.dfdx:add(-clr, state.deltaParameters) 76 | else 77 | x:add(-clr, dfdx) 78 | state.dfdx:add(-clr, dfdx) 79 | end 80 | 81 | -- (6) update evaluation counter 82 | state.evalCounter = state.evalCounter + 1 83 | 84 | -- return x, f(x) before optimization 85 | return x,{fx} 86 | end 87 | -------------------------------------------------------------------------------- /sgd.lua: -------------------------------------------------------------------------------- 1 | --[[ A plain implementation of SGD 2 | 3 | ARGS: 4 | 5 | - `opfunc` : a function that takes a single input (X), the point 6 | of a evaluation, and returns f(X) and df/dX 7 | - `x` : the initial point 8 | - `config` : a table with configuration parameters for the optimizer 9 | - `config.learningRate` : learning rate 10 | - `config.learningRateDecay` : learning rate decay 11 | - `config.weightDecay` : weight decay 12 | - `config.weightDecays` : vector of individual weight decays 13 | - `config.momentum` : momentum 14 | - `config.dampening` : dampening for momentum 15 | - `config.nesterov` : enables Nesterov momentum 16 | - `config.learningRates` : vector of individual learning rates 17 | - `state` : a table describing the state of the optimizer; after each 18 | call the state is modified 19 | - `state.evalCounter` : evaluation counter (optional: 0, by default) 20 | 21 | RETURN: 22 | - `x` : the new x vector 23 | - `f(x)` : the function, evaluated before the update 24 | 25 | (Clement Farabet, 2012) 26 | ]] 27 | function optim.sgd(opfunc, x, config, state) 28 | -- (0) get/update state 29 | local config = config or {} 30 | local state = state or config 31 | local lr = config.learningRate or 1e-3 32 | local lrd = config.learningRateDecay or 0 33 | local wd = config.weightDecay or 0 34 | local mom = config.momentum or 0 35 | local damp = config.dampening or mom 36 | local nesterov = config.nesterov or false 37 | local lrs = config.learningRates 38 | local wds = config.weightDecays 39 | state.evalCounter = state.evalCounter or 0 40 | local nevals = state.evalCounter 41 | assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening") 42 | 43 | -- (1) evaluate f(x) and df/dx 44 | local fx,dfdx = opfunc(x) 45 | 46 | -- (2) weight decay with single or individual parameters 47 | if wd ~= 0 then 48 | dfdx:add(wd, x) 49 | elseif wds then 50 | if not state.decayParameters then 51 | state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) 52 | end 53 | state.decayParameters:copy(wds):cmul(x) 54 | dfdx:add(state.decayParameters) 55 | end 56 | 57 | -- (3) apply momentum 58 | if mom ~= 0 then 59 | if not state.dfdx then 60 | state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx) 61 | else 62 | state.dfdx:mul(mom):add(1-damp, dfdx) 63 | end 64 | if nesterov then 65 | dfdx:add(mom, state.dfdx) 66 | else 67 | dfdx = state.dfdx 68 | end 69 | end 70 | 71 | -- (4) learning rate decay (annealing) 72 | local clr = lr / (1 + nevals*lrd) 73 | 74 | -- (5) parameter update with single or individual learning rates 75 | if lrs then 76 | if not state.deltaParameters then 77 | state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) 78 | end 79 | state.deltaParameters:copy(lrs):cmul(dfdx) 80 | x:add(-clr, state.deltaParameters) 81 | else 82 | x:add(-clr, dfdx) 83 | end 84 | 85 | -- (6) update evaluation counter 86 | state.evalCounter = state.evalCounter + 1 87 | 88 | -- return x*, f(x) before optimization 89 | return x,{fx} 90 | end 91 | -------------------------------------------------------------------------------- /de.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of `DE` (Differential Evolution), 2 | 3 | ARGS: 4 | 5 | -`opfunc` : a function that takes a single input (X), the point of 6 | evaluation, and returns f(X) and df/dX. Note that df/dX is not used 7 | -`x` : the initial point 8 | -`state.popsize`: population size. If this is left empty, 10*d will be used 9 | -`state.scaleFactor`: float, usually between 0.4 and 1 10 | -`state.crossoverRate`: float, usually between 0.1 and 0.9 11 | -`state.maxEval`: int, maximal number of function evaluations 12 | 13 | RETURN: 14 | - `x*` : the new `x` vector, at the optimal point 15 | - `f` : a table of all function values: 16 | `f[1]` is the value of the function before any optimization and 17 | `f[#f]` is the final fully optimized value, at `x*` 18 | ]] 19 | 20 | require 'torch' 21 | 22 | function optim.de(opfunc, x, config, state) 23 | -- process input parameters 24 | local config = config or {} 25 | local state = state 26 | local popsize = config.popsize -- population size 27 | local scaleFactor = config.scaleFactor -- scale factor 28 | local crossoverRate = config.crossoverRate -- crossover rate 29 | local maxFEs = tonumber(config.maxFEs) -- maximal number of function evaluations 30 | local maxRegion = config.maxRegion -- upper bound of search region 31 | local minRegion = config.minRegion -- lower bound of search region 32 | local xmean = x:clone():view(-1) -- distribution mean, a flattened copy 33 | local D = xmean:size(1) -- number of objective variables/problem dimension 34 | 35 | if config.popsize == nil then 36 | popsize = 10 * D 37 | end 38 | if config.maxRegion == nil then 39 | maxRegion = 30 40 | end 41 | if config.minRegion == nil then 42 | minRegion = -30 43 | end 44 | 45 | -- Initialize population 46 | local fx = x.new(maxFEs) 47 | local pop = x.new(popsize, D) 48 | local children = x.new(popsize, D) 49 | local fitness = x.new(popsize) 50 | local children_fitness = x.new(popsize) 51 | local fes = 1 -- number of function evaluations 52 | local best_fitness 53 | local best_solution = x.new(D) 54 | 55 | -- Initialize population and evaluate the its fitness value 56 | local gen = torch.Generator() 57 | torch.manualSeed(gen, 1) 58 | 59 | pop:uniform(gen, minRegion, maxRegion) 60 | for i = 1, popsize do 61 | fitness[i] = opfunc(pop[i]) 62 | fx[fes] = fitness[i] 63 | fes = fes + 1 64 | end 65 | 66 | -- Find the best solution 67 | local index 68 | best_fitness, index = fitness:max(1) 69 | best_fitness = best_fitness[1] 70 | index = index[1] 71 | best_solution:copy(pop[index]) 72 | 73 | -- Main loop 74 | while fes < maxFEs do 75 | local r1, r2 76 | for i = 1, popsize do 77 | repeat 78 | r1 = torch.random(gen, 1, popsize) 79 | until(r1 ~= i) 80 | repeat 81 | r2 = torch.random(gen, 1, popsize) 82 | until(r2 ~= r1 and r2 ~= i) 83 | 84 | local jrand = torch.random(gen, 1, D) 85 | for j = 1, D do 86 | if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then 87 | children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j]) 88 | else 89 | children[i][j] = pop[i][j] 90 | end 91 | end 92 | children_fitness[i] = opfunc(children[i]) 93 | fx[fes] = children_fitness[i] 94 | fes = fes + 1 95 | end 96 | 97 | for i = 1, popsize do 98 | if children_fitness[i] <= fitness[i] then 99 | pop[i]:copy(children[i]) 100 | fitness[i] = children_fitness[i] 101 | if fitness[i] < best_fitness then 102 | best_fitness = fitness[i] 103 | best_solution:copy(children[i]) 104 | end 105 | end 106 | end 107 | end 108 | return best_solution, fx 109 | end 110 | -------------------------------------------------------------------------------- /rprop.lua: -------------------------------------------------------------------------------- 1 | --[[ A plain implementation of RPROP 2 | 3 | ARGS: 4 | - `opfunc` : a function that takes a single input (X), the point of 5 | evaluation, and returns f(X) and df/dX 6 | - `x` : the initial point 7 | - `state` : a table describing the state of the optimizer; after each 8 | call the state is modified 9 | - `state.stepsize` : initial step size, common to all components 10 | - `state.etaplus` : multiplicative increase factor, > 1 (default 1.2) 11 | - `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5) 12 | - `state.stepsizemax` : maximum stepsize allowed (default 50) 13 | - `state.stepsizemin` : minimum stepsize allowed (default 1e-6) 14 | - `state.niter` : number of iterations (default 1) 15 | 16 | RETURN: 17 | - `x` : the new x vector 18 | - `f(x)` : the function, evaluated before the update 19 | 20 | (Martin Riedmiller, Koray Kavukcuoglu 2013) 21 | --]] 22 | function optim.rprop(opfunc, x, config, state) 23 | if config == nil and state == nil then 24 | print('no state table RPROP initializing') 25 | end 26 | -- (0) get/update state 27 | local config = config or {} 28 | local state = state or config 29 | local stepsize = config.stepsize or 0.1 30 | local etaplus = config.etaplus or 1.2 31 | local etaminus = config.etaminus or 0.5 32 | local stepsizemax = config.stepsizemax or 50.0 33 | local stepsizemin = config.stepsizemin or 1E-06 34 | local niter = config.niter or 1 35 | 36 | local hfx = {} 37 | 38 | for i=1,niter do 39 | 40 | -- (1) evaluate f(x) and df/dx 41 | local fx,dfdx = opfunc(x) 42 | 43 | -- init temp storage 44 | if not state.delta then 45 | state.delta = dfdx.new(dfdx:size()):zero() 46 | state.stepsize = dfdx.new(dfdx:size()):fill(stepsize) 47 | state.sign = dfdx.new(dfdx:size()) 48 | state.psign = torch.ByteTensor(dfdx:size()) 49 | state.nsign = torch.ByteTensor(dfdx:size()) 50 | state.zsign = torch.ByteTensor(dfdx:size()) 51 | state.dminmax = torch.ByteTensor(dfdx:size()) 52 | if torch.type(x)=='torch.CudaTensor' then 53 | -- Push to GPU 54 | state.psign = state.psign:cuda() 55 | state.nsign = state.nsign:cuda() 56 | state.zsign = state.zsign:cuda() 57 | state.dminmax = state.dminmax:cuda() 58 | end 59 | end 60 | 61 | -- sign of derivative from last step to this one 62 | torch.cmul(state.sign, dfdx, state.delta) 63 | torch.sign(state.sign, state.sign) 64 | 65 | -- get indices of >0, <0 and ==0 entries 66 | state.sign.gt(state.psign, state.sign, 0) 67 | state.sign.lt(state.nsign, state.sign, 0) 68 | state.sign.eq(state.zsign, state.sign, 0) 69 | 70 | -- get step size updates 71 | state.sign[state.psign] = etaplus 72 | state.sign[state.nsign] = etaminus 73 | state.sign[state.zsign] = 1 74 | 75 | -- update stepsizes with step size updates 76 | state.stepsize:cmul(state.sign) 77 | 78 | -- threshold step sizes 79 | -- >50 => 50 80 | state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax) 81 | state.stepsize[state.dminmax] = stepsizemax 82 | -- <1e-6 ==> 1e-6 83 | state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin) 84 | state.stepsize[state.dminmax] = stepsizemin 85 | 86 | -- for dir<0, dfdx=0 87 | -- for dir>=0 dfdx=dfdx 88 | dfdx[state.nsign] = 0 89 | -- state.sign = sign(dfdx) 90 | torch.sign(state.sign,dfdx) 91 | 92 | -- update weights 93 | x:addcmul(-1,state.sign,state.stepsize) 94 | 95 | -- update state.dfdx with current dfdx 96 | state.delta:copy(dfdx) 97 | 98 | table.insert(hfx,fx) 99 | end 100 | 101 | -- return x*, f(x) before optimization 102 | return x,hfx 103 | end 104 | -------------------------------------------------------------------------------- /test/sparsecoding.lua: -------------------------------------------------------------------------------- 1 | require 'kex' 2 | 3 | -- L1 FISTA Solution 4 | -- L1 solution with a linear dictionary ||Ax-b||^2 + \lambda ||x||_1 5 | -- D : dictionary, each column is a dictionary element 6 | -- params: set of params to pass to FISTA and possibly temp allocation (**optional**) 7 | -- check unsup.FistaLS function for details. 8 | -- returns fista : a table with the following entries 9 | -- fista.run(x,lambda) : run L1 sparse coding algorithm with input x and lambda. 10 | -- The following entries will be allocated and reused by each call to fista.run(x,lambda) 11 | -- fista.reconstruction: reconstructed input. 12 | -- fista.gradf : gradient of L2 part of the problem wrt x 13 | -- fista.code : the solution of L1 problem 14 | -- The following entries just point to data passed to fista.run(x) 15 | -- fista.input : points to the tensor 'x' used in the last fista.run(x,lambda) 16 | -- fista.lambda : the lambda value used in the last fista.run(x,lambda) 17 | function optim.FistaL1(D, params) 18 | 19 | -- this is for keeping parameters related to fista algorithm 20 | local params = params or {} 21 | -- this is for temporary variables and such 22 | local fista = {} 23 | 24 | -- related to FISTA 25 | params.L = params.L or 0.1 26 | params.Lstep = params.Lstep or 1.5 27 | params.maxiter = params.maxiter or 50 28 | params.maxline = params.maxline or 20 29 | params.errthres = params.errthres or 1e-4 30 | 31 | -- temporary stuff that might be good to keep around 32 | fista.reconstruction = torch.Tensor() 33 | fista.gradf = torch.Tensor() 34 | fista.gradg = torch.Tensor() 35 | fista.code = torch.Tensor() 36 | 37 | -- these will be assigned in run(x) 38 | -- fista.input points to the last input that was run 39 | -- fista.lambda is the lambda value from the last run 40 | fista.input = nil 41 | fista.lambda = nil 42 | 43 | -- CREATE FUNCTION CLOSURES 44 | -- smooth function 45 | fista.f = function (x,mode) 46 | 47 | local reconstruction = fista.reconstruction 48 | local input = fista.input 49 | -- ------------------- 50 | -- function evaluation 51 | if x:dim() == 1 then 52 | --print(D:size(),x:size()) 53 | reconstruction:resize(D:size(1)) 54 | reconstruction:addmv(0,1,D,x) 55 | elseif x:dim(2) then 56 | reconstruction:resize(x:size(1),D:size(1)) 57 | reconstruction:addmm(0,1,x,D:t()) 58 | end 59 | local fval = input:dist(reconstruction)^2 60 | 61 | -- ---------------------- 62 | -- derivative calculation 63 | if mode and mode:match('dx') then 64 | local gradf = fista.gradf 65 | reconstruction:add(-1,input):mul(2) 66 | gradf:resizeAs(x) 67 | if input:dim() == 1 then 68 | gradf:addmv(0,1,D:t(),reconstruction) 69 | else 70 | gradf:addmm(0,1,reconstruction, D) 71 | end 72 | --------------------------------------- 73 | -- return function value and derivative 74 | return fval, gradf, reconstruction 75 | end 76 | 77 | ------------------------ 78 | -- return function value 79 | return fval, reconstruction 80 | end 81 | 82 | -- non-smooth function L1 83 | fista.g = function (x) 84 | 85 | local fval = fista.lambda*x:norm(1) 86 | 87 | if mod and mode:match('dx') then 88 | local gradg = fista.gradg 89 | gradg:resizAs(x) 90 | gradg:sign():mul(fista.lambda) 91 | return fval,gradg 92 | end 93 | return fval 94 | end 95 | 96 | -- argmin_x Q(x,y), just shrinkage for L1 97 | fista.pl = function (x,L) 98 | x:shrinkage(fista.lambda/L) 99 | end 100 | 101 | fista.run = function(x, lam, codeinit) 102 | local code = fista.code 103 | fista.input = x 104 | fista.lambda = lam 105 | 106 | -- resize code, maybe a different number of dimensions 107 | -- fill with zeros, initial point 108 | if codeinit then 109 | code:resizeAs(codeinit) 110 | code:copy(codeinit) 111 | else 112 | if x:dim() == 1 then 113 | code:resize(D:size(2)) 114 | elseif x:dim() == 2 then 115 | code:resize(x:size(1),D:size(2)) 116 | else 117 | error(' I do not know how to handle ' .. x:dim() .. ' dimensional input') 118 | end 119 | code:fill(0) 120 | end 121 | -- return the result of unsup.FistaLS call. 122 | return optim.FistaLS(fista.f, fista.g, fista.pl, fista.code, params) 123 | end 124 | 125 | return fista 126 | end 127 | 128 | -------------------------------------------------------------------------------- /Logger.lua: -------------------------------------------------------------------------------- 1 | --[[ Logger: a simple class to log symbols during training, 2 | and automate plot generation 3 | 4 | Example: 5 | logger = optim.Logger('somefile.log') -- file to save stuff 6 | 7 | for i = 1,N do -- log some symbols during 8 | train_error = ... -- training/testing 9 | test_error = ... 10 | logger:add{['training error'] = train_error, 11 | ['test error'] = test_error} 12 | end 13 | 14 | logger:style{['training error'] = '-', -- define styles for plots 15 | ['test error'] = '-'} 16 | logger:plot() -- and plot 17 | 18 | ---- OR --- 19 | 20 | logger = optim.Logger('somefile.log') -- file to save stuff 21 | logger:setNames{'training error', 'test error'} 22 | 23 | for i = 1,N do -- log some symbols during 24 | train_error = ... -- training/testing 25 | test_error = ... 26 | logger:add{train_error, test_error} 27 | end 28 | 29 | logger:style{'-', '-'} -- define styles for plots 30 | logger:plot() -- and plot 31 | 32 | ----------- 33 | 34 | logger:setlogscale(true) -- enable logscale on Y-axis 35 | logger:plot() -- and plot 36 | ]] 37 | require 'xlua' 38 | local Logger = torch.class('optim.Logger') 39 | 40 | function Logger:__init(filename, timestamp) 41 | if filename then 42 | self.name = filename 43 | os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"') 44 | if timestamp then 45 | -- append timestamp to create unique log file 46 | filename = filename .. '-'..os.date("%Y_%m_%d_%X") 47 | end 48 | self.file = io.open(filename,'w') 49 | self.epsfile = self.name .. '.eps' 50 | else 51 | self.file = io.stdout 52 | self.name = 'stdout' 53 | print(' warning: no path provided, logging to std out') 54 | end 55 | self.empty = true 56 | self.symbols = {} 57 | self.styles = {} 58 | self.names = {} 59 | self.idx = {} 60 | self.figure = nil 61 | self.showPlot = true 62 | self.plotRawCmd = nil 63 | self.defaultStyle = '+' 64 | self.logscale = false 65 | end 66 | 67 | function Logger:setNames(names) 68 | self.names = names 69 | self.empty = false 70 | self.nsymbols = #names 71 | for k,key in pairs(names) do 72 | self.file:write(key .. '\t') 73 | self.symbols[k] = {} 74 | self.styles[k] = {self.defaultStyle} 75 | self.idx[key] = k 76 | end 77 | self.file:write('\n') 78 | self.file:flush() 79 | return self 80 | end 81 | 82 | function Logger:add(symbols) 83 | -- (1) first time ? print symbols' names on first row 84 | if self.empty then 85 | self.empty = false 86 | self.nsymbols = #symbols 87 | for k,val in pairs(symbols) do 88 | self.file:write(k .. '\t') 89 | self.symbols[k] = {} 90 | self.styles[k] = {self.defaultStyle} 91 | self.names[k] = k 92 | end 93 | self.idx = self.names 94 | self.file:write('\n') 95 | end 96 | -- (2) print all symbols on one row 97 | for k,val in pairs(symbols) do 98 | if type(val) == 'number' then 99 | self.file:write(string.format('%11.4e',val) .. '\t') 100 | elseif type(val) == 'string' then 101 | self.file:write(val .. '\t') 102 | else 103 | xlua.error('can only log numbers and strings', 'Logger') 104 | end 105 | end 106 | self.file:write('\n') 107 | self.file:flush() 108 | -- (3) save symbols in internal table 109 | for k,val in pairs(symbols) do 110 | table.insert(self.symbols[k], val) 111 | end 112 | end 113 | 114 | function Logger:style(symbols) 115 | for name,style in pairs(symbols) do 116 | if type(style) == 'string' then 117 | self.styles[name] = {style} 118 | elseif type(style) == 'table' then 119 | self.styles[name] = style 120 | else 121 | xlua.error('style should be a string or a table of strings','Logger') 122 | end 123 | end 124 | return self 125 | end 126 | 127 | function Logger:setlogscale(state) 128 | self.logscale = state 129 | end 130 | 131 | function Logger:display(state) 132 | self.showPlot = state 133 | end 134 | 135 | function Logger:plot(...) 136 | if not xlua.require('gnuplot') then 137 | if not self.warned then 138 | print(' warning: cannot plot with this version of Torch') 139 | self.warned = true 140 | end 141 | return 142 | end 143 | local plotit = false 144 | local plots = {} 145 | local plotsymbol = 146 | function(name,list) 147 | if #list > 1 then 148 | local nelts = #list 149 | local plot_y = torch.Tensor(nelts) 150 | for i = 1,nelts do 151 | plot_y[i] = list[i] 152 | end 153 | for _,style in ipairs(self.styles[name]) do 154 | table.insert(plots, {self.names[name], plot_y, style}) 155 | end 156 | plotit = true 157 | end 158 | end 159 | local args = {...} 160 | if not args[1] then -- plot all symbols 161 | for name,list in pairs(self.symbols) do 162 | plotsymbol(name,list) 163 | end 164 | else -- plot given symbols 165 | for _,name in ipairs(args) do 166 | plotsymbol(self.idx[name], self.symbols[self.idx[name]]) 167 | end 168 | end 169 | if plotit then 170 | if self.showPlot then 171 | self.figure = gnuplot.figure(self.figure) 172 | if self.logscale then gnuplot.logscale('on') end 173 | gnuplot.plot(plots) 174 | if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end 175 | gnuplot.grid('on') 176 | gnuplot.title('') 177 | end 178 | if self.epsfile then 179 | os.execute('rm -f "' .. self.epsfile .. '"') 180 | local epsfig = gnuplot.epsfigure(self.epsfile) 181 | if self.logscale then gnuplot.logscale('on') end 182 | gnuplot.plot(plots) 183 | if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end 184 | gnuplot.grid('on') 185 | gnuplot.title('') 186 | gnuplot.plotflush() 187 | gnuplot.close(epsfig) 188 | end 189 | end 190 | end 191 | -------------------------------------------------------------------------------- /cg.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | 3 | This cg implementation is a rewrite of minimize.m written by Carl 4 | E. Rasmussen. It is supposed to produce exactly same results (give 5 | or take numerical accuracy due to some changed order of 6 | operations). You can compare the result on rosenbrock with minimize.m. 7 | http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html 8 | 9 | [x fx c] = minimize([0 0]', 'rosenbrock', -25) 10 | 11 | Note that we limit the number of function evaluations only, it seems much 12 | more important in practical use. 13 | 14 | ARGS: 15 | 16 | - `opfunc` : a function that takes a single input, the point of evaluation. 17 | - `x` : the initial point 18 | - `state` : a table of parameters and temporary allocations. 19 | - `state.maxEval` : max number of function evaluations 20 | - `state.maxIter` : max number of iterations 21 | - `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage 22 | - `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage 23 | 24 | RETURN: 25 | 26 | - `x*` : the new x vector, at the optimal point 27 | - `f` : a table of all function values where 28 | `f[1]` is the value of the function before any optimization and 29 | `f[#f]` is the final fully optimized value, at x* 30 | 31 | (Koray Kavukcuoglu, 2012) 32 | --]] 33 | function optim.cg(opfunc, x, config, state) 34 | -- parameters 35 | local config = config or {} 36 | local state = state or config 37 | local rho = config.rho or 0.01 38 | local sig = config.sig or 0.5 39 | local int = config.int or 0.1 40 | local ext = config.ext or 3.0 41 | local maxIter = config.maxIter or 20 42 | local ratio = config.ratio or 100 43 | local maxEval = config.maxEval or maxIter*1.25 44 | local red = 1 45 | 46 | local verbose = config.verbose or 0 47 | 48 | local i = 0 49 | local ls_failed = 0 50 | local fx = {} 51 | 52 | -- we need three points for the interpolation/extrapolation stuff 53 | local z1,z2,z3 = 0,0,0 54 | local d1,d2,d3 = 0,0,0 55 | local f1,f2,f3 = 0,0,0 56 | 57 | local df1 = state.df1 or x.new() 58 | local df2 = state.df2 or x.new() 59 | local df3 = state.df3 or x.new() 60 | local tdf 61 | 62 | df1:resizeAs(x) 63 | df2:resizeAs(x) 64 | df3:resizeAs(x) 65 | 66 | -- search direction 67 | local s = state.s or x.new() 68 | s:resizeAs(x) 69 | 70 | -- we need a temp storage for X 71 | local x0 = state.x0 or x.new() 72 | local f0 = 0 73 | local df0 = state.df0 or x.new() 74 | x0:resizeAs(x) 75 | df0:resizeAs(x) 76 | 77 | -- evaluate at initial point 78 | f1,tdf = opfunc(x) 79 | fx[#fx+1] = f1 80 | df1:copy(tdf) 81 | i=i+1 82 | 83 | -- initial search direction 84 | s:copy(df1):mul(-1) 85 | 86 | d1 = -s:dot(s ) -- slope 87 | z1 = red/(1-d1) -- initial step 88 | 89 | while i < math.abs(maxEval) do 90 | 91 | x0:copy(x) 92 | f0 = f1 93 | df0:copy(df1) 94 | 95 | x:add(z1,s) 96 | f2,tdf = opfunc(x) 97 | df2:copy(tdf) 98 | i=i+1 99 | d2 = df2:dot(s) 100 | f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1 101 | local m = math.min(maxIter,maxEval-i) 102 | local success = 0 103 | local limit = -1 104 | 105 | while true do 106 | while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do 107 | limit = z1 108 | if f2 > f1 then 109 | z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3) 110 | else 111 | local A = 6*(f2-f3)/z3+3*(d2+d3) 112 | local B = 3*(f3-f2)-z3*(d3+2*d2) 113 | z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A 114 | end 115 | if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then 116 | z2 = z3/2; 117 | end 118 | z2 = math.max(math.min(z2, int*z3),(1-int)*z3); 119 | z1 = z1 + z2; 120 | x:add(z2,s) 121 | f2,tdf = opfunc(x) 122 | df2:copy(tdf) 123 | i=i+1 124 | m = m - 1 125 | d2 = df2:dot(s) 126 | z3 = z3-z2; 127 | end 128 | if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then 129 | break 130 | elseif d2 > sig*d1 then 131 | success = 1; 132 | break; 133 | elseif m == 0 then 134 | break; 135 | end 136 | local A = 6*(f2-f3)/z3+3*(d2+d3); 137 | local B = 3*(f3-f2)-z3*(d3+2*d2); 138 | z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3)) 139 | 140 | if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then 141 | if limit < -0.5 then 142 | z2 = z1 * (ext -1) 143 | else 144 | z2 = (limit-z1)/2 145 | end 146 | elseif (limit > -0.5) and (z2+z1) > limit then 147 | z2 = (limit-z1)/2 148 | elseif limit < -0.5 and (z2+z1) > z1*ext then 149 | z2 = z1*(ext-1) 150 | elseif z2 < -z3*int then 151 | z2 = -z3*int 152 | elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then 153 | z2 = (limit-z1)*(1-int) 154 | end 155 | f3=f2; d3=d2; z3=-z2; 156 | z1 = z1+z2; 157 | x:add(z2,s) 158 | 159 | f2,tdf = opfunc(x) 160 | df2:copy(tdf) 161 | i=i+1 162 | m = m - 1 163 | d2 = df2:dot(s) 164 | end 165 | if success == 1 then 166 | f1 = f2 167 | fx[#fx+1] = f1; 168 | local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1) 169 | s:mul(ss) 170 | s:add(-1,df2) 171 | local tmp = df1:clone() 172 | df1:copy(df2) 173 | df2:copy(tmp) 174 | d2 = df1:dot(s) 175 | if d2> 0 then 176 | s:copy(df1) 177 | s:mul(-1) 178 | d2 = -s:dot(s) 179 | end 180 | 181 | z1 = z1 * math.min(ratio, d1/(d2-1e-320)) 182 | d1 = d2 183 | ls_failed = 0 184 | else 185 | x:copy(x0) 186 | f1 = f0 187 | df1:copy(df0) 188 | if ls_failed or i>maxEval then 189 | break 190 | end 191 | local tmp = df1:clone() 192 | df1:copy(df2) 193 | df2:copy(tmp) 194 | s:copy(df1) 195 | s:mul(-1) 196 | d1 = -s:dot(s) 197 | z1 = 1/(1-d1) 198 | ls_failed = 1 199 | end 200 | end 201 | state.df0 = df0 202 | state.df1 = df1 203 | state.df2 = df2 204 | state.df3 = df3 205 | state.x0 = x0 206 | state.s = s 207 | return x,fx,i 208 | end 209 | -------------------------------------------------------------------------------- /lswolfe.lua: -------------------------------------------------------------------------------- 1 | --[[ A Line Search satisfying the Wolfe conditions 2 | 3 | ARGS: 4 | - `opfunc` : a function (the objective) that takes a single input (X), 5 | the point of evaluation, and returns f(X) and df/dX 6 | - `x` : initial point / starting location 7 | - `t` : initial step size 8 | - `d` : descent direction 9 | - `f` : initial function value 10 | - `g` : gradient at initial location 11 | - `gtd` : directional derivative at starting location 12 | - `options.c1` : sufficient decrease parameter 13 | - `options.c2` : curvature parameter 14 | - `options.tolX` : minimum allowable step length 15 | - `options.maxIter` : maximum nb of iterations 16 | 17 | RETURN: 18 | - `f` : function value at x+t*d 19 | - `g` : gradient value at x+t*d 20 | - `x` : the next x (=x+t*d) 21 | - `t` : the step length 22 | - `lsFuncEval` : the number of function evaluations 23 | ]] 24 | function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options) 25 | -- options 26 | options = options or {} 27 | local c1 = options.c1 or 1e-4 28 | local c2 = options.c2 or 0.9 29 | local tolX = options.tolX or 1e-9 30 | local maxIter = options.maxIter or 20 31 | local isverbose = options.verbose or false 32 | 33 | -- some shortcuts 34 | local abs = torch.abs 35 | local min = math.min 36 | local max = math.max 37 | 38 | -- verbose function 39 | local function verbose(...) 40 | if isverbose then print(' ', ...) end 41 | end 42 | 43 | -- evaluate objective and gradient using initial step 44 | local x_init = x:clone() 45 | x:add(t,d) 46 | local f_new,g_new = opfunc(x) 47 | local lsFuncEval = 1 48 | local gtd_new = g_new * d 49 | 50 | -- bracket an interval containing a point satisfying the Wolfe 51 | -- criteria 52 | local LSiter,t_prev,done = 0,0,false 53 | local f_prev,g_prev,gtd_prev = f,g:clone(),gtd 54 | local bracket,bracketFval,bracketGval 55 | while LSiter < maxIter do 56 | -- check conditions: 57 | if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then 58 | bracket = x.new{t_prev,t} 59 | bracketFval = x.new{f_prev,f_new} 60 | bracketGval = x.new(2,g_new:size(1)) 61 | bracketGval[1] = g_prev 62 | bracketGval[2] = g_new 63 | break 64 | 65 | elseif abs(gtd_new) <= -c2*gtd then 66 | bracket = x.new{t} 67 | bracketFval = x.new{f_new} 68 | bracketGval = x.new(1,g_new:size(1)) 69 | bracketGval[1] = g_new 70 | done = true 71 | break 72 | 73 | elseif gtd_new >= 0 then 74 | bracket = x.new{t_prev,t} 75 | bracketFval = x.new{f_prev,f_new} 76 | bracketGval = x.new(2,g_new:size(1)) 77 | bracketGval[1] = g_prev 78 | bracketGval[2] = g_new 79 | break 80 | 81 | end 82 | 83 | -- interpolate: 84 | local tmp = t_prev 85 | t_prev = t 86 | local minStep = t + 0.01*(t-tmp) 87 | local maxStep = t*10 88 | t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev}, 89 | {t,f_new,gtd_new}}, 90 | minStep, maxStep) 91 | 92 | -- next step: 93 | f_prev = f_new 94 | g_prev = g_new:clone() 95 | gtd_prev = gtd_new 96 | x[{}] = x_init 97 | x:add(t,d) 98 | f_new,g_new = opfunc(x) 99 | lsFuncEval = lsFuncEval + 1 100 | gtd_new = g_new * d 101 | LSiter = LSiter + 1 102 | end 103 | 104 | -- reached max nb of iterations? 105 | if LSiter == maxIter then 106 | bracket = x.new{0,t} 107 | bracketFval = x.new{f,f_new} 108 | bracketGval = x.new(2,g_new:size(1)) 109 | bracketGval[1] = g 110 | bracketGval[2] = g_new 111 | end 112 | 113 | -- zoom phase: we now have a point satisfying the criteria, or 114 | -- a bracket around it. We refine the bracket until we find the 115 | -- exact point satisfying the criteria 116 | local insufProgress = false 117 | local LOposRemoved = 0 118 | while not done and LSiter < maxIter do 119 | -- find high and low points in bracket 120 | local f_LO,LOpos = bracketFval:min(1) 121 | LOpos = LOpos[1] f_LO = f_LO[1] 122 | local HIpos = -LOpos+3 123 | 124 | -- compute new trial value 125 | t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d}, 126 | {bracket[2],bracketFval[2],bracketGval[2]*d}}) 127 | 128 | -- test what we are making sufficient progress 129 | if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then 130 | if insufProgress or t>=bracket:max() or t <= bracket:min() then 131 | if abs(t-bracket:max()) < abs(t-bracket:min()) then 132 | t = bracket:max()-0.1*(bracket:max()-bracket:min()) 133 | else 134 | t = bracket:min()+0.1*(bracket:max()-bracket:min()) 135 | end 136 | insufProgress = false 137 | else 138 | insufProgress = true 139 | end 140 | else 141 | insufProgress = false 142 | end 143 | 144 | -- Evaluate new point 145 | x[{}] = x_init 146 | x:add(t,d) 147 | f_new,g_new = opfunc(x) 148 | lsFuncEval = lsFuncEval + 1 149 | gtd_new = g_new * d 150 | LSiter = LSiter + 1 151 | if f_new > f + c1*t*gtd or f_new >= f_LO then 152 | -- Armijo condition not satisfied or not lower than lowest point 153 | bracket[HIpos] = t 154 | bracketFval[HIpos] = f_new 155 | bracketGval[HIpos] = g_new 156 | else 157 | if abs(gtd_new) <= - c2*gtd then 158 | -- Wolfe conditions satisfied 159 | done = true 160 | elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then 161 | -- Old HI becomes new LO 162 | bracket[HIpos] = bracket[LOpos] 163 | bracketFval[HIpos] = bracketFval[LOpos] 164 | bracketGval[HIpos] = bracketGval[LOpos] 165 | end 166 | -- New point becomes new LO 167 | bracket[LOpos] = t 168 | bracketFval[LOpos] = f_new 169 | bracketGval[LOpos] = g_new 170 | end 171 | 172 | -- done? 173 | if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then 174 | break 175 | end 176 | end 177 | 178 | -- be verbose 179 | if LSiter == maxIter then 180 | verbose('reached max number of iterations') 181 | end 182 | 183 | -- return stuff 184 | local _,LOpos = bracketFval:min(1) 185 | LOpos = LOpos[1] 186 | t = bracket[LOpos] 187 | f_new = bracketFval[LOpos] 188 | g_new = bracketGval[LOpos] 189 | x[{}] = x_init 190 | x:add(t,d) 191 | return f_new,g_new,x,t,lsFuncEval 192 | end 193 | -------------------------------------------------------------------------------- /polyinterp.lua: -------------------------------------------------------------------------------- 1 | local function isreal(x) 2 | return x == x 3 | end 4 | 5 | local function isnan(x) 6 | return not x == x 7 | end 8 | 9 | local function roots(c) 10 | local tol=1e-12 11 | c[torch.lt(torch.abs(c),tol)]=0 12 | 13 | local nonzero = torch.ne(c,0) 14 | if nonzero:max() == 0 then 15 | return 0 16 | end 17 | 18 | -- first non-zero 19 | local _,pos = torch.max(nonzero,1) 20 | pos = pos[1] 21 | c=c[{ {pos,-1} }] 22 | 23 | local nz = 0 24 | for i=c:size(1),1,-1 do 25 | if c[i] ~= 0 then 26 | break 27 | else 28 | nz = nz + 1 29 | end 30 | end 31 | c=c[{ {1,c:size(1)-nz} }] 32 | 33 | local n = c:size(1)-1 34 | if n == 1 then 35 | local e = c.new({{-c[2]/c[1], 0}}) 36 | if nz > 0 then 37 | return torch.cat(e, c.new(nz, 2):zero(), 1) 38 | else 39 | return e 40 | end 41 | elseif n > 1 then 42 | local A = torch.diag(c.new(n-1):fill(1),-1) 43 | A[1] = -c[{ {2,n+1} }]/c[1]; 44 | local e = torch.eig(A,'N') 45 | if nz > 0 then 46 | return torch.cat(e, c.new(nz,2):zero(), 1) 47 | else 48 | return e 49 | end 50 | else 51 | return c.new(nz,2):zero() 52 | end 53 | end 54 | 55 | local function real(x) 56 | if type(x) == number then return x end 57 | return x[{ {} , 1}] 58 | end 59 | 60 | local function imag(x) 61 | if type(x) == 'number' then return 0 end 62 | if x:nDimension() == 1 then 63 | return x.new(x:size(1)):zero() 64 | else 65 | return x[{ {}, 2}] 66 | end 67 | end 68 | 69 | local function polyval(p,x) 70 | local pwr = p:size(1) 71 | if type(x) == 'number' then 72 | local val = 0 73 | p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end) 74 | return val 75 | else 76 | local val = x.new(x:size(1)) 77 | p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end) 78 | return val 79 | end 80 | end 81 | 82 | ---------------------------------------------------------------------- 83 | -- Minimum of interpolating polynomial based on function and 84 | -- derivative values 85 | -- 86 | -- ARGS: 87 | -- points : N triplets (x,f,g), must be a Tensor 88 | -- xmin : min value that brackets minimum (default: min of points) 89 | -- xmax : max value that brackets maximum (default: max of points) 90 | -- 91 | -- RETURN: 92 | -- minPos : position of minimum 93 | -- 94 | function optim.polyinterp(points,xminBound,xmaxBound) 95 | -- locals 96 | local sqrt = torch.sqrt 97 | local mean = torch.mean 98 | local max = math.max 99 | local min = math.min 100 | 101 | -- nb of points / order of polynomial 102 | local nPoints = points:size(1) 103 | local order = nPoints*2-1 104 | 105 | -- returned values 106 | local minPos 107 | 108 | -- Code for most common case: 109 | -- + cubic interpolation of 2 points w/ function and derivative values for both 110 | -- + no xminBound/xmaxBound 111 | if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then 112 | -- Solution in this case (where x2 is the farthest point): 113 | -- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2); 114 | -- d2 = sqrt(d1^2 - g1*g2); 115 | -- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2)); 116 | -- t_new = min(max(minPos,x1),x2); 117 | local minVal,minPos = points[{ {},1 }]:min(1) 118 | minVal = minVal[1] minPos = minPos[1] 119 | local notMinPos = -minPos+3; 120 | 121 | local d1 = points[{minPos,3}] + points[{notMinPos,3}] 122 | - 3*(points[{minPos,2}]-points[{notMinPos,2}]) 123 | / (points[{minPos,1}]-points[{notMinPos,1}]); 124 | local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]); 125 | 126 | if isreal(d2) then -- isreal() 127 | local t = points[{notMinPos,1}] - (points[{notMinPos,1}] 128 | - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1) 129 | / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2)) 130 | 131 | minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}]) 132 | else 133 | minPos = mean(points[{{},1}]) 134 | end 135 | return minPos 136 | end 137 | 138 | -- TODO: get the code below to work! 139 | --error(' extrapolation not implemented yet...') 140 | 141 | -- Compute Bounds of Interpolation Area 142 | local xmin = points[{{},1}]:min() 143 | local xmax = points[{{},1}]:max() 144 | xminBound = xminBound or xmin 145 | xmaxBound = xmaxBound or xmax 146 | 147 | -- Add constraints on function values 148 | local A = points.new(nPoints*2,order+1):zero() 149 | local b = points.new(nPoints*2,1):zero() 150 | for i = 1,nPoints do 151 | local constraint = points.new(order+1):zero() 152 | for j = order,0,-1 do 153 | constraint[order-j+1] = points[{i,1}]^j 154 | end 155 | A[i] = constraint 156 | b[i] = points[{i,2}] 157 | end 158 | 159 | -- Add constraints based on derivatives 160 | for i = 1,nPoints do 161 | local constraint = points.new(order+1):zero() 162 | for j = 1,order do 163 | constraint[j] = (order-j+1)*points[{i,1}]^(order-j) 164 | end 165 | A[nPoints+i] = constraint 166 | b[nPoints+i] = points[{i,3}] 167 | end 168 | 169 | -- Find interpolating polynomial 170 | local res = torch.gels(b,A) 171 | local params = res[{ {1,nPoints*2} }]:squeeze() 172 | 173 | params[torch.le(torch.abs(params),1e-12)]=0 174 | 175 | -- Compute Critical Points 176 | local dParams = points.new(order):zero(); 177 | for i = 1,params:size(1)-1 do 178 | dParams[i] = params[i]*(order-i+1) 179 | end 180 | 181 | -- nan/inf? 182 | local nans = false 183 | if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then 184 | nans = true 185 | end 186 | 187 | local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}]) 188 | if not nans then 189 | local cproots = roots(dParams) 190 | local cpi = points.new(cp:size(1),2):zero() 191 | cpi[{ {1,cp:size(1)} , 1 }] = cp 192 | cp = torch.cat(cpi,cproots,1) 193 | end 194 | 195 | -- Test Critical Points 196 | local fmin = math.huge 197 | -- Default to Bisection if no critical points valid: 198 | minPos = (xminBound+xmaxBound)/2 199 | for i = 1,cp:size(1) do 200 | local xCP = cp[{ {i,i} , {} }] 201 | local ixCP = imag(xCP)[1] 202 | local rxCP = real(xCP)[1] 203 | if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then 204 | local fCP = polyval(params,rxCP) 205 | if fCP < fmin then 206 | minPos = rxCP 207 | fmin = fCP 208 | end 209 | end 210 | end 211 | return minPos,fmin 212 | end 213 | -------------------------------------------------------------------------------- /fista.lua: -------------------------------------------------------------------------------- 1 | --[[ FISTA with backtracking line search 2 | 3 | - `f` : smooth function 4 | - `g` : non-smooth function 5 | - `pl` : minimizer of intermediate problem Q(x,y) 6 | - `xinit` : initial point 7 | - `params` : table of parameters (**optional**) 8 | - `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1) 9 | - `params.Lstep` : step size multiplier at each iteration (1.5) 10 | - `params.maxiter` : max number of iterations (50) 11 | - `params.maxline` : max number of line search iterations per iteration (20) 12 | - `params.errthres`: Error thershold for convergence check (1e-4) 13 | - `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true) 14 | - `params.verbose` : store each iteration solution and print detailed info (false) 15 | 16 | On output, `params` will contain these additional fields that can be reused. 17 | 18 | - `params.L` : last used L value will be written. 19 | 20 | These are temporary storages needed by the algo and if the same params object is 21 | passed a second time, these same storages will be used without new allocation. 22 | 23 | - `params.xkm` : previous iterarion point 24 | - `params.y` : fista iteration 25 | - `params.ply` : ply = pl(y - 1/L grad(f)) 26 | 27 | Returns the solution x and history of {function evals, number of line search ,...} 28 | 29 | Algorithm is published in 30 | 31 | @article{beck-fista-09, 32 | Author = {Beck, Amir and Teboulle, Marc}, 33 | Journal = {SIAM J. Img. Sci.}, 34 | Number = {1}, 35 | Pages = {183--202}, 36 | Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems}, 37 | Volume = {2}, 38 | Year = {2009}} 39 | ]] 40 | function optim.FistaLS(f, g, pl, xinit, params) 41 | 42 | local params = params or {} 43 | local L = params.L or 0.1 44 | local Lstep = params.Lstep or 1.5 45 | local maxiter = params.maxiter or 50 46 | local maxline = params.maxline or 20 47 | local errthres = params.errthres or 1e-4 48 | local doFistaUpdate = params.doFistaUpdate 49 | local verbose = params.verbose 50 | 51 | -- temporary allocations 52 | params.xkm = params.xkm or torch.Tensor() 53 | params.y = params.y or torch.Tensor() 54 | params.ply = params.ply or torch.Tensor() 55 | local xkm = params.xkm -- previous iteration 56 | local y = params.y -- fista iteration 57 | local ply = params.ply -- soft shrinked y 58 | 59 | -- we start from all zeros 60 | local xk = xinit 61 | xkm:resizeAs(xk):zero() 62 | ply:resizeAs(xk):zero() 63 | y:resizeAs(xk):zero() 64 | 65 | local history = {} -- keep track of stuff 66 | local niter = 0 -- number of iterations done 67 | local converged = false -- are we done? 68 | local tk = 1 -- momentum param for FISTA 69 | local tkp = 0 70 | 71 | 72 | local gy = g(y) 73 | local fval = math.huge -- fval = f+g 74 | while not converged and niter < maxiter do 75 | 76 | -- run through smooth function (code is input, input is target) 77 | -- get derivatives from smooth function 78 | local fy,gfy = f(y,'dx') 79 | --local gfy = f(y) 80 | 81 | local fply = 0 82 | local gply = 0 83 | local Q = 0 84 | 85 | ---------------------------------------------- 86 | -- do line search to find new current location starting from fista loc 87 | local nline = 0 88 | local linesearchdone = false 89 | while not linesearchdone do 90 | -- take a step in gradient direction of smooth function 91 | ply:copy(y) 92 | ply:add(-1/L,gfy) 93 | 94 | -- and solve for minimum of auxiliary problem 95 | pl(ply,L) 96 | -- this is candidate for new current iteration 97 | xk:copy(ply) 98 | 99 | -- evaluate this point F(ply) 100 | fply = f(ply) 101 | 102 | -- ply - y 103 | ply:add(-1, y) 104 | -- 105 | local Q2 = gfy:dot(ply) 106 | -- L/2 ||beta-y||^2 107 | local Q3 = L/2 * ply:dot(ply) 108 | -- Q(beta,y) = F(y) + + L/2||beta-y||^2 + G(beta) 109 | Q = fy + Q2 + Q3 110 | 111 | if verbose then 112 | print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3)) 113 | end 114 | -- check if F(beta) < Q(pl(y),\t) 115 | if fply <= Q then --and Fply + Gply <= F then 116 | -- now evaluate G here 117 | linesearchdone = true 118 | elseif nline >= maxline then 119 | linesearchdone = true 120 | xk:copy(xkm) -- if we can't find a better point, current iter = previous iter 121 | --print('oops') 122 | else 123 | L = L * Lstep 124 | end 125 | nline = nline + 1 126 | end 127 | -- end line search 128 | --------------------------------------------- 129 | 130 | --------------------------------------------- 131 | -- FISTA 132 | --------------------------------------------- 133 | if doFistaUpdate then 134 | -- do the FISTA step 135 | tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2 136 | -- x(k-1) = x(k-1) - x(k) 137 | xkm:add(-1,xk) 138 | -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k)) 139 | y:copy(xk) 140 | y:add( (1-tk)/tkp , xkm) 141 | -- store for next iterations 142 | -- x(k-1) = x(k) 143 | xkm:copy(xk) 144 | else 145 | y:copy(xk) 146 | end 147 | -- t(k) = t(k+1) 148 | tk = tkp 149 | fply = f(y) 150 | gply = g(y) 151 | if verbose then 152 | print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply)) 153 | end 154 | 155 | niter = niter + 1 156 | 157 | -- bookeeping 158 | fval = fply + gply 159 | history[niter] = {} 160 | history[niter].nline = nline 161 | history[niter].L = L 162 | history[niter].F = fval 163 | history[niter].Fply = fply 164 | history[niter].Gply = gply 165 | history[niter].Q = Q 166 | params.L = L 167 | if verbose then 168 | history[niter].xk = xk:clone() 169 | history[niter].y = y:clone() 170 | end 171 | 172 | -- are we done? 173 | if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then 174 | converged = true 175 | xinit:copy(y) 176 | return y,history 177 | end 178 | 179 | if niter >= maxiter then 180 | xinit:copy(y) 181 | return y,history 182 | end 183 | 184 | --if niter > 1 and history[niter].F > history[niter-1].F then 185 | --print(niter, 'This was supposed to be a convex function, we are going up') 186 | --converged = true 187 | --return xk,history 188 | --end 189 | end 190 | error('not supposed to be here') 191 | end 192 | 193 | -------------------------------------------------------------------------------- /doc/intro.md: -------------------------------------------------------------------------------- 1 | 2 | # Overview 3 | 4 | Most optimization algorithms have the following interface: 5 | 6 | ```lua 7 | x*, {f}, ... = optim.method(opfunc, x[, config][, state]) 8 | ``` 9 | 10 | where: 11 | 12 | * `opfunc`: a user-defined closure that respects this API: `f, df/dx = func(x)` 13 | * `x`: the current parameter vector (a 1D `Tensor`) 14 | * `config`: a table of parameters, dependent upon the algorithm 15 | * `state`: a table of state variables, if `nil`, `config` will contain the state 16 | * `x*`: the new parameter vector that minimizes `f, x* = argmin_x f(x)` 17 | * `{f}`: a table of all `f` values, in the order they've been evaluated (for some simple algorithms, like SGD, `#f == 1`) 18 | 19 | 20 | 21 | ## Example 22 | 23 | The state table is used to hold the state of the algorithm. 24 | It's usually initialized once, by the user, and then passed to the optim function as a black box. 25 | Example: 26 | 27 | ```lua 28 | config = { 29 | learningRate = 1e-3, 30 | momentum = 0.5 31 | } 32 | 33 | for i, sample in ipairs(training_samples) do 34 | local func = function(x) 35 | -- define eval function 36 | return f, df_dx 37 | end 38 | optim.sgd(func, x, config) 39 | end 40 | ``` 41 | 42 | 43 | 44 | ## Training using optim ## 45 | 46 | `optim` is a quite general optimizer, for minimizing any function with respect to a set of parameters. 47 | In our case, our function will be the loss of our network, given an input, and a set of weights. 48 | The goal of training a neural net is to optimize the weights to give the lowest loss over our validation set, by using the training set as a proxy. 49 | So, we are going to use optim to minimize the loss with respect to the weights, over our training set. 50 | 51 | To illustrate all the steps required, we will go over a simple example, where we will train a neural network on the classical XOR problem. 52 | We will feed the data to `optim` in minibatches (we will use here just one minibatch), breaking your training set into chucks, and feed each minibatch to `optim`, one by one. 53 | 54 | We need to give `optim` a function that will output the loss and the derivative of the loss with respect to the 55 | weights, given the current weights, as a function parameter. 56 | The function will have access to our training minibatch, and use this to calculate the loss, for this minibatch. 57 | Typically, the function would be defined inside our loop over batches, and therefore have access to the current minibatch data. 58 | 59 | 60 | ### Neural Network ### 61 | 62 | We create a simple neural network with one hidden layer. 63 | 64 | ```lua 65 | require 'nn' 66 | 67 | model = nn.Sequential() -- make a multi-layer perceptron 68 | inputs = 2; outputs = 1; HUs = 20 -- parameters 69 | model:add(nn.Linear(inputs, HUs)) 70 | model:add(nn.Tanh()) 71 | model:add(nn.Linear(HUs, outputs)) 72 | ``` 73 | 74 | > If we would like to train on GPU, then we need to shipt the model to *device memory* by typing `model:cuda()` after having issued `require 'cunn'`. 75 | 76 | 77 | ### Criterion ### 78 | 79 | We choose the *Mean Squared Error* loss `Criterion`: 80 | 81 | ```lua 82 | criterion = nn.MSECriterion() 83 | ``` 84 | 85 | We are using an `nn.MSECriterion` because we are training on a regression task, predicting contiguous (real) target value, from `-1` to `+1`. 86 | For a classification task, with more than two classes, we would add an `nn.LogSoftMax` layer to the end of our network, and use a `nn.ClassNLLCriterion` loss criterion. 87 | Nevertheless, the XOR problem could be seen and a two classes classification task, associated to the `-1` and `+1` discrete outputs. 88 | 89 | > If we would like to train on GPU, then we need to ship the `Criterion` to *device memory* by typing `criterion:cuda()`. 90 | 91 | 92 | ### Data set ### 93 | 94 | We will just create one minibatch of `128` examples. 95 | In your own training, you'd want to break down your rather larger data set into multiple minibatches, of around `32` to `512` examples each. 96 | 97 | ```lua 98 | batchSize = 128 99 | batchInputs = torch.DoubleTensor(batchSize, inputs) -- or CudaTensor for GPU training 100 | batchLabels = torch.DoubleTensor(batchSize) -- or CudaTensor for GPU training 101 | 102 | for i = 1, batchSize do 103 | local input = torch.randn(2) -- normally distributed example in 2d 104 | local label 105 | if input[1] * input[2] > 0 then -- calculate label for XOR function 106 | label = -1 107 | else 108 | label = 1 109 | end 110 | batchInputs[i]:copy(input) 111 | batchLabels[i] = label 112 | end 113 | ``` 114 | 115 | 116 | ### Flatten parameters ### 117 | 118 | `optim` expects the parameters that are to be optimized, and their gradients, to be one-dimensional `Tensor`s. 119 | But, our network model contains probably multiple modules, typically multiple convolutional layers, and each of these layers has their own `weight` and `bias` `Tensor`s. 120 | How to handle this? 121 | 122 | It is simple: we can call a standard method `:getParameters()`, that is defined for any network module. 123 | When we call this method, the following magic will happen: 124 | 125 | - a new `Tensor` will be created, large enough to hold all the `weight`s and `bias`es of the entire network model 126 | - the model `weight` and `bias` `Tensor`s are replaced with views onto the new contiguous parameter `Tensor` 127 | - and the exact same thing will happen for all the gradient `Tensor`s: replaced with views onto one single contiguous gradient `Tensor` 128 | 129 | We can call this method as follows: 130 | 131 | ```lua 132 | params, gradParams = model:getParameters() 133 | ``` 134 | 135 | These flattened `Tensor`s have the following characteristics: 136 | 137 | - to `optim`, the parameters it needs to optimize are all contained in one single one-dimensional `Tensor` 138 | - when `optim` optimizes the parameters in this large one-dimensional `Tensor`, it is implicitly optimizing the `weight`s and `bias`es in our network model, since those are now simply views onto this large one-dimensional parameter `Tensor` 139 | 140 | It will look something like this: 141 | 142 | ![Parameter flattening](image/parameterflattening.png?raw=true "Parameter Flattening") 143 | 144 | > Note that flattening the parameters redefines the `weight` and `bias` `Tensor`s for all the network modules in our network model. 145 | > Therefore, any pre-existing references to the original model layer `weight` and `bias` `Tensor`s will no longer point to the model `weight` and `bias` `Tensor`s, after flattening. 146 | 147 | 148 | ### Training ### 149 | 150 | Now that we have created our model, our training set, and prepared the flattened network parameters, we can train using `optim`. 151 | `optim` provides [various training algorithms](doc/index.md). 152 | We will use the stochastic gradient descent algorithm [SGD](doc/index.md#x-sgdopfunc-x-state). 153 | We need to provide the learning rate, via an optimization state table: 154 | 155 | ```lua 156 | local optimState = {learningRate = 0.01} 157 | ``` 158 | 159 | We define an evaluation function, inside our training loop, and use `optim.sgd` to train the system: 160 | 161 | ```lua 162 | require 'optim' 163 | 164 | for epoch = 1, 50 do 165 | -- local function we give to optim 166 | -- it takes current weights as input, and outputs the loss 167 | -- and the gradient of the loss with respect to the weights 168 | -- gradParams is calculated implicitly by calling 'backward', 169 | -- because the model's weight and bias gradient tensors 170 | -- are simply views onto gradParams 171 | function feval(params) 172 | gradParams:zero() 173 | 174 | local outputs = model:forward(batchInputs) 175 | local loss = criterion:forward(outputs, batchLabels) 176 | local dloss_doutputs = criterion:backward(outputs, batchLabels) 177 | model:backward(batchInputs, dloss_doutputs) 178 | 179 | return loss, gradParams 180 | end 181 | optim.sgd(feval, params, optimState) 182 | end 183 | ``` 184 | 185 | 186 | ### Test the network ### 187 | 188 | For the prediction task, we will also typically use minibatches, although we can run prediction sample by sample too. 189 | In this example, we will predict sample by sample. 190 | To run prediction on a minibatch, simply pass in a tensor with one additional dimension, which represents the sample index. 191 | 192 | ```lua 193 | x = torch.Tensor(2) 194 | x[1] = 0.5; x[2] = 0.5; print(model:forward(x)) 195 | x[1] = 0.5; x[2] = -0.5; print(model:forward(x)) 196 | x[1] = -0.5; x[2] = 0.5; print(model:forward(x)) 197 | x[1] = -0.5; x[2] = -0.5; print(model:forward(x)) 198 | ``` 199 | 200 | You should see something like: 201 | 202 | ```lua 203 | > x = torch.Tensor(2) 204 | > x[1] = 0.5; x[2] = 0.5; print(model:forward(x)) 205 | 206 | -0.3490 207 | [torch.DoubleTensor of dimension 1] 208 | 209 | > x[1] = 0.5; x[2] = -0.5; print(model:forward(x)) 210 | 211 | 1.0561 212 | [torch.DoubleTensor of dimension 1] 213 | 214 | > x[1] = -0.5; x[2] = 0.5; print(model:forward(x)) 215 | 216 | 0.8640 217 | [torch.DoubleTensor of dimension 1] 218 | 219 | > x[1] = -0.5; x[2] = -0.5; print(model:forward(x)) 220 | 221 | -0.2941 222 | [torch.DoubleTensor of dimension 1] 223 | ``` 224 | 225 | If we were running on a GPU, we would probably want to predict using minibatches, because this will hide the latencies involved in transferring data from main memory to the GPU. 226 | To predict on a minbatch, we could do something like: 227 | 228 | ```lua 229 | x = torch.CudaTensor({ 230 | { 0.5, 0.5}, 231 | { 0.5, -0.5}, 232 | {-0.5, 0.5}, 233 | {-0.5, -0.5} 234 | }) 235 | print(model:forward(x)) 236 | ``` 237 | 238 | You should see something like: 239 | 240 | ```lua 241 | > print(model:forward(x)) 242 | -0.3490 243 | 1.0561 244 | 0.8640 245 | -0.2941 246 | [torch.CudaTensor of size 4] 247 | ``` 248 | 249 | That's it! 250 | For minibatched prediction, the output tensor contains one value for each of our input data samples. 251 | -------------------------------------------------------------------------------- /lbfgs.lua: -------------------------------------------------------------------------------- 1 | --[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt) 2 | 3 | This implementation of L-BFGS relies on a user-provided line 4 | search function (state.lineSearch). If this function is not 5 | provided, then a simple learningRate is used to produce fixed 6 | size steps. Fixed size steps are much less costly than line 7 | searches, and can be useful for stochastic problems. 8 | 9 | The learning rate is used even when a line search is provided. 10 | This is also useful for large-scale stochastic problems, where 11 | opfunc is a noisy approximation of f(x). In that case, the learning 12 | rate allows a reduction of confidence in the step size. 13 | 14 | ARGS: 15 | 16 | - `opfunc` : a function that takes a single input (X), the point of 17 | evaluation, and returns f(X) and df/dX 18 | - `x` : the initial point 19 | - `state` : a table describing the state of the optimizer; after each 20 | call the state is modified 21 | - `state.maxIter` : Maximum number of iterations allowed 22 | - `state.maxEval` : Maximum number of function evaluations 23 | - `state.tolFun` : Termination tolerance on the first-order optimality 24 | - `state.tolX` : Termination tol on progress in terms of func/param changes 25 | - `state.lineSearch` : A line search function 26 | - `state.learningRate` : If no line search provided, then a fixed step size is used 27 | 28 | RETURN: 29 | - `x*` : the new `x` vector, at the optimal point 30 | - `f` : a table of all function values: 31 | `f[1]` is the value of the function before any optimization and 32 | `f[#f]` is the final fully optimized value, at `x*` 33 | 34 | (Clement Farabet, 2012) 35 | ]] 36 | function optim.lbfgs(opfunc, x, config, state) 37 | -- get/update state 38 | local config = config or {} 39 | local state = state or config 40 | local maxIter = tonumber(config.maxIter) or 20 41 | local maxEval = tonumber(config.maxEval) or maxIter*1.25 42 | local tolFun = config.tolFun or 1e-5 43 | local tolX = config.tolX or 1e-9 44 | local nCorrection = config.nCorrection or 100 45 | local lineSearch = config.lineSearch 46 | local lineSearchOpts = config.lineSearchOptions 47 | local learningRate = config.learningRate or 1 48 | local isverbose = config.verbose or false 49 | 50 | state.funcEval = state.funcEval or 0 51 | state.nIter = state.nIter or 0 52 | 53 | -- verbose function 54 | local verbose 55 | if isverbose then 56 | verbose = function(...) print(' ', ...) end 57 | else 58 | verbose = function() end 59 | end 60 | 61 | -- import some functions 62 | local abs = math.abs 63 | local min = math.min 64 | 65 | -- evaluate initial f(x) and df/dx 66 | local f,g = opfunc(x) 67 | local f_hist = {f} 68 | local currentFuncEval = 1 69 | state.funcEval = state.funcEval + 1 70 | local p = g:size(1) 71 | 72 | -- check optimality of initial point 73 | state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1 74 | tmp1:copy(g):abs() 75 | if tmp1:sum() <= tolFun then 76 | -- optimality condition below tolFun 77 | verbose('optimality condition below tolFun') 78 | return x,f_hist 79 | end 80 | 81 | if not state.dir_bufs then 82 | -- reusable buffers for y's and s's, and their histories 83 | verbose('creating recyclable direction/step/history buffers') 84 | state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1) 85 | state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1) 86 | for i=1,#state.dir_bufs do 87 | state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1) 88 | state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1) 89 | end 90 | end 91 | 92 | -- variables cached in state (for tracing) 93 | local d = state.d 94 | local t = state.t 95 | local old_dirs = state.old_dirs 96 | local old_stps = state.old_stps 97 | local Hdiag = state.Hdiag 98 | local g_old = state.g_old 99 | local f_old = state.f_old 100 | 101 | -- optimize for a max of maxIter iterations 102 | local nIter = 0 103 | while nIter < maxIter do 104 | -- keep track of nb of iterations 105 | nIter = nIter + 1 106 | state.nIter = state.nIter + 1 107 | 108 | ------------------------------------------------------------ 109 | -- compute gradient descent direction 110 | ------------------------------------------------------------ 111 | if state.nIter == 1 then 112 | d = g:clone():mul(-1) -- -g 113 | old_dirs = {} 114 | old_stps = {} 115 | Hdiag = 1 116 | else 117 | -- do lbfgs update (update memory) 118 | local y = table.remove(state.dir_bufs) -- pop 119 | local s = table.remove(state.stp_bufs) 120 | y:add(g, -1, g_old) -- g - g_old 121 | s:mul(d, t) -- d*t 122 | local ys = y:dot(s) -- y*s 123 | if ys > 1e-10 then 124 | -- updating memory 125 | if #old_dirs == nCorrection then 126 | -- shift history by one (limited-memory) 127 | local removed1 = table.remove(old_dirs, 1) 128 | local removed2 = table.remove(old_stps, 1) 129 | table.insert(state.dir_bufs, removed1) 130 | table.insert(state.stp_bufs, removed2) 131 | end 132 | 133 | -- store new direction/step 134 | table.insert(old_dirs, s) 135 | table.insert(old_stps, y) 136 | 137 | -- update scale of initial Hessian approximation 138 | Hdiag = ys / y:dot(y) -- (y*y) 139 | else 140 | -- put y and s back into the buffer pool 141 | table.insert(state.dir_bufs, y) 142 | table.insert(state.stp_bufs, s) 143 | end 144 | 145 | -- compute the approximate (L-BFGS) inverse Hessian 146 | -- multiplied by the gradient 147 | local k = #old_dirs 148 | 149 | -- need to be accessed element-by-element, so don't re-type tensor: 150 | state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro 151 | for i = 1,k do 152 | ro[i] = 1 / old_stps[i]:dot(old_dirs[i]) 153 | end 154 | 155 | -- iteration in L-BFGS loop collapsed to use just one buffer 156 | local q = tmp1 -- reuse tmp1 for the q buffer 157 | -- need to be accessed element-by-element, so don't re-type tensor: 158 | state.al = state.al or torch.zeros(nCorrection) local al = state.al 159 | 160 | q:mul(g, -1) -- -g 161 | for i = k,1,-1 do 162 | al[i] = old_dirs[i]:dot(q) * ro[i] 163 | q:add(-al[i], old_stps[i]) 164 | end 165 | 166 | -- multiply by initial Hessian 167 | r = d -- share the same buffer, since we don't need the old d 168 | r:mul(q, Hdiag) -- q[1] * Hdiag 169 | for i = 1,k do 170 | local be_i = old_stps[i]:dot(r) * ro[i] 171 | r:add(al[i]-be_i, old_dirs[i]) 172 | end 173 | -- final direction is in r/d (same object) 174 | end 175 | g_old = g_old or g:clone() 176 | g_old:copy(g) 177 | f_old = f 178 | 179 | ------------------------------------------------------------ 180 | -- compute step length 181 | ------------------------------------------------------------ 182 | -- directional derivative 183 | local gtd = g:dot(d) -- g * d 184 | 185 | -- check that progress can be made along that direction 186 | if gtd > -tolX then 187 | break 188 | end 189 | 190 | -- reset initial guess for step size 191 | if state.nIter == 1 then 192 | tmp1:copy(g):abs() 193 | t = min(1,1/tmp1:sum()) * learningRate 194 | else 195 | t = learningRate 196 | end 197 | 198 | -- optional line search: user function 199 | local lsFuncEval = 0 200 | if lineSearch and type(lineSearch) == 'function' then 201 | -- perform line search, using user function 202 | f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts) 203 | table.insert(f_hist, f) 204 | else 205 | -- no line search, simply move with fixed-step 206 | x:add(t,d) 207 | if nIter ~= maxIter then 208 | -- re-evaluate function only if not in last iteration 209 | -- the reason we do this: in a stochastic setting, 210 | -- no use to re-evaluate that function here 211 | f,g = opfunc(x) 212 | lsFuncEval = 1 213 | table.insert(f_hist, f) 214 | end 215 | end 216 | 217 | -- update func eval 218 | currentFuncEval = currentFuncEval + lsFuncEval 219 | state.funcEval = state.funcEval + lsFuncEval 220 | 221 | ------------------------------------------------------------ 222 | -- check conditions 223 | ------------------------------------------------------------ 224 | if nIter == maxIter then 225 | -- no use to run tests 226 | verbose('reached max number of iterations') 227 | break 228 | end 229 | 230 | if currentFuncEval >= maxEval then 231 | -- max nb of function evals 232 | verbose('max nb of function evals') 233 | break 234 | end 235 | 236 | tmp1:copy(g):abs() 237 | if tmp1:sum() <= tolFun then 238 | -- check optimality 239 | verbose('optimality condition below tolFun') 240 | break 241 | end 242 | 243 | tmp1:copy(d):mul(t):abs() 244 | if tmp1:sum() <= tolX then 245 | -- step size below tolX 246 | verbose('step size below tolX') 247 | break 248 | end 249 | 250 | if abs(f-f_old) < tolX then 251 | -- function value changing less than tolX 252 | verbose('function value changing less than tolX') 253 | break 254 | end 255 | end 256 | 257 | -- save state 258 | state.old_dirs = old_dirs 259 | state.old_stps = old_stps 260 | state.Hdiag = Hdiag 261 | state.g_old = g_old 262 | state.f_old = f_old 263 | state.t = t 264 | state.d = d 265 | 266 | -- return optimal x, and history of f(x) 267 | return x,f_hist,currentFuncEval 268 | end 269 | -------------------------------------------------------------------------------- /cmaes.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'math' 3 | 4 | local BestSolution = {} 5 | --[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), 6 | ported from https://www.lri.fr/~hansen/barecmaes2.html. 7 | 8 | Parameters 9 | ---------- 10 | ARGS: 11 | 12 | - `opfunc` : a function that takes a single input (X), the point of 13 | evaluation, and returns f(X) and df/dX. Note that df/dX is not used 14 | - `x` : the initial point 15 | - `state.sigma` 16 | float, initial step-size (standard deviation in each 17 | coordinate) 18 | - `state.maxEval` 19 | int, maximal number of function evaluations 20 | - `state.ftarget` 21 | float, target function value 22 | - `state.popsize` 23 | population size. If this is left empty, 24 | 4 + int(3 * log(|x|)) will be used 25 | - `state.ftarget` 26 | stop if fitness < ftarget 27 | - `state.verb_disp` 28 | int, display on console every verb_disp iteration, 0 for never 29 | 30 | RETURN: 31 | - `x*` : the new `x` vector, at the optimal point 32 | - `f` : a table of all function values: 33 | `f[1]` is the value of the function before any optimization and 34 | `f[#f]` is the final fully optimized value, at `x*` 35 | --]] 36 | function optim.cmaes(opfunc, x, config, state) 37 | if (x.triu == nil or x.diag == nil) then 38 | error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x") 39 | end 40 | -- process input parameters 41 | local config = config or {} 42 | local state = state or config 43 | local xmean = x:clone():view(-1) -- distribution mean, a flattened copy 44 | local N = xmean:size(1) -- number of objective variables/problem dimension 45 | local sigma = state.sigma -- coordinate wise standard deviation (step size) 46 | local ftarget = state.ftarget -- stop if fitness < ftarget 47 | local maxEval = tonumber(state.maxEval) or 1e3*N^2 48 | local objfunc = opfunc 49 | local verb_disp = state.verb_disp -- display step size 50 | local min_iterations = state.min_iterations or 1 51 | 52 | local lambda = state.popsize -- population size, offspring number 53 | -- Strategy parameter setting: Selection 54 | if state.popsize == nil then 55 | lambda = 4 + math.floor(3 * math.log(N)) 56 | end 57 | 58 | local mu = lambda / 2 -- number of parents/points for recombination 59 | local weights = torch.range(0,mu-1):apply(function(i) 60 | return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights 61 | weights:div(weights:sum()) -- normalize recombination weights array 62 | local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i 63 | weights = weights:typeAs(x) 64 | 65 | -- Strategy parameter setting: Adaptation 66 | local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C 67 | local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control 68 | local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C 69 | local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update 70 | local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1 71 | 72 | -- Initialize dynamic (internal) state variables 73 | local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C 74 | local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma 75 | local B = torch.eye(N):typeAs(x) -- B defines the coordinate system 76 | local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling 77 | local C = torch.eye(N):typeAs(x) -- covariance matrix 78 | if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig 79 | error('torch.symeig not available for ' .. x:type() .. 80 | " please use Float- or DoubleTensor for x") 81 | end 82 | local candidates = torch.Tensor(lambda,N):typeAs(x) 83 | local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2 84 | local eigeneval = 0 -- tracking the update of B and D 85 | local counteval = 0 86 | local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination 87 | local fitvals = torch.Tensor(lambda)-- fitness values 88 | local best = BestSolution.new(nil,nil,counteval) 89 | local iteration = 0 -- iteration of the optimize loop 90 | 91 | 92 | local function ask() 93 | --[[return a list of lambda candidate solutions according to 94 | m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I) 95 | --]] 96 | -- Eigendecomposition: first update B, D and invsqrtC from C 97 | -- postpone in case to achieve O(N^2) 98 | if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then 99 | eigeneval = counteval 100 | C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry 101 | D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3) 102 | D = torch.sqrt(D) -- D contains standard deviations now 103 | invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t()) 104 | end 105 | for k=1,lambda do --repeat lambda times 106 | local z = D:clone():normal(0,1):cmul(D) 107 | candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma) 108 | end 109 | 110 | return candidates 111 | end 112 | 113 | 114 | local function tell(arx) 115 | --[[update the evolution paths and the distribution parameters m, 116 | sigma, and C within CMA-ES. 117 | 118 | Parameters 119 | ---------- 120 | `arx` 121 | a list of solutions, presumably from `ask()` 122 | `fitvals` 123 | the corresponding objective function values --]] 124 | -- bookkeeping, preparation 125 | counteval = counteval + lambda -- slightly artificial to do here 126 | local xold = xmean:clone() 127 | 128 | -- Sort by fitness and compute weighted mean into xmean 129 | local arindex = nil --sorted indices 130 | fitvals, arindex = torch.sort(fitvals) 131 | arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions 132 | 133 | table.insert(f_hist, fitvals[1]) --append best fitness to history 134 | best:update(arx[1], fitvals[1], counteval) 135 | 136 | xmean:zero() 137 | xmean:addmv(arx:t(), weights) --dot product 138 | 139 | -- Cumulation: update evolution paths 140 | local y = xmean - xold 141 | local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold) 142 | 143 | local c = (cs * (2-cs) * mueff)^0.5 / sigma 144 | ps = ps - ps * cs + z * c -- exponential decay on ps 145 | local hsig = (torch.sum(torch.pow(ps,2)) / 146 | (1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1)) 147 | hsig = hsig and 1.0 or 0.0 --use binary numbers 148 | 149 | c = (cc * (2-cc) * mueff)^0.5 / sigma 150 | pc = pc - pc * cc + y * c * hsig -- exponential decay on pc 151 | 152 | -- Adapt covariance matrix C 153 | local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc) 154 | -- for a minor adjustment to the variance loss by hsig 155 | for i=1,N do 156 | for j=1,N do 157 | local r = torch.range(1,mu) 158 | r:apply(function(k) 159 | return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end) 160 | local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update 161 | C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + 162 | c1 * pc[i]*pc[j] + cmu * Cmuij) 163 | end 164 | end 165 | 166 | -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82 167 | sigma = sigma * math.exp(math.min(0.6, 168 | (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2)) 169 | end 170 | 171 | local function stop() 172 | --[[return satisfied termination conditions in a table like 173 | {'termination reason':value, ...}, for example {'tolfun':1e-12}, 174 | or the empty table {}--]] 175 | local res = {} 176 | if counteval > 0 then 177 | if counteval >= maxEval then 178 | res['evals'] = maxEval 179 | end 180 | if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then 181 | res['ftarget'] = ftarget 182 | end 183 | if torch.max(D) > 1e7 * torch.min(D) then 184 | res['condition'] = 1e7 185 | end 186 | if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then 187 | res['tolfun'] = 1e-12 188 | end 189 | if sigma * torch.max(D) < 1e-11 then 190 | -- remark: max(D) >= max(diag(C))^0.5 191 | res['tolx'] = 1e-11 192 | end 193 | end 194 | return res 195 | end 196 | 197 | local function disp(verb_modulo) 198 | --[[display some iteration info--]] 199 | if verb_disp == 0 then 200 | return nil 201 | end 202 | local iteration = counteval / lambda 203 | 204 | if iteration == 1 or iteration % (10*verb_modulo) == 0 then 205 | print('evals:\t ax-ratio max(std) f-value') 206 | end 207 | if iteration <= 2 or iteration % verb_modulo == 0 then 208 | local max_std = math.sqrt(torch.max(torch.diag(C))) 209 | print(tostring(counteval).. ': ' .. 210 | string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) 211 | .. tostring(fitvals[1])) 212 | end 213 | 214 | return nil 215 | end 216 | 217 | while next(stop()) == nil or iteration < min_iterations do 218 | iteration = iteration + 1 219 | 220 | local X = ask() -- deliver candidate solutions 221 | for i=1, lambda do 222 | -- put candidate tensor back in input shape and evaluate in opfunc 223 | local candidate = X[i]:viewAs(x) 224 | fitvals[i] = objfunc(candidate) 225 | end 226 | 227 | tell(X) 228 | disp(verb_disp) 229 | end 230 | 231 | local bestmu, f, c = best:get() 232 | if verb_disp > 0 then 233 | for k, v in pairs(stop()) do 234 | print('termination by', k, '=', v) 235 | end 236 | print('best f-value =', f) 237 | print('solution = ') 238 | print(bestmu) 239 | print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration) 240 | end 241 | table.insert(f_hist, f) 242 | 243 | return bestmu, f_hist, counteval 244 | end 245 | 246 | 247 | 248 | BestSolution.__index = BestSolution 249 | function BestSolution.new(x, f, evals) 250 | local self = setmetatable({}, BestSolution) 251 | self.x = x 252 | self.f = f 253 | self.evals = evals 254 | return self 255 | end 256 | 257 | function BestSolution.update(self, arx, arf, evals) 258 | --[[initialize the best solution with `x`, `f`, and `evals`. 259 | Better solutions have smaller `f`-values.--]] 260 | if self.f == nil or arf < self.f then 261 | self.x = arx:clone() 262 | self.f = arf 263 | self.evals = evals 264 | end 265 | return self 266 | end 267 | 268 | function BestSolution.get(self) 269 | return self.x, self.f, self.evals 270 | end 271 | -------------------------------------------------------------------------------- /ConfusionMatrix.lua: -------------------------------------------------------------------------------- 1 | --[[ A Confusion Matrix class 2 | 3 | Example: 4 | 5 | conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix 6 | conf:zero() -- reset matrix 7 | for i = 1,N do 8 | conf:add( neuralnet:forward(sample), label ) -- accumulate errors 9 | end 10 | print(conf) -- print matrix 11 | image.display(conf:render()) -- render matrix 12 | ]] 13 | local ConfusionMatrix = torch.class('optim.ConfusionMatrix') 14 | 15 | function ConfusionMatrix:__init(nclasses, classes) 16 | if type(nclasses) == 'table' then 17 | classes = nclasses 18 | nclasses = #classes 19 | end 20 | self.mat = torch.LongTensor(nclasses,nclasses):zero() 21 | self.valids = torch.FloatTensor(nclasses):zero() 22 | self.unionvalids = torch.FloatTensor(nclasses):zero() 23 | self.nclasses = nclasses 24 | self.totalValid = 0 25 | self.averageValid = 0 26 | self.classes = classes or {} 27 | -- buffers 28 | self._mat_flat = self.mat:view(-1) 29 | self._target = torch.FloatTensor() 30 | self._prediction = torch.FloatTensor() 31 | self._max = torch.FloatTensor() 32 | self._pred_idx = torch.LongTensor() 33 | self._targ_idx = torch.LongTensor() 34 | end 35 | 36 | -- takes scalar prediction and target as input 37 | function ConfusionMatrix:_add(p, t) 38 | assert(p and type(p) == 'number') 39 | assert(t and type(t) == 'number') 40 | -- non-positive values are considered missing 41 | -- and therefore ignored 42 | if t > 0 then 43 | self.mat[t][p] = self.mat[t][p] + 1 44 | end 45 | end 46 | 47 | function ConfusionMatrix:add(prediction, target) 48 | if type(prediction) == 'number' then 49 | -- comparing numbers 50 | self:_add(prediction, target) 51 | else 52 | self._prediction:resize(prediction:size()):copy(prediction) 53 | assert(prediction:dim() == 1) 54 | if type(target) == 'number' then 55 | -- prediction is a vector, then target assumed to be an index 56 | self._max:max(self._pred_idx, self._prediction, 1) 57 | self:_add(self._pred_idx[1], target) 58 | else 59 | -- both prediction and target are vectors 60 | assert(target:dim() == 1) 61 | self._target:resize(target:size()):copy(target) 62 | self._max:max(self._targ_idx, self._target, 1) 63 | self._max:max(self._pred_idx, self._prediction, 1) 64 | self:_add(self._pred_idx[1], self._targ_idx[1]) 65 | end 66 | end 67 | end 68 | 69 | function ConfusionMatrix:batchAdd(predictions, targets) 70 | local preds, targs, __ 71 | self._prediction:resize(predictions:size()):copy(predictions) 72 | if predictions:dim() == 1 then 73 | -- predictions is a vector of classes 74 | preds = self._prediction 75 | elseif predictions:dim() == 2 then 76 | -- prediction is a matrix of class likelihoods 77 | if predictions:size(2) == 1 then 78 | -- or prediction just needs flattening 79 | preds = self._prediction:select(2,1) 80 | else 81 | self._max:max(self._pred_idx, self._prediction, 2) 82 | preds = self._pred_idx:select(2,1) 83 | end 84 | else 85 | error("predictions has invalid number of dimensions") 86 | end 87 | 88 | self._target:resize(targets:size()):copy(targets) 89 | if targets:dim() == 1 then 90 | -- targets is a vector of classes 91 | targs = self._target 92 | elseif targets:dim() == 2 then 93 | -- targets is a matrix of one-hot rows 94 | if targets:size(2) == 1 then 95 | -- or targets just needs flattening 96 | targs = self._target:select(2,1) 97 | else 98 | self._max:max(self._targ_idx, self._target, 2) 99 | targs = self._targ_idx:select(2,1) 100 | end 101 | else 102 | error("targets has invalid number of dimensions") 103 | end 104 | 105 | -- non-positive values are considered missing and therefore ignored 106 | local mask = targs:ge(1) 107 | targs = targs[mask] 108 | preds = preds[mask] 109 | 110 | self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility 111 | 112 | preds = preds:typeAs(targs) 113 | 114 | assert(self.mat:isContiguous() and self.mat:stride(2) == 1) 115 | local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat) 116 | local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1)) 117 | self._mat_flat:indexAdd(1, indices, ones) 118 | end 119 | 120 | function ConfusionMatrix:zero() 121 | self.mat:zero() 122 | self.valids:zero() 123 | self.unionvalids:zero() 124 | self.totalValid = 0 125 | self.averageValid = 0 126 | end 127 | 128 | local function isNaN(number) 129 | return number ~= number 130 | end 131 | 132 | function ConfusionMatrix:updateValids() 133 | local total = 0 134 | for t = 1,self.nclasses do 135 | self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum() 136 | self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t]) 137 | total = total + self.mat[t][t] 138 | end 139 | self.totalValid = total / self.mat:sum() 140 | self.averageValid = 0 141 | self.averageUnionValid = 0 142 | local nvalids = 0 143 | local nunionvalids = 0 144 | for t = 1,self.nclasses do 145 | if not isNaN(self.valids[t]) then 146 | self.averageValid = self.averageValid + self.valids[t] 147 | nvalids = nvalids + 1 148 | end 149 | if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then 150 | self.averageUnionValid = self.averageUnionValid + self.unionvalids[t] 151 | nunionvalids = nunionvalids + 1 152 | end 153 | end 154 | self.averageValid = self.averageValid / nvalids 155 | self.averageUnionValid = self.averageUnionValid / nunionvalids 156 | end 157 | 158 | -- Calculating FAR/FRR associated with the confusion matrix 159 | 160 | function ConfusionMatrix:farFrr() 161 | local cmat = self.mat 162 | local noOfClasses = cmat:size()[1] 163 | self._frrs = self._frrs or torch.zeros(noOfClasses) 164 | self._frrs:zero() 165 | self._classFrrs = self._classFrrs or torch.zeros(noOfClasses) 166 | self._classFrrs:zero() 167 | self._classFrrs:add(-1) 168 | self._fars = self._fars or torch.zeros(noOfClasses) 169 | self._fars:zero() 170 | self._classFars = self._classFars or torch.zeros(noOfClasses) 171 | self._classFars:zero() 172 | self._classFars:add(-1) 173 | local classSamplesCount = cmat:sum(2) 174 | local indx = 1 175 | for i=1,noOfClasses do 176 | if classSamplesCount[i][1] ~= 0 then 177 | self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1] 178 | self._classFrrs[i] = self._frrs[indx] 179 | -- Calculating FARs 180 | local farNumerator = 0 181 | local farDenominator = 0 182 | for j=1, noOfClasses do 183 | if i ~= j then 184 | if classSamplesCount[j][1] ~= 0 then 185 | farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1] 186 | farDenominator = farDenominator + 1 187 | end 188 | end 189 | end 190 | self._fars[indx] = farNumerator/farDenominator 191 | self._classFars[i] = self._fars[indx] 192 | indx = indx + 1 193 | end 194 | end 195 | indx = indx - 1 196 | local returnFrrs = self._frrs[{{1, indx}}] 197 | local returnFars = self._fars[{{1, indx}}] 198 | return self._classFrrs, self._classFars, returnFrrs, returnFars 199 | end 200 | 201 | local function log10(n) 202 | if math.log10 then 203 | return math.log10(n) 204 | else 205 | return math.log(n) / math.log(10) 206 | end 207 | end 208 | 209 | function ConfusionMatrix:__tostring__() 210 | self:updateValids() 211 | local str = {'ConfusionMatrix:\n'} 212 | local nclasses = self.nclasses 213 | table.insert(str, '[') 214 | local maxCnt = self.mat:max() 215 | local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt))) 216 | for t = 1,nclasses do 217 | local pclass = self.valids[t] * 100 218 | pclass = string.format('%2.3f', pclass) 219 | if t == 1 then 220 | table.insert(str, '[') 221 | else 222 | table.insert(str, ' [') 223 | end 224 | for p = 1,nclasses do 225 | table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p])) 226 | end 227 | if self.classes and self.classes[1] then 228 | if t == nclasses then 229 | table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') 230 | else 231 | table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') 232 | end 233 | else 234 | if t == nclasses then 235 | table.insert(str, ']] ' .. pclass .. '% \n') 236 | else 237 | table.insert(str, '] ' .. pclass .. '% \n') 238 | end 239 | end 240 | end 241 | table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n') 242 | table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n') 243 | table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%') 244 | return table.concat(str) 245 | end 246 | 247 | function ConfusionMatrix:render(sortmode, display, block, legendwidth) 248 | -- args 249 | local confusion = self.mat:double() 250 | local classes = self.classes 251 | local sortmode = sortmode or 'score' -- 'score' or 'occurrence' 252 | local block = block or 25 253 | local legendwidth = legendwidth or 200 254 | local display = display or false 255 | 256 | -- legends 257 | local legend = { 258 | ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]', 259 | ['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]' 260 | } 261 | 262 | -- parse matrix / normalize / count scores 263 | local diag = torch.FloatTensor(#classes) 264 | local freqs = torch.FloatTensor(#classes) 265 | local unconf = confusion 266 | local confusion = confusion:clone() 267 | local corrects = 0 268 | local total = 0 269 | for target = 1,#classes do 270 | freqs[target] = confusion[target]:sum() 271 | corrects = corrects + confusion[target][target] 272 | total = total + freqs[target] 273 | confusion[target]:div( math.max(confusion[target]:sum(),1) ) 274 | diag[target] = confusion[target][target] 275 | end 276 | 277 | -- accuracies 278 | local accuracy = corrects / total * 100 279 | local perclass = 0 280 | local total = 0 281 | for target = 1,#classes do 282 | if confusion[target]:sum() > 0 then 283 | perclass = perclass + diag[target] 284 | total = total + 1 285 | end 286 | end 287 | perclass = perclass / total * 100 288 | freqs:div(unconf:sum()) 289 | 290 | -- sort matrix 291 | if sortmode == 'score' then 292 | _,order = torch.sort(diag,1,true) 293 | elseif sortmode == 'occurrence' then 294 | _,order = torch.sort(freqs,1,true) 295 | else 296 | error('sort mode must be one of: score | occurrence') 297 | end 298 | 299 | -- render matrix 300 | local render = torch.zeros(#classes*block, #classes*block) 301 | for target = 1,#classes do 302 | for prediction = 1,#classes do 303 | render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]] 304 | end 305 | end 306 | 307 | -- add grid 308 | for target = 1,#classes do 309 | render[{ {target*block},{} }] = 0.1 310 | render[{ {},{target*block} }] = 0.1 311 | end 312 | 313 | -- create rendering 314 | require 'image' 315 | require 'qtwidget' 316 | require 'qttorch' 317 | local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] ) 318 | image.display{image=render, win=win1} 319 | 320 | -- add legend 321 | for i in ipairs(classes) do 322 | -- background cell 323 | win1:setcolor{r=0,g=0,b=0} 324 | win1:rectangle((#render)[2],(i-1)*block,legendwidth,block) 325 | win1:fill() 326 | 327 | -- % 328 | win1:setfont(qt.QFont{serif=false, size=fontsize}) 329 | local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4 330 | win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2} 331 | win1:moveto((#render)[2]+10,i*block-block/3) 332 | win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100)) 333 | 334 | -- legend 335 | win1:setfont(qt.QFont{serif=false, size=fontsize}) 336 | local gscale = diag[order[i]]*0.8+0.2 337 | win1:setcolor{r=gscale,g=gscale,b=gscale} 338 | win1:moveto(120+(#render)[2]+10,i*block-block/3) 339 | win1:show(classes[order[i]]) 340 | 341 | for j in ipairs(classes) do 342 | -- scores 343 | local score = confusion[order[j]][order[i]] 344 | local gscale = (1-score)*(score*0.8+0.2) 345 | win1:setcolor{r=gscale,g=gscale,b=gscale} 346 | win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3) 347 | win1:show(string.format('%02.0f',math.floor(score*100+0.5))) 348 | end 349 | end 350 | 351 | -- generate tensor 352 | local t = win1:image():toTensor() 353 | 354 | -- display 355 | if display then 356 | image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)} 357 | end 358 | 359 | -- return rendering 360 | return t 361 | end 362 | -------------------------------------------------------------------------------- /doc/image/parameterflattening.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 339 | -------------------------------------------------------------------------------- /doc/algos.md: -------------------------------------------------------------------------------- 1 | 2 | # Optimization algorithms 3 | 4 | The following algorithms are provided: 5 | 6 | * [*Stochastic Gradient Descent*](#optim.sgd) 7 | * [*Averaged Stochastic Gradient Descent*](#optim.asgd) 8 | * [*L-BFGS*](#optim.lbfgs) 9 | * [*Congugate Gradients*](#optim.cg) 10 | * [*AdaDelta*](#optim.adadelta) 11 | * [*AdaGrad*](#optim.adagrad) 12 | * [*Adam*](#optim.adam) 13 | * [*AdaMax*](#optim.adamax) 14 | * [*FISTA with backtracking line search*](#optim.FistaLS) 15 | * [*Nesterov's Accelerated Gradient method*](#optim.nag) 16 | * [*RMSprop*](#optim.rmsprop) 17 | * [*Rprop*](#optim.rprop) 18 | * [*CMAES*](#optim.cmaes) 19 | 20 | All these algorithms are designed to support batch optimization as well as stochastic optimization. 21 | It's up to the user to construct an objective function that represents the batch, mini-batch, or single sample on which to evaluate the objective. 22 | 23 | Some of these algorithms support a line search, which can be passed as a function (*L-BFGS*), whereas others only support a learning rate (*SGD*). 24 | 25 | General interface: 26 | 27 | ```lua 28 | x*, {f}, ... = optim.method(opfunc, x[, config][, state]) 29 | ``` 30 | 31 | 32 | 33 | ## sgd(opfunc, x[, config][, state]) 34 | 35 | An implementation of *Stochastic Gradient Descent* (*SGD*). 36 | 37 | Arguments: 38 | 39 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX` 40 | * `x`: the initial point 41 | * `config`: a table with configuration parameters for the optimizer 42 | * `config.learningRate`: learning rate 43 | * `config.learningRateDecay`: learning rate decay 44 | * `config.weightDecay`: weight decay 45 | * `config.weightDecays`: vector of individual weight decays 46 | * `config.momentum`: momentum 47 | * `config.dampening`: dampening for momentum 48 | * `config.nesterov`: enables Nesterov momentum 49 | * `state`: a table describing the state of the optimizer; after each call the state is modified 50 | * `state.learningRates`: vector of individual learning rates 51 | 52 | Returns: 53 | 54 | * `x*`: the new x vector 55 | * `f(x)`: the function, evaluated before the update 56 | 57 | 58 | 59 | ## asgd(opfunc, x[, config][, state]) 60 | 61 | An implementation of *Averaged Stochastic Gradient Descent* (*ASGD*): 62 | 63 | ```lua 64 | x = (1 - lambda eta_t) x - eta_t df / dx(z, x) 65 | a = a + mu_t [ x - a ] 66 | 67 | eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75 68 | mu_t = 1 / max(1, t - t0) 69 | ``` 70 | 71 | Arguments: 72 | 73 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX` 74 | * `x`: the initial point 75 | * `config`: a table with configuration parameters for the optimizer 76 | * `config.eta0`: learning rate 77 | * `config.lambda`: decay term 78 | * `config.alpha`: power for eta update 79 | * `config.t0`: point at which to start averaging 80 | 81 | Returns: 82 | 83 | * `x*`: the new x vector 84 | * `f(x)`: the function, evaluated before the update 85 | * `ax`: the averaged x vector 86 | 87 | 88 | 89 | ## lbfgs(opfunc, x[, config][, state]) 90 | 91 | An implementation of *L-BFGS* that relies on a user-provided line search function (`state.lineSearch`). 92 | If this function is not provided, then a simple learning rate is used to produce fixed size steps. 93 | Fixed size steps are much less costly than line searches, and can be useful for stochastic problems. 94 | 95 | The learning rate is used even when a line search is provided. 96 | This is also useful for large-scale stochastic problems, where opfunc is a noisy approximation of `f(x)`. 97 | In that case, the learning rate allows a reduction of confidence in the step size. 98 | 99 | Arguments: 100 | 101 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX` 102 | * `x`: the initial point 103 | * `config`: a table with configuration parameters for the optimizer 104 | * `config.maxIter`: Maximum number of iterations allowed 105 | * `config.maxEval`: Maximum number of function evaluations 106 | * `config.tolFun`: Termination tolerance on the first-order optimality 107 | * `config.tolX`: Termination tol on progress in terms of func/param changes 108 | * `config.lineSearch`: A line search function 109 | * `config.learningRate`: If no line search provided, then a fixed step size is used 110 | 111 | Returns: 112 | * `x*`: the new `x` vector, at the optimal point 113 | * `f`: a table of all function values: 114 | * `f[1]` is the value of the function before any optimization and 115 | * `f[#f]` is the final fully optimized value, at `x*` 116 | 117 | 118 | 119 | ## cg(opfunc, x[, config][, state]) 120 | 121 | An implementation of the *Conjugate Gradient* method which is a rewrite of `minimize.m` written by Carl E. Rasmussen. 122 | It is supposed to produce exactly same results (give or take numerical accuracy due to some changed order of operations). 123 | You can compare the result on rosenbrock with [`minimize.m`](http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html). 124 | 125 | ```lua 126 | x, fx, c = minimize([0, 0]', 'rosenbrock', -25) 127 | ``` 128 | 129 | Note that we limit the number of function evaluations only, it seems much more important in practical use. 130 | 131 | Arguments: 132 | 133 | * `opfunc`: a function that takes a single input, the point of evaluation. 134 | * `x`: the initial point 135 | * `config`: a table with configuration parameters for the optimizer 136 | * `config.maxEval`: max number of function evaluations 137 | * `config.maxIter`: max number of iterations 138 | * `state`: a table of parameters and temporary allocations. 139 | * `state.df[0, 1, 2, 3]`: if you pass `Tensor` they will be used for temp storage 140 | * `state.[s, x0]`: if you pass `Tensor` they will be used for temp storage 141 | 142 | Returns: 143 | 144 | * `x*`: the new `x` vector, at the optimal point 145 | * `f`: a table of all function values where 146 | * `f[1]` is the value of the function before any optimization and 147 | * `f[#f]` is the final fully optimized value, at `x*` 148 | 149 | 150 | 151 | ## adadelta(opfunc, x[, config][, state]) 152 | 153 | *AdaDelta* implementation for *SGD* http://arxiv.org/abs/1212.5701. 154 | 155 | Arguments: 156 | 157 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX` 158 | * `x`: the initial point 159 | * `config`: a table of hyper-parameters 160 | * `config.rho`: interpolation parameter 161 | * `config.eps`: for numerical stability 162 | * `state`: a table describing the state of the optimizer; after each call the state is modified 163 | * `state.paramVariance`: vector of temporal variances of parameters 164 | * `state.accDelta`: vector of accummulated delta of gradients 165 | 166 | Returns: 167 | 168 | * `x*`: the new x vector 169 | * `f(x)`: the function, evaluated before the update 170 | 171 | 172 | 173 | ## adagrad(opfunc, x[, config][, state]) 174 | 175 | *AdaGrad* implementation for *SGD*. 176 | 177 | Arguments: 178 | 179 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX` 180 | * `x`: the initial point 181 | * `config`: a table with configuration parameters for the optimizer 182 | * `config.learningRate`: learning rate 183 | * `config.learningRateDecay`: learning rate decay 184 | * `config.weightDecay`: weight decay coefficient for regularization 185 | * `state`: a table describing the state of the optimizer; after each call the state is modified 186 | * `state.paramVariance`: vector of temporal variances of parameters 187 | 188 | Returns: 189 | 190 | * `x*`: the new `x` vector 191 | * `f(x)`: the function, evaluated before the update 192 | 193 | 194 | 195 | ## adam(opfunc, x[, config][, state]) 196 | 197 | An implementation of *Adam* from http://arxiv.org/pdf/1412.6980.pdf. 198 | 199 | Arguments: 200 | 201 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX` 202 | * `x`: the initial point 203 | * `config`: a table with configuration parameters for the optimizer 204 | * `config.learningRate`: learning rate 205 | * `config.learningRateDecay`: learning rate decay 206 | * `config.weightDecay`: weight decay coefficient for regularization 207 | * `config.beta1`: first moment coefficient 208 | * `config.beta2`: second moment coefficient 209 | * `config.epsilon`: for numerical stability 210 | * `state`: a table describing the state of the optimizer; after each call the state is modified 211 | 212 | Returns: 213 | 214 | * `x*`: the new x vector 215 | * `f(x)`: the function, evaluated before the update 216 | 217 | 218 | 219 | ## adamax(opfunc, x[, config][, state]) 220 | 221 | An implementation of *AdaMax* http://arxiv.org/pdf/1412.6980.pdf. 222 | 223 | Arguments: 224 | 225 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX` 226 | * `x`: the initial point 227 | * `config`: a table with configuration parameters for the optimizer 228 | * `config.learningRate`: learning rate 229 | * `config.beta1`: first moment coefficient 230 | * `config.beta2`: second moment coefficient 231 | * `config.epsilon`: for numerical stability 232 | * `state`: a table describing the state of the optimizer; after each call the state is modified 233 | 234 | Returns: 235 | 236 | * `x*`: the new `x` vector 237 | * `f(x)`: the function, evaluated before the update 238 | 239 | 240 | 241 | ## FistaLS(f, g, pl, xinit[, params]) 242 | 243 | *Fista* with backtracking *Line Search*: 244 | 245 | * `f`: smooth function 246 | * `g`: non-smooth function 247 | * `pl`: minimizer of intermediate problem Q(x, y) 248 | * `xinit`: initial point 249 | * `params`: table of parameters (**optional**) 250 | * `params.L`: 1/(step size) for ISTA/FISTA iteration (0.1) 251 | * `params.Lstep`: step size multiplier at each iteration (1.5) 252 | * `params.maxiter`: max number of iterations (50) 253 | * `params.maxline`: max number of line search iterations per iteration (20) 254 | * `params.errthres`: Error thershold for convergence check (1e-4) 255 | * `params.doFistaUpdate`: true : use FISTA, false: use ISTA (true) 256 | * `params.verbose`: store each iteration solution and print detailed info (false) 257 | 258 | On output, `params` will contain these additional fields that can be reused. 259 | * `params.L`: last used L value will be written. 260 | 261 | These are temporary storages needed by the algo and if the same params object is 262 | passed a second time, these same storages will be used without new allocation. 263 | * `params.xkm`: previous iterarion point 264 | * `params.y`: fista iteration 265 | * `params.ply`: `ply = pl(y * 1/L grad(f))` 266 | 267 | Returns the solution `x` and history of `{function evals, number of line search , ...}`. 268 | 269 | Algorithm is published in http://epubs.siam.org/doi/abs/10.1137/080716542 270 | 271 | 272 | 273 | ## nag(opfunc, x[, config][, state]) 274 | 275 | An implementation of *SGD* adapted with features of *Nesterov's Accelerated Gradient method*, based on the paper "On the Importance of Initialization and Momentum in Deep Learning" (Sutskever et. al., ICML 2013) http://www.cs.toronto.edu/~fritz/absps/momentum.pdf. 276 | 277 | Arguments: 278 | 279 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX` 280 | * `x`: the initial point 281 | * `config`: a table with configuration parameters for the optimizer 282 | * `config.learningRate`: learning rate 283 | * `config.learningRateDecay`: learning rate decay 284 | * `config.weightDecay`: weight decay 285 | * `config.momentum`: momentum 286 | * `config.learningRates`: vector of individual learning rates 287 | 288 | Returns: 289 | 290 | * `x*`: the new `x` vector 291 | * `f(x)`: the function, evaluated before the update 292 | 293 | 294 | 295 | ## rmsprop(opfunc, x[, config][, state]) 296 | 297 | An implementation of *RMSprop*. 298 | 299 | Arguments: 300 | 301 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX` 302 | * `x`: the initial point 303 | * `config`: a table with configuration parameters for the optimizer 304 | * `config.learningRate`: learning rate 305 | * `config.alpha`: smoothing constant 306 | * `config.epsilon`: value with which to initialise m 307 | * `state`: a table describing the state of the optimizer; after each call the state is modified 308 | * `state.m`: leaky sum of squares of parameter gradients, 309 | * `state.tmp`: and the square root (with epsilon smoothing) 310 | 311 | Returns: 312 | 313 | * `x*`: the new x vector 314 | * `f(x)`: the function, evaluated before the update 315 | 316 | 317 | 318 | ## rprop(opfunc, x[, config][, state]) 319 | 320 | A plain implementation of *Rprop* (Martin Riedmiller, Koray Kavukcuoglu 2013). 321 | 322 | Arguments: 323 | 324 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX` 325 | * `x`: the initial point 326 | * `config`: a table with configuration parameters for the optimizer 327 | * `config.stepsize`: initial step size, common to all components 328 | * `config.etaplus`: multiplicative increase factor, > 1 (default 1.2) 329 | * `config.etaminus`: multiplicative decrease factor, < 1 (default 0.5) 330 | * `config.stepsizemax`: maximum stepsize allowed (default 50) 331 | * `config.stepsizemin`: minimum stepsize allowed (default 1e-6) 332 | * `config.niter`: number of iterations (default 1) 333 | 334 | Returns: 335 | 336 | * `x*`: the new x vector 337 | * `f(x)`: the function, evaluated before the update 338 | 339 | 340 | 341 | ## cmaes(opfunc, x[, config][, state]) 342 | 343 | An implementation of *CMAES* (*Covariance Matrix Adaptation Evolution Strategy*), ported from https://www.lri.fr/~hansen/barecmaes2.html. 344 | 345 | *CMAES* is a stochastic, derivative-free method for heuristic global optimization of non-linear or non-convex continuous optimization problems. 346 | Note that this method will on average take much more function evaluations to converge then a gradient based method. 347 | 348 | Arguments: 349 | 350 | If `state` is specified, then `config` is not used at all. 351 | Otherwise `state` is `config`. 352 | 353 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`. Note that `df/dX` is not used and can be left 0 354 | * `x`: the initial point 355 | * `state`: a table describing the state of the optimizer; after each call the state is modified 356 | * `state.sigma`: float, initial step-size (standard deviation in each coordinate) 357 | * `state.maxEval`: int, maximal number of function evaluations 358 | * `state.ftarget`: float, target function value 359 | * `state.popsize`: population size. If this is left empty, `4 + int(3 * log(|x|))` will be used 360 | * `state.ftarget`: stop if `fitness < ftarget` 361 | * `state.verb_disp`: display info on console every verb_disp iteration, 0 for never 362 | 363 | Returns: 364 | * `x*`: the new `x` vector, at the optimal point 365 | * `f`: a table of all function values: 366 | * `f[1]` is the value of the function before any optimization and 367 | * `f[#f]` is the final fully optimized value, at `x*` 368 | --------------------------------------------------------------------------------