├── .gitignore
├── .dokx
├── mkdocs.yml
├── test
    ├── test_cg.lua
    ├── test_adam.lua
    ├── test_sgd.lua
    ├── test_lbfgs_w_ls.lua
    ├── test_adagrad.lua
    ├── test_rmsprop.lua
    ├── test_adamax.lua
    ├── test_adadelta.lua
    ├── test_cmaes.lua
    ├── test_logger.lua
    ├── l2.lua
    ├── test_lbfgs.lua
    ├── test_confusion.lua
    ├── rosenbrock.lua
    ├── test_fista.lua
    └── sparsecoding.lua
├── CMakeLists.txt
├── init.lua
├── optim-1.0.5-0.rockspec
├── optim-1.0.4-0.rockspec
├── optim-1.0.3-0.rockspec
├── optim-1.0.3-1.rockspec
├── checkgrad.lua
├── README.md
├── adagrad.lua
├── rmsprop.lua
├── COPYRIGHT.txt
├── adadelta.lua
├── asgd.lua
├── adamax.lua
├── adam.lua
├── nag.lua
├── sgd.lua
├── rprop.lua
├── Logger.lua
├── cg.lua
├── lswolfe.lua
├── fista.lua
├── polyinterp.lua
├── lbfgs.lua
├── cmaes.lua
├── ConfusionMatrix.lua
└── doc
    └── index.md


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/.dokx:
--------------------------------------------------------------------------------
1 | return {
2 |     githubURL = "torch/optim",
3 |     exclude   = {"test", "polyinterp.lua"}
4 | }
5 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: optim
2 | theme : simplex
3 | repo_url : https://github.com/torch/optim
4 | use_directory_urls : false
5 | markdown_extensions: [extra]
6 | docs_dir : doc
7 | pages:
8 | - [index.md, Optim]
9 | 


--------------------------------------------------------------------------------
/test/test_cg.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | 
 8 | x = torch.Tensor(2):fill(0)
 9 | x,fx,i=optim.cg(rosenbrock,x,{maxIter=50})
10 | 
11 | print()
12 | print('Rosenbrock test: compare with http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html')
13 | print()
14 | print('Number of function evals = ',i)
15 | print('x=');print(x)
16 | print('fx=')
17 | for i=1,#fx do print(i,fx[i]); end
18 | 


--------------------------------------------------------------------------------
/test/test_adam.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | require 'rosenbrock'
 4 | require 'l2'
 5 | x = torch.Tensor(2):fill(0)
 6 | fx = {}
 7 | config = {learningRate=0.002}
 8 | for i = 1,10001 do
 9 | x,f=optim.adam(rosenbrock,x,config)
10 | if (i-1)%1000 == 0 then
11 | table.insert(fx,f[1])
12 | end
13 | end
14 | print()
15 | print('Rosenbrock test')
16 | print()
17 | print('x=');print(x)
18 | print('fx=')
19 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
20 | 


--------------------------------------------------------------------------------
/test/test_sgd.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | x = torch.Tensor(2):fill(0)
 8 | fx = {}
 9 | 
10 | config = {learningRate=1e-3}
11 | for i = 1,10001 do
12 | 	x,f=optim.sgd(rosenbrock,x,config)
13 | 	if (i-1)%1000 == 0 then
14 | 		table.insert(fx,f[1])
15 | 	end
16 | end
17 | 
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 | 


--------------------------------------------------------------------------------
/test/test_lbfgs_w_ls.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | print('--- batch test w/ line search ---')
 8 | 
 9 | x = torch.Tensor(2):fill(0)
10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, lineSearch=optim.lswolfe})
11 | 
12 | print()
13 | print('Rosenbrock test')
14 | print()
15 | print('Number of function evals = ',i)
16 | print('x=');print(x)
17 | print('fx=')
18 | for i=1,#fx do print(i,fx[i]); end
19 | print()
20 | print()
21 | 


--------------------------------------------------------------------------------
/test/test_adagrad.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | x = torch.Tensor(2):fill(0)
 8 | fx = {}
 9 | 
10 | config = {learningRate=1e-1}
11 | for i = 1,10001 do
12 | 	x,f=optim.adagrad(rosenbrock,x,config)
13 | 	if (i-1)%1000 == 0 then
14 | 		table.insert(fx,f[1])
15 | 	end
16 | end
17 | 
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 | 


--------------------------------------------------------------------------------
/test/test_rmsprop.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | x = torch.Tensor(2):fill(0)
 8 | fx = {}
 9 | 
10 | config = {learningRate=5e-4}
11 | for i = 1,10001 do
12 | 	x,f=optim.rmsprop(rosenbrock,x,config)
13 | 	if (i-1)%1000 == 0 then
14 | 		table.insert(fx,f[1])
15 | 	end
16 | end
17 | 
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 | 


--------------------------------------------------------------------------------
/test/test_adamax.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'torch'
 3 | require 'optim'
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | x = torch.Tensor(2):fill(0)
 8 | fx = {}
 9 | state = {}
10 | config = {}
11 | for i = 1,10001 do
12 |    x,f=optim.adamax(rosenbrock,x,config,state)
13 |    if (i-1)%1000 == 0 then
14 |       table.insert(fx,f[1])
15 |    end
16 | end
17 | 
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 | 


--------------------------------------------------------------------------------
/test/test_adadelta.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | x = torch.Tensor(2):fill(0)
 8 | fx = {}
 9 | state = {}
10 | config = {eps=1e-10}
11 | for i = 1,10001 do
12 | 	x,f=optim.adadelta(rosenbrock,x,config,state)
13 | 	if (i-1)%1000 == 0 then
14 | 		table.insert(fx,f[1])
15 | 	end
16 | end
17 | 
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 | 


--------------------------------------------------------------------------------
/test/test_cmaes.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | -- 10-D rosenbrock
 8 | x = torch.Tensor(10):fill(0)
 9 | config = {maxEval=10000, sigma=0.5, verb_disp=0}
10 | 
11 | -- will take some time
12 | x,fx,i=optim.cmaes(rosenbrock,x,config)
13 | 
14 | 
15 | print('Rosenbrock test')
16 | print()
17 | -- approx 6500 function evals expected
18 | print('Number of function evals = ',i)
19 | print('x=');print(x)
20 | print('fx=')
21 | for i=1,#fx do print(i,fx[i]); end
22 | print()
23 | print()


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
 3 | CMAKE_POLICY(VERSION 2.6)
 4 | IF(LUAROCKS_PREFIX)
 5 |     MESSAGE(STATUS "Installing Torch through Luarocks")
 6 |     STRING(REGEX REPLACE "(.*)lib/luarocks/rocks.*" "\\1" CMAKE_INSTALL_PREFIX  "${LUAROCKS_PREFIX}")
 7 |     MESSAGE(STATUS "Prefix inferred from Luarocks: ${CMAKE_INSTALL_PREFIX}")
 8 | ENDIF()
 9 | FIND_PACKAGE(Torch REQUIRED)
10 | 
11 | SET(src)
12 | FILE(GLOB luasrc *.lua)
13 | ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}")
14 | #ADD_TORCH_DOK(dok optim "Machine Learning" "Optimization" 3.2)
15 | 


--------------------------------------------------------------------------------
/test/test_logger.lua:
--------------------------------------------------------------------------------
 1 | require 'optim'
 2 | 
 3 | 
 4 | logger_former = optim.Logger('accuracy-former.log')
 5 | logger_new = optim.Logger('accuracy-new.log')
 6 | 
 7 | logger_new:setNames({'channel 1', 'channel 2', 'channel 3'})
 8 | 
 9 | for i = 1, 20 do
10 |    logger_former:add({['channel 1'] = 1 , ['channel 2'] = 0.1 * i, ['channel 3'] = 1 - 0.2 * i})
11 |    logger_new:add({1 , 0.1 * i, 1 - 0.2 * i})
12 | end
13 | 
14 | logger_former:style({['channel 1'] = '-' , ['channel 2'] = '-', ['channel 3'] = '-'})
15 | logger_new:style{'-', '-', '-'}
16 | 
17 | logger_former:plot()
18 | logger_new:plot()
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/test/l2.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | -- rosenbrock.m This function returns the function value, partial derivatives
 3 | -- and Hessian of the (general dimension) rosenbrock function, given by:
 4 | --
 5 | --       f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 
 6 | --
 7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1).
 8 | --
 9 | -- Carl Edward Rasmussen, 2001-07-21.
10 | 
11 | function l2(x)
12 | 
13 |    local xx = x:clone()
14 |    xx:cmul(xx)
15 |    local fout = xx:sum()
16 | 
17 |    local dx = torch.Tensor():resizeAs(x):copy(x)
18 |    dx:mul(2)
19 |    --print('l2 eval = ', fout)
20 |    return fout,dx
21 | 
22 | end


--------------------------------------------------------------------------------
/init.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'torch'
 3 | 
 4 | optim = {}
 5 | 
 6 | -- optimizations
 7 | require('optim.sgd')
 8 | require('optim.cg')
 9 | require('optim.asgd')
10 | require('optim.nag')
11 | require('optim.fista')
12 | require('optim.lbfgs')
13 | require('optim.adagrad')
14 | require('optim.rprop')
15 | require('optim.adam')
16 | require('optim.adamax')
17 | require('optim.rmsprop')
18 | require('optim.adadelta')
19 | require('optim.cmaes')
20 | 
21 | -- line search functions
22 | require('optim.lswolfe')
23 | 
24 | -- helpers
25 | require('optim.polyinterp')
26 | require('optim.checkgrad')
27 | 
28 | -- tools
29 | require('optim.ConfusionMatrix')
30 | require('optim.Logger')
31 | 
32 | return optim
33 | 


--------------------------------------------------------------------------------
/optim-1.0.5-0.rockspec:
--------------------------------------------------------------------------------
 1 | package = "optim"
 2 | version = "1.0.5-0"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/torch/optim",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "An optimization library for Torch.",
10 |    detailed = [[
11 | This package contains several optimization routines for Torch.   
12 |   ]],
13 |    homepage = "https://github.com/torch/optim",
14 |    license = "BSD"
15 | }
16 | 
17 | dependencies = {
18 |    "torch >= 7.0",
19 | }
20 | 
21 | build = {
22 |    type = "command",
23 |    build_command = [[
24 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
25 |    ]],
26 |    install_command = "cd build && $(MAKE) install"
27 | }
28 | 


--------------------------------------------------------------------------------
/optim-1.0.4-0.rockspec:
--------------------------------------------------------------------------------
 1 | package = "optim"
 2 | version = "1.0.4-0"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/torch/optim",
 6 |    tag = "1.0.4-0"
 7 | }
 8 | 
 9 | description = {
10 |    summary = "An optimization library for Torch.",
11 |    detailed = [[
12 | This package contains several optimization routines for Torch.   
13 |   ]],
14 |    homepage = "https://github.com/torch/optim",
15 |    license = "BSD"
16 | }
17 | 
18 | dependencies = {
19 |    "torch >= 7.0",
20 | }
21 | 
22 | build = {
23 |    type = "command",
24 |    build_command = [[
25 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
26 |    ]],
27 |    install_command = "cd build && $(MAKE) install"
28 | }
29 | 


--------------------------------------------------------------------------------
/optim-1.0.3-0.rockspec:
--------------------------------------------------------------------------------
 1 | package = "optim"
 2 | version = "1.0.3-0"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/torch/optim",
 6 |    tag = "1.0.3-0"
 7 | }
 8 | 
 9 | description = {
10 |    summary = "An optimization library for Torch.",
11 |    detailed = [[
12 | This package contains several optimization routines for Torch.   
13 |   ]],
14 |    homepage = "https://github.com/torch/optim",
15 |    license = "BSD"
16 | }
17 | 
18 | dependencies = {
19 |    "torch >= 7.0",
20 |    "sys >= 1.0",
21 | }
22 | 
23 | build = {
24 |    type = "command",
25 |    build_command = [[
26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
27 |    ]],
28 |    install_command = "cd build && $(MAKE) install"
29 | }
30 | 


--------------------------------------------------------------------------------
/optim-1.0.3-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "optim"
 2 | version = "1.0.3-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/torch/optim",
 6 |    tag = "1.0.3-1"
 7 | }
 8 | 
 9 | description = {
10 |    summary = "An optimization library for Torch.",
11 |    detailed = [[
12 | This package contains several optimization routines for Torch.   
13 |   ]],
14 |    homepage = "https://github.com/torch/optim",
15 |    license = "BSD"
16 | }
17 | 
18 | dependencies = {
19 |    "torch >= 7.0",
20 |    "sys >= 1.0",
21 | }
22 | 
23 | build = {
24 |    type = "command",
25 |    build_command = [[
26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
27 |    ]],
28 |    install_command = "cd build && $(MAKE) install"
29 | }
30 | 


--------------------------------------------------------------------------------
/test/test_lbfgs.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | require 'rosenbrock'
 5 | require 'l2'
 6 | 
 7 | print('--- regular batch test ---')
 8 | 
 9 | x = torch.Tensor(2):fill(0)
10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, learningRate=1e-1})
11 | 
12 | print()
13 | print('Rosenbrock test')
14 | print()
15 | print('Number of function evals = ',i)
16 | print('x=');print(x)
17 | print('fx=')
18 | for i=1,#fx do print(i,fx[i]); end
19 | print()
20 | print()
21 | 
22 | print('--- stochastic test ---')
23 | 
24 | x = torch.Tensor(2):fill(0)
25 | fx = {}
26 | config = {learningRate=1e-1, maxIter=1}
27 | for i = 1,100 do
28 | 	x,f=optim.lbfgs(rosenbrock,x,config)
29 | 	table.insert(fx,f[1])
30 | end
31 | 
32 | print()
33 | print('Rosenbrock test')
34 | print()
35 | print('Number of function evals = ',i)
36 | print('x=');print(x)
37 | print('fx=')
38 | for i=1,#fx do print(i,fx[i]); end
39 | 


--------------------------------------------------------------------------------
/test/test_confusion.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'optim'
 3 | 
 4 | n_feature = 3
 5 | classes = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
 6 | 
 7 | print'ConfusionMatrix:__init() test'
 8 | cm = optim.ConfusionMatrix(#classes, classes)
 9 | 
10 | target = 3
11 | prediction = torch.randn(#classes)
12 | 
13 | print'ConfusionMatrix:add() test'
14 | cm:add(prediction, target)
15 | cm:add(prediction, torch.randn(#classes))
16 | 
17 | batch_size = 8
18 | 
19 | targets = torch.randperm(batch_size)
20 | predictions = torch.randn(batch_size, #classes)
21 | 
22 | print'ConfusionMatrix:batchAdd() test'
23 | cm:batchAdd(predictions, targets)
24 | assert(cm.mat:sum() == batch_size + 2, 'missing examples')
25 | 
26 | print'ConfusionMatrix:updateValids() test'
27 | cm:updateValids()
28 | 
29 | print'ConfusionMatrix:__tostring__() test'
30 | print(cm)
31 | 
32 | target = 0
33 | cm:add(prediction, target)
34 | assert(cm.mat:sum() == batch_size + 2, 'too many examples')
35 | 
36 | -- FAR/FRR testing on identify matrix. FRR/FAR should be zero for identity.
37 | cm.mat = torch.eye(#classes, #classes)
38 | classFrrs, classFars, frrs, fars = cm:farFrr()
39 | assert(classFrrs:sum() + classFars:sum() == 0, "Incorrect values")
40 | 


--------------------------------------------------------------------------------
/checkgrad.lua:
--------------------------------------------------------------------------------
 1 | --[[ An implementation of a simple numerical gradient checker.
 2 | 
 3 | ARGS:
 4 | 
 5 | - `opfunc` : a function that takes a single input (X), the point of
 6 |          evaluation, and returns f(X) and df/dX
 7 | - `x` : the initial point
 8 | - `eps` : the epsilon to use for the numerical check (default is 1e-7)
 9 | 
10 | RETURN:
11 | 
12 | - `diff` : error in the gradient, should be near tol
13 | - `dC` : exact gradient at point 
14 | - `dC_est` : numerically estimates gradient at point
15 | 
16 | ]]--
17 | 
18 | 
19 | -- function that numerically checks gradient of NCA loss:
20 | function optim.checkgrad(opfunc, x, eps)
21 |     
22 |     -- compute true gradient:
23 |     local _,dC = opfunc(x)
24 |     dC:resize(x:size())
25 |     
26 |     -- compute numeric approximations to gradient:
27 |     local eps = eps or 1e-7
28 |     local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC)
29 |     for i = 1,dC:size(1) do
30 |       x[i] = x[i] + eps
31 |       local C1 = opfunc(x)
32 |       x[i] = x[i] - 2 * eps
33 |       local C2 = opfunc(x)
34 |       x[i] = x[i] + eps
35 |       dC_est[i] = (C1 - C2) / (2 * eps)
36 |     end
37 | 
38 |     -- estimate error of gradient:
39 |     local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est)
40 |     return diff,dC,dC_est
41 | end
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Optimization package
 2 | 
 3 | This package contains several optimization routines for [Torch](https://github.com/torch/torch7/blob/master/README.md).
 4 | Each optimization algorithm is based on the same interface:
 5 | 
 6 | ```lua
 7 | x*, {f}, ... = optim.method(func, x, state)
 8 | ```
 9 | 
10 | where:
11 | 
12 | * `func`: a user-defined closure that respects this API: `f, df/dx = func(x)`
13 | * `x`: the current parameter vector (a 1D `torch.Tensor`)
14 | * `state`: a table of parameters, and state variables, dependent upon the algorithm
15 | * `x*`: the new parameter vector that minimizes `f, x* = argmin_x f(x)`
16 | * `{f}`: a table of all f values, in the order they've been evaluated
17 |   (for some simple algorithms, like SGD, `#f == 1`)
18 | 
19 | ## Available algorithms
20 | 
21 | Please check [this file](doc/index.md) for the full list of
22 | optimization algorithms available and examples. Get also into the
23 | [`test`](test/) directory for straightforward examples using the
24 | [Rosenbrock's](test/rosenbrock.lua) function.
25 | 
26 | ## Important Note
27 | 
28 | The state table is used to hold the state of the algorithm.
29 | It's usually initialized once, by the user, and then passed to the optim function
30 | as a black box. Example:
31 | 
32 | ```lua
33 | state = {
34 |    learningRate = 1e-3,
35 |    momentum = 0.5
36 | }
37 | 
38 | for i,sample in ipairs(training_samples) do
39 |     local func = function(x)
40 |        -- define eval function
41 |        return f,df_dx
42 |     end
43 |     optim.sgd(func,x,state)
44 | end
45 | ```
46 | 


--------------------------------------------------------------------------------
/test/rosenbrock.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | -- rosenbrock.m This function returns the function value, partial derivatives
 3 | -- and Hessian of the (general dimension) rosenbrock function, given by:
 4 | --
 5 | --       f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 
 6 | --
 7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1).
 8 | --
 9 | -- Carl Edward Rasmussen, 2001-07-21.
10 | 
11 | function rosenbrock(x)
12 |    
13 |    -- (1) compute f(x)
14 |    local d = x:size(1)
15 |    -- x1 =  x(i)^2
16 |    local x1 = x.new(d-1):copy(x:narrow(1,1,d-1))
17 |    -- x(i+1) - x(i)^2
18 |    x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1))
19 | 
20 |    -- 100*(x(i+1) - x(i)^2)^2
21 |    x1:cmul(x1):mul(100)
22 | 
23 |    -- x(i)
24 |    local x0 = x.new(d-1):copy(x:narrow(1,1,d-1))
25 |    -- 1-x(i)
26 |    x0:mul(-1):add(1)
27 |    -- (1-x(i))^2
28 |    x0:cmul(x0)
29 |    -- 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
30 |    x1:add(x0)
31 |    local fout = x1:sum()
32 | 
33 |    -- (2) compute f(x)/dx
34 |    local dxout = x.new():resizeAs(x):zero()
35 |    -- df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1));
36 |    
37 |    x1:copy(x:narrow(1,1,d-1))
38 |    x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1)):cmul(x:narrow(1,1,d-1)):mul(-400)
39 |    x0:copy(x:narrow(1,1,d-1)):mul(-1):add(1):mul(-2)
40 |    x1:add(x0)
41 |    dxout:narrow(1,1,d-1):copy(x1)
42 |    
43 |   -- df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2);
44 |    x0:copy(x:narrow(1,1,d-1))
45 |    x0:cmul(x0):mul(-1):add(x:narrow(1,2,d-1)):mul(200)
46 |    dxout:narrow(1,2,d-1):add(x0)
47 | 
48 |   return fout,dxout
49 | 
50 | end
51 | 


--------------------------------------------------------------------------------
/adagrad.lua:
--------------------------------------------------------------------------------
 1 | --[[ ADAGRAD implementation for SGD
 2 | 
 3 | ARGS:
 4 | - `opfunc` : a function that takes a single input (X), the point of
 5 |          evaluation, and returns f(X) and df/dX
 6 | - `x` : the initial point
 7 | - `state` : a table describing the state of the optimizer; after each
 8 |          call the state is modified
 9 | - `state.learningRate` : learning rate
10 | - `state.paramVariance` : vector of temporal variances of parameters
11 | - `state.weightDecay` : scalar that controls weight decay
12 | RETURN:
13 | - `x` : the new x vector
14 | - `f(x)` : the function, evaluated before the update
15 | 
16 | ]]
17 | function optim.adagrad(opfunc, x, config, state)
18 |    -- (0) get/update state
19 |    if config == nil and state == nil then
20 |       print('no state table, ADAGRAD initializing')
21 |    end
22 |    local config = config or {}
23 |    local state = state or config
24 |    local lr = config.learningRate or 1e-3
25 |    local lrd = config.learningRateDecay or 0
26 |    local wd = config.weightDecay or 0
27 |    state.evalCounter = state.evalCounter or 0
28 |    local nevals = state.evalCounter
29 | 
30 |    -- (1) evaluate f(x) and df/dx
31 |    local fx,dfdx = opfunc(x)
32 | 
33 |    -- (2) weight decay with a single parameter
34 |    if wd ~= 0 then
35 |        dfdx:add(wd, x)
36 |    end
37 | 
38 |    -- (3) learning rate decay (annealing)
39 |    local clr = lr / (1 + nevals*lrd)
40 | 
41 |    -- (4) parameter update with single or individual learning rates
42 |    if not state.paramVariance then
43 |       state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
44 |       state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx)
45 |    end
46 |    state.paramVariance:addcmul(1,dfdx,dfdx)
47 |    state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt()
48 |    x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10))
49 | 
50 |    -- (5) update evaluation counter
51 |    state.evalCounter = state.evalCounter + 1
52 | 
53 |    -- return x*, f(x) before optimization
54 |    return x,{fx}
55 | end
56 | 


--------------------------------------------------------------------------------
/rmsprop.lua:
--------------------------------------------------------------------------------
 1 | --[[ An implementation of RMSprop
 2 | 
 3 | ARGS:
 4 | 
 5 | - 'opfunc' : a function that takes a single input (X), the point
 6 |              of a evaluation, and returns f(X) and df/dX
 7 | - 'x'      : the initial point
 8 | - 'config` : a table with configuration parameters for the optimizer
 9 | - 'config.learningRate'      : learning rate
10 | - 'config.alpha'             : smoothing constant
11 | - 'config.epsilon'           : value with which to initialise m
12 | - 'config.weightDecay'       : weight decay
13 | - 'state'                    : a table describing the state of the optimizer;
14 |                                after each call the state is modified
15 | - 'state.m'                  : leaky sum of squares of parameter gradients,
16 | - 'state.tmp'                : and the square root (with epsilon smoothing)
17 | 
18 | RETURN:
19 | - `x`     : the new x vector
20 | - `f(x)`  : the function, evaluated before the update
21 | 
22 | ]]
23 | 
24 | function optim.rmsprop(opfunc, x, config, state)
25 |     -- (0) get/update state
26 |     local config = config or {}
27 |     local state = state or config
28 |     local lr = config.learningRate or 1e-2
29 |     local alpha = config.alpha or 0.99
30 |     local epsilon = config.epsilon or 1e-8
31 |     local wd = config.weightDecay or 0
32 | 
33 |     -- (1) evaluate f(x) and df/dx
34 |     local fx, dfdx = opfunc(x)
35 | 
36 |     -- (2) weight decay
37 |     if wd ~= 0 then
38 |       dfdx:add(wd, x)
39 |     end
40 | 
41 |     -- (3) initialize mean square values and square gradient storage
42 |     if not state.m then
43 |       state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(1)
44 |       state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx)
45 |     end
46 | 
47 |     -- (4) calculate new (leaky) mean squared values
48 |     state.m:mul(alpha)
49 |     state.m:addcmul(1.0-alpha, dfdx, dfdx)
50 | 
51 |     -- (5) perform update
52 |     state.tmp:sqrt(state.m):add(epsilon)
53 |     x:addcdiv(-lr, dfdx, state.tmp)
54 | 
55 |     -- return x*, f(x) before optimization
56 |     return x, {fx}
57 | end
58 | 


--------------------------------------------------------------------------------
/COPYRIGHT.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 2 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 3 | Copyright (c) 2011-2013 NYU (Clement Farabet)
 4 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 5 | Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 6 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 7 | 
 8 | All rights reserved.
 9 | 
10 | Redistribution and use in source and binary forms, with or without
11 | modification, are permitted provided that the following conditions are met:
12 | 
13 | 1. Redistributions of source code must retain the above copyright
14 |    notice, this list of conditions and the following disclaimer.
15 | 
16 | 2. Redistributions in binary form must reproduce the above copyright
17 |    notice, this list of conditions and the following disclaimer in the
18 |    documentation and/or other materials provided with the distribution.
19 | 
20 | 3. Neither the names of NEC Laboratories American and IDIAP Research
21 |    Institute nor the names of its contributors may be used to endorse or
22 |    promote products derived from this software without specific prior
23 |    written permission.
24 | 
25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 | POSSIBILITY OF SUCH DAMAGE.
36 | 


--------------------------------------------------------------------------------
/adadelta.lua:
--------------------------------------------------------------------------------
 1 | --[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701
 2 | 
 3 | ARGS:
 4 | - `opfunc` : a function that takes a single input (X), the point of
 5 |             evaluation, and returns f(X) and df/dX
 6 | - `x` : the initial point
 7 | - `config` : a table of hyper-parameters
 8 | - `config.rho` : interpolation parameter
 9 | - `config.eps` : for numerical stability
10 | - `config.weightDecay` : weight decay
11 | - `state` : a table describing the state of the optimizer; after each
12 |          call the state is modified
13 | - `state.paramVariance` : vector of temporal variances of parameters
14 | - `state.accDelta` : vector of accummulated delta of gradients
15 | RETURN:
16 | - `x` : the new x vector
17 | - `f(x)` : the function, evaluated before the update
18 | ]]
19 | function optim.adadelta(opfunc, x, config, state)
20 |     -- (0) get/update state
21 |     if config == nil and state == nil then
22 |         print('no state table, ADADELTA initializing')
23 |     end
24 |     local config = config or {}
25 |     local state = state or config
26 |     local rho = config.rho or 0.9
27 |     local eps = config.eps or 1e-6
28 |     local wd = config.weightDecay or 0
29 |     state.evalCounter = state.evalCounter or 0
30 |     -- (1) evaluate f(x) and df/dx
31 |     local fx,dfdx = opfunc(x)
32 | 
33 |     -- (2) weight decay
34 |     if wd ~= 0 then
35 |       dfdx:add(wd, x)
36 |     end
37 | 
38 |     -- (3) parameter update
39 |     if not state.paramVariance then
40 |         state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
41 |         state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
42 |         state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
43 |         state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
44 |     end
45 |     state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx)
46 |     state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt()
47 |     state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx)
48 |     x:add(-1, state.delta)
49 |     state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta)
50 |     -- (4) update evaluation counter
51 |     state.evalCounter = state.evalCounter + 1
52 | 
53 |     -- return x*, f(x) before optimization
54 |     return x,{fx}
55 | end
56 | 


--------------------------------------------------------------------------------
/asgd.lua:
--------------------------------------------------------------------------------
 1 | --[[ An implementation of ASGD
 2 | 
 3 | ASGD: 
 4 | 
 5 |        x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
 6 |        a := a + mu_t [ x - a ]
 7 | 
 8 |     eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
 9 |      mu_t = 1/max(1,t-t0)
10 | 
11 | implements ASGD algoritm as in L.Bottou's sgd-2.0
12 | 
13 | ARGS:
14 | 
15 | - `opfunc` : a function that takes a single input (X), the point of 
16 |          evaluation, and returns f(X) and df/dX
17 | - `x`      : the initial point
18 | - `state`  : a table describing the state of the optimizer; after each
19 |          call the state is modified
20 | - `state.eta0`   : learning rate
21 | - `state.lambda` : decay term
22 | - `state.alpha`  : power for eta update
23 | - `state.t0`     : point at which to start averaging
24 | 
25 | RETURN:
26 | - `x`     : the new x vector
27 | - `f(x)`  : the function, evaluated before the update
28 | - `ax`    : the averaged x vector
29 | 
30 | (Clement Farabet, 2012)
31 | --]]
32 | function optim.asgd(opfunc, x, config, state)
33 |    -- (0) get/update state
34 |    local config = config or {}
35 |    local state = state or config
36 |    config.eta0 = config.eta0 or 1e-4
37 |    config.lambda = config.lambda or 1e-4
38 |    config.alpha = config.alpha or 0.75
39 |    config.t0 = config.t0 or 1e6
40 | 
41 |    -- (hidden state)
42 |    state.eta_t = state.eta_t or config.eta0
43 |    state.mu_t = state.mu_t or 1
44 |    state.t = state.t or 0
45 | 
46 |    -- (1) evaluate f(x) and df/dx
47 |    local fx,dfdx = opfunc(x)
48 | 
49 |    -- (2) decay term
50 |    x:mul(1 - config.lambda*state.eta_t)
51 | 
52 |    -- (3) update x
53 |    x:add(-state.eta_t, dfdx)
54 | 
55 |    -- (4) averaging
56 |    state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero()
57 |    state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax)
58 |    if state.mu_t ~= 1 then
59 |       state.tmp:copy(x)
60 |       state.tmp:add(-1,state.ax):mul(state.mu_t)
61 |       state.ax:add(state.tmp)
62 |    else
63 |       state.ax:copy(x)
64 |    end
65 | 
66 |    -- (5) update eta_t and mu_t
67 |    state.t = state.t + 1
68 |    state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha)
69 |    state.mu_t = 1 / math.max(1, state.t - config.t0)
70 | 
71 |    -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
72 |    return x,{fx},state.ax
73 | end
74 | 


--------------------------------------------------------------------------------
/adamax.lua:
--------------------------------------------------------------------------------
 1 | --[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
 2 | 
 3 | ARGS:
 4 | 
 5 | - 'opfunc' : a function that takes a single input (X), the point
 6 |              of a evaluation, and returns f(X) and df/dX
 7 | - 'x'      : the initial point
 8 | - 'config` : a table with configuration parameters for the optimizer
 9 | - 'config.learningRate'      : learning rate
10 | - 'config.beta1'             : first moment coefficient
11 | - 'config.beta2'             : second moment coefficient
12 | - 'config.epsilon'           : for numerical stability
13 | - 'state'                    : a table describing the state of the optimizer;
14 |                                after each call the state is modified.
15 | 
16 | RETURN:
17 | - `x`     : the new x vector
18 | - `f(x)`  : the function, evaluated before the update
19 | 
20 | ]]
21 | 
22 | function optim.adamax(opfunc, x, config, state)
23 |     -- (0) get/update state
24 |     local config = config or {}
25 |     local state = state or config
26 |     local lr = config.learningRate or 0.002
27 | 
28 |     local beta1 = config.beta1 or 0.9
29 |     local beta2 = config.beta2 or 0.999
30 |     local epsilon = config.epsilon or 1e-38
31 |     local wd = config.weightDecay or 0
32 | 
33 |     -- (1) evaluate f(x) and df/dx
34 |     local fx, dfdx = opfunc(x)
35 | 
36 |     -- (2) weight decay
37 |     if wd ~= 0 then
38 |        dfdx:add(wd, x)
39 |     end
40 | 
41 |     -- Initialization
42 |     state.t = state.t or 0
43 |     -- Exponential moving average of gradient values
44 |     state.m = state.m or x.new(dfdx:size()):zero()
45 |     -- Exponential moving average of the infinity norm
46 |     state.u = state.u or x.new(dfdx:size()):zero()
47 |     -- A tmp tensor to hold the input to max()
48 |     state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()
49 | 
50 |     state.t = state.t + 1
51 | 
52 |     -- Update biased first moment estimate.
53 |     state.m:mul(beta1):add(1-beta1, dfdx)
54 |     -- Update the exponentially weighted infinity norm.
55 |     state.max[1]:copy(state.u):mul(beta2)
56 |     state.max[2]:copy(dfdx):abs():add(epsilon)
57 |     state.u:max(state.max, 1)
58 | 
59 |     local biasCorrection1 = 1 - beta1^state.t
60 |     local stepSize = lr/biasCorrection1
61 |     -- (2) update x
62 |     x:addcdiv(-stepSize, state.m, state.u)
63 | 
64 |     -- return x*, f(x) before optimization
65 |     return x, {fx}
66 | end
67 | 


--------------------------------------------------------------------------------
/adam.lua:
--------------------------------------------------------------------------------
 1 | --[[ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf
 2 | 
 3 | ARGS:
 4 | 
 5 | - 'opfunc' : a function that takes a single input (X), the point
 6 |              of a evaluation, and returns f(X) and df/dX
 7 | - 'x'      : the initial point
 8 | - 'config` : a table with configuration parameters for the optimizer
 9 | - 'config.learningRate'      : learning rate
10 | - 'config.beta1'             : first moment coefficient
11 | - 'config.beta2'             : second moment coefficient
12 | - 'config.epsilon'           : for numerical stability
13 | - 'config.weightDecay'       : weight decay
14 | - 'state'                    : a table describing the state of the optimizer; after each
15 |                               call the state is modified
16 | 
17 | RETURN:
18 | - `x`     : the new x vector
19 | - `f(x)`  : the function, evaluated before the update
20 | 
21 | ]]
22 | 
23 | function optim.adam(opfunc, x, config, state)
24 |     -- (0) get/update state
25 |     local config = config or {}
26 |     local state = state or config
27 |     local lr = config.learningRate or 0.001
28 | 
29 |     local beta1 = config.beta1 or 0.9
30 |     local beta2 = config.beta2 or 0.999
31 |     local epsilon = config.epsilon or 1e-8
32 |     local wd = config.weightDecay or 0
33 | 
34 |     -- (1) evaluate f(x) and df/dx
35 |     local fx, dfdx = opfunc(x)
36 | 
37 |     -- (2) weight decay
38 |     if wd ~= 0 then
39 |        dfdx:add(wd, x)
40 |     end
41 | 
42 |     -- Initialization
43 |     state.t = state.t or 0
44 |     -- Exponential moving average of gradient values
45 |     state.m = state.m or x.new(dfdx:size()):zero()
46 |     -- Exponential moving average of squared gradient values
47 |     state.v = state.v or x.new(dfdx:size()):zero()
48 |     -- A tmp tensor to hold the sqrt(v) + epsilon
49 |     state.denom = state.denom or x.new(dfdx:size()):zero()
50 | 
51 |     state.t = state.t + 1
52 |     
53 |     -- Decay the first and second moment running average coefficient
54 |     state.m:mul(beta1):add(1-beta1, dfdx)
55 |     state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
56 | 
57 |     state.denom:copy(state.v):sqrt():add(epsilon)
58 | 
59 |     local biasCorrection1 = 1 - beta1^state.t
60 |     local biasCorrection2 = 1 - beta2^state.t
61 |     local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1
62 |     -- (3) update x
63 |     x:addcdiv(-stepSize, state.m, state.denom)
64 | 
65 |     -- return x*, f(x) before optimization
66 |     return x, {fx}
67 | end
68 | 


--------------------------------------------------------------------------------
/test/test_fista.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'unsup'
 3 | require 'torch'
 4 | require 'gnuplot'
 5 | require 'sparsecoding'
 6 | 
 7 | -- gnuplot.setgnuplotexe('/usr/bin/gnuplot44')
 8 | -- gnuplot.setgnuplotterminal('x11')
 9 | 
10 | function gettableval(tt,v)
11 |    local x = torch.Tensor(#tt)
12 |    for i=1,#tt do x[i] = tt[i][v] end
13 |    return x
14 | end
15 | function doplots(v)
16 |    v = v or 'F'
17 |    local fistaf = torch.DiskFile('fista2.bin'):binary()
18 |    local istaf = torch.DiskFile('ista2.bin'):binary()
19 |    
20 |    local hfista = fistaf:readObject()
21 |    fistaf:close()
22 |    local hista = istaf:readObject()
23 |    istaf:close()
24 | 
25 |    gnuplot.figure()
26 |    gnuplot.plot({'fista ' .. v,gettableval(hfista,v)},{'ista ' .. v, gettableval(hista,v)})
27 | end
28 | 
29 | seed = seed or 123
30 | if dofista == nil then
31 |    dofista = true
32 | else
33 |    dofista = not dofista
34 | end
35 | 
36 | torch.manualSeed(seed)
37 | math.randomseed(seed)
38 | nc = 3
39 | ni = 30
40 | no = 100
41 | x = torch.Tensor(ni):zero()
42 | 
43 | --- I am keeping these just to make sure random init stays same
44 | fista = unsup.LinearFistaL1(ni,no,0.1)
45 | fista = nil
46 | 
47 | fistaparams = {}
48 | fistaparams.doFistaUpdate = dofista
49 | fistaparams.maxline = 10
50 | fistaparams.maxiter = 200
51 | fistaparams.verbose = true
52 | 
53 | D=torch.randn(ni,no)
54 | for i=1,D:size(2) do
55 |    D:select(2,i):div(D:select(2,i):std()+1e-12)
56 | end
57 | 
58 | mixi = torch.Tensor(nc)
59 | mixj = torch.Tensor(nc)
60 | for i=1,nc do
61 |    local ii = math.random(1,no)
62 |    local cc = torch.uniform(0,1/nc)
63 |    mixi[i] = ii;
64 |    mixj[i] = cc;
65 |    print(ii,cc)
66 |    x:add(cc, D:select(2,ii))
67 | end
68 | 
69 | fista = optim.FistaL1(D,fistaparams)
70 | code,h = fista.run(x,0.1)
71 | 
72 | --fista.reconstruction:addmv(0,1,D,code)
73 | rec = fista.reconstruction
74 | --code,rec,h = fista:forward(x);
75 | 
76 | gnuplot.figure(1)
77 | gnuplot.plot({'data',mixi,mixj,'+'},{'code',torch.linspace(1,no,no),code,'+'})
78 | gnuplot.title('Fista = ' .. tostring(fistaparams.doFistaUpdate))
79 | 
80 | gnuplot.figure(2)
81 | gnuplot.plot({'input',torch.linspace(1,ni,ni),x,'+-'},{'reconstruction',torch.linspace(1,ni,ni),rec,'+-'});
82 | gnuplot.title('Reconstruction Error : ' ..  x:dist(rec) .. ' ' .. 'Fista = ' .. tostring(fistaparams.doFistaUpdate))
83 | --w2:axis(0,ni+1,-1,1)
84 | 
85 | if dofista then
86 |    print('Running FISTA')
87 |    fname = 'fista2.bin'
88 | else
89 |    print('Running ISTA')
90 |    fname = 'ista2.bin'
91 | end
92 | ff = torch.DiskFile(fname,'w'):binary()
93 | ff:writeObject(h)
94 | ff:close()
95 | 
96 | 


--------------------------------------------------------------------------------
/nag.lua:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------------------------
 2 | -- An implementation of SGD adapted with features of Nesterov's 
 3 | -- Accelerated Gradient method, based on the paper
 4 | -- On the Importance of Initialization and Momentum in Deep Learning
 5 | -- Sutsveker et. al., ICML 2013
 6 | --
 7 | -- ARGS:
 8 | -- opfunc : a function that takes a single input (X), the point of 
 9 | --          evaluation, and returns f(X) and df/dX
10 | -- x      : the initial point
11 | -- state  : a table describing the state of the optimizer; after each
12 | --          call the state is modified
13 | --   state.learningRate      : learning rate
14 | --   state.learningRateDecay : learning rate decay
15 | --   state.weightDecay       : weight decay
16 | --   state.momentum          : momentum
17 | --   state.learningRates     : vector of individual learning rates
18 | --
19 | -- RETURN:
20 | -- x     : the new x vector
21 | -- f(x)  : the function, evaluated before the update
22 | --
23 | -- (Dilip Krishnan, 2013)
24 | --
25 | 
26 | function optim.nag(opfunc, x, config, state)
27 |    -- (0) get/update state
28 |    local config = config or {}
29 |    local state = state or config
30 |    local lr = config.learningRate or 1e-3
31 |    local lrd = config.learningRateDecay or 0
32 |    local wd = config.weightDecay or 0
33 |    local mom = config.momentum or 0.9
34 |    local damp = config.dampening or mom
35 |    local lrs = config.learningRates
36 |    state.evalCounter = state.evalCounter or 0
37 |    local nevals = state.evalCounter
38 | 
39 |    if mom <= 0 then
40 |      error('Momentum must be positive for Nesterov Accelerated Gradient')
41 |    end
42 | 
43 |    -- (1) evaluate f(x) and df/dx
44 |    -- first step in the direction of the momentum vector
45 | 
46 |    if state.dfdx then
47 |       x:add(mom, state.dfdx) 
48 |    end
49 |    -- then compute gradient at that point
50 |    -- comment out the above line to get the original SGD
51 |    local fx,dfdx = opfunc(x)
52 | 
53 |    -- (2) weight decay
54 |    if wd ~= 0 then
55 |       dfdx:add(wd, x)
56 |    end
57 | 
58 |    -- (3) learning rate decay (annealing)
59 |    local clr = lr / (1 + nevals*lrd)
60 | 
61 |    -- (4) apply momentum
62 |    if not state.dfdx then
63 |       state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0)
64 |    else
65 |       state.dfdx:mul(mom)
66 |    end
67 | 
68 |    -- (5) parameter update with single or individual learning rates
69 |    if lrs then
70 |       if not state.deltaParameters then
71 |          state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
72 |       end
73 |       state.deltaParameters:copy(lrs):cmul(dfdx)
74 |       x:add(-clr, state.deltaParameters)
75 |       state.dfdx:add(-clr, state.deltaParameters)
76 |    else
77 |       x:add(-clr, dfdx)
78 |       state.dfdx:add(-clr, dfdx)
79 |    end
80 | 
81 |    -- (6) update evaluation counter
82 |    state.evalCounter = state.evalCounter + 1
83 | 
84 |    -- return x, f(x) before optimization
85 |    return x,{fx}
86 | end
87 | 


--------------------------------------------------------------------------------
/sgd.lua:
--------------------------------------------------------------------------------
 1 | --[[ A plain implementation of SGD
 2 | 
 3 | ARGS:
 4 | 
 5 | - `opfunc` : a function that takes a single input (X), the point
 6 |              of a evaluation, and returns f(X) and df/dX
 7 | - `x`      : the initial point
 8 | - `config` : a table with configuration parameters for the optimizer
 9 | - `config.learningRate`      : learning rate
10 | - `config.learningRateDecay` : learning rate decay
11 | - `config.weightDecay`       : weight decay
12 | - `config.weightDecays`      : vector of individual weight decays
13 | - `config.momentum`          : momentum
14 | - `config.dampening`         : dampening for momentum
15 | - `config.nesterov`          : enables Nesterov momentum
16 | - `config.learningRates`     : vector of individual learning rates
17 | - `state`  : a table describing the state of the optimizer; after each
18 |              call the state is modified
19 | - `state.evalCounter`        : evaluation counter (optional: 0, by default)
20 | 
21 | RETURN:
22 | - `x`     : the new x vector
23 | - `f(x)`  : the function, evaluated before the update
24 | 
25 | (Clement Farabet, 2012)
26 | ]]
27 | function optim.sgd(opfunc, x, config, state)
28 |    -- (0) get/update state
29 |    local config = config or {}
30 |    local state = state or config
31 |    local lr = config.learningRate or 1e-3
32 |    local lrd = config.learningRateDecay or 0
33 |    local wd = config.weightDecay or 0
34 |    local mom = config.momentum or 0
35 |    local damp = config.dampening or mom
36 |    local nesterov = config.nesterov or false
37 |    local lrs = config.learningRates
38 |    local wds = config.weightDecays
39 |    state.evalCounter = state.evalCounter or 0
40 |    local nevals = state.evalCounter
41 |    assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening")
42 | 
43 |    -- (1) evaluate f(x) and df/dx
44 |    local fx,dfdx = opfunc(x)
45 | 
46 |    -- (2) weight decay with single or individual parameters
47 |    if wd ~= 0 then
48 |       dfdx:add(wd, x)
49 |    elseif wds then
50 |       if not state.decayParameters then
51 |          state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
52 |       end
53 |       state.decayParameters:copy(wds):cmul(x)
54 |       dfdx:add(state.decayParameters)
55 |    end
56 | 
57 |    -- (3) apply momentum
58 |    if mom ~= 0 then
59 |       if not state.dfdx then
60 |          state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx)
61 |       else
62 |          state.dfdx:mul(mom):add(1-damp, dfdx)
63 |       end
64 |       if nesterov then
65 |          dfdx:add(mom, state.dfdx)
66 |       else
67 |          dfdx = state.dfdx
68 |       end
69 |    end
70 | 
71 |    -- (4) learning rate decay (annealing)
72 |    local clr = lr / (1 + nevals*lrd)
73 |    
74 |    -- (5) parameter update with single or individual learning rates
75 |    if lrs then
76 |       if not state.deltaParameters then
77 |          state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
78 |       end
79 |       state.deltaParameters:copy(lrs):cmul(dfdx)
80 |       x:add(-clr, state.deltaParameters)
81 |    else
82 |       x:add(-clr, dfdx)
83 |    end
84 | 
85 |    -- (6) update evaluation counter
86 |    state.evalCounter = state.evalCounter + 1
87 | 
88 |    -- return x*, f(x) before optimization
89 |    return x,{fx}
90 | end
91 | 


--------------------------------------------------------------------------------
/rprop.lua:
--------------------------------------------------------------------------------
  1 | --[[ A plain implementation of RPROP
  2 | 
  3 | ARGS:
  4 | - `opfunc` : a function that takes a single input (X), the point of
  5 |              evaluation, and returns f(X) and df/dX
  6 | - `x`      : the initial point
  7 | - `state`  : a table describing the state of the optimizer; after each
  8 |              call the state is modified
  9 | - `state.stepsize`    : initial step size, common to all components
 10 | - `state.etaplus`     : multiplicative increase factor, > 1 (default 1.2)
 11 | - `state.etaminus`    : multiplicative decrease factor, < 1 (default 0.5)
 12 | - `state.stepsizemax` : maximum stepsize allowed (default 50)
 13 | - `state.stepsizemin` : minimum stepsize allowed (default 1e-6)
 14 | - `state.niter`       : number of iterations (default 1)
 15 | 
 16 | RETURN:
 17 | - `x`     : the new x vector
 18 | - `f(x)`  : the function, evaluated before the update
 19 | 
 20 | (Martin Riedmiller, Koray Kavukcuoglu 2013)
 21 | --]]
 22 | function optim.rprop(opfunc, x, config, state)
 23 |     if config == nil and state == nil then
 24 |         print('no state table RPROP initializing')
 25 |     end
 26 |     -- (0) get/update state
 27 |     local config = config or {}
 28 |     local state = state or config
 29 |     local stepsize = config.stepsize or 0.1
 30 |     local etaplus = config.etaplus or 1.2
 31 |     local etaminus = config.etaminus or 0.5
 32 |     local stepsizemax = config.stepsizemax or 50.0
 33 |     local stepsizemin = config.stepsizemin or 1E-06
 34 |     local niter = config.niter or 1
 35 | 
 36 |     local hfx = {}
 37 | 
 38 |     for i=1,niter do
 39 | 
 40 |         -- (1) evaluate f(x) and df/dx
 41 |         local fx,dfdx = opfunc(x)
 42 | 
 43 |         -- init temp storage
 44 |         if not state.delta then
 45 |             state.delta    = dfdx.new(dfdx:size()):zero()
 46 |             state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
 47 |             state.sign     = dfdx.new(dfdx:size())
 48 |             state.psign    = torch.ByteTensor(dfdx:size())
 49 |             state.nsign    = torch.ByteTensor(dfdx:size())
 50 |             state.zsign    = torch.ByteTensor(dfdx:size())
 51 |             state.dminmax  = torch.ByteTensor(dfdx:size())
 52 | 			if torch.type(x)=='torch.CudaTensor' then
 53 | 				-- Push to GPU
 54 | 				state.psign    = state.psign:cuda()
 55 | 				state.nsign    = state.nsign:cuda()
 56 | 				state.zsign    = state.zsign:cuda()
 57 | 				state.dminmax  = state.dminmax:cuda()
 58 | 			end
 59 |         end
 60 | 
 61 |         -- sign of derivative from last step to this one
 62 |         torch.cmul(state.sign, dfdx, state.delta)
 63 |         torch.sign(state.sign, state.sign)
 64 | 
 65 |         -- get indices of >0, <0 and ==0 entries
 66 |         state.sign.gt(state.psign, state.sign, 0)
 67 |         state.sign.lt(state.nsign, state.sign, 0)
 68 |         state.sign.eq(state.zsign, state.sign, 0)
 69 | 
 70 |         -- get step size updates
 71 |         state.sign[state.psign] = etaplus
 72 |         state.sign[state.nsign] = etaminus
 73 |         state.sign[state.zsign] = 1
 74 | 
 75 |         -- update stepsizes with step size updates
 76 |         state.stepsize:cmul(state.sign)
 77 | 
 78 |         -- threshold step sizes
 79 |         -- >50 => 50
 80 |         state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
 81 |         state.stepsize[state.dminmax] = stepsizemax
 82 |         -- <1e-6 ==> 1e-6
 83 |         state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
 84 |         state.stepsize[state.dminmax] = stepsizemin
 85 | 
 86 |         -- for dir<0, dfdx=0
 87 |         -- for dir>=0 dfdx=dfdx
 88 |         dfdx[state.nsign] = 0
 89 |         -- state.sign = sign(dfdx) 
 90 |         torch.sign(state.sign,dfdx)
 91 | 
 92 |         -- update weights
 93 |         x:addcmul(-1,state.sign,state.stepsize)
 94 | 
 95 |         -- update state.dfdx with current dfdx
 96 |         state.delta:copy(dfdx)
 97 | 
 98 |         table.insert(hfx,fx)
 99 |     end
100 | 
101 |    -- return x*, f(x) before optimization
102 |    return x,hfx
103 | end
104 | 


--------------------------------------------------------------------------------
/test/sparsecoding.lua:
--------------------------------------------------------------------------------
  1 | require 'kex'
  2 | 
  3 | -- L1 FISTA Solution
  4 | -- L1 solution with a linear dictionary ||Ax-b||^2 + \lambda ||x||_1
  5 | -- D     : dictionary, each column is a dictionary element
  6 | -- params: set of params to pass to FISTA and possibly temp allocation (**optional**)
  7 | --         check unsup.FistaLS function for details.
  8 | -- returns fista : a table with the following entries
  9 | -- fista.run(x,lambda) : run L1 sparse coding algorithm with input x and lambda.
 10 | -- The following entries will be allocated and reused by each call to fista.run(x,lambda)
 11 | -- fista.reconstruction: reconstructed input.
 12 | -- fista.gradf         : gradient of L2 part of the problem wrt x
 13 | -- fista.code          : the solution of L1 problem
 14 | -- The following entries just point to data passed to fista.run(x)
 15 | -- fista.input         : points to the tensor 'x' used in the last fista.run(x,lambda)
 16 | -- fista.lambda        : the lambda value used in the last fista.run(x,lambda)
 17 | function optim.FistaL1(D, params)
 18 | 
 19 |    -- this is for keeping parameters related to fista algorithm
 20 |    local params = params or {}
 21 |    -- this is for temporary variables and such
 22 |    local fista = {}
 23 | 
 24 |    -- related to FISTA
 25 |    params.L = params.L or 0.1
 26 |    params.Lstep = params.Lstep or 1.5
 27 |    params.maxiter = params.maxiter or 50
 28 |    params.maxline = params.maxline or 20
 29 |    params.errthres = params.errthres or 1e-4
 30 |    
 31 |    -- temporary stuff that might be good to keep around
 32 |    fista.reconstruction = torch.Tensor()
 33 |    fista.gradf = torch.Tensor()
 34 |    fista.gradg = torch.Tensor()
 35 |    fista.code = torch.Tensor()
 36 | 
 37 |    -- these will be assigned in run(x)
 38 |    -- fista.input points to the last input that was run
 39 |    -- fista.lambda is the lambda value from the last run
 40 |    fista.input = nil
 41 |    fista.lambda = nil
 42 | 
 43 |    -- CREATE FUNCTION CLOSURES
 44 |    -- smooth function
 45 |    fista.f = function (x,mode)
 46 | 
 47 | 		local reconstruction = fista.reconstruction
 48 | 		local input = fista.input
 49 | 		-- -------------------
 50 | 		-- function evaluation
 51 | 		if x:dim() == 1 then
 52 | 		   --print(D:size(),x:size())
 53 | 		   reconstruction:resize(D:size(1))
 54 | 		   reconstruction:addmv(0,1,D,x)
 55 | 		elseif x:dim(2) then
 56 | 		   reconstruction:resize(x:size(1),D:size(1))
 57 | 		   reconstruction:addmm(0,1,x,D:t())
 58 | 		end
 59 | 		local fval = input:dist(reconstruction)^2
 60 | 		
 61 | 		-- ----------------------
 62 | 		-- derivative calculation
 63 | 		if mode and mode:match('dx') then
 64 | 		   local gradf = fista.gradf
 65 | 		   reconstruction:add(-1,input):mul(2)
 66 | 		   gradf:resizeAs(x)
 67 | 		   if input:dim() == 1 then
 68 | 		      gradf:addmv(0,1,D:t(),reconstruction)
 69 | 		   else
 70 | 		      gradf:addmm(0,1,reconstruction, D)
 71 | 		   end
 72 | 		   ---------------------------------------
 73 | 		   -- return function value and derivative
 74 | 		   return fval, gradf, reconstruction
 75 | 		end
 76 | 		
 77 | 		------------------------
 78 | 		-- return function value
 79 | 		return fval, reconstruction
 80 | 	     end
 81 | 
 82 |    -- non-smooth function L1
 83 |    fista.g =  function (x)
 84 | 
 85 | 		 local fval = fista.lambda*x:norm(1)
 86 | 
 87 | 		 if mod and mode:match('dx') then
 88 | 		    local gradg = fista.gradg
 89 | 		    gradg:resizAs(x)
 90 | 		    gradg:sign():mul(fista.lambda)
 91 | 		    return fval,gradg
 92 | 		 end
 93 | 		 return fval
 94 | 	      end
 95 |    
 96 |    -- argmin_x Q(x,y), just shrinkage for L1
 97 |    fista.pl = function (x,L)
 98 | 		 x:shrinkage(fista.lambda/L)
 99 | 	      end
100 |    
101 |    fista.run = function(x, lam, codeinit)
102 | 		  local code = fista.code
103 | 		  fista.input = x
104 | 		  fista.lambda = lam
105 | 		  
106 | 		  -- resize code, maybe a different number of dimensions
107 | 		  -- fill with zeros, initial point
108 | 		  if codeinit then
109 | 		     code:resizeAs(codeinit)
110 | 		     code:copy(codeinit)
111 | 		  else
112 | 		     if x:dim() == 1 then
113 | 			code:resize(D:size(2))
114 | 		     elseif x:dim() == 2 then
115 | 			code:resize(x:size(1),D:size(2))
116 | 		     else
117 | 			error(' I do not know how to handle ' .. x:dim() .. ' dimensional input')
118 | 		     end
119 | 		     code:fill(0)
120 | 		  end
121 | 		  -- return the result of unsup.FistaLS call.
122 | 		  return optim.FistaLS(fista.f, fista.g, fista.pl, fista.code, params)
123 | 	       end
124 | 
125 |    return fista
126 | end
127 | 
128 | 


--------------------------------------------------------------------------------
/Logger.lua:
--------------------------------------------------------------------------------
  1 | --[[ Logger: a simple class to log symbols during training,
  2 |         and automate plot generation
  3 | 
  4 | Example:
  5 |     logger = optim.Logger('somefile.log')    -- file to save stuff
  6 | 
  7 |     for i = 1,N do                           -- log some symbols during
  8 |         train_error = ...                     -- training/testing
  9 |         test_error = ...
 10 |         logger:add{['training error'] = train_error,
 11 |             ['test error'] = test_error}
 12 |     end
 13 | 
 14 |     logger:style{['training error'] = '-',   -- define styles for plots
 15 |                  ['test error'] = '-'}
 16 |     logger:plot()                            -- and plot
 17 | 
 18 | ---- OR ---
 19 | 
 20 |     logger = optim.Logger('somefile.log')    -- file to save stuff
 21 |     logger:setNames{'training error', 'test error'}
 22 | 
 23 |     for i = 1,N do                           -- log some symbols during
 24 |        train_error = ...                     -- training/testing
 25 |        test_error = ...
 26 |        logger:add{train_error, test_error}
 27 |     end
 28 | 
 29 |     logger:style{'-', '-'}                   -- define styles for plots
 30 |     logger:plot()                            -- and plot
 31 | 
 32 | -----------
 33 | 
 34 |     logger:setlogscale(true)                 -- enable logscale on Y-axis
 35 |     logger:plot()                            -- and plot
 36 | ]]
 37 | require 'xlua'
 38 | local Logger = torch.class('optim.Logger')
 39 | 
 40 | function Logger:__init(filename, timestamp)
 41 |    if filename then
 42 |       self.name = filename
 43 |       os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"')
 44 |       if timestamp then
 45 |          -- append timestamp to create unique log file
 46 |          filename = filename .. '-'..os.date("%Y_%m_%d_%X")
 47 |       end
 48 |       self.file = io.open(filename,'w')
 49 |       self.epsfile = self.name .. '.eps'
 50 |    else
 51 |       self.file = io.stdout
 52 |       self.name = 'stdout'
 53 |       print('<Logger> warning: no path provided, logging to std out')
 54 |    end
 55 |    self.empty = true
 56 |    self.symbols = {}
 57 |    self.styles = {}
 58 |    self.names = {}
 59 |    self.idx = {}
 60 |    self.figure = nil
 61 |    self.showPlot = true
 62 |    self.plotRawCmd = nil
 63 |    self.defaultStyle = '+'
 64 |    self.logscale = false
 65 | end
 66 | 
 67 | function Logger:setNames(names)
 68 |    self.names = names
 69 |    self.empty = false
 70 |    self.nsymbols = #names
 71 |    for k,key in pairs(names) do
 72 |       self.file:write(key .. '\t')
 73 |       self.symbols[k] = {}
 74 |       self.styles[k] = {self.defaultStyle}
 75 |       self.idx[key] = k
 76 |    end
 77 |    self.file:write('\n')
 78 |    self.file:flush()
 79 | end
 80 | 
 81 | function Logger:add(symbols)
 82 |    -- (1) first time ? print symbols' names on first row
 83 |    if self.empty then
 84 |       self.empty = false
 85 |       self.nsymbols = #symbols
 86 |       for k,val in pairs(symbols) do
 87 |          self.file:write(k .. '\t')
 88 |          self.symbols[k] = {}
 89 |          self.styles[k] = {self.defaultStyle}
 90 |          self.names[k] = k
 91 |       end
 92 |       self.idx = self.names
 93 |       self.file:write('\n')
 94 |    end
 95 |    -- (2) print all symbols on one row
 96 |    for k,val in pairs(symbols) do
 97 |       if type(val) == 'number' then
 98 |          self.file:write(string.format('%11.4e',val) .. '\t')
 99 |       elseif type(val) == 'string' then
100 |          self.file:write(val .. '\t')
101 |       else
102 |          xlua.error('can only log numbers and strings', 'Logger')
103 |       end
104 |    end
105 |    self.file:write('\n')
106 |    self.file:flush()
107 |    -- (3) save symbols in internal table
108 |    for k,val in pairs(symbols) do
109 |       table.insert(self.symbols[k], val)
110 |    end
111 | end
112 | 
113 | function Logger:style(symbols)
114 |    for name,style in pairs(symbols) do
115 |       if type(style) == 'string' then
116 |          self.styles[name] = {style}
117 |       elseif type(style) == 'table' then
118 |          self.styles[name] = style
119 |       else
120 |          xlua.error('style should be a string or a table of strings','Logger')
121 |       end
122 |    end
123 | end
124 | 
125 | function Logger:setlogscale(value)
126 |    self.logscale = value
127 | end
128 | 
129 | function Logger:plot(...)
130 |    if not xlua.require('gnuplot') then
131 |       if not self.warned then
132 |          print('<Logger> warning: cannot plot with this version of Torch')
133 |          self.warned = true
134 |       end
135 |       return
136 |    end
137 |    local plotit = false
138 |    local plots = {}
139 |    local plotsymbol =
140 |       function(name,list)
141 |          if #list > 1 then
142 |             local nelts = #list
143 |             local plot_y = torch.Tensor(nelts)
144 |             for i = 1,nelts do
145 |                plot_y[i] = list[i]
146 |             end
147 |             for _,style in ipairs(self.styles[name]) do
148 |                table.insert(plots, {self.names[name], plot_y, style})
149 |             end
150 |             plotit = true
151 |          end
152 |       end
153 |    local args = {...}
154 |    if not args[1] then -- plot all symbols
155 |       for name,list in pairs(self.symbols) do
156 |          plotsymbol(name,list)
157 |       end
158 |    else -- plot given symbols
159 |       for _,name in ipairs(args) do
160 |          plotsymbol(self.idx[name], self.symbols[self.idx[name]])
161 |       end
162 |    end
163 |    if plotit then
164 |       if self.showPlot then
165 |          self.figure = gnuplot.figure(self.figure)
166 |          if self.logscale then gnuplot.logscale('on') end
167 |          gnuplot.plot(plots)
168 |          if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
169 |          gnuplot.grid('on')
170 |          gnuplot.title('<Logger::' .. self.name .. '>')
171 |       end
172 |       if self.epsfile then
173 |          os.execute('rm -f "' .. self.epsfile .. '"')
174 |          local epsfig = gnuplot.epsfigure(self.epsfile)
175 |          if self.logscale then gnuplot.logscale('on') end
176 |          gnuplot.plot(plots)
177 |          if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
178 |          gnuplot.grid('on')
179 |          gnuplot.title('<Logger::' .. self.name .. '>')
180 |          gnuplot.plotflush()
181 |          gnuplot.close(epsfig)
182 |       end
183 |    end
184 | end
185 | 


--------------------------------------------------------------------------------
/cg.lua:
--------------------------------------------------------------------------------
  1 | --[[
  2 | 
  3 | This cg implementation is a rewrite of minimize.m written by Carl
  4 | E. Rasmussen. It is supposed to produce exactly same results (give
  5 | or take numerical accuracy due to some changed order of
  6 | operations). You can compare the result on rosenbrock with minimize.m.
  7 | http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html
  8 | 
  9 |     [x fx c] = minimize([0 0]', 'rosenbrock', -25)
 10 | 
 11 | Note that we limit the number of function evaluations only, it seems much
 12 | more important in practical use.
 13 | 
 14 | ARGS:
 15 | 
 16 | - `opfunc` : a function that takes a single input, the point of evaluation.
 17 | - `x`      : the initial point
 18 | - `state` : a table of parameters and temporary allocations.
 19 | - `state.maxEval`     : max number of function evaluations
 20 | - `state.maxIter`     : max number of iterations
 21 | - `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage
 22 | - `state.[s,x0]`      : if you pass torch.Tensor they will be used for temp storage
 23 | 
 24 | RETURN:
 25 | 
 26 | - `x*` : the new x vector, at the optimal point
 27 | - `f`  : a table of all function values where
 28 |      `f[1]` is the value of the function before any optimization and
 29 |      `f[#f]` is the final fully optimized value, at x*
 30 | 
 31 | (Koray Kavukcuoglu, 2012)
 32 | --]]
 33 | function optim.cg(opfunc, x, config, state)
 34 |    -- parameters
 35 |    local config = config or {}
 36 |    local state = state or config
 37 |    local rho  = config.rho or 0.01
 38 |    local sig  = config.sig or 0.5
 39 |    local int  = config.int or 0.1
 40 |    local ext  = config.ext or 3.0
 41 |    local maxIter  = config.maxIter or 20
 42 |    local ratio = config.ratio or 100
 43 |    local maxEval = config.maxEval or maxIter*1.25
 44 |    local red = 1
 45 | 
 46 |    local verbose = config.verbose or 0
 47 | 
 48 |    local i = 0
 49 |    local ls_failed = 0
 50 |    local fx  = {}
 51 | 
 52 |    -- we need three points for the interpolation/extrapolation stuff
 53 |    local z1,z2,z3 = 0,0,0
 54 |    local d1,d2,d3 = 0,0,0
 55 |    local f1,f2,f3 = 0,0,0
 56 | 
 57 |    local df1 = state.df1 or x.new()
 58 |    local df2 = state.df2 or x.new()
 59 |    local df3 = state.df3 or x.new()
 60 |    local tdf
 61 | 
 62 |    df1:resizeAs(x)
 63 |    df2:resizeAs(x)
 64 |    df3:resizeAs(x)
 65 | 
 66 |    -- search direction
 67 |    local s = state.s or x.new()
 68 |    s:resizeAs(x)
 69 | 
 70 |    -- we need a temp storage for X
 71 |    local x0 = state.x0 or x.new()
 72 |    local f0 = 0
 73 |    local df0 = state.df0 or x.new()
 74 |    x0:resizeAs(x)
 75 |    df0:resizeAs(x)
 76 | 
 77 |    -- evaluate at initial point
 78 |    f1,tdf = opfunc(x)
 79 |    fx[#fx+1] = f1
 80 |    df1:copy(tdf)
 81 |    i=i+1
 82 | 
 83 |    -- initial search direction
 84 |    s:copy(df1):mul(-1)
 85 | 
 86 |    d1 = -s:dot(s )         -- slope
 87 |    z1 = red/(1-d1)         -- initial step
 88 | 
 89 |    while i < math.abs(maxEval) do
 90 | 
 91 |       x0:copy(x)
 92 |       f0 = f1
 93 |       df0:copy(df1)
 94 | 
 95 |       x:add(z1,s)
 96 |       f2,tdf = opfunc(x)
 97 |       df2:copy(tdf)
 98 |       i=i+1
 99 |       d2 = df2:dot(s)
100 |       f3,d3,z3 = f1,d1,-z1   -- init point 3 equal to point 1
101 |       local m = math.min(maxIter,maxEval-i)
102 |       local success = 0
103 |       local limit = -1
104 | 
105 |       while true do
106 |          while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do
107 |             limit = z1
108 |             if f2 > f1 then
109 |                z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
110 |             else
111 |                local A = 6*(f2-f3)/z3+3*(d2+d3)
112 |                local B = 3*(f3-f2)-z3*(d3+2*d2)
113 |                z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A
114 |             end
115 |             if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then
116 |                z2 = z3/2;
117 |             end
118 |             z2 = math.max(math.min(z2, int*z3),(1-int)*z3);
119 |             z1 = z1 + z2;
120 |             x:add(z2,s)
121 |             f2,tdf = opfunc(x)
122 |             df2:copy(tdf)
123 |             i=i+1
124 |             m = m - 1
125 |             d2 = df2:dot(s)
126 |             z3 = z3-z2;
127 |          end
128 |          if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then
129 |             break
130 |          elseif d2 > sig*d1 then
131 |             success = 1;
132 |             break;
133 |          elseif m == 0 then
134 |             break;
135 |          end
136 |          local A = 6*(f2-f3)/z3+3*(d2+d3);
137 |          local B = 3*(f3-f2)-z3*(d3+2*d2);
138 |          z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3))
139 | 
140 |          if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then
141 |             if limit < -0.5 then
142 |                z2 = z1 * (ext -1)
143 |             else
144 |                z2 = (limit-z1)/2
145 |             end
146 |          elseif (limit > -0.5) and (z2+z1) > limit then
147 |             z2 = (limit-z1)/2
148 |          elseif limit < -0.5 and (z2+z1) > z1*ext then
149 |             z2 = z1*(ext-1)
150 |          elseif z2 < -z3*int then
151 |             z2 = -z3*int
152 |          elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then
153 |             z2 = (limit-z1)*(1-int)
154 |          end
155 |          f3=f2; d3=d2; z3=-z2;
156 |          z1 = z1+z2;
157 |          x:add(z2,s)
158 | 
159 |          f2,tdf = opfunc(x)
160 |          df2:copy(tdf)
161 |          i=i+1
162 |          m = m - 1
163 |          d2 = df2:dot(s)
164 |       end
165 |       if success == 1 then
166 |          f1 = f2
167 |          fx[#fx+1] = f1;
168 |          local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1)
169 |          s:mul(ss)
170 |          s:add(-1,df2)
171 |          local tmp = df1:clone()
172 |          df1:copy(df2)
173 |          df2:copy(tmp)
174 |          d2 = df1:dot(s)
175 |          if d2> 0 then
176 |             s:copy(df1)
177 |             s:mul(-1)
178 |             d2 = -s:dot(s)
179 |          end
180 | 
181 |          z1 = z1 * math.min(ratio, d1/(d2-1e-320))
182 |          d1 = d2
183 |          ls_failed = 0
184 |       else
185 |          x:copy(x0)
186 |          f1 = f0
187 |          df1:copy(df0)
188 |          if ls_failed or i>maxEval then
189 |             break
190 |          end
191 |          local tmp = df1:clone()
192 |          df1:copy(df2)
193 |          df2:copy(tmp)
194 |          s:copy(df1)
195 |          s:mul(-1)
196 |          d1 = -s:dot(s)
197 |          z1 = 1/(1-d1)
198 |          ls_failed = 1
199 |       end
200 |    end
201 |    state.df0 = df0
202 |    state.df1 = df1
203 |    state.df2 = df2
204 |    state.df3 = df3
205 |    state.x0 = x0
206 |    state.s = s
207 |    return x,fx,i
208 | end
209 | 


--------------------------------------------------------------------------------
/lswolfe.lua:
--------------------------------------------------------------------------------
  1 | --[[ A Line Search satisfying the Wolfe conditions
  2 | 
  3 | ARGS:
  4 | - `opfunc` : a function (the objective) that takes a single input (X),
  5 |          the point of evaluation, and returns f(X) and df/dX
  6 | - `x`          : initial point / starting location
  7 | - `t`          : initial step size
  8 | - `d`          : descent direction
  9 | - `f`          : initial function value
 10 | - `g`          : gradient at initial location
 11 | - `gtd`        : directional derivative at starting location
 12 | - `options.c1` : sufficient decrease parameter
 13 | - `options.c2` : curvature parameter
 14 | - `options.tolX`    : minimum allowable step length
 15 | - `options.maxIter` : maximum nb of iterations
 16 | 
 17 | RETURN:
 18 | - `f`          : function value at x+t*d
 19 | - `g`          : gradient value at x+t*d
 20 | - `x`          : the next x (=x+t*d)
 21 | - `t`          : the step length
 22 | - `lsFuncEval` : the number of function evaluations
 23 | ]]
 24 | function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options)
 25 |    -- options
 26 |    options = options or {}
 27 |    local c1 = options.c1 or 1e-4
 28 |    local c2 = options.c2 or 0.9
 29 |    local tolX = options.tolX or 1e-9
 30 |    local maxIter = options.maxIter or 20
 31 |    local isverbose = options.verbose or false
 32 | 
 33 |    -- some shortcuts
 34 |    local abs = torch.abs
 35 |    local min = math.min
 36 |    local max = math.max
 37 |    local Tensor = torch.Tensor
 38 | 
 39 |    -- verbose function
 40 |    local function verbose(...)
 41 |       if isverbose then print('<optim.lswolfe> ', ...) end
 42 |    end
 43 | 
 44 |    -- evaluate objective and gradient using initial step
 45 |    local x_init = x:clone()
 46 |    x:add(t,d)
 47 |    local f_new,g_new = opfunc(x)
 48 |    local lsFuncEval = 1
 49 |    local gtd_new = g_new * d
 50 | 
 51 |    -- bracket an interval containing a point satisfying the Wolfe
 52 |    -- criteria
 53 |    local LSiter,t_prev,done = 0,0,false
 54 |    local f_prev,g_prev,gtd_prev = f,g:clone(),gtd
 55 |    local bracket,bracketFval,bracketGval
 56 |    while LSiter < maxIter do
 57 |       -- check conditions:
 58 |       if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then
 59 |          bracket = Tensor{t_prev,t}
 60 |          bracketFval = Tensor{f_prev,f_new}
 61 |          bracketGval = Tensor(2,g_new:size(1))
 62 |          bracketGval[1] = g_prev
 63 |          bracketGval[2] = g_new
 64 |          break
 65 | 
 66 |       elseif abs(gtd_new) <= -c2*gtd then
 67 |          bracket = Tensor{t}
 68 |          bracketFval = Tensor{f_new}
 69 |          bracketGval = Tensor(1,g_new:size(1))
 70 |          bracketGval[1] = g_new
 71 |          done = true
 72 |          break
 73 | 
 74 |       elseif gtd_new >= 0 then
 75 |          bracket = Tensor{t_prev,t}
 76 |          bracketFval = Tensor{f_prev,f_new}
 77 |          bracketGval = Tensor(2,g_new:size(1))
 78 |          bracketGval[1] = g_prev
 79 |          bracketGval[2] = g_new
 80 |          break
 81 | 
 82 |       end
 83 | 
 84 |       -- interpolate:
 85 |       local tmp = t_prev
 86 |       t_prev = t
 87 |       local minStep = t + 0.01*(t-tmp)
 88 |       local maxStep = t*10
 89 |       t = optim.polyinterp(Tensor{{tmp,f_prev,gtd_prev},
 90 |                                   {t,f_new,gtd_new}},
 91 |                            minStep, maxStep)
 92 | 
 93 |       -- next step:
 94 |       f_prev = f_new
 95 |       g_prev = g_new:clone()
 96 |       gtd_prev = gtd_new
 97 |       x[{}] = x_init
 98 |       x:add(t,d)
 99 |       f_new,g_new = opfunc(x)
100 |       lsFuncEval = lsFuncEval + 1
101 |       gtd_new = g_new * d
102 |       LSiter = LSiter + 1
103 |    end
104 | 
105 |    -- reached max nb of iterations?
106 |    if LSiter == maxIter then
107 |       bracket = Tensor{0,t}
108 |       bracketFval = Tensor{f,f_new}
109 |       bracketGval = Tensor(2,g_new:size(1))
110 |       bracketGval[1] = g
111 |       bracketGval[2] = g_new
112 |    end
113 | 
114 |    -- zoom phase: we now have a point satisfying the criteria, or
115 |    -- a bracket around it. We refine the bracket until we find the
116 |    -- exact point satisfying the criteria
117 |    local insufProgress = false
118 |    local LOposRemoved = 0
119 |    while not done and LSiter < maxIter do
120 |       -- find high and low points in bracket
121 |       local f_LO,LOpos = bracketFval:min(1)
122 |       LOpos = LOpos[1] f_LO = f_LO[1]
123 |       local HIpos = -LOpos+3
124 | 
125 |       -- compute new trial value
126 |       t = optim.polyinterp(Tensor{{bracket[1],bracketFval[1],bracketGval[1]*d},
127 |                                   {bracket[2],bracketFval[2],bracketGval[2]*d}})
128 | 
129 |       -- test what we are making sufficient progress
130 |       if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then
131 |          if insufProgress or t>=bracket:max() or t <= bracket:min() then
132 |             if abs(t-bracket:max()) < abs(t-bracket:min()) then
133 |                t = bracket:max()-0.1*(bracket:max()-bracket:min())
134 |             else
135 |                t = bracket:min()+0.1*(bracket:max()-bracket:min())
136 |             end
137 |             insufProgress = false
138 |          else
139 |             insufProgress = true
140 |          end
141 |       else
142 |          insufProgress = false
143 |       end
144 | 
145 |       -- Evaluate new point
146 |       x[{}] = x_init
147 |       x:add(t,d)
148 |       f_new,g_new = opfunc(x)
149 |       lsFuncEval = lsFuncEval + 1
150 |       gtd_new = g_new * d
151 |       LSiter = LSiter + 1
152 |       if f_new > f + c1*t*gtd or f_new >= f_LO then
153 |          -- Armijo condition not satisfied or not lower than lowest point
154 |          bracket[HIpos] = t
155 |          bracketFval[HIpos] = f_new
156 |          bracketGval[HIpos] = g_new
157 |       else
158 |          if abs(gtd_new) <= - c2*gtd then
159 |             -- Wolfe conditions satisfied
160 |             done = true
161 |          elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then
162 |             -- Old HI becomes new LO
163 |             bracket[HIpos] = bracket[LOpos]
164 |             bracketFval[HIpos] = bracketFval[LOpos]
165 |             bracketGval[HIpos] = bracketGval[LOpos]
166 |          end
167 |          -- New point becomes new LO
168 |          bracket[LOpos] = t
169 |          bracketFval[LOpos] = f_new
170 |          bracketGval[LOpos] = g_new
171 |       end
172 | 
173 |       -- done?
174 |       if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then
175 |          break
176 |       end
177 |    end
178 | 
179 |    -- be verbose
180 |    if LSiter == maxIter then
181 |       verbose('reached max number of iterations')
182 |    end
183 | 
184 |    -- return stuff
185 |    local _,LOpos = bracketFval:min(1)
186 |    LOpos = LOpos[1]
187 |    t = bracket[LOpos]
188 |    f_new = bracketFval[LOpos]
189 |    g_new = bracketGval[LOpos]
190 |    x[{}] = x_init
191 |    x:add(t,d)
192 | 	return f_new,g_new,x,t,lsFuncEval
193 | end
194 | 


--------------------------------------------------------------------------------
/fista.lua:
--------------------------------------------------------------------------------
  1 | --[[ FISTA with backtracking line search
  2 | 
  3 | - `f`        : smooth function
  4 | - `g`        : non-smooth function
  5 | - `pl`       : minimizer of intermediate problem Q(x,y)
  6 | - `xinit`    : initial point
  7 | - `params`   : table of parameters (**optional**)
  8 | - `params.L`       : 1/(step size) for ISTA/FISTA iteration (0.1)
  9 | - `params.Lstep`   : step size multiplier at each iteration (1.5)
 10 | - `params.maxiter` : max number of iterations (50)
 11 | - `params.maxline` : max number of line search iterations per iteration (20)
 12 | - `params.errthres`: Error thershold for convergence check (1e-4)
 13 | - `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true)
 14 | - `params.verbose` : store each iteration solution and print detailed info (false)
 15 | 
 16 | On output, `params` will contain these additional fields that can be reused.
 17 | 
 18 | - `params.L`       : last used L value will be written.
 19 | 
 20 | These are temporary storages needed by the algo and if the same params object is 
 21 | passed a second time, these same storages will be used without new allocation.
 22 | 
 23 | - `params.xkm`     : previous iterarion point
 24 | - `params.y`       : fista iteration
 25 | - `params.ply`     : ply = pl(y - 1/L grad(f))
 26 | 
 27 | Returns the solution x and history of {function evals, number of line search ,...}
 28 | 
 29 | Algorithm is published in 
 30 | 
 31 |     @article{beck-fista-09,
 32 |        Author = {Beck, Amir and Teboulle, Marc},
 33 |        Journal = {SIAM J. Img. Sci.},
 34 |        Number = {1},
 35 |        Pages = {183--202},
 36 |        Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems},
 37 |        Volume = {2},
 38 |        Year = {2009}}
 39 | ]]
 40 | function optim.FistaLS(f, g, pl, xinit, params)
 41 |    
 42 |    local params = params or {}
 43 |    local L = params.L or 0.1
 44 |    local Lstep = params.Lstep or 1.5
 45 |    local maxiter = params.maxiter or 50
 46 |    local maxline = params.maxline or 20
 47 |    local errthres = params.errthres or 1e-4
 48 |    local doFistaUpdate = params.doFistaUpdate
 49 |    local verbose = params.verbose 
 50 | 
 51 |    -- temporary allocations
 52 |    params.xkm = params.xkm or torch.Tensor()
 53 |    params.y   = params.y   or torch.Tensor()
 54 |    params.ply = params.ply or torch.Tensor()
 55 |    local xkm = params.xkm  -- previous iteration
 56 |    local y   = params.y    -- fista iteration
 57 |    local ply = params.ply  -- soft shrinked y
 58 | 
 59 |    -- we start from all zeros
 60 |    local xk = xinit
 61 |    xkm:resizeAs(xk):zero()
 62 |    ply:resizeAs(xk):zero()
 63 |    y:resizeAs(xk):zero()
 64 | 
 65 |    local history = {} -- keep track of stuff
 66 |    local niter = 0    -- number of iterations done
 67 |    local converged = false  -- are we done?
 68 |    local tk = 1      -- momentum param for FISTA
 69 |    local tkp = 0
 70 | 
 71 | 
 72 |    local gy = g(y)
 73 |    local fval = math.huge -- fval = f+g
 74 |    while not converged and niter < maxiter do
 75 | 
 76 |       -- run through smooth function (code is input, input is target)
 77 |       -- get derivatives from smooth function
 78 |       local fy,gfy = f(y,'dx')
 79 |       --local gfy = f(y)
 80 |       
 81 |       local fply = 0
 82 |       local gply = 0
 83 |       local Q = 0
 84 |       
 85 |       ----------------------------------------------
 86 |       -- do line search to find new current location starting from fista loc
 87 |       local nline = 0
 88 |       local linesearchdone = false
 89 |       while not linesearchdone do
 90 |          -- take a step in gradient direction of smooth function
 91 |          ply:copy(y)
 92 |          ply:add(-1/L,gfy)
 93 | 
 94 |          -- and solve for minimum of auxiliary problem
 95 |          pl(ply,L)
 96 |          -- this is candidate for new current iteration
 97 |          xk:copy(ply)
 98 | 
 99 |          -- evaluate this point F(ply)
100 |          fply = f(ply)
101 |          
102 |          -- ply - y
103 |          ply:add(-1, y)
104 |          -- <ply-y , \Grad(f(y))>
105 |          local Q2 = gfy:dot(ply)
106 |          -- L/2 ||beta-y||^2
107 |          local Q3 = L/2 * ply:dot(ply)
108 |          -- Q(beta,y) = F(y) + <beta-y , \Grad(F(y))> + L/2||beta-y||^2 + G(beta)
109 |          Q = fy + Q2 + Q3
110 | 
111 |          if verbose then
112 |             print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3))
113 |          end
114 |          -- check if F(beta) < Q(pl(y),\t)
115 |          if fply <= Q then --and Fply + Gply <= F then
116 |             -- now evaluate G here
117 |             linesearchdone = true
118 |          elseif  nline >= maxline then
119 |             linesearchdone = true
120 |             xk:copy(xkm) -- if we can't find a better point, current iter = previous iter
121 |             --print('oops')
122 |          else
123 |             L = L * Lstep
124 |          end
125 |          nline = nline + 1
126 |       end
127 |       -- end line search
128 |       ---------------------------------------------
129 | 
130 |       ---------------------------------------------
131 |       -- FISTA
132 |       ---------------------------------------------
133 |       if doFistaUpdate then
134 |          -- do the FISTA step
135 |          tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2
136 |          -- x(k-1) = x(k-1) - x(k)
137 |          xkm:add(-1,xk)
138 |          -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k))
139 |          y:copy(xk)
140 |          y:add( (1-tk)/tkp , xkm)
141 |          -- store for next iterations
142 |          -- x(k-1) = x(k)
143 |          xkm:copy(xk)
144 |       else
145 |          y:copy(xk)
146 |       end
147 |       -- t(k) = t(k+1)
148 |       tk = tkp
149 |       fply = f(y)
150 |       gply = g(y)
151 |       if verbose then
152 | 	 print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply))
153 |       end
154 | 
155 |       niter = niter + 1
156 | 
157 |       -- bookeeping
158 |       fval = fply + gply
159 |       history[niter] = {}
160 |       history[niter].nline = nline
161 |       history[niter].L  = L
162 |       history[niter].F  = fval
163 |       history[niter].Fply = fply
164 |       history[niter].Gply = gply
165 |       history[niter].Q  = Q
166 |       params.L = L
167 |       if verbose then
168 |          history[niter].xk = xk:clone()
169 |          history[niter].y  = y:clone()
170 |       end
171 | 
172 |       -- are we done?
173 |       if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then
174 |          converged = true
175 | 	 xinit:copy(y)
176 |          return y,history
177 |       end
178 | 
179 |       if niter >= maxiter then
180 | 	 xinit:copy(y)
181 |          return y,history
182 |       end
183 | 
184 |       --if niter > 1 and history[niter].F > history[niter-1].F then
185 |       --print(niter, 'This was supposed to be a convex function, we are going up')
186 |       --converged = true
187 |       --return xk,history
188 |       --end
189 |    end
190 |    error('not supposed to be here')
191 | end
192 | 
193 | 


--------------------------------------------------------------------------------
/polyinterp.lua:
--------------------------------------------------------------------------------
  1 | local function isreal(x)
  2 |    return x == x
  3 | end
  4 | 
  5 | local function isnan(x)
  6 |    return not x == x
  7 | end
  8 | 
  9 | local function roots(c)
 10 |    local tol=1e-12
 11 |    c[torch.lt(torch.abs(c),tol)]=0
 12 | 
 13 |    local nonzero = torch.ne(c,0)
 14 |    if nonzero:max() == 0 then
 15 |       return 0
 16 |    end
 17 | 
 18 |    -- first non-zero
 19 |    local _,pos = torch.max(nonzero,1)
 20 |    pos = pos[1]
 21 |    c=c[{ {pos,-1} }]
 22 | 
 23 |    local nz = 0
 24 |    for i=c:size(1),1,-1 do
 25 |       if c[i] ~= 0 then
 26 |          break
 27 |       else
 28 |          nz = nz + 1
 29 |       end
 30 |    end
 31 |    c=c[{ {1,c:size(1)-nz} }]
 32 | 
 33 |    local n = c:size(1)-1
 34 |    if n == 1 then
 35 |       local e = torch.Tensor({{-c[2]/c[1], 0}})
 36 |       if nz > 0 then
 37 |          return torch.cat(e, torch.zeros(nz, 2), 1)
 38 |       else
 39 |          return e
 40 |       end
 41 |    elseif n > 1 then
 42 |       local A = torch.diag(torch.ones(n-1),-1)
 43 |       A[1] = -c[{ {2,n+1} }]/c[1];
 44 |       local e = torch.eig(A,'N')
 45 |       if nz > 0 then
 46 |          return torch.cat(e, torch.zeros(nz,2), 1)
 47 |       else
 48 |          return e
 49 |       end
 50 |    else
 51 |       return torch.zeros(nz,2)
 52 |    end
 53 | end
 54 | 
 55 | local function real(x)
 56 |    if type(x) == number then return x end
 57 |    return x[{ {} , 1}]
 58 | end
 59 | 
 60 | local function imag(x)
 61 |    if type(x) == 'number' then return 0 end
 62 |    if x:nDimension() == 1 then
 63 |       return torch.zeros(x:size(1))
 64 |    else
 65 |       return x[{ {},  2}]
 66 |    end
 67 | end
 68 | 
 69 | local function polyval(p,x)
 70 |    local pwr = p:size(1)
 71 |    if type(x) == 'number' then 
 72 |       local val = 0
 73 |       p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end)
 74 |       return val
 75 |    else
 76 |       local val = x.new(x:size(1))
 77 |       p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end)
 78 |       return val
 79 |    end
 80 | end
 81 | 
 82 | ----------------------------------------------------------------------
 83 | -- Minimum of interpolating polynomial based on function and 
 84 | -- derivative values
 85 | --
 86 | -- ARGS:
 87 | -- points : N triplets (x,f,g), must be a Tensor
 88 | -- xmin   : min value that brackets minimum (default: min of points)
 89 | -- xmax   : max value that brackets maximum (default: max of points)
 90 | --
 91 | -- RETURN:
 92 | -- minPos : position of minimum
 93 | --
 94 | function optim.polyinterp(points,xminBound,xmaxBound)
 95 |    -- locals
 96 |    local sqrt = torch.sqrt
 97 |    local mean = torch.mean
 98 |    local Tensor = torch.Tensor
 99 |    local zeros = torch.zeros
100 |    local max = math.max
101 |    local min = math.min
102 | 
103 |    -- nb of points / order of polynomial
104 |    local nPoints = points:size(1)
105 |    local order = nPoints*2-1
106 | 
107 |    -- returned values
108 |    local minPos
109 | 
110 |    -- Code for most common case:
111 |    --   + cubic interpolation of 2 points w/ function and derivative values for both
112 |    --   + no xminBound/xmaxBound
113 |    if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then
114 |       -- Solution in this case (where x2 is the farthest point):
115 |       --    d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
116 |       --    d2 = sqrt(d1^2 - g1*g2);
117 |       --    minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
118 |       --    t_new = min(max(minPos,x1),x2);
119 |       local minVal,minPos = points[{ {},1 }]:min(1)
120 |       minVal = minVal[1] minPos = minPos[1]
121 |       local notMinPos = -minPos+3;
122 |       
123 |       local d1 = points[{minPos,3}] + points[{notMinPos,3}] 
124 |                - 3*(points[{minPos,2}]-points[{notMinPos,2}])
125 |                      / (points[{minPos,1}]-points[{notMinPos,1}]);
126 |       local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]);
127 | 
128 |       if isreal(d2) then -- isreal()
129 |          local t = points[{notMinPos,1}] - (points[{notMinPos,1}]
130 |                    - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1)
131 |                      / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2))
132 |          
133 |          minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}])
134 |       else
135 |          minPos = mean(points[{{},1}])
136 |       end
137 |       return minPos
138 |    end
139 | 
140 |    -- TODO: get the code below to work!
141 |    --error('<optim.polyinterp> extrapolation not implemented yet...')
142 | 
143 |    -- Compute Bounds of Interpolation Area
144 |    local xmin = points[{{},1}]:min()
145 |    local xmax = points[{{},1}]:max()
146 |    xminBound = xminBound or xmin
147 |    xmaxBound = xmaxBound or xmax
148 | 
149 |    -- Add constraints on function values
150 |    local A = zeros(nPoints*2,order+1)
151 |    local b = zeros(nPoints*2,1)
152 |    for i = 1,nPoints do
153 |       local constraint = zeros(order+1)
154 |       for j = order,0,-1 do
155 |          constraint[order-j+1] = points[{i,1}]^j
156 |       end
157 |       A[i] = constraint
158 |       b[i] = points[{i,2}]
159 |    end
160 | 
161 |    -- Add constraints based on derivatives
162 |    for i = 1,nPoints do
163 |       local constraint = zeros(order+1)
164 |       for j = 1,order do
165 |          constraint[j] = (order-j+1)*points[{i,1}]^(order-j)
166 |       end
167 |       A[nPoints+i] = constraint
168 |       b[nPoints+i] = points[{i,3}]
169 |    end
170 | 
171 |    -- Find interpolating polynomial
172 |    local res = torch.gels(b,A)
173 |    local params = res[{ {1,nPoints*2} }]:squeeze()
174 | 
175 |    --print(A)
176 |    --print(b)
177 |    --print(params)
178 |    params[torch.le(torch.abs(params),1e-12)]=0
179 | 
180 |    -- Compute Critical Points
181 |    local dParams = zeros(order);
182 |    for i = 1,params:size(1)-1 do
183 |       dParams[i] = params[i]*(order-i+1)
184 |    end
185 | 
186 |    -- nan/inf?
187 |    local nans = false
188 |    if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then
189 |       nans = true
190 |    end
191 |    -- for i = 1,dParams:size(1) do
192 |    --    if dParams[i] ~= dParams[i] or dParams[i] == math.huge then
193 |    --       nans = true
194 |    --       break
195 |    --    end
196 |    -- end
197 |    local cp = torch.cat(Tensor{xminBound,xmaxBound},points[{{},1}])
198 |    if not nans then
199 |       local cproots = roots(dParams)
200 |       local cpi = zeros(cp:size(1),2)
201 |       cpi[{ {1,cp:size(1)} , 1 }] = cp
202 |       cp = torch.cat(cpi,cproots,1)
203 |    end
204 | 
205 |    --print(dParams)
206 |    --print(cp)
207 | 
208 |    -- Test Critical Points
209 |    local fmin = math.huge
210 |    -- Default to Bisection if no critical points valid:
211 |    minPos = (xminBound+xmaxBound)/2
212 |    --print(minPos,fmin)
213 |    --print(xminBound,xmaxBound)
214 |    for i = 1,cp:size(1) do
215 |       local xCP = cp[{ {i,i} , {} }]
216 |       --print('xcp=')
217 |       --print(xCP)
218 |       local ixCP = imag(xCP)[1]
219 |       local rxCP = real(xCP)[1]
220 |       if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then
221 |          local fCP = polyval(params,rxCP)
222 | 	 --print('fcp=')
223 | 	 --print(fCP)
224 | 	 --print(fCP < fmin)
225 |          if fCP < fmin then
226 |             minPos = rxCP
227 |             fmin = fCP
228 | 	    --print('u',minPos,fmin)
229 |          end
230 | 	 --print('v',minPos,fmin)
231 |       end
232 |    end
233 |    return minPos,fmin
234 | end
235 | 


--------------------------------------------------------------------------------
/lbfgs.lua:
--------------------------------------------------------------------------------
  1 | --[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)
  2 | 
  3 | This implementation of L-BFGS relies on a user-provided line
  4 | search function (state.lineSearch). If this function is not
  5 | provided, then a simple learningRate is used to produce fixed
  6 | size steps. Fixed size steps are much less costly than line
  7 | searches, and can be useful for stochastic problems.
  8 | 
  9 | The learning rate is used even when a line search is provided.
 10 | This is also useful for large-scale stochastic problems, where
 11 | opfunc is a noisy approximation of f(x). In that case, the learning
 12 | rate allows a reduction of confidence in the step size.
 13 | 
 14 | ARGS:
 15 | 
 16 | - `opfunc` : a function that takes a single input (X), the point of
 17 |          evaluation, and returns f(X) and df/dX
 18 | - `x` : the initial point
 19 | - `state` : a table describing the state of the optimizer; after each
 20 |          call the state is modified
 21 | - `state.maxIter` : Maximum number of iterations allowed
 22 | - `state.maxEval` : Maximum number of function evaluations
 23 | - `state.tolFun` : Termination tolerance on the first-order optimality
 24 | - `state.tolX` : Termination tol on progress in terms of func/param changes
 25 | - `state.lineSearch` : A line search function
 26 | - `state.learningRate` : If no line search provided, then a fixed step size is used
 27 | 
 28 | RETURN:
 29 | - `x*` : the new `x` vector, at the optimal point
 30 | - `f`  : a table of all function values: 
 31 |      `f[1]` is the value of the function before any optimization and
 32 |      `f[#f]` is the final fully optimized value, at `x*`
 33 | 
 34 | (Clement Farabet, 2012)
 35 | ]]
 36 | function optim.lbfgs(opfunc, x, config, state)
 37 |    -- get/update state
 38 |    local config = config or {}
 39 |    local state = state or config
 40 |    local maxIter = tonumber(config.maxIter) or 20
 41 |    local maxEval = tonumber(config.maxEval) or maxIter*1.25
 42 |    local tolFun = config.tolFun or 1e-5
 43 |    local tolX = config.tolX or 1e-9
 44 |    local nCorrection = config.nCorrection or 100
 45 |    local lineSearch = config.lineSearch
 46 |    local lineSearchOpts = config.lineSearchOptions
 47 |    local learningRate = config.learningRate or 1
 48 |    local isverbose = config.verbose or false
 49 |    
 50 |    state.funcEval = state.funcEval or 0
 51 |    state.nIter = state.nIter or 0
 52 | 
 53 |    -- verbose function
 54 |    local verbose
 55 |    if isverbose then
 56 |       verbose = function(...) print('<optim.lbfgs> ', ...) end
 57 |    else
 58 |       verbose = function() end
 59 |    end
 60 | 
 61 |    -- import some functions
 62 |    local abs = math.abs
 63 |    local min = math.min
 64 | 
 65 |    -- evaluate initial f(x) and df/dx
 66 |    local f,g = opfunc(x)
 67 |    local f_hist = {f}
 68 |    local currentFuncEval = 1
 69 |    state.funcEval = state.funcEval + 1
 70 |    local p = g:size(1)
 71 | 
 72 |    -- check optimality of initial point
 73 |    state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1
 74 |    tmp1:copy(g):abs()
 75 |    if tmp1:sum() <= tolFun then
 76 |       -- optimality condition below tolFun
 77 |       verbose('optimality condition below tolFun')
 78 |       return x,f_hist
 79 |    end
 80 | 
 81 |    if not state.dir_bufs then
 82 |       -- reusable buffers for y's and s's, and their histories
 83 |       verbose('creating recyclable direction/step/history buffers')
 84 |       state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1)
 85 |       state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1)
 86 |       for i=1,#state.dir_bufs do
 87 |          state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1)
 88 |          state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1)
 89 |       end
 90 |    end
 91 | 
 92 |    -- variables cached in state (for tracing)
 93 |    local d = state.d
 94 |    local t = state.t
 95 |    local old_dirs = state.old_dirs
 96 |    local old_stps = state.old_stps
 97 |    local Hdiag = state.Hdiag
 98 |    local g_old = state.g_old
 99 |    local f_old = state.f_old
100 | 
101 |    -- optimize for a max of maxIter iterations
102 |    local nIter = 0
103 |    while nIter < maxIter do
104 |       -- keep track of nb of iterations
105 |       nIter = nIter + 1
106 |       state.nIter = state.nIter + 1
107 | 
108 |       ------------------------------------------------------------
109 |       -- compute gradient descent direction
110 |       ------------------------------------------------------------
111 |       if state.nIter == 1 then
112 |          d = g:clone():mul(-1) -- -g
113 |          old_dirs = {}
114 |          old_stps = {}
115 |          Hdiag = 1
116 |       else
117 |          -- do lbfgs update (update memory)
118 |          local y = table.remove(state.dir_bufs)  -- pop
119 |          local s = table.remove(state.stp_bufs)
120 |          y:add(g, -1, g_old)  -- g - g_old
121 |          s:mul(d, t)          -- d*t
122 |          local ys = y:dot(s)  -- y*s
123 |          if ys > 1e-10 then
124 |             -- updating memory
125 |             if #old_dirs == nCorrection then
126 |                -- shift history by one (limited-memory)
127 |                local removed1 = table.remove(old_dirs, 1)
128 |                local removed2 = table.remove(old_stps, 1)
129 |                table.insert(state.dir_bufs, removed1)
130 |                table.insert(state.stp_bufs, removed2)
131 |             end
132 | 
133 |             -- store new direction/step
134 |             table.insert(old_dirs, s)
135 |             table.insert(old_stps, y)
136 | 
137 |             -- update scale of initial Hessian approximation
138 |             Hdiag = ys / y:dot(y)  -- (y*y)
139 |          else
140 |             -- put y and s back into the buffer pool
141 |             table.insert(state.dir_bufs, y)
142 |             table.insert(state.stp_bufs, s)
143 |          end
144 | 
145 |          -- compute the approximate (L-BFGS) inverse Hessian 
146 |          -- multiplied by the gradient
147 |          local k = #old_dirs
148 | 
149 |          -- need to be accessed element-by-element, so don't re-type tensor:
150 |          state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro
151 |          for i = 1,k do
152 |             ro[i] = 1 / old_stps[i]:dot(old_dirs[i])
153 |          end
154 | 
155 |          -- iteration in L-BFGS loop collapsed to use just one buffer
156 |          local q = tmp1  -- reuse tmp1 for the q buffer
157 |          -- need to be accessed element-by-element, so don't re-type tensor:
158 |          state.al = state.al or torch.zeros(nCorrection) local al = state.al
159 | 
160 |          q:mul(g, -1)  -- -g
161 |          for i = k,1,-1 do
162 |             al[i] = old_dirs[i]:dot(q) * ro[i]
163 |             q:add(-al[i], old_stps[i])
164 |          end
165 | 
166 |          -- multiply by initial Hessian
167 |          r = d  -- share the same buffer, since we don't need the old d
168 |          r:mul(q, Hdiag)  -- q[1] * Hdiag
169 |          for i = 1,k do
170 |             local be_i = old_stps[i]:dot(r) * ro[i]
171 |             r:add(al[i]-be_i, old_dirs[i])
172 |          end
173 |          -- final direction is in r/d (same object)
174 |       end
175 |       g_old = g_old or g:clone()
176 |       g_old:copy(g)
177 |       f_old = f
178 | 
179 |       ------------------------------------------------------------
180 |       -- compute step length
181 |       ------------------------------------------------------------
182 |       -- directional derivative
183 |       local gtd = g:dot(d)  -- g * d
184 | 
185 |       -- check that progress can be made along that direction
186 |       if gtd > -tolX then
187 |          break
188 |       end
189 | 
190 |       -- reset initial guess for step size
191 |       if state.nIter == 1 then
192 |          tmp1:copy(g):abs()
193 |          t = min(1,1/tmp1:sum()) * learningRate
194 |       else
195 |          t = learningRate
196 |       end
197 | 
198 |       -- optional line search: user function
199 |       local lsFuncEval = 0
200 |       if lineSearch and type(lineSearch) == 'function' then
201 |          -- perform line search, using user function
202 |          f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
203 |          table.insert(f_hist, f)
204 |       else
205 |          -- no line search, simply move with fixed-step
206 |          x:add(t,d)
207 |          if nIter ~= maxIter then
208 |             -- re-evaluate function only if not in last iteration
209 |             -- the reason we do this: in a stochastic setting,
210 |             -- no use to re-evaluate that function here
211 |             f,g = opfunc(x)
212 |             lsFuncEval = 1
213 |             table.insert(f_hist, f)
214 |          end
215 |       end
216 | 
217 |       -- update func eval
218 |       currentFuncEval = currentFuncEval + lsFuncEval
219 |       state.funcEval = state.funcEval + lsFuncEval
220 | 
221 |       ------------------------------------------------------------
222 |       -- check conditions
223 |       ------------------------------------------------------------
224 |       if nIter == maxIter then
225 |          -- no use to run tests
226 |          verbose('reached max number of iterations')
227 |          break
228 |       end
229 | 
230 |       if currentFuncEval >= maxEval then
231 |          -- max nb of function evals
232 |          verbose('max nb of function evals')
233 |          break
234 |       end
235 | 
236 |       tmp1:copy(g):abs()
237 |       if tmp1:sum() <= tolFun then
238 |          -- check optimality
239 |          verbose('optimality condition below tolFun')
240 |          break
241 |       end
242 | 
243 |       tmp1:copy(d):mul(t):abs()
244 |       if tmp1:sum() <= tolX then
245 |          -- step size below tolX
246 |          verbose('step size below tolX')
247 |          break
248 |       end
249 | 
250 |       if abs(f-f_old) < tolX then
251 |          -- function value changing less than tolX
252 |          verbose('function value changing less than tolX')
253 |          break
254 |       end
255 |    end
256 | 
257 |    -- save state
258 |    state.old_dirs = old_dirs
259 |    state.old_stps = old_stps
260 |    state.Hdiag = Hdiag
261 |    state.g_old = g_old
262 |    state.f_old = f_old
263 |    state.t = t
264 |    state.d = d
265 | 
266 |    -- return optimal x, and history of f(x)
267 |    return x,f_hist,currentFuncEval
268 | end
269 | 


--------------------------------------------------------------------------------
/cmaes.lua:
--------------------------------------------------------------------------------
  1 | require 'torch'
  2 | require 'math'
  3 | 
  4 | local BestSolution = {} 
  5 | --[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), 
  6 | ported from https://www.lri.fr/~hansen/barecmaes2.html.
  7 |  
  8 | Parameters
  9 | ----------
 10 | ARGS:
 11 | 
 12 | -    `opfunc` : a function that takes a single input (X), the point of 
 13 |                evaluation, and returns f(X) and df/dX. Note that df/dX is not used 
 14 | -    `x` : the initial point
 15 | -    `state.sigma`
 16 |             float, initial step-size (standard deviation in each
 17 |             coordinate)
 18 | -    `state.maxEval`
 19 |             int, maximal number of function evaluations
 20 | -    `state.ftarget`
 21 |             float, target function value
 22 | -    `state.popsize`
 23 |           population size. If this is left empty, 
 24 |             4 + int(3 * log(|x|)) will be used
 25 | -    `state.ftarget` 
 26 |             stop if fitness < ftarget
 27 | -    `state.verb_disp`
 28 |             int, display on console every verb_disp iteration, 0 for never
 29 | 
 30 | RETURN:
 31 | - `x*` : the new `x` vector, at the optimal point
 32 | - `f`  : a table of all function values: 
 33 |        `f[1]` is the value of the function before any optimization and
 34 |        `f[#f]` is the final fully optimized value, at `x*`
 35 | --]]
 36 | function optim.cmaes(opfunc, x, config, state)
 37 |    if  (x.triu == nil or x.diag == nil) then
 38 |       error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x")
 39 |    end
 40 |    -- process input parameters
 41 |    local config = config or {}
 42 |    local state = state or config
 43 |    local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
 44 |    local N = xmean:size(1)  -- number of objective variables/problem dimension
 45 |    local sigma = state.sigma -- coordinate wise standard deviation (step size)
 46 |    local ftarget = state.ftarget -- stop if fitness < ftarget
 47 |    local maxEval = tonumber(state.maxEval) or 1e3*N^2
 48 |    local objfunc = opfunc
 49 |    local verb_disp = state.verb_disp -- display step size
 50 |    local min_iterations = state.min_iterations or 1
 51 | 
 52 |    local lambda = state.popsize -- population size, offspring number
 53 |    -- Strategy parameter setting: Selection  
 54 |    if state.popsize == nil then
 55 |       lambda = 4 + math.floor(3 * math.log(N))
 56 |    end
 57 | 
 58 |    local mu = lambda / 2  -- number of parents/points for recombination
 59 |    local weights = torch.range(0,mu-1):apply(function(i) 
 60 |        return math.log(mu+0.5) - math.log(i+1)  end) -- recombination weights
 61 |    weights:div(weights:sum())  -- normalize recombination weights array
 62 |    local mueff = weights:sum()^2 / torch.pow(weights,2):sum()  -- variance-effectiveness of sum w_i x_i
 63 |    weights = weights:typeAs(x)
 64 | 
 65 |    -- Strategy parameter setting: Adaptation
 66 |    local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N)  -- time constant for cumulation for C
 67 |    local cs = (mueff + 2) / (N + mueff + 5)  -- t-const for cumulation for sigma control
 68 |    local c1 = 2 / ((N + 1.3)^2 + mueff)  -- learning rate for rank-one update of C
 69 |    local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff))  -- and for rank-mu update
 70 |    local damps = 2 * mueff/lambda + 0.3 + cs  -- damping for sigma, usually close to 1
 71 | 
 72 |    -- Initialize dynamic (internal) state variables 
 73 |    local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C
 74 |    local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma
 75 |    local B = torch.eye(N):typeAs(x)   -- B defines the coordinate system 
 76 |    local D = torch.Tensor(N):fill(1):typeAs(x)  -- diagonal D defines the scaling
 77 |    local C = torch.eye(N):typeAs(x)   -- covariance matrix 
 78 |    if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig
 79 |       error('torch.symeig not available for ' .. x:type() .. 
 80 |          " please use Float- or DoubleTensor for x")
 81 |    end
 82 |    local candidates = torch.Tensor(lambda,N):typeAs(x)
 83 |    local invsqrtC = torch.eye(N):typeAs(x)  -- C^-1/2 
 84 |    local eigeneval = 0      -- tracking the update of B and D
 85 |    local counteval = 0
 86 |    local f_hist = {[1]=opfunc(x)}  -- for bookkeeping output and termination
 87 |    local fitvals = torch.Tensor(lambda)-- fitness values
 88 |    local best = BestSolution.new(nil,nil,counteval)
 89 |    local iteration = 0 -- iteration of the optimize loop
 90 | 
 91 | 
 92 |    local function ask()
 93 |       --[[return a list of lambda candidate solutions according to 
 94 |        m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I)
 95 |        --]]
 96 |       -- Eigendecomposition: first update B, D and invsqrtC from C
 97 |       -- postpone in case to achieve O(N^2)
 98 |       if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then
 99 |          eigeneval = counteval
100 |          C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry
101 |          D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3)
102 |          D = torch.sqrt(D)  -- D contains standard deviations now
103 |          invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t())
104 |       end
105 |       for k=1,lambda do --repeat lambda times
106 |          local z = D:clone():normal(0,1):cmul(D)
107 |          candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma)
108 |       end
109 | 
110 |       return candidates
111 |    end
112 | 
113 | 
114 |    local function tell(arx)
115 |       --[[update the evolution paths and the distribution parameters m,
116 |       sigma, and C within CMA-ES.
117 | 
118 |       Parameters
119 |       ----------
120 |             `arx` 
121 |                   a list of solutions, presumably from `ask()`
122 |             `fitvals` 
123 |                   the corresponding objective function values --]]
124 |       -- bookkeeping, preparation
125 |       counteval = counteval + lambda  -- slightly artificial to do here
126 |       local xold = xmean:clone()
127 | 
128 |       -- Sort by fitness and compute weighted mean into xmean
129 |       local arindex = nil --sorted indices
130 |       fitvals, arindex = torch.sort(fitvals)
131 |       arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions
132 | 
133 |       table.insert(f_hist, fitvals[1]) --append best fitness to history
134 |       best:update(arx[1], fitvals[1], counteval)
135 | 
136 |       xmean:zero()
137 |       xmean:addmv(arx:t(), weights) --dot product
138 | 
139 |       -- Cumulation: update evolution paths
140 |       local y = xmean - xold
141 |       local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold)
142 | 
143 |       local c = (cs * (2-cs) * mueff)^0.5 / sigma
144 |       ps = ps - ps * cs + z * c -- exponential decay on ps
145 |       local hsig = (torch.sum(torch.pow(ps,2)) / 
146 |          (1-(1-cs)^(2*counteval/lambda)) / N  < 2 + 4./(N+1))
147 |       hsig = hsig and 1.0 or 0.0 --use binary numbers
148 | 
149 |       c = (cc * (2-cc) * mueff)^0.5 / sigma
150 |       pc = pc - pc * cc + y * c * hsig -- exponential decay on pc
151 | 
152 |       -- Adapt covariance matrix C
153 |       local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc)
154 |       -- for a minor adjustment to the variance loss by hsig
155 |       for i=1,N do
156 |          for j=1,N do
157 |             local r = torch.range(1,mu)
158 |             r:apply(function(k) 
159 |                return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end)
160 |             local Cmuij = torch.sum(r) / sigma^2  -- rank-mu update
161 |             C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + 
162 |                   c1 * pc[i]*pc[j] + cmu * Cmuij)
163 |             end
164 |          end
165 | 
166 |          -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82
167 |          sigma = sigma * math.exp(math.min(0.6, 
168 |                (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2))
169 |    end
170 | 
171 |    local function stop() 
172 |       --[[return satisfied termination conditions in a table like 
173 |       {'termination reason':value, ...}, for example {'tolfun':1e-12}, 
174 |       or the empty table {}--]] 
175 |       local res = {}
176 |       if counteval > 0 then
177 |          if counteval >= maxEval then
178 |             res['evals'] = maxEval
179 |          end
180 |          if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then
181 |             res['ftarget'] = ftarget
182 |          end
183 |          if torch.max(D) > 1e7 * torch.min(D) then
184 |             res['condition'] = 1e7
185 |          end
186 |          if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then
187 |             res['tolfun'] = 1e-12 
188 |          end
189 |          if sigma * torch.max(D) < 1e-11 then
190 |             -- remark: max(D) >= max(diag(C))^0.5
191 |             res['tolx'] = 1e-11
192 |          end
193 |       end
194 |       return res
195 |    end
196 | 
197 |    local function disp(verb_modulo)
198 |       --[[display some iteration info--]]
199 |       if verb_disp == 0 then
200 |          return nil
201 |       end
202 |       local iteration = counteval / lambda
203 | 
204 |       if iteration == 1 or iteration % (10*verb_modulo) == 0 then
205 |          print('evals:\t ax-ratio max(std)   f-value')
206 |       end
207 |       if iteration <= 2 or iteration % verb_modulo == 0 then
208 |          local max_std = math.sqrt(torch.max(torch.diag(C)))
209 |          print(tostring(counteval).. ': ' .. 
210 |             string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) 
211 |             .. tostring(fitvals[1]))
212 |       end
213 | 
214 |       return nil
215 |    end
216 | 
217 |    while next(stop()) == nil or iteration < min_iterations do
218 |       iteration = iteration + 1
219 | 
220 |       local X = ask() -- deliver candidate solutions
221 |       for i=1, lambda do
222 |          -- put candidate tensor back in input shape and evaluate in opfunc
223 |          local candidate = X[i]:viewAs(x)
224 |          fitvals[i] = objfunc(candidate)
225 |       end
226 | 
227 |       tell(X) 
228 |       disp(verb_disp)
229 |    end
230 | 
231 |    local bestmu, f, c = best:get()
232 |    if verb_disp > 0 then
233 |       for k, v in pairs(stop()) do
234 |          print('termination by', k, '=', v)
235 |       end
236 |       print('best f-value =', f)
237 |       print('solution = ')
238 |       print(bestmu)
239 |       print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration)
240 |    end
241 |    table.insert(f_hist, f)
242 | 
243 |    return bestmu, f_hist, counteval
244 | end
245 | 
246 | 
247 | 
248 | BestSolution.__index = BestSolution 
249 | function BestSolution.new(x, f, evals)
250 |    local self = setmetatable({}, BestSolution)
251 |    self.x = x
252 |    self.f = f
253 |    self.evals = evals
254 |    return self
255 | end
256 | 
257 | function BestSolution.update(self, arx, arf, evals)
258 |    --[[initialize the best solution with `x`, `f`, and `evals`.
259 |       Better solutions have smaller `f`-values.--]]
260 |    if self.f == nil or arf < self.f then
261 |       self.x = arx:clone()
262 |       self.f = arf
263 |       self.evals = evals
264 |    end
265 |    return self
266 | end
267 | 
268 | function BestSolution.get(self)
269 |    return self.x, self.f, self.evals
270 | end
271 | 


--------------------------------------------------------------------------------
/ConfusionMatrix.lua:
--------------------------------------------------------------------------------
  1 | --[[ A Confusion Matrix class
  2 | 
  3 | Example:
  4 | 
  5 |     conf = optim.ConfusionMatrix( {'cat','dog','person'} )   -- new matrix
  6 |     conf:zero()                                              -- reset matrix
  7 |     for i = 1,N do
  8 |         conf:add( neuralnet:forward(sample), label )         -- accumulate errors
  9 |     end
 10 |     print(conf)                                              -- print matrix
 11 |     image.display(conf:render())                             -- render matrix
 12 | ]]
 13 | local ConfusionMatrix = torch.class('optim.ConfusionMatrix')
 14 | 
 15 | function ConfusionMatrix:__init(nclasses, classes)
 16 |    if type(nclasses) == 'table' then
 17 |       classes = nclasses
 18 |       nclasses = #classes
 19 |    end
 20 |    self.mat = torch.LongTensor(nclasses,nclasses):zero()
 21 |    self.valids = torch.FloatTensor(nclasses):zero()
 22 |    self.unionvalids = torch.FloatTensor(nclasses):zero()
 23 |    self.nclasses = nclasses
 24 |    self.totalValid = 0
 25 |    self.averageValid = 0
 26 |    self.classes = classes or {}
 27 |    -- buffers
 28 |    self._mat_flat = self.mat:view(-1)
 29 |    self._target = torch.FloatTensor()
 30 |    self._prediction = torch.FloatTensor()
 31 |    self._max = torch.FloatTensor()
 32 |    self._pred_idx = torch.LongTensor()
 33 |    self._targ_idx = torch.LongTensor()
 34 | end
 35 | 
 36 | -- takes scalar prediction and target as input
 37 | function ConfusionMatrix:_add(p, t)
 38 |    assert(p and type(p) == 'number')
 39 |    assert(t and type(t) == 'number')
 40 |    -- non-positive values are considered missing
 41 |    -- and therefore ignored
 42 |    if t > 0 then
 43 |       self.mat[t][p] = self.mat[t][p] + 1
 44 |    end
 45 | end
 46 | 
 47 | function ConfusionMatrix:add(prediction, target)
 48 |    if type(prediction) == 'number' then
 49 |       -- comparing numbers
 50 |       self:_add(prediction, target)
 51 |    else
 52 |       self._prediction:resize(prediction:size()):copy(prediction)
 53 |       assert(prediction:dim() == 1)
 54 |       if type(target) == 'number' then
 55 |          -- prediction is a vector, then target assumed to be an index
 56 |          self._max:max(self._pred_idx, self._prediction, 1)
 57 |          self:_add(self._pred_idx[1], target)
 58 |       else
 59 |          -- both prediction and target are vectors
 60 |          assert(target:dim() == 1)
 61 |          self._target:resize(target:size()):copy(target)
 62 |          self._max:max(self._targ_idx, self._target, 1)
 63 |          self._max:max(self._pred_idx, self._prediction, 1)
 64 |          self:_add(self._pred_idx[1], self._targ_idx[1])
 65 |       end
 66 |    end
 67 | end
 68 | 
 69 | function ConfusionMatrix:batchAdd(predictions, targets)
 70 |    local preds, targs, __
 71 |    self._prediction:resize(predictions:size()):copy(predictions)
 72 |    if predictions:dim() == 1 then
 73 |       -- predictions is a vector of classes
 74 |       preds = self._prediction
 75 |    elseif predictions:dim() == 2 then
 76 |       -- prediction is a matrix of class likelihoods
 77 |       if predictions:size(2) == 1 then
 78 |          -- or prediction just needs flattening
 79 |          preds = self._prediction:select(2,1)
 80 |       else
 81 |          self._max:max(self._pred_idx, self._prediction, 2)
 82 |          preds = self._pred_idx:select(2,1)
 83 |       end
 84 |    else
 85 |       error("predictions has invalid number of dimensions")
 86 |    end
 87 | 
 88 |    self._target:resize(targets:size()):copy(targets)
 89 |    if targets:dim() == 1 then
 90 |       -- targets is a vector of classes
 91 |       targs = self._target
 92 |    elseif targets:dim() == 2 then
 93 |       -- targets is a matrix of one-hot rows
 94 |       if targets:size(2) == 1 then
 95 |          -- or targets just needs flattening
 96 |          targs = self._target:select(2,1)
 97 |       else
 98 |          self._max:max(self._targ_idx, self._target, 2)
 99 |          targs = self._targ_idx:select(2,1)
100 |       end
101 |    else
102 |       error("targets has invalid number of dimensions")
103 |    end
104 | 
105 |    -- non-positive values are considered missing and therefore ignored
106 |    local mask = targs:ge(1)
107 |    targs = targs[mask]
108 |    preds = preds[mask]
109 | 
110 |    self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility
111 | 
112 |    preds = preds:typeAs(targs)
113 | 
114 |    assert(self.mat:isContiguous() and self.mat:stride(2) == 1)
115 |    local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat)
116 |    local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1))
117 |    self._mat_flat:indexAdd(1, indices, ones)
118 | end
119 | 
120 | function ConfusionMatrix:zero()
121 |    self.mat:zero()
122 |    self.valids:zero()
123 |    self.unionvalids:zero()
124 |    self.totalValid = 0
125 |    self.averageValid = 0
126 | end
127 | 
128 | local function isNaN(number)
129 |   return number ~= number
130 | end
131 | 
132 | function ConfusionMatrix:updateValids()
133 |    local total = 0
134 |    for t = 1,self.nclasses do
135 |       self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum()
136 |       self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t])
137 |       total = total + self.mat[t][t]
138 |    end
139 |    self.totalValid = total / self.mat:sum()
140 |    self.averageValid = 0
141 |    self.averageUnionValid = 0
142 |    local nvalids = 0
143 |    local nunionvalids = 0
144 |    for t = 1,self.nclasses do
145 |       if not isNaN(self.valids[t]) then
146 |          self.averageValid = self.averageValid + self.valids[t]
147 |          nvalids = nvalids + 1
148 |       end
149 |       if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then
150 |          self.averageUnionValid = self.averageUnionValid + self.unionvalids[t]
151 |          nunionvalids = nunionvalids + 1
152 |       end
153 |    end
154 |    self.averageValid = self.averageValid / nvalids
155 |    self.averageUnionValid = self.averageUnionValid / nunionvalids
156 | end
157 | 
158 | -- Calculating FAR/FRR associated with the confusion matrix
159 | 
160 | function ConfusionMatrix:farFrr()
161 |    local cmat = self.mat
162 |    local noOfClasses = cmat:size()[1]
163 |    self._frrs = self._frrs or torch.zeros(noOfClasses)
164 |    self._frrs:zero()
165 |    self._classFrrs = self._classFrrs or torch.zeros(noOfClasses)
166 |    self._classFrrs:zero()
167 |    self._classFrrs:add(-1)
168 |    self._fars = self._fars or torch.zeros(noOfClasses)
169 |    self._fars:zero()
170 |    self._classFars = self._classFars or torch.zeros(noOfClasses)
171 |    self._classFars:zero()
172 |    self._classFars:add(-1)
173 |    local classSamplesCount = cmat:sum(2)
174 |    local indx = 1
175 |    for i=1,noOfClasses do
176 |       if classSamplesCount[i][1] ~= 0 then
177 |          self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1]
178 |          self._classFrrs[i] = self._frrs[indx]
179 |          -- Calculating FARs
180 |          local farNumerator = 0
181 |          local farDenominator = 0
182 |          for j=1, noOfClasses do
183 |             if i ~= j then
184 |                if classSamplesCount[j][1] ~= 0 then
185 |                   farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1]
186 |                   farDenominator  = farDenominator + 1
187 |                end
188 |             end
189 |          end
190 |          self._fars[indx] = farNumerator/farDenominator
191 |          self._classFars[i] = self._fars[indx]
192 |          indx = indx + 1
193 |       end
194 |    end
195 |    indx = indx - 1
196 |    local returnFrrs = self._frrs[{{1, indx}}]
197 |    local returnFars = self._fars[{{1, indx}}]
198 |    return self._classFrrs, self._classFars, returnFrrs, returnFars
199 | end
200 | 
201 | local function log10(n)
202 |    if math.log10 then
203 |       return math.log10(n)
204 |    else
205 |       return math.log(n) / math.log(10)
206 |    end
207 | end
208 | 
209 | function ConfusionMatrix:__tostring__()
210 |    self:updateValids()
211 |    local str = {'ConfusionMatrix:\n'}
212 |    local nclasses = self.nclasses
213 |    table.insert(str, '[')
214 |    local maxCnt = self.mat:max()
215 |    local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt)))
216 |    for t = 1,nclasses do
217 |       local pclass = self.valids[t] * 100
218 |       pclass = string.format('%2.3f', pclass)
219 |       if t == 1 then
220 |          table.insert(str, '[')
221 |       else
222 |          table.insert(str, ' [')
223 |       end
224 |       for p = 1,nclasses do
225 |          table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p]))
226 |       end
227 |       if self.classes and self.classes[1] then
228 |          if t == nclasses then
229 |             table.insert(str, ']]  ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
230 |          else
231 |             table.insert(str, ']   ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
232 |          end
233 |       else
234 |          if t == nclasses then
235 |             table.insert(str, ']]  ' .. pclass .. '% \n')
236 |          else
237 |             table.insert(str, ']   ' .. pclass .. '% \n')
238 |          end
239 |       end
240 |    end
241 |    table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n')
242 |    table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n')
243 |    table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%')
244 |    return table.concat(str)
245 | end
246 | 
247 | function ConfusionMatrix:render(sortmode, display, block, legendwidth)
248 |    -- args
249 |    local confusion = self.mat:double()
250 |    local classes = self.classes
251 |    local sortmode = sortmode or 'score' -- 'score' or 'occurrence'
252 |    local block = block or 25
253 |    local legendwidth = legendwidth or 200
254 |    local display = display or false
255 | 
256 |    -- legends
257 |    local legend = {
258 |       ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]',
259 |       ['occurrence'] = 'Confusiong matrix [sorted by occurences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]'
260 |    }
261 | 
262 |    -- parse matrix / normalize / count scores
263 |    local diag = torch.FloatTensor(#classes)
264 |    local freqs = torch.FloatTensor(#classes)
265 |    local unconf = confusion
266 |    local confusion = confusion:clone()
267 |    local corrects = 0
268 |    local total = 0
269 |    for target = 1,#classes do
270 |       freqs[target] = confusion[target]:sum()
271 |       corrects = corrects + confusion[target][target]
272 |       total = total + freqs[target]
273 |       confusion[target]:div( math.max(confusion[target]:sum(),1) )
274 |       diag[target] = confusion[target][target]
275 |    end
276 | 
277 |    -- accuracies
278 |    local accuracy = corrects / total * 100
279 |    local perclass = 0
280 |    local total = 0
281 |    for target = 1,#classes do
282 |       if confusion[target]:sum() > 0 then
283 |          perclass = perclass + diag[target]
284 |          total = total + 1
285 |       end
286 |    end
287 |    perclass = perclass / total * 100
288 |    freqs:div(unconf:sum())
289 | 
290 |    -- sort matrix
291 |    if sortmode == 'score' then
292 |       _,order = torch.sort(diag,1,true)
293 |    elseif sortmode == 'occurrence' then
294 |       _,order = torch.sort(freqs,1,true)
295 |    else
296 |       error('sort mode must be one of: score | occurrence')
297 |    end
298 | 
299 |    -- render matrix
300 |    local render = torch.zeros(#classes*block, #classes*block)
301 |    for target = 1,#classes do
302 |       for prediction = 1,#classes do
303 |          render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]]
304 |       end
305 |    end
306 | 
307 |    -- add grid
308 |    for target = 1,#classes do
309 |       render[{ {target*block},{} }] = 0.1
310 |       render[{ {},{target*block} }] = 0.1
311 |    end
312 | 
313 |    -- create rendering
314 |    require 'image'
315 |    require 'qtwidget'
316 |    require 'qttorch'
317 |    local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] )
318 |    image.display{image=render, win=win1}
319 | 
320 |    -- add legend
321 |    for i in ipairs(classes) do
322 |       -- background cell
323 |       win1:setcolor{r=0,g=0,b=0}
324 |       win1:rectangle((#render)[2],(i-1)*block,legendwidth,block)
325 |       win1:fill()
326 | 
327 |       -- %
328 |       win1:setfont(qt.QFont{serif=false, size=fontsize})
329 |       local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4
330 |       win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2}
331 |       win1:moveto((#render)[2]+10,i*block-block/3)
332 |       win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100))
333 | 
334 |       -- legend
335 |       win1:setfont(qt.QFont{serif=false, size=fontsize})
336 |       local gscale = diag[order[i]]*0.8+0.2
337 |       win1:setcolor{r=gscale,g=gscale,b=gscale}
338 |       win1:moveto(120+(#render)[2]+10,i*block-block/3)
339 |       win1:show(classes[order[i]])
340 | 
341 |       for j in ipairs(classes) do
342 |          -- scores
343 |          local score = confusion[order[j]][order[i]]
344 |          local gscale = (1-score)*(score*0.8+0.2)
345 |          win1:setcolor{r=gscale,g=gscale,b=gscale}
346 |          win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3)
347 |          win1:show(string.format('%02.0f',math.floor(score*100+0.5)))
348 |       end
349 |    end
350 | 
351 |    -- generate tensor
352 |    local t = win1:image():toTensor()
353 | 
354 |    -- display
355 |    if display then
356 |       image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)}
357 |    end
358 | 
359 |    -- return rendering
360 |    return t
361 | end
362 | 


--------------------------------------------------------------------------------
/doc/index.md:
--------------------------------------------------------------------------------
  1 | <a name='optim.dok'></a>
  2 | # Optim Package
  3 | 
  4 | This package provides a set of optimization algorithms, which all follow
  5 | a unified, closure-based API.
  6 | 
  7 | This package is fully compatible with the [nn](http://nn.readthedocs.org) package, but can also be
  8 | used to optimize arbitrary objective functions.
  9 | 
 10 | For now, the following algorithms are provided:
 11 | 
 12 |   * [Stochastic Gradient Descent](#optim.sgd)
 13 |   * [Averaged Stochastic Gradient Descent](#optim.asgd)
 14 |   * [L-BFGS](#optim.lbfgs)
 15 |   * [Congugate Gradients](#optim.cg)
 16 |   * [AdaDelta](#optim.adadelta)
 17 |   * [AdaGrad](#optim.adagrad)
 18 |   * [Adam](#optim.adam)
 19 |   * [AdaMax](#optim.adamax)
 20 |   * [FISTA with backtracking line search](#optim.FistaLS)
 21 |   * [Nesterov's Accelerated Gradient method](#optim.nag)
 22 |   * [RMSprop](#optim.rmsprop)
 23 |   * [Rprop](#optim.rprop)
 24 |   * [CMAES](#optim.cmaes)
 25 | 
 26 | All these algorithms are designed to support batch optimization as
 27 | well as stochastic optimization. It's up to the user to construct an 
 28 | objective function that represents the batch, mini-batch, or single sample
 29 | on which to evaluate the objective.
 30 | 
 31 | Some of these algorithms support a line search, which can be passed as
 32 | a function (L-BFGS), whereas others only support a learning rate (SGD).
 33 | 
 34 | <a name='optim.overview'></a>
 35 | ## Overview 
 36 | 
 37 | This package contains several optimization routines for [Torch](https://github.com/torch/torch7/blob/master/README.md).
 38 | Most optimization algorithms has the following interface:
 39 | 
 40 | ```lua
 41 | x*, {f}, ... = optim.method(opfunc, x, state)
 42 | ```
 43 | 
 44 | where:
 45 | 
 46 | * `opfunc`: a user-defined closure that respects this API: `f, df/dx = func(x)`
 47 | * `x`: the current parameter vector (a 1D `torch.Tensor`)
 48 | * `state`: a table of parameters, and state variables, dependent upon the algorithm
 49 | * `x*`: the new parameter vector that minimizes `f, x* = argmin_x f(x)`
 50 | * `{f}`: a table of all f values, in the order they've been evaluated (for some simple algorithms, like SGD, `#f == 1`)
 51 | 
 52 | <a name='optim.example'></a>
 53 | ## Example
 54 | 
 55 | The state table is used to hold the state of the algorihtm.
 56 | It's usually initialized once, by the user, and then passed to the optim function
 57 | as a black box. Example:
 58 | 
 59 | ```lua
 60 | state = {
 61 |    learningRate = 1e-3,
 62 |    momentum = 0.5
 63 | }
 64 | 
 65 | for i,sample in ipairs(training_samples) do
 66 |     local func = function(x)
 67 |        -- define eval function
 68 |        return f,df_dx
 69 |     end
 70 |     optim.sgd(func,x,state)
 71 | end
 72 | ```
 73 | 
 74 | <a name='optim.algorithms'></a>
 75 | ## Algorithms
 76 | 
 77 | Most algorithms provided rely on a unified interface:
 78 | ```lua
 79 | x_new,fs = optim.method(opfunc, x, state)
 80 | ```
 81 | where: 
 82 | x is the trainable/adjustable parameter vector,
 83 | state contains both options for the algorithm and the state of the algorihtm,
 84 | opfunc is a closure that has the following interface:
 85 | ```lua
 86 | f,df_dx = opfunc(x)
 87 | ```
 88 | x_new is the new parameter vector (after optimization),
 89 | fs is a a table containing all the values of the objective, as evaluated during
 90 | the optimization procedure: fs[1] is the value before optimization, and fs[#fs]
 91 | is the most optimized one (the lowest).
 92 | 
 93 | <a name='optim.sgd'></a>
 94 | ### [x] sgd(opfunc, x, state) 
 95 | 
 96 | An implementation of Stochastic Gradient Descent (SGD).
 97 | 
 98 | Arguments:
 99 | 
100 |   * `opfunc` : a function that takes a single input (`X`), the point of a evaluation, and returns `f(X)` and `df/dX`
101 |   * `x`      : the initial point
102 |   * `config` : a table with configuration parameters for the optimizer
103 |   * `config.learningRate`      : learning rate
104 |   * `config.learningRateDecay` : learning rate decay
105 |   * `config.weightDecay`       : weight decay
106 |   * `config.weightDecays`      : vector of individual weight decays
107 |   * `config.momentum`          : momentum
108 |   * `config.dampening`         : dampening for momentum
109 |   * `config.nesterov`          : enables Nesterov momentum
110 |   * `state`  : a table describing the state of the optimizer; after each call the state is modified
111 |   * `state.learningRates`      : vector of individual learning rates
112 | 
113 | Returns :
114 | 
115 |   * `x`     : the new x vector
116 |   * `f(x)`  : the function, evaluated before the update
117 | 
118 | <a name='optim.asgd'></a>
119 | ### [x] asgd(opfunc, x, state) 
120 | 
121 | An implementation of Averaged Stochastic Gradient Descent (ASGD): 
122 | 
123 | ```
124 | x = (1 - lambda eta_t) x - eta_t df/dx(z,x)
125 | a = a + mu_t [ x - a ]
126 | 
127 | eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
128 | mu_t = 1/max(1,t-t0)
129 | ```
130 | 
131 | Arguments:
132 | 
133 |   * `opfunc` : a function that takes a single input (`X`), the point of evaluation, and returns `f(X)` and `df/dX`
134 |   * `x` : the initial point
135 |   * `state` : a table describing the state of the optimizer; after each call the state is modified
136 |   * `state.eta0` : learning rate
137 |   * `state.lambda` : decay term
138 |   * `state.alpha` : power for eta update
139 |   * `state.t0` : point at which to start averaging
140 | 
141 | Returns:
142 | 
143 |   * `x`     : the new x vector
144 |   * `f(x)`  : the function, evaluated before the update
145 |   * `ax`    : the averaged x vector
146 | 
147 | 
148 | <a name='optim.lbfgs'></a>
149 | ### [x] lbfgs(opfunc, x, state)
150 | 
151 | An implementation of L-BFGS that relies on a user-provided line
152 | search function (`state.lineSearch`). If this function is not
153 | provided, then a simple learningRate is used to produce fixed
154 | size steps. Fixed size steps are much less costly than line
155 | searches, and can be useful for stochastic problems.
156 | 
157 | The learning rate is used even when a line search is provided.
158 | This is also useful for large-scale stochastic problems, where
159 | opfunc is a noisy approximation of `f(x)`. In that case, the learning
160 | rate allows a reduction of confidence in the step size.
161 | 
162 | Arguments :
163 | 
164 |   * `opfunc` : a function that takes a single input (`X`), the point of evaluation, and returns `f(X)` and `df/dX`
165 |   * `x` : the initial point
166 |   * `state` : a table describing the state of the optimizer; after each call the state is modified
167 |   * `state.maxIter` : Maximum number of iterations allowed
168 |   * `state.maxEval` : Maximum number of function evaluations
169 |   * `state.tolFun` : Termination tolerance on the first-order optimality
170 |   * `state.tolX` : Termination tol on progress in terms of func/param changes
171 |   * `state.lineSearch` : A line search function
172 |   * `state.learningRate` : If no line search provided, then a fixed step size is used
173 | 
174 | Returns :
175 |   * `x*` : the new `x` vector, at the optimal point
176 |   * `f`  : a table of all function values: 
177 |    * `f[1]` is the value of the function before any optimization and
178 |    * `f[#f]` is the final fully optimized value, at `x*`
179 | 
180 | 
181 | <a name='optim.cg'></a>
182 | ### [x] cg(opfunc, x, state)
183 | 
184 | An implementation of the Conjugate Gradient method which is a rewrite of 
185 | `minimize.m` written by Carl E. Rasmussen. 
186 | It is supposed to produce exactly same results (give
187 | or take numerical accuracy due to some changed order of
188 | operations). You can compare the result on rosenbrock with 
189 | [minimize.m](http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html).
190 | ```
191 | [x fx c] = minimize([0 0]', 'rosenbrock', -25)
192 | ```
193 | 
194 | Note that we limit the number of function evaluations only, it seems much
195 | more important in practical use.
196 | 
197 | Arguments :
198 | 
199 |   * `opfunc` : a function that takes a single input, the point of evaluation.
200 |   * `x`      : the initial point
201 |   * `state` : a table of parameters and temporary allocations.
202 |   * `state.maxEval`     : max number of function evaluations
203 |   * `state.maxIter`     : max number of iterations
204 |   * `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage
205 |   * `state.[s,x0]`      : if you pass torch.Tensor they will be used for temp storage
206 | 
207 | Returns :
208 | 
209 |   * `x*` : the new x vector, at the optimal point
210 |   * `f`  : a table of all function values where
211 |    * `f[1]` is the value of the function before any optimization and
212 |    * `f[#f]` is the final fully optimized value, at x*
213 | 
214 | <a name='optim.adadelta'></a>
215 | ### [x] adadelta(opfunc, x, config, state)
216 | ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701
217 | 
218 | Arguments :
219 | 
220 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX
221 | * `x` : the initial point
222 | * `config` : a table of hyper-parameters
223 | * `config.rho` : interpolation parameter
224 | * `config.eps` : for numerical stability
225 | * `state` : a table describing the state of the optimizer; after each call the state is modified
226 | * `state.paramVariance` : vector of temporal variances of parameters
227 | * `state.accDelta` : vector of accummulated delta of gradients
228 | 
229 | Returns :
230 | 
231 | * `x` : the new x vector
232 | * `f(x)` : the function, evaluated before the update
233 | 
234 | <a name='optim.adagrad'></a>
235 | ### [x] adagrad(opfunc, x, config, state)
236 | AdaGrad implementation for SGD
237 | 
238 | Arguments :
239 | 
240 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX
241 | * `x` : the initial point
242 | * `state` : a table describing the state of the optimizer; after each call the state is modified
243 | * `state.learningRate` : learning rate
244 | * `state.paramVariance` : vector of temporal variances of parameters
245 | 
246 | Returns :
247 | 
248 | * `x` : the new x vector
249 | * `f(x)` : the function, evaluated before the update
250 | 
251 | <a name='optim.adam'></a>
252 | ### [x] adam(opfunc, x, config, state)
253 | An implementation of Adam from http://arxiv.org/pdf/1412.6980.pdf
254 | 
255 | Arguments :
256 | 
257 | * `opfunc` : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX
258 | * `x`      : the initial point
259 | * `config` : a table with configuration parameters for the optimizer
260 | * `config.learningRate`      : learning rate
261 | * `config.beta1`             : first moment coefficient
262 | * `config.beta2`             : second moment coefficient
263 | * `config.epsilon`           : for numerical stability
264 | * `state`                    : a table describing the state of the optimizer; after each call the state is modified
265 | 
266 | Returns :
267 | 
268 | * `x`     : the new x vector
269 | * `f(x)`  : the function, evaluated before the update
270 | 
271 | <a name='optim.adamax'></a>
272 | ### [x] adamax(opfunc, x, config, state)
273 | An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
274 | 
275 | Arguments :
276 | 
277 | * `opfunc` : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX
278 | * `x`      : the initial point
279 | * `config` : a table with configuration parameters for the optimizer
280 | * `config.learningRate`      : learning rate
281 | * `config.beta1`             : first moment coefficient
282 | * `config.beta2`             : second moment coefficient
283 | * `config.epsilon`           : for numerical stability
284 | * `state`                    : a table describing the state of the optimizer; after each call the state is modified.
285 | 
286 | Returns :
287 | 
288 | * `x`     : the new x vector
289 | * `f(x)`  : the function, evaluated before the update
290 | 
291 | <a name='optim.FistaLS'></a>
292 | ### [x] FistaLS(f, g, pl, xinit, params)
293 | FISTA with backtracking line search
294 | * `f`        : smooth function
295 | * `g`        : non-smooth function
296 | * `pl`       : minimizer of intermediate problem Q(x,y)
297 | * `xinit`    : initial point
298 | * `params`   : table of parameters (**optional**)
299 | * `params.L`       : 1/(step size) for ISTA/FISTA iteration (0.1)
300 | * `params.Lstep`   : step size multiplier at each iteration (1.5)
301 | * `params.maxiter` : max number of iterations (50)
302 | * `params.maxline` : max number of line search iterations per iteration (20)
303 | * `params.errthres`: Error thershold for convergence check (1e-4)
304 | * `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true)
305 | * `params.verbose` : store each iteration solution and print detailed info (false)
306 | 
307 | On output, `params` will contain these additional fields that can be reused.
308 | * `params.L`       : last used L value will be written.
309 | 
310 | These are temporary storages needed by the algo and if the same params object is 
311 | passed a second time, these same storages will be used without new allocation.
312 | * `params.xkm`     : previous iterarion point
313 | * `params.y`       : fista iteration
314 | * `params.ply`     : ply = pl(y * 1/L grad(f))
315 | 
316 | Returns the solution x and history of {function evals, number of line search ,...}
317 | 
318 | Algorithm is published in http://epubs.siam.org/doi/abs/10.1137/080716542
319 | 
320 | <a name='optim.nag'></a>
321 | ### [x] nag(opfunc, x, config, state)      
322 | An implementation of SGD adapted with features of Nesterov's 
323 | Accelerated Gradient method, based on the paper "On the Importance of Initialization and Momentum in Deep Learning" (Sutsveker et. al., ICML 2013).
324 | 
325 | Arguments :
326 | 
327 | *  `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX
328 | *  `x` : the initial point
329 | *  `state`  : a table describing the state of the optimizer; after each call the state is modified
330 | *  `state.learningRate`      : learning rate
331 | *  `state.learningRateDecay` : learning rate decay
332 | *  `astate.weightDecay`       : weight decay
333 | *  `state.momentum`          : momentum
334 | *  `state.learningRates`     : vector of individual learning rates
335 | 
336 | Returns :
337 | 
338 | * `x`     : the new x vector
339 | * `f(x)` : the function, evaluated before the update
340 | 
341 | <a name='optim.rmsprop'></a>
342 | ### [x] rmsprop(opfunc, x, config, state)
343 | An implementation of RMSprop
344 | 
345 | Arguments :
346 | 
347 | * `opfunc` : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX
348 | * `x`      : the initial point
349 | * `config` : a table with configuration parameters for the optimizer
350 | * `config.learningRate`      : learning rate
351 | * `config.alpha`             : smoothing constant
352 | * `config.epsilon`           : value with which to initialise m
353 | * `state`                    : a table describing the state of the optimizer; after each call the state is modified
354 | * `state.m`                  : leaky sum of squares of parameter gradients,
355 | * `state.tmp`                : and the square root (with epsilon smoothing)
356 | 
357 | Returns :
358 | 
359 | * `x`     : the new x vector
360 | * `f(x)`  : the function, evaluated before the update
361 | 
362 | <a name='optim.rprop'></a>
363 | ### [x] rprop(opfunc, x, config, state)
364 | A plain implementation of Rprop
365 | (Martin Riedmiller, Koray Kavukcuoglu 2013)
366 | 
367 | Arguments :
368 | 
369 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX
370 | * `x`      : the initial point
371 | * `state`  : a table describing the state of the optimizer; after each call the state is modified
372 | * `state.stepsize`    : initial step size, common to all components
373 | * `state.etaplus`     : multiplicative increase factor, > 1 (default 1.2)
374 | * `state.etaminus`    : multiplicative decrease factor, < 1 (default 0.5)
375 | * `state.stepsizemax` : maximum stepsize allowed (default 50)
376 | * `state.stepsizemin` : minimum stepsize allowed (default 1e-6)
377 | * `state.niter`       : number of iterations (default 1)
378 | 
379 | Returns :
380 | 
381 | * `x`     : the new x vector
382 | * `f(x)`  : the function, evaluated before the update
383 | 
384 | 
385 | 
386 |  
387 | <a name='optim.cmaes'></a>
388 | ### [x] cmaes(opfunc, x, config, state)
389 | An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), 
390 | ported from https://www.lri.fr/~hansen/barecmaes2.html.
391 | 
392 | CMAES is a stochastic, derivative-free method for heuristic global optimization of non-linear or non-convex continuous optimization problems. Note that this method will on average take much more function evaluations to converge then a gradient based method.
393 | 
394 | Arguments:
395 | 
396 | * `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX. Note that df/dX is not used and can be left 0
397 | * `x` : the initial point
398 | * `state.sigma`     : float, initial step-size (standard deviation in each coordinate)
399 | * `state.maxEval`   : int, maximal number of function evaluations
400 | * `state.ftarget`   : float, target function value
401 | * `state.popsize`   : population size. If this is left empty, 4 + int(3 * log(|x|)) will be used
402 | * `state.ftarget`   : stop if fitness < ftarget
403 | * `state.verb_disp` : display info on console every verb_disp iteration, 0 for never
404 | 
405 | Returns:
406 | * `x*` : the new `x` vector, at the optimal point
407 | * `f`  : a table of all function values: 
408 |   * `f[1]` is the value of the function before any optimization and
409 |   * `f[#f]` is the final fully optimized value, at `x*`
410 | 


--------------------------------------------------------------------------------