├── .gitignore
├── doc
├── logger_plot.png
├── image
│ ├── parameterflattening.png
│ ├── parameterflattening.svg.png
│ └── parameterflattening.svg
├── logger.md
├── intro.md
└── algos.md
├── .dokx
├── mkdocs.yml
├── README.md
├── test
├── test_cg.lua
├── test_adam.lua
├── test_sgd.lua
├── test_lbfgs_w_ls.lua
├── test_adagrad.lua
├── test_rmsprop.lua
├── test_adamax.lua
├── test_adadelta.lua
├── test_cmaes.lua
├── test_de.lua
├── test_logger.lua
├── l2.lua
├── test_lbfgs.lua
├── test_confusion.lua
├── rosenbrock.lua
├── test_fista.lua
└── sparsecoding.lua
├── init.lua
├── CMakeLists.txt
├── optim-1.0.5-0.rockspec
├── optim-1.0.4-0.rockspec
├── optim-1.0.3-0.rockspec
├── optim-1.0.3-1.rockspec
├── checkgrad.lua
├── adagrad.lua
├── COPYRIGHT.txt
├── rmsprop.lua
├── adadelta.lua
├── asgd.lua
├── adamax.lua
├── adam.lua
├── nag.lua
├── sgd.lua
├── de.lua
├── rprop.lua
├── Logger.lua
├── cg.lua
├── lswolfe.lua
├── polyinterp.lua
├── fista.lua
├── lbfgs.lua
├── cmaes.lua
└── ConfusionMatrix.lua
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 |
--------------------------------------------------------------------------------
/doc/logger_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torch/optim/HEAD/doc/logger_plot.png
--------------------------------------------------------------------------------
/.dokx:
--------------------------------------------------------------------------------
1 | return {
2 | githubURL = "torch/optim",
3 | exclude = {"test", "polyinterp.lua"}
4 | }
5 |
--------------------------------------------------------------------------------
/doc/image/parameterflattening.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torch/optim/HEAD/doc/image/parameterflattening.png
--------------------------------------------------------------------------------
/doc/image/parameterflattening.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torch/optim/HEAD/doc/image/parameterflattening.svg.png
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: optim
2 | theme : simplex
3 | repo_url : https://github.com/torch/optim
4 | use_directory_urls : false
5 | markdown_extensions: [extra]
6 | docs_dir : doc
7 | pages:
8 | - [index.md, Optim]
9 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Optimization package
3 |
4 | This package contains several optimization routines and a logger for [Torch](https://github.com/torch/torch7/blob/master/README.md):
5 |
6 | * [Overview](doc/intro.md);
7 | * [Optimization algorithms](doc/algos.md);
8 | * [Logger](doc/logger.md).
9 |
--------------------------------------------------------------------------------
/test/test_cg.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 |
8 | x = torch.Tensor(2):fill(0)
9 | x,fx,i=optim.cg(rosenbrock,x,{maxIter=50})
10 |
11 | print()
12 | print('Rosenbrock test: compare with http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html')
13 | print()
14 | print('Number of function evals = ',i)
15 | print('x=');print(x)
16 | print('fx=')
17 | for i=1,#fx do print(i,fx[i]); end
18 |
--------------------------------------------------------------------------------
/test/test_adam.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 | require 'rosenbrock'
4 | require 'l2'
5 | x = torch.Tensor(2):fill(0)
6 | fx = {}
7 | config = {learningRate=0.002}
8 | for i = 1,10001 do
9 | x,f=optim.adam(rosenbrock,x,config)
10 | if (i-1)%1000 == 0 then
11 | table.insert(fx,f[1])
12 | end
13 | end
14 | print()
15 | print('Rosenbrock test')
16 | print()
17 | print('x=');print(x)
18 | print('fx=')
19 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
20 |
--------------------------------------------------------------------------------
/test/test_sgd.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | x = torch.Tensor(2):fill(0)
8 | fx = {}
9 |
10 | config = {learningRate=1e-3}
11 | for i = 1,10001 do
12 | x,f=optim.sgd(rosenbrock,x,config)
13 | if (i-1)%1000 == 0 then
14 | table.insert(fx,f[1])
15 | end
16 | end
17 |
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 |
--------------------------------------------------------------------------------
/test/test_lbfgs_w_ls.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | print('--- batch test w/ line search ---')
8 |
9 | x = torch.Tensor(2):fill(0)
10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, lineSearch=optim.lswolfe})
11 |
12 | print()
13 | print('Rosenbrock test')
14 | print()
15 | print('Number of function evals = ',i)
16 | print('x=');print(x)
17 | print('fx=')
18 | for i=1,#fx do print(i,fx[i]); end
19 | print()
20 | print()
21 |
--------------------------------------------------------------------------------
/test/test_adagrad.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | x = torch.Tensor(2):fill(0)
8 | fx = {}
9 |
10 | config = {learningRate=1e-1}
11 | for i = 1,10001 do
12 | x,f=optim.adagrad(rosenbrock,x,config)
13 | if (i-1)%1000 == 0 then
14 | table.insert(fx,f[1])
15 | end
16 | end
17 |
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 |
--------------------------------------------------------------------------------
/test/test_rmsprop.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | x = torch.Tensor(2):fill(0)
8 | fx = {}
9 |
10 | config = {learningRate=5e-4}
11 | for i = 1,10001 do
12 | x,f=optim.rmsprop(rosenbrock,x,config)
13 | if (i-1)%1000 == 0 then
14 | table.insert(fx,f[1])
15 | end
16 | end
17 |
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 |
--------------------------------------------------------------------------------
/test/test_adamax.lua:
--------------------------------------------------------------------------------
1 |
2 | require 'torch'
3 | require 'optim'
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | x = torch.Tensor(2):fill(0)
8 | fx = {}
9 | state = {}
10 | config = {}
11 | for i = 1,10001 do
12 | x,f=optim.adamax(rosenbrock,x,config,state)
13 | if (i-1)%1000 == 0 then
14 | table.insert(fx,f[1])
15 | end
16 | end
17 |
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 |
--------------------------------------------------------------------------------
/test/test_adadelta.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | x = torch.Tensor(2):fill(0)
8 | fx = {}
9 | state = {}
10 | config = {eps=1e-10}
11 | for i = 1,10001 do
12 | x,f=optim.adadelta(rosenbrock,x,config,state)
13 | if (i-1)%1000 == 0 then
14 | table.insert(fx,f[1])
15 | end
16 | end
17 |
18 | print()
19 | print('Rosenbrock test')
20 | print()
21 | print('x=');print(x)
22 | print('fx=')
23 | for i=1,#fx do print((i-1)*1000+1,fx[i]); end
24 |
--------------------------------------------------------------------------------
/test/test_cmaes.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | -- 10-D rosenbrock
8 | x = torch.Tensor(10):fill(0)
9 | config = {maxEval=10000, sigma=0.5, verb_disp=0}
10 |
11 | -- will take some time
12 | x,fx,i=optim.cmaes(rosenbrock,x,config)
13 |
14 |
15 | print('Rosenbrock test')
16 | print()
17 | -- approx 6500 function evals expected
18 | print('Number of function evals = ',i)
19 | print('x=');print(x)
20 | print('fx=')
21 | for i=1,#fx do print(i,fx[i]); end
22 | print()
23 | print()
--------------------------------------------------------------------------------
/test/test_de.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 |
8 | -- 10-D rosenbrock
9 | x = torch.Tensor(2):fill(0)
10 | config = {popsize=50, scaleFactor=0.5, crossoverRate=0.9, maxFEs=3000}
11 |
12 | -- will take some time
13 | x,fx=optim.de(rosenbrock,x,config)
14 |
15 |
16 | print('Rosenbrock test')
17 | print()
18 | -- approx 6500 function evals expected
19 | print('Number of function evals = ',i)
20 | print('x=');print(x)
21 | print('fx=')
22 | for i=1,config.maxFEs do print(i,fx[i]); end
23 | print()
24 | print()
25 |
--------------------------------------------------------------------------------
/test/test_logger.lua:
--------------------------------------------------------------------------------
1 | require 'optim'
2 |
3 |
4 | logger_former = optim.Logger('accuracy-former.log')
5 | logger_new = optim.Logger('accuracy-new.log')
6 |
7 | logger_new:setNames({'channel 1', 'channel 2', 'channel 3'})
8 |
9 | for i = 1, 20 do
10 | logger_former:add({['channel 1'] = 1 , ['channel 2'] = 0.1 * i, ['channel 3'] = 1 - 0.2 * i})
11 | logger_new:add({1 , 0.1 * i, 1 - 0.2 * i})
12 | end
13 |
14 | logger_former:style({['channel 1'] = '-' , ['channel 2'] = '-', ['channel 3'] = '-'})
15 | logger_new:style{'-', '-', '-'}
16 |
17 | logger_former:plot()
18 | logger_new:plot()
19 |
20 |
21 |
--------------------------------------------------------------------------------
/test/l2.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | -- rosenbrock.m This function returns the function value, partial derivatives
3 | -- and Hessian of the (general dimension) rosenbrock function, given by:
4 | --
5 | -- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
6 | --
7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1).
8 | --
9 | -- Carl Edward Rasmussen, 2001-07-21.
10 |
11 | function l2(x)
12 |
13 | local xx = x:clone()
14 | xx:cmul(xx)
15 | local fout = xx:sum()
16 |
17 | local dx = torch.Tensor():resizeAs(x):copy(x)
18 | dx:mul(2)
19 | --print('l2 eval = ', fout)
20 | return fout,dx
21 |
22 | end
--------------------------------------------------------------------------------
/init.lua:
--------------------------------------------------------------------------------
1 |
2 | require 'torch'
3 |
4 | optim = {}
5 |
6 | -- optimizations
7 | require('optim.sgd')
8 | require('optim.cg')
9 | require('optim.asgd')
10 | require('optim.nag')
11 | require('optim.fista')
12 | require('optim.lbfgs')
13 | require('optim.adagrad')
14 | require('optim.rprop')
15 | require('optim.adam')
16 | require('optim.adamax')
17 | require('optim.rmsprop')
18 | require('optim.adadelta')
19 | require('optim.cmaes')
20 | require('optim.de')
21 |
22 | -- line search functions
23 | require('optim.lswolfe')
24 |
25 | -- helpers
26 | require('optim.polyinterp')
27 | require('optim.checkgrad')
28 |
29 | -- tools
30 | require('optim.ConfusionMatrix')
31 | require('optim.Logger')
32 |
33 | return optim
34 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 |
2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
3 | CMAKE_POLICY(VERSION 2.6)
4 | IF(LUAROCKS_PREFIX)
5 | MESSAGE(STATUS "Installing Torch through Luarocks")
6 | STRING(REGEX REPLACE "(.*)lib/luarocks/rocks.*" "\\1" CMAKE_INSTALL_PREFIX "${LUAROCKS_PREFIX}")
7 | MESSAGE(STATUS "Prefix inferred from Luarocks: ${CMAKE_INSTALL_PREFIX}")
8 | ENDIF()
9 | FIND_PACKAGE(Torch REQUIRED)
10 |
11 | SET(src)
12 | FILE(GLOB luasrc *.lua)
13 | ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}")
14 | #ADD_TORCH_DOK(dok optim "Machine Learning" "Optimization" 3.2)
15 |
16 | INSTALL(DIRECTORY "doc" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/optim")
17 | INSTALL(FILES "README.md" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/optim")
18 |
--------------------------------------------------------------------------------
/optim-1.0.5-0.rockspec:
--------------------------------------------------------------------------------
1 | package = "optim"
2 | version = "1.0.5-0"
3 |
4 | source = {
5 | url = "git://github.com/torch/optim",
6 | }
7 |
8 | description = {
9 | summary = "An optimization library for Torch.",
10 | detailed = [[
11 | This package contains several optimization routines for Torch.
12 | ]],
13 | homepage = "https://github.com/torch/optim",
14 | license = "BSD"
15 | }
16 |
17 | dependencies = {
18 | "torch >= 7.0",
19 | }
20 |
21 | build = {
22 | type = "command",
23 | build_command = [[
24 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
25 | ]],
26 | install_command = "cd build && $(MAKE) install"
27 | }
28 |
--------------------------------------------------------------------------------
/optim-1.0.4-0.rockspec:
--------------------------------------------------------------------------------
1 | package = "optim"
2 | version = "1.0.4-0"
3 |
4 | source = {
5 | url = "git://github.com/torch/optim",
6 | tag = "1.0.4-0"
7 | }
8 |
9 | description = {
10 | summary = "An optimization library for Torch.",
11 | detailed = [[
12 | This package contains several optimization routines for Torch.
13 | ]],
14 | homepage = "https://github.com/torch/optim",
15 | license = "BSD"
16 | }
17 |
18 | dependencies = {
19 | "torch >= 7.0",
20 | }
21 |
22 | build = {
23 | type = "command",
24 | build_command = [[
25 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
26 | ]],
27 | install_command = "cd build && $(MAKE) install"
28 | }
29 |
--------------------------------------------------------------------------------
/optim-1.0.3-0.rockspec:
--------------------------------------------------------------------------------
1 | package = "optim"
2 | version = "1.0.3-0"
3 |
4 | source = {
5 | url = "git://github.com/torch/optim",
6 | tag = "1.0.3-0"
7 | }
8 |
9 | description = {
10 | summary = "An optimization library for Torch.",
11 | detailed = [[
12 | This package contains several optimization routines for Torch.
13 | ]],
14 | homepage = "https://github.com/torch/optim",
15 | license = "BSD"
16 | }
17 |
18 | dependencies = {
19 | "torch >= 7.0",
20 | "sys >= 1.0",
21 | }
22 |
23 | build = {
24 | type = "command",
25 | build_command = [[
26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
27 | ]],
28 | install_command = "cd build && $(MAKE) install"
29 | }
30 |
--------------------------------------------------------------------------------
/optim-1.0.3-1.rockspec:
--------------------------------------------------------------------------------
1 | package = "optim"
2 | version = "1.0.3-1"
3 |
4 | source = {
5 | url = "git://github.com/torch/optim",
6 | tag = "1.0.3-1"
7 | }
8 |
9 | description = {
10 | summary = "An optimization library for Torch.",
11 | detailed = [[
12 | This package contains several optimization routines for Torch.
13 | ]],
14 | homepage = "https://github.com/torch/optim",
15 | license = "BSD"
16 | }
17 |
18 | dependencies = {
19 | "torch >= 7.0",
20 | "sys >= 1.0",
21 | }
22 |
23 | build = {
24 | type = "command",
25 | build_command = [[
26 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
27 | ]],
28 | install_command = "cd build && $(MAKE) install"
29 | }
30 |
--------------------------------------------------------------------------------
/test/test_lbfgs.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | require 'rosenbrock'
5 | require 'l2'
6 |
7 | print('--- regular batch test ---')
8 |
9 | x = torch.Tensor(2):fill(0)
10 | x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, learningRate=1e-1})
11 |
12 | print()
13 | print('Rosenbrock test')
14 | print()
15 | print('Number of function evals = ',i)
16 | print('x=');print(x)
17 | print('fx=')
18 | for i=1,#fx do print(i,fx[i]); end
19 | print()
20 | print()
21 |
22 | print('--- stochastic test ---')
23 |
24 | x = torch.Tensor(2):fill(0)
25 | fx = {}
26 | config = {learningRate=1e-1, maxIter=1}
27 | for i = 1,100 do
28 | x,f=optim.lbfgs(rosenbrock,x,config)
29 | table.insert(fx,f[1])
30 | end
31 |
32 | print()
33 | print('Rosenbrock test')
34 | print()
35 | print('Number of function evals = ',i)
36 | print('x=');print(x)
37 | print('fx=')
38 | for i=1,#fx do print(i,fx[i]); end
39 |
--------------------------------------------------------------------------------
/test/test_confusion.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'optim'
3 |
4 | n_feature = 3
5 | classes = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
6 |
7 | print'ConfusionMatrix:__init() test'
8 | cm = optim.ConfusionMatrix(#classes, classes)
9 |
10 | target = 3
11 | prediction = torch.randn(#classes)
12 |
13 | print'ConfusionMatrix:add() test'
14 | cm:add(prediction, target)
15 | cm:add(prediction, torch.randn(#classes))
16 |
17 | batch_size = 8
18 |
19 | targets = torch.randperm(batch_size)
20 | predictions = torch.randn(batch_size, #classes)
21 |
22 | print'ConfusionMatrix:batchAdd() test'
23 | cm:batchAdd(predictions, targets)
24 | assert(cm.mat:sum() == batch_size + 2, 'missing examples')
25 |
26 | print'ConfusionMatrix:updateValids() test'
27 | cm:updateValids()
28 |
29 | print'ConfusionMatrix:__tostring__() test'
30 | print(cm)
31 |
32 | target = 0
33 | cm:add(prediction, target)
34 | assert(cm.mat:sum() == batch_size + 2, 'too many examples')
35 |
36 | -- FAR/FRR testing on identify matrix. FRR/FAR should be zero for identity.
37 | cm.mat = torch.eye(#classes, #classes)
38 | classFrrs, classFars, frrs, fars = cm:farFrr()
39 | assert(classFrrs:sum() + classFars:sum() == 0, "Incorrect values")
40 |
--------------------------------------------------------------------------------
/checkgrad.lua:
--------------------------------------------------------------------------------
1 | --[[ An implementation of a simple numerical gradient checker.
2 |
3 | ARGS:
4 |
5 | - `opfunc` : a function that takes a single input (X), the point of
6 | evaluation, and returns f(X) and df/dX
7 | - `x` : the initial point
8 | - `eps` : the epsilon to use for the numerical check (default is 1e-7)
9 |
10 | RETURN:
11 |
12 | - `diff` : error in the gradient, should be near tol
13 | - `dC` : exact gradient at point
14 | - `dC_est` : numerically estimates gradient at point
15 |
16 | ]]--
17 |
18 |
19 | -- function that numerically checks gradient of NCA loss:
20 | function optim.checkgrad(opfunc, x, eps)
21 |
22 | -- compute true gradient:
23 | local Corg,dC = opfunc(x)
24 | dC:resize(x:size())
25 |
26 | local Ctmp -- temporary value
27 | local isTensor = torch.isTensor(Corg)
28 | if isTensor then
29 | Ctmp = Corg.new(Corg:size())
30 | end
31 |
32 | -- compute numeric approximations to gradient:
33 | local eps = eps or 1e-7
34 | local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC)
35 | for i = 1,dC:size(1) do
36 | local tmp = x[i]
37 | x[i] = x[i] + eps
38 | local C1 = opfunc(x)
39 | if isTensor then
40 | Ctmp:copy(C1)
41 | C1 = Ctmp
42 | end
43 | x[i] = x[i] - 2 * eps
44 | local C2 = opfunc(x)
45 | x[i] = tmp
46 | dC_est[i] = (C1 - C2) / (2 * eps)
47 | end
48 |
49 | -- estimate error of gradient:
50 | local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est)
51 | return diff,dC,dC_est
52 | end
53 |
--------------------------------------------------------------------------------
/test/rosenbrock.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | -- rosenbrock.m This function returns the function value, partial derivatives
3 | -- and Hessian of the (general dimension) rosenbrock function, given by:
4 | --
5 | -- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
6 | --
7 | -- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1).
8 | --
9 | -- Carl Edward Rasmussen, 2001-07-21.
10 |
11 | function rosenbrock(x)
12 |
13 | -- (1) compute f(x)
14 | local d = x:size(1)
15 | -- x1 = x(i)^2
16 | local x1 = x.new(d-1):copy(x:narrow(1,1,d-1))
17 | -- x(i+1) - x(i)^2
18 | x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1))
19 |
20 | -- 100*(x(i+1) - x(i)^2)^2
21 | x1:cmul(x1):mul(100)
22 |
23 | -- x(i)
24 | local x0 = x.new(d-1):copy(x:narrow(1,1,d-1))
25 | -- 1-x(i)
26 | x0:mul(-1):add(1)
27 | -- (1-x(i))^2
28 | x0:cmul(x0)
29 | -- 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
30 | x1:add(x0)
31 | local fout = x1:sum()
32 |
33 | -- (2) compute f(x)/dx
34 | local dxout = x.new():resizeAs(x):zero()
35 | -- df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1));
36 |
37 | x1:copy(x:narrow(1,1,d-1))
38 | x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1)):cmul(x:narrow(1,1,d-1)):mul(-400)
39 | x0:copy(x:narrow(1,1,d-1)):mul(-1):add(1):mul(-2)
40 | x1:add(x0)
41 | dxout:narrow(1,1,d-1):copy(x1)
42 |
43 | -- df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2);
44 | x0:copy(x:narrow(1,1,d-1))
45 | x0:cmul(x0):mul(-1):add(x:narrow(1,2,d-1)):mul(200)
46 | dxout:narrow(1,2,d-1):add(x0)
47 |
48 | return fout,dxout
49 |
50 | end
51 |
--------------------------------------------------------------------------------
/adagrad.lua:
--------------------------------------------------------------------------------
1 | --[[ ADAGRAD implementation for SGD
2 |
3 | ARGS:
4 | - `opfunc` : a function that takes a single input (X), the point of
5 | evaluation, and returns f(X) and df/dX
6 | - `x` : the initial point
7 | - `state` : a table describing the state of the optimizer; after each
8 | call the state is modified
9 | - `state.learningRate` : learning rate
10 | - `state.paramVariance` : vector of temporal variances of parameters
11 | - `state.weightDecay` : scalar that controls weight decay
12 | RETURN:
13 | - `x` : the new x vector
14 | - `f(x)` : the function, evaluated before the update
15 |
16 | ]]
17 | function optim.adagrad(opfunc, x, config, state)
18 | -- (0) get/update state
19 | if config == nil and state == nil then
20 | print('no state table, ADAGRAD initializing')
21 | end
22 | local config = config or {}
23 | local state = state or config
24 | local lr = config.learningRate or 1e-3
25 | local lrd = config.learningRateDecay or 0
26 | local wd = config.weightDecay or 0
27 | state.evalCounter = state.evalCounter or 0
28 | local nevals = state.evalCounter
29 |
30 | -- (1) evaluate f(x) and df/dx
31 | local fx,dfdx = opfunc(x)
32 |
33 | -- (2) weight decay with a single parameter
34 | if wd ~= 0 then
35 | dfdx:add(wd, x)
36 | end
37 |
38 | -- (3) learning rate decay (annealing)
39 | local clr = lr / (1 + nevals*lrd)
40 |
41 | -- (4) parameter update with single or individual learning rates
42 | if not state.paramVariance then
43 | state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
44 | state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx)
45 | end
46 | state.paramVariance:addcmul(1,dfdx,dfdx)
47 | state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt()
48 | x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10))
49 |
50 | -- (5) update evaluation counter
51 | state.evalCounter = state.evalCounter + 1
52 |
53 | -- return x*, f(x) before optimization
54 | return x,{fx}
55 | end
56 |
--------------------------------------------------------------------------------
/COPYRIGHT.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
2 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
3 | Copyright (c) 2011-2013 NYU (Clement Farabet)
4 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
5 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
6 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
7 |
8 | All rights reserved.
9 |
10 | Redistribution and use in source and binary forms, with or without
11 | modification, are permitted provided that the following conditions are met:
12 |
13 | 1. Redistributions of source code must retain the above copyright
14 | notice, this list of conditions and the following disclaimer.
15 |
16 | 2. Redistributions in binary form must reproduce the above copyright
17 | notice, this list of conditions and the following disclaimer in the
18 | documentation and/or other materials provided with the distribution.
19 |
20 | 3. Neither the names of NEC Laboratories American and IDIAP Research
21 | Institute nor the names of its contributors may be used to endorse or
22 | promote products derived from this software without specific prior
23 | written permission.
24 |
25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 | POSSIBILITY OF SUCH DAMAGE.
36 |
--------------------------------------------------------------------------------
/rmsprop.lua:
--------------------------------------------------------------------------------
1 | --[[ An implementation of RMSprop
2 |
3 | ARGS:
4 |
5 | - 'opfunc' : a function that takes a single input (X), the point
6 | of a evaluation, and returns f(X) and df/dX
7 | - 'x' : the initial point
8 | - 'config` : a table with configuration parameters for the optimizer
9 | - 'config.learningRate' : learning rate
10 | - 'config.alpha' : smoothing constant
11 | - 'config.epsilon' : value with which to initialise m
12 | - 'config.weightDecay' : weight decay
13 | - 'state' : a table describing the state of the optimizer;
14 | after each call the state is modified
15 | - 'state.m' : leaky sum of squares of parameter gradients,
16 | - 'state.tmp' : and the square root (with epsilon smoothing)
17 |
18 | RETURN:
19 | - `x` : the new x vector
20 | - `f(x)` : the function, evaluated before the update
21 |
22 | ]]
23 |
24 | function optim.rmsprop(opfunc, x, config, state)
25 | -- (0) get/update state
26 | local config = config or {}
27 | local state = state or config
28 | local lr = config.learningRate or 1e-2
29 | local alpha = config.alpha or 0.99
30 | local epsilon = config.epsilon or 1e-8
31 | local wd = config.weightDecay or 0
32 | local mfill = config.initialMean or 0
33 |
34 | -- (1) evaluate f(x) and df/dx
35 | local fx, dfdx = opfunc(x)
36 |
37 | -- (2) weight decay
38 | if wd ~= 0 then
39 | dfdx:add(wd, x)
40 | end
41 |
42 | -- (3) initialize mean square values and square gradient storage
43 | if not state.m then
44 | state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill)
45 | state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx)
46 | end
47 |
48 | -- (4) calculate new (leaky) mean squared values
49 | state.m:mul(alpha)
50 | state.m:addcmul(1.0-alpha, dfdx, dfdx)
51 |
52 | -- (5) perform update
53 | state.tmp:sqrt(state.m):add(epsilon)
54 | x:addcdiv(-lr, dfdx, state.tmp)
55 |
56 | -- return x*, f(x) before optimization
57 | return x, {fx}
58 | end
59 |
--------------------------------------------------------------------------------
/adadelta.lua:
--------------------------------------------------------------------------------
1 | --[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701
2 |
3 | ARGS:
4 | - `opfunc` : a function that takes a single input (X), the point of
5 | evaluation, and returns f(X) and df/dX
6 | - `x` : the initial point
7 | - `config` : a table of hyper-parameters
8 | - `config.rho` : interpolation parameter
9 | - `config.eps` : for numerical stability
10 | - `config.weightDecay` : weight decay
11 | - `state` : a table describing the state of the optimizer; after each
12 | call the state is modified
13 | - `state.paramVariance` : vector of temporal variances of parameters
14 | - `state.accDelta` : vector of accummulated delta of gradients
15 | RETURN:
16 | - `x` : the new x vector
17 | - `f(x)` : the function, evaluated before the update
18 | ]]
19 | function optim.adadelta(opfunc, x, config, state)
20 | -- (0) get/update state
21 | if config == nil and state == nil then
22 | print('no state table, ADADELTA initializing')
23 | end
24 | local config = config or {}
25 | local state = state or config
26 | local rho = config.rho or 0.9
27 | local eps = config.eps or 1e-6
28 | local wd = config.weightDecay or 0
29 | state.evalCounter = state.evalCounter or 0
30 | -- (1) evaluate f(x) and df/dx
31 | local fx,dfdx = opfunc(x)
32 |
33 | -- (2) weight decay
34 | if wd ~= 0 then
35 | dfdx:add(wd, x)
36 | end
37 |
38 | -- (3) parameter update
39 | if not state.paramVariance then
40 | state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
41 | state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
42 | state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
43 | state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
44 | end
45 | state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx)
46 | state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt()
47 | state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx)
48 | x:add(-1, state.delta)
49 | state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta)
50 | -- (4) update evaluation counter
51 | state.evalCounter = state.evalCounter + 1
52 |
53 | -- return x*, f(x) before optimization
54 | return x,{fx}
55 | end
56 |
--------------------------------------------------------------------------------
/asgd.lua:
--------------------------------------------------------------------------------
1 | --[[ An implementation of ASGD
2 |
3 | ASGD:
4 |
5 | x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
6 | a := a + mu_t [ x - a ]
7 |
8 | eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
9 | mu_t = 1/max(1,t-t0)
10 |
11 | implements ASGD algoritm as in L.Bottou's sgd-2.0
12 |
13 | ARGS:
14 |
15 | - `opfunc` : a function that takes a single input (X), the point of
16 | evaluation, and returns f(X) and df/dX
17 | - `x` : the initial point
18 | - `state` : a table describing the state of the optimizer; after each
19 | call the state is modified
20 | - `state.eta0` : learning rate
21 | - `state.lambda` : decay term
22 | - `state.alpha` : power for eta update
23 | - `state.t0` : point at which to start averaging
24 |
25 | RETURN:
26 | - `x` : the new x vector
27 | - `f(x)` : the function, evaluated before the update
28 | - `ax` : the averaged x vector
29 |
30 | (Clement Farabet, 2012)
31 | --]]
32 | function optim.asgd(opfunc, x, config, state)
33 | -- (0) get/update state
34 | local config = config or {}
35 | local state = state or config
36 | config.eta0 = config.eta0 or 1e-4
37 | config.lambda = config.lambda or 1e-4
38 | config.alpha = config.alpha or 0.75
39 | config.t0 = config.t0 or 1e6
40 |
41 | -- (hidden state)
42 | state.eta_t = state.eta_t or config.eta0
43 | state.mu_t = state.mu_t or 1
44 | state.t = state.t or 0
45 |
46 | -- (1) evaluate f(x) and df/dx
47 | local fx,dfdx = opfunc(x)
48 |
49 | -- (2) decay term
50 | x:mul(1 - config.lambda*state.eta_t)
51 |
52 | -- (3) update x
53 | x:add(-state.eta_t, dfdx)
54 |
55 | -- (4) averaging
56 | state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero()
57 | state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax)
58 | if state.mu_t ~= 1 then
59 | state.tmp:copy(x)
60 | state.tmp:add(-1,state.ax):mul(state.mu_t)
61 | state.ax:add(state.tmp)
62 | else
63 | state.ax:copy(x)
64 | end
65 |
66 | -- (5) update eta_t and mu_t
67 | state.t = state.t + 1
68 | state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha)
69 | state.mu_t = 1 / math.max(1, state.t - config.t0)
70 |
71 | -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
72 | return x,{fx},state.ax
73 | end
74 |
--------------------------------------------------------------------------------
/adamax.lua:
--------------------------------------------------------------------------------
1 | --[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
2 |
3 | ARGS:
4 |
5 | - 'opfunc' : a function that takes a single input (X), the point
6 | of a evaluation, and returns f(X) and df/dX
7 | - 'x' : the initial point
8 | - 'config` : a table with configuration parameters for the optimizer
9 | - 'config.learningRate' : learning rate
10 | - 'config.beta1' : first moment coefficient
11 | - 'config.beta2' : second moment coefficient
12 | - 'config.epsilon' : for numerical stability
13 | - 'state' : a table describing the state of the optimizer;
14 | after each call the state is modified.
15 |
16 | RETURN:
17 | - `x` : the new x vector
18 | - `f(x)` : the function, evaluated before the update
19 |
20 | ]]
21 |
22 | function optim.adamax(opfunc, x, config, state)
23 | -- (0) get/update state
24 | local config = config or {}
25 | local state = state or config
26 | local lr = config.learningRate or 0.002
27 |
28 | local beta1 = config.beta1 or 0.9
29 | local beta2 = config.beta2 or 0.999
30 | local epsilon = config.epsilon or 1e-38
31 | local wd = config.weightDecay or 0
32 |
33 | -- (1) evaluate f(x) and df/dx
34 | local fx, dfdx = opfunc(x)
35 |
36 | -- (2) weight decay
37 | if wd ~= 0 then
38 | dfdx:add(wd, x)
39 | end
40 |
41 | -- Initialization
42 | state.t = state.t or 0
43 | -- Exponential moving average of gradient values
44 | state.m = state.m or x.new(dfdx:size()):zero()
45 | -- Exponential moving average of the infinity norm
46 | state.u = state.u or x.new(dfdx:size()):zero()
47 | -- A tmp tensor to hold the input to max()
48 | state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()
49 |
50 | state.t = state.t + 1
51 |
52 | -- Update biased first moment estimate.
53 | state.m:mul(beta1):add(1-beta1, dfdx)
54 | -- Update the exponentially weighted infinity norm.
55 | state.max[1]:copy(state.u):mul(beta2)
56 | state.max[2]:copy(dfdx):abs():add(epsilon)
57 | state.u:max(state.max, 1)
58 |
59 | local biasCorrection1 = 1 - beta1^state.t
60 | local stepSize = lr/biasCorrection1
61 | -- (2) update x
62 | x:addcdiv(-stepSize, state.m, state.u)
63 |
64 | -- return x*, f(x) before optimization
65 | return x, {fx}
66 | end
67 |
--------------------------------------------------------------------------------
/test/test_fista.lua:
--------------------------------------------------------------------------------
1 |
2 | require 'unsup'
3 | require 'torch'
4 | require 'gnuplot'
5 | require 'sparsecoding'
6 |
7 | -- gnuplot.setgnuplotexe('/usr/bin/gnuplot44')
8 | -- gnuplot.setgnuplotterminal('x11')
9 |
10 | function gettableval(tt,v)
11 | local x = torch.Tensor(#tt)
12 | for i=1,#tt do x[i] = tt[i][v] end
13 | return x
14 | end
15 | function doplots(v)
16 | v = v or 'F'
17 | local fistaf = torch.DiskFile('fista2.bin'):binary()
18 | local istaf = torch.DiskFile('ista2.bin'):binary()
19 |
20 | local hfista = fistaf:readObject()
21 | fistaf:close()
22 | local hista = istaf:readObject()
23 | istaf:close()
24 |
25 | gnuplot.figure()
26 | gnuplot.plot({'fista ' .. v,gettableval(hfista,v)},{'ista ' .. v, gettableval(hista,v)})
27 | end
28 |
29 | seed = seed or 123
30 | if dofista == nil then
31 | dofista = true
32 | else
33 | dofista = not dofista
34 | end
35 |
36 | torch.manualSeed(seed)
37 | math.randomseed(seed)
38 | nc = 3
39 | ni = 30
40 | no = 100
41 | x = torch.Tensor(ni):zero()
42 |
43 | --- I am keeping these just to make sure random init stays same
44 | fista = unsup.LinearFistaL1(ni,no,0.1)
45 | fista = nil
46 |
47 | fistaparams = {}
48 | fistaparams.doFistaUpdate = dofista
49 | fistaparams.maxline = 10
50 | fistaparams.maxiter = 200
51 | fistaparams.verbose = true
52 |
53 | D=torch.randn(ni,no)
54 | for i=1,D:size(2) do
55 | D:select(2,i):div(D:select(2,i):std()+1e-12)
56 | end
57 |
58 | mixi = torch.Tensor(nc)
59 | mixj = torch.Tensor(nc)
60 | for i=1,nc do
61 | local ii = math.random(1,no)
62 | local cc = torch.uniform(0,1/nc)
63 | mixi[i] = ii;
64 | mixj[i] = cc;
65 | print(ii,cc)
66 | x:add(cc, D:select(2,ii))
67 | end
68 |
69 | fista = optim.FistaL1(D,fistaparams)
70 | code,h = fista.run(x,0.1)
71 |
72 | --fista.reconstruction:addmv(0,1,D,code)
73 | rec = fista.reconstruction
74 | --code,rec,h = fista:forward(x);
75 |
76 | gnuplot.figure(1)
77 | gnuplot.plot({'data',mixi,mixj,'+'},{'code',torch.linspace(1,no,no),code,'+'})
78 | gnuplot.title('Fista = ' .. tostring(fistaparams.doFistaUpdate))
79 |
80 | gnuplot.figure(2)
81 | gnuplot.plot({'input',torch.linspace(1,ni,ni),x,'+-'},{'reconstruction',torch.linspace(1,ni,ni),rec,'+-'});
82 | gnuplot.title('Reconstruction Error : ' .. x:dist(rec) .. ' ' .. 'Fista = ' .. tostring(fistaparams.doFistaUpdate))
83 | --w2:axis(0,ni+1,-1,1)
84 |
85 | if dofista then
86 | print('Running FISTA')
87 | fname = 'fista2.bin'
88 | else
89 | print('Running ISTA')
90 | fname = 'ista2.bin'
91 | end
92 | ff = torch.DiskFile(fname,'w'):binary()
93 | ff:writeObject(h)
94 | ff:close()
95 |
96 |
--------------------------------------------------------------------------------
/adam.lua:
--------------------------------------------------------------------------------
1 | --[[ An implementation of Adam https://arxiv.org/abs/1412.6980
2 |
3 | ARGS:
4 |
5 | - 'opfunc' : a function that takes a single input (X), the point
6 | of a evaluation, and returns f(X) and df/dX
7 | - 'x' : the initial point
8 | - 'config` : a table with configuration parameters for the optimizer
9 | - 'config.learningRate' : learning rate
10 | - `config.learningRateDecay` : learning rate decay
11 | - 'config.beta1' : first moment coefficient
12 | - 'config.beta2' : second moment coefficient
13 | - 'config.epsilon' : for numerical stability
14 | - 'config.weightDecay' : weight decay
15 | - 'state' : a table describing the state of the optimizer; after each
16 | call the state is modified
17 |
18 | RETURN:
19 | - `x` : the new x vector
20 | - `f(x)` : the function, evaluated before the update
21 |
22 | ]]
23 |
24 | function optim.adam(opfunc, x, config, state)
25 | -- (0) get/update state
26 | local config = config or {}
27 | local state = state or config
28 | local lr = config.learningRate or 0.001
29 | local lrd = config.learningRateDecay or 0
30 |
31 | local beta1 = config.beta1 or 0.9
32 | local beta2 = config.beta2 or 0.999
33 | local epsilon = config.epsilon or 1e-8
34 | local wd = config.weightDecay or 0
35 |
36 | -- (1) evaluate f(x) and df/dx
37 | local fx, dfdx = opfunc(x)
38 |
39 | -- (2) weight decay
40 | if wd ~= 0 then
41 | dfdx:add(wd, x)
42 | end
43 |
44 | -- Initialization
45 | state.t = state.t or 0
46 | -- Exponential moving average of gradient values
47 | state.m = state.m or x.new(dfdx:size()):zero()
48 | -- Exponential moving average of squared gradient values
49 | state.v = state.v or x.new(dfdx:size()):zero()
50 | -- A tmp tensor to hold the sqrt(v) + epsilon
51 | state.denom = state.denom or x.new(dfdx:size()):zero()
52 |
53 | -- (3) learning rate decay (annealing)
54 | local clr = lr / (1 + state.t*lrd)
55 |
56 | state.t = state.t + 1
57 |
58 | -- Decay the first and second moment running average coefficient
59 | state.m:mul(beta1):add(1-beta1, dfdx)
60 | state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
61 |
62 | state.denom:copy(state.v):sqrt():add(epsilon)
63 |
64 | local biasCorrection1 = 1 - beta1^state.t
65 | local biasCorrection2 = 1 - beta2^state.t
66 | local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1
67 | -- (4) update x
68 | x:addcdiv(-stepSize, state.m, state.denom)
69 |
70 | -- return x*, f(x) before optimization
71 | return x, {fx}
72 | end
73 |
--------------------------------------------------------------------------------
/doc/logger.md:
--------------------------------------------------------------------------------
1 |
2 | # Logger
3 |
4 | `optim` provides also logging and live plotting capabilities via the `optim.Logger()` function.
5 |
6 | Live logging is essential to monitor the *network accuracy* and *cost function* during training and testing, for spotting *under-* and *over-fitting*, for *early stopping* or just for monitoring the health of the current optimisation task.
7 |
8 |
9 | ## Logging data
10 |
11 | Let walk through an example to see how it works.
12 |
13 | We start with initialising our logger connected to a text file `accuracy.log`.
14 |
15 | ```lua
16 | logger = optim.Logger('accuracy.log')
17 | ```
18 |
19 | We can decide to log on it, for example, *training* and *testing accuracies*.
20 |
21 | ```lua
22 | logger:setNames{'Training acc.', 'Test acc.'}
23 | ```
24 |
25 | And now we can populate our logger randomly.
26 |
27 | ```lua
28 | for i = 1, 10 do
29 | trainAcc = math.random(0, 100)
30 | testAcc = math.random(0, 100)
31 | logger:add{trainAcc, testAcc}
32 | end
33 | ```
34 |
35 | We can `cat` `accuracy.log` and see what's in it.
36 |
37 | ```
38 | Training acc. Test acc.
39 | 7.0000e+01 5.9000e+01
40 | 7.6000e+01 8.0000e+00
41 | 6.6000e+01 3.4000e+01
42 | 7.4000e+01 4.3000e+01
43 | 5.7000e+01 1.1000e+01
44 | 5.0000e+00 9.8000e+01
45 | 7.1000e+01 1.7000e+01
46 | 9.8000e+01 2.7000e+01
47 | 3.5000e+01 4.7000e+01
48 | 6.8000e+01 5.8000e+01
49 | ```
50 |
51 | ## Visualising logs
52 |
53 | OK, cool, but how can we actually see what's going on?
54 |
55 | To have a better grasp of what's happening, we can plot our curves.
56 | We need first to specify the plotting style, choosing from:
57 |
58 | * `.` for dots
59 | * `+` for points
60 | * `-` for lines
61 | * `+-` for points and lines
62 | * `~` for using smoothed lines with cubic interpolation
63 | * `|` for using boxes
64 | * custom string, one can also pass custom strings to use full capability of gnuplot.
65 |
66 | ```lua
67 | logger:style{'+-', '+-'}
68 | logger:plot()
69 | ```
70 |
71 | 
72 |
73 | If we'd like an interactive visualisation, we can put the `logger:plot()` instruction within the `for` loop, and the chart will be updated at every iteration.
74 |
75 | In case we'd like to prevent `gnuplot` to display the plots, we can set the option `logger:display(false)`.
76 | In this way, plots will be saved but not displayed.
77 | To restore the normal behaviour, use `logger:display(true)`.
78 |
79 | We can set a logarithmic *y* axis with `logger:setlogscale(true)` and reset it with `logger:setlogscale(false)`.
80 |
--------------------------------------------------------------------------------
/nag.lua:
--------------------------------------------------------------------------------
1 | ----------------------------------------------------------------------
2 | -- An implementation of SGD adapted with features of Nesterov's
3 | -- Accelerated Gradient method, based on the paper
4 | -- On the Importance of Initialization and Momentum in Deep Learning
5 | -- Sutsveker et. al., ICML 2013
6 | --
7 | -- ARGS:
8 | -- opfunc : a function that takes a single input (X), the point of
9 | -- evaluation, and returns f(X) and df/dX
10 | -- x : the initial point
11 | -- state : a table describing the state of the optimizer; after each
12 | -- call the state is modified
13 | -- state.learningRate : learning rate
14 | -- state.learningRateDecay : learning rate decay
15 | -- state.weightDecay : weight decay
16 | -- state.momentum : momentum
17 | -- state.learningRates : vector of individual learning rates
18 | --
19 | -- RETURN:
20 | -- x : the new x vector
21 | -- f(x) : the function, evaluated before the update
22 | --
23 | -- (Dilip Krishnan, 2013)
24 | --
25 |
26 | function optim.nag(opfunc, x, config, state)
27 | -- (0) get/update state
28 | local config = config or {}
29 | local state = state or config
30 | local lr = config.learningRate or 1e-3
31 | local lrd = config.learningRateDecay or 0
32 | local wd = config.weightDecay or 0
33 | local mom = config.momentum or 0.9
34 | local damp = config.dampening or mom
35 | local lrs = config.learningRates
36 | state.evalCounter = state.evalCounter or 0
37 | local nevals = state.evalCounter
38 |
39 | if mom <= 0 then
40 | error('Momentum must be positive for Nesterov Accelerated Gradient')
41 | end
42 |
43 | -- (1) evaluate f(x) and df/dx
44 | -- first step in the direction of the momentum vector
45 |
46 | if state.dfdx then
47 | x:add(mom, state.dfdx)
48 | end
49 | -- then compute gradient at that point
50 | -- comment out the above line to get the original SGD
51 | local fx,dfdx = opfunc(x)
52 |
53 | -- (2) weight decay
54 | if wd ~= 0 then
55 | dfdx:add(wd, x)
56 | end
57 |
58 | -- (3) learning rate decay (annealing)
59 | local clr = lr / (1 + nevals*lrd)
60 |
61 | -- (4) apply momentum
62 | if not state.dfdx then
63 | state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0)
64 | else
65 | state.dfdx:mul(mom)
66 | end
67 |
68 | -- (5) parameter update with single or individual learning rates
69 | if lrs then
70 | if not state.deltaParameters then
71 | state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
72 | end
73 | state.deltaParameters:copy(lrs):cmul(dfdx)
74 | x:add(-clr, state.deltaParameters)
75 | state.dfdx:add(-clr, state.deltaParameters)
76 | else
77 | x:add(-clr, dfdx)
78 | state.dfdx:add(-clr, dfdx)
79 | end
80 |
81 | -- (6) update evaluation counter
82 | state.evalCounter = state.evalCounter + 1
83 |
84 | -- return x, f(x) before optimization
85 | return x,{fx}
86 | end
87 |
--------------------------------------------------------------------------------
/sgd.lua:
--------------------------------------------------------------------------------
1 | --[[ A plain implementation of SGD
2 |
3 | ARGS:
4 |
5 | - `opfunc` : a function that takes a single input (X), the point
6 | of a evaluation, and returns f(X) and df/dX
7 | - `x` : the initial point
8 | - `config` : a table with configuration parameters for the optimizer
9 | - `config.learningRate` : learning rate
10 | - `config.learningRateDecay` : learning rate decay
11 | - `config.weightDecay` : weight decay
12 | - `config.weightDecays` : vector of individual weight decays
13 | - `config.momentum` : momentum
14 | - `config.dampening` : dampening for momentum
15 | - `config.nesterov` : enables Nesterov momentum
16 | - `config.learningRates` : vector of individual learning rates
17 | - `state` : a table describing the state of the optimizer; after each
18 | call the state is modified
19 | - `state.evalCounter` : evaluation counter (optional: 0, by default)
20 |
21 | RETURN:
22 | - `x` : the new x vector
23 | - `f(x)` : the function, evaluated before the update
24 |
25 | (Clement Farabet, 2012)
26 | ]]
27 | function optim.sgd(opfunc, x, config, state)
28 | -- (0) get/update state
29 | local config = config or {}
30 | local state = state or config
31 | local lr = config.learningRate or 1e-3
32 | local lrd = config.learningRateDecay or 0
33 | local wd = config.weightDecay or 0
34 | local mom = config.momentum or 0
35 | local damp = config.dampening or mom
36 | local nesterov = config.nesterov or false
37 | local lrs = config.learningRates
38 | local wds = config.weightDecays
39 | state.evalCounter = state.evalCounter or 0
40 | local nevals = state.evalCounter
41 | assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening")
42 |
43 | -- (1) evaluate f(x) and df/dx
44 | local fx,dfdx = opfunc(x)
45 |
46 | -- (2) weight decay with single or individual parameters
47 | if wd ~= 0 then
48 | dfdx:add(wd, x)
49 | elseif wds then
50 | if not state.decayParameters then
51 | state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
52 | end
53 | state.decayParameters:copy(wds):cmul(x)
54 | dfdx:add(state.decayParameters)
55 | end
56 |
57 | -- (3) apply momentum
58 | if mom ~= 0 then
59 | if not state.dfdx then
60 | state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx)
61 | else
62 | state.dfdx:mul(mom):add(1-damp, dfdx)
63 | end
64 | if nesterov then
65 | dfdx:add(mom, state.dfdx)
66 | else
67 | dfdx = state.dfdx
68 | end
69 | end
70 |
71 | -- (4) learning rate decay (annealing)
72 | local clr = lr / (1 + nevals*lrd)
73 |
74 | -- (5) parameter update with single or individual learning rates
75 | if lrs then
76 | if not state.deltaParameters then
77 | state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
78 | end
79 | state.deltaParameters:copy(lrs):cmul(dfdx)
80 | x:add(-clr, state.deltaParameters)
81 | else
82 | x:add(-clr, dfdx)
83 | end
84 |
85 | -- (6) update evaluation counter
86 | state.evalCounter = state.evalCounter + 1
87 |
88 | -- return x*, f(x) before optimization
89 | return x,{fx}
90 | end
91 |
--------------------------------------------------------------------------------
/de.lua:
--------------------------------------------------------------------------------
1 | --[[ An implementation of `DE` (Differential Evolution),
2 |
3 | ARGS:
4 |
5 | -`opfunc` : a function that takes a single input (X), the point of
6 | evaluation, and returns f(X) and df/dX. Note that df/dX is not used
7 | -`x` : the initial point
8 | -`state.popsize`: population size. If this is left empty, 10*d will be used
9 | -`state.scaleFactor`: float, usually between 0.4 and 1
10 | -`state.crossoverRate`: float, usually between 0.1 and 0.9
11 | -`state.maxEval`: int, maximal number of function evaluations
12 |
13 | RETURN:
14 | - `x*` : the new `x` vector, at the optimal point
15 | - `f` : a table of all function values:
16 | `f[1]` is the value of the function before any optimization and
17 | `f[#f]` is the final fully optimized value, at `x*`
18 | ]]
19 |
20 | require 'torch'
21 |
22 | function optim.de(opfunc, x, config, state)
23 | -- process input parameters
24 | local config = config or {}
25 | local state = state
26 | local popsize = config.popsize -- population size
27 | local scaleFactor = config.scaleFactor -- scale factor
28 | local crossoverRate = config.crossoverRate -- crossover rate
29 | local maxFEs = tonumber(config.maxFEs) -- maximal number of function evaluations
30 | local maxRegion = config.maxRegion -- upper bound of search region
31 | local minRegion = config.minRegion -- lower bound of search region
32 | local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
33 | local D = xmean:size(1) -- number of objective variables/problem dimension
34 |
35 | if config.popsize == nil then
36 | popsize = 10 * D
37 | end
38 | if config.maxRegion == nil then
39 | maxRegion = 30
40 | end
41 | if config.minRegion == nil then
42 | minRegion = -30
43 | end
44 |
45 | -- Initialize population
46 | local fx = x.new(maxFEs)
47 | local pop = x.new(popsize, D)
48 | local children = x.new(popsize, D)
49 | local fitness = x.new(popsize)
50 | local children_fitness = x.new(popsize)
51 | local fes = 1 -- number of function evaluations
52 | local best_fitness
53 | local best_solution = x.new(D)
54 |
55 | -- Initialize population and evaluate the its fitness value
56 | local gen = torch.Generator()
57 | torch.manualSeed(gen, 1)
58 |
59 | pop:uniform(gen, minRegion, maxRegion)
60 | for i = 1, popsize do
61 | fitness[i] = opfunc(pop[i])
62 | fx[fes] = fitness[i]
63 | fes = fes + 1
64 | end
65 |
66 | -- Find the best solution
67 | local index
68 | best_fitness, index = fitness:max(1)
69 | best_fitness = best_fitness[1]
70 | index = index[1]
71 | best_solution:copy(pop[index])
72 |
73 | -- Main loop
74 | while fes < maxFEs do
75 | local r1, r2
76 | for i = 1, popsize do
77 | repeat
78 | r1 = torch.random(gen, 1, popsize)
79 | until(r1 ~= i)
80 | repeat
81 | r2 = torch.random(gen, 1, popsize)
82 | until(r2 ~= r1 and r2 ~= i)
83 |
84 | local jrand = torch.random(gen, 1, D)
85 | for j = 1, D do
86 | if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then
87 | children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j])
88 | else
89 | children[i][j] = pop[i][j]
90 | end
91 | end
92 | children_fitness[i] = opfunc(children[i])
93 | fx[fes] = children_fitness[i]
94 | fes = fes + 1
95 | end
96 |
97 | for i = 1, popsize do
98 | if children_fitness[i] <= fitness[i] then
99 | pop[i]:copy(children[i])
100 | fitness[i] = children_fitness[i]
101 | if fitness[i] < best_fitness then
102 | best_fitness = fitness[i]
103 | best_solution:copy(children[i])
104 | end
105 | end
106 | end
107 | end
108 | return best_solution, fx
109 | end
110 |
--------------------------------------------------------------------------------
/rprop.lua:
--------------------------------------------------------------------------------
1 | --[[ A plain implementation of RPROP
2 |
3 | ARGS:
4 | - `opfunc` : a function that takes a single input (X), the point of
5 | evaluation, and returns f(X) and df/dX
6 | - `x` : the initial point
7 | - `state` : a table describing the state of the optimizer; after each
8 | call the state is modified
9 | - `state.stepsize` : initial step size, common to all components
10 | - `state.etaplus` : multiplicative increase factor, > 1 (default 1.2)
11 | - `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5)
12 | - `state.stepsizemax` : maximum stepsize allowed (default 50)
13 | - `state.stepsizemin` : minimum stepsize allowed (default 1e-6)
14 | - `state.niter` : number of iterations (default 1)
15 |
16 | RETURN:
17 | - `x` : the new x vector
18 | - `f(x)` : the function, evaluated before the update
19 |
20 | (Martin Riedmiller, Koray Kavukcuoglu 2013)
21 | --]]
22 | function optim.rprop(opfunc, x, config, state)
23 | if config == nil and state == nil then
24 | print('no state table RPROP initializing')
25 | end
26 | -- (0) get/update state
27 | local config = config or {}
28 | local state = state or config
29 | local stepsize = config.stepsize or 0.1
30 | local etaplus = config.etaplus or 1.2
31 | local etaminus = config.etaminus or 0.5
32 | local stepsizemax = config.stepsizemax or 50.0
33 | local stepsizemin = config.stepsizemin or 1E-06
34 | local niter = config.niter or 1
35 |
36 | local hfx = {}
37 |
38 | for i=1,niter do
39 |
40 | -- (1) evaluate f(x) and df/dx
41 | local fx,dfdx = opfunc(x)
42 |
43 | -- init temp storage
44 | if not state.delta then
45 | state.delta = dfdx.new(dfdx:size()):zero()
46 | state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
47 | state.sign = dfdx.new(dfdx:size())
48 | state.psign = torch.ByteTensor(dfdx:size())
49 | state.nsign = torch.ByteTensor(dfdx:size())
50 | state.zsign = torch.ByteTensor(dfdx:size())
51 | state.dminmax = torch.ByteTensor(dfdx:size())
52 | if torch.type(x)=='torch.CudaTensor' then
53 | -- Push to GPU
54 | state.psign = state.psign:cuda()
55 | state.nsign = state.nsign:cuda()
56 | state.zsign = state.zsign:cuda()
57 | state.dminmax = state.dminmax:cuda()
58 | end
59 | end
60 |
61 | -- sign of derivative from last step to this one
62 | torch.cmul(state.sign, dfdx, state.delta)
63 | torch.sign(state.sign, state.sign)
64 |
65 | -- get indices of >0, <0 and ==0 entries
66 | state.sign.gt(state.psign, state.sign, 0)
67 | state.sign.lt(state.nsign, state.sign, 0)
68 | state.sign.eq(state.zsign, state.sign, 0)
69 |
70 | -- get step size updates
71 | state.sign[state.psign] = etaplus
72 | state.sign[state.nsign] = etaminus
73 | state.sign[state.zsign] = 1
74 |
75 | -- update stepsizes with step size updates
76 | state.stepsize:cmul(state.sign)
77 |
78 | -- threshold step sizes
79 | -- >50 => 50
80 | state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
81 | state.stepsize[state.dminmax] = stepsizemax
82 | -- <1e-6 ==> 1e-6
83 | state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
84 | state.stepsize[state.dminmax] = stepsizemin
85 |
86 | -- for dir<0, dfdx=0
87 | -- for dir>=0 dfdx=dfdx
88 | dfdx[state.nsign] = 0
89 | -- state.sign = sign(dfdx)
90 | torch.sign(state.sign,dfdx)
91 |
92 | -- update weights
93 | x:addcmul(-1,state.sign,state.stepsize)
94 |
95 | -- update state.dfdx with current dfdx
96 | state.delta:copy(dfdx)
97 |
98 | table.insert(hfx,fx)
99 | end
100 |
101 | -- return x*, f(x) before optimization
102 | return x,hfx
103 | end
104 |
--------------------------------------------------------------------------------
/test/sparsecoding.lua:
--------------------------------------------------------------------------------
1 | require 'kex'
2 |
3 | -- L1 FISTA Solution
4 | -- L1 solution with a linear dictionary ||Ax-b||^2 + \lambda ||x||_1
5 | -- D : dictionary, each column is a dictionary element
6 | -- params: set of params to pass to FISTA and possibly temp allocation (**optional**)
7 | -- check unsup.FistaLS function for details.
8 | -- returns fista : a table with the following entries
9 | -- fista.run(x,lambda) : run L1 sparse coding algorithm with input x and lambda.
10 | -- The following entries will be allocated and reused by each call to fista.run(x,lambda)
11 | -- fista.reconstruction: reconstructed input.
12 | -- fista.gradf : gradient of L2 part of the problem wrt x
13 | -- fista.code : the solution of L1 problem
14 | -- The following entries just point to data passed to fista.run(x)
15 | -- fista.input : points to the tensor 'x' used in the last fista.run(x,lambda)
16 | -- fista.lambda : the lambda value used in the last fista.run(x,lambda)
17 | function optim.FistaL1(D, params)
18 |
19 | -- this is for keeping parameters related to fista algorithm
20 | local params = params or {}
21 | -- this is for temporary variables and such
22 | local fista = {}
23 |
24 | -- related to FISTA
25 | params.L = params.L or 0.1
26 | params.Lstep = params.Lstep or 1.5
27 | params.maxiter = params.maxiter or 50
28 | params.maxline = params.maxline or 20
29 | params.errthres = params.errthres or 1e-4
30 |
31 | -- temporary stuff that might be good to keep around
32 | fista.reconstruction = torch.Tensor()
33 | fista.gradf = torch.Tensor()
34 | fista.gradg = torch.Tensor()
35 | fista.code = torch.Tensor()
36 |
37 | -- these will be assigned in run(x)
38 | -- fista.input points to the last input that was run
39 | -- fista.lambda is the lambda value from the last run
40 | fista.input = nil
41 | fista.lambda = nil
42 |
43 | -- CREATE FUNCTION CLOSURES
44 | -- smooth function
45 | fista.f = function (x,mode)
46 |
47 | local reconstruction = fista.reconstruction
48 | local input = fista.input
49 | -- -------------------
50 | -- function evaluation
51 | if x:dim() == 1 then
52 | --print(D:size(),x:size())
53 | reconstruction:resize(D:size(1))
54 | reconstruction:addmv(0,1,D,x)
55 | elseif x:dim(2) then
56 | reconstruction:resize(x:size(1),D:size(1))
57 | reconstruction:addmm(0,1,x,D:t())
58 | end
59 | local fval = input:dist(reconstruction)^2
60 |
61 | -- ----------------------
62 | -- derivative calculation
63 | if mode and mode:match('dx') then
64 | local gradf = fista.gradf
65 | reconstruction:add(-1,input):mul(2)
66 | gradf:resizeAs(x)
67 | if input:dim() == 1 then
68 | gradf:addmv(0,1,D:t(),reconstruction)
69 | else
70 | gradf:addmm(0,1,reconstruction, D)
71 | end
72 | ---------------------------------------
73 | -- return function value and derivative
74 | return fval, gradf, reconstruction
75 | end
76 |
77 | ------------------------
78 | -- return function value
79 | return fval, reconstruction
80 | end
81 |
82 | -- non-smooth function L1
83 | fista.g = function (x)
84 |
85 | local fval = fista.lambda*x:norm(1)
86 |
87 | if mod and mode:match('dx') then
88 | local gradg = fista.gradg
89 | gradg:resizAs(x)
90 | gradg:sign():mul(fista.lambda)
91 | return fval,gradg
92 | end
93 | return fval
94 | end
95 |
96 | -- argmin_x Q(x,y), just shrinkage for L1
97 | fista.pl = function (x,L)
98 | x:shrinkage(fista.lambda/L)
99 | end
100 |
101 | fista.run = function(x, lam, codeinit)
102 | local code = fista.code
103 | fista.input = x
104 | fista.lambda = lam
105 |
106 | -- resize code, maybe a different number of dimensions
107 | -- fill with zeros, initial point
108 | if codeinit then
109 | code:resizeAs(codeinit)
110 | code:copy(codeinit)
111 | else
112 | if x:dim() == 1 then
113 | code:resize(D:size(2))
114 | elseif x:dim() == 2 then
115 | code:resize(x:size(1),D:size(2))
116 | else
117 | error(' I do not know how to handle ' .. x:dim() .. ' dimensional input')
118 | end
119 | code:fill(0)
120 | end
121 | -- return the result of unsup.FistaLS call.
122 | return optim.FistaLS(fista.f, fista.g, fista.pl, fista.code, params)
123 | end
124 |
125 | return fista
126 | end
127 |
128 |
--------------------------------------------------------------------------------
/Logger.lua:
--------------------------------------------------------------------------------
1 | --[[ Logger: a simple class to log symbols during training,
2 | and automate plot generation
3 |
4 | Example:
5 | logger = optim.Logger('somefile.log') -- file to save stuff
6 |
7 | for i = 1,N do -- log some symbols during
8 | train_error = ... -- training/testing
9 | test_error = ...
10 | logger:add{['training error'] = train_error,
11 | ['test error'] = test_error}
12 | end
13 |
14 | logger:style{['training error'] = '-', -- define styles for plots
15 | ['test error'] = '-'}
16 | logger:plot() -- and plot
17 |
18 | ---- OR ---
19 |
20 | logger = optim.Logger('somefile.log') -- file to save stuff
21 | logger:setNames{'training error', 'test error'}
22 |
23 | for i = 1,N do -- log some symbols during
24 | train_error = ... -- training/testing
25 | test_error = ...
26 | logger:add{train_error, test_error}
27 | end
28 |
29 | logger:style{'-', '-'} -- define styles for plots
30 | logger:plot() -- and plot
31 |
32 | -----------
33 |
34 | logger:setlogscale(true) -- enable logscale on Y-axis
35 | logger:plot() -- and plot
36 | ]]
37 | require 'xlua'
38 | local Logger = torch.class('optim.Logger')
39 |
40 | function Logger:__init(filename, timestamp)
41 | if filename then
42 | self.name = filename
43 | os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"')
44 | if timestamp then
45 | -- append timestamp to create unique log file
46 | filename = filename .. '-'..os.date("%Y_%m_%d_%X")
47 | end
48 | self.file = io.open(filename,'w')
49 | self.epsfile = self.name .. '.eps'
50 | else
51 | self.file = io.stdout
52 | self.name = 'stdout'
53 | print(' warning: no path provided, logging to std out')
54 | end
55 | self.empty = true
56 | self.symbols = {}
57 | self.styles = {}
58 | self.names = {}
59 | self.idx = {}
60 | self.figure = nil
61 | self.showPlot = true
62 | self.plotRawCmd = nil
63 | self.defaultStyle = '+'
64 | self.logscale = false
65 | end
66 |
67 | function Logger:setNames(names)
68 | self.names = names
69 | self.empty = false
70 | self.nsymbols = #names
71 | for k,key in pairs(names) do
72 | self.file:write(key .. '\t')
73 | self.symbols[k] = {}
74 | self.styles[k] = {self.defaultStyle}
75 | self.idx[key] = k
76 | end
77 | self.file:write('\n')
78 | self.file:flush()
79 | return self
80 | end
81 |
82 | function Logger:add(symbols)
83 | -- (1) first time ? print symbols' names on first row
84 | if self.empty then
85 | self.empty = false
86 | self.nsymbols = #symbols
87 | for k,val in pairs(symbols) do
88 | self.file:write(k .. '\t')
89 | self.symbols[k] = {}
90 | self.styles[k] = {self.defaultStyle}
91 | self.names[k] = k
92 | end
93 | self.idx = self.names
94 | self.file:write('\n')
95 | end
96 | -- (2) print all symbols on one row
97 | for k,val in pairs(symbols) do
98 | if type(val) == 'number' then
99 | self.file:write(string.format('%11.4e',val) .. '\t')
100 | elseif type(val) == 'string' then
101 | self.file:write(val .. '\t')
102 | else
103 | xlua.error('can only log numbers and strings', 'Logger')
104 | end
105 | end
106 | self.file:write('\n')
107 | self.file:flush()
108 | -- (3) save symbols in internal table
109 | for k,val in pairs(symbols) do
110 | table.insert(self.symbols[k], val)
111 | end
112 | end
113 |
114 | function Logger:style(symbols)
115 | for name,style in pairs(symbols) do
116 | if type(style) == 'string' then
117 | self.styles[name] = {style}
118 | elseif type(style) == 'table' then
119 | self.styles[name] = style
120 | else
121 | xlua.error('style should be a string or a table of strings','Logger')
122 | end
123 | end
124 | return self
125 | end
126 |
127 | function Logger:setlogscale(state)
128 | self.logscale = state
129 | end
130 |
131 | function Logger:display(state)
132 | self.showPlot = state
133 | end
134 |
135 | function Logger:plot(...)
136 | if not xlua.require('gnuplot') then
137 | if not self.warned then
138 | print(' warning: cannot plot with this version of Torch')
139 | self.warned = true
140 | end
141 | return
142 | end
143 | local plotit = false
144 | local plots = {}
145 | local plotsymbol =
146 | function(name,list)
147 | if #list > 1 then
148 | local nelts = #list
149 | local plot_y = torch.Tensor(nelts)
150 | for i = 1,nelts do
151 | plot_y[i] = list[i]
152 | end
153 | for _,style in ipairs(self.styles[name]) do
154 | table.insert(plots, {self.names[name], plot_y, style})
155 | end
156 | plotit = true
157 | end
158 | end
159 | local args = {...}
160 | if not args[1] then -- plot all symbols
161 | for name,list in pairs(self.symbols) do
162 | plotsymbol(name,list)
163 | end
164 | else -- plot given symbols
165 | for _,name in ipairs(args) do
166 | plotsymbol(self.idx[name], self.symbols[self.idx[name]])
167 | end
168 | end
169 | if plotit then
170 | if self.showPlot then
171 | self.figure = gnuplot.figure(self.figure)
172 | if self.logscale then gnuplot.logscale('on') end
173 | gnuplot.plot(plots)
174 | if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
175 | gnuplot.grid('on')
176 | gnuplot.title('')
177 | end
178 | if self.epsfile then
179 | os.execute('rm -f "' .. self.epsfile .. '"')
180 | local epsfig = gnuplot.epsfigure(self.epsfile)
181 | if self.logscale then gnuplot.logscale('on') end
182 | gnuplot.plot(plots)
183 | if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
184 | gnuplot.grid('on')
185 | gnuplot.title('')
186 | gnuplot.plotflush()
187 | gnuplot.close(epsfig)
188 | end
189 | end
190 | end
191 |
--------------------------------------------------------------------------------
/cg.lua:
--------------------------------------------------------------------------------
1 | --[[
2 |
3 | This cg implementation is a rewrite of minimize.m written by Carl
4 | E. Rasmussen. It is supposed to produce exactly same results (give
5 | or take numerical accuracy due to some changed order of
6 | operations). You can compare the result on rosenbrock with minimize.m.
7 | http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html
8 |
9 | [x fx c] = minimize([0 0]', 'rosenbrock', -25)
10 |
11 | Note that we limit the number of function evaluations only, it seems much
12 | more important in practical use.
13 |
14 | ARGS:
15 |
16 | - `opfunc` : a function that takes a single input, the point of evaluation.
17 | - `x` : the initial point
18 | - `state` : a table of parameters and temporary allocations.
19 | - `state.maxEval` : max number of function evaluations
20 | - `state.maxIter` : max number of iterations
21 | - `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage
22 | - `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage
23 |
24 | RETURN:
25 |
26 | - `x*` : the new x vector, at the optimal point
27 | - `f` : a table of all function values where
28 | `f[1]` is the value of the function before any optimization and
29 | `f[#f]` is the final fully optimized value, at x*
30 |
31 | (Koray Kavukcuoglu, 2012)
32 | --]]
33 | function optim.cg(opfunc, x, config, state)
34 | -- parameters
35 | local config = config or {}
36 | local state = state or config
37 | local rho = config.rho or 0.01
38 | local sig = config.sig or 0.5
39 | local int = config.int or 0.1
40 | local ext = config.ext or 3.0
41 | local maxIter = config.maxIter or 20
42 | local ratio = config.ratio or 100
43 | local maxEval = config.maxEval or maxIter*1.25
44 | local red = 1
45 |
46 | local verbose = config.verbose or 0
47 |
48 | local i = 0
49 | local ls_failed = 0
50 | local fx = {}
51 |
52 | -- we need three points for the interpolation/extrapolation stuff
53 | local z1,z2,z3 = 0,0,0
54 | local d1,d2,d3 = 0,0,0
55 | local f1,f2,f3 = 0,0,0
56 |
57 | local df1 = state.df1 or x.new()
58 | local df2 = state.df2 or x.new()
59 | local df3 = state.df3 or x.new()
60 | local tdf
61 |
62 | df1:resizeAs(x)
63 | df2:resizeAs(x)
64 | df3:resizeAs(x)
65 |
66 | -- search direction
67 | local s = state.s or x.new()
68 | s:resizeAs(x)
69 |
70 | -- we need a temp storage for X
71 | local x0 = state.x0 or x.new()
72 | local f0 = 0
73 | local df0 = state.df0 or x.new()
74 | x0:resizeAs(x)
75 | df0:resizeAs(x)
76 |
77 | -- evaluate at initial point
78 | f1,tdf = opfunc(x)
79 | fx[#fx+1] = f1
80 | df1:copy(tdf)
81 | i=i+1
82 |
83 | -- initial search direction
84 | s:copy(df1):mul(-1)
85 |
86 | d1 = -s:dot(s ) -- slope
87 | z1 = red/(1-d1) -- initial step
88 |
89 | while i < math.abs(maxEval) do
90 |
91 | x0:copy(x)
92 | f0 = f1
93 | df0:copy(df1)
94 |
95 | x:add(z1,s)
96 | f2,tdf = opfunc(x)
97 | df2:copy(tdf)
98 | i=i+1
99 | d2 = df2:dot(s)
100 | f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1
101 | local m = math.min(maxIter,maxEval-i)
102 | local success = 0
103 | local limit = -1
104 |
105 | while true do
106 | while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do
107 | limit = z1
108 | if f2 > f1 then
109 | z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
110 | else
111 | local A = 6*(f2-f3)/z3+3*(d2+d3)
112 | local B = 3*(f3-f2)-z3*(d3+2*d2)
113 | z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A
114 | end
115 | if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then
116 | z2 = z3/2;
117 | end
118 | z2 = math.max(math.min(z2, int*z3),(1-int)*z3);
119 | z1 = z1 + z2;
120 | x:add(z2,s)
121 | f2,tdf = opfunc(x)
122 | df2:copy(tdf)
123 | i=i+1
124 | m = m - 1
125 | d2 = df2:dot(s)
126 | z3 = z3-z2;
127 | end
128 | if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then
129 | break
130 | elseif d2 > sig*d1 then
131 | success = 1;
132 | break;
133 | elseif m == 0 then
134 | break;
135 | end
136 | local A = 6*(f2-f3)/z3+3*(d2+d3);
137 | local B = 3*(f3-f2)-z3*(d3+2*d2);
138 | z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3))
139 |
140 | if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then
141 | if limit < -0.5 then
142 | z2 = z1 * (ext -1)
143 | else
144 | z2 = (limit-z1)/2
145 | end
146 | elseif (limit > -0.5) and (z2+z1) > limit then
147 | z2 = (limit-z1)/2
148 | elseif limit < -0.5 and (z2+z1) > z1*ext then
149 | z2 = z1*(ext-1)
150 | elseif z2 < -z3*int then
151 | z2 = -z3*int
152 | elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then
153 | z2 = (limit-z1)*(1-int)
154 | end
155 | f3=f2; d3=d2; z3=-z2;
156 | z1 = z1+z2;
157 | x:add(z2,s)
158 |
159 | f2,tdf = opfunc(x)
160 | df2:copy(tdf)
161 | i=i+1
162 | m = m - 1
163 | d2 = df2:dot(s)
164 | end
165 | if success == 1 then
166 | f1 = f2
167 | fx[#fx+1] = f1;
168 | local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1)
169 | s:mul(ss)
170 | s:add(-1,df2)
171 | local tmp = df1:clone()
172 | df1:copy(df2)
173 | df2:copy(tmp)
174 | d2 = df1:dot(s)
175 | if d2> 0 then
176 | s:copy(df1)
177 | s:mul(-1)
178 | d2 = -s:dot(s)
179 | end
180 |
181 | z1 = z1 * math.min(ratio, d1/(d2-1e-320))
182 | d1 = d2
183 | ls_failed = 0
184 | else
185 | x:copy(x0)
186 | f1 = f0
187 | df1:copy(df0)
188 | if ls_failed or i>maxEval then
189 | break
190 | end
191 | local tmp = df1:clone()
192 | df1:copy(df2)
193 | df2:copy(tmp)
194 | s:copy(df1)
195 | s:mul(-1)
196 | d1 = -s:dot(s)
197 | z1 = 1/(1-d1)
198 | ls_failed = 1
199 | end
200 | end
201 | state.df0 = df0
202 | state.df1 = df1
203 | state.df2 = df2
204 | state.df3 = df3
205 | state.x0 = x0
206 | state.s = s
207 | return x,fx,i
208 | end
209 |
--------------------------------------------------------------------------------
/lswolfe.lua:
--------------------------------------------------------------------------------
1 | --[[ A Line Search satisfying the Wolfe conditions
2 |
3 | ARGS:
4 | - `opfunc` : a function (the objective) that takes a single input (X),
5 | the point of evaluation, and returns f(X) and df/dX
6 | - `x` : initial point / starting location
7 | - `t` : initial step size
8 | - `d` : descent direction
9 | - `f` : initial function value
10 | - `g` : gradient at initial location
11 | - `gtd` : directional derivative at starting location
12 | - `options.c1` : sufficient decrease parameter
13 | - `options.c2` : curvature parameter
14 | - `options.tolX` : minimum allowable step length
15 | - `options.maxIter` : maximum nb of iterations
16 |
17 | RETURN:
18 | - `f` : function value at x+t*d
19 | - `g` : gradient value at x+t*d
20 | - `x` : the next x (=x+t*d)
21 | - `t` : the step length
22 | - `lsFuncEval` : the number of function evaluations
23 | ]]
24 | function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options)
25 | -- options
26 | options = options or {}
27 | local c1 = options.c1 or 1e-4
28 | local c2 = options.c2 or 0.9
29 | local tolX = options.tolX or 1e-9
30 | local maxIter = options.maxIter or 20
31 | local isverbose = options.verbose or false
32 |
33 | -- some shortcuts
34 | local abs = torch.abs
35 | local min = math.min
36 | local max = math.max
37 |
38 | -- verbose function
39 | local function verbose(...)
40 | if isverbose then print(' ', ...) end
41 | end
42 |
43 | -- evaluate objective and gradient using initial step
44 | local x_init = x:clone()
45 | x:add(t,d)
46 | local f_new,g_new = opfunc(x)
47 | local lsFuncEval = 1
48 | local gtd_new = g_new * d
49 |
50 | -- bracket an interval containing a point satisfying the Wolfe
51 | -- criteria
52 | local LSiter,t_prev,done = 0,0,false
53 | local f_prev,g_prev,gtd_prev = f,g:clone(),gtd
54 | local bracket,bracketFval,bracketGval
55 | while LSiter < maxIter do
56 | -- check conditions:
57 | if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then
58 | bracket = x.new{t_prev,t}
59 | bracketFval = x.new{f_prev,f_new}
60 | bracketGval = x.new(2,g_new:size(1))
61 | bracketGval[1] = g_prev
62 | bracketGval[2] = g_new
63 | break
64 |
65 | elseif abs(gtd_new) <= -c2*gtd then
66 | bracket = x.new{t}
67 | bracketFval = x.new{f_new}
68 | bracketGval = x.new(1,g_new:size(1))
69 | bracketGval[1] = g_new
70 | done = true
71 | break
72 |
73 | elseif gtd_new >= 0 then
74 | bracket = x.new{t_prev,t}
75 | bracketFval = x.new{f_prev,f_new}
76 | bracketGval = x.new(2,g_new:size(1))
77 | bracketGval[1] = g_prev
78 | bracketGval[2] = g_new
79 | break
80 |
81 | end
82 |
83 | -- interpolate:
84 | local tmp = t_prev
85 | t_prev = t
86 | local minStep = t + 0.01*(t-tmp)
87 | local maxStep = t*10
88 | t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev},
89 | {t,f_new,gtd_new}},
90 | minStep, maxStep)
91 |
92 | -- next step:
93 | f_prev = f_new
94 | g_prev = g_new:clone()
95 | gtd_prev = gtd_new
96 | x[{}] = x_init
97 | x:add(t,d)
98 | f_new,g_new = opfunc(x)
99 | lsFuncEval = lsFuncEval + 1
100 | gtd_new = g_new * d
101 | LSiter = LSiter + 1
102 | end
103 |
104 | -- reached max nb of iterations?
105 | if LSiter == maxIter then
106 | bracket = x.new{0,t}
107 | bracketFval = x.new{f,f_new}
108 | bracketGval = x.new(2,g_new:size(1))
109 | bracketGval[1] = g
110 | bracketGval[2] = g_new
111 | end
112 |
113 | -- zoom phase: we now have a point satisfying the criteria, or
114 | -- a bracket around it. We refine the bracket until we find the
115 | -- exact point satisfying the criteria
116 | local insufProgress = false
117 | local LOposRemoved = 0
118 | while not done and LSiter < maxIter do
119 | -- find high and low points in bracket
120 | local f_LO,LOpos = bracketFval:min(1)
121 | LOpos = LOpos[1] f_LO = f_LO[1]
122 | local HIpos = -LOpos+3
123 |
124 | -- compute new trial value
125 | t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d},
126 | {bracket[2],bracketFval[2],bracketGval[2]*d}})
127 |
128 | -- test what we are making sufficient progress
129 | if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then
130 | if insufProgress or t>=bracket:max() or t <= bracket:min() then
131 | if abs(t-bracket:max()) < abs(t-bracket:min()) then
132 | t = bracket:max()-0.1*(bracket:max()-bracket:min())
133 | else
134 | t = bracket:min()+0.1*(bracket:max()-bracket:min())
135 | end
136 | insufProgress = false
137 | else
138 | insufProgress = true
139 | end
140 | else
141 | insufProgress = false
142 | end
143 |
144 | -- Evaluate new point
145 | x[{}] = x_init
146 | x:add(t,d)
147 | f_new,g_new = opfunc(x)
148 | lsFuncEval = lsFuncEval + 1
149 | gtd_new = g_new * d
150 | LSiter = LSiter + 1
151 | if f_new > f + c1*t*gtd or f_new >= f_LO then
152 | -- Armijo condition not satisfied or not lower than lowest point
153 | bracket[HIpos] = t
154 | bracketFval[HIpos] = f_new
155 | bracketGval[HIpos] = g_new
156 | else
157 | if abs(gtd_new) <= - c2*gtd then
158 | -- Wolfe conditions satisfied
159 | done = true
160 | elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then
161 | -- Old HI becomes new LO
162 | bracket[HIpos] = bracket[LOpos]
163 | bracketFval[HIpos] = bracketFval[LOpos]
164 | bracketGval[HIpos] = bracketGval[LOpos]
165 | end
166 | -- New point becomes new LO
167 | bracket[LOpos] = t
168 | bracketFval[LOpos] = f_new
169 | bracketGval[LOpos] = g_new
170 | end
171 |
172 | -- done?
173 | if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then
174 | break
175 | end
176 | end
177 |
178 | -- be verbose
179 | if LSiter == maxIter then
180 | verbose('reached max number of iterations')
181 | end
182 |
183 | -- return stuff
184 | local _,LOpos = bracketFval:min(1)
185 | LOpos = LOpos[1]
186 | t = bracket[LOpos]
187 | f_new = bracketFval[LOpos]
188 | g_new = bracketGval[LOpos]
189 | x[{}] = x_init
190 | x:add(t,d)
191 | return f_new,g_new,x,t,lsFuncEval
192 | end
193 |
--------------------------------------------------------------------------------
/polyinterp.lua:
--------------------------------------------------------------------------------
1 | local function isreal(x)
2 | return x == x
3 | end
4 |
5 | local function isnan(x)
6 | return not x == x
7 | end
8 |
9 | local function roots(c)
10 | local tol=1e-12
11 | c[torch.lt(torch.abs(c),tol)]=0
12 |
13 | local nonzero = torch.ne(c,0)
14 | if nonzero:max() == 0 then
15 | return 0
16 | end
17 |
18 | -- first non-zero
19 | local _,pos = torch.max(nonzero,1)
20 | pos = pos[1]
21 | c=c[{ {pos,-1} }]
22 |
23 | local nz = 0
24 | for i=c:size(1),1,-1 do
25 | if c[i] ~= 0 then
26 | break
27 | else
28 | nz = nz + 1
29 | end
30 | end
31 | c=c[{ {1,c:size(1)-nz} }]
32 |
33 | local n = c:size(1)-1
34 | if n == 1 then
35 | local e = c.new({{-c[2]/c[1], 0}})
36 | if nz > 0 then
37 | return torch.cat(e, c.new(nz, 2):zero(), 1)
38 | else
39 | return e
40 | end
41 | elseif n > 1 then
42 | local A = torch.diag(c.new(n-1):fill(1),-1)
43 | A[1] = -c[{ {2,n+1} }]/c[1];
44 | local e = torch.eig(A,'N')
45 | if nz > 0 then
46 | return torch.cat(e, c.new(nz,2):zero(), 1)
47 | else
48 | return e
49 | end
50 | else
51 | return c.new(nz,2):zero()
52 | end
53 | end
54 |
55 | local function real(x)
56 | if type(x) == number then return x end
57 | return x[{ {} , 1}]
58 | end
59 |
60 | local function imag(x)
61 | if type(x) == 'number' then return 0 end
62 | if x:nDimension() == 1 then
63 | return x.new(x:size(1)):zero()
64 | else
65 | return x[{ {}, 2}]
66 | end
67 | end
68 |
69 | local function polyval(p,x)
70 | local pwr = p:size(1)
71 | if type(x) == 'number' then
72 | local val = 0
73 | p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end)
74 | return val
75 | else
76 | local val = x.new(x:size(1))
77 | p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end)
78 | return val
79 | end
80 | end
81 |
82 | ----------------------------------------------------------------------
83 | -- Minimum of interpolating polynomial based on function and
84 | -- derivative values
85 | --
86 | -- ARGS:
87 | -- points : N triplets (x,f,g), must be a Tensor
88 | -- xmin : min value that brackets minimum (default: min of points)
89 | -- xmax : max value that brackets maximum (default: max of points)
90 | --
91 | -- RETURN:
92 | -- minPos : position of minimum
93 | --
94 | function optim.polyinterp(points,xminBound,xmaxBound)
95 | -- locals
96 | local sqrt = torch.sqrt
97 | local mean = torch.mean
98 | local max = math.max
99 | local min = math.min
100 |
101 | -- nb of points / order of polynomial
102 | local nPoints = points:size(1)
103 | local order = nPoints*2-1
104 |
105 | -- returned values
106 | local minPos
107 |
108 | -- Code for most common case:
109 | -- + cubic interpolation of 2 points w/ function and derivative values for both
110 | -- + no xminBound/xmaxBound
111 | if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then
112 | -- Solution in this case (where x2 is the farthest point):
113 | -- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
114 | -- d2 = sqrt(d1^2 - g1*g2);
115 | -- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
116 | -- t_new = min(max(minPos,x1),x2);
117 | local minVal,minPos = points[{ {},1 }]:min(1)
118 | minVal = minVal[1] minPos = minPos[1]
119 | local notMinPos = -minPos+3;
120 |
121 | local d1 = points[{minPos,3}] + points[{notMinPos,3}]
122 | - 3*(points[{minPos,2}]-points[{notMinPos,2}])
123 | / (points[{minPos,1}]-points[{notMinPos,1}]);
124 | local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]);
125 |
126 | if isreal(d2) then -- isreal()
127 | local t = points[{notMinPos,1}] - (points[{notMinPos,1}]
128 | - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1)
129 | / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2))
130 |
131 | minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}])
132 | else
133 | minPos = mean(points[{{},1}])
134 | end
135 | return minPos
136 | end
137 |
138 | -- TODO: get the code below to work!
139 | --error(' extrapolation not implemented yet...')
140 |
141 | -- Compute Bounds of Interpolation Area
142 | local xmin = points[{{},1}]:min()
143 | local xmax = points[{{},1}]:max()
144 | xminBound = xminBound or xmin
145 | xmaxBound = xmaxBound or xmax
146 |
147 | -- Add constraints on function values
148 | local A = points.new(nPoints*2,order+1):zero()
149 | local b = points.new(nPoints*2,1):zero()
150 | for i = 1,nPoints do
151 | local constraint = points.new(order+1):zero()
152 | for j = order,0,-1 do
153 | constraint[order-j+1] = points[{i,1}]^j
154 | end
155 | A[i] = constraint
156 | b[i] = points[{i,2}]
157 | end
158 |
159 | -- Add constraints based on derivatives
160 | for i = 1,nPoints do
161 | local constraint = points.new(order+1):zero()
162 | for j = 1,order do
163 | constraint[j] = (order-j+1)*points[{i,1}]^(order-j)
164 | end
165 | A[nPoints+i] = constraint
166 | b[nPoints+i] = points[{i,3}]
167 | end
168 |
169 | -- Find interpolating polynomial
170 | local res = torch.gels(b,A)
171 | local params = res[{ {1,nPoints*2} }]:squeeze()
172 |
173 | params[torch.le(torch.abs(params),1e-12)]=0
174 |
175 | -- Compute Critical Points
176 | local dParams = points.new(order):zero();
177 | for i = 1,params:size(1)-1 do
178 | dParams[i] = params[i]*(order-i+1)
179 | end
180 |
181 | -- nan/inf?
182 | local nans = false
183 | if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then
184 | nans = true
185 | end
186 |
187 | local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}])
188 | if not nans then
189 | local cproots = roots(dParams)
190 | local cpi = points.new(cp:size(1),2):zero()
191 | cpi[{ {1,cp:size(1)} , 1 }] = cp
192 | cp = torch.cat(cpi,cproots,1)
193 | end
194 |
195 | -- Test Critical Points
196 | local fmin = math.huge
197 | -- Default to Bisection if no critical points valid:
198 | minPos = (xminBound+xmaxBound)/2
199 | for i = 1,cp:size(1) do
200 | local xCP = cp[{ {i,i} , {} }]
201 | local ixCP = imag(xCP)[1]
202 | local rxCP = real(xCP)[1]
203 | if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then
204 | local fCP = polyval(params,rxCP)
205 | if fCP < fmin then
206 | minPos = rxCP
207 | fmin = fCP
208 | end
209 | end
210 | end
211 | return minPos,fmin
212 | end
213 |
--------------------------------------------------------------------------------
/fista.lua:
--------------------------------------------------------------------------------
1 | --[[ FISTA with backtracking line search
2 |
3 | - `f` : smooth function
4 | - `g` : non-smooth function
5 | - `pl` : minimizer of intermediate problem Q(x,y)
6 | - `xinit` : initial point
7 | - `params` : table of parameters (**optional**)
8 | - `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1)
9 | - `params.Lstep` : step size multiplier at each iteration (1.5)
10 | - `params.maxiter` : max number of iterations (50)
11 | - `params.maxline` : max number of line search iterations per iteration (20)
12 | - `params.errthres`: Error thershold for convergence check (1e-4)
13 | - `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true)
14 | - `params.verbose` : store each iteration solution and print detailed info (false)
15 |
16 | On output, `params` will contain these additional fields that can be reused.
17 |
18 | - `params.L` : last used L value will be written.
19 |
20 | These are temporary storages needed by the algo and if the same params object is
21 | passed a second time, these same storages will be used without new allocation.
22 |
23 | - `params.xkm` : previous iterarion point
24 | - `params.y` : fista iteration
25 | - `params.ply` : ply = pl(y - 1/L grad(f))
26 |
27 | Returns the solution x and history of {function evals, number of line search ,...}
28 |
29 | Algorithm is published in
30 |
31 | @article{beck-fista-09,
32 | Author = {Beck, Amir and Teboulle, Marc},
33 | Journal = {SIAM J. Img. Sci.},
34 | Number = {1},
35 | Pages = {183--202},
36 | Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems},
37 | Volume = {2},
38 | Year = {2009}}
39 | ]]
40 | function optim.FistaLS(f, g, pl, xinit, params)
41 |
42 | local params = params or {}
43 | local L = params.L or 0.1
44 | local Lstep = params.Lstep or 1.5
45 | local maxiter = params.maxiter or 50
46 | local maxline = params.maxline or 20
47 | local errthres = params.errthres or 1e-4
48 | local doFistaUpdate = params.doFistaUpdate
49 | local verbose = params.verbose
50 |
51 | -- temporary allocations
52 | params.xkm = params.xkm or torch.Tensor()
53 | params.y = params.y or torch.Tensor()
54 | params.ply = params.ply or torch.Tensor()
55 | local xkm = params.xkm -- previous iteration
56 | local y = params.y -- fista iteration
57 | local ply = params.ply -- soft shrinked y
58 |
59 | -- we start from all zeros
60 | local xk = xinit
61 | xkm:resizeAs(xk):zero()
62 | ply:resizeAs(xk):zero()
63 | y:resizeAs(xk):zero()
64 |
65 | local history = {} -- keep track of stuff
66 | local niter = 0 -- number of iterations done
67 | local converged = false -- are we done?
68 | local tk = 1 -- momentum param for FISTA
69 | local tkp = 0
70 |
71 |
72 | local gy = g(y)
73 | local fval = math.huge -- fval = f+g
74 | while not converged and niter < maxiter do
75 |
76 | -- run through smooth function (code is input, input is target)
77 | -- get derivatives from smooth function
78 | local fy,gfy = f(y,'dx')
79 | --local gfy = f(y)
80 |
81 | local fply = 0
82 | local gply = 0
83 | local Q = 0
84 |
85 | ----------------------------------------------
86 | -- do line search to find new current location starting from fista loc
87 | local nline = 0
88 | local linesearchdone = false
89 | while not linesearchdone do
90 | -- take a step in gradient direction of smooth function
91 | ply:copy(y)
92 | ply:add(-1/L,gfy)
93 |
94 | -- and solve for minimum of auxiliary problem
95 | pl(ply,L)
96 | -- this is candidate for new current iteration
97 | xk:copy(ply)
98 |
99 | -- evaluate this point F(ply)
100 | fply = f(ply)
101 |
102 | -- ply - y
103 | ply:add(-1, y)
104 | --
105 | local Q2 = gfy:dot(ply)
106 | -- L/2 ||beta-y||^2
107 | local Q3 = L/2 * ply:dot(ply)
108 | -- Q(beta,y) = F(y) + + L/2||beta-y||^2 + G(beta)
109 | Q = fy + Q2 + Q3
110 |
111 | if verbose then
112 | print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3))
113 | end
114 | -- check if F(beta) < Q(pl(y),\t)
115 | if fply <= Q then --and Fply + Gply <= F then
116 | -- now evaluate G here
117 | linesearchdone = true
118 | elseif nline >= maxline then
119 | linesearchdone = true
120 | xk:copy(xkm) -- if we can't find a better point, current iter = previous iter
121 | --print('oops')
122 | else
123 | L = L * Lstep
124 | end
125 | nline = nline + 1
126 | end
127 | -- end line search
128 | ---------------------------------------------
129 |
130 | ---------------------------------------------
131 | -- FISTA
132 | ---------------------------------------------
133 | if doFistaUpdate then
134 | -- do the FISTA step
135 | tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2
136 | -- x(k-1) = x(k-1) - x(k)
137 | xkm:add(-1,xk)
138 | -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k))
139 | y:copy(xk)
140 | y:add( (1-tk)/tkp , xkm)
141 | -- store for next iterations
142 | -- x(k-1) = x(k)
143 | xkm:copy(xk)
144 | else
145 | y:copy(xk)
146 | end
147 | -- t(k) = t(k+1)
148 | tk = tkp
149 | fply = f(y)
150 | gply = g(y)
151 | if verbose then
152 | print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply))
153 | end
154 |
155 | niter = niter + 1
156 |
157 | -- bookeeping
158 | fval = fply + gply
159 | history[niter] = {}
160 | history[niter].nline = nline
161 | history[niter].L = L
162 | history[niter].F = fval
163 | history[niter].Fply = fply
164 | history[niter].Gply = gply
165 | history[niter].Q = Q
166 | params.L = L
167 | if verbose then
168 | history[niter].xk = xk:clone()
169 | history[niter].y = y:clone()
170 | end
171 |
172 | -- are we done?
173 | if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then
174 | converged = true
175 | xinit:copy(y)
176 | return y,history
177 | end
178 |
179 | if niter >= maxiter then
180 | xinit:copy(y)
181 | return y,history
182 | end
183 |
184 | --if niter > 1 and history[niter].F > history[niter-1].F then
185 | --print(niter, 'This was supposed to be a convex function, we are going up')
186 | --converged = true
187 | --return xk,history
188 | --end
189 | end
190 | error('not supposed to be here')
191 | end
192 |
193 |
--------------------------------------------------------------------------------
/doc/intro.md:
--------------------------------------------------------------------------------
1 |
2 | # Overview
3 |
4 | Most optimization algorithms have the following interface:
5 |
6 | ```lua
7 | x*, {f}, ... = optim.method(opfunc, x[, config][, state])
8 | ```
9 |
10 | where:
11 |
12 | * `opfunc`: a user-defined closure that respects this API: `f, df/dx = func(x)`
13 | * `x`: the current parameter vector (a 1D `Tensor`)
14 | * `config`: a table of parameters, dependent upon the algorithm
15 | * `state`: a table of state variables, if `nil`, `config` will contain the state
16 | * `x*`: the new parameter vector that minimizes `f, x* = argmin_x f(x)`
17 | * `{f}`: a table of all `f` values, in the order they've been evaluated (for some simple algorithms, like SGD, `#f == 1`)
18 |
19 |
20 |
21 | ## Example
22 |
23 | The state table is used to hold the state of the algorithm.
24 | It's usually initialized once, by the user, and then passed to the optim function as a black box.
25 | Example:
26 |
27 | ```lua
28 | config = {
29 | learningRate = 1e-3,
30 | momentum = 0.5
31 | }
32 |
33 | for i, sample in ipairs(training_samples) do
34 | local func = function(x)
35 | -- define eval function
36 | return f, df_dx
37 | end
38 | optim.sgd(func, x, config)
39 | end
40 | ```
41 |
42 |
43 |
44 | ## Training using optim ##
45 |
46 | `optim` is a quite general optimizer, for minimizing any function with respect to a set of parameters.
47 | In our case, our function will be the loss of our network, given an input, and a set of weights.
48 | The goal of training a neural net is to optimize the weights to give the lowest loss over our validation set, by using the training set as a proxy.
49 | So, we are going to use optim to minimize the loss with respect to the weights, over our training set.
50 |
51 | To illustrate all the steps required, we will go over a simple example, where we will train a neural network on the classical XOR problem.
52 | We will feed the data to `optim` in minibatches (we will use here just one minibatch), breaking your training set into chucks, and feed each minibatch to `optim`, one by one.
53 |
54 | We need to give `optim` a function that will output the loss and the derivative of the loss with respect to the
55 | weights, given the current weights, as a function parameter.
56 | The function will have access to our training minibatch, and use this to calculate the loss, for this minibatch.
57 | Typically, the function would be defined inside our loop over batches, and therefore have access to the current minibatch data.
58 |
59 |
60 | ### Neural Network ###
61 |
62 | We create a simple neural network with one hidden layer.
63 |
64 | ```lua
65 | require 'nn'
66 |
67 | model = nn.Sequential() -- make a multi-layer perceptron
68 | inputs = 2; outputs = 1; HUs = 20 -- parameters
69 | model:add(nn.Linear(inputs, HUs))
70 | model:add(nn.Tanh())
71 | model:add(nn.Linear(HUs, outputs))
72 | ```
73 |
74 | > If we would like to train on GPU, then we need to shipt the model to *device memory* by typing `model:cuda()` after having issued `require 'cunn'`.
75 |
76 |
77 | ### Criterion ###
78 |
79 | We choose the *Mean Squared Error* loss `Criterion`:
80 |
81 | ```lua
82 | criterion = nn.MSECriterion()
83 | ```
84 |
85 | We are using an `nn.MSECriterion` because we are training on a regression task, predicting contiguous (real) target value, from `-1` to `+1`.
86 | For a classification task, with more than two classes, we would add an `nn.LogSoftMax` layer to the end of our network, and use a `nn.ClassNLLCriterion` loss criterion.
87 | Nevertheless, the XOR problem could be seen and a two classes classification task, associated to the `-1` and `+1` discrete outputs.
88 |
89 | > If we would like to train on GPU, then we need to ship the `Criterion` to *device memory* by typing `criterion:cuda()`.
90 |
91 |
92 | ### Data set ###
93 |
94 | We will just create one minibatch of `128` examples.
95 | In your own training, you'd want to break down your rather larger data set into multiple minibatches, of around `32` to `512` examples each.
96 |
97 | ```lua
98 | batchSize = 128
99 | batchInputs = torch.DoubleTensor(batchSize, inputs) -- or CudaTensor for GPU training
100 | batchLabels = torch.DoubleTensor(batchSize) -- or CudaTensor for GPU training
101 |
102 | for i = 1, batchSize do
103 | local input = torch.randn(2) -- normally distributed example in 2d
104 | local label
105 | if input[1] * input[2] > 0 then -- calculate label for XOR function
106 | label = -1
107 | else
108 | label = 1
109 | end
110 | batchInputs[i]:copy(input)
111 | batchLabels[i] = label
112 | end
113 | ```
114 |
115 |
116 | ### Flatten parameters ###
117 |
118 | `optim` expects the parameters that are to be optimized, and their gradients, to be one-dimensional `Tensor`s.
119 | But, our network model contains probably multiple modules, typically multiple convolutional layers, and each of these layers has their own `weight` and `bias` `Tensor`s.
120 | How to handle this?
121 |
122 | It is simple: we can call a standard method `:getParameters()`, that is defined for any network module.
123 | When we call this method, the following magic will happen:
124 |
125 | - a new `Tensor` will be created, large enough to hold all the `weight`s and `bias`es of the entire network model
126 | - the model `weight` and `bias` `Tensor`s are replaced with views onto the new contiguous parameter `Tensor`
127 | - and the exact same thing will happen for all the gradient `Tensor`s: replaced with views onto one single contiguous gradient `Tensor`
128 |
129 | We can call this method as follows:
130 |
131 | ```lua
132 | params, gradParams = model:getParameters()
133 | ```
134 |
135 | These flattened `Tensor`s have the following characteristics:
136 |
137 | - to `optim`, the parameters it needs to optimize are all contained in one single one-dimensional `Tensor`
138 | - when `optim` optimizes the parameters in this large one-dimensional `Tensor`, it is implicitly optimizing the `weight`s and `bias`es in our network model, since those are now simply views onto this large one-dimensional parameter `Tensor`
139 |
140 | It will look something like this:
141 |
142 | 
143 |
144 | > Note that flattening the parameters redefines the `weight` and `bias` `Tensor`s for all the network modules in our network model.
145 | > Therefore, any pre-existing references to the original model layer `weight` and `bias` `Tensor`s will no longer point to the model `weight` and `bias` `Tensor`s, after flattening.
146 |
147 |
148 | ### Training ###
149 |
150 | Now that we have created our model, our training set, and prepared the flattened network parameters, we can train using `optim`.
151 | `optim` provides [various training algorithms](doc/index.md).
152 | We will use the stochastic gradient descent algorithm [SGD](doc/index.md#x-sgdopfunc-x-state).
153 | We need to provide the learning rate, via an optimization state table:
154 |
155 | ```lua
156 | local optimState = {learningRate = 0.01}
157 | ```
158 |
159 | We define an evaluation function, inside our training loop, and use `optim.sgd` to train the system:
160 |
161 | ```lua
162 | require 'optim'
163 |
164 | for epoch = 1, 50 do
165 | -- local function we give to optim
166 | -- it takes current weights as input, and outputs the loss
167 | -- and the gradient of the loss with respect to the weights
168 | -- gradParams is calculated implicitly by calling 'backward',
169 | -- because the model's weight and bias gradient tensors
170 | -- are simply views onto gradParams
171 | function feval(params)
172 | gradParams:zero()
173 |
174 | local outputs = model:forward(batchInputs)
175 | local loss = criterion:forward(outputs, batchLabels)
176 | local dloss_doutputs = criterion:backward(outputs, batchLabels)
177 | model:backward(batchInputs, dloss_doutputs)
178 |
179 | return loss, gradParams
180 | end
181 | optim.sgd(feval, params, optimState)
182 | end
183 | ```
184 |
185 |
186 | ### Test the network ###
187 |
188 | For the prediction task, we will also typically use minibatches, although we can run prediction sample by sample too.
189 | In this example, we will predict sample by sample.
190 | To run prediction on a minibatch, simply pass in a tensor with one additional dimension, which represents the sample index.
191 |
192 | ```lua
193 | x = torch.Tensor(2)
194 | x[1] = 0.5; x[2] = 0.5; print(model:forward(x))
195 | x[1] = 0.5; x[2] = -0.5; print(model:forward(x))
196 | x[1] = -0.5; x[2] = 0.5; print(model:forward(x))
197 | x[1] = -0.5; x[2] = -0.5; print(model:forward(x))
198 | ```
199 |
200 | You should see something like:
201 |
202 | ```lua
203 | > x = torch.Tensor(2)
204 | > x[1] = 0.5; x[2] = 0.5; print(model:forward(x))
205 |
206 | -0.3490
207 | [torch.DoubleTensor of dimension 1]
208 |
209 | > x[1] = 0.5; x[2] = -0.5; print(model:forward(x))
210 |
211 | 1.0561
212 | [torch.DoubleTensor of dimension 1]
213 |
214 | > x[1] = -0.5; x[2] = 0.5; print(model:forward(x))
215 |
216 | 0.8640
217 | [torch.DoubleTensor of dimension 1]
218 |
219 | > x[1] = -0.5; x[2] = -0.5; print(model:forward(x))
220 |
221 | -0.2941
222 | [torch.DoubleTensor of dimension 1]
223 | ```
224 |
225 | If we were running on a GPU, we would probably want to predict using minibatches, because this will hide the latencies involved in transferring data from main memory to the GPU.
226 | To predict on a minbatch, we could do something like:
227 |
228 | ```lua
229 | x = torch.CudaTensor({
230 | { 0.5, 0.5},
231 | { 0.5, -0.5},
232 | {-0.5, 0.5},
233 | {-0.5, -0.5}
234 | })
235 | print(model:forward(x))
236 | ```
237 |
238 | You should see something like:
239 |
240 | ```lua
241 | > print(model:forward(x))
242 | -0.3490
243 | 1.0561
244 | 0.8640
245 | -0.2941
246 | [torch.CudaTensor of size 4]
247 | ```
248 |
249 | That's it!
250 | For minibatched prediction, the output tensor contains one value for each of our input data samples.
251 |
--------------------------------------------------------------------------------
/lbfgs.lua:
--------------------------------------------------------------------------------
1 | --[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)
2 |
3 | This implementation of L-BFGS relies on a user-provided line
4 | search function (state.lineSearch). If this function is not
5 | provided, then a simple learningRate is used to produce fixed
6 | size steps. Fixed size steps are much less costly than line
7 | searches, and can be useful for stochastic problems.
8 |
9 | The learning rate is used even when a line search is provided.
10 | This is also useful for large-scale stochastic problems, where
11 | opfunc is a noisy approximation of f(x). In that case, the learning
12 | rate allows a reduction of confidence in the step size.
13 |
14 | ARGS:
15 |
16 | - `opfunc` : a function that takes a single input (X), the point of
17 | evaluation, and returns f(X) and df/dX
18 | - `x` : the initial point
19 | - `state` : a table describing the state of the optimizer; after each
20 | call the state is modified
21 | - `state.maxIter` : Maximum number of iterations allowed
22 | - `state.maxEval` : Maximum number of function evaluations
23 | - `state.tolFun` : Termination tolerance on the first-order optimality
24 | - `state.tolX` : Termination tol on progress in terms of func/param changes
25 | - `state.lineSearch` : A line search function
26 | - `state.learningRate` : If no line search provided, then a fixed step size is used
27 |
28 | RETURN:
29 | - `x*` : the new `x` vector, at the optimal point
30 | - `f` : a table of all function values:
31 | `f[1]` is the value of the function before any optimization and
32 | `f[#f]` is the final fully optimized value, at `x*`
33 |
34 | (Clement Farabet, 2012)
35 | ]]
36 | function optim.lbfgs(opfunc, x, config, state)
37 | -- get/update state
38 | local config = config or {}
39 | local state = state or config
40 | local maxIter = tonumber(config.maxIter) or 20
41 | local maxEval = tonumber(config.maxEval) or maxIter*1.25
42 | local tolFun = config.tolFun or 1e-5
43 | local tolX = config.tolX or 1e-9
44 | local nCorrection = config.nCorrection or 100
45 | local lineSearch = config.lineSearch
46 | local lineSearchOpts = config.lineSearchOptions
47 | local learningRate = config.learningRate or 1
48 | local isverbose = config.verbose or false
49 |
50 | state.funcEval = state.funcEval or 0
51 | state.nIter = state.nIter or 0
52 |
53 | -- verbose function
54 | local verbose
55 | if isverbose then
56 | verbose = function(...) print(' ', ...) end
57 | else
58 | verbose = function() end
59 | end
60 |
61 | -- import some functions
62 | local abs = math.abs
63 | local min = math.min
64 |
65 | -- evaluate initial f(x) and df/dx
66 | local f,g = opfunc(x)
67 | local f_hist = {f}
68 | local currentFuncEval = 1
69 | state.funcEval = state.funcEval + 1
70 | local p = g:size(1)
71 |
72 | -- check optimality of initial point
73 | state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1
74 | tmp1:copy(g):abs()
75 | if tmp1:sum() <= tolFun then
76 | -- optimality condition below tolFun
77 | verbose('optimality condition below tolFun')
78 | return x,f_hist
79 | end
80 |
81 | if not state.dir_bufs then
82 | -- reusable buffers for y's and s's, and their histories
83 | verbose('creating recyclable direction/step/history buffers')
84 | state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1)
85 | state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1)
86 | for i=1,#state.dir_bufs do
87 | state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1)
88 | state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1)
89 | end
90 | end
91 |
92 | -- variables cached in state (for tracing)
93 | local d = state.d
94 | local t = state.t
95 | local old_dirs = state.old_dirs
96 | local old_stps = state.old_stps
97 | local Hdiag = state.Hdiag
98 | local g_old = state.g_old
99 | local f_old = state.f_old
100 |
101 | -- optimize for a max of maxIter iterations
102 | local nIter = 0
103 | while nIter < maxIter do
104 | -- keep track of nb of iterations
105 | nIter = nIter + 1
106 | state.nIter = state.nIter + 1
107 |
108 | ------------------------------------------------------------
109 | -- compute gradient descent direction
110 | ------------------------------------------------------------
111 | if state.nIter == 1 then
112 | d = g:clone():mul(-1) -- -g
113 | old_dirs = {}
114 | old_stps = {}
115 | Hdiag = 1
116 | else
117 | -- do lbfgs update (update memory)
118 | local y = table.remove(state.dir_bufs) -- pop
119 | local s = table.remove(state.stp_bufs)
120 | y:add(g, -1, g_old) -- g - g_old
121 | s:mul(d, t) -- d*t
122 | local ys = y:dot(s) -- y*s
123 | if ys > 1e-10 then
124 | -- updating memory
125 | if #old_dirs == nCorrection then
126 | -- shift history by one (limited-memory)
127 | local removed1 = table.remove(old_dirs, 1)
128 | local removed2 = table.remove(old_stps, 1)
129 | table.insert(state.dir_bufs, removed1)
130 | table.insert(state.stp_bufs, removed2)
131 | end
132 |
133 | -- store new direction/step
134 | table.insert(old_dirs, s)
135 | table.insert(old_stps, y)
136 |
137 | -- update scale of initial Hessian approximation
138 | Hdiag = ys / y:dot(y) -- (y*y)
139 | else
140 | -- put y and s back into the buffer pool
141 | table.insert(state.dir_bufs, y)
142 | table.insert(state.stp_bufs, s)
143 | end
144 |
145 | -- compute the approximate (L-BFGS) inverse Hessian
146 | -- multiplied by the gradient
147 | local k = #old_dirs
148 |
149 | -- need to be accessed element-by-element, so don't re-type tensor:
150 | state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro
151 | for i = 1,k do
152 | ro[i] = 1 / old_stps[i]:dot(old_dirs[i])
153 | end
154 |
155 | -- iteration in L-BFGS loop collapsed to use just one buffer
156 | local q = tmp1 -- reuse tmp1 for the q buffer
157 | -- need to be accessed element-by-element, so don't re-type tensor:
158 | state.al = state.al or torch.zeros(nCorrection) local al = state.al
159 |
160 | q:mul(g, -1) -- -g
161 | for i = k,1,-1 do
162 | al[i] = old_dirs[i]:dot(q) * ro[i]
163 | q:add(-al[i], old_stps[i])
164 | end
165 |
166 | -- multiply by initial Hessian
167 | r = d -- share the same buffer, since we don't need the old d
168 | r:mul(q, Hdiag) -- q[1] * Hdiag
169 | for i = 1,k do
170 | local be_i = old_stps[i]:dot(r) * ro[i]
171 | r:add(al[i]-be_i, old_dirs[i])
172 | end
173 | -- final direction is in r/d (same object)
174 | end
175 | g_old = g_old or g:clone()
176 | g_old:copy(g)
177 | f_old = f
178 |
179 | ------------------------------------------------------------
180 | -- compute step length
181 | ------------------------------------------------------------
182 | -- directional derivative
183 | local gtd = g:dot(d) -- g * d
184 |
185 | -- check that progress can be made along that direction
186 | if gtd > -tolX then
187 | break
188 | end
189 |
190 | -- reset initial guess for step size
191 | if state.nIter == 1 then
192 | tmp1:copy(g):abs()
193 | t = min(1,1/tmp1:sum()) * learningRate
194 | else
195 | t = learningRate
196 | end
197 |
198 | -- optional line search: user function
199 | local lsFuncEval = 0
200 | if lineSearch and type(lineSearch) == 'function' then
201 | -- perform line search, using user function
202 | f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
203 | table.insert(f_hist, f)
204 | else
205 | -- no line search, simply move with fixed-step
206 | x:add(t,d)
207 | if nIter ~= maxIter then
208 | -- re-evaluate function only if not in last iteration
209 | -- the reason we do this: in a stochastic setting,
210 | -- no use to re-evaluate that function here
211 | f,g = opfunc(x)
212 | lsFuncEval = 1
213 | table.insert(f_hist, f)
214 | end
215 | end
216 |
217 | -- update func eval
218 | currentFuncEval = currentFuncEval + lsFuncEval
219 | state.funcEval = state.funcEval + lsFuncEval
220 |
221 | ------------------------------------------------------------
222 | -- check conditions
223 | ------------------------------------------------------------
224 | if nIter == maxIter then
225 | -- no use to run tests
226 | verbose('reached max number of iterations')
227 | break
228 | end
229 |
230 | if currentFuncEval >= maxEval then
231 | -- max nb of function evals
232 | verbose('max nb of function evals')
233 | break
234 | end
235 |
236 | tmp1:copy(g):abs()
237 | if tmp1:sum() <= tolFun then
238 | -- check optimality
239 | verbose('optimality condition below tolFun')
240 | break
241 | end
242 |
243 | tmp1:copy(d):mul(t):abs()
244 | if tmp1:sum() <= tolX then
245 | -- step size below tolX
246 | verbose('step size below tolX')
247 | break
248 | end
249 |
250 | if abs(f-f_old) < tolX then
251 | -- function value changing less than tolX
252 | verbose('function value changing less than tolX')
253 | break
254 | end
255 | end
256 |
257 | -- save state
258 | state.old_dirs = old_dirs
259 | state.old_stps = old_stps
260 | state.Hdiag = Hdiag
261 | state.g_old = g_old
262 | state.f_old = f_old
263 | state.t = t
264 | state.d = d
265 |
266 | -- return optimal x, and history of f(x)
267 | return x,f_hist,currentFuncEval
268 | end
269 |
--------------------------------------------------------------------------------
/cmaes.lua:
--------------------------------------------------------------------------------
1 | require 'torch'
2 | require 'math'
3 |
4 | local BestSolution = {}
5 | --[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy),
6 | ported from https://www.lri.fr/~hansen/barecmaes2.html.
7 |
8 | Parameters
9 | ----------
10 | ARGS:
11 |
12 | - `opfunc` : a function that takes a single input (X), the point of
13 | evaluation, and returns f(X) and df/dX. Note that df/dX is not used
14 | - `x` : the initial point
15 | - `state.sigma`
16 | float, initial step-size (standard deviation in each
17 | coordinate)
18 | - `state.maxEval`
19 | int, maximal number of function evaluations
20 | - `state.ftarget`
21 | float, target function value
22 | - `state.popsize`
23 | population size. If this is left empty,
24 | 4 + int(3 * log(|x|)) will be used
25 | - `state.ftarget`
26 | stop if fitness < ftarget
27 | - `state.verb_disp`
28 | int, display on console every verb_disp iteration, 0 for never
29 |
30 | RETURN:
31 | - `x*` : the new `x` vector, at the optimal point
32 | - `f` : a table of all function values:
33 | `f[1]` is the value of the function before any optimization and
34 | `f[#f]` is the final fully optimized value, at `x*`
35 | --]]
36 | function optim.cmaes(opfunc, x, config, state)
37 | if (x.triu == nil or x.diag == nil) then
38 | error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x")
39 | end
40 | -- process input parameters
41 | local config = config or {}
42 | local state = state or config
43 | local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
44 | local N = xmean:size(1) -- number of objective variables/problem dimension
45 | local sigma = state.sigma -- coordinate wise standard deviation (step size)
46 | local ftarget = state.ftarget -- stop if fitness < ftarget
47 | local maxEval = tonumber(state.maxEval) or 1e3*N^2
48 | local objfunc = opfunc
49 | local verb_disp = state.verb_disp -- display step size
50 | local min_iterations = state.min_iterations or 1
51 |
52 | local lambda = state.popsize -- population size, offspring number
53 | -- Strategy parameter setting: Selection
54 | if state.popsize == nil then
55 | lambda = 4 + math.floor(3 * math.log(N))
56 | end
57 |
58 | local mu = lambda / 2 -- number of parents/points for recombination
59 | local weights = torch.range(0,mu-1):apply(function(i)
60 | return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights
61 | weights:div(weights:sum()) -- normalize recombination weights array
62 | local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i
63 | weights = weights:typeAs(x)
64 |
65 | -- Strategy parameter setting: Adaptation
66 | local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C
67 | local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control
68 | local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C
69 | local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update
70 | local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1
71 |
72 | -- Initialize dynamic (internal) state variables
73 | local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C
74 | local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma
75 | local B = torch.eye(N):typeAs(x) -- B defines the coordinate system
76 | local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling
77 | local C = torch.eye(N):typeAs(x) -- covariance matrix
78 | if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig
79 | error('torch.symeig not available for ' .. x:type() ..
80 | " please use Float- or DoubleTensor for x")
81 | end
82 | local candidates = torch.Tensor(lambda,N):typeAs(x)
83 | local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2
84 | local eigeneval = 0 -- tracking the update of B and D
85 | local counteval = 0
86 | local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination
87 | local fitvals = torch.Tensor(lambda)-- fitness values
88 | local best = BestSolution.new(nil,nil,counteval)
89 | local iteration = 0 -- iteration of the optimize loop
90 |
91 |
92 | local function ask()
93 | --[[return a list of lambda candidate solutions according to
94 | m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I)
95 | --]]
96 | -- Eigendecomposition: first update B, D and invsqrtC from C
97 | -- postpone in case to achieve O(N^2)
98 | if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then
99 | eigeneval = counteval
100 | C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry
101 | D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3)
102 | D = torch.sqrt(D) -- D contains standard deviations now
103 | invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t())
104 | end
105 | for k=1,lambda do --repeat lambda times
106 | local z = D:clone():normal(0,1):cmul(D)
107 | candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma)
108 | end
109 |
110 | return candidates
111 | end
112 |
113 |
114 | local function tell(arx)
115 | --[[update the evolution paths and the distribution parameters m,
116 | sigma, and C within CMA-ES.
117 |
118 | Parameters
119 | ----------
120 | `arx`
121 | a list of solutions, presumably from `ask()`
122 | `fitvals`
123 | the corresponding objective function values --]]
124 | -- bookkeeping, preparation
125 | counteval = counteval + lambda -- slightly artificial to do here
126 | local xold = xmean:clone()
127 |
128 | -- Sort by fitness and compute weighted mean into xmean
129 | local arindex = nil --sorted indices
130 | fitvals, arindex = torch.sort(fitvals)
131 | arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions
132 |
133 | table.insert(f_hist, fitvals[1]) --append best fitness to history
134 | best:update(arx[1], fitvals[1], counteval)
135 |
136 | xmean:zero()
137 | xmean:addmv(arx:t(), weights) --dot product
138 |
139 | -- Cumulation: update evolution paths
140 | local y = xmean - xold
141 | local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold)
142 |
143 | local c = (cs * (2-cs) * mueff)^0.5 / sigma
144 | ps = ps - ps * cs + z * c -- exponential decay on ps
145 | local hsig = (torch.sum(torch.pow(ps,2)) /
146 | (1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1))
147 | hsig = hsig and 1.0 or 0.0 --use binary numbers
148 |
149 | c = (cc * (2-cc) * mueff)^0.5 / sigma
150 | pc = pc - pc * cc + y * c * hsig -- exponential decay on pc
151 |
152 | -- Adapt covariance matrix C
153 | local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc)
154 | -- for a minor adjustment to the variance loss by hsig
155 | for i=1,N do
156 | for j=1,N do
157 | local r = torch.range(1,mu)
158 | r:apply(function(k)
159 | return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end)
160 | local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update
161 | C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] +
162 | c1 * pc[i]*pc[j] + cmu * Cmuij)
163 | end
164 | end
165 |
166 | -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82
167 | sigma = sigma * math.exp(math.min(0.6,
168 | (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2))
169 | end
170 |
171 | local function stop()
172 | --[[return satisfied termination conditions in a table like
173 | {'termination reason':value, ...}, for example {'tolfun':1e-12},
174 | or the empty table {}--]]
175 | local res = {}
176 | if counteval > 0 then
177 | if counteval >= maxEval then
178 | res['evals'] = maxEval
179 | end
180 | if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then
181 | res['ftarget'] = ftarget
182 | end
183 | if torch.max(D) > 1e7 * torch.min(D) then
184 | res['condition'] = 1e7
185 | end
186 | if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then
187 | res['tolfun'] = 1e-12
188 | end
189 | if sigma * torch.max(D) < 1e-11 then
190 | -- remark: max(D) >= max(diag(C))^0.5
191 | res['tolx'] = 1e-11
192 | end
193 | end
194 | return res
195 | end
196 |
197 | local function disp(verb_modulo)
198 | --[[display some iteration info--]]
199 | if verb_disp == 0 then
200 | return nil
201 | end
202 | local iteration = counteval / lambda
203 |
204 | if iteration == 1 or iteration % (10*verb_modulo) == 0 then
205 | print('evals:\t ax-ratio max(std) f-value')
206 | end
207 | if iteration <= 2 or iteration % verb_modulo == 0 then
208 | local max_std = math.sqrt(torch.max(torch.diag(C)))
209 | print(tostring(counteval).. ': ' ..
210 | string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std)
211 | .. tostring(fitvals[1]))
212 | end
213 |
214 | return nil
215 | end
216 |
217 | while next(stop()) == nil or iteration < min_iterations do
218 | iteration = iteration + 1
219 |
220 | local X = ask() -- deliver candidate solutions
221 | for i=1, lambda do
222 | -- put candidate tensor back in input shape and evaluate in opfunc
223 | local candidate = X[i]:viewAs(x)
224 | fitvals[i] = objfunc(candidate)
225 | end
226 |
227 | tell(X)
228 | disp(verb_disp)
229 | end
230 |
231 | local bestmu, f, c = best:get()
232 | if verb_disp > 0 then
233 | for k, v in pairs(stop()) do
234 | print('termination by', k, '=', v)
235 | end
236 | print('best f-value =', f)
237 | print('solution = ')
238 | print(bestmu)
239 | print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration)
240 | end
241 | table.insert(f_hist, f)
242 |
243 | return bestmu, f_hist, counteval
244 | end
245 |
246 |
247 |
248 | BestSolution.__index = BestSolution
249 | function BestSolution.new(x, f, evals)
250 | local self = setmetatable({}, BestSolution)
251 | self.x = x
252 | self.f = f
253 | self.evals = evals
254 | return self
255 | end
256 |
257 | function BestSolution.update(self, arx, arf, evals)
258 | --[[initialize the best solution with `x`, `f`, and `evals`.
259 | Better solutions have smaller `f`-values.--]]
260 | if self.f == nil or arf < self.f then
261 | self.x = arx:clone()
262 | self.f = arf
263 | self.evals = evals
264 | end
265 | return self
266 | end
267 |
268 | function BestSolution.get(self)
269 | return self.x, self.f, self.evals
270 | end
271 |
--------------------------------------------------------------------------------
/ConfusionMatrix.lua:
--------------------------------------------------------------------------------
1 | --[[ A Confusion Matrix class
2 |
3 | Example:
4 |
5 | conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix
6 | conf:zero() -- reset matrix
7 | for i = 1,N do
8 | conf:add( neuralnet:forward(sample), label ) -- accumulate errors
9 | end
10 | print(conf) -- print matrix
11 | image.display(conf:render()) -- render matrix
12 | ]]
13 | local ConfusionMatrix = torch.class('optim.ConfusionMatrix')
14 |
15 | function ConfusionMatrix:__init(nclasses, classes)
16 | if type(nclasses) == 'table' then
17 | classes = nclasses
18 | nclasses = #classes
19 | end
20 | self.mat = torch.LongTensor(nclasses,nclasses):zero()
21 | self.valids = torch.FloatTensor(nclasses):zero()
22 | self.unionvalids = torch.FloatTensor(nclasses):zero()
23 | self.nclasses = nclasses
24 | self.totalValid = 0
25 | self.averageValid = 0
26 | self.classes = classes or {}
27 | -- buffers
28 | self._mat_flat = self.mat:view(-1)
29 | self._target = torch.FloatTensor()
30 | self._prediction = torch.FloatTensor()
31 | self._max = torch.FloatTensor()
32 | self._pred_idx = torch.LongTensor()
33 | self._targ_idx = torch.LongTensor()
34 | end
35 |
36 | -- takes scalar prediction and target as input
37 | function ConfusionMatrix:_add(p, t)
38 | assert(p and type(p) == 'number')
39 | assert(t and type(t) == 'number')
40 | -- non-positive values are considered missing
41 | -- and therefore ignored
42 | if t > 0 then
43 | self.mat[t][p] = self.mat[t][p] + 1
44 | end
45 | end
46 |
47 | function ConfusionMatrix:add(prediction, target)
48 | if type(prediction) == 'number' then
49 | -- comparing numbers
50 | self:_add(prediction, target)
51 | else
52 | self._prediction:resize(prediction:size()):copy(prediction)
53 | assert(prediction:dim() == 1)
54 | if type(target) == 'number' then
55 | -- prediction is a vector, then target assumed to be an index
56 | self._max:max(self._pred_idx, self._prediction, 1)
57 | self:_add(self._pred_idx[1], target)
58 | else
59 | -- both prediction and target are vectors
60 | assert(target:dim() == 1)
61 | self._target:resize(target:size()):copy(target)
62 | self._max:max(self._targ_idx, self._target, 1)
63 | self._max:max(self._pred_idx, self._prediction, 1)
64 | self:_add(self._pred_idx[1], self._targ_idx[1])
65 | end
66 | end
67 | end
68 |
69 | function ConfusionMatrix:batchAdd(predictions, targets)
70 | local preds, targs, __
71 | self._prediction:resize(predictions:size()):copy(predictions)
72 | if predictions:dim() == 1 then
73 | -- predictions is a vector of classes
74 | preds = self._prediction
75 | elseif predictions:dim() == 2 then
76 | -- prediction is a matrix of class likelihoods
77 | if predictions:size(2) == 1 then
78 | -- or prediction just needs flattening
79 | preds = self._prediction:select(2,1)
80 | else
81 | self._max:max(self._pred_idx, self._prediction, 2)
82 | preds = self._pred_idx:select(2,1)
83 | end
84 | else
85 | error("predictions has invalid number of dimensions")
86 | end
87 |
88 | self._target:resize(targets:size()):copy(targets)
89 | if targets:dim() == 1 then
90 | -- targets is a vector of classes
91 | targs = self._target
92 | elseif targets:dim() == 2 then
93 | -- targets is a matrix of one-hot rows
94 | if targets:size(2) == 1 then
95 | -- or targets just needs flattening
96 | targs = self._target:select(2,1)
97 | else
98 | self._max:max(self._targ_idx, self._target, 2)
99 | targs = self._targ_idx:select(2,1)
100 | end
101 | else
102 | error("targets has invalid number of dimensions")
103 | end
104 |
105 | -- non-positive values are considered missing and therefore ignored
106 | local mask = targs:ge(1)
107 | targs = targs[mask]
108 | preds = preds[mask]
109 |
110 | self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility
111 |
112 | preds = preds:typeAs(targs)
113 |
114 | assert(self.mat:isContiguous() and self.mat:stride(2) == 1)
115 | local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat)
116 | local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1))
117 | self._mat_flat:indexAdd(1, indices, ones)
118 | end
119 |
120 | function ConfusionMatrix:zero()
121 | self.mat:zero()
122 | self.valids:zero()
123 | self.unionvalids:zero()
124 | self.totalValid = 0
125 | self.averageValid = 0
126 | end
127 |
128 | local function isNaN(number)
129 | return number ~= number
130 | end
131 |
132 | function ConfusionMatrix:updateValids()
133 | local total = 0
134 | for t = 1,self.nclasses do
135 | self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum()
136 | self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t])
137 | total = total + self.mat[t][t]
138 | end
139 | self.totalValid = total / self.mat:sum()
140 | self.averageValid = 0
141 | self.averageUnionValid = 0
142 | local nvalids = 0
143 | local nunionvalids = 0
144 | for t = 1,self.nclasses do
145 | if not isNaN(self.valids[t]) then
146 | self.averageValid = self.averageValid + self.valids[t]
147 | nvalids = nvalids + 1
148 | end
149 | if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then
150 | self.averageUnionValid = self.averageUnionValid + self.unionvalids[t]
151 | nunionvalids = nunionvalids + 1
152 | end
153 | end
154 | self.averageValid = self.averageValid / nvalids
155 | self.averageUnionValid = self.averageUnionValid / nunionvalids
156 | end
157 |
158 | -- Calculating FAR/FRR associated with the confusion matrix
159 |
160 | function ConfusionMatrix:farFrr()
161 | local cmat = self.mat
162 | local noOfClasses = cmat:size()[1]
163 | self._frrs = self._frrs or torch.zeros(noOfClasses)
164 | self._frrs:zero()
165 | self._classFrrs = self._classFrrs or torch.zeros(noOfClasses)
166 | self._classFrrs:zero()
167 | self._classFrrs:add(-1)
168 | self._fars = self._fars or torch.zeros(noOfClasses)
169 | self._fars:zero()
170 | self._classFars = self._classFars or torch.zeros(noOfClasses)
171 | self._classFars:zero()
172 | self._classFars:add(-1)
173 | local classSamplesCount = cmat:sum(2)
174 | local indx = 1
175 | for i=1,noOfClasses do
176 | if classSamplesCount[i][1] ~= 0 then
177 | self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1]
178 | self._classFrrs[i] = self._frrs[indx]
179 | -- Calculating FARs
180 | local farNumerator = 0
181 | local farDenominator = 0
182 | for j=1, noOfClasses do
183 | if i ~= j then
184 | if classSamplesCount[j][1] ~= 0 then
185 | farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1]
186 | farDenominator = farDenominator + 1
187 | end
188 | end
189 | end
190 | self._fars[indx] = farNumerator/farDenominator
191 | self._classFars[i] = self._fars[indx]
192 | indx = indx + 1
193 | end
194 | end
195 | indx = indx - 1
196 | local returnFrrs = self._frrs[{{1, indx}}]
197 | local returnFars = self._fars[{{1, indx}}]
198 | return self._classFrrs, self._classFars, returnFrrs, returnFars
199 | end
200 |
201 | local function log10(n)
202 | if math.log10 then
203 | return math.log10(n)
204 | else
205 | return math.log(n) / math.log(10)
206 | end
207 | end
208 |
209 | function ConfusionMatrix:__tostring__()
210 | self:updateValids()
211 | local str = {'ConfusionMatrix:\n'}
212 | local nclasses = self.nclasses
213 | table.insert(str, '[')
214 | local maxCnt = self.mat:max()
215 | local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt)))
216 | for t = 1,nclasses do
217 | local pclass = self.valids[t] * 100
218 | pclass = string.format('%2.3f', pclass)
219 | if t == 1 then
220 | table.insert(str, '[')
221 | else
222 | table.insert(str, ' [')
223 | end
224 | for p = 1,nclasses do
225 | table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p]))
226 | end
227 | if self.classes and self.classes[1] then
228 | if t == nclasses then
229 | table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
230 | else
231 | table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
232 | end
233 | else
234 | if t == nclasses then
235 | table.insert(str, ']] ' .. pclass .. '% \n')
236 | else
237 | table.insert(str, '] ' .. pclass .. '% \n')
238 | end
239 | end
240 | end
241 | table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n')
242 | table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n')
243 | table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%')
244 | return table.concat(str)
245 | end
246 |
247 | function ConfusionMatrix:render(sortmode, display, block, legendwidth)
248 | -- args
249 | local confusion = self.mat:double()
250 | local classes = self.classes
251 | local sortmode = sortmode or 'score' -- 'score' or 'occurrence'
252 | local block = block or 25
253 | local legendwidth = legendwidth or 200
254 | local display = display or false
255 |
256 | -- legends
257 | local legend = {
258 | ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]',
259 | ['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]'
260 | }
261 |
262 | -- parse matrix / normalize / count scores
263 | local diag = torch.FloatTensor(#classes)
264 | local freqs = torch.FloatTensor(#classes)
265 | local unconf = confusion
266 | local confusion = confusion:clone()
267 | local corrects = 0
268 | local total = 0
269 | for target = 1,#classes do
270 | freqs[target] = confusion[target]:sum()
271 | corrects = corrects + confusion[target][target]
272 | total = total + freqs[target]
273 | confusion[target]:div( math.max(confusion[target]:sum(),1) )
274 | diag[target] = confusion[target][target]
275 | end
276 |
277 | -- accuracies
278 | local accuracy = corrects / total * 100
279 | local perclass = 0
280 | local total = 0
281 | for target = 1,#classes do
282 | if confusion[target]:sum() > 0 then
283 | perclass = perclass + diag[target]
284 | total = total + 1
285 | end
286 | end
287 | perclass = perclass / total * 100
288 | freqs:div(unconf:sum())
289 |
290 | -- sort matrix
291 | if sortmode == 'score' then
292 | _,order = torch.sort(diag,1,true)
293 | elseif sortmode == 'occurrence' then
294 | _,order = torch.sort(freqs,1,true)
295 | else
296 | error('sort mode must be one of: score | occurrence')
297 | end
298 |
299 | -- render matrix
300 | local render = torch.zeros(#classes*block, #classes*block)
301 | for target = 1,#classes do
302 | for prediction = 1,#classes do
303 | render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]]
304 | end
305 | end
306 |
307 | -- add grid
308 | for target = 1,#classes do
309 | render[{ {target*block},{} }] = 0.1
310 | render[{ {},{target*block} }] = 0.1
311 | end
312 |
313 | -- create rendering
314 | require 'image'
315 | require 'qtwidget'
316 | require 'qttorch'
317 | local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] )
318 | image.display{image=render, win=win1}
319 |
320 | -- add legend
321 | for i in ipairs(classes) do
322 | -- background cell
323 | win1:setcolor{r=0,g=0,b=0}
324 | win1:rectangle((#render)[2],(i-1)*block,legendwidth,block)
325 | win1:fill()
326 |
327 | -- %
328 | win1:setfont(qt.QFont{serif=false, size=fontsize})
329 | local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4
330 | win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2}
331 | win1:moveto((#render)[2]+10,i*block-block/3)
332 | win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100))
333 |
334 | -- legend
335 | win1:setfont(qt.QFont{serif=false, size=fontsize})
336 | local gscale = diag[order[i]]*0.8+0.2
337 | win1:setcolor{r=gscale,g=gscale,b=gscale}
338 | win1:moveto(120+(#render)[2]+10,i*block-block/3)
339 | win1:show(classes[order[i]])
340 |
341 | for j in ipairs(classes) do
342 | -- scores
343 | local score = confusion[order[j]][order[i]]
344 | local gscale = (1-score)*(score*0.8+0.2)
345 | win1:setcolor{r=gscale,g=gscale,b=gscale}
346 | win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3)
347 | win1:show(string.format('%02.0f',math.floor(score*100+0.5)))
348 | end
349 | end
350 |
351 | -- generate tensor
352 | local t = win1:image():toTensor()
353 |
354 | -- display
355 | if display then
356 | image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)}
357 | end
358 |
359 | -- return rendering
360 | return t
361 | end
362 |
--------------------------------------------------------------------------------
/doc/image/parameterflattening.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
339 |
--------------------------------------------------------------------------------
/doc/algos.md:
--------------------------------------------------------------------------------
1 |
2 | # Optimization algorithms
3 |
4 | The following algorithms are provided:
5 |
6 | * [*Stochastic Gradient Descent*](#optim.sgd)
7 | * [*Averaged Stochastic Gradient Descent*](#optim.asgd)
8 | * [*L-BFGS*](#optim.lbfgs)
9 | * [*Congugate Gradients*](#optim.cg)
10 | * [*AdaDelta*](#optim.adadelta)
11 | * [*AdaGrad*](#optim.adagrad)
12 | * [*Adam*](#optim.adam)
13 | * [*AdaMax*](#optim.adamax)
14 | * [*FISTA with backtracking line search*](#optim.FistaLS)
15 | * [*Nesterov's Accelerated Gradient method*](#optim.nag)
16 | * [*RMSprop*](#optim.rmsprop)
17 | * [*Rprop*](#optim.rprop)
18 | * [*CMAES*](#optim.cmaes)
19 |
20 | All these algorithms are designed to support batch optimization as well as stochastic optimization.
21 | It's up to the user to construct an objective function that represents the batch, mini-batch, or single sample on which to evaluate the objective.
22 |
23 | Some of these algorithms support a line search, which can be passed as a function (*L-BFGS*), whereas others only support a learning rate (*SGD*).
24 |
25 | General interface:
26 |
27 | ```lua
28 | x*, {f}, ... = optim.method(opfunc, x[, config][, state])
29 | ```
30 |
31 |
32 |
33 | ## sgd(opfunc, x[, config][, state])
34 |
35 | An implementation of *Stochastic Gradient Descent* (*SGD*).
36 |
37 | Arguments:
38 |
39 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
40 | * `x`: the initial point
41 | * `config`: a table with configuration parameters for the optimizer
42 | * `config.learningRate`: learning rate
43 | * `config.learningRateDecay`: learning rate decay
44 | * `config.weightDecay`: weight decay
45 | * `config.weightDecays`: vector of individual weight decays
46 | * `config.momentum`: momentum
47 | * `config.dampening`: dampening for momentum
48 | * `config.nesterov`: enables Nesterov momentum
49 | * `state`: a table describing the state of the optimizer; after each call the state is modified
50 | * `state.learningRates`: vector of individual learning rates
51 |
52 | Returns:
53 |
54 | * `x*`: the new x vector
55 | * `f(x)`: the function, evaluated before the update
56 |
57 |
58 |
59 | ## asgd(opfunc, x[, config][, state])
60 |
61 | An implementation of *Averaged Stochastic Gradient Descent* (*ASGD*):
62 |
63 | ```lua
64 | x = (1 - lambda eta_t) x - eta_t df / dx(z, x)
65 | a = a + mu_t [ x - a ]
66 |
67 | eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
68 | mu_t = 1 / max(1, t - t0)
69 | ```
70 |
71 | Arguments:
72 |
73 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
74 | * `x`: the initial point
75 | * `config`: a table with configuration parameters for the optimizer
76 | * `config.eta0`: learning rate
77 | * `config.lambda`: decay term
78 | * `config.alpha`: power for eta update
79 | * `config.t0`: point at which to start averaging
80 |
81 | Returns:
82 |
83 | * `x*`: the new x vector
84 | * `f(x)`: the function, evaluated before the update
85 | * `ax`: the averaged x vector
86 |
87 |
88 |
89 | ## lbfgs(opfunc, x[, config][, state])
90 |
91 | An implementation of *L-BFGS* that relies on a user-provided line search function (`state.lineSearch`).
92 | If this function is not provided, then a simple learning rate is used to produce fixed size steps.
93 | Fixed size steps are much less costly than line searches, and can be useful for stochastic problems.
94 |
95 | The learning rate is used even when a line search is provided.
96 | This is also useful for large-scale stochastic problems, where opfunc is a noisy approximation of `f(x)`.
97 | In that case, the learning rate allows a reduction of confidence in the step size.
98 |
99 | Arguments:
100 |
101 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
102 | * `x`: the initial point
103 | * `config`: a table with configuration parameters for the optimizer
104 | * `config.maxIter`: Maximum number of iterations allowed
105 | * `config.maxEval`: Maximum number of function evaluations
106 | * `config.tolFun`: Termination tolerance on the first-order optimality
107 | * `config.tolX`: Termination tol on progress in terms of func/param changes
108 | * `config.lineSearch`: A line search function
109 | * `config.learningRate`: If no line search provided, then a fixed step size is used
110 |
111 | Returns:
112 | * `x*`: the new `x` vector, at the optimal point
113 | * `f`: a table of all function values:
114 | * `f[1]` is the value of the function before any optimization and
115 | * `f[#f]` is the final fully optimized value, at `x*`
116 |
117 |
118 |
119 | ## cg(opfunc, x[, config][, state])
120 |
121 | An implementation of the *Conjugate Gradient* method which is a rewrite of `minimize.m` written by Carl E. Rasmussen.
122 | It is supposed to produce exactly same results (give or take numerical accuracy due to some changed order of operations).
123 | You can compare the result on rosenbrock with [`minimize.m`](http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html).
124 |
125 | ```lua
126 | x, fx, c = minimize([0, 0]', 'rosenbrock', -25)
127 | ```
128 |
129 | Note that we limit the number of function evaluations only, it seems much more important in practical use.
130 |
131 | Arguments:
132 |
133 | * `opfunc`: a function that takes a single input, the point of evaluation.
134 | * `x`: the initial point
135 | * `config`: a table with configuration parameters for the optimizer
136 | * `config.maxEval`: max number of function evaluations
137 | * `config.maxIter`: max number of iterations
138 | * `state`: a table of parameters and temporary allocations.
139 | * `state.df[0, 1, 2, 3]`: if you pass `Tensor` they will be used for temp storage
140 | * `state.[s, x0]`: if you pass `Tensor` they will be used for temp storage
141 |
142 | Returns:
143 |
144 | * `x*`: the new `x` vector, at the optimal point
145 | * `f`: a table of all function values where
146 | * `f[1]` is the value of the function before any optimization and
147 | * `f[#f]` is the final fully optimized value, at `x*`
148 |
149 |
150 |
151 | ## adadelta(opfunc, x[, config][, state])
152 |
153 | *AdaDelta* implementation for *SGD* http://arxiv.org/abs/1212.5701.
154 |
155 | Arguments:
156 |
157 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
158 | * `x`: the initial point
159 | * `config`: a table of hyper-parameters
160 | * `config.rho`: interpolation parameter
161 | * `config.eps`: for numerical stability
162 | * `state`: a table describing the state of the optimizer; after each call the state is modified
163 | * `state.paramVariance`: vector of temporal variances of parameters
164 | * `state.accDelta`: vector of accummulated delta of gradients
165 |
166 | Returns:
167 |
168 | * `x*`: the new x vector
169 | * `f(x)`: the function, evaluated before the update
170 |
171 |
172 |
173 | ## adagrad(opfunc, x[, config][, state])
174 |
175 | *AdaGrad* implementation for *SGD*.
176 |
177 | Arguments:
178 |
179 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
180 | * `x`: the initial point
181 | * `config`: a table with configuration parameters for the optimizer
182 | * `config.learningRate`: learning rate
183 | * `config.learningRateDecay`: learning rate decay
184 | * `config.weightDecay`: weight decay coefficient for regularization
185 | * `state`: a table describing the state of the optimizer; after each call the state is modified
186 | * `state.paramVariance`: vector of temporal variances of parameters
187 |
188 | Returns:
189 |
190 | * `x*`: the new `x` vector
191 | * `f(x)`: the function, evaluated before the update
192 |
193 |
194 |
195 | ## adam(opfunc, x[, config][, state])
196 |
197 | An implementation of *Adam* from http://arxiv.org/pdf/1412.6980.pdf.
198 |
199 | Arguments:
200 |
201 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
202 | * `x`: the initial point
203 | * `config`: a table with configuration parameters for the optimizer
204 | * `config.learningRate`: learning rate
205 | * `config.learningRateDecay`: learning rate decay
206 | * `config.weightDecay`: weight decay coefficient for regularization
207 | * `config.beta1`: first moment coefficient
208 | * `config.beta2`: second moment coefficient
209 | * `config.epsilon`: for numerical stability
210 | * `state`: a table describing the state of the optimizer; after each call the state is modified
211 |
212 | Returns:
213 |
214 | * `x*`: the new x vector
215 | * `f(x)`: the function, evaluated before the update
216 |
217 |
218 |
219 | ## adamax(opfunc, x[, config][, state])
220 |
221 | An implementation of *AdaMax* http://arxiv.org/pdf/1412.6980.pdf.
222 |
223 | Arguments:
224 |
225 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
226 | * `x`: the initial point
227 | * `config`: a table with configuration parameters for the optimizer
228 | * `config.learningRate`: learning rate
229 | * `config.beta1`: first moment coefficient
230 | * `config.beta2`: second moment coefficient
231 | * `config.epsilon`: for numerical stability
232 | * `state`: a table describing the state of the optimizer; after each call the state is modified
233 |
234 | Returns:
235 |
236 | * `x*`: the new `x` vector
237 | * `f(x)`: the function, evaluated before the update
238 |
239 |
240 |
241 | ## FistaLS(f, g, pl, xinit[, params])
242 |
243 | *Fista* with backtracking *Line Search*:
244 |
245 | * `f`: smooth function
246 | * `g`: non-smooth function
247 | * `pl`: minimizer of intermediate problem Q(x, y)
248 | * `xinit`: initial point
249 | * `params`: table of parameters (**optional**)
250 | * `params.L`: 1/(step size) for ISTA/FISTA iteration (0.1)
251 | * `params.Lstep`: step size multiplier at each iteration (1.5)
252 | * `params.maxiter`: max number of iterations (50)
253 | * `params.maxline`: max number of line search iterations per iteration (20)
254 | * `params.errthres`: Error thershold for convergence check (1e-4)
255 | * `params.doFistaUpdate`: true : use FISTA, false: use ISTA (true)
256 | * `params.verbose`: store each iteration solution and print detailed info (false)
257 |
258 | On output, `params` will contain these additional fields that can be reused.
259 | * `params.L`: last used L value will be written.
260 |
261 | These are temporary storages needed by the algo and if the same params object is
262 | passed a second time, these same storages will be used without new allocation.
263 | * `params.xkm`: previous iterarion point
264 | * `params.y`: fista iteration
265 | * `params.ply`: `ply = pl(y * 1/L grad(f))`
266 |
267 | Returns the solution `x` and history of `{function evals, number of line search , ...}`.
268 |
269 | Algorithm is published in http://epubs.siam.org/doi/abs/10.1137/080716542
270 |
271 |
272 |
273 | ## nag(opfunc, x[, config][, state])
274 |
275 | An implementation of *SGD* adapted with features of *Nesterov's Accelerated Gradient method*, based on the paper "On the Importance of Initialization and Momentum in Deep Learning" (Sutskever et. al., ICML 2013) http://www.cs.toronto.edu/~fritz/absps/momentum.pdf.
276 |
277 | Arguments:
278 |
279 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
280 | * `x`: the initial point
281 | * `config`: a table with configuration parameters for the optimizer
282 | * `config.learningRate`: learning rate
283 | * `config.learningRateDecay`: learning rate decay
284 | * `config.weightDecay`: weight decay
285 | * `config.momentum`: momentum
286 | * `config.learningRates`: vector of individual learning rates
287 |
288 | Returns:
289 |
290 | * `x*`: the new `x` vector
291 | * `f(x)`: the function, evaluated before the update
292 |
293 |
294 |
295 | ## rmsprop(opfunc, x[, config][, state])
296 |
297 | An implementation of *RMSprop*.
298 |
299 | Arguments:
300 |
301 | * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
302 | * `x`: the initial point
303 | * `config`: a table with configuration parameters for the optimizer
304 | * `config.learningRate`: learning rate
305 | * `config.alpha`: smoothing constant
306 | * `config.epsilon`: value with which to initialise m
307 | * `state`: a table describing the state of the optimizer; after each call the state is modified
308 | * `state.m`: leaky sum of squares of parameter gradients,
309 | * `state.tmp`: and the square root (with epsilon smoothing)
310 |
311 | Returns:
312 |
313 | * `x*`: the new x vector
314 | * `f(x)`: the function, evaluated before the update
315 |
316 |
317 |
318 | ## rprop(opfunc, x[, config][, state])
319 |
320 | A plain implementation of *Rprop* (Martin Riedmiller, Koray Kavukcuoglu 2013).
321 |
322 | Arguments:
323 |
324 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
325 | * `x`: the initial point
326 | * `config`: a table with configuration parameters for the optimizer
327 | * `config.stepsize`: initial step size, common to all components
328 | * `config.etaplus`: multiplicative increase factor, > 1 (default 1.2)
329 | * `config.etaminus`: multiplicative decrease factor, < 1 (default 0.5)
330 | * `config.stepsizemax`: maximum stepsize allowed (default 50)
331 | * `config.stepsizemin`: minimum stepsize allowed (default 1e-6)
332 | * `config.niter`: number of iterations (default 1)
333 |
334 | Returns:
335 |
336 | * `x*`: the new x vector
337 | * `f(x)`: the function, evaluated before the update
338 |
339 |
340 |
341 | ## cmaes(opfunc, x[, config][, state])
342 |
343 | An implementation of *CMAES* (*Covariance Matrix Adaptation Evolution Strategy*), ported from https://www.lri.fr/~hansen/barecmaes2.html.
344 |
345 | *CMAES* is a stochastic, derivative-free method for heuristic global optimization of non-linear or non-convex continuous optimization problems.
346 | Note that this method will on average take much more function evaluations to converge then a gradient based method.
347 |
348 | Arguments:
349 |
350 | If `state` is specified, then `config` is not used at all.
351 | Otherwise `state` is `config`.
352 |
353 | * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`. Note that `df/dX` is not used and can be left 0
354 | * `x`: the initial point
355 | * `state`: a table describing the state of the optimizer; after each call the state is modified
356 | * `state.sigma`: float, initial step-size (standard deviation in each coordinate)
357 | * `state.maxEval`: int, maximal number of function evaluations
358 | * `state.ftarget`: float, target function value
359 | * `state.popsize`: population size. If this is left empty, `4 + int(3 * log(|x|))` will be used
360 | * `state.ftarget`: stop if `fitness < ftarget`
361 | * `state.verb_disp`: display info on console every verb_disp iteration, 0 for never
362 |
363 | Returns:
364 | * `x*`: the new `x` vector, at the optimal point
365 | * `f`: a table of all function values:
366 | * `f[1]` is the value of the function before any optimization and
367 | * `f[#f]` is the final fully optimized value, at `x*`
368 |
--------------------------------------------------------------------------------