├── .gitignore
├── probls
    ├── __init__.py
    ├── tensorflow_interface
    │   ├── __init__.py
    │   ├── gradient_moment.py
    │   └── interface_sgd.py
    ├── utils.py
    ├── line_search.py
    └── gaussian_process.py
├── examples
    ├── models
    │   ├── __init__.py
    │   ├── mnist_mlp.py
    │   ├── mnist_2conv_2dense.py
    │   └── cifar10_2conv_3dense.py
    ├── run_probls_cifar10.py
    ├── run_probls_mnist.py
    ├── run_probls_mnist_interactive.py
    └── cifar10.py
├── test
    ├── demo_gaussian_process.py
    ├── test_utils.py
    ├── test_gaussian_process.py
    ├── demo_interface_sgd.py
    └── test_gradient_moment.py
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | examples/data
3 | 


--------------------------------------------------------------------------------
/probls/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Nov 16 15:46:48 2016
4 | 
5 | @author: lballes
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/probls/tensorflow_interface/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Nov 16 15:46:48 2016
4 | 
5 | @author: lballes
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Nov 27 11:59:40 2016
4 | 
5 | @author: Lukas Balles [lballes@tuebingen.mpg.de]
6 | """
7 | 


--------------------------------------------------------------------------------
/examples/models/mnist_mlp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | TensorFlow MNIST MLP model.
 4 | """
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | def weight_variable(shape):
 9 |   initial = tf.truncated_normal(shape, stddev=1e-2)
10 |   return tf.Variable(initial)
11 | 
12 | def bias_variable(shape):
13 |   initial = tf.constant(0.05, shape=shape)
14 |   return tf.Variable(initial)
15 | 
16 | def set_up_model():
17 |   tf.reset_default_graph()
18 |   X = tf.placeholder(tf.float32, shape=[None, 784])
19 |   y = tf.placeholder(tf.float32, shape=[None, 10])
20 |   W_fc1 = weight_variable([784, 800])
21 |   b_fc1 = bias_variable([800])
22 |   h_fc1 = tf.nn.sigmoid(tf.matmul(X, W_fc1) + b_fc1)
23 |   W_fc2 = weight_variable([800, 10])
24 |   b_fc2 = bias_variable([10])
25 |   h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
26 |   losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1])
27 |   return losses, [X, y], [W_fc1, b_fc1, W_fc2, b_fc2]
28 | 


--------------------------------------------------------------------------------
/test/demo_gaussian_process.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Demo for Gaussian process functionality in probls.gaussian_process.
 4 | 
 5 | Created on Thu Nov 17 16:58:40 2016
 6 | 
 7 | @author: lballes
 8 | """
 9 | 
10 | import os
11 | import sys
12 | sys.path.insert(0, os.path.abspath('..'))
13 | 
14 | import numpy as np
15 | import matplotlib.pyplot as plt
16 | import time
17 | 
18 | from probls import gaussian_process
19 | 
20 | 
21 | # Specify noise levels and observations
22 | fvar, dfvar = 3e-1, 1e-2
23 | observations = [(0., 0., -1.), (1., -0.5, -0.9), (2., -0.9, 0.7)]
24 | 
25 | # Add observations to GP, compute posterior mean and variance
26 | gp = gaussian_process.ProbLSGaussianProcess()
27 | for obs in observations:
28 |   gp.add(*obs, fvar=fvar, dfvar=dfvar)
29 | beg = time.time()
30 | gp.update()
31 | print "gp.update() took", (time.time()-beg)*10**6, "microseconds"
32 | 
33 | tt = np.arange(-0.1, 4.0, 0.01)
34 | 
35 | 
36 | fig, (a1, a2, a3) = plt.subplots(3, 1)
37 | gp.visualize_f(a1)
38 | gp.visualize_df(a2)
39 | gp.visualize_ei(a3)
40 | 
41 | # Find the minima and add them to the plot
42 | minima = gp.find_dmu_equal(0.2)
43 | a1.plot(minima, [gp.mu(m) for m in minima], 'D')
44 | a2.plot(minima, [gp.dmu(m) for m in minima], 'D')


--------------------------------------------------------------------------------
/examples/run_probls_cifar10.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Run probabilistic line search on a CIFAR-10 example.
 4 | """
 5 | 
 6 | import os
 7 | import sys
 8 | sys.path.insert(0, os.path.abspath('..'))
 9 | 
10 | import tensorflow as tf
11 | 
12 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface
13 | from probls.line_search import ProbLSOptimizer
14 | 
15 | import cifar10
16 | 
17 | #### Specify training specifics here ##########################################
18 | from models import cifar10_2conv_3dense as model
19 | num_steps = 4000
20 | batch_size = 256
21 | ###############################################################################
22 | 
23 | 
24 | # Set up model
25 | tf.reset_default_graph()
26 | images, labels = cifar10.distorted_inputs(batch_size=batch_size)
27 | losses, variables = model.set_up_model(images, labels)
28 | 
29 | # Set up ProbLS optimizer
30 | opt_interface = ProbLSOptimizerSGDInterface()
31 | opt_interface.minimize(losses, variables)
32 | sess = tf.Session()
33 | opt_interface.register_session(sess)
34 | opt_ls = ProbLSOptimizer(opt_interface, alpha0=1e-3, cW=0.3, c1=0.05,
35 |     target_df=0.5, df_lo=-0.1, df_hi=1.1, expl_policy="linear", fpush=1.0,
36 |     max_change_factor=10., max_steps=10, max_expl=10, max_dmu0=0.0)
37 | 
38 | # Initialize variables and start queues
39 | coord = tf.train.Coordinator()
40 | sess.run(tf.global_variables_initializer())
41 | threads = tf.train.start_queue_runners(sess=sess, coord=coord)
42 | 
43 | # Run ProbLS
44 | opt_ls.prepare()
45 | for i in range(num_steps):
46 |   print(opt_ls.proceed())
47 | 
48 | # Stop queues
49 | coord.request_stop()
50 | coord.join(threads)


--------------------------------------------------------------------------------
/examples/models/mnist_2conv_2dense.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | TensorFlow MNIST CNN model.
 4 | """
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | def weight_variable(shape):
 9 |   initial = tf.truncated_normal(shape, stddev=1e-2)
10 |   return tf.Variable(initial)
11 | 
12 | def bias_variable(shape):
13 |   initial = tf.constant(0.05, shape=shape)
14 |   return tf.Variable(initial)
15 | 
16 | def conv2d(x, W):
17 |   return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
18 | 
19 | def max_pool_2x2(x):
20 |   return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
21 |                         strides=[1, 2, 2, 1], padding='SAME')
22 | 
23 | def set_up_model():
24 |   tf.reset_default_graph()
25 |   X = tf.placeholder(tf.float32, shape=[None, 784])
26 |   y = tf.placeholder(tf.float32, shape=[None, 10])
27 |   W_conv1 = weight_variable([5, 5, 1, 32])
28 |   b_conv1 = bias_variable([32])
29 |   X_image = tf.reshape(X, [-1,28,28,1])
30 |   h_conv1 = tf.nn.relu(conv2d(X_image, W_conv1) + b_conv1)
31 |   h_pool1 = max_pool_2x2(h_conv1)
32 |   W_conv2 = weight_variable([5, 5, 32, 64])
33 |   b_conv2 = bias_variable([64])
34 |   h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
35 |   h_pool2 = max_pool_2x2(h_conv2)
36 |   W_fc1 = weight_variable([7 * 7 * 64, 1024])
37 |   b_fc1 = bias_variable([1024])
38 |   h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
39 |   h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
40 |   W_fc2 = weight_variable([1024, 10])
41 |   b_fc2 = bias_variable([10])
42 |   h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
43 |   losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1])
44 |   return losses, [X, y], [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2]
45 | 


--------------------------------------------------------------------------------
/examples/run_probls_mnist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Run probabilistic line search on a MNIST example.
 4 | """
 5 | 
 6 | import os
 7 | import sys
 8 | sys.path.insert(0, os.path.abspath('..'))
 9 | 
10 | import tensorflow as tf
11 | from tensorflow.examples.tutorials.mnist import input_data
12 | mnist = input_data.read_data_sets('data/mnist', one_hot=True)
13 | 
14 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface
15 | from probls.line_search import ProbLSOptimizer
16 | 
17 | #### Specify training specifics here ##########################################
18 | from models import mnist_2conv_2dense as model # Comment/uncomment to chose
19 | #from models import mnist_mlp as model           # the model to run 
20 | num_steps = 4000
21 | batch_size = 256
22 | ###############################################################################
23 | 
24 | 
25 | # Set up model
26 | losses, placeholders, variables = model.set_up_model()
27 | X, y = placeholders
28 | 
29 | # Set up ProbLS optimizer
30 | opt_interface = ProbLSOptimizerSGDInterface()
31 | opt_interface.minimize(losses, variables)
32 | sess = tf.Session()
33 | opt_interface.register_session(sess)
34 | opt_ls = ProbLSOptimizer(opt_interface, alpha0=1e-3, cW=0.3, c1=0.05,
35 |     target_df=0.5, df_lo=-0.1, df_hi=1.1, expl_policy="linear", fpush=1.0,
36 |     max_change_factor=10., max_steps=10, max_expl=10, max_dmu0=0.0)
37 | 
38 | # Initialize variables
39 | sess.run(tf.global_variables_initializer())
40 | 
41 | # Run ProbLS
42 | batch = mnist.train.next_batch(batch_size)
43 | opt_ls.prepare({X: batch[0], y: batch[1]})
44 | for i in range(num_steps):
45 |   batch = mnist.train.next_batch(batch_size)
46 |   print(opt_ls.proceed({X: batch[0], y: batch[1]}))


--------------------------------------------------------------------------------
/examples/models/cifar10_2conv_3dense.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Aug  4 11:05:44 2016
 4 | 
 5 | @author: lballes
 6 | """
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | def weight_variable(shape, stddev=1e-2):
11 |   initial = tf.truncated_normal(shape, stddev=stddev)
12 |   return tf.Variable(initial)
13 | 
14 | def bias_variable(shape, val=0.05):
15 |   initial = tf.constant(val, shape=shape)
16 |   return tf.Variable(initial)
17 | 
18 | def conv2d(x, W):
19 |   return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
20 | 
21 | def max_pool_3x3(x):
22 |   return tf.nn.max_pool(x, ksize=[1, 3, 3, 1],
23 |                         strides=[1, 2, 2, 1], padding='SAME')
24 | 
25 | def set_up_model(images, labels):
26 |   W_conv1 = weight_variable([5, 5, 3, 64], 5e-2)
27 |   b_conv1 = bias_variable([64], 0.0)
28 |   h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1)
29 |   h_conv1_pool = max_pool_3x3(h_conv1)
30 |   
31 |   W_conv2 = weight_variable([5, 5, 64, 64], 5e-2)
32 |   b_conv2 = bias_variable([64], 0.1)
33 |   h_conv2 = tf.nn.relu(conv2d(h_conv1_pool, W_conv2) + b_conv2)
34 |   h_conv2_pool = max_pool_3x3(h_conv2)
35 |   
36 |   batch_size = tf.gather(tf.shape(images), 0)  
37 |   reshape = tf.reshape(h_conv2_pool, tf.pack([batch_size, -1]))
38 |   dim = 2304
39 |   W_fc1 = weight_variable([dim, 384], 0.04)
40 |   b_fc1 = bias_variable([384], 0.1)
41 |   h_fc1 = tf.nn.relu(tf.matmul(reshape, W_fc1) + b_fc1)
42 |   
43 |   W_fc2 = weight_variable([384, 192], 0.04)
44 |   b_fc2 = bias_variable([192], 0.1)
45 |   h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
46 |   
47 |   W_fc3 = weight_variable([192, 10], 1/192.0)
48 |   b_fc3 = bias_variable([10], 0.0)
49 |   h_fc3 = tf.matmul(h_fc2, W_fc3) + b_fc3
50 |   
51 |   labels = tf.cast(labels, tf.int64)
52 |   losses = tf.nn.sparse_softmax_cross_entropy_with_logits(h_fc3, labels)
53 |   return losses, [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2, W_fc3, b_fc3]
54 | 


--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Tests for utility functions in probls.utils
 4 | 
 5 | Created on Wed Jul  6 16:12:54 2016
 6 | 
 7 | @author: lballes
 8 | """
 9 | 
10 | import os
11 | import sys
12 | sys.path.insert(0, os.path.abspath('..'))
13 | 
14 | import unittest
15 | import numpy as np
16 | 
17 | 
18 | from probls import utils
19 | cdf = utils._cdf
20 | bvnu = utils.unbounded_bivariate_normal_integral
21 | bvn = utils.bounded_bivariate_normal_integral
22 | 
23 | 
24 | class TestCDF(unittest.TestCase):
25 |   
26 |   def runTest(self):
27 |     self.assertEqual(utils._cdf(0.), 0.5)
28 |     self.assertAlmostEqual(cdf(1.), 0.8413, places=4)
29 |     self.assertAlmostEqual(cdf(3.), 0.9987, places=4)
30 |     self.assertAlmostEqual(cdf(-1.), 0.1587, places=4)
31 |     self.assertAlmostEqual(cdf(-0.1), 0.4602, places=4)
32 | 
33 | 
34 | class TestUnboundedIntegral(unittest.TestCase):
35 |   
36 |   def runTest(self):
37 |     self.assertEqual(bvnu(0., 0., 0.), 0.25)
38 |     self.assertEqual(bvnu(1., 0., 0.), 0.5)
39 |     self.assertAlmostEqual(bvnu(0.43, 2.5, -1.0), 0.0062, places=4)
40 |     self.assertAlmostEqual(bvnu(-0.17, 0.5, 1.0), 0.0351, places=4)
41 |     self.assertAlmostEqual(bvnu(0., 0.5, 1.0), 0.0490, places=4)
42 |     self.assertAlmostEqual(bvnu(-1., 0., -3.), 0.4987, places=4)
43 |     self.assertAlmostEqual(bvnu(0., -5., -5.), 1., places=4)
44 |     self.assertAlmostEqual(bvnu(0., 5., 5.), 0., places=4)
45 |     self.assertAlmostEqual(bvnu(1., 3., 3.), 0.0013, places=4)
46 |     self.assertAlmostEqual(bvnu(-1., 0., 0.), 0., places=4)
47 | 
48 | class TestBoundedIntegral(unittest.TestCase):
49 |   
50 |   def runTest(self):
51 |     self.assertAlmostEqual(bvn(0.25, 0., 2.5, -1.2, 0.1), 0.1901, places=4)
52 |     self.assertAlmostEqual(bvn(0., 0., 1., 0., 1.), 0.1165, places=4)
53 |     self.assertAlmostEqual(bvn(0.5, 0., 1., 0., 1.), 0.1411, places=4)
54 |     self.assertAlmostEqual(bvn(0.5, 0., np.inf, 0., 1.), 0.2059, places=4)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |   unittest.main()


--------------------------------------------------------------------------------
/examples/run_probls_mnist_interactive.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Run probabilistic line search on an MNIST CNN in interactive mode. Displays
 4 | visualizations of function value f, derivative df, and the probabilistc wolfe
 5 | conditions and expected improvement. Click to proceed one function evaluation.
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | sys.path.insert(0, os.path.abspath('..'))
11 | 
12 | import matplotlib.pyplot as plt
13 | import tensorflow as tf
14 | from tensorflow.examples.tutorials.mnist import input_data
15 | mnist = input_data.read_data_sets('data/mnist', one_hot=True)
16 | 
17 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface
18 | from probls.line_search import ProbLSOptimizer
19 | 
20 | #### Specify training specifics here ##########################################
21 | #from models import mnist_2conv_2dense as model # Comment/uncomment to chose
22 | from models import mnist_mlp as model           # the model to run 
23 | num_steps = 4000
24 | batch_size = 128
25 | ###############################################################################
26 | 
27 | # Set up model
28 | losses, placeholders, variables = model.set_up_model()
29 | X, y = placeholders
30 | 
31 | # Set up ProbLS optimizer
32 | opt_interface = ProbLSOptimizerSGDInterface()
33 | opt_interface.minimize(losses, variables)
34 | sess = tf.Session()
35 | opt_interface.register_session(sess)
36 | sess.run(tf.global_variables_initializer())
37 | opt_ls = ProbLSOptimizer(opt_interface, cW=0.3, c1=0.05, target_df=0.5,
38 |                          df_lo=-0.1, df_hi=1.1, expl_policy="linear",
39 |                          fpush=1.0, max_change_factor=10., max_steps=10,
40 |                          max_expl=10, max_dmu0=100.0)
41 | batch = mnist.train.next_batch(batch_size)
42 | opt_ls.prepare({X: batch[0], y: batch[1]})
43 | 
44 | # Run
45 | plt.figure()
46 | for i in range(num_steps):
47 |   plt.clf()
48 |   batch = mnist.train.next_batch(batch_size)
49 |   print opt_ls.proceed({X: batch[0], y: batch[1]})
50 |   opt_ls.gp.visualize_f(plt.subplot(3, 1, 1))
51 |   plt.subplot(3, 1, 1).set_ylabel("f")
52 |   opt_ls.gp.visualize_df(plt.subplot(3, 1, 2))
53 |   plt.subplot(3, 1, 2).set_ylabel("df")
54 |   opt_ls.visualize_ei_pw(plt.subplot(3, 1, 3))
55 |   plt.subplot(3, 1, 3).set_ylabel("p_Wolfe / EI")
56 |   plt.show()
57 |   plt.waitforbuttonpress()


--------------------------------------------------------------------------------
/test/test_gaussian_process.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Test for Gaussian process implementation in probls.gaussian_process
 4 | 
 5 | Created on Fri Jul  1 09:51:00 2016
 6 | 
 7 | @author: lballes
 8 | """
 9 | 
10 | import os
11 | import sys
12 | sys.path.insert(0, os.path.abspath('..'))
13 | 
14 | import unittest
15 | import numpy as np
16 | 
17 | 
18 | from probls import gaussian_process
19 | 
20 | 
21 | class TestSolveQuadraticPolynomial(unittest.TestCase):
22 |   """Test the ``quadratic_polynomial_solve`` function of
23 |   ``probls.gaussian_process`` with a few hand-computed polynomials."""
24 |   
25 |   def setUp(self):
26 |     self.solve = gaussian_process.quadratic_polynomial_solve
27 |   
28 |   def runTest(self):
29 |     self.assertListEqual(self.solve(1., 0., 0., -3.5), [])
30 |     self.assertListEqual(self.solve(1., 0., 0., 0.), [])
31 |     self.assertListEqual(self.solve(2., -4., 0., 0.), [2.])
32 |     self.assertListEqual(self.solve(1., 3., -2., -4.), [-1.])
33 |     self.assertListEqual(self.solve(2., 0.5, 4., -8.0), [])
34 | 
35 | 
36 | class TestKernelFunctions(unittest.TestCase):
37 |   
38 |   def setUp(self):
39 |     self.gp = gaussian_process.ProbLSGaussianProcess()
40 |   
41 |   def runTest(self):
42 |     
43 |     # Test kernel function with hand-computed values
44 |     self.assertEqual(self.gp.k(3.5, 1.), 11.**3/3. + 0.5*2.5*11.**2)
45 |     self.assertEqual(self.gp.k(2., 3.), 12.**3/3.+.5*12.**2)    
46 |     self.assertEqual(self.gp.dkd(1., 2.0), 11.)
47 |     self.assertEqual(self.gp.dkd(-2., -1.), 8.)
48 |     
49 |     # Test if one-to-one computations give the same result as  one-to-many
50 |     # computations
51 |     t, T = np.random.rand(), np.random.rand(10)
52 |     for fun in [self.gp.k, self.gp.kd, self.gp.dkd, self.gp.d2k, self.gp.d2kd, self.gp.d3k]:
53 |       res = fun(t, T)
54 |       for i, tt in enumerate(T):
55 |         self.assertEqual(fun(t, tt), res[i])
56 | 
57 | 
58 | class TestNoiseFree(unittest.TestCase):
59 |   """Test whether posterior mean equals observations in the noise-free case."""
60 |   
61 |   def setUp(self):
62 |     self.gp = gaussian_process.ProbLSGaussianProcess()
63 |   
64 |   def runTest(self):
65 |     ts, fs, dfs = np.random.randn(10), np.random.randn(10), np.random.randn(10)
66 |     for i in range(10):
67 |       self.gp.add(ts[i], fs[i], dfs[i])
68 |     self.gp.update()
69 |     for i in range(10):
70 |       t, f, df = ts[i], fs[i], dfs[i]
71 |       self.assertLess(self.gp.V(t), 1e-9)
72 |       self.assertAlmostEqual(self.gp.mu(t), f, places=3)
73 |       self.assertAlmostEqual(self.gp.dmu(t), df, places=3)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |   unittest.main()


--------------------------------------------------------------------------------
/test/demo_interface_sgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Demo script for the tensorflow SGD interface. Uses the interface to perform
 4 | SGD on an MNIST CNN by repeatedly calling ``opt_interface.adv_eval(lr)`` and
 5 | ``accept()``.
 6 | 
 7 | Created on Fri Nov 25 16:36:07 2016
 8 | 
 9 | @author: Lukas Balles [lballes@tuebingen.mpg.de]
10 | """
11 | 
12 | import os
13 | import sys
14 | sys.path.insert(0, os.path.abspath('..'))
15 | 
16 | import tensorflow as tf
17 | from tensorflow.examples.tutorials.mnist import input_data
18 | import numpy as np
19 | 
20 | from probls.tensorflow_interface import interface_sgd
21 | 
22 | 
23 | def weight_variable(shape):
24 |   initial = tf.truncated_normal(shape, stddev=1e-2)
25 |   return tf.Variable(initial)
26 | 
27 | def bias_variable(shape):
28 |   initial = tf.constant(0.05, shape=shape)
29 |   return tf.Variable(initial)
30 | 
31 | def conv2d(x, W):
32 |   return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
33 | 
34 | def max_pool_2x2(x):
35 |   return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
36 |                         strides=[1, 2, 2, 1], padding='SAME')
37 | 
38 | # Set up model
39 | tf.reset_default_graph()
40 | X = tf.placeholder(tf.float32, shape=[None, 784])
41 | y = tf.placeholder(tf.float32, shape=[None, 10])
42 | W_conv1 = weight_variable([5, 5, 1, 32])
43 | b_conv1 = bias_variable([32])
44 | X_image = tf.reshape(X, [-1,28,28,1])
45 | h_conv1 = tf.nn.relu(conv2d(X_image, W_conv1) + b_conv1)
46 | h_pool1 = max_pool_2x2(h_conv1)
47 | W_conv2 = weight_variable([5, 5, 32, 64])
48 | b_conv2 = bias_variable([64])
49 | h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
50 | h_pool2 = max_pool_2x2(h_conv2)
51 | W_fc1 = weight_variable([7 * 7 * 64, 1024])
52 | b_fc1 = bias_variable([1024])
53 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
54 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
55 | W_fc2 = weight_variable([1024, 10])
56 | b_fc2 = bias_variable([10])
57 | h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
58 | losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1])
59 | var_list = [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2]
60 | 
61 | # Initialize interface
62 | opt_interface = interface_sgd.ProbLSOptimizerSGDInterface()
63 | opt_interface.minimize(losses, var_list)
64 | 
65 | # Create session and initialize variables
66 | sess = tf.Session()
67 | sess.run(tf.initialize_all_variables())
68 | opt_interface.register_session(sess)
69 | 
70 | # Get data ready
71 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
72 | m = 128
73 | 
74 | # Call prepare
75 | batch = mnist.train.next_batch(m)
76 | Xb, yb = batch[0], batch[1]
77 | opt_interface.prepare({X: Xb, y: yb})
78 | 
79 | # Run SGD steps
80 | lr = 0.1
81 | for i in range(1000):
82 |   print opt_interface.adv_eval(lr, {X: Xb, y: yb})
83 |   print opt_interface.accept()


--------------------------------------------------------------------------------
/test/test_gradient_moment.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Tests for gradient moment computation in
 4 | probls.tensorflow_interface.gradient_moment
 5 | 
 6 | Created on Wed Nov 23 17:09:34 2016
 7 | 
 8 | @author: Lukas Balles [lballes@tuebingen.mpg.de]
 9 | """
10 | 
11 | import os
12 | import sys
13 | sys.path.insert(0, os.path.abspath('..'))
14 | 
15 | import numpy as np
16 | import unittest
17 | import tensorflow as tf
18 | from tensorflow.examples.tutorials.mnist import input_data
19 | from probls.tensorflow_interface import gradient_moment as gm
20 | 
21 | def weight_variable(shape):
22 |   initial = tf.truncated_normal(shape, stddev=1e-2)
23 |   return tf.Variable(initial)
24 | 
25 | def bias_variable(shape):
26 |   initial = tf.constant(0.05, shape=shape)
27 |   return tf.Variable(initial)
28 | 
29 | def conv2d(x, W):
30 |   return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
31 | 
32 | def max_pool_2x2(x):
33 |   return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
34 |                         strides=[1, 2, 2, 1], padding='SAME')
35 | 
36 | class TestGradientMomentFullyConnected(unittest.TestCase):
37 |   """Test."""
38 |   
39 |   def setUp(self):    
40 |     # Set up model
41 |     tf.reset_default_graph()
42 |     X = tf.placeholder(tf.float32, shape=[None, 784])
43 |     y = tf.placeholder(tf.float32, shape=[None, 10])
44 |     W_fc1 = weight_variable([784, 1024])
45 |     b_fc1 = bias_variable([1024])
46 |     h_fc1 = tf.nn.relu(tf.matmul(X, W_fc1) + b_fc1)
47 |     W_fc2 = weight_variable([1024, 10])
48 |     b_fc2 = bias_variable([10])
49 |     h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
50 |     losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1])
51 |     
52 |     self.loss = tf.reduce_mean(losses)
53 |     self.batch_size = tf.cast(tf.gather(tf.shape(losses), 0), tf.float32)
54 |     self.var_list = [W_fc1, b_fc1, W_fc2, b_fc2]
55 |     self.X = X
56 |     self.y = y
57 |     
58 |     self.sess = tf.Session()
59 |     self.sess.run(tf.initialize_all_variables())
60 |     
61 |     self.mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
62 |   
63 |   def runTest(self):
64 |     grads, grad_moms = gm.grads_and_grad_moms(self.loss, self.batch_size,
65 |                                               self.var_list)
66 |     # Check shapes
67 |     for v, g, mom in zip(self.var_list, grads, grad_moms):
68 |       self.assertEqual(v.get_shape(), g.get_shape())
69 |       self.assertEqual(v.get_shape(), mom.get_shape())
70 |     
71 |     # Check against manual computation of moment
72 |     m = 10
73 |     batch = self.mnist.train.next_batch(m)
74 |     Xb, yb = batch[0], batch[1]
75 |     indiv_grads = []
76 |     for i in range(m):
77 |       gs = self.sess.run(grads, feed_dict={self.X: Xb[[i],:], self.y: yb[[i],:]})
78 |       indiv_grads.append(gs) 
79 |     indiv_grads_arr = [np.stack([indiv_grads[i][j] for i in range(m)], axis=0) for j in range(len(self.var_list))]
80 |     grads_manual = [np.mean(gs_var, axis=0) for gs_var in indiv_grads_arr]
81 |     grad_moms_manual = [np.mean(gs_var**2, axis=0) for gs_var in indiv_grads_arr]
82 |     grads_impl, grad_moms_impl = self.sess.run([grads, grad_moms], feed_dict={self.X: Xb, self.y: yb})
83 |     for grm, gri in zip(grads_manual, grads_impl):
84 |         self.assertTrue(np.allclose(grm, gri, rtol=1e-4))
85 |     for gmm, gmi in zip(grad_moms_manual, grad_moms_impl):
86 |         self.assertTrue(np.allclose(gmm, gmi, rtol=1e-4))
87 | 
88 | if __name__ == "__main__":
89 |   unittest.main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Probabilistic Line Search
  2 | 
  3 | This is a Python implementation of a _Probabilistic Line Searches for Stochastic
  4 | Optimization_ ([NIPS paper][1], [extended version][3]) plus a TensorFlow interface that allows you to use the line
  5 | search to train your TensorFlow model. **Please note: this is a development version with multiple experimental changes compared to the original paper!**
  6 | 
  7 | ## The Algorithm in a Nutshell
  8 | The probabilistic line search is an algorithm for the optimization of a
  9 | stochastic objective function F. Being at point x and having fixed a search
 10 | direction d, it maintains a Gaussian process model for the one-dimensional
 11 | function f(t) = F(x + td). This function and its derivative are evaluated at
 12 | (possibly multiple) step sizes t, updating the GP after each observation. This
 13 | is repeated until a _probabilistic belief_ over a quality criterion of the step
 14 | size, implied by the GP, exceeds a certain threshold.
 15 | 
 16 | ## Installation
 17 | 
 18 | No installation is required, just clone this git repositiory to your machine.
 19 | 
 20 | Requirements:
 21 | - tensorflow (0.12.0 is known to work)
 22 | - numpy (1.11.2 is known to work)
 23 | - scipy (0.13.3 is known to work)
 24 | - Some of the demo scripts require additional packages, like sys, os, matplotlib
 25 |   et cetera.
 26 | 
 27 | ## Usage
 28 | 
 29 | The built-in TensorFlow optimizers are used roughly like this
 30 | 
 31 | ```python
 32 | var_list = ...
 33 | losses = ... # A vector of losses, one for each example in the batch
 34 | 
 35 | loss = tf.mean(losses)
 36 | opt = tf.train.GradientDescentOptimizer(learning_rate)
 37 | sgd_step = opt.minimize(loss)
 38 | sess = tf.Session()
 39 | sess.run(tf.initialize_all_variables())
 40 | 
 41 | for i in range(num_steps):
 42 |   ...
 43 |   sess.run(sgd_step, feed_dict_if_applicable)
 44 | ```
 45 | 
 46 | Usage is slightly different for the probabilistic line search optimizer, but its only five additional lines of code:
 47 | 
 48 | ```python
 49 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface
 50 | from probls.line_search import ProbLSOptimizer
 51 | 
 52 | var_list = ...
 53 | losses = ... # A vector of losses, one for each example in the batch
 54 | 
 55 | opt_interface = ProbLSOptimizerSGDInterface()
 56 | opt_interface.minimize(losses, var_list) # Note that we pass losses, not an aggregate mean loss
 57 | sess = tf.Session()
 58 | sess.run(tf.initialize_all_variables())
 59 | opt_interface.register_session(sess)
 60 | opt_ls = ProbLSOptimizer(opt_interface)
 61 | opt_ls.prepare(feed_dict_if_applicable)
 62 | 
 63 | for i in range(num_steps):
 64 |   ...
 65 |   opt_ls.proceed(feed_dict_if_applicable)
 66 | ```
 67 | 
 68 | The effects of these individual commands will become clear in the next section.
 69 | See the ``examples/`` folder for working demo scripts.
 70 | 
 71 | 
 72 | ## Quick Guide to this Implementation
 73 | 
 74 | This implementation consists of two major components:
 75 | - A line search class (``ProbLSOptimizer``). It performs the line search, i.e. it gathers observations, updates the GP model, decides where to evaluate next, et cetera. The ``ProbLSOptimizer`` takes as argument a ``func`` object that is the "interface" to the objective function. It assumes that this interface has certain methods for evaluating at new points or accepting the current one; see below.
 76 | - The TensorFlow interface ``ProbLSOptimizerSGDInterface``. This can be used as the ``func`` argument for a ``ProbLSOptimizer`` and provides the necessary interface to use the line search to train your TensorFlow model.
 77 | 
 78 | ### Line Search
 79 | 
 80 | The ``ProbLSOptimizer`` class is implemented in ``probls.line_search``. It
 81 | excepts a ``func`` argument which acts as the interface to the objective function.
 82 | It is assumend that ``func`` has three methods:
 83 | - ``f, df, fvar, dfvar = func.adv_eval(dt, *args)`` to proceed along the current search
 84 |   direction by an increment ``dt``, returning function value ``f``, projected gradient ``df``
 85 |   and variance estimates for both (``fvar, dfvar``).
 86 | - ``f, df, fvar, dfvar = func.accept()`` to accept the current step size,
 87 |   returning function value, projected gradients and an estimate of the variance
 88 |   of these two quantities (``df`` and ``dfvar`` with respect to the new search direction).
 89 | - ``f, df, fvar, dfvar = func.prepare(*args)`` to prepare the interface returning an
 90 |   initial observation.
 91 | 
 92 | ``*args`` are additional positional arguments, e.g.. an optional feed_dict in the case the TensorFlow interface; see below.
 93 | The line search algorithm "communicates" with the objective function exclusively via these three methods.
 94 | 
 95 | Other than ``func``, ``ProbLSOptimizer`` has no required arguments, most notably, no learning rate!
 96 | The remaining arguments are design parameters of the line search algorithm. See the docstring of ``ProbLSOptimizer`` a description of these parameters.
 97 | 
 98 | ``opt_ls`` has two methods that are of interest for the end-user.
 99 | - ``opt_ls.prepare(*pass_to_func_args)`` has to be called once to initialize the line search.
100 | - ``opt_ls.proceed(*pass_to_func_args)`` proceeds one step in the line search (i.e. one
101 | function evaluation). We call this method for however many steps we want to train the model. This is where
102 | the actual line search happens, so check out its code (and that of the subroutines it calls) to get an idea of what is going on!
103 | 
104 | The Gaussian process functionality needed in the line search is outsourced to
105 | ``probls.gaussian_process``. It implements one-dimensional Gaussian process regression with an integrated
106 | Wiener process kernel that uses observations of both the function value and the
107 | derivative. For details, see the docstring of the ``ProbLSGaussianProcess`` class.
108 | 
109 | ### TensorFlow Interface
110 | 
111 | The TensorFlow interface ``ProbLSOptimizerSGDInterface`` is implemented in ``probls.tensorflow_interface.interface_sgd``.
112 | It inherits from ``tf.train.Optimizer`` and implements the necessary functionality to serve as the ``func`` argument of the ``ProbLSOptimizer``, providing the
113 | desired interface to the objective function defined by your TensorFlow model.
114 | Its ``minimize(losses, var_list)`` method adds to sets of operations to the TensorFlow graph:
115 | - ``adv_eval_op``
116 |   Advance along the current search direction, compute the loss,
117 |   the gradients and variances of both. Gradient and its variance are stored
118 |   in slot variables. Return the loss ``f``, projected gradient ``df``,
119 |   variance of the loss fvar, and variance of the projected gradient dfvar
120 | - ``accept_op``:
121 |   Accept the current point. Set its gradient as the new search direction.
122 |   Returns f, df fvar and dfvar, where df and dfvar are now with respect to this new search direction.
123 | 
124 | In order for the ``ProbLSOptimizerSGDInterface`` object to work as a self-contained
125 | interface that can perform function/gradient evaluations, you have to pass it a
126 | TensorFlow session via its ``register_session(sess)`` method. After that, the interface is
127 | ready to go and provides the three aforementioned methods ``adv_eval(dt, optional_feed_dict)``, ``accept()`` and ``prepare(optional_feed_dict)``.
128 | 
129 | A crucial part of the line search are within-batch estimates of the variance of the function 
130 | value and the gradient, see equations (17) and (18) in the [paper][1]. The variance
131 | of the objective is easily computed given the individual loss values for the examples
132 | in the batch. That is why we pass the vector of ``losses``, instead of a mean ``loss``.
133 | Computing the gradient variance is a little tricky; a detailed explanation can be found in this [note][2].
134 | For the implementation, see ``probls.tensorflow_interface.gradient_moment``.
135 | 
136 | [1]: https://arxiv.org/abs/1502.02846
137 | [2]: https://drive.google.com/open?id=0B0adgqwcMJK5aDNaQ2Q4ZmhCQzA
138 | [3]: https://arxiv.org/abs/1703.10034
139 | 


--------------------------------------------------------------------------------
/probls/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Utility functions for probabilistic line search algorithm.
  4 | """
  5 | 
  6 | import numpy as np
  7 | from scipy.special import erf
  8 | 
  9 | def bounded_bivariate_normal_integral(rho, xl, xu, yl, yu):
 10 |   """Computes the bounded bivariate normal integral.
 11 |   
 12 |   Computes the probability that ``xu >= X >= xl and yu >= Y >= yl`` where X
 13 |   and Y are jointly Gaussian random variables, with mean ``[0., 0.]`` and
 14 |   covariance matrix ``[[1., rho], [rho, 1.]]``.
 15 | 
 16 |   Inputs:
 17 |       :rho: Correlation coefficient of the bivariate normal random variable
 18 |       :xl, yl: Lower bounds of the integral
 19 |       :xu, yu: Upper bounds of the integral
 20 |   
 21 |   Ported from a Matlab implementation by Alan Genz which, in turn, is based on
 22 |   the method described by
 23 |       Drezner, Z and G.O. Wesolowsky, (1989),
 24 |       On the computation of the bivariate normal inegral,
 25 |       Journal of Statist. Comput. Simul. 35, pp. 101-107,
 26 |   
 27 |   Copyright statement of Alan Genz's version:
 28 |   ***************
 29 |   Copyright (C) 2013, Alan Genz,  All rights reserved.               
 30 | 
 31 |   Redistribution and use in source and binary forms, with or without
 32 |   modification, are permitted provided the following conditions are met:
 33 |     - Redistributions of source code must retain the above copyright
 34 |       notice, this list of conditions and the following disclaimer.
 35 |     - Redistributions in binary form must reproduce the above copyright
 36 |       notice, this list of conditions and the following disclaimer in 
 37 |       the documentation and/or other materials provided with the 
 38 |       distribution.
 39 |     - The contributor name(s) may not be used to endorse or promote 
 40 |       products derived from this software without specific prior 
 41 |       written permission.
 42 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 43 |   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 44 |   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 45 |   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 46 |   COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
 47 |   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
 48 |   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 
 49 |   OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 
 50 |   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
 51 |   TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
 52 |   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
 53 |   
 54 |   bvnu = unbounded_bivariate_normal_integral
 55 |   p = bvnu(rho, xl, yl) - bvnu(rho, xu, yl) \
 56 |       - bvnu(rho, xl, yu) + bvnu(rho, xu, yu)
 57 |   return max(0., min(p, 1.))
 58 | 
 59 | def unbounded_bivariate_normal_integral(rho, xl, yl):
 60 |   """Computes the unbounded bivariate normal integral.
 61 |   
 62 |   Computes the probability that ``X>=xl and Y>=yl`` where X and Y are jointly
 63 |   Gaussian random variables, with mean ``[0., 0.]`` and covariance matrix
 64 |   ``[[1., rho], [rho, 1.]]``.
 65 |   
 66 |   Note: to compute the probability that ``X < xl and Y < yl``, use
 67 |   ``unbounded_bivariate_normal_integral(rho, -xl, -yl)``. 
 68 | 
 69 |   Inputs:
 70 |       :rho: Correlation coefficient of the bivariate normal random variable
 71 |       :xl, yl: Lower bounds of the integral
 72 |   
 73 |   Ported from a Matlab implementation by Alan Genz which, in turn, is based on
 74 |   the method described by
 75 |       Drezner, Z and G.O. Wesolowsky, (1989),
 76 |       On the computation of the bivariate normal inegral,
 77 |       Journal of Statist. Comput. Simul. 35, pp. 101-107,
 78 |   
 79 |   Copyright statement of Alan Genz's version:
 80 |   ***************
 81 |   Copyright (C) 2013, Alan Genz,  All rights reserved.               
 82 | 
 83 |   Redistribution and use in source and binary forms, with or without
 84 |   modification, are permitted provided the following conditions are met:
 85 |     - Redistributions of source code must retain the above copyright
 86 |       notice, this list of conditions and the following disclaimer.
 87 |     - Redistributions in binary form must reproduce the above copyright
 88 |       notice, this list of conditions and the following disclaimer in 
 89 |       the documentation and/or other materials provided with the 
 90 |       distribution.
 91 |     - The contributor name(s) may not be used to endorse or promote 
 92 |       products derived from this software without specific prior 
 93 |       written permission.
 94 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 95 |   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 96 |   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 97 |   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 98 |   COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
 99 |   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
100 |   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 
101 |   OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 
102 |   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
103 |   TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
104 |   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
105 |   
106 |   rho = max(-1., min(1., rho))
107 | 
108 |   if np.isposinf(xl) or np.isposinf(yl):
109 |     return 0.
110 |   elif np.isneginf(xl):
111 |     return 1. if np.isneginf(yl) else _cdf(-yl)
112 |   elif np.isneginf(yl):
113 |     return _cdf(-xl)
114 |   elif rho == 0:
115 |     return _cdf(-xl)*_cdf(-yl)
116 |   
117 |   tp = 2.*np.pi
118 |   h, k = xl, yl
119 |   hk = h*k
120 |   bvn = 0.
121 |   
122 |   if np.abs(rho) < 0.3:
123 |     # Gauss Legendre points and weights, n =  6
124 |     w = np.array([0.1713244923791705, 0.3607615730481384, 0.4679139345726904])
125 |     x = np.array([0.9324695142031522, 0.6612093864662647, 0.2386191860831970])
126 |   elif np.abs(rho) < 0.75:
127 |     # Gauss Legendre points and weights, n = 12
128 |     w = np.array([0.04717533638651177, 0.1069393259953183, 0.1600783285433464,
129 |                   0.2031674267230659, 0.2334925365383547, 0.2491470458134029])
130 |     x = np.array([0.9815606342467191, 0.9041172563704750, 0.7699026741943050,
131 |                   0.5873179542866171, 0.3678314989981802, 0.1252334085114692])
132 |   else:
133 |     # Gauss Legendre points and weights, n = 20
134 |     w = np.array([.01761400713915212, .04060142980038694, .06267204833410906,
135 |                   .08327674157670475, 0.1019301198172404, 0.1181945319615184,
136 |                   0.1316886384491766, 0.1420961093183821, 0.1491729864726037,
137 |                   0.1527533871307259])
138 |     x = np.array([0.9931285991850949, 0.9639719272779138, 0.9122344282513259,
139 |                   0.8391169718222188, 0.7463319064601508, 0.6360536807265150,
140 |                   0.5108670019508271, 0.3737060887154196, 0.2277858511416451,
141 |                   0.07652652113349733])
142 |   
143 |   w = np.tile(w, 2)
144 |   x = np.concatenate([1.-x, 1.+x])
145 |   
146 |   if np.abs(rho) < 0.925:
147 |     hs = .5 * (h*h + k*k)
148 |     asr = .5*np.arcsin(rho)
149 |     sn = np.sin(asr*x)
150 |     bvn = np.dot(w, np.exp((sn*hk-hs)/(1.-sn**2)))
151 |     bvn = bvn*asr/tp + _cdf(-h)*_cdf(-k) 
152 |   else:
153 |     if rho < 0.:
154 |       k = -k
155 |       hk = -hk
156 |     if np.abs(rho) < 1.:
157 |       ass = 1.-rho**2
158 |       a = np.sqrt(ass)
159 |       bs = (h-k)**2
160 |       asr = -.5*(bs/ass + hk)
161 |       c = (4.-hk)/8.
162 |       d = (12.-hk)/80. 
163 |       if asr > -100.:
164 |         bvn = a*np.exp(asr)*(1.-c*(bs-ass)*(1.-d*bs)/3. + c*d*ass**2)
165 |       if hk  > -100.:
166 |         b = np.sqrt(bs)
167 |         sp = np.sqrt(tp)*_cdf(-b/a)
168 |         bvn = bvn - np.exp(-.5*hk)*sp*b*(1. - c*bs*(1.-d*bs)/3.)
169 |       a = .5*a
170 |       xs = (a*x)**2
171 |       asr = -.5*(bs/xs + hk)
172 |       inds = [i for i, asr_elt in enumerate(asr) if asr_elt>-100.]
173 |       xs = xs[inds]
174 |       sp = 1. + c*xs*(1.+5.*d*xs)
175 |       rs = np.sqrt(1.-xs)
176 |       ep = np.exp(-.5*hk*xs / (1.+rs)**2)/rs
177 |       bvn = (a*np.dot(np.exp(asr[inds])*(sp-ep), w[inds]) - bvn)/tp
178 |     if rho > 0:
179 |       bvn +=  _cdf(-max(h, k)) 
180 |     elif h >= k:
181 |       bvn = -bvn
182 |     else:
183 |       if h < 0.:
184 |         L = _cdf(k)-_cdf(h)
185 |       else:
186 |         L = _cdf(-h)-_cdf(-k)
187 |       bvn =  L - bvn
188 |   
189 |   return max(0., min(1., bvn))
190 | 
191 | def _cdf(z):
192 |   """Cumulative density function (CDF) of the standard normal distribution."""
193 |   return .5 * (1. + erf(z/np.sqrt(2.)))


--------------------------------------------------------------------------------
/probls/tensorflow_interface/gradient_moment.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Computation of *moments* of gradients through tensorflow operations.
  4 | 
  5 | Tensorflow is typically used for empircal risk minimzation with gradient-based
  6 | optimization methods. That is, we want to adjust trainable variables ``W``,
  7 | such as to minimize an objective quantity, called ``LOSS``, of the form
  8 | 
  9 |     LOSS(W) = (1/n) * sum{i=1:n}[ loss(W, d_i) ]
 10 | 
 11 | That is the mean of individual losses induced by ``n`` training data points
 12 | ``d_i``. Consquently, the gradient of ``LOSS`` w.r.t. the variables ``W`` is
 13 | the mean of individual gradients ``dloss(W, d_i)``. These individual gradients
 14 | are not computed separately when we call ``tf.gradients`` on the aggregate
 15 | ``LOSS``. Instead, they are implicitly aggregated by the operations in the
 16 | backward graph. This batch processing is crucial for the computational
 17 | efficiency of the gradient computation.
 18 | 
 19 | This module provides functionality to compute the ``p``-th moment of the
 20 | individual gradients, i.e. the quantity
 21 | 
 22 |     MOM(W) = (1/n) * sum{i=1:n}[ dloss(w, d_i)**p ]
 23 | 
 24 | without giving up the efficiency of batch processing. For a more detailed 
 25 | explanation, see the note [1]. Applications of this are the computation of the 
 26 | gradient variance estimate in [2] and [3].
 27 | 
 28 | [1] https://drive.google.com/open?id=0B0adgqwcMJK5aDNaQ2Q4ZmhCQzA
 29 | 
 30 | [2] M. Mahsereci and P. Hennig. Probabilistic line searches for stochastic
 31 | optimization. In Advances in Neural Information Processing Systems 28, pages
 32 | 181-189, 2015.
 33 | 
 34 | [3] L. Balles, J. Romero and P. Hennig. Coupling Adaptive Batch Sizes with
 35 | Learning Rates. In arXiv preprint arXiv:1612.05086, 2016.
 36 | https://arxiv.org/abs/1612.05086.
 37 | """
 38 | 
 39 | import tensorflow as tf
 40 | from tensorflow.python.ops import gen_array_ops
 41 | 
 42 | VALID_TYPES = ["MatMul", "Conv2D", "Add"]
 43 | VALID_REGULARIZATION_TYPES = ["L2Loss"]
 44 | 
 45 | def _check_and_sort_ops(op_list):
 46 |   """Sort a list of ops according to type into valid types for which we can
 47 |   compute the gradient moment) and regularizers. Raise an exception when
 48 |   encountering an op of invalid type."""
 49 |   
 50 |   valid, regularizers = [], []
 51 |   for op in op_list:
 52 |     if op.type in VALID_TYPES:
 53 |       valid.append(op)
 54 |     elif op.type in VALID_REGULARIZATION_TYPES:
 55 |       regularizers.append(op)
 56 |     else:
 57 |       raise Exception("A variable in var_list is consumed by an operation of "
 58 |           "type {} for which I don't how to compute the gradient moment. "
 59 |           "Allowed are types {} and regularization operations "
 60 |           "of type {}".format(op.type, str(VALID_TYPES),
 61 |           str(VALID_REGULARIZATION_TYPES)))
 62 |   return valid, regularizers
 63 | 
 64 | def grads_and_grad_moms(loss, batch_size, var_list, mom=2):
 65 |   """Compute the gradients and gradient moments of ``loss`` w.r.t. to the
 66 |   variables in ``var_list``
 67 |   
 68 |   Inputs:
 69 |       :loss: The tensor containing the scalar loss. The loss has  to be the
 70 |           ``tf.mean`` of ``batch_size`` individual losses induced by
 71 |           individual training data points.
 72 |       :batch_size: Self-explanatory. Integer tensor.
 73 |       :var_list: The list of variables.
 74 |       :mom: The desired moment. Integer. Defaults to 2.
 75 |   
 76 |   Returns:
 77 |       :v_grads: The gradients of ``loss`` w.r.t. the variables in ``var_list``
 78 |           as computed by ``tf.gradients(loss, var_list)``.
 79 |       :grad_moms: The gradient moments for each variable in ``var_list``."""
 80 |       
 81 |   assert len(set(var_list)) == len(var_list)           
 82 |   vs = [tf.convert_to_tensor(v) for v in var_list]
 83 |   num_vars = len(vs)
 84 |   
 85 |   consumers = []
 86 |   consumer_outs = []
 87 |   for v in vs:
 88 |     valid, regularizers = _check_and_sort_ops(v.consumers())
 89 |     if len(valid) > 1:
 90 |       raise Exception("Variable {} is consumed by more than one operation "
 91 |           "(ignoring regularization operations)".format(v.name))
 92 |     if len(regularizers) > 1:
 93 |       raise Exception("Variable {} is consumed by more than one "
 94 |           "regularization operation".format(v.name))
 95 |     consumers.extend(valid)
 96 |     consumer_outs.extend(valid[0].outputs)      
 97 |   
 98 |   # Use tf.gradients to compute gradients w.r.t. the variables, while also
 99 |   # retrieving gradients w.r.t. the outputs
100 |   all_grads = tf.gradients(loss, vs+consumer_outs)
101 |   v_grads = all_grads[0:num_vars]
102 |   out_grads = all_grads[num_vars::]
103 |   
104 |   # Compute the gradient moment for each (v, vp, op, output)
105 |   with tf.name_scope("grad_moms"):
106 |     grad_moms = [_GradMom(o, v, out_grad, batch_size, mom)
107 |                 for o, v, out_grad in zip(consumers, vs, out_grads)]
108 |   
109 |   return (v_grads, grad_moms)
110 | 
111 | def _GradMom(op, v, out_grad, batch_size, mom=2):
112 |   """Wrapper function for the operation type-specific GradMom functions below.
113 |   
114 |   Inputs:
115 |       :op: A tensorflow operation of type in VALID_TYPES.
116 |       :v: The read-tensor of the trainable variable consumed by this operation.
117 |       :out_grad: The tensor containing the gradient w.r.t. to the output of
118 |           the op (as computed by ``tf.gradients``).
119 |       :batch_size: Batch size ``m`` (constant integer or scalar int tf.Tensor)
120 |       :mom: Integer moment desired (defaults to 2)."""
121 |   
122 |   with tf.name_scope(op.name+"_grad_mom"):
123 |     if op.type == "MatMul":
124 |       return _MatMulGradMom(op, v, out_grad, batch_size, mom)
125 |     elif op.type == "Conv2D":
126 |       return _Conv2DGradMom(op, v, out_grad, batch_size, mom)
127 |     elif op.type == "Add":
128 |       return _AddGradMom(op, v, out_grad, batch_size, mom)
129 |     else:
130 |       raise ValueError("Don't know how to compute gradient moment for "
131 |           "variable {}, consumed by operation of type {}".format(v.name,
132 |           op.type))
133 | 
134 | def _MatMulGradMom(op, W, out_grad, batch_size, mom=2):
135 |   """Computes gradient moment for a weight matrix through a MatMul operation.
136 |   
137 |   Assumes ``Z=tf.matmul(A, W)``, where ``W`` is a d1xd2 weight matrix, ``A``
138 |   are the nxd1 activations of the previous layer (n being the batch size).
139 |   ``out_grad`` is the gradient w.r.t. ``Z``, as computed by ``tf.gradients()``.
140 |   No transposes in the MatMul operation allowed.
141 |   
142 |   Inputs:
143 |       :op: The MatMul operation
144 |       :W: The weight matrix (the tensor, not the variable)
145 |       :out_grad: The tensor of gradient w.r.t. to the output of the op
146 |       :batch_size: Batch size n (constant integer or scalar int tf.Tensor)
147 |       :mom: Integer moment desired (defaults to 2)"""
148 |   
149 |   assert op.type == "MatMul"
150 |   t_a, t_b = op.get_attr("transpose_a"), op.get_attr("transpose_b")
151 |   assert W is op.inputs[1] and not t_a and not t_b
152 |   
153 |   A = op.inputs[0]
154 |   out_grad_pow = tf.pow(out_grad, mom)
155 |   A_pow = tf.pow(A, mom)
156 |   return tf.mul(batch_size, tf.matmul(A_pow, out_grad_pow, transpose_a=True))
157 | 
158 | def _Conv2DGradMom(op, f, out_grad, batch_size, mom=2):
159 |   """Computes gradient moment for the filter of a Conv2D operation.
160 |   
161 |   Assumes ``Z=tf.nn.conv2d(A, f)``, where ``f`` is a ``[h_f, w_f, c_in, c_out]``
162 |   convolution filter and ``A`` are the ``[n, h_in, w_in, c_in]`` activations of
163 |   the previous layer (``n`` being the batch size). ``out_grad`` is the gradient
164 |   w.r.t. ``Z``, as computed by ``tf.gradients()``.
165 |   
166 |   Inputs:
167 |       :op: The Conv2D operation
168 |       :f: The filter (the tensor, not the variable)
169 |       :out_grad: The tensor of gradient w.r.t. to the output of the op
170 |       :batch_size: Batch size ``n`` (constant integer or scalar int tf.Tensor)
171 |       :mom: Integer moment desired (defaults to 2)"""
172 |   
173 |   assert op.type == "Conv2D"
174 |   assert f is op.inputs[1]
175 |   
176 |   strides = op.get_attr("strides")
177 |   padding = op.get_attr("padding")
178 |   use_cudnn = op.get_attr("use_cudnn_on_gpu")
179 |   data_format = op.get_attr("data_format")
180 |   
181 |   inp = op.inputs[0]
182 |   inp_pow = tf.pow(inp, mom)
183 |   
184 |   f_shape = tf.shape(f)
185 |   out_grad_pow = tf.pow(out_grad, mom)
186 |   
187 |   raw_moment = tf.nn.conv2d_backprop_filter(inp_pow, f_shape, out_grad_pow,
188 |       strides, padding, use_cudnn, data_format)
189 |   return tf.mul(batch_size, raw_moment)
190 | 
191 | def _AddGradMom(op, b, out_grad, batch_size, mom=2):
192 |   """Computes gradient moment for a bias variable through an Add operation.
193 |   
194 |   Assumes ``Z = tf.add(Zz, b)``, where ``b`` is a bias parameter and ``Zz`` is
195 |   a ``[n, ?]`` tensor (``n`` being the batch size). Broadcasting for all kinds
196 |   of shapes of ``Zz`` (e.g. ``[n, d_in]`` or ``[n, h_in, w_in, c_in]`` are
197 |   supported. ``out_grad`` is the gradient w.r.t. ``Z``, as computed by
198 |   ``tf.gradients()``.
199 |   
200 |   Inputs:
201 |       :op: The Add operation
202 |       :b: The bias parameter (the tensor, not the variable)
203 |       :out_grad: The tensor of gradient w.r.t. to the output of the op
204 |       :batch_size: Batch size ``n`` (constant integer or scalar int tf.Tensor)
205 |       :mom: Integer moment desired (defaults to 2)"""
206 |   
207 |   assert op.type == "Add"
208 |   
209 |   out_grad_pow = tf.pow(out_grad, mom)
210 |   
211 |   if b is op.inputs[0]:
212 |     y = op.inputs[1]
213 |     sx = tf.shape(b)
214 |     sy = tf.shape(y)
215 |     rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
216 |     raw_mom = tf.reshape(tf.reduce_sum(out_grad_pow, rx), sx)
217 |   elif b is op.inputs[1]:
218 |     x = op.inputs[0]
219 |     sx = tf.shape(x)
220 |     sy = tf.shape(b)
221 |     rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
222 |     raw_mom = tf.reshape(tf.reduce_sum(out_grad_pow, ry), sy)
223 |   return tf.mul(batch_size, raw_mom)
224 | 


--------------------------------------------------------------------------------
/probls/tensorflow_interface/interface_sgd.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | TensorFlow optimizer that acts as an interface for the probabilistic line
  4 | search algorithm.
  5 | """
  6 | 
  7 | import tensorflow as tf
  8 | import gradient_moment as gm
  9 | 
 10 | class ProbLSOptimizerSGDInterface(tf.train.Optimizer):
 11 |   """Optimizer that implements gradient descent with and interface for the
 12 |   probabilistic line search algorithm.  
 13 |   @@__init__
 14 |   """
 15 | 
 16 |   def __init__(self, momentum=None, use_locking=False, name="ProbLS"):
 17 |     """Construct a new probabilistic line search optimizer.
 18 |     
 19 |     Args:
 20 |     
 21 |     :momentum: None or scalar momentum parameter.
 22 |     :use_locking: If True use locks for update operations.
 23 |     :name: Optional name prefix for the operations created when applying
 24 |         gradients. Defaults to "ProbLS".
 25 |     """
 26 |     super(ProbLSOptimizerSGDInterface, self).__init__(use_locking, name)
 27 |     
 28 |     assert momentum is None or (isinstance(momentum, float) and 0<=momentum<=1)
 29 |     self.momentum = momentum
 30 |     
 31 |     self._ops_ready = False
 32 |     self._prepared = False
 33 |     self.sess = None
 34 |     
 35 |     self.dt = None
 36 |     self.adv_eval_op = None
 37 |     self.accept_op = None
 38 |   
 39 |   def _create_slots(self, var_list):
 40 |     for v in var_list:
 41 |       self._zeros_slot(v, "grad", "grad") # Variables to memorize gradients
 42 |       self._zeros_slot(v, "dir", "dir") # Search direction
 43 |       self._zeros_slot(v, "gradvar", "gradvar") # Gradient variance
 44 |   
 45 |   def minimize(self, losses, var_list):
 46 |     """Add operations to perform SGD with probabilistic line search. This
 47 |     comprises two sets of operations:
 48 |     
 49 |     1) adv_eval_op:    
 50 |        Advance along the current search direction, compute the loss,
 51 |        the gradients and both variances. Gradient and its variance are stored
 52 |        in slot variables. Return the loss f, projected gradient df,
 53 |        variance of the loss fvar, and variance of the projected gradient dfvar
 54 |     2) accept_op:
 55 |        Accept the current point. Set its gradient as the new search direction.
 56 |        Returns df and dfvar with respect to this new search direction.
 57 |     
 58 |     Inputs:
 59 |       :losses: A Tensor of shape (batch_size,) containing the *individual*
 60 |           loss for each example in the batch. Do *not* pass a scalar mean loss
 61 |           as for the built-in tensorflow optimizers.
 62 |       :var_list: List of Variable objects to update to minimize loss."""
 63 |     
 64 |     assert isinstance(losses, tf.Tensor)
 65 |     for var in var_list: assert isinstance(var, tf.Variable)
 66 |     assert len(var_list) >= 0
 67 |     assert len(var_list) == len(set(var_list)) # Check for duplicates
 68 |     
 69 |     input_dtype = losses.dtype.base_dtype
 70 |     
 71 |     # Create and retrieve slot variables
 72 |     self._create_slots(var_list)
 73 |     mem_grads = [self.get_slot(v, "grad") for v in var_list]
 74 |     dirs = [self.get_slot(v, "dir") for v in var_list]
 75 |     mem_gradvars = [self.get_slot(v, "gradvar") for v in var_list]
 76 |     mem_f = tf.Variable(0.0, input_dtype, name="mem_f")
 77 |     mem_fvar = tf.Variable(0.0, input_dtype, name="mem_fvar")
 78 |     
 79 |     with tf.name_scope("ProbLS"):
 80 |       
 81 |       ###### adv_eval_op ######################################################      
 82 |       # Extract the batch size, i.e. the length of the losses vector
 83 |       batch_size = tf.cast(tf.gather(tf.shape(losses), 0), input_dtype,
 84 |                            name="batch_size")
 85 |       
 86 |       # Add a scalar placeholder dt and operations that advance t by dt,
 87 |       # i.e., update v += dt*d (v: variable, d: search direction)
 88 |       with tf.name_scope("advance_t"):
 89 |         self.dt = tf.placeholder(dtype=input_dtype, shape=[], name="delta_t")
 90 |         steps = [tf.mul(self.dt, tf.convert_to_tensor(d)) for d in dirs]
 91 |         advance_t_updates = [v.assign_add(s) for v, s in zip(var_list, steps)]
 92 |       
 93 |       # With a dependency on the advance_t update (making sure that a step is
 94 |       # taken first), add tensors that compute the loss f, the gradients and
 95 |       # the gradient moments
 96 |       with tf.control_dependencies(advance_t_updates):
 97 |         loss = tf.reduce_mean(losses, name="f")
 98 |         grads, moms = gm.grads_and_grad_moms(loss, batch_size, var_list)
 99 |       
100 |       # Add variance of the loss
101 |       ssl = tf.reduce_mean(tf.square(losses), name="sum_of_squared_losses")
102 |       fvar = tf.div(ssl-tf.square(loss), batch_size-1., name="fvar")
103 |         
104 |       # Add projected gradient df (w.r.t. the current search direction)
105 |       with tf.name_scope("df"):
106 |         proj_grads = [tf.reduce_sum(tf.mul(g, d), name="proj_grad")
107 |                       for g, d in zip(grads, dirs)]
108 |         df = tf.add_n(proj_grads, name="df")
109 |       
110 |       # Add gradient variances and the variance of df
111 |       gradvars = [tf.div(mom-tf.square(g), batch_size-1.)
112 |                   for mom, g in zip(moms, grads)]
113 |       dfvar = tf.add_n([tf.reduce_sum(gv*tf.square(d))
114 |                         for gv, d in zip(gradvars, dirs)])
115 |       
116 |       # Add operations to memorize stuff in variables. This is because they
117 |       # are needed in the case that this points ends up being accepted (i.e.,
118 |       # if the accept op is called next). Stored quantities are
119 |       # - gradients
120 |       # - gradient moment
121 |       # - f and fvar
122 |       with tf.name_scope("memorize"):
123 |         mem_updates = [v.assign(grad) for v, grad in zip(mem_grads, grads)]
124 |         mem_updates.extend(
125 |             [v.assign(gv) for v, gv in zip(mem_gradvars, gradvars)]
126 |             )
127 |         mem_updates.append(mem_f.assign(loss))
128 |         mem_updates.append(mem_fvar.assign(fvar))
129 |       
130 |       # With a dependency on the memorization, add the adv_eval_op. It is
131 |       # simply the tuple (f, df, fvar, dfvar). All the dependencies make sure
132 |       # that it also does the other stuff
133 |       with tf.control_dependencies(mem_updates):
134 |         self.adv_eval_op = tf.tuple([loss, df, fvar, dfvar], name="results")
135 |       
136 |       ###### accept_op ########################################################      
137 |       # Operation that accepts the current state, i.e.
138 |       # - sets the current gradient as the new search direction
139 |       # - returns a new df, computed w.r.t. to that new search direction 
140 |       with tf.name_scope("accept"):
141 |         # Add operations the set the new search direction
142 |         if self.momentum is None:
143 |           new_dirs = [tf.neg(g) for g in mem_grads]
144 |         else:
145 |           mu = tf.convert_to_tensor(self.momentum, name="momentum_mu")
146 |           new_dirs = [mu*d-g for d, g in zip(dirs, mem_grads)]
147 |         dir_updates = [d.assign(d_new) for d, d_new in zip(dirs, new_dirs)]
148 |         
149 |         # With a dependency on the search direction updates, compute df and
150 |         # dfvar w.r.t. the new search direction, using the memorized gradients
151 |         # and gradient variances
152 |         with tf.control_dependencies(dir_updates):
153 |           proj_grads_new = [tf.reduce_sum(g*d)
154 |                             for g, d in zip(mem_grads, dirs)]
155 |           df_new = tf.add_n(proj_grads_new, name="df_new")
156 |           dfvar_new = tf.add_n([tf.reduce_sum(gv*tf.square(d))
157 |                                 for gv, d in zip(mem_gradvars, dirs)])
158 |         self.accept_op = tf.tuple([mem_f, df_new, mem_fvar, dfvar_new],
159 |                                   name="results_after_accept")
160 |     
161 |     # Set internal flag that the operations are now ready
162 |     self._ops_ready = True
163 |   
164 |   def register_session(self, sess):
165 |     """Register the session ``sess`` with this line search interface.
166 |     Computations resulting from calls to ``prepare``, ``adv_eval`` or
167 |     ``accept`` will be executed in this session.
168 |     
169 |     Inputs:
170 |       :sess: A TensorFlow Session."""
171 |     
172 |     if not self._ops_ready:
173 |       raise Warning("You have to call minimize first")
174 |     assert isinstance(sess, tf.Session)
175 |     self.sess = sess
176 |   
177 |   def prepare(self, feed_dict=None):
178 |     """Make a first evaluation to properly initialize all gradients, et cetera.
179 |     Call this function before using ``adv_eval`` or ``accept``."""
180 |     
181 |     if self.sess is None:
182 |       raise Warning("You have to register a session first.")
183 |     
184 |     if feed_dict is None:
185 |       feed_dict = {}
186 |     feed_dict[self.dt] = 0.0
187 |     
188 |     # We need to evaluate and accept once in order to compute initial
189 |     # gradients and accept them as search direction. Only then can we
190 |     # make the first "real" evaluation and return the results
191 |     self.sess.run(self.adv_eval_op, feed_dict)
192 |     self.sess.run(self.accept_op)
193 |     self.sess.run(self.adv_eval_op, feed_dict)
194 |     self._prepared = True
195 |     return self.sess.run(self.accept_op)
196 |     
197 |   def adv_eval(self, dt, feed_dict=None):
198 |     """Advance by an increment ``dt`` along the current search direction and
199 |     evaluate.
200 |     
201 |     Inputs:
202 |       :dt: Float step size increment.
203 |       :feed_dict: Optional feed_dict.
204 |     
205 |     Returns:
206 |       :f: Function value at the new point.
207 |       :df: Gradient at the new point, projected onto the search direction.
208 |       :fvar: Variance of f.
209 |       :dfvar: Variance of df."""
210 |     
211 |     if not self._prepared:
212 |       raise Warning("You have to call prepare first")
213 |     if feed_dict is None:
214 |       feed_dict = {}
215 |     feed_dict[self.dt] = dt
216 |     return self.sess.run(self.adv_eval_op, feed_dict)
217 |   
218 |   def accept(self):
219 |     if not self._prepared:
220 |       raise Warning("You have to call prepare first")
221 |     return self.sess.run(self.accept_op)


--------------------------------------------------------------------------------
/examples/cifar10.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Routine for decoding the CIFAR-10 binary file format."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import sys
 23 | import os
 24 | import tarfile
 25 | 
 26 | from six.moves import xrange, urllib  # pylint: disable=redefined-builtin
 27 | import tensorflow as tf
 28 | 
 29 | # Process images of this size. Note that this differs from the original CIFAR
 30 | # image size of 32 x 32. If one alters this number, then the entire model
 31 | # architecture will change and any model would need to be retrained.
 32 | IMAGE_SIZE = 24
 33 | 
 34 | # Global constants describing the CIFAR-10 data set.
 35 | NUM_CLASSES = 10
 36 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
 37 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
 38 | 
 39 | DATA_DIR = "data/cifar-10/cifar-10-batches-bin"
 40 | DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
 41 | 
 42 | # Check if data is already there, if not download!
 43 | dest_directory = "data/cifar-10"
 44 | if not os.path.exists(dest_directory):
 45 |   os.makedirs(dest_directory)
 46 | filename = DATA_URL.split('/')[-1]
 47 | filepath = os.path.join(dest_directory, filename)
 48 | if not os.path.exists(filepath):
 49 |   def _progress(count, block_size, total_size):
 50 |     sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
 51 |         float(count * block_size) / float(total_size) * 100.0))
 52 |     sys.stdout.flush()
 53 |   filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
 54 |   print()
 55 |   statinfo = os.stat(filepath)
 56 |   print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
 57 |   tarfile.open(filepath, 'r:gz').extractall(dest_directory)
 58 | 
 59 | def read_cifar10(filename_queue):
 60 |   """Reads and parses examples from CIFAR10 data files.
 61 |   Recommendation: if you want N-way read parallelism, call this function
 62 |   N times.  This will give you N independent Readers reading different
 63 |   files & positions within those files, which will give better mixing of
 64 |   examples.
 65 |   Args:
 66 |     filename_queue: A queue of strings with the filenames to read from.
 67 |   Returns:
 68 |     An object representing a single example, with the following fields:
 69 |       height: number of rows in the result (32)
 70 |       width: number of columns in the result (32)
 71 |       depth: number of color channels in the result (3)
 72 |       key: a scalar string Tensor describing the filename & record number
 73 |         for this example.
 74 |       label: an int32 Tensor with the label in the range 0..9.
 75 |       uint8image: a [height, width, depth] uint8 Tensor with the image data
 76 |   """
 77 | 
 78 |   class CIFAR10Record(object):
 79 |     pass
 80 |   result = CIFAR10Record()
 81 | 
 82 |   # Dimensions of the images in the CIFAR-10 dataset.
 83 |   # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
 84 |   # input format.
 85 |   label_bytes = 1  # 2 for CIFAR-100
 86 |   result.height = 32
 87 |   result.width = 32
 88 |   result.depth = 3
 89 |   image_bytes = result.height * result.width * result.depth
 90 |   # Every record consists of a label followed by the image, with a
 91 |   # fixed number of bytes for each.
 92 |   record_bytes = label_bytes + image_bytes
 93 | 
 94 |   # Read a record, getting filenames from the filename_queue.  No
 95 |   # header or footer in the CIFAR-10 format, so we leave header_bytes
 96 |   # and footer_bytes at their default of 0.
 97 |   reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
 98 |   result.key, value = reader.read(filename_queue)
 99 | 
100 |   # Convert from a string to a vector of uint8 that is record_bytes long.
101 |   record_bytes = tf.decode_raw(value, tf.uint8)
102 | 
103 |   # The first bytes represent the label, which we convert from uint8->int32.
104 |   result.label = tf.cast(
105 |       tf.slice(record_bytes, [0], [label_bytes]), tf.int32)
106 | 
107 |   # The remaining bytes after the label represent the image, which we reshape
108 |   # from [depth * height * width] to [depth, height, width].
109 |   depth_major = tf.reshape(tf.slice(record_bytes, [label_bytes], [image_bytes]),
110 |                            [result.depth, result.height, result.width])
111 |   # Convert from [depth, height, width] to [height, width, depth].
112 |   result.uint8image = tf.transpose(depth_major, [1, 2, 0])
113 | 
114 |   return result
115 | 
116 | 
117 | def _generate_image_and_label_batch(image, label, min_queue_examples,
118 |                                     batch_size, shuffle):
119 |   """Construct a queued batch of images and labels.
120 |   Args:
121 |     image: 3-D Tensor of [height, width, 3] of type.float32.
122 |     label: 1-D Tensor of type.int32
123 |     min_queue_examples: int32, minimum number of samples to retain
124 |       in the queue that provides of batches of examples.
125 |     batch_size: Number of images per batch.
126 |     shuffle: boolean indicating whether to use a shuffling queue.
127 |   Returns:
128 |     images: Images. 4D tensor of [batch_size, height, width, 3] size.
129 |     labels: Labels. 1D tensor of [batch_size] size.
130 |   """
131 |   # Create a queue that shuffles the examples, and then
132 |   # read 'batch_size' images + labels from the example queue.
133 |   num_preprocess_threads = 16
134 |   if shuffle:
135 |     images, label_batch = tf.train.shuffle_batch(
136 |         [image, label],
137 |         batch_size=batch_size,
138 |         num_threads=num_preprocess_threads,
139 |         capacity=min_queue_examples + 3 * batch_size,
140 |         min_after_dequeue=min_queue_examples)
141 |   else:
142 |     images, label_batch = tf.train.batch(
143 |         [image, label],
144 |         batch_size=batch_size,
145 |         num_threads=num_preprocess_threads,
146 |         capacity=min_queue_examples + 3 * batch_size)
147 | 
148 |   return images, tf.reshape(label_batch, [batch_size])
149 | 
150 | 
151 | def distorted_inputs(data_dir=DATA_DIR, batch_size=128):
152 |   """Construct distorted input for CIFAR training using the Reader ops.
153 |   Args:
154 |     data_dir: Path to the CIFAR-10 data directory.
155 |     batch_size: Number of images per batch.
156 |   Returns:
157 |     images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
158 |     labels: Labels. 1D tensor of [batch_size] size.
159 |   """
160 |   filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
161 |                for i in xrange(1, 6)]
162 |   for f in filenames:
163 |     if not tf.gfile.Exists(f):
164 |       raise ValueError('Failed to find file: ' + f)
165 | 
166 |   # Create a queue that produces the filenames to read.
167 |   filename_queue = tf.train.string_input_producer(filenames)
168 | 
169 |   # Read examples from files in the filename queue.
170 |   read_input = read_cifar10(filename_queue)
171 |   reshaped_image = tf.cast(read_input.uint8image, tf.float32)
172 | 
173 |   height = IMAGE_SIZE
174 |   width = IMAGE_SIZE
175 | 
176 |   # Image processing for training the network. Note the many random
177 |   # distortions applied to the image.
178 | 
179 |   # Randomly crop a [height, width] section of the image.
180 |   distorted_image = tf.random_crop(reshaped_image, [height, width, 3])
181 | 
182 |   # Randomly flip the image horizontally.
183 |   distorted_image = tf.image.random_flip_left_right(distorted_image)
184 | 
185 |   # Because these operations are not commutative, consider randomizing
186 |   # the order their operation.
187 |   distorted_image = tf.image.random_brightness(distorted_image,
188 |                                                max_delta=63)
189 |   distorted_image = tf.image.random_contrast(distorted_image,
190 |                                              lower=0.2, upper=1.8)
191 | 
192 |   # Subtract off the mean and divide by the variance of the pixels.
193 |   float_image = tf.image.per_image_standardization(distorted_image)
194 | 
195 |   # Ensure that the random shuffling has good mixing properties.
196 |   min_fraction_of_examples_in_queue = 0.4
197 |   min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
198 |                            min_fraction_of_examples_in_queue)
199 |   print ('Filling queue with %d CIFAR images before starting to train. '
200 |          'This will take a few minutes.' % min_queue_examples)
201 | 
202 |   # Generate a batch of images and labels by building up a queue of examples.
203 |   return _generate_image_and_label_batch(float_image, read_input.label,
204 |                                          min_queue_examples, batch_size,
205 |                                          shuffle=True)
206 | 
207 | 
208 | def inputs(eval_data, data_dir=DATA_DIR, batch_size=128):
209 |   """Construct input for CIFAR evaluation using the Reader ops.
210 |   Args:
211 |     eval_data: bool, indicating if one should use the train or eval data set.
212 |     data_dir: Path to the CIFAR-10 data directory.
213 |     batch_size: Number of images per batch.
214 |   Returns:
215 |     images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
216 |     labels: Labels. 1D tensor of [batch_size] size.
217 |   """
218 |   if not eval_data:
219 |     filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
220 |                  for i in xrange(1, 6)]
221 |     num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
222 |   else:
223 |     filenames = [os.path.join(data_dir, 'test_batch.bin')]
224 |     num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
225 | 
226 |   for f in filenames:
227 |     if not tf.gfile.Exists(f):
228 |       raise ValueError('Failed to find file: ' + f)
229 | 
230 |   # Create a queue that produces the filenames to read.
231 |   filename_queue = tf.train.string_input_producer(filenames)
232 | 
233 |   # Read examples from files in the filename queue.
234 |   read_input = read_cifar10(filename_queue)
235 |   reshaped_image = tf.cast(read_input.uint8image, tf.float32)
236 | 
237 |   height = IMAGE_SIZE
238 |   width = IMAGE_SIZE
239 | 
240 |   # Image processing for evaluation.
241 |   # Crop the central [height, width] of the image.
242 |   resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image,
243 |                                                          width, height)
244 | 
245 |   # Subtract off the mean and divide by the variance of the pixels.
246 |   float_image = tf.image.per_image_standardization(resized_image)
247 | 
248 |   # Ensure that the random shuffling has good mixing properties.
249 |   min_fraction_of_examples_in_queue = 0.4
250 |   min_queue_examples = int(num_examples_per_epoch *
251 |                            min_fraction_of_examples_in_queue)
252 | 
253 |   # Generate a batch of images and labels by building up a queue of examples.
254 |   return _generate_image_and_label_batch(float_image, read_input.label,
255 |                                          min_queue_examples, batch_size,
256 |                                          shuffle=False)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2016 Max Planck Society.  All rights reserved.
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright 2015, Max Planck Society.
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.
204 | 


--------------------------------------------------------------------------------
/probls/line_search.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Implementation of Probabilistic Line Search for Stochastic Optimization [1].
  4 | 
  5 | [1] M. Mahsereci and P. Hennig. Probabilistic line searches for stochastic
  6 | optimization. In Advances in Neural Information Processing Systems 28, pages
  7 | 181-189, 2015.
  8 | """
  9 | 
 10 | import numpy as np
 11 | import gaussian_process
 12 | import utils
 13 | 
 14 | class ProbLSOptimizer(object):
 15 |   """Probabilistic line search optimizer.
 16 |   
 17 |   @@__init__
 18 |   """
 19 |   
 20 |   def __init__(self, func, c1=0.05, cW=0.3, fpush=1.0, alpha0=0.01,
 21 |                target_df=0.5, df_lo=-0.1, df_hi=1.1, max_steps=10, max_expl=6, 
 22 |                max_dmu0=0.0, max_change_factor=10.0, expl_policy="linear"):
 23 |     """Create a new probabilistic line search object.
 24 |     
 25 |     Inputs:
 26 |       :func: Interface to the objective function. We assume that it has three
 27 |           methods.          
 28 |              - ``f, df, fvar, dfvar = func.adv_eval(dt)`` to proceed along the
 29 |                current search direction by an increment ``dt``, returning
 30 |                function value, projected gradient and variance estimates for
 31 |                both.
 32 |              - ``f, df, fvar, dfvar = func.accept()`` to accept the current
 33 |                step size, returning function value, projected gradients and an
 34 |                estimate of the variance of these two quantities.
 35 |              - ``f, df, fvar, dfvar = func.prepare()`` to prepare the interface
 36 |                returning an initial observation of function value and gradient,
 37 |                as well as the variances.
 38 |           If the function interface takes additional arguments (e.g. a feed
 39 |           dict with a batch of data in tensorflow), those are passed as
 40 |           positional arguments ``*pass_to_func_args``. 
 41 |       :c1: Scalar parameters for the first Wolfe conditions. Default to 0.05.
 42 |       :cW: Acceptance threshold for the Wolfe probability. Defaults to 0.3.
 43 |       :fpush: Push factor that is multiplied with the accepted step size to get
 44 |           the base step size for the next line search. Defaults to 1.0.
 45 |       :alpha0: Initial step size. Defaults to 0.01.
 46 |       :target_df: The target value for the relative projected gradient
 47 |           df(t)/abs(df(0)). Defaults to 0.5.
 48 |       :df_lo, df_hi: Lower and higher threshold for the relative projected
 49 |           gradient df(t)/abs(df(0)). Default to -0.1 and 1.1.
 50 |       :max_steps: Maximum number of steps (function evaluations) per line
 51 |           search. Defaults to 10.
 52 |       :max_epl: Maximum number of exploration steps per line search. Defaults
 53 |           to 6.
 54 |       :max_dmu0: If the posterior derivative at t=0 exceeds ``max_dmu0``, the
 55 |           current line search is aborted as a safeguard against bad search
 56 |           directions. Defaults to 0.0.
 57 |       :max_change_factor: The algorithm usually takes the accepted alpha of the
 58 |           current line search as the base ``alpha0`` of the next one (after
 59 |           multiplying with ``fpush``). However, if a line search accepts an
 60 |           alpha that is more than ``max_change_factor`` times smaller or larger
 61 |           than the current ``alpha0``, we instead set the next ``alpha0`` to a
 62 |           running average of the accepted alphas (``alpha_stats``). Defaults to
 63 |           10.0.
 64 |       :expl_policy: String indicating the policy used for exploring points *to
 65 |           the right* in the line search. If ``k`` is the number of exploration
 66 |           steps already made, then the ``"linear"`` exploration policy chooses
 67 |           ``2*(k+1)*alpha0`` as the next exploration candidate. The
 68 |           ``"exponential"`` policy chooses ``2**(k+1)*alpha0``. Defaults to
 69 |           ``"linear"``."""
 70 |     
 71 |     # Make sure the function_interface is valid and store it
 72 |     assert hasattr(func, "adv_eval")
 73 |     assert hasattr(func, "accept")
 74 |     assert hasattr(func, "prepare")
 75 |     self.func = func
 76 |     
 77 |     # Store the line search parameters
 78 |     self.c1 = c1
 79 |     self.cW = cW
 80 |     self.fpush = fpush
 81 |     self.target_df = target_df
 82 |     self.df_lo = df_lo
 83 |     self.df_hi = df_hi
 84 |     self.max_steps = max_steps
 85 |     self.max_expl = max_expl
 86 |     self.max_dmu0 = max_dmu0
 87 |     self.max_change_factor = max_change_factor
 88 |     assert expl_policy in ["linear", "exponential"]
 89 |     self.expl_policy = expl_policy
 90 |     
 91 |     # Initialize base step size with given value.
 92 |     self.alpha0 = alpha0
 93 |     
 94 |     # alpha_stats will contain a running average of accepted step sizes
 95 |     self.alpha_stats = alpha0
 96 |     
 97 |     # Raw function values at the origin of the line search
 98 |     self.f0 = None
 99 |     self.df0 = None
100 |     
101 |     # Counting steps in the current line search and, separately, steps that
102 |     # explore "to the right"
103 |     self.num_steps = 0
104 |     self.num_expl = 0
105 |     
106 |     # Initialize GP object
107 |     self.gp = gaussian_process.ProbLSGaussianProcess()
108 |     
109 |     # Switch to assert that the prepare method will be called first
110 |     self.prepare_called = False
111 |     
112 |     # Internal abort status
113 |     self.abort_status = 0
114 |   
115 |   def scale_obs(self, f_raw, df_raw, fvar_raw, dfvar_raw):
116 |     """Scale an observation of function value and gradient. See section 3.4 of
117 |     [1] for details."""
118 |     
119 |     f = (f_raw-self.f0)/(self.df0*self.alpha0)
120 |     df = df_raw/(self.df0)
121 |     fvar = fvar_raw/((self.alpha0*self.df0)**2)
122 |     dfvar = dfvar_raw/(self.df0**2)
123 |     return f, df, fvar, dfvar
124 |   
125 |   def rescale_t(self, t):
126 |     """Rescale a step size used internally by multiplying with the base step
127 |     size."""
128 |     
129 |     return t*self.alpha0
130 |   
131 |   def rescale_obs(self, f, df, fvar, dfvar):
132 |     """Rescale an observation to real-world scale."""
133 |     
134 |     f_raw = f*self.df0*self.alpha0 + self.f0
135 |     df_raw = df*self.df0
136 |     fvar_raw = fvar*(self.alpha0*self.df0)**2
137 |     dfvar_raw = dfvar*self.df0**2
138 |     return f_raw, df_raw, fvar_raw, dfvar_raw
139 |   
140 |   def prepare(self, *pass_to_func_args):
141 |     """Preparation.
142 |     
143 |     *pass_to_func_args are arguments that are passed to the function interface,
144 |     e.g. a feed dict."""
145 |     
146 |     # Call the prepare op of the function interface, reset the observation
147 |     # lists, the sigmas, and f0 and df0
148 |     f_raw, df_raw, fvar_raw, dfvar_raw = self.func.prepare(*pass_to_func_args)
149 |     self.f0 = f_raw
150 |     self.df0 = np.abs(df_raw)
151 |     
152 |     # Add the first observation to the gp
153 |     f, df, fvar, dfvar = self.scale_obs(f_raw, df_raw, fvar_raw, dfvar_raw)
154 |     self.gp.add(0.0, f, df, fvar, dfvar)
155 |     
156 |     # Set flag that the prepare method has been called
157 |     self.prepare_called = True
158 |   
159 |   def accept(self):
160 |     """Accept the most recent step size."""
161 |     
162 |     assert self.abort_status != 1
163 |     assert self.num_steps >= 1
164 |     
165 |     # Rescale to the "real-world" step size alpha
166 |     alpha = self.rescale_t(self.gp.ts[-1])
167 |     
168 |     # If this accept was not due to an abort and the step size did not change
169 |     # *too much*, we use the accepted alpha as the new base step size alpha0
170 |     # (and update a running average alpha_stats). Otherwise, we use said
171 |     # running average as the new base step size.
172 |     f = self.max_change_factor    
173 |     if self.abort_status == 0 and self.alpha0/f < alpha < self.alpha0*f:
174 |       self.alpha_stats = 0.95*self.alpha_stats + 0.05*alpha
175 |       self.alpha0 = self.fpush*alpha
176 |     else:
177 |       self.alpha0 = self.alpha_stats
178 |     
179 |     # Reset abort status and counters
180 |     self.abort_status = 0
181 |     self.num_steps = 0
182 |     self.num_expl = 0
183 |     
184 |     # Run accept op, reset f0 and df0
185 |     f_raw, df_raw, fvar_raw, dfvar_raw = self.func.accept()
186 |     self.f0 = f_raw
187 |     self.df0 = np.abs(df_raw)
188 |     
189 |     # Reset the gp and add the first observation to the gp
190 |     self.gp.reset()
191 |     f, df, fvar, dfvar = self.scale_obs(f_raw, df_raw, fvar_raw, dfvar_raw)
192 |     self.gp.add(0.0, f, df, fvar, dfvar)
193 |   
194 |   def evaluate(self, t, *pass_to_func_args):
195 |     """Evaluate at step size ``t``.
196 |     
197 |     *pass_to_func_args are arguments that are passed to the function interface,
198 |     e.g. a feed dict."""
199 |     
200 |     assert self.prepare_called
201 |     
202 |     self.num_steps += 1
203 |     
204 |     # Call the adv_eval method of the function interface with the increment
205 |     # re-scaled to the "real-world" step size
206 |     dt = t-self.gp.ts[-1]
207 |     dalpha = self.rescale_t(dt)
208 |     f_raw, df_raw, fvar_raw, dfvar_raw = self.func.adv_eval(dalpha,
209 |                                                             *pass_to_func_args)
210 |     
211 |     # Safeguard against inf or nan encounters. Trigerring abort.
212 |     if np.isnan(f_raw) or np.isinf(f_raw) or np.isnan(df_raw) or np.isinf(df_raw):
213 |       f_raw = 100.0
214 |       df_raw = 10.0
215 |       self.abort_status = 1
216 |     
217 |     # Scale the observations, add it to the GP and update the GP
218 |     # We are currently using the variance estimates from t=0 for all
219 |     # observations, but this might change in the future
220 |     f, df, fvar, dfvar = self.scale_obs(f_raw, df_raw, fvar_raw, dfvar_raw)
221 |     fvar = self.gp.fvars[0]
222 |     dfvar = self.gp.dfvars[0]
223 |     self.gp.add(t, f, df, fvar, dfvar)
224 |     self.gp.update()
225 |   
226 |   def find_next_t(self):
227 |     """Find the step size for the next evaluation."""
228 |     
229 |     assert self.num_steps >= 1
230 |     
231 |     # Generate candidates: the points where the derivative of the posterior
232 |     # mean equals the target value plus one exploration point to the right.
233 |     candidates = self.gp.find_dmu_equal(self.target_df)
234 |     if self.expl_policy == "linear":
235 |       candidates.append(2.*(self.num_expl+1))
236 |     elif self.expl_policy == "exponential":
237 |       candidates.append(2.**(self.num_expl+1))
238 |     else:
239 |       raise Exception("Unknown exploration policy")
240 |     print "\t * Computing utilities for candidates %s", candidates
241 |     
242 |     # Compute p_Wolfe for candidates
243 |     pws = [self.compute_p_wolfe(t) for t in candidates]
244 |     print "\t * p_Wolfe:", pws
245 |     ind_best = np.argmax(pws)
246 |     
247 |     # Memorize when we have chosen the exploration point
248 |     if ind_best == len(candidates) - 1:
249 |         self.num_expl += 1
250 |     
251 |     # Return the candidate t with maximal utility
252 |     print "\t * Best candidate is", candidates[ind_best], "(was candidate", ind_best, "/", len(candidates)-1, ")"
253 |     return candidates[ind_best]
254 |   
255 |   def find_abort_t(self):
256 |     """Find the step size to use for an abort."""
257 |     
258 |     return 0.01
259 | # We are currently simply aborting with a very small step, but we might do
260 | # something like this:
261 | #    ts = self.gp.ts
262 | #    pws = [self.compute_p_wolfe(t) for t in ts]
263 | #    if max(pws) > 0.5*self.cW:
264 | #      t = ts[np.argmax(pws)]
265 | #    else:
266 | #      t = 0.0
267 | #    offset = 0.01
268 | #    
269 | #    return t + offset
270 |   
271 |   def compute_p_wolfe(self, t):
272 |     # Already changed dCov and Covd here
273 |     """Computes the probability that step size ``t`` satisfies the adjusted
274 |     Wolfe conditions under the current GP model."""
275 |     
276 |     # Compute mean and covariance matrix of the two Wolfe quantities a and b
277 |     # (equations (11) to (13) in [1]).
278 |     mu0 = self.gp.mu(0.)
279 |     dmu0 = self.gp.dmu(0.)
280 |     mu = self.gp.mu(t)
281 |     dmu = self.gp.dmu(t)    
282 |     V0 = self.gp.V(0.)
283 |     Vd0 = self.gp.Vd(0.)
284 |     dVd0 = self.gp.dVd(0.)    
285 |     dCov0t = self.gp.dCov_0(t)
286 |     Covd0t = self.gp.Covd_0(t)
287 |     
288 |     ma = mu0 - mu + self.c1*t*dmu0
289 |     Vaa = V0 + dVd0*(self.c1*t)**2 + self.gp.V(t) \
290 |           + 2.*self.c1*t*(Vd0 - dCov0t) - 2.*self.gp.Cov_0(t)
291 |     mb = dmu
292 |     Vbb = self.gp.dVd(t)
293 |     
294 |     # Very small variances can cause numerical problems. Safeguard against
295 |     # this with a deterministic evaluation of the Wolfe conditions.
296 |     if Vaa < 1e-9 or Vbb < 1e-9:
297 |       return 1. if ma>=0. and mb>=0. else 0.
298 |     
299 |     Vab = Covd0t + self.c1*t*self.gp.dCovd_0(t) - self.gp.Vd(t)
300 |     
301 |     # Compute correlation factor and integration bounds for adjusted p_Wolfe
302 |     # and return the result of the bivariate normal integral.
303 |     rho = Vab/np.sqrt(Vaa*Vbb)
304 |     al = -ma/np.sqrt(Vaa)
305 |     bl = (self.df_lo - mb)/np.sqrt(Vbb)
306 |     bu = (self.df_hi - mb)/np.sqrt(Vbb)
307 |     return utils.bounded_bivariate_normal_integral(rho, al, np.inf, bl, bu)
308 |   
309 |   def check_for_acceptance(self):
310 |     """Checks whether the most recent point should be accepted."""
311 |     
312 |     # Return False when no evaluations t>0 have been made yet
313 |     if self.num_steps == 0:
314 |       return False
315 |     
316 |     # If an abort has been triggered, return True
317 |     if self.abort_status == 2:
318 |       return True
319 |     
320 |     # Check Wolfe probability
321 |     pW = self.compute_p_wolfe(self.gp.ts[-1])
322 |     if pW >= self.cW:
323 |       return True
324 |     else:
325 |       return False
326 |   
327 |   def proceed(self, *pass_to_func_args):
328 |     """Make one step (function evaluation) in the line search.
329 |     
330 |     *pass_to_func_args are arguments that are passed to the function interface,
331 |     e.g. a feed dict."""
332 |     
333 |     assert self.prepare_called
334 |     
335 |     # Check for acceptance and accept the previous point as the case may be
336 |     if self.check_for_acceptance():
337 |       print "-> ACCEPT"
338 |       print "\t * alpha = ", self.rescale_t(self.gp.ts[-1]), "[alpha0 was", self.alpha0, "]"
339 |       self.accept()
340 |       print "\t * f = ", self.f0
341 |     
342 |     # In the first call to proceed in a new line search, evaluate at t=1.
343 |     if self.num_steps == 0:
344 |       print "************************************"
345 |       print "NEW LINE SEARCH [alpha0 is", self.alpha0, "]"
346 |       print "-> First step, evaluating at t = 1.0"
347 |       self.evaluate(1., *pass_to_func_args)
348 |     
349 |     # Abort with a very small, safe step size if 
350 |     # - Abort triggered in an other method, e.g. evaluate() encountered inf or
351 |     #   nan. (self.abort_status==1)
352 |     # - the maximum number of steps per line search is exceeded
353 |     # - the maximum number of exploration steps is exceeded
354 |     # - the posterior derivative at t=0. is too large (bad search direction)
355 |     elif (self.abort_status == 1
356 |           or self.num_steps >= self.max_steps
357 |           or self.num_expl >= self.max_expl
358 |           or self.gp.dmu(0.) >= self.max_dmu0):
359 |       t_new = self.find_abort_t()
360 |       print "-> Aborting with t = ", t_new
361 |       self.evaluate(t_new, *pass_to_func_args)
362 |       self.abort_status = 2
363 |     
364 |     # This is an "ordinary" evaluation. Find the best candidate for the next
365 |     # evaluation and evaluate there.
366 |     else:
367 |       print "-> Ordinary step", self.num_steps, ", searching for new t"
368 |       t_new = self.find_next_t()
369 |       print "\t * Evaluating at t =", t_new
370 |       self.evaluate(t_new, *pass_to_func_args)
371 |     
372 |     # Return the real-world function value
373 |     f, _, _, _ = self.rescale_obs(self.gp.fs[-1], self.gp.dfs[-1],
374 |                                   self.gp.fvars[-1], self.gp.dfvars[-1])    
375 |     return f
376 |   
377 |   def proceed_constant_step(self, alpha, *pass_to_func_args):
378 |     """Make one step (function evaluation) in the line search.
379 |     
380 |     *pass_to_func_args are arguments that are passed to the function interface,
381 |     e.g. a feed dict."""
382 |     
383 |     assert self.prepare_called
384 |     
385 |     if self.num_steps >= 1:
386 |       self.accept()
387 |     
388 |     print "************************************"
389 |     print "CONSTANT STEP with alpha =", alpha, "[alpha0 is", self.alpha0, "]"
390 |     t = alpha/self.alpha0
391 |     print "-> Evaluating at t =", t
392 |     self.evaluate(t, *pass_to_func_args)
393 |     
394 |     f, _ = self.rescale_obs(self.gp.fs[-1], self.gp.dfs[-1])
395 |     return f
396 |   
397 |   # ToDo: Commenting
398 |   def visualize_ei_pw(self, ax):
399 |     """Visualize the current state of the line search: expected improvement
400 |     and p_Wolfe.
401 |     
402 |     ``ax`` is a matplotlib axis."""
403 |     
404 |     a, b = min(self.gp.ts), max(self.gp.ts)
405 |     lo = a - .05*(b-a)
406 |     up = b + (b-a)  
407 |     tt = np.linspace(lo, up, num=1000)
408 |     ei = [self.gp.expected_improvement(t) for t in tt]
409 |     pw = [self.compute_p_wolfe(t) for t in tt]
410 |     prod = [e*p for e, p in zip(ei, pw)]
411 |     ax.hold(True)
412 |     ax.plot(tt, ei, label="EI")
413 |     ax.plot(tt, pw, label="pW")
414 |     ax.plot(tt, prod, label="EI*pW")
415 |     ax.plot([lo, up], [self.cW, self.cW], color="grey")
416 |     ax.text(lo, self.cW, "Acceptance threshold", fontsize=8)
417 |     ax.set_xlim(lo, up)
418 |     ax.legend(fontsize=10)
419 | 
420 | ## LEGACY VERSION OF p_Wolfe #################################################
421 | # Changed dCov and Covd here already!
422 | #  def compute_p_wolfe_original(self, t):
423 | #    """Computes the probability that step size ``t`` satisfies the Wolfe
424 | #    conditions under the current GP model."""
425 | #    
426 | #    # Compute mean and covariance matrix of the two Wolfe quantities a and b
427 | #    # (equations (11) to (13) in [1]).
428 | #    mu0 = self.gp.mu(0.)
429 | #    dmu0 = self.gp.dmu(0.)
430 | #    mu = self.gp.mu(t)
431 | #    dmu = self.gp.dmu(t)    
432 | #    V0 = self.gp.V(0.)
433 | #    Vd0 = self.gp.Vd(0.)
434 | #    dVd0 = self.gp.dVd(0.)    
435 | #    ma = mu0 - mu + self.c1*t*dmu0
436 | #    Vaa = V0 + dVd0*(self.c1*t)**2 + self.gp.V(t) \
437 | #          + 2.*self.c1*t*(Vd0 - self.gp.dCov_0(t)) - 2.*self.gp.Cov_0(t)
438 | #    mb = dmu - self.c2*dmu0
439 | #    Vbb = dVd0*self.c2**2 - 2.*self.c2*self.gp.dCovd_0(t) + self.gp.dVd(t)
440 | #    
441 | #    # Very small variances can cause numerical problems. Safeguard against
442 | #    # this with a deterministic evaluation of the Wolfe conditions.
443 | #    if Vaa < 1e-9 or Vbb < 1e-9:
444 | #      return 1. if ma>=0. and mb>=0. else 0.
445 | #    
446 | #    Vab = -self.c2*(Vd0 + self.c1*t*dVd0) + self.c2*self.gp.dCov_0(t) \
447 | #          + self.gp.Covd_0(t) + self.c1*t*self.gp.dCovd_0(t) - self.gp.Vd(t)
448 | #    
449 | #    # Compute rho and integration bounds for p_Wolfe and return the result of
450 | #    # the bivariate normal integral. Upper limit for b is used when strong
451 | #    # Wolfe conditions are requested (cf. equations (14) to (16)in [1]).
452 | #    rho = Vab/np.sqrt(Vaa*Vbb)
453 | #    al = -ma/np.sqrt(Vaa)
454 | #    bl = -mb/np.sqrt(Vbb)
455 | #    if self.strong_wolfe:
456 | #      bbar = 2.*self.c2*(np.abs(dmu0) + 2.*np.sqrt(dVd0))
457 | #      bu = (bbar - mb)/np.sqrt(Vbb)
458 | #      return utils.bounded_bivariate_normal_integral(rho, al, np.inf, bl, bu)
459 | #    else:
460 | #      return utils.unbounded_bivariate_normal_integral(rho, al, bl)
461 | ###############################################################################


--------------------------------------------------------------------------------
/probls/gaussian_process.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Implements the Gaussian process functionality needed for the probabilistic
  4 | line search algorithm.
  5 | """
  6 | 
  7 | import numpy as np
  8 | from scipy import linalg
  9 | from utils import erf
 10 | 
 11 | class ProbLSGaussianProcess(object):
 12 |   """Gaussian process implementation for probabilistic line searches [1].
 13 |   Implements 1D GP regression with observations of function value and
 14 |   derivative. Kernel is a once-integrated Wiener process with theta=1.0.
 15 |   
 16 |   Public interface:
 17 |       - ``gp = ProbLSGaussianProcess()``
 18 |       - ``gp.add(t, f, df, sigma2_f, sigma2_df)`` to add a new observation.
 19 |       - ``gp.reset()`` to remove all observations.
 20 |       - ``gp.update()`` to set up and invert the Gram matrix and make the GP
 21 |         ready for inference (i.e. the following methods).
 22 |       - ``gp.mu(t)`` returns the posterior mean at ``t``.
 23 |       - ``gp.V(t)`` returns the posterior variance ``t``.
 24 |       - ``gp.expected_improvement(t)`` returns the expected improvement at
 25 |         ``t ``
 26 |       - ``gp.cubic_polynomial_coefficients(t)`` to get the coefficients of the
 27 |         cubic polynomial ``gp.mu()`` at ``t`` (the posterior mean is
 28 |         piece-wise cubic).
 29 |       - ``gp.find_cubic_minima()`` to get the minima (if existent) of the cubic
 30 |         polynomials in each "cell", i.e. between (sorted) observation at t_i
 31 |         and t_i+1.
 32 |       - ``gp.find_dmu_equal(val)``, like ``find_cubic_minima()``, but for
 33 |         points where the derivative of the posterior mean equals ``val`` (and
 34 |         the second derivative is positive).
 35 |   
 36 |   [1] M. Mahsereci and P. Hennig. Probabilistic line searches for stochastic
 37 |   optimization. In Advances in Neural Information Processing Systems 28, pages
 38 |   181-189, 2015"""
 39 |   
 40 |   def __init__(self, theta=1.0, offset=10.0):
 41 |     """Create a new GP object."""
 42 |     
 43 |     # Hyperparamters of the GP
 44 |     self.theta = theta
 45 |     self.offset = offset
 46 |     
 47 |     # Observation counter and arrays to store observations
 48 |     self.N = 0
 49 |     self.ts = []
 50 |     self.fs = []
 51 |     self.dfs = []
 52 |     self.fvars = []
 53 |     self.dfvars = []
 54 |     
 55 |     # Kernel matrices
 56 |     self.K = None
 57 |     self.Kd = None
 58 |     self.dKd = None
 59 |     
 60 |     # Gram matrix and pre-computed "weights" of the GP
 61 |     self.G = None
 62 |     self.w = None
 63 |     
 64 |     # Switch that remembers whether we are ready for inference (calls to mu,
 65 |     # V, etc...). It is set to False when the GP is manipulated (points added,
 66 |     # noise level adjusted, reset). After such manipulations, gp.update() has
 67 |     # to be called. Remember current best observation of exp. improvement
 68 |     self.ready = False
 69 |     self.min_obs = None
 70 |   
 71 |   def reset(self):
 72 |     """Reset the GP, removing all previous observations.
 73 |     
 74 |     Automatically adds the observation at t=0 (with f=0 and df=-1)."""
 75 |     
 76 |     self.N = 0
 77 |     self.ts = []
 78 |     self.fs = []
 79 |     self.dfs = []
 80 |     self.fvars = []
 81 |     self.dfvars = []
 82 |     self.K = None
 83 |     self.Kd = None
 84 |     self.dKd = None
 85 |     self.G = None
 86 |     self.LU = None
 87 |     self.LU_piv = None
 88 |     self.w = None
 89 |     
 90 |     self.min_obs = None
 91 |     self.ready = False
 92 |   
 93 |   def add(self, t, f, df, fvar=0.0, dfvar=0.0):
 94 |     """Add a new observation (t, f, df, simga2_f, sigma2_df) to the GP.
 95 |     
 96 |     This stores the observation internally, but does NOT yet set up and invert
 97 |     the Gram matrix. Add observations with repeated calls to this method, then
 98 |     call ``gp.update()`` to set up and invert the Gram matrix. Only then you 
 99 |     can perform inference (calls to ``gp.mu(t)``, ``gp.V(t)``, etc...)."""
100 |     
101 |     assert isinstance(t, (float, np.float32, np.float64))
102 |     assert isinstance(f, (float, np.float32, np.float64))
103 |     assert isinstance(df, (float, np.float32, np.float64))
104 |     assert isinstance(fvar, (float, np.float32, np.float64))
105 |     assert isinstance(dfvar, (float, np.float32, np.float64))
106 |     
107 |     self.ready = False
108 |     self.min_obs = None
109 |     
110 |     self.N += 1
111 |     self.ts.append(t)
112 |     self.fs.append(f)
113 |     self.dfs.append(df)
114 |     self.fvars.append(fvar)
115 |     self.dfvars.append(dfvar)
116 |   
117 |   def update(self):
118 |     """Set up the Gram matrix and compute its LU decomposition to make the GP
119 |     ready for inference (calls to ``.gp.mu(t)``, ``gp.V(t)``, etc...).
120 |     
121 |     Call this method after you have manipulated the GP by
122 |        - ``gp.reset()`` ing,
123 |        - adding observations with ``gp.add(t, f, df)``, or
124 |        - adjusting the sigmas via ``gp.update_sigmas()``.
125 |     and want to perform inference next."""
126 |     
127 |     if self.ready:
128 |       return
129 |     
130 |     # Set up the kernel matrices.
131 |     self.K = np.matrix(np.zeros([self.N, self.N]))
132 |     self.Kd = np.matrix(np.zeros([self.N, self.N]))
133 |     self.dKd = np.matrix(np.zeros([self.N, self.N]))    
134 |     for i in range(self.N):
135 |       for j in range(self.N):
136 |         self.K[i, j] = self.k(self.ts[i], self.ts[j])
137 |         self.Kd[i, j] = self.kd(self.ts[i], self.ts[j])
138 |         self.dKd[i, j] = self.dkd(self.ts[i], self.ts[j])
139 |     
140 |     # Put together the Gram matrix
141 |     S_f = np.matrix(np.diag(self.fvars))
142 |     S_df = np.matrix(np.diag(self.dfvars))
143 |     self.G = np.bmat([[self.K + S_f, self.Kd],
144 |                       [self.Kd.T, self.dKd + S_df]])
145 |     
146 |     # Compute the LU decomposition of G and store it
147 |     self.LU, self.LU_piv = linalg.lu_factor(self.G, check_finite=True)
148 |     
149 |     # Set ready switch to True
150 |     self.ready = True
151 |     
152 |     # Pre-compute the regression weights used in mu
153 |     self.w = self.solve_G(np.array(self.fs + self.dfs))
154 |   
155 |   def solve_G(self, b):
156 |     """Solve ``Gx=b`` where ``G`` is the Gram matrix of the GP.
157 |     
158 |     Uses the internally-stored LU decomposition of ``G`` computed in
159 |     ``gp.update()``."""
160 |     
161 |     assert self.ready
162 |     return linalg.lu_solve((self.LU, self.LU_piv), b, check_finite=True)
163 |   
164 |   def mu(self, t):
165 |     """Evaluate posterior mean of f at ``t``."""
166 |     
167 |     assert isinstance(t, (float, np.float32, np.float64))
168 |     assert self.ready
169 |     
170 |     # Compute kernel vector (k and kd) of the query t and the observations T
171 |     # Then perform inner product with the pre-computed GP weights
172 |     T = np.array(self.ts)
173 |     kvec = np.concatenate([self.k(t, T), self.kd(t, T)])
174 |     
175 |     return np.dot(self.w, kvec)
176 |   
177 |   def dmu(self, t):
178 |     """Evaluate first derivative of the posterior mean of df at ``t``."""
179 |     
180 |     assert isinstance(t, (float, np.float32, np.float64))
181 |     assert self.ready
182 |     
183 |     # Same is in mu, with the respective "derivative kernel vectors"
184 |     T = np.array(self.ts)
185 |     kvec = np.concatenate([self.kd(T, t), self.dkd(t, T)])
186 |     
187 |     return np.dot(self.w, kvec)
188 |   
189 |   def d2mu(self, t):
190 |     """Evaluate 2nd derivative of the posterior mean of f at ``t``."""
191 |     
192 |     assert isinstance(t, (float, np.float32, np.float64))
193 |     assert self.ready
194 |     
195 |     # Same is in mu, with the respective "derivative kernel vectors"
196 |     T = np.array(self.ts)
197 |     kvec = np.concatenate([self.d2k(t, T), self.d2kd(t, T)])
198 |     
199 |     return np.dot(self.w, kvec)
200 |   
201 |   def d3mu(self, t):
202 |     """Evaluate 3rd derivative of the posterior mean of f at ``t``."""
203 |     
204 |     assert isinstance(t, (float, np.float32, np.float64))
205 |     assert self.ready
206 |     
207 |     # Same is in mu, with the respective "derivative kernel vectors"
208 |     T = np.array(self.ts)
209 |     kvec = np.concatenate([self.d3k(t, T), np.zeros(self.N)])
210 |     
211 |     return np.dot(self.w, kvec)
212 |   
213 |   def V(self, t):
214 |     """Evaluate posterior variance of f at ``t``."""
215 |     
216 |     assert isinstance(t, (float, np.float32, np.float64))
217 |     assert self.ready
218 |     
219 |     # Compute the needed k vector
220 |     T = np.array(self.ts)    
221 |     kvec = np.concatenate([self.k(t, T), self.kd(t,T)])
222 |     ktt = self.k(t, t)
223 |     
224 |     return ktt - np.dot(kvec, self.solve_G(kvec))
225 |   
226 |   def Vd(self, t):
227 |     """Evaluate posterior co-variance of f and df at ``t``."""
228 |     
229 |     assert isinstance(t, (float, np.float32, np.float64))
230 |     assert self.ready
231 |     
232 |     T = np.array(self.ts)
233 |     ktT = self.k(t, T)
234 |     kdtT = self.kd(t, T)
235 |     dktT = self.kd(T, t)
236 |     dkdtT = self.dkd(t, T)
237 |     kdtt = self.kd(t, t)
238 |     kvec_a = np.concatenate([ktT, kdtT])
239 |     kvec_b = np.concatenate([dktT, dkdtT])
240 |     
241 |     return kdtt - np.dot(kvec_a, self.solve_G(kvec_b))
242 |   
243 |   def dVd(self, t):
244 |     """Evaluate posterior variance of df at ``t``"""
245 |     
246 |     assert isinstance(t, (float, np.float32, np.float64))
247 |     assert self.ready
248 |     
249 |     T = np.array(self.ts)
250 |     dkdtt = self.dkd(t, t)
251 |     dktT = self.kd(T, t)
252 |     dkdtT = self.dkd(t, T)
253 |     kvec = np.concatenate([dktT, dkdtT])
254 |     
255 |     return dkdtt - np.dot(kvec, self.solve_G(kvec))
256 |   
257 |   def Cov_0(self, t):
258 |     """Evaluate posterior co-variance of f at 0. and ``t``."""
259 |     
260 |     assert isinstance(t, (float, np.float32, np.float64))
261 |     assert self.ready
262 |     
263 |     T = np.array(self.ts)
264 |     k0t = self.k(0., t)
265 |     k0T = self.k(0., T)
266 |     kd0T = self.kd(0., T)
267 |     ktT = self.k(t, T)
268 |     kdtT = self.kd(t, T)
269 |     kvec_a = np.concatenate([k0T, kd0T])
270 |     kvec_b = np.concatenate([ktT, kdtT])
271 |     
272 |     return k0t - np.dot(kvec_a, self.solve_G(kvec_b))
273 |   
274 |   def Covd_0(self, t):
275 |     """Evaluate posterior co-variance of f at 0. and df at ``t``."""
276 |     # !!! I changed this in line_search new, Covd_0 <-> dCov_0
277 |     
278 |     assert isinstance(t, (float, np.float32, np.float64))
279 |     assert self.ready
280 |     
281 |     T = np.array(self.ts)
282 |     kd0t = self.kd(0., t)
283 |     k0T = self.k(0., T)
284 |     kd0T = self.kd(0., T)
285 |     dktT = self.kd(T, t)
286 |     dkdtT = self.dkd(t, T)
287 |     kvec_a = np.concatenate([k0T, kd0T])
288 |     kvec_b = np.concatenate([dktT, dkdtT])
289 |     
290 |     return kd0t - np.dot(kvec_a, self.solve_G(kvec_b))
291 |   
292 |   def dCov_0(self, t):
293 |     """Evaluate posterior co-variance of df at 0. and f at ``t``."""
294 |     # !!! I changed this in line_search new, Covd_0 <-> dCov_0
295 |     
296 |     assert isinstance(t, (float, np.float32, np.float64))
297 |     assert self.ready
298 |     
299 |     T = np.array(self.ts)
300 |     dk0t = self.kd(t, 0.)
301 |     dk0T = self.kd(T, 0.)
302 |     dkd0T = self.dkd(0., T)
303 |     ktT = self.k(t, T)
304 |     kdtT = self.kd(t, T)
305 |     kvec_a = np.concatenate([dk0T, dkd0T])
306 |     kvec_b = np.concatenate([ktT, kdtT])
307 |     
308 |     return dk0t - np.dot(kvec_a, self.solve_G(kvec_b))
309 |   
310 |   def dCovd_0(self, t):
311 |     """Evaluate posterior co-variance of df at 0. and ``t``."""
312 |     
313 |     assert isinstance(t, (float, np.float32, np.float64))
314 |     assert self.ready
315 |     
316 |     T = np.array(self.ts)
317 |     dkd0t = self.dkd(0., t)
318 |     dk0T = self.kd(T, 0.)
319 |     dkd0T = self.dkd(0., T)
320 |     dktT = self.kd(T, t)
321 |     dkdtT = self.dkd(t, T)
322 |     kvec_a = np.concatenate([dk0T, dkd0T])
323 |     kvec_b = np.concatenate([dktT, dkdtT])
324 |     
325 |     return dkd0t - np.dot(kvec_a, self.solve_G(kvec_b))
326 |   
327 |   def cubic_polynomial_coefficients(self, t):
328 |     """The posterior mean ``mu`` of this GP is piece-wise cubic. Return the
329 |     coefficients of the cubic polynomial that is ``mu`` at ``t``."""
330 |     
331 |     assert isinstance(t, (float, np.float32, np.float64))
332 |     assert t not in self.ts # at the observations, polynomial is ambiguous
333 |     
334 |     d1, d2, d3 = self.dmu(t), self.d2mu(t), self.d3mu(t)
335 |     a = d3/6.0
336 |     b = 0.5*d2-3*a*t
337 |     c = d1-3*a*t**2-2*b*t
338 |     d = self.mu(t)-a*t**3-b*t**2-c*t
339 |     
340 |     return (a, b, c, d)
341 |   
342 |   def quadratic_polynomial_coefficients(self, t):
343 |     """The posterior mean ``mu`` of this GP is piece-wise cubic. Return the
344 |     coefficients of the **quadratic** polynomial that is the **derivative** of
345 |     ``mu`` at ``t``.
346 |     
347 |     This is used to find the minimum of the cubic polynomial in
348 |     ``gp.find_mimima()``."""
349 |     
350 |     assert isinstance(t, (float, np.float32, np.float64))
351 |     assert t not in self.ts # at the observations, polynomial is ambiguous
352 |     
353 |     d1, d2, d3 = self.dmu(t), self.d2mu(t), self.d3mu(t)
354 |     a = .5*d3
355 |     b = d2 - d3*t
356 |     c = d1 - d2*t + 0.5*d3*t**2
357 |     
358 |     return (a, b, c)
359 |   
360 |   def find_dmu_equal(self, val):
361 |     """Finds points where the derivative of the posterior mean equals ``val``
362 |     and the second derivative is positive.
363 |     
364 |     The posterior mean is a  cubic polynomial in each of the cells"
365 |     ``[t_i, t_i+1]`` where the t_i are the sorted observed ts. For each of
366 |     these cells, returns points with dmu==val the cubic polynomial if it exists
367 |     and happens to lie in that cell."""
368 |     
369 |     # We want to go through the observations from smallest to largest t
370 |     ts_sorted = list(self.ts)
371 |     ts_sorted.sort()
372 |     
373 |     solutions = []
374 |     
375 |     for t1, t2 in zip(ts_sorted, ts_sorted[1:]):
376 |       # Compute the coefficients of the quadratic polynomial dmu/dt in this
377 |       # cell, then call the function minimize_cubic to find the minimizer.
378 |       # If there is one and it falls into the current cell, store it
379 |       a, b, c = self.quadratic_polynomial_coefficients(t1+0.5*(t2-t1))
380 |       solutions_cell = quadratic_polynomial_solve(a, b, c, val)
381 |       for s in solutions_cell:
382 |         if s>t1 and s<t2:
383 |           solutions.append(s)
384 |     
385 |     return solutions
386 |   
387 |   def find_cubic_minima(self):
388 |     """Find the local minimizers of the posterior mean.
389 |     
390 |     The posterior mean is a  cubic polynomial in each of the cells"
391 |     [t_i, t_i+1] where the t_i are the sorted observed ts. For each of these
392 |     cells, return the minimizer of the cubic polynomial if it exists and
393 |     happens to lie in that cell."""
394 |     
395 |     return self.find_dmu_equal(0.0)
396 |   
397 |   def expected_improvement(self, t):
398 |     """Computes the expected improvement at position ``t`` under the current
399 |     GP model.
400 |     
401 |     Reference "current best" is the observed ``t`` with minimal posterior
402 |     mean."""
403 |     
404 |     assert isinstance(t, (float, np.float32, np.float64))
405 |     
406 |     # Find the observation with minimal posterior mean, if it has not yet been
407 |     # computed by a previous call to this method
408 |     if self.min_obs is None:
409 |       self.min_obs = min(self.mu(tt) for tt in self.ts)
410 |     
411 |     # Compute posterior mean and variance at t
412 |     m, v = self.mu(t), self.V(t)
413 |     
414 |     # Compute the two terms in the formula for EI and return the sum
415 |     t1 = 0.5 * (self.min_obs-m) * (1 + erf((self.min_obs-m)/np.sqrt(2.*v)))
416 |     t2 = np.sqrt(0.5*v/np.pi) * np.exp(-0.5*(self.min_obs-m)**2/v)
417 |     
418 |     return t1 + t2
419 |   
420 |   def k(self, x, y):
421 |     """Kernel function."""
422 |     for arg in [x, y]:
423 |       assert isinstance(arg, (float, np.float32, np.float64)) or \
424 |              (isinstance(arg, np.ndarray) and np.rank(arg) == 1)
425 |     mi = self.offset + np.minimum(x, y)
426 |     return self.theta**2 * (mi**3/3.0 + 0.5*np.abs(x-y)*mi**2)
427 |   
428 |   def kd(self, x, y):
429 |     """Derivative of kernel function, 1st derivative w.r.t. right argument."""
430 |     for arg in [x, y]:
431 |       assert isinstance(arg, (float, np.float32, np.float64)) or \
432 |              (isinstance(arg, np.ndarray) and np.rank(arg) == 1)
433 |     xx = x + self.offset
434 |     yy = y + self.offset
435 |     return self.theta**2 * np.where(x<y, 0.5*xx**2, xx*yy-0.5*yy**2)
436 |   
437 |   def dkd(self, x, y):
438 |     """Derivative of kernel function,  1st derivative w.r.t. both arguments."""
439 |     for arg in [x, y]:
440 |       assert isinstance(arg, (float, np.float32, np.float64)) or \
441 |              (isinstance(arg, np.ndarray) and np.rank(arg) == 1)
442 |     xx = x+self.offset
443 |     yy = y+self.offset
444 |     return self.theta**2 * np.minimum(xx, yy)
445 |   
446 |   def d2k(self, x, y):
447 |     """Derivative of kernel function,  2nd derivative w.r.t. left argument."""
448 |     for arg in [x, y]:
449 |       assert isinstance(arg, (float, np.float32, np.float64)) or \
450 |              (isinstance(arg, np.ndarray) and np.rank(arg) == 1)
451 |     return self.theta**2 * np.where(x<y, y-x, 0.)
452 |   
453 |   def d3k(self, x, y):
454 |     """Derivative of kernel function,  3rd derivative w.r.t. left argument."""
455 |     for arg in [x, y]:
456 |       assert isinstance(arg, (float, np.float32, np.float64)) or \
457 |              (isinstance(arg, np.ndarray) and np.rank(arg) == 1)
458 |     return self.theta**2 * np.where(x<y, -1., 0.)
459 | 
460 |   def d2kd(self, x, y):
461 |     """Derivative of kernel function,  2nd derivative w.r.t. left argument,
462 |     1st derivative w.r.t. right argument."""
463 |     for arg in [x, y]:
464 |       assert isinstance(arg, (float, np.float32, np.float64)) or \
465 |              (isinstance(arg, np.ndarray) and np.rank(arg) == 1)
466 |     return self.theta**2 * np.where(x<y, 1., 0.)
467 |   
468 |   # ToDo: Commenting
469 |   def visualize_f(self, ax):
470 |     """Visualize the GP: function value.
471 |     
472 |     ``ax`` is a matplotlib axis."""
473 |     
474 |     a, b = min(self.ts), max(self.ts)
475 |     lo = a - .05*(b-a)
476 |     up = b + (b-a)
477 |     span = up - lo
478 |     w = span/40.
479 |     tt = np.linspace(lo, up, num=1000)
480 |     m = np.array([self.mu(t) for t in tt])
481 |     v = np.array([self.V(t) for t in tt])
482 |     ax.hold(True)
483 |     for t, f, df in zip(self.ts, self.fs, self.dfs):
484 |       ax.plot(t, f, marker='o', markersize=8, color=[0., 0.4717, 0.4604])
485 |       ax.plot([t-w, t+w], [f-w*df, f+w*df], 'black')
486 |     ax.plot(tt, m, color=[1., 0.6, 0.2], linewidth=1.5)
487 |     ax.plot(tt, m + 2*np.sqrt(v), color=[1., 0.6, 0.2], linestyle='--')
488 |     ax.plot(tt, m - 2*np.sqrt(v), color=[1., 0.6, 0.2], linestyle='--')
489 |     ax.plot([lo, up], [0., 0.], color="grey", linestyle=":")
490 |     ax.set_xlim(lo, up)
491 |   
492 |   # ToDo: Commenting
493 |   def visualize_df(self, ax):
494 |     """Visualize the GP: derivative.
495 |     
496 |     ``ax`` is a matplotlib axis."""
497 |     
498 |     a, b = min(self.ts), max(self.ts)
499 |     lo = a - .05*(b-a)
500 |     up = b + (b-a)
501 |     tt = np.linspace(lo, up, num=1000)
502 |     m = np.array([self.dmu(t) for t in tt])
503 |     v = np.array([self.dVd(t) for t in tt])
504 |     ax.hold(True)
505 |     ax.plot(self.ts, self.dfs, 'o', markersize=8, color=[0., 0.4717, 0.4604])
506 |     ax.plot(tt, m, color=[1., 0.6, 0.2], linewidth=1.5)
507 |     ax.plot(tt, m + 2*np.sqrt(v), color=[1., 0.6, 0.2], linestyle='--')
508 |     ax.plot(tt, m - 2*np.sqrt(v), color=[1., 0.6, 0.2], linestyle='--')
509 |     ax.plot([lo, up], [0., 0.], color="grey", linestyle=":")
510 |     ax.set_xlim(lo, up)
511 |   
512 |   # ToDo: Commenting
513 |   def visualize_ei(self, ax):
514 |     """Visualize expected improvement.
515 |     
516 |     ``ax`` is a matplotlib axis."""
517 |     
518 |     a, b = min(self.ts), max(self.ts)
519 |     lo = a - .05*(b-a)
520 |     up = b + (b-a)
521 |     tt = np.linspace(lo, up, num=1000)
522 |     ax.plot(tt, [self.expected_improvement(t) for t in tt])
523 |     ax.set_xlim(lo, up)
524 | 
525 | 
526 | def quadratic_polynomial_solve(a, b, c, val):
527 |   """Computes *real* solutions of f(t) = a*t**2 + b*t + c = val with f''(t)>0.
528 |   
529 |   Returns the *list* of solutions (containg 1 or 0 solutions)."""
530 |   
531 |   assert isinstance(a, (float, np.float32, np.float64))
532 |   assert isinstance(b, (float, np.float32, np.float64))
533 |   assert isinstance(c, (float, np.float32, np.float64))
534 |   assert isinstance(val, (float, np.float32, np.float64))
535 |   
536 |   # Check if a is almost zero. If so, solve the remaining linear equation. Note
537 |   # that we return only soultions with f''(t) = b > 0
538 |   if abs(a) < 1e-9:
539 |     if b > 1e-9:
540 |       return [(val-c)/b]
541 |     else:
542 |       return []
543 |   
544 |   # Compute the term under the square root in pq formula, if it is negative,
545 |   # there is no real solution
546 |   det = b**2-4.*a*(c-val)
547 |   if det < 0:
548 |     return []
549 |   
550 |   # Otherwise, compute the two roots
551 |   s = np.sqrt(det)
552 |   r1 = (-b - np.sign(a)*s)/(2.*a)
553 |   r2 = (-b + np.sign(a)*s)/(2.*a)
554 |   
555 |   # Return the one with f''(t) = 2at + b > 0, or []
556 |   if 2*a*r1+b > 0:
557 |     return [r1]
558 |   elif 2*a*r2+b > 0:
559 |     return [r2]  
560 |   else:
561 |     return []


--------------------------------------------------------------------------------