├── .gitignore ├── probls ├── __init__.py ├── tensorflow_interface │ ├── __init__.py │ ├── gradient_moment.py │ └── interface_sgd.py ├── utils.py ├── line_search.py └── gaussian_process.py ├── examples ├── models │ ├── __init__.py │ ├── mnist_mlp.py │ ├── mnist_2conv_2dense.py │ └── cifar10_2conv_3dense.py ├── run_probls_cifar10.py ├── run_probls_mnist.py ├── run_probls_mnist_interactive.py └── cifar10.py ├── test ├── demo_gaussian_process.py ├── test_utils.py ├── test_gaussian_process.py ├── demo_interface_sgd.py └── test_gradient_moment.py ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | examples/data 3 | -------------------------------------------------------------------------------- /probls/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 16 15:46:48 2016 4 | 5 | @author: lballes 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /probls/tensorflow_interface/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 16 15:46:48 2016 4 | 5 | @author: lballes 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /examples/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Nov 27 11:59:40 2016 4 | 5 | @author: Lukas Balles [lballes@tuebingen.mpg.de] 6 | """ 7 | -------------------------------------------------------------------------------- /examples/models/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | TensorFlow MNIST MLP model. 4 | """ 5 | 6 | import tensorflow as tf 7 | 8 | def weight_variable(shape): 9 | initial = tf.truncated_normal(shape, stddev=1e-2) 10 | return tf.Variable(initial) 11 | 12 | def bias_variable(shape): 13 | initial = tf.constant(0.05, shape=shape) 14 | return tf.Variable(initial) 15 | 16 | def set_up_model(): 17 | tf.reset_default_graph() 18 | X = tf.placeholder(tf.float32, shape=[None, 784]) 19 | y = tf.placeholder(tf.float32, shape=[None, 10]) 20 | W_fc1 = weight_variable([784, 800]) 21 | b_fc1 = bias_variable([800]) 22 | h_fc1 = tf.nn.sigmoid(tf.matmul(X, W_fc1) + b_fc1) 23 | W_fc2 = weight_variable([800, 10]) 24 | b_fc2 = bias_variable([10]) 25 | h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2) 26 | losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1]) 27 | return losses, [X, y], [W_fc1, b_fc1, W_fc2, b_fc2] 28 | -------------------------------------------------------------------------------- /test/demo_gaussian_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Demo for Gaussian process functionality in probls.gaussian_process. 4 | 5 | Created on Thu Nov 17 16:58:40 2016 6 | 7 | @author: lballes 8 | """ 9 | 10 | import os 11 | import sys 12 | sys.path.insert(0, os.path.abspath('..')) 13 | 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | import time 17 | 18 | from probls import gaussian_process 19 | 20 | 21 | # Specify noise levels and observations 22 | fvar, dfvar = 3e-1, 1e-2 23 | observations = [(0., 0., -1.), (1., -0.5, -0.9), (2., -0.9, 0.7)] 24 | 25 | # Add observations to GP, compute posterior mean and variance 26 | gp = gaussian_process.ProbLSGaussianProcess() 27 | for obs in observations: 28 | gp.add(*obs, fvar=fvar, dfvar=dfvar) 29 | beg = time.time() 30 | gp.update() 31 | print "gp.update() took", (time.time()-beg)*10**6, "microseconds" 32 | 33 | tt = np.arange(-0.1, 4.0, 0.01) 34 | 35 | 36 | fig, (a1, a2, a3) = plt.subplots(3, 1) 37 | gp.visualize_f(a1) 38 | gp.visualize_df(a2) 39 | gp.visualize_ei(a3) 40 | 41 | # Find the minima and add them to the plot 42 | minima = gp.find_dmu_equal(0.2) 43 | a1.plot(minima, [gp.mu(m) for m in minima], 'D') 44 | a2.plot(minima, [gp.dmu(m) for m in minima], 'D') -------------------------------------------------------------------------------- /examples/run_probls_cifar10.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Run probabilistic line search on a CIFAR-10 example. 4 | """ 5 | 6 | import os 7 | import sys 8 | sys.path.insert(0, os.path.abspath('..')) 9 | 10 | import tensorflow as tf 11 | 12 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface 13 | from probls.line_search import ProbLSOptimizer 14 | 15 | import cifar10 16 | 17 | #### Specify training specifics here ########################################## 18 | from models import cifar10_2conv_3dense as model 19 | num_steps = 4000 20 | batch_size = 256 21 | ############################################################################### 22 | 23 | 24 | # Set up model 25 | tf.reset_default_graph() 26 | images, labels = cifar10.distorted_inputs(batch_size=batch_size) 27 | losses, variables = model.set_up_model(images, labels) 28 | 29 | # Set up ProbLS optimizer 30 | opt_interface = ProbLSOptimizerSGDInterface() 31 | opt_interface.minimize(losses, variables) 32 | sess = tf.Session() 33 | opt_interface.register_session(sess) 34 | opt_ls = ProbLSOptimizer(opt_interface, alpha0=1e-3, cW=0.3, c1=0.05, 35 | target_df=0.5, df_lo=-0.1, df_hi=1.1, expl_policy="linear", fpush=1.0, 36 | max_change_factor=10., max_steps=10, max_expl=10, max_dmu0=0.0) 37 | 38 | # Initialize variables and start queues 39 | coord = tf.train.Coordinator() 40 | sess.run(tf.global_variables_initializer()) 41 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 42 | 43 | # Run ProbLS 44 | opt_ls.prepare() 45 | for i in range(num_steps): 46 | print(opt_ls.proceed()) 47 | 48 | # Stop queues 49 | coord.request_stop() 50 | coord.join(threads) -------------------------------------------------------------------------------- /examples/models/mnist_2conv_2dense.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | TensorFlow MNIST CNN model. 4 | """ 5 | 6 | import tensorflow as tf 7 | 8 | def weight_variable(shape): 9 | initial = tf.truncated_normal(shape, stddev=1e-2) 10 | return tf.Variable(initial) 11 | 12 | def bias_variable(shape): 13 | initial = tf.constant(0.05, shape=shape) 14 | return tf.Variable(initial) 15 | 16 | def conv2d(x, W): 17 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 18 | 19 | def max_pool_2x2(x): 20 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], 21 | strides=[1, 2, 2, 1], padding='SAME') 22 | 23 | def set_up_model(): 24 | tf.reset_default_graph() 25 | X = tf.placeholder(tf.float32, shape=[None, 784]) 26 | y = tf.placeholder(tf.float32, shape=[None, 10]) 27 | W_conv1 = weight_variable([5, 5, 1, 32]) 28 | b_conv1 = bias_variable([32]) 29 | X_image = tf.reshape(X, [-1,28,28,1]) 30 | h_conv1 = tf.nn.relu(conv2d(X_image, W_conv1) + b_conv1) 31 | h_pool1 = max_pool_2x2(h_conv1) 32 | W_conv2 = weight_variable([5, 5, 32, 64]) 33 | b_conv2 = bias_variable([64]) 34 | h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) 35 | h_pool2 = max_pool_2x2(h_conv2) 36 | W_fc1 = weight_variable([7 * 7 * 64, 1024]) 37 | b_fc1 = bias_variable([1024]) 38 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) 39 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) 40 | W_fc2 = weight_variable([1024, 10]) 41 | b_fc2 = bias_variable([10]) 42 | h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2) 43 | losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1]) 44 | return losses, [X, y], [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2] 45 | -------------------------------------------------------------------------------- /examples/run_probls_mnist.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Run probabilistic line search on a MNIST example. 4 | """ 5 | 6 | import os 7 | import sys 8 | sys.path.insert(0, os.path.abspath('..')) 9 | 10 | import tensorflow as tf 11 | from tensorflow.examples.tutorials.mnist import input_data 12 | mnist = input_data.read_data_sets('data/mnist', one_hot=True) 13 | 14 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface 15 | from probls.line_search import ProbLSOptimizer 16 | 17 | #### Specify training specifics here ########################################## 18 | from models import mnist_2conv_2dense as model # Comment/uncomment to chose 19 | #from models import mnist_mlp as model # the model to run 20 | num_steps = 4000 21 | batch_size = 256 22 | ############################################################################### 23 | 24 | 25 | # Set up model 26 | losses, placeholders, variables = model.set_up_model() 27 | X, y = placeholders 28 | 29 | # Set up ProbLS optimizer 30 | opt_interface = ProbLSOptimizerSGDInterface() 31 | opt_interface.minimize(losses, variables) 32 | sess = tf.Session() 33 | opt_interface.register_session(sess) 34 | opt_ls = ProbLSOptimizer(opt_interface, alpha0=1e-3, cW=0.3, c1=0.05, 35 | target_df=0.5, df_lo=-0.1, df_hi=1.1, expl_policy="linear", fpush=1.0, 36 | max_change_factor=10., max_steps=10, max_expl=10, max_dmu0=0.0) 37 | 38 | # Initialize variables 39 | sess.run(tf.global_variables_initializer()) 40 | 41 | # Run ProbLS 42 | batch = mnist.train.next_batch(batch_size) 43 | opt_ls.prepare({X: batch[0], y: batch[1]}) 44 | for i in range(num_steps): 45 | batch = mnist.train.next_batch(batch_size) 46 | print(opt_ls.proceed({X: batch[0], y: batch[1]})) -------------------------------------------------------------------------------- /examples/models/cifar10_2conv_3dense.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Aug 4 11:05:44 2016 4 | 5 | @author: lballes 6 | """ 7 | 8 | import tensorflow as tf 9 | 10 | def weight_variable(shape, stddev=1e-2): 11 | initial = tf.truncated_normal(shape, stddev=stddev) 12 | return tf.Variable(initial) 13 | 14 | def bias_variable(shape, val=0.05): 15 | initial = tf.constant(val, shape=shape) 16 | return tf.Variable(initial) 17 | 18 | def conv2d(x, W): 19 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 20 | 21 | def max_pool_3x3(x): 22 | return tf.nn.max_pool(x, ksize=[1, 3, 3, 1], 23 | strides=[1, 2, 2, 1], padding='SAME') 24 | 25 | def set_up_model(images, labels): 26 | W_conv1 = weight_variable([5, 5, 3, 64], 5e-2) 27 | b_conv1 = bias_variable([64], 0.0) 28 | h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1) 29 | h_conv1_pool = max_pool_3x3(h_conv1) 30 | 31 | W_conv2 = weight_variable([5, 5, 64, 64], 5e-2) 32 | b_conv2 = bias_variable([64], 0.1) 33 | h_conv2 = tf.nn.relu(conv2d(h_conv1_pool, W_conv2) + b_conv2) 34 | h_conv2_pool = max_pool_3x3(h_conv2) 35 | 36 | batch_size = tf.gather(tf.shape(images), 0) 37 | reshape = tf.reshape(h_conv2_pool, tf.pack([batch_size, -1])) 38 | dim = 2304 39 | W_fc1 = weight_variable([dim, 384], 0.04) 40 | b_fc1 = bias_variable([384], 0.1) 41 | h_fc1 = tf.nn.relu(tf.matmul(reshape, W_fc1) + b_fc1) 42 | 43 | W_fc2 = weight_variable([384, 192], 0.04) 44 | b_fc2 = bias_variable([192], 0.1) 45 | h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) 46 | 47 | W_fc3 = weight_variable([192, 10], 1/192.0) 48 | b_fc3 = bias_variable([10], 0.0) 49 | h_fc3 = tf.matmul(h_fc2, W_fc3) + b_fc3 50 | 51 | labels = tf.cast(labels, tf.int64) 52 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(h_fc3, labels) 53 | return losses, [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2, W_fc3, b_fc3] 54 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Tests for utility functions in probls.utils 4 | 5 | Created on Wed Jul 6 16:12:54 2016 6 | 7 | @author: lballes 8 | """ 9 | 10 | import os 11 | import sys 12 | sys.path.insert(0, os.path.abspath('..')) 13 | 14 | import unittest 15 | import numpy as np 16 | 17 | 18 | from probls import utils 19 | cdf = utils._cdf 20 | bvnu = utils.unbounded_bivariate_normal_integral 21 | bvn = utils.bounded_bivariate_normal_integral 22 | 23 | 24 | class TestCDF(unittest.TestCase): 25 | 26 | def runTest(self): 27 | self.assertEqual(utils._cdf(0.), 0.5) 28 | self.assertAlmostEqual(cdf(1.), 0.8413, places=4) 29 | self.assertAlmostEqual(cdf(3.), 0.9987, places=4) 30 | self.assertAlmostEqual(cdf(-1.), 0.1587, places=4) 31 | self.assertAlmostEqual(cdf(-0.1), 0.4602, places=4) 32 | 33 | 34 | class TestUnboundedIntegral(unittest.TestCase): 35 | 36 | def runTest(self): 37 | self.assertEqual(bvnu(0., 0., 0.), 0.25) 38 | self.assertEqual(bvnu(1., 0., 0.), 0.5) 39 | self.assertAlmostEqual(bvnu(0.43, 2.5, -1.0), 0.0062, places=4) 40 | self.assertAlmostEqual(bvnu(-0.17, 0.5, 1.0), 0.0351, places=4) 41 | self.assertAlmostEqual(bvnu(0., 0.5, 1.0), 0.0490, places=4) 42 | self.assertAlmostEqual(bvnu(-1., 0., -3.), 0.4987, places=4) 43 | self.assertAlmostEqual(bvnu(0., -5., -5.), 1., places=4) 44 | self.assertAlmostEqual(bvnu(0., 5., 5.), 0., places=4) 45 | self.assertAlmostEqual(bvnu(1., 3., 3.), 0.0013, places=4) 46 | self.assertAlmostEqual(bvnu(-1., 0., 0.), 0., places=4) 47 | 48 | class TestBoundedIntegral(unittest.TestCase): 49 | 50 | def runTest(self): 51 | self.assertAlmostEqual(bvn(0.25, 0., 2.5, -1.2, 0.1), 0.1901, places=4) 52 | self.assertAlmostEqual(bvn(0., 0., 1., 0., 1.), 0.1165, places=4) 53 | self.assertAlmostEqual(bvn(0.5, 0., 1., 0., 1.), 0.1411, places=4) 54 | self.assertAlmostEqual(bvn(0.5, 0., np.inf, 0., 1.), 0.2059, places=4) 55 | 56 | 57 | if __name__ == "__main__": 58 | unittest.main() -------------------------------------------------------------------------------- /examples/run_probls_mnist_interactive.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Run probabilistic line search on an MNIST CNN in interactive mode. Displays 4 | visualizations of function value f, derivative df, and the probabilistc wolfe 5 | conditions and expected improvement. Click to proceed one function evaluation. 6 | """ 7 | 8 | import os 9 | import sys 10 | sys.path.insert(0, os.path.abspath('..')) 11 | 12 | import matplotlib.pyplot as plt 13 | import tensorflow as tf 14 | from tensorflow.examples.tutorials.mnist import input_data 15 | mnist = input_data.read_data_sets('data/mnist', one_hot=True) 16 | 17 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface 18 | from probls.line_search import ProbLSOptimizer 19 | 20 | #### Specify training specifics here ########################################## 21 | #from models import mnist_2conv_2dense as model # Comment/uncomment to chose 22 | from models import mnist_mlp as model # the model to run 23 | num_steps = 4000 24 | batch_size = 128 25 | ############################################################################### 26 | 27 | # Set up model 28 | losses, placeholders, variables = model.set_up_model() 29 | X, y = placeholders 30 | 31 | # Set up ProbLS optimizer 32 | opt_interface = ProbLSOptimizerSGDInterface() 33 | opt_interface.minimize(losses, variables) 34 | sess = tf.Session() 35 | opt_interface.register_session(sess) 36 | sess.run(tf.global_variables_initializer()) 37 | opt_ls = ProbLSOptimizer(opt_interface, cW=0.3, c1=0.05, target_df=0.5, 38 | df_lo=-0.1, df_hi=1.1, expl_policy="linear", 39 | fpush=1.0, max_change_factor=10., max_steps=10, 40 | max_expl=10, max_dmu0=100.0) 41 | batch = mnist.train.next_batch(batch_size) 42 | opt_ls.prepare({X: batch[0], y: batch[1]}) 43 | 44 | # Run 45 | plt.figure() 46 | for i in range(num_steps): 47 | plt.clf() 48 | batch = mnist.train.next_batch(batch_size) 49 | print opt_ls.proceed({X: batch[0], y: batch[1]}) 50 | opt_ls.gp.visualize_f(plt.subplot(3, 1, 1)) 51 | plt.subplot(3, 1, 1).set_ylabel("f") 52 | opt_ls.gp.visualize_df(plt.subplot(3, 1, 2)) 53 | plt.subplot(3, 1, 2).set_ylabel("df") 54 | opt_ls.visualize_ei_pw(plt.subplot(3, 1, 3)) 55 | plt.subplot(3, 1, 3).set_ylabel("p_Wolfe / EI") 56 | plt.show() 57 | plt.waitforbuttonpress() -------------------------------------------------------------------------------- /test/test_gaussian_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Test for Gaussian process implementation in probls.gaussian_process 4 | 5 | Created on Fri Jul 1 09:51:00 2016 6 | 7 | @author: lballes 8 | """ 9 | 10 | import os 11 | import sys 12 | sys.path.insert(0, os.path.abspath('..')) 13 | 14 | import unittest 15 | import numpy as np 16 | 17 | 18 | from probls import gaussian_process 19 | 20 | 21 | class TestSolveQuadraticPolynomial(unittest.TestCase): 22 | """Test the ``quadratic_polynomial_solve`` function of 23 | ``probls.gaussian_process`` with a few hand-computed polynomials.""" 24 | 25 | def setUp(self): 26 | self.solve = gaussian_process.quadratic_polynomial_solve 27 | 28 | def runTest(self): 29 | self.assertListEqual(self.solve(1., 0., 0., -3.5), []) 30 | self.assertListEqual(self.solve(1., 0., 0., 0.), []) 31 | self.assertListEqual(self.solve(2., -4., 0., 0.), [2.]) 32 | self.assertListEqual(self.solve(1., 3., -2., -4.), [-1.]) 33 | self.assertListEqual(self.solve(2., 0.5, 4., -8.0), []) 34 | 35 | 36 | class TestKernelFunctions(unittest.TestCase): 37 | 38 | def setUp(self): 39 | self.gp = gaussian_process.ProbLSGaussianProcess() 40 | 41 | def runTest(self): 42 | 43 | # Test kernel function with hand-computed values 44 | self.assertEqual(self.gp.k(3.5, 1.), 11.**3/3. + 0.5*2.5*11.**2) 45 | self.assertEqual(self.gp.k(2., 3.), 12.**3/3.+.5*12.**2) 46 | self.assertEqual(self.gp.dkd(1., 2.0), 11.) 47 | self.assertEqual(self.gp.dkd(-2., -1.), 8.) 48 | 49 | # Test if one-to-one computations give the same result as one-to-many 50 | # computations 51 | t, T = np.random.rand(), np.random.rand(10) 52 | for fun in [self.gp.k, self.gp.kd, self.gp.dkd, self.gp.d2k, self.gp.d2kd, self.gp.d3k]: 53 | res = fun(t, T) 54 | for i, tt in enumerate(T): 55 | self.assertEqual(fun(t, tt), res[i]) 56 | 57 | 58 | class TestNoiseFree(unittest.TestCase): 59 | """Test whether posterior mean equals observations in the noise-free case.""" 60 | 61 | def setUp(self): 62 | self.gp = gaussian_process.ProbLSGaussianProcess() 63 | 64 | def runTest(self): 65 | ts, fs, dfs = np.random.randn(10), np.random.randn(10), np.random.randn(10) 66 | for i in range(10): 67 | self.gp.add(ts[i], fs[i], dfs[i]) 68 | self.gp.update() 69 | for i in range(10): 70 | t, f, df = ts[i], fs[i], dfs[i] 71 | self.assertLess(self.gp.V(t), 1e-9) 72 | self.assertAlmostEqual(self.gp.mu(t), f, places=3) 73 | self.assertAlmostEqual(self.gp.dmu(t), df, places=3) 74 | 75 | 76 | if __name__ == "__main__": 77 | unittest.main() -------------------------------------------------------------------------------- /test/demo_interface_sgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Demo script for the tensorflow SGD interface. Uses the interface to perform 4 | SGD on an MNIST CNN by repeatedly calling ``opt_interface.adv_eval(lr)`` and 5 | ``accept()``. 6 | 7 | Created on Fri Nov 25 16:36:07 2016 8 | 9 | @author: Lukas Balles [lballes@tuebingen.mpg.de] 10 | """ 11 | 12 | import os 13 | import sys 14 | sys.path.insert(0, os.path.abspath('..')) 15 | 16 | import tensorflow as tf 17 | from tensorflow.examples.tutorials.mnist import input_data 18 | import numpy as np 19 | 20 | from probls.tensorflow_interface import interface_sgd 21 | 22 | 23 | def weight_variable(shape): 24 | initial = tf.truncated_normal(shape, stddev=1e-2) 25 | return tf.Variable(initial) 26 | 27 | def bias_variable(shape): 28 | initial = tf.constant(0.05, shape=shape) 29 | return tf.Variable(initial) 30 | 31 | def conv2d(x, W): 32 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 33 | 34 | def max_pool_2x2(x): 35 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], 36 | strides=[1, 2, 2, 1], padding='SAME') 37 | 38 | # Set up model 39 | tf.reset_default_graph() 40 | X = tf.placeholder(tf.float32, shape=[None, 784]) 41 | y = tf.placeholder(tf.float32, shape=[None, 10]) 42 | W_conv1 = weight_variable([5, 5, 1, 32]) 43 | b_conv1 = bias_variable([32]) 44 | X_image = tf.reshape(X, [-1,28,28,1]) 45 | h_conv1 = tf.nn.relu(conv2d(X_image, W_conv1) + b_conv1) 46 | h_pool1 = max_pool_2x2(h_conv1) 47 | W_conv2 = weight_variable([5, 5, 32, 64]) 48 | b_conv2 = bias_variable([64]) 49 | h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) 50 | h_pool2 = max_pool_2x2(h_conv2) 51 | W_fc1 = weight_variable([7 * 7 * 64, 1024]) 52 | b_fc1 = bias_variable([1024]) 53 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) 54 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) 55 | W_fc2 = weight_variable([1024, 10]) 56 | b_fc2 = bias_variable([10]) 57 | h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2) 58 | losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1]) 59 | var_list = [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2] 60 | 61 | # Initialize interface 62 | opt_interface = interface_sgd.ProbLSOptimizerSGDInterface() 63 | opt_interface.minimize(losses, var_list) 64 | 65 | # Create session and initialize variables 66 | sess = tf.Session() 67 | sess.run(tf.initialize_all_variables()) 68 | opt_interface.register_session(sess) 69 | 70 | # Get data ready 71 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 72 | m = 128 73 | 74 | # Call prepare 75 | batch = mnist.train.next_batch(m) 76 | Xb, yb = batch[0], batch[1] 77 | opt_interface.prepare({X: Xb, y: yb}) 78 | 79 | # Run SGD steps 80 | lr = 0.1 81 | for i in range(1000): 82 | print opt_interface.adv_eval(lr, {X: Xb, y: yb}) 83 | print opt_interface.accept() -------------------------------------------------------------------------------- /test/test_gradient_moment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Tests for gradient moment computation in 4 | probls.tensorflow_interface.gradient_moment 5 | 6 | Created on Wed Nov 23 17:09:34 2016 7 | 8 | @author: Lukas Balles [lballes@tuebingen.mpg.de] 9 | """ 10 | 11 | import os 12 | import sys 13 | sys.path.insert(0, os.path.abspath('..')) 14 | 15 | import numpy as np 16 | import unittest 17 | import tensorflow as tf 18 | from tensorflow.examples.tutorials.mnist import input_data 19 | from probls.tensorflow_interface import gradient_moment as gm 20 | 21 | def weight_variable(shape): 22 | initial = tf.truncated_normal(shape, stddev=1e-2) 23 | return tf.Variable(initial) 24 | 25 | def bias_variable(shape): 26 | initial = tf.constant(0.05, shape=shape) 27 | return tf.Variable(initial) 28 | 29 | def conv2d(x, W): 30 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 31 | 32 | def max_pool_2x2(x): 33 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], 34 | strides=[1, 2, 2, 1], padding='SAME') 35 | 36 | class TestGradientMomentFullyConnected(unittest.TestCase): 37 | """Test.""" 38 | 39 | def setUp(self): 40 | # Set up model 41 | tf.reset_default_graph() 42 | X = tf.placeholder(tf.float32, shape=[None, 784]) 43 | y = tf.placeholder(tf.float32, shape=[None, 10]) 44 | W_fc1 = weight_variable([784, 1024]) 45 | b_fc1 = bias_variable([1024]) 46 | h_fc1 = tf.nn.relu(tf.matmul(X, W_fc1) + b_fc1) 47 | W_fc2 = weight_variable([1024, 10]) 48 | b_fc2 = bias_variable([10]) 49 | h_fc2 = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2) 50 | losses = -tf.reduce_sum(y*tf.log(h_fc2), reduction_indices=[1]) 51 | 52 | self.loss = tf.reduce_mean(losses) 53 | self.batch_size = tf.cast(tf.gather(tf.shape(losses), 0), tf.float32) 54 | self.var_list = [W_fc1, b_fc1, W_fc2, b_fc2] 55 | self.X = X 56 | self.y = y 57 | 58 | self.sess = tf.Session() 59 | self.sess.run(tf.initialize_all_variables()) 60 | 61 | self.mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 62 | 63 | def runTest(self): 64 | grads, grad_moms = gm.grads_and_grad_moms(self.loss, self.batch_size, 65 | self.var_list) 66 | # Check shapes 67 | for v, g, mom in zip(self.var_list, grads, grad_moms): 68 | self.assertEqual(v.get_shape(), g.get_shape()) 69 | self.assertEqual(v.get_shape(), mom.get_shape()) 70 | 71 | # Check against manual computation of moment 72 | m = 10 73 | batch = self.mnist.train.next_batch(m) 74 | Xb, yb = batch[0], batch[1] 75 | indiv_grads = [] 76 | for i in range(m): 77 | gs = self.sess.run(grads, feed_dict={self.X: Xb[[i],:], self.y: yb[[i],:]}) 78 | indiv_grads.append(gs) 79 | indiv_grads_arr = [np.stack([indiv_grads[i][j] for i in range(m)], axis=0) for j in range(len(self.var_list))] 80 | grads_manual = [np.mean(gs_var, axis=0) for gs_var in indiv_grads_arr] 81 | grad_moms_manual = [np.mean(gs_var**2, axis=0) for gs_var in indiv_grads_arr] 82 | grads_impl, grad_moms_impl = self.sess.run([grads, grad_moms], feed_dict={self.X: Xb, self.y: yb}) 83 | for grm, gri in zip(grads_manual, grads_impl): 84 | self.assertTrue(np.allclose(grm, gri, rtol=1e-4)) 85 | for gmm, gmi in zip(grad_moms_manual, grad_moms_impl): 86 | self.assertTrue(np.allclose(gmm, gmi, rtol=1e-4)) 87 | 88 | if __name__ == "__main__": 89 | unittest.main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Probabilistic Line Search 2 | 3 | This is a Python implementation of a _Probabilistic Line Searches for Stochastic 4 | Optimization_ ([NIPS paper][1], [extended version][3]) plus a TensorFlow interface that allows you to use the line 5 | search to train your TensorFlow model. **Please note: this is a development version with multiple experimental changes compared to the original paper!** 6 | 7 | ## The Algorithm in a Nutshell 8 | The probabilistic line search is an algorithm for the optimization of a 9 | stochastic objective function F. Being at point x and having fixed a search 10 | direction d, it maintains a Gaussian process model for the one-dimensional 11 | function f(t) = F(x + td). This function and its derivative are evaluated at 12 | (possibly multiple) step sizes t, updating the GP after each observation. This 13 | is repeated until a _probabilistic belief_ over a quality criterion of the step 14 | size, implied by the GP, exceeds a certain threshold. 15 | 16 | ## Installation 17 | 18 | No installation is required, just clone this git repositiory to your machine. 19 | 20 | Requirements: 21 | - tensorflow (0.12.0 is known to work) 22 | - numpy (1.11.2 is known to work) 23 | - scipy (0.13.3 is known to work) 24 | - Some of the demo scripts require additional packages, like sys, os, matplotlib 25 | et cetera. 26 | 27 | ## Usage 28 | 29 | The built-in TensorFlow optimizers are used roughly like this 30 | 31 | ```python 32 | var_list = ... 33 | losses = ... # A vector of losses, one for each example in the batch 34 | 35 | loss = tf.mean(losses) 36 | opt = tf.train.GradientDescentOptimizer(learning_rate) 37 | sgd_step = opt.minimize(loss) 38 | sess = tf.Session() 39 | sess.run(tf.initialize_all_variables()) 40 | 41 | for i in range(num_steps): 42 | ... 43 | sess.run(sgd_step, feed_dict_if_applicable) 44 | ``` 45 | 46 | Usage is slightly different for the probabilistic line search optimizer, but its only five additional lines of code: 47 | 48 | ```python 49 | from probls.tensorflow_interface.interface_sgd import ProbLSOptimizerSGDInterface 50 | from probls.line_search import ProbLSOptimizer 51 | 52 | var_list = ... 53 | losses = ... # A vector of losses, one for each example in the batch 54 | 55 | opt_interface = ProbLSOptimizerSGDInterface() 56 | opt_interface.minimize(losses, var_list) # Note that we pass losses, not an aggregate mean loss 57 | sess = tf.Session() 58 | sess.run(tf.initialize_all_variables()) 59 | opt_interface.register_session(sess) 60 | opt_ls = ProbLSOptimizer(opt_interface) 61 | opt_ls.prepare(feed_dict_if_applicable) 62 | 63 | for i in range(num_steps): 64 | ... 65 | opt_ls.proceed(feed_dict_if_applicable) 66 | ``` 67 | 68 | The effects of these individual commands will become clear in the next section. 69 | See the ``examples/`` folder for working demo scripts. 70 | 71 | 72 | ## Quick Guide to this Implementation 73 | 74 | This implementation consists of two major components: 75 | - A line search class (``ProbLSOptimizer``). It performs the line search, i.e. it gathers observations, updates the GP model, decides where to evaluate next, et cetera. The ``ProbLSOptimizer`` takes as argument a ``func`` object that is the "interface" to the objective function. It assumes that this interface has certain methods for evaluating at new points or accepting the current one; see below. 76 | - The TensorFlow interface ``ProbLSOptimizerSGDInterface``. This can be used as the ``func`` argument for a ``ProbLSOptimizer`` and provides the necessary interface to use the line search to train your TensorFlow model. 77 | 78 | ### Line Search 79 | 80 | The ``ProbLSOptimizer`` class is implemented in ``probls.line_search``. It 81 | excepts a ``func`` argument which acts as the interface to the objective function. 82 | It is assumend that ``func`` has three methods: 83 | - ``f, df, fvar, dfvar = func.adv_eval(dt, *args)`` to proceed along the current search 84 | direction by an increment ``dt``, returning function value ``f``, projected gradient ``df`` 85 | and variance estimates for both (``fvar, dfvar``). 86 | - ``f, df, fvar, dfvar = func.accept()`` to accept the current step size, 87 | returning function value, projected gradients and an estimate of the variance 88 | of these two quantities (``df`` and ``dfvar`` with respect to the new search direction). 89 | - ``f, df, fvar, dfvar = func.prepare(*args)`` to prepare the interface returning an 90 | initial observation. 91 | 92 | ``*args`` are additional positional arguments, e.g.. an optional feed_dict in the case the TensorFlow interface; see below. 93 | The line search algorithm "communicates" with the objective function exclusively via these three methods. 94 | 95 | Other than ``func``, ``ProbLSOptimizer`` has no required arguments, most notably, no learning rate! 96 | The remaining arguments are design parameters of the line search algorithm. See the docstring of ``ProbLSOptimizer`` a description of these parameters. 97 | 98 | ``opt_ls`` has two methods that are of interest for the end-user. 99 | - ``opt_ls.prepare(*pass_to_func_args)`` has to be called once to initialize the line search. 100 | - ``opt_ls.proceed(*pass_to_func_args)`` proceeds one step in the line search (i.e. one 101 | function evaluation). We call this method for however many steps we want to train the model. This is where 102 | the actual line search happens, so check out its code (and that of the subroutines it calls) to get an idea of what is going on! 103 | 104 | The Gaussian process functionality needed in the line search is outsourced to 105 | ``probls.gaussian_process``. It implements one-dimensional Gaussian process regression with an integrated 106 | Wiener process kernel that uses observations of both the function value and the 107 | derivative. For details, see the docstring of the ``ProbLSGaussianProcess`` class. 108 | 109 | ### TensorFlow Interface 110 | 111 | The TensorFlow interface ``ProbLSOptimizerSGDInterface`` is implemented in ``probls.tensorflow_interface.interface_sgd``. 112 | It inherits from ``tf.train.Optimizer`` and implements the necessary functionality to serve as the ``func`` argument of the ``ProbLSOptimizer``, providing the 113 | desired interface to the objective function defined by your TensorFlow model. 114 | Its ``minimize(losses, var_list)`` method adds to sets of operations to the TensorFlow graph: 115 | - ``adv_eval_op`` 116 | Advance along the current search direction, compute the loss, 117 | the gradients and variances of both. Gradient and its variance are stored 118 | in slot variables. Return the loss ``f``, projected gradient ``df``, 119 | variance of the loss fvar, and variance of the projected gradient dfvar 120 | - ``accept_op``: 121 | Accept the current point. Set its gradient as the new search direction. 122 | Returns f, df fvar and dfvar, where df and dfvar are now with respect to this new search direction. 123 | 124 | In order for the ``ProbLSOptimizerSGDInterface`` object to work as a self-contained 125 | interface that can perform function/gradient evaluations, you have to pass it a 126 | TensorFlow session via its ``register_session(sess)`` method. After that, the interface is 127 | ready to go and provides the three aforementioned methods ``adv_eval(dt, optional_feed_dict)``, ``accept()`` and ``prepare(optional_feed_dict)``. 128 | 129 | A crucial part of the line search are within-batch estimates of the variance of the function 130 | value and the gradient, see equations (17) and (18) in the [paper][1]. The variance 131 | of the objective is easily computed given the individual loss values for the examples 132 | in the batch. That is why we pass the vector of ``losses``, instead of a mean ``loss``. 133 | Computing the gradient variance is a little tricky; a detailed explanation can be found in this [note][2]. 134 | For the implementation, see ``probls.tensorflow_interface.gradient_moment``. 135 | 136 | [1]: https://arxiv.org/abs/1502.02846 137 | [2]: https://drive.google.com/open?id=0B0adgqwcMJK5aDNaQ2Q4ZmhCQzA 138 | [3]: https://arxiv.org/abs/1703.10034 139 | -------------------------------------------------------------------------------- /probls/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Utility functions for probabilistic line search algorithm. 4 | """ 5 | 6 | import numpy as np 7 | from scipy.special import erf 8 | 9 | def bounded_bivariate_normal_integral(rho, xl, xu, yl, yu): 10 | """Computes the bounded bivariate normal integral. 11 | 12 | Computes the probability that ``xu >= X >= xl and yu >= Y >= yl`` where X 13 | and Y are jointly Gaussian random variables, with mean ``[0., 0.]`` and 14 | covariance matrix ``[[1., rho], [rho, 1.]]``. 15 | 16 | Inputs: 17 | :rho: Correlation coefficient of the bivariate normal random variable 18 | :xl, yl: Lower bounds of the integral 19 | :xu, yu: Upper bounds of the integral 20 | 21 | Ported from a Matlab implementation by Alan Genz which, in turn, is based on 22 | the method described by 23 | Drezner, Z and G.O. Wesolowsky, (1989), 24 | On the computation of the bivariate normal inegral, 25 | Journal of Statist. Comput. Simul. 35, pp. 101-107, 26 | 27 | Copyright statement of Alan Genz's version: 28 | *************** 29 | Copyright (C) 2013, Alan Genz, All rights reserved. 30 | 31 | Redistribution and use in source and binary forms, with or without 32 | modification, are permitted provided the following conditions are met: 33 | - Redistributions of source code must retain the above copyright 34 | notice, this list of conditions and the following disclaimer. 35 | - Redistributions in binary form must reproduce the above copyright 36 | notice, this list of conditions and the following disclaimer in 37 | the documentation and/or other materials provided with the 38 | distribution. 39 | - The contributor name(s) may not be used to endorse or promote 40 | products derived from this software without specific prior 41 | written permission. 42 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 43 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 44 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 45 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 46 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 47 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 48 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 49 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 50 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 51 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE 52 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" 53 | 54 | bvnu = unbounded_bivariate_normal_integral 55 | p = bvnu(rho, xl, yl) - bvnu(rho, xu, yl) \ 56 | - bvnu(rho, xl, yu) + bvnu(rho, xu, yu) 57 | return max(0., min(p, 1.)) 58 | 59 | def unbounded_bivariate_normal_integral(rho, xl, yl): 60 | """Computes the unbounded bivariate normal integral. 61 | 62 | Computes the probability that ``X>=xl and Y>=yl`` where X and Y are jointly 63 | Gaussian random variables, with mean ``[0., 0.]`` and covariance matrix 64 | ``[[1., rho], [rho, 1.]]``. 65 | 66 | Note: to compute the probability that ``X < xl and Y < yl``, use 67 | ``unbounded_bivariate_normal_integral(rho, -xl, -yl)``. 68 | 69 | Inputs: 70 | :rho: Correlation coefficient of the bivariate normal random variable 71 | :xl, yl: Lower bounds of the integral 72 | 73 | Ported from a Matlab implementation by Alan Genz which, in turn, is based on 74 | the method described by 75 | Drezner, Z and G.O. Wesolowsky, (1989), 76 | On the computation of the bivariate normal inegral, 77 | Journal of Statist. Comput. Simul. 35, pp. 101-107, 78 | 79 | Copyright statement of Alan Genz's version: 80 | *************** 81 | Copyright (C) 2013, Alan Genz, All rights reserved. 82 | 83 | Redistribution and use in source and binary forms, with or without 84 | modification, are permitted provided the following conditions are met: 85 | - Redistributions of source code must retain the above copyright 86 | notice, this list of conditions and the following disclaimer. 87 | - Redistributions in binary form must reproduce the above copyright 88 | notice, this list of conditions and the following disclaimer in 89 | the documentation and/or other materials provided with the 90 | distribution. 91 | - The contributor name(s) may not be used to endorse or promote 92 | products derived from this software without specific prior 93 | written permission. 94 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 95 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 96 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 97 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 98 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 99 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 100 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 101 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 102 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 103 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE 104 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" 105 | 106 | rho = max(-1., min(1., rho)) 107 | 108 | if np.isposinf(xl) or np.isposinf(yl): 109 | return 0. 110 | elif np.isneginf(xl): 111 | return 1. if np.isneginf(yl) else _cdf(-yl) 112 | elif np.isneginf(yl): 113 | return _cdf(-xl) 114 | elif rho == 0: 115 | return _cdf(-xl)*_cdf(-yl) 116 | 117 | tp = 2.*np.pi 118 | h, k = xl, yl 119 | hk = h*k 120 | bvn = 0. 121 | 122 | if np.abs(rho) < 0.3: 123 | # Gauss Legendre points and weights, n = 6 124 | w = np.array([0.1713244923791705, 0.3607615730481384, 0.4679139345726904]) 125 | x = np.array([0.9324695142031522, 0.6612093864662647, 0.2386191860831970]) 126 | elif np.abs(rho) < 0.75: 127 | # Gauss Legendre points and weights, n = 12 128 | w = np.array([0.04717533638651177, 0.1069393259953183, 0.1600783285433464, 129 | 0.2031674267230659, 0.2334925365383547, 0.2491470458134029]) 130 | x = np.array([0.9815606342467191, 0.9041172563704750, 0.7699026741943050, 131 | 0.5873179542866171, 0.3678314989981802, 0.1252334085114692]) 132 | else: 133 | # Gauss Legendre points and weights, n = 20 134 | w = np.array([.01761400713915212, .04060142980038694, .06267204833410906, 135 | .08327674157670475, 0.1019301198172404, 0.1181945319615184, 136 | 0.1316886384491766, 0.1420961093183821, 0.1491729864726037, 137 | 0.1527533871307259]) 138 | x = np.array([0.9931285991850949, 0.9639719272779138, 0.9122344282513259, 139 | 0.8391169718222188, 0.7463319064601508, 0.6360536807265150, 140 | 0.5108670019508271, 0.3737060887154196, 0.2277858511416451, 141 | 0.07652652113349733]) 142 | 143 | w = np.tile(w, 2) 144 | x = np.concatenate([1.-x, 1.+x]) 145 | 146 | if np.abs(rho) < 0.925: 147 | hs = .5 * (h*h + k*k) 148 | asr = .5*np.arcsin(rho) 149 | sn = np.sin(asr*x) 150 | bvn = np.dot(w, np.exp((sn*hk-hs)/(1.-sn**2))) 151 | bvn = bvn*asr/tp + _cdf(-h)*_cdf(-k) 152 | else: 153 | if rho < 0.: 154 | k = -k 155 | hk = -hk 156 | if np.abs(rho) < 1.: 157 | ass = 1.-rho**2 158 | a = np.sqrt(ass) 159 | bs = (h-k)**2 160 | asr = -.5*(bs/ass + hk) 161 | c = (4.-hk)/8. 162 | d = (12.-hk)/80. 163 | if asr > -100.: 164 | bvn = a*np.exp(asr)*(1.-c*(bs-ass)*(1.-d*bs)/3. + c*d*ass**2) 165 | if hk > -100.: 166 | b = np.sqrt(bs) 167 | sp = np.sqrt(tp)*_cdf(-b/a) 168 | bvn = bvn - np.exp(-.5*hk)*sp*b*(1. - c*bs*(1.-d*bs)/3.) 169 | a = .5*a 170 | xs = (a*x)**2 171 | asr = -.5*(bs/xs + hk) 172 | inds = [i for i, asr_elt in enumerate(asr) if asr_elt>-100.] 173 | xs = xs[inds] 174 | sp = 1. + c*xs*(1.+5.*d*xs) 175 | rs = np.sqrt(1.-xs) 176 | ep = np.exp(-.5*hk*xs / (1.+rs)**2)/rs 177 | bvn = (a*np.dot(np.exp(asr[inds])*(sp-ep), w[inds]) - bvn)/tp 178 | if rho > 0: 179 | bvn += _cdf(-max(h, k)) 180 | elif h >= k: 181 | bvn = -bvn 182 | else: 183 | if h < 0.: 184 | L = _cdf(k)-_cdf(h) 185 | else: 186 | L = _cdf(-h)-_cdf(-k) 187 | bvn = L - bvn 188 | 189 | return max(0., min(1., bvn)) 190 | 191 | def _cdf(z): 192 | """Cumulative density function (CDF) of the standard normal distribution.""" 193 | return .5 * (1. + erf(z/np.sqrt(2.))) -------------------------------------------------------------------------------- /probls/tensorflow_interface/gradient_moment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Computation of *moments* of gradients through tensorflow operations. 4 | 5 | Tensorflow is typically used for empircal risk minimzation with gradient-based 6 | optimization methods. That is, we want to adjust trainable variables ``W``, 7 | such as to minimize an objective quantity, called ``LOSS``, of the form 8 | 9 | LOSS(W) = (1/n) * sum{i=1:n}[ loss(W, d_i) ] 10 | 11 | That is the mean of individual losses induced by ``n`` training data points 12 | ``d_i``. Consquently, the gradient of ``LOSS`` w.r.t. the variables ``W`` is 13 | the mean of individual gradients ``dloss(W, d_i)``. These individual gradients 14 | are not computed separately when we call ``tf.gradients`` on the aggregate 15 | ``LOSS``. Instead, they are implicitly aggregated by the operations in the 16 | backward graph. This batch processing is crucial for the computational 17 | efficiency of the gradient computation. 18 | 19 | This module provides functionality to compute the ``p``-th moment of the 20 | individual gradients, i.e. the quantity 21 | 22 | MOM(W) = (1/n) * sum{i=1:n}[ dloss(w, d_i)**p ] 23 | 24 | without giving up the efficiency of batch processing. For a more detailed 25 | explanation, see the note [1]. Applications of this are the computation of the 26 | gradient variance estimate in [2] and [3]. 27 | 28 | [1] https://drive.google.com/open?id=0B0adgqwcMJK5aDNaQ2Q4ZmhCQzA 29 | 30 | [2] M. Mahsereci and P. Hennig. Probabilistic line searches for stochastic 31 | optimization. In Advances in Neural Information Processing Systems 28, pages 32 | 181-189, 2015. 33 | 34 | [3] L. Balles, J. Romero and P. Hennig. Coupling Adaptive Batch Sizes with 35 | Learning Rates. In arXiv preprint arXiv:1612.05086, 2016. 36 | https://arxiv.org/abs/1612.05086. 37 | """ 38 | 39 | import tensorflow as tf 40 | from tensorflow.python.ops import gen_array_ops 41 | 42 | VALID_TYPES = ["MatMul", "Conv2D", "Add"] 43 | VALID_REGULARIZATION_TYPES = ["L2Loss"] 44 | 45 | def _check_and_sort_ops(op_list): 46 | """Sort a list of ops according to type into valid types for which we can 47 | compute the gradient moment) and regularizers. Raise an exception when 48 | encountering an op of invalid type.""" 49 | 50 | valid, regularizers = [], [] 51 | for op in op_list: 52 | if op.type in VALID_TYPES: 53 | valid.append(op) 54 | elif op.type in VALID_REGULARIZATION_TYPES: 55 | regularizers.append(op) 56 | else: 57 | raise Exception("A variable in var_list is consumed by an operation of " 58 | "type {} for which I don't how to compute the gradient moment. " 59 | "Allowed are types {} and regularization operations " 60 | "of type {}".format(op.type, str(VALID_TYPES), 61 | str(VALID_REGULARIZATION_TYPES))) 62 | return valid, regularizers 63 | 64 | def grads_and_grad_moms(loss, batch_size, var_list, mom=2): 65 | """Compute the gradients and gradient moments of ``loss`` w.r.t. to the 66 | variables in ``var_list`` 67 | 68 | Inputs: 69 | :loss: The tensor containing the scalar loss. The loss has to be the 70 | ``tf.mean`` of ``batch_size`` individual losses induced by 71 | individual training data points. 72 | :batch_size: Self-explanatory. Integer tensor. 73 | :var_list: The list of variables. 74 | :mom: The desired moment. Integer. Defaults to 2. 75 | 76 | Returns: 77 | :v_grads: The gradients of ``loss`` w.r.t. the variables in ``var_list`` 78 | as computed by ``tf.gradients(loss, var_list)``. 79 | :grad_moms: The gradient moments for each variable in ``var_list``.""" 80 | 81 | assert len(set(var_list)) == len(var_list) 82 | vs = [tf.convert_to_tensor(v) for v in var_list] 83 | num_vars = len(vs) 84 | 85 | consumers = [] 86 | consumer_outs = [] 87 | for v in vs: 88 | valid, regularizers = _check_and_sort_ops(v.consumers()) 89 | if len(valid) > 1: 90 | raise Exception("Variable {} is consumed by more than one operation " 91 | "(ignoring regularization operations)".format(v.name)) 92 | if len(regularizers) > 1: 93 | raise Exception("Variable {} is consumed by more than one " 94 | "regularization operation".format(v.name)) 95 | consumers.extend(valid) 96 | consumer_outs.extend(valid[0].outputs) 97 | 98 | # Use tf.gradients to compute gradients w.r.t. the variables, while also 99 | # retrieving gradients w.r.t. the outputs 100 | all_grads = tf.gradients(loss, vs+consumer_outs) 101 | v_grads = all_grads[0:num_vars] 102 | out_grads = all_grads[num_vars::] 103 | 104 | # Compute the gradient moment for each (v, vp, op, output) 105 | with tf.name_scope("grad_moms"): 106 | grad_moms = [_GradMom(o, v, out_grad, batch_size, mom) 107 | for o, v, out_grad in zip(consumers, vs, out_grads)] 108 | 109 | return (v_grads, grad_moms) 110 | 111 | def _GradMom(op, v, out_grad, batch_size, mom=2): 112 | """Wrapper function for the operation type-specific GradMom functions below. 113 | 114 | Inputs: 115 | :op: A tensorflow operation of type in VALID_TYPES. 116 | :v: The read-tensor of the trainable variable consumed by this operation. 117 | :out_grad: The tensor containing the gradient w.r.t. to the output of 118 | the op (as computed by ``tf.gradients``). 119 | :batch_size: Batch size ``m`` (constant integer or scalar int tf.Tensor) 120 | :mom: Integer moment desired (defaults to 2).""" 121 | 122 | with tf.name_scope(op.name+"_grad_mom"): 123 | if op.type == "MatMul": 124 | return _MatMulGradMom(op, v, out_grad, batch_size, mom) 125 | elif op.type == "Conv2D": 126 | return _Conv2DGradMom(op, v, out_grad, batch_size, mom) 127 | elif op.type == "Add": 128 | return _AddGradMom(op, v, out_grad, batch_size, mom) 129 | else: 130 | raise ValueError("Don't know how to compute gradient moment for " 131 | "variable {}, consumed by operation of type {}".format(v.name, 132 | op.type)) 133 | 134 | def _MatMulGradMom(op, W, out_grad, batch_size, mom=2): 135 | """Computes gradient moment for a weight matrix through a MatMul operation. 136 | 137 | Assumes ``Z=tf.matmul(A, W)``, where ``W`` is a d1xd2 weight matrix, ``A`` 138 | are the nxd1 activations of the previous layer (n being the batch size). 139 | ``out_grad`` is the gradient w.r.t. ``Z``, as computed by ``tf.gradients()``. 140 | No transposes in the MatMul operation allowed. 141 | 142 | Inputs: 143 | :op: The MatMul operation 144 | :W: The weight matrix (the tensor, not the variable) 145 | :out_grad: The tensor of gradient w.r.t. to the output of the op 146 | :batch_size: Batch size n (constant integer or scalar int tf.Tensor) 147 | :mom: Integer moment desired (defaults to 2)""" 148 | 149 | assert op.type == "MatMul" 150 | t_a, t_b = op.get_attr("transpose_a"), op.get_attr("transpose_b") 151 | assert W is op.inputs[1] and not t_a and not t_b 152 | 153 | A = op.inputs[0] 154 | out_grad_pow = tf.pow(out_grad, mom) 155 | A_pow = tf.pow(A, mom) 156 | return tf.mul(batch_size, tf.matmul(A_pow, out_grad_pow, transpose_a=True)) 157 | 158 | def _Conv2DGradMom(op, f, out_grad, batch_size, mom=2): 159 | """Computes gradient moment for the filter of a Conv2D operation. 160 | 161 | Assumes ``Z=tf.nn.conv2d(A, f)``, where ``f`` is a ``[h_f, w_f, c_in, c_out]`` 162 | convolution filter and ``A`` are the ``[n, h_in, w_in, c_in]`` activations of 163 | the previous layer (``n`` being the batch size). ``out_grad`` is the gradient 164 | w.r.t. ``Z``, as computed by ``tf.gradients()``. 165 | 166 | Inputs: 167 | :op: The Conv2D operation 168 | :f: The filter (the tensor, not the variable) 169 | :out_grad: The tensor of gradient w.r.t. to the output of the op 170 | :batch_size: Batch size ``n`` (constant integer or scalar int tf.Tensor) 171 | :mom: Integer moment desired (defaults to 2)""" 172 | 173 | assert op.type == "Conv2D" 174 | assert f is op.inputs[1] 175 | 176 | strides = op.get_attr("strides") 177 | padding = op.get_attr("padding") 178 | use_cudnn = op.get_attr("use_cudnn_on_gpu") 179 | data_format = op.get_attr("data_format") 180 | 181 | inp = op.inputs[0] 182 | inp_pow = tf.pow(inp, mom) 183 | 184 | f_shape = tf.shape(f) 185 | out_grad_pow = tf.pow(out_grad, mom) 186 | 187 | raw_moment = tf.nn.conv2d_backprop_filter(inp_pow, f_shape, out_grad_pow, 188 | strides, padding, use_cudnn, data_format) 189 | return tf.mul(batch_size, raw_moment) 190 | 191 | def _AddGradMom(op, b, out_grad, batch_size, mom=2): 192 | """Computes gradient moment for a bias variable through an Add operation. 193 | 194 | Assumes ``Z = tf.add(Zz, b)``, where ``b`` is a bias parameter and ``Zz`` is 195 | a ``[n, ?]`` tensor (``n`` being the batch size). Broadcasting for all kinds 196 | of shapes of ``Zz`` (e.g. ``[n, d_in]`` or ``[n, h_in, w_in, c_in]`` are 197 | supported. ``out_grad`` is the gradient w.r.t. ``Z``, as computed by 198 | ``tf.gradients()``. 199 | 200 | Inputs: 201 | :op: The Add operation 202 | :b: The bias parameter (the tensor, not the variable) 203 | :out_grad: The tensor of gradient w.r.t. to the output of the op 204 | :batch_size: Batch size ``n`` (constant integer or scalar int tf.Tensor) 205 | :mom: Integer moment desired (defaults to 2)""" 206 | 207 | assert op.type == "Add" 208 | 209 | out_grad_pow = tf.pow(out_grad, mom) 210 | 211 | if b is op.inputs[0]: 212 | y = op.inputs[1] 213 | sx = tf.shape(b) 214 | sy = tf.shape(y) 215 | rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy) 216 | raw_mom = tf.reshape(tf.reduce_sum(out_grad_pow, rx), sx) 217 | elif b is op.inputs[1]: 218 | x = op.inputs[0] 219 | sx = tf.shape(x) 220 | sy = tf.shape(b) 221 | rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy) 222 | raw_mom = tf.reshape(tf.reduce_sum(out_grad_pow, ry), sy) 223 | return tf.mul(batch_size, raw_mom) 224 | -------------------------------------------------------------------------------- /probls/tensorflow_interface/interface_sgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | TensorFlow optimizer that acts as an interface for the probabilistic line 4 | search algorithm. 5 | """ 6 | 7 | import tensorflow as tf 8 | import gradient_moment as gm 9 | 10 | class ProbLSOptimizerSGDInterface(tf.train.Optimizer): 11 | """Optimizer that implements gradient descent with and interface for the 12 | probabilistic line search algorithm. 13 | @@__init__ 14 | """ 15 | 16 | def __init__(self, momentum=None, use_locking=False, name="ProbLS"): 17 | """Construct a new probabilistic line search optimizer. 18 | 19 | Args: 20 | 21 | :momentum: None or scalar momentum parameter. 22 | :use_locking: If True use locks for update operations. 23 | :name: Optional name prefix for the operations created when applying 24 | gradients. Defaults to "ProbLS". 25 | """ 26 | super(ProbLSOptimizerSGDInterface, self).__init__(use_locking, name) 27 | 28 | assert momentum is None or (isinstance(momentum, float) and 0<=momentum<=1) 29 | self.momentum = momentum 30 | 31 | self._ops_ready = False 32 | self._prepared = False 33 | self.sess = None 34 | 35 | self.dt = None 36 | self.adv_eval_op = None 37 | self.accept_op = None 38 | 39 | def _create_slots(self, var_list): 40 | for v in var_list: 41 | self._zeros_slot(v, "grad", "grad") # Variables to memorize gradients 42 | self._zeros_slot(v, "dir", "dir") # Search direction 43 | self._zeros_slot(v, "gradvar", "gradvar") # Gradient variance 44 | 45 | def minimize(self, losses, var_list): 46 | """Add operations to perform SGD with probabilistic line search. This 47 | comprises two sets of operations: 48 | 49 | 1) adv_eval_op: 50 | Advance along the current search direction, compute the loss, 51 | the gradients and both variances. Gradient and its variance are stored 52 | in slot variables. Return the loss f, projected gradient df, 53 | variance of the loss fvar, and variance of the projected gradient dfvar 54 | 2) accept_op: 55 | Accept the current point. Set its gradient as the new search direction. 56 | Returns df and dfvar with respect to this new search direction. 57 | 58 | Inputs: 59 | :losses: A Tensor of shape (batch_size,) containing the *individual* 60 | loss for each example in the batch. Do *not* pass a scalar mean loss 61 | as for the built-in tensorflow optimizers. 62 | :var_list: List of Variable objects to update to minimize loss.""" 63 | 64 | assert isinstance(losses, tf.Tensor) 65 | for var in var_list: assert isinstance(var, tf.Variable) 66 | assert len(var_list) >= 0 67 | assert len(var_list) == len(set(var_list)) # Check for duplicates 68 | 69 | input_dtype = losses.dtype.base_dtype 70 | 71 | # Create and retrieve slot variables 72 | self._create_slots(var_list) 73 | mem_grads = [self.get_slot(v, "grad") for v in var_list] 74 | dirs = [self.get_slot(v, "dir") for v in var_list] 75 | mem_gradvars = [self.get_slot(v, "gradvar") for v in var_list] 76 | mem_f = tf.Variable(0.0, input_dtype, name="mem_f") 77 | mem_fvar = tf.Variable(0.0, input_dtype, name="mem_fvar") 78 | 79 | with tf.name_scope("ProbLS"): 80 | 81 | ###### adv_eval_op ###################################################### 82 | # Extract the batch size, i.e. the length of the losses vector 83 | batch_size = tf.cast(tf.gather(tf.shape(losses), 0), input_dtype, 84 | name="batch_size") 85 | 86 | # Add a scalar placeholder dt and operations that advance t by dt, 87 | # i.e., update v += dt*d (v: variable, d: search direction) 88 | with tf.name_scope("advance_t"): 89 | self.dt = tf.placeholder(dtype=input_dtype, shape=[], name="delta_t") 90 | steps = [tf.mul(self.dt, tf.convert_to_tensor(d)) for d in dirs] 91 | advance_t_updates = [v.assign_add(s) for v, s in zip(var_list, steps)] 92 | 93 | # With a dependency on the advance_t update (making sure that a step is 94 | # taken first), add tensors that compute the loss f, the gradients and 95 | # the gradient moments 96 | with tf.control_dependencies(advance_t_updates): 97 | loss = tf.reduce_mean(losses, name="f") 98 | grads, moms = gm.grads_and_grad_moms(loss, batch_size, var_list) 99 | 100 | # Add variance of the loss 101 | ssl = tf.reduce_mean(tf.square(losses), name="sum_of_squared_losses") 102 | fvar = tf.div(ssl-tf.square(loss), batch_size-1., name="fvar") 103 | 104 | # Add projected gradient df (w.r.t. the current search direction) 105 | with tf.name_scope("df"): 106 | proj_grads = [tf.reduce_sum(tf.mul(g, d), name="proj_grad") 107 | for g, d in zip(grads, dirs)] 108 | df = tf.add_n(proj_grads, name="df") 109 | 110 | # Add gradient variances and the variance of df 111 | gradvars = [tf.div(mom-tf.square(g), batch_size-1.) 112 | for mom, g in zip(moms, grads)] 113 | dfvar = tf.add_n([tf.reduce_sum(gv*tf.square(d)) 114 | for gv, d in zip(gradvars, dirs)]) 115 | 116 | # Add operations to memorize stuff in variables. This is because they 117 | # are needed in the case that this points ends up being accepted (i.e., 118 | # if the accept op is called next). Stored quantities are 119 | # - gradients 120 | # - gradient moment 121 | # - f and fvar 122 | with tf.name_scope("memorize"): 123 | mem_updates = [v.assign(grad) for v, grad in zip(mem_grads, grads)] 124 | mem_updates.extend( 125 | [v.assign(gv) for v, gv in zip(mem_gradvars, gradvars)] 126 | ) 127 | mem_updates.append(mem_f.assign(loss)) 128 | mem_updates.append(mem_fvar.assign(fvar)) 129 | 130 | # With a dependency on the memorization, add the adv_eval_op. It is 131 | # simply the tuple (f, df, fvar, dfvar). All the dependencies make sure 132 | # that it also does the other stuff 133 | with tf.control_dependencies(mem_updates): 134 | self.adv_eval_op = tf.tuple([loss, df, fvar, dfvar], name="results") 135 | 136 | ###### accept_op ######################################################## 137 | # Operation that accepts the current state, i.e. 138 | # - sets the current gradient as the new search direction 139 | # - returns a new df, computed w.r.t. to that new search direction 140 | with tf.name_scope("accept"): 141 | # Add operations the set the new search direction 142 | if self.momentum is None: 143 | new_dirs = [tf.neg(g) for g in mem_grads] 144 | else: 145 | mu = tf.convert_to_tensor(self.momentum, name="momentum_mu") 146 | new_dirs = [mu*d-g for d, g in zip(dirs, mem_grads)] 147 | dir_updates = [d.assign(d_new) for d, d_new in zip(dirs, new_dirs)] 148 | 149 | # With a dependency on the search direction updates, compute df and 150 | # dfvar w.r.t. the new search direction, using the memorized gradients 151 | # and gradient variances 152 | with tf.control_dependencies(dir_updates): 153 | proj_grads_new = [tf.reduce_sum(g*d) 154 | for g, d in zip(mem_grads, dirs)] 155 | df_new = tf.add_n(proj_grads_new, name="df_new") 156 | dfvar_new = tf.add_n([tf.reduce_sum(gv*tf.square(d)) 157 | for gv, d in zip(mem_gradvars, dirs)]) 158 | self.accept_op = tf.tuple([mem_f, df_new, mem_fvar, dfvar_new], 159 | name="results_after_accept") 160 | 161 | # Set internal flag that the operations are now ready 162 | self._ops_ready = True 163 | 164 | def register_session(self, sess): 165 | """Register the session ``sess`` with this line search interface. 166 | Computations resulting from calls to ``prepare``, ``adv_eval`` or 167 | ``accept`` will be executed in this session. 168 | 169 | Inputs: 170 | :sess: A TensorFlow Session.""" 171 | 172 | if not self._ops_ready: 173 | raise Warning("You have to call minimize first") 174 | assert isinstance(sess, tf.Session) 175 | self.sess = sess 176 | 177 | def prepare(self, feed_dict=None): 178 | """Make a first evaluation to properly initialize all gradients, et cetera. 179 | Call this function before using ``adv_eval`` or ``accept``.""" 180 | 181 | if self.sess is None: 182 | raise Warning("You have to register a session first.") 183 | 184 | if feed_dict is None: 185 | feed_dict = {} 186 | feed_dict[self.dt] = 0.0 187 | 188 | # We need to evaluate and accept once in order to compute initial 189 | # gradients and accept them as search direction. Only then can we 190 | # make the first "real" evaluation and return the results 191 | self.sess.run(self.adv_eval_op, feed_dict) 192 | self.sess.run(self.accept_op) 193 | self.sess.run(self.adv_eval_op, feed_dict) 194 | self._prepared = True 195 | return self.sess.run(self.accept_op) 196 | 197 | def adv_eval(self, dt, feed_dict=None): 198 | """Advance by an increment ``dt`` along the current search direction and 199 | evaluate. 200 | 201 | Inputs: 202 | :dt: Float step size increment. 203 | :feed_dict: Optional feed_dict. 204 | 205 | Returns: 206 | :f: Function value at the new point. 207 | :df: Gradient at the new point, projected onto the search direction. 208 | :fvar: Variance of f. 209 | :dfvar: Variance of df.""" 210 | 211 | if not self._prepared: 212 | raise Warning("You have to call prepare first") 213 | if feed_dict is None: 214 | feed_dict = {} 215 | feed_dict[self.dt] = dt 216 | return self.sess.run(self.adv_eval_op, feed_dict) 217 | 218 | def accept(self): 219 | if not self._prepared: 220 | raise Warning("You have to call prepare first") 221 | return self.sess.run(self.accept_op) -------------------------------------------------------------------------------- /examples/cifar10.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Routine for decoding the CIFAR-10 binary file format.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import sys 23 | import os 24 | import tarfile 25 | 26 | from six.moves import xrange, urllib # pylint: disable=redefined-builtin 27 | import tensorflow as tf 28 | 29 | # Process images of this size. Note that this differs from the original CIFAR 30 | # image size of 32 x 32. If one alters this number, then the entire model 31 | # architecture will change and any model would need to be retrained. 32 | IMAGE_SIZE = 24 33 | 34 | # Global constants describing the CIFAR-10 data set. 35 | NUM_CLASSES = 10 36 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 37 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000 38 | 39 | DATA_DIR = "data/cifar-10/cifar-10-batches-bin" 40 | DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz' 41 | 42 | # Check if data is already there, if not download! 43 | dest_directory = "data/cifar-10" 44 | if not os.path.exists(dest_directory): 45 | os.makedirs(dest_directory) 46 | filename = DATA_URL.split('/')[-1] 47 | filepath = os.path.join(dest_directory, filename) 48 | if not os.path.exists(filepath): 49 | def _progress(count, block_size, total_size): 50 | sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename, 51 | float(count * block_size) / float(total_size) * 100.0)) 52 | sys.stdout.flush() 53 | filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress) 54 | print() 55 | statinfo = os.stat(filepath) 56 | print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') 57 | tarfile.open(filepath, 'r:gz').extractall(dest_directory) 58 | 59 | def read_cifar10(filename_queue): 60 | """Reads and parses examples from CIFAR10 data files. 61 | Recommendation: if you want N-way read parallelism, call this function 62 | N times. This will give you N independent Readers reading different 63 | files & positions within those files, which will give better mixing of 64 | examples. 65 | Args: 66 | filename_queue: A queue of strings with the filenames to read from. 67 | Returns: 68 | An object representing a single example, with the following fields: 69 | height: number of rows in the result (32) 70 | width: number of columns in the result (32) 71 | depth: number of color channels in the result (3) 72 | key: a scalar string Tensor describing the filename & record number 73 | for this example. 74 | label: an int32 Tensor with the label in the range 0..9. 75 | uint8image: a [height, width, depth] uint8 Tensor with the image data 76 | """ 77 | 78 | class CIFAR10Record(object): 79 | pass 80 | result = CIFAR10Record() 81 | 82 | # Dimensions of the images in the CIFAR-10 dataset. 83 | # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the 84 | # input format. 85 | label_bytes = 1 # 2 for CIFAR-100 86 | result.height = 32 87 | result.width = 32 88 | result.depth = 3 89 | image_bytes = result.height * result.width * result.depth 90 | # Every record consists of a label followed by the image, with a 91 | # fixed number of bytes for each. 92 | record_bytes = label_bytes + image_bytes 93 | 94 | # Read a record, getting filenames from the filename_queue. No 95 | # header or footer in the CIFAR-10 format, so we leave header_bytes 96 | # and footer_bytes at their default of 0. 97 | reader = tf.FixedLengthRecordReader(record_bytes=record_bytes) 98 | result.key, value = reader.read(filename_queue) 99 | 100 | # Convert from a string to a vector of uint8 that is record_bytes long. 101 | record_bytes = tf.decode_raw(value, tf.uint8) 102 | 103 | # The first bytes represent the label, which we convert from uint8->int32. 104 | result.label = tf.cast( 105 | tf.slice(record_bytes, [0], [label_bytes]), tf.int32) 106 | 107 | # The remaining bytes after the label represent the image, which we reshape 108 | # from [depth * height * width] to [depth, height, width]. 109 | depth_major = tf.reshape(tf.slice(record_bytes, [label_bytes], [image_bytes]), 110 | [result.depth, result.height, result.width]) 111 | # Convert from [depth, height, width] to [height, width, depth]. 112 | result.uint8image = tf.transpose(depth_major, [1, 2, 0]) 113 | 114 | return result 115 | 116 | 117 | def _generate_image_and_label_batch(image, label, min_queue_examples, 118 | batch_size, shuffle): 119 | """Construct a queued batch of images and labels. 120 | Args: 121 | image: 3-D Tensor of [height, width, 3] of type.float32. 122 | label: 1-D Tensor of type.int32 123 | min_queue_examples: int32, minimum number of samples to retain 124 | in the queue that provides of batches of examples. 125 | batch_size: Number of images per batch. 126 | shuffle: boolean indicating whether to use a shuffling queue. 127 | Returns: 128 | images: Images. 4D tensor of [batch_size, height, width, 3] size. 129 | labels: Labels. 1D tensor of [batch_size] size. 130 | """ 131 | # Create a queue that shuffles the examples, and then 132 | # read 'batch_size' images + labels from the example queue. 133 | num_preprocess_threads = 16 134 | if shuffle: 135 | images, label_batch = tf.train.shuffle_batch( 136 | [image, label], 137 | batch_size=batch_size, 138 | num_threads=num_preprocess_threads, 139 | capacity=min_queue_examples + 3 * batch_size, 140 | min_after_dequeue=min_queue_examples) 141 | else: 142 | images, label_batch = tf.train.batch( 143 | [image, label], 144 | batch_size=batch_size, 145 | num_threads=num_preprocess_threads, 146 | capacity=min_queue_examples + 3 * batch_size) 147 | 148 | return images, tf.reshape(label_batch, [batch_size]) 149 | 150 | 151 | def distorted_inputs(data_dir=DATA_DIR, batch_size=128): 152 | """Construct distorted input for CIFAR training using the Reader ops. 153 | Args: 154 | data_dir: Path to the CIFAR-10 data directory. 155 | batch_size: Number of images per batch. 156 | Returns: 157 | images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. 158 | labels: Labels. 1D tensor of [batch_size] size. 159 | """ 160 | filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) 161 | for i in xrange(1, 6)] 162 | for f in filenames: 163 | if not tf.gfile.Exists(f): 164 | raise ValueError('Failed to find file: ' + f) 165 | 166 | # Create a queue that produces the filenames to read. 167 | filename_queue = tf.train.string_input_producer(filenames) 168 | 169 | # Read examples from files in the filename queue. 170 | read_input = read_cifar10(filename_queue) 171 | reshaped_image = tf.cast(read_input.uint8image, tf.float32) 172 | 173 | height = IMAGE_SIZE 174 | width = IMAGE_SIZE 175 | 176 | # Image processing for training the network. Note the many random 177 | # distortions applied to the image. 178 | 179 | # Randomly crop a [height, width] section of the image. 180 | distorted_image = tf.random_crop(reshaped_image, [height, width, 3]) 181 | 182 | # Randomly flip the image horizontally. 183 | distorted_image = tf.image.random_flip_left_right(distorted_image) 184 | 185 | # Because these operations are not commutative, consider randomizing 186 | # the order their operation. 187 | distorted_image = tf.image.random_brightness(distorted_image, 188 | max_delta=63) 189 | distorted_image = tf.image.random_contrast(distorted_image, 190 | lower=0.2, upper=1.8) 191 | 192 | # Subtract off the mean and divide by the variance of the pixels. 193 | float_image = tf.image.per_image_standardization(distorted_image) 194 | 195 | # Ensure that the random shuffling has good mixing properties. 196 | min_fraction_of_examples_in_queue = 0.4 197 | min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 198 | min_fraction_of_examples_in_queue) 199 | print ('Filling queue with %d CIFAR images before starting to train. ' 200 | 'This will take a few minutes.' % min_queue_examples) 201 | 202 | # Generate a batch of images and labels by building up a queue of examples. 203 | return _generate_image_and_label_batch(float_image, read_input.label, 204 | min_queue_examples, batch_size, 205 | shuffle=True) 206 | 207 | 208 | def inputs(eval_data, data_dir=DATA_DIR, batch_size=128): 209 | """Construct input for CIFAR evaluation using the Reader ops. 210 | Args: 211 | eval_data: bool, indicating if one should use the train or eval data set. 212 | data_dir: Path to the CIFAR-10 data directory. 213 | batch_size: Number of images per batch. 214 | Returns: 215 | images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. 216 | labels: Labels. 1D tensor of [batch_size] size. 217 | """ 218 | if not eval_data: 219 | filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) 220 | for i in xrange(1, 6)] 221 | num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN 222 | else: 223 | filenames = [os.path.join(data_dir, 'test_batch.bin')] 224 | num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_EVAL 225 | 226 | for f in filenames: 227 | if not tf.gfile.Exists(f): 228 | raise ValueError('Failed to find file: ' + f) 229 | 230 | # Create a queue that produces the filenames to read. 231 | filename_queue = tf.train.string_input_producer(filenames) 232 | 233 | # Read examples from files in the filename queue. 234 | read_input = read_cifar10(filename_queue) 235 | reshaped_image = tf.cast(read_input.uint8image, tf.float32) 236 | 237 | height = IMAGE_SIZE 238 | width = IMAGE_SIZE 239 | 240 | # Image processing for evaluation. 241 | # Crop the central [height, width] of the image. 242 | resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image, 243 | width, height) 244 | 245 | # Subtract off the mean and divide by the variance of the pixels. 246 | float_image = tf.image.per_image_standardization(resized_image) 247 | 248 | # Ensure that the random shuffling has good mixing properties. 249 | min_fraction_of_examples_in_queue = 0.4 250 | min_queue_examples = int(num_examples_per_epoch * 251 | min_fraction_of_examples_in_queue) 252 | 253 | # Generate a batch of images and labels by building up a queue of examples. 254 | return _generate_image_and_label_batch(float_image, read_input.label, 255 | min_queue_examples, batch_size, 256 | shuffle=False) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2016 Max Planck Society. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright 2015, Max Planck Society. 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /probls/line_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Implementation of Probabilistic Line Search for Stochastic Optimization [1]. 4 | 5 | [1] M. Mahsereci and P. Hennig. Probabilistic line searches for stochastic 6 | optimization. In Advances in Neural Information Processing Systems 28, pages 7 | 181-189, 2015. 8 | """ 9 | 10 | import numpy as np 11 | import gaussian_process 12 | import utils 13 | 14 | class ProbLSOptimizer(object): 15 | """Probabilistic line search optimizer. 16 | 17 | @@__init__ 18 | """ 19 | 20 | def __init__(self, func, c1=0.05, cW=0.3, fpush=1.0, alpha0=0.01, 21 | target_df=0.5, df_lo=-0.1, df_hi=1.1, max_steps=10, max_expl=6, 22 | max_dmu0=0.0, max_change_factor=10.0, expl_policy="linear"): 23 | """Create a new probabilistic line search object. 24 | 25 | Inputs: 26 | :func: Interface to the objective function. We assume that it has three 27 | methods. 28 | - ``f, df, fvar, dfvar = func.adv_eval(dt)`` to proceed along the 29 | current search direction by an increment ``dt``, returning 30 | function value, projected gradient and variance estimates for 31 | both. 32 | - ``f, df, fvar, dfvar = func.accept()`` to accept the current 33 | step size, returning function value, projected gradients and an 34 | estimate of the variance of these two quantities. 35 | - ``f, df, fvar, dfvar = func.prepare()`` to prepare the interface 36 | returning an initial observation of function value and gradient, 37 | as well as the variances. 38 | If the function interface takes additional arguments (e.g. a feed 39 | dict with a batch of data in tensorflow), those are passed as 40 | positional arguments ``*pass_to_func_args``. 41 | :c1: Scalar parameters for the first Wolfe conditions. Default to 0.05. 42 | :cW: Acceptance threshold for the Wolfe probability. Defaults to 0.3. 43 | :fpush: Push factor that is multiplied with the accepted step size to get 44 | the base step size for the next line search. Defaults to 1.0. 45 | :alpha0: Initial step size. Defaults to 0.01. 46 | :target_df: The target value for the relative projected gradient 47 | df(t)/abs(df(0)). Defaults to 0.5. 48 | :df_lo, df_hi: Lower and higher threshold for the relative projected 49 | gradient df(t)/abs(df(0)). Default to -0.1 and 1.1. 50 | :max_steps: Maximum number of steps (function evaluations) per line 51 | search. Defaults to 10. 52 | :max_epl: Maximum number of exploration steps per line search. Defaults 53 | to 6. 54 | :max_dmu0: If the posterior derivative at t=0 exceeds ``max_dmu0``, the 55 | current line search is aborted as a safeguard against bad search 56 | directions. Defaults to 0.0. 57 | :max_change_factor: The algorithm usually takes the accepted alpha of the 58 | current line search as the base ``alpha0`` of the next one (after 59 | multiplying with ``fpush``). However, if a line search accepts an 60 | alpha that is more than ``max_change_factor`` times smaller or larger 61 | than the current ``alpha0``, we instead set the next ``alpha0`` to a 62 | running average of the accepted alphas (``alpha_stats``). Defaults to 63 | 10.0. 64 | :expl_policy: String indicating the policy used for exploring points *to 65 | the right* in the line search. If ``k`` is the number of exploration 66 | steps already made, then the ``"linear"`` exploration policy chooses 67 | ``2*(k+1)*alpha0`` as the next exploration candidate. The 68 | ``"exponential"`` policy chooses ``2**(k+1)*alpha0``. Defaults to 69 | ``"linear"``.""" 70 | 71 | # Make sure the function_interface is valid and store it 72 | assert hasattr(func, "adv_eval") 73 | assert hasattr(func, "accept") 74 | assert hasattr(func, "prepare") 75 | self.func = func 76 | 77 | # Store the line search parameters 78 | self.c1 = c1 79 | self.cW = cW 80 | self.fpush = fpush 81 | self.target_df = target_df 82 | self.df_lo = df_lo 83 | self.df_hi = df_hi 84 | self.max_steps = max_steps 85 | self.max_expl = max_expl 86 | self.max_dmu0 = max_dmu0 87 | self.max_change_factor = max_change_factor 88 | assert expl_policy in ["linear", "exponential"] 89 | self.expl_policy = expl_policy 90 | 91 | # Initialize base step size with given value. 92 | self.alpha0 = alpha0 93 | 94 | # alpha_stats will contain a running average of accepted step sizes 95 | self.alpha_stats = alpha0 96 | 97 | # Raw function values at the origin of the line search 98 | self.f0 = None 99 | self.df0 = None 100 | 101 | # Counting steps in the current line search and, separately, steps that 102 | # explore "to the right" 103 | self.num_steps = 0 104 | self.num_expl = 0 105 | 106 | # Initialize GP object 107 | self.gp = gaussian_process.ProbLSGaussianProcess() 108 | 109 | # Switch to assert that the prepare method will be called first 110 | self.prepare_called = False 111 | 112 | # Internal abort status 113 | self.abort_status = 0 114 | 115 | def scale_obs(self, f_raw, df_raw, fvar_raw, dfvar_raw): 116 | """Scale an observation of function value and gradient. See section 3.4 of 117 | [1] for details.""" 118 | 119 | f = (f_raw-self.f0)/(self.df0*self.alpha0) 120 | df = df_raw/(self.df0) 121 | fvar = fvar_raw/((self.alpha0*self.df0)**2) 122 | dfvar = dfvar_raw/(self.df0**2) 123 | return f, df, fvar, dfvar 124 | 125 | def rescale_t(self, t): 126 | """Rescale a step size used internally by multiplying with the base step 127 | size.""" 128 | 129 | return t*self.alpha0 130 | 131 | def rescale_obs(self, f, df, fvar, dfvar): 132 | """Rescale an observation to real-world scale.""" 133 | 134 | f_raw = f*self.df0*self.alpha0 + self.f0 135 | df_raw = df*self.df0 136 | fvar_raw = fvar*(self.alpha0*self.df0)**2 137 | dfvar_raw = dfvar*self.df0**2 138 | return f_raw, df_raw, fvar_raw, dfvar_raw 139 | 140 | def prepare(self, *pass_to_func_args): 141 | """Preparation. 142 | 143 | *pass_to_func_args are arguments that are passed to the function interface, 144 | e.g. a feed dict.""" 145 | 146 | # Call the prepare op of the function interface, reset the observation 147 | # lists, the sigmas, and f0 and df0 148 | f_raw, df_raw, fvar_raw, dfvar_raw = self.func.prepare(*pass_to_func_args) 149 | self.f0 = f_raw 150 | self.df0 = np.abs(df_raw) 151 | 152 | # Add the first observation to the gp 153 | f, df, fvar, dfvar = self.scale_obs(f_raw, df_raw, fvar_raw, dfvar_raw) 154 | self.gp.add(0.0, f, df, fvar, dfvar) 155 | 156 | # Set flag that the prepare method has been called 157 | self.prepare_called = True 158 | 159 | def accept(self): 160 | """Accept the most recent step size.""" 161 | 162 | assert self.abort_status != 1 163 | assert self.num_steps >= 1 164 | 165 | # Rescale to the "real-world" step size alpha 166 | alpha = self.rescale_t(self.gp.ts[-1]) 167 | 168 | # If this accept was not due to an abort and the step size did not change 169 | # *too much*, we use the accepted alpha as the new base step size alpha0 170 | # (and update a running average alpha_stats). Otherwise, we use said 171 | # running average as the new base step size. 172 | f = self.max_change_factor 173 | if self.abort_status == 0 and self.alpha0/f < alpha < self.alpha0*f: 174 | self.alpha_stats = 0.95*self.alpha_stats + 0.05*alpha 175 | self.alpha0 = self.fpush*alpha 176 | else: 177 | self.alpha0 = self.alpha_stats 178 | 179 | # Reset abort status and counters 180 | self.abort_status = 0 181 | self.num_steps = 0 182 | self.num_expl = 0 183 | 184 | # Run accept op, reset f0 and df0 185 | f_raw, df_raw, fvar_raw, dfvar_raw = self.func.accept() 186 | self.f0 = f_raw 187 | self.df0 = np.abs(df_raw) 188 | 189 | # Reset the gp and add the first observation to the gp 190 | self.gp.reset() 191 | f, df, fvar, dfvar = self.scale_obs(f_raw, df_raw, fvar_raw, dfvar_raw) 192 | self.gp.add(0.0, f, df, fvar, dfvar) 193 | 194 | def evaluate(self, t, *pass_to_func_args): 195 | """Evaluate at step size ``t``. 196 | 197 | *pass_to_func_args are arguments that are passed to the function interface, 198 | e.g. a feed dict.""" 199 | 200 | assert self.prepare_called 201 | 202 | self.num_steps += 1 203 | 204 | # Call the adv_eval method of the function interface with the increment 205 | # re-scaled to the "real-world" step size 206 | dt = t-self.gp.ts[-1] 207 | dalpha = self.rescale_t(dt) 208 | f_raw, df_raw, fvar_raw, dfvar_raw = self.func.adv_eval(dalpha, 209 | *pass_to_func_args) 210 | 211 | # Safeguard against inf or nan encounters. Trigerring abort. 212 | if np.isnan(f_raw) or np.isinf(f_raw) or np.isnan(df_raw) or np.isinf(df_raw): 213 | f_raw = 100.0 214 | df_raw = 10.0 215 | self.abort_status = 1 216 | 217 | # Scale the observations, add it to the GP and update the GP 218 | # We are currently using the variance estimates from t=0 for all 219 | # observations, but this might change in the future 220 | f, df, fvar, dfvar = self.scale_obs(f_raw, df_raw, fvar_raw, dfvar_raw) 221 | fvar = self.gp.fvars[0] 222 | dfvar = self.gp.dfvars[0] 223 | self.gp.add(t, f, df, fvar, dfvar) 224 | self.gp.update() 225 | 226 | def find_next_t(self): 227 | """Find the step size for the next evaluation.""" 228 | 229 | assert self.num_steps >= 1 230 | 231 | # Generate candidates: the points where the derivative of the posterior 232 | # mean equals the target value plus one exploration point to the right. 233 | candidates = self.gp.find_dmu_equal(self.target_df) 234 | if self.expl_policy == "linear": 235 | candidates.append(2.*(self.num_expl+1)) 236 | elif self.expl_policy == "exponential": 237 | candidates.append(2.**(self.num_expl+1)) 238 | else: 239 | raise Exception("Unknown exploration policy") 240 | print "\t * Computing utilities for candidates %s", candidates 241 | 242 | # Compute p_Wolfe for candidates 243 | pws = [self.compute_p_wolfe(t) for t in candidates] 244 | print "\t * p_Wolfe:", pws 245 | ind_best = np.argmax(pws) 246 | 247 | # Memorize when we have chosen the exploration point 248 | if ind_best == len(candidates) - 1: 249 | self.num_expl += 1 250 | 251 | # Return the candidate t with maximal utility 252 | print "\t * Best candidate is", candidates[ind_best], "(was candidate", ind_best, "/", len(candidates)-1, ")" 253 | return candidates[ind_best] 254 | 255 | def find_abort_t(self): 256 | """Find the step size to use for an abort.""" 257 | 258 | return 0.01 259 | # We are currently simply aborting with a very small step, but we might do 260 | # something like this: 261 | # ts = self.gp.ts 262 | # pws = [self.compute_p_wolfe(t) for t in ts] 263 | # if max(pws) > 0.5*self.cW: 264 | # t = ts[np.argmax(pws)] 265 | # else: 266 | # t = 0.0 267 | # offset = 0.01 268 | # 269 | # return t + offset 270 | 271 | def compute_p_wolfe(self, t): 272 | # Already changed dCov and Covd here 273 | """Computes the probability that step size ``t`` satisfies the adjusted 274 | Wolfe conditions under the current GP model.""" 275 | 276 | # Compute mean and covariance matrix of the two Wolfe quantities a and b 277 | # (equations (11) to (13) in [1]). 278 | mu0 = self.gp.mu(0.) 279 | dmu0 = self.gp.dmu(0.) 280 | mu = self.gp.mu(t) 281 | dmu = self.gp.dmu(t) 282 | V0 = self.gp.V(0.) 283 | Vd0 = self.gp.Vd(0.) 284 | dVd0 = self.gp.dVd(0.) 285 | dCov0t = self.gp.dCov_0(t) 286 | Covd0t = self.gp.Covd_0(t) 287 | 288 | ma = mu0 - mu + self.c1*t*dmu0 289 | Vaa = V0 + dVd0*(self.c1*t)**2 + self.gp.V(t) \ 290 | + 2.*self.c1*t*(Vd0 - dCov0t) - 2.*self.gp.Cov_0(t) 291 | mb = dmu 292 | Vbb = self.gp.dVd(t) 293 | 294 | # Very small variances can cause numerical problems. Safeguard against 295 | # this with a deterministic evaluation of the Wolfe conditions. 296 | if Vaa < 1e-9 or Vbb < 1e-9: 297 | return 1. if ma>=0. and mb>=0. else 0. 298 | 299 | Vab = Covd0t + self.c1*t*self.gp.dCovd_0(t) - self.gp.Vd(t) 300 | 301 | # Compute correlation factor and integration bounds for adjusted p_Wolfe 302 | # and return the result of the bivariate normal integral. 303 | rho = Vab/np.sqrt(Vaa*Vbb) 304 | al = -ma/np.sqrt(Vaa) 305 | bl = (self.df_lo - mb)/np.sqrt(Vbb) 306 | bu = (self.df_hi - mb)/np.sqrt(Vbb) 307 | return utils.bounded_bivariate_normal_integral(rho, al, np.inf, bl, bu) 308 | 309 | def check_for_acceptance(self): 310 | """Checks whether the most recent point should be accepted.""" 311 | 312 | # Return False when no evaluations t>0 have been made yet 313 | if self.num_steps == 0: 314 | return False 315 | 316 | # If an abort has been triggered, return True 317 | if self.abort_status == 2: 318 | return True 319 | 320 | # Check Wolfe probability 321 | pW = self.compute_p_wolfe(self.gp.ts[-1]) 322 | if pW >= self.cW: 323 | return True 324 | else: 325 | return False 326 | 327 | def proceed(self, *pass_to_func_args): 328 | """Make one step (function evaluation) in the line search. 329 | 330 | *pass_to_func_args are arguments that are passed to the function interface, 331 | e.g. a feed dict.""" 332 | 333 | assert self.prepare_called 334 | 335 | # Check for acceptance and accept the previous point as the case may be 336 | if self.check_for_acceptance(): 337 | print "-> ACCEPT" 338 | print "\t * alpha = ", self.rescale_t(self.gp.ts[-1]), "[alpha0 was", self.alpha0, "]" 339 | self.accept() 340 | print "\t * f = ", self.f0 341 | 342 | # In the first call to proceed in a new line search, evaluate at t=1. 343 | if self.num_steps == 0: 344 | print "************************************" 345 | print "NEW LINE SEARCH [alpha0 is", self.alpha0, "]" 346 | print "-> First step, evaluating at t = 1.0" 347 | self.evaluate(1., *pass_to_func_args) 348 | 349 | # Abort with a very small, safe step size if 350 | # - Abort triggered in an other method, e.g. evaluate() encountered inf or 351 | # nan. (self.abort_status==1) 352 | # - the maximum number of steps per line search is exceeded 353 | # - the maximum number of exploration steps is exceeded 354 | # - the posterior derivative at t=0. is too large (bad search direction) 355 | elif (self.abort_status == 1 356 | or self.num_steps >= self.max_steps 357 | or self.num_expl >= self.max_expl 358 | or self.gp.dmu(0.) >= self.max_dmu0): 359 | t_new = self.find_abort_t() 360 | print "-> Aborting with t = ", t_new 361 | self.evaluate(t_new, *pass_to_func_args) 362 | self.abort_status = 2 363 | 364 | # This is an "ordinary" evaluation. Find the best candidate for the next 365 | # evaluation and evaluate there. 366 | else: 367 | print "-> Ordinary step", self.num_steps, ", searching for new t" 368 | t_new = self.find_next_t() 369 | print "\t * Evaluating at t =", t_new 370 | self.evaluate(t_new, *pass_to_func_args) 371 | 372 | # Return the real-world function value 373 | f, _, _, _ = self.rescale_obs(self.gp.fs[-1], self.gp.dfs[-1], 374 | self.gp.fvars[-1], self.gp.dfvars[-1]) 375 | return f 376 | 377 | def proceed_constant_step(self, alpha, *pass_to_func_args): 378 | """Make one step (function evaluation) in the line search. 379 | 380 | *pass_to_func_args are arguments that are passed to the function interface, 381 | e.g. a feed dict.""" 382 | 383 | assert self.prepare_called 384 | 385 | if self.num_steps >= 1: 386 | self.accept() 387 | 388 | print "************************************" 389 | print "CONSTANT STEP with alpha =", alpha, "[alpha0 is", self.alpha0, "]" 390 | t = alpha/self.alpha0 391 | print "-> Evaluating at t =", t 392 | self.evaluate(t, *pass_to_func_args) 393 | 394 | f, _ = self.rescale_obs(self.gp.fs[-1], self.gp.dfs[-1]) 395 | return f 396 | 397 | # ToDo: Commenting 398 | def visualize_ei_pw(self, ax): 399 | """Visualize the current state of the line search: expected improvement 400 | and p_Wolfe. 401 | 402 | ``ax`` is a matplotlib axis.""" 403 | 404 | a, b = min(self.gp.ts), max(self.gp.ts) 405 | lo = a - .05*(b-a) 406 | up = b + (b-a) 407 | tt = np.linspace(lo, up, num=1000) 408 | ei = [self.gp.expected_improvement(t) for t in tt] 409 | pw = [self.compute_p_wolfe(t) for t in tt] 410 | prod = [e*p for e, p in zip(ei, pw)] 411 | ax.hold(True) 412 | ax.plot(tt, ei, label="EI") 413 | ax.plot(tt, pw, label="pW") 414 | ax.plot(tt, prod, label="EI*pW") 415 | ax.plot([lo, up], [self.cW, self.cW], color="grey") 416 | ax.text(lo, self.cW, "Acceptance threshold", fontsize=8) 417 | ax.set_xlim(lo, up) 418 | ax.legend(fontsize=10) 419 | 420 | ## LEGACY VERSION OF p_Wolfe ################################################# 421 | # Changed dCov and Covd here already! 422 | # def compute_p_wolfe_original(self, t): 423 | # """Computes the probability that step size ``t`` satisfies the Wolfe 424 | # conditions under the current GP model.""" 425 | # 426 | # # Compute mean and covariance matrix of the two Wolfe quantities a and b 427 | # # (equations (11) to (13) in [1]). 428 | # mu0 = self.gp.mu(0.) 429 | # dmu0 = self.gp.dmu(0.) 430 | # mu = self.gp.mu(t) 431 | # dmu = self.gp.dmu(t) 432 | # V0 = self.gp.V(0.) 433 | # Vd0 = self.gp.Vd(0.) 434 | # dVd0 = self.gp.dVd(0.) 435 | # ma = mu0 - mu + self.c1*t*dmu0 436 | # Vaa = V0 + dVd0*(self.c1*t)**2 + self.gp.V(t) \ 437 | # + 2.*self.c1*t*(Vd0 - self.gp.dCov_0(t)) - 2.*self.gp.Cov_0(t) 438 | # mb = dmu - self.c2*dmu0 439 | # Vbb = dVd0*self.c2**2 - 2.*self.c2*self.gp.dCovd_0(t) + self.gp.dVd(t) 440 | # 441 | # # Very small variances can cause numerical problems. Safeguard against 442 | # # this with a deterministic evaluation of the Wolfe conditions. 443 | # if Vaa < 1e-9 or Vbb < 1e-9: 444 | # return 1. if ma>=0. and mb>=0. else 0. 445 | # 446 | # Vab = -self.c2*(Vd0 + self.c1*t*dVd0) + self.c2*self.gp.dCov_0(t) \ 447 | # + self.gp.Covd_0(t) + self.c1*t*self.gp.dCovd_0(t) - self.gp.Vd(t) 448 | # 449 | # # Compute rho and integration bounds for p_Wolfe and return the result of 450 | # # the bivariate normal integral. Upper limit for b is used when strong 451 | # # Wolfe conditions are requested (cf. equations (14) to (16)in [1]). 452 | # rho = Vab/np.sqrt(Vaa*Vbb) 453 | # al = -ma/np.sqrt(Vaa) 454 | # bl = -mb/np.sqrt(Vbb) 455 | # if self.strong_wolfe: 456 | # bbar = 2.*self.c2*(np.abs(dmu0) + 2.*np.sqrt(dVd0)) 457 | # bu = (bbar - mb)/np.sqrt(Vbb) 458 | # return utils.bounded_bivariate_normal_integral(rho, al, np.inf, bl, bu) 459 | # else: 460 | # return utils.unbounded_bivariate_normal_integral(rho, al, bl) 461 | ############################################################################### -------------------------------------------------------------------------------- /probls/gaussian_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Implements the Gaussian process functionality needed for the probabilistic 4 | line search algorithm. 5 | """ 6 | 7 | import numpy as np 8 | from scipy import linalg 9 | from utils import erf 10 | 11 | class ProbLSGaussianProcess(object): 12 | """Gaussian process implementation for probabilistic line searches [1]. 13 | Implements 1D GP regression with observations of function value and 14 | derivative. Kernel is a once-integrated Wiener process with theta=1.0. 15 | 16 | Public interface: 17 | - ``gp = ProbLSGaussianProcess()`` 18 | - ``gp.add(t, f, df, sigma2_f, sigma2_df)`` to add a new observation. 19 | - ``gp.reset()`` to remove all observations. 20 | - ``gp.update()`` to set up and invert the Gram matrix and make the GP 21 | ready for inference (i.e. the following methods). 22 | - ``gp.mu(t)`` returns the posterior mean at ``t``. 23 | - ``gp.V(t)`` returns the posterior variance ``t``. 24 | - ``gp.expected_improvement(t)`` returns the expected improvement at 25 | ``t `` 26 | - ``gp.cubic_polynomial_coefficients(t)`` to get the coefficients of the 27 | cubic polynomial ``gp.mu()`` at ``t`` (the posterior mean is 28 | piece-wise cubic). 29 | - ``gp.find_cubic_minima()`` to get the minima (if existent) of the cubic 30 | polynomials in each "cell", i.e. between (sorted) observation at t_i 31 | and t_i+1. 32 | - ``gp.find_dmu_equal(val)``, like ``find_cubic_minima()``, but for 33 | points where the derivative of the posterior mean equals ``val`` (and 34 | the second derivative is positive). 35 | 36 | [1] M. Mahsereci and P. Hennig. Probabilistic line searches for stochastic 37 | optimization. In Advances in Neural Information Processing Systems 28, pages 38 | 181-189, 2015""" 39 | 40 | def __init__(self, theta=1.0, offset=10.0): 41 | """Create a new GP object.""" 42 | 43 | # Hyperparamters of the GP 44 | self.theta = theta 45 | self.offset = offset 46 | 47 | # Observation counter and arrays to store observations 48 | self.N = 0 49 | self.ts = [] 50 | self.fs = [] 51 | self.dfs = [] 52 | self.fvars = [] 53 | self.dfvars = [] 54 | 55 | # Kernel matrices 56 | self.K = None 57 | self.Kd = None 58 | self.dKd = None 59 | 60 | # Gram matrix and pre-computed "weights" of the GP 61 | self.G = None 62 | self.w = None 63 | 64 | # Switch that remembers whether we are ready for inference (calls to mu, 65 | # V, etc...). It is set to False when the GP is manipulated (points added, 66 | # noise level adjusted, reset). After such manipulations, gp.update() has 67 | # to be called. Remember current best observation of exp. improvement 68 | self.ready = False 69 | self.min_obs = None 70 | 71 | def reset(self): 72 | """Reset the GP, removing all previous observations. 73 | 74 | Automatically adds the observation at t=0 (with f=0 and df=-1).""" 75 | 76 | self.N = 0 77 | self.ts = [] 78 | self.fs = [] 79 | self.dfs = [] 80 | self.fvars = [] 81 | self.dfvars = [] 82 | self.K = None 83 | self.Kd = None 84 | self.dKd = None 85 | self.G = None 86 | self.LU = None 87 | self.LU_piv = None 88 | self.w = None 89 | 90 | self.min_obs = None 91 | self.ready = False 92 | 93 | def add(self, t, f, df, fvar=0.0, dfvar=0.0): 94 | """Add a new observation (t, f, df, simga2_f, sigma2_df) to the GP. 95 | 96 | This stores the observation internally, but does NOT yet set up and invert 97 | the Gram matrix. Add observations with repeated calls to this method, then 98 | call ``gp.update()`` to set up and invert the Gram matrix. Only then you 99 | can perform inference (calls to ``gp.mu(t)``, ``gp.V(t)``, etc...).""" 100 | 101 | assert isinstance(t, (float, np.float32, np.float64)) 102 | assert isinstance(f, (float, np.float32, np.float64)) 103 | assert isinstance(df, (float, np.float32, np.float64)) 104 | assert isinstance(fvar, (float, np.float32, np.float64)) 105 | assert isinstance(dfvar, (float, np.float32, np.float64)) 106 | 107 | self.ready = False 108 | self.min_obs = None 109 | 110 | self.N += 1 111 | self.ts.append(t) 112 | self.fs.append(f) 113 | self.dfs.append(df) 114 | self.fvars.append(fvar) 115 | self.dfvars.append(dfvar) 116 | 117 | def update(self): 118 | """Set up the Gram matrix and compute its LU decomposition to make the GP 119 | ready for inference (calls to ``.gp.mu(t)``, ``gp.V(t)``, etc...). 120 | 121 | Call this method after you have manipulated the GP by 122 | - ``gp.reset()`` ing, 123 | - adding observations with ``gp.add(t, f, df)``, or 124 | - adjusting the sigmas via ``gp.update_sigmas()``. 125 | and want to perform inference next.""" 126 | 127 | if self.ready: 128 | return 129 | 130 | # Set up the kernel matrices. 131 | self.K = np.matrix(np.zeros([self.N, self.N])) 132 | self.Kd = np.matrix(np.zeros([self.N, self.N])) 133 | self.dKd = np.matrix(np.zeros([self.N, self.N])) 134 | for i in range(self.N): 135 | for j in range(self.N): 136 | self.K[i, j] = self.k(self.ts[i], self.ts[j]) 137 | self.Kd[i, j] = self.kd(self.ts[i], self.ts[j]) 138 | self.dKd[i, j] = self.dkd(self.ts[i], self.ts[j]) 139 | 140 | # Put together the Gram matrix 141 | S_f = np.matrix(np.diag(self.fvars)) 142 | S_df = np.matrix(np.diag(self.dfvars)) 143 | self.G = np.bmat([[self.K + S_f, self.Kd], 144 | [self.Kd.T, self.dKd + S_df]]) 145 | 146 | # Compute the LU decomposition of G and store it 147 | self.LU, self.LU_piv = linalg.lu_factor(self.G, check_finite=True) 148 | 149 | # Set ready switch to True 150 | self.ready = True 151 | 152 | # Pre-compute the regression weights used in mu 153 | self.w = self.solve_G(np.array(self.fs + self.dfs)) 154 | 155 | def solve_G(self, b): 156 | """Solve ``Gx=b`` where ``G`` is the Gram matrix of the GP. 157 | 158 | Uses the internally-stored LU decomposition of ``G`` computed in 159 | ``gp.update()``.""" 160 | 161 | assert self.ready 162 | return linalg.lu_solve((self.LU, self.LU_piv), b, check_finite=True) 163 | 164 | def mu(self, t): 165 | """Evaluate posterior mean of f at ``t``.""" 166 | 167 | assert isinstance(t, (float, np.float32, np.float64)) 168 | assert self.ready 169 | 170 | # Compute kernel vector (k and kd) of the query t and the observations T 171 | # Then perform inner product with the pre-computed GP weights 172 | T = np.array(self.ts) 173 | kvec = np.concatenate([self.k(t, T), self.kd(t, T)]) 174 | 175 | return np.dot(self.w, kvec) 176 | 177 | def dmu(self, t): 178 | """Evaluate first derivative of the posterior mean of df at ``t``.""" 179 | 180 | assert isinstance(t, (float, np.float32, np.float64)) 181 | assert self.ready 182 | 183 | # Same is in mu, with the respective "derivative kernel vectors" 184 | T = np.array(self.ts) 185 | kvec = np.concatenate([self.kd(T, t), self.dkd(t, T)]) 186 | 187 | return np.dot(self.w, kvec) 188 | 189 | def d2mu(self, t): 190 | """Evaluate 2nd derivative of the posterior mean of f at ``t``.""" 191 | 192 | assert isinstance(t, (float, np.float32, np.float64)) 193 | assert self.ready 194 | 195 | # Same is in mu, with the respective "derivative kernel vectors" 196 | T = np.array(self.ts) 197 | kvec = np.concatenate([self.d2k(t, T), self.d2kd(t, T)]) 198 | 199 | return np.dot(self.w, kvec) 200 | 201 | def d3mu(self, t): 202 | """Evaluate 3rd derivative of the posterior mean of f at ``t``.""" 203 | 204 | assert isinstance(t, (float, np.float32, np.float64)) 205 | assert self.ready 206 | 207 | # Same is in mu, with the respective "derivative kernel vectors" 208 | T = np.array(self.ts) 209 | kvec = np.concatenate([self.d3k(t, T), np.zeros(self.N)]) 210 | 211 | return np.dot(self.w, kvec) 212 | 213 | def V(self, t): 214 | """Evaluate posterior variance of f at ``t``.""" 215 | 216 | assert isinstance(t, (float, np.float32, np.float64)) 217 | assert self.ready 218 | 219 | # Compute the needed k vector 220 | T = np.array(self.ts) 221 | kvec = np.concatenate([self.k(t, T), self.kd(t,T)]) 222 | ktt = self.k(t, t) 223 | 224 | return ktt - np.dot(kvec, self.solve_G(kvec)) 225 | 226 | def Vd(self, t): 227 | """Evaluate posterior co-variance of f and df at ``t``.""" 228 | 229 | assert isinstance(t, (float, np.float32, np.float64)) 230 | assert self.ready 231 | 232 | T = np.array(self.ts) 233 | ktT = self.k(t, T) 234 | kdtT = self.kd(t, T) 235 | dktT = self.kd(T, t) 236 | dkdtT = self.dkd(t, T) 237 | kdtt = self.kd(t, t) 238 | kvec_a = np.concatenate([ktT, kdtT]) 239 | kvec_b = np.concatenate([dktT, dkdtT]) 240 | 241 | return kdtt - np.dot(kvec_a, self.solve_G(kvec_b)) 242 | 243 | def dVd(self, t): 244 | """Evaluate posterior variance of df at ``t``""" 245 | 246 | assert isinstance(t, (float, np.float32, np.float64)) 247 | assert self.ready 248 | 249 | T = np.array(self.ts) 250 | dkdtt = self.dkd(t, t) 251 | dktT = self.kd(T, t) 252 | dkdtT = self.dkd(t, T) 253 | kvec = np.concatenate([dktT, dkdtT]) 254 | 255 | return dkdtt - np.dot(kvec, self.solve_G(kvec)) 256 | 257 | def Cov_0(self, t): 258 | """Evaluate posterior co-variance of f at 0. and ``t``.""" 259 | 260 | assert isinstance(t, (float, np.float32, np.float64)) 261 | assert self.ready 262 | 263 | T = np.array(self.ts) 264 | k0t = self.k(0., t) 265 | k0T = self.k(0., T) 266 | kd0T = self.kd(0., T) 267 | ktT = self.k(t, T) 268 | kdtT = self.kd(t, T) 269 | kvec_a = np.concatenate([k0T, kd0T]) 270 | kvec_b = np.concatenate([ktT, kdtT]) 271 | 272 | return k0t - np.dot(kvec_a, self.solve_G(kvec_b)) 273 | 274 | def Covd_0(self, t): 275 | """Evaluate posterior co-variance of f at 0. and df at ``t``.""" 276 | # !!! I changed this in line_search new, Covd_0 <-> dCov_0 277 | 278 | assert isinstance(t, (float, np.float32, np.float64)) 279 | assert self.ready 280 | 281 | T = np.array(self.ts) 282 | kd0t = self.kd(0., t) 283 | k0T = self.k(0., T) 284 | kd0T = self.kd(0., T) 285 | dktT = self.kd(T, t) 286 | dkdtT = self.dkd(t, T) 287 | kvec_a = np.concatenate([k0T, kd0T]) 288 | kvec_b = np.concatenate([dktT, dkdtT]) 289 | 290 | return kd0t - np.dot(kvec_a, self.solve_G(kvec_b)) 291 | 292 | def dCov_0(self, t): 293 | """Evaluate posterior co-variance of df at 0. and f at ``t``.""" 294 | # !!! I changed this in line_search new, Covd_0 <-> dCov_0 295 | 296 | assert isinstance(t, (float, np.float32, np.float64)) 297 | assert self.ready 298 | 299 | T = np.array(self.ts) 300 | dk0t = self.kd(t, 0.) 301 | dk0T = self.kd(T, 0.) 302 | dkd0T = self.dkd(0., T) 303 | ktT = self.k(t, T) 304 | kdtT = self.kd(t, T) 305 | kvec_a = np.concatenate([dk0T, dkd0T]) 306 | kvec_b = np.concatenate([ktT, kdtT]) 307 | 308 | return dk0t - np.dot(kvec_a, self.solve_G(kvec_b)) 309 | 310 | def dCovd_0(self, t): 311 | """Evaluate posterior co-variance of df at 0. and ``t``.""" 312 | 313 | assert isinstance(t, (float, np.float32, np.float64)) 314 | assert self.ready 315 | 316 | T = np.array(self.ts) 317 | dkd0t = self.dkd(0., t) 318 | dk0T = self.kd(T, 0.) 319 | dkd0T = self.dkd(0., T) 320 | dktT = self.kd(T, t) 321 | dkdtT = self.dkd(t, T) 322 | kvec_a = np.concatenate([dk0T, dkd0T]) 323 | kvec_b = np.concatenate([dktT, dkdtT]) 324 | 325 | return dkd0t - np.dot(kvec_a, self.solve_G(kvec_b)) 326 | 327 | def cubic_polynomial_coefficients(self, t): 328 | """The posterior mean ``mu`` of this GP is piece-wise cubic. Return the 329 | coefficients of the cubic polynomial that is ``mu`` at ``t``.""" 330 | 331 | assert isinstance(t, (float, np.float32, np.float64)) 332 | assert t not in self.ts # at the observations, polynomial is ambiguous 333 | 334 | d1, d2, d3 = self.dmu(t), self.d2mu(t), self.d3mu(t) 335 | a = d3/6.0 336 | b = 0.5*d2-3*a*t 337 | c = d1-3*a*t**2-2*b*t 338 | d = self.mu(t)-a*t**3-b*t**2-c*t 339 | 340 | return (a, b, c, d) 341 | 342 | def quadratic_polynomial_coefficients(self, t): 343 | """The posterior mean ``mu`` of this GP is piece-wise cubic. Return the 344 | coefficients of the **quadratic** polynomial that is the **derivative** of 345 | ``mu`` at ``t``. 346 | 347 | This is used to find the minimum of the cubic polynomial in 348 | ``gp.find_mimima()``.""" 349 | 350 | assert isinstance(t, (float, np.float32, np.float64)) 351 | assert t not in self.ts # at the observations, polynomial is ambiguous 352 | 353 | d1, d2, d3 = self.dmu(t), self.d2mu(t), self.d3mu(t) 354 | a = .5*d3 355 | b = d2 - d3*t 356 | c = d1 - d2*t + 0.5*d3*t**2 357 | 358 | return (a, b, c) 359 | 360 | def find_dmu_equal(self, val): 361 | """Finds points where the derivative of the posterior mean equals ``val`` 362 | and the second derivative is positive. 363 | 364 | The posterior mean is a cubic polynomial in each of the cells" 365 | ``[t_i, t_i+1]`` where the t_i are the sorted observed ts. For each of 366 | these cells, returns points with dmu==val the cubic polynomial if it exists 367 | and happens to lie in that cell.""" 368 | 369 | # We want to go through the observations from smallest to largest t 370 | ts_sorted = list(self.ts) 371 | ts_sorted.sort() 372 | 373 | solutions = [] 374 | 375 | for t1, t2 in zip(ts_sorted, ts_sorted[1:]): 376 | # Compute the coefficients of the quadratic polynomial dmu/dt in this 377 | # cell, then call the function minimize_cubic to find the minimizer. 378 | # If there is one and it falls into the current cell, store it 379 | a, b, c = self.quadratic_polynomial_coefficients(t1+0.5*(t2-t1)) 380 | solutions_cell = quadratic_polynomial_solve(a, b, c, val) 381 | for s in solutions_cell: 382 | if s>t1 and s0. 528 | 529 | Returns the *list* of solutions (containg 1 or 0 solutions).""" 530 | 531 | assert isinstance(a, (float, np.float32, np.float64)) 532 | assert isinstance(b, (float, np.float32, np.float64)) 533 | assert isinstance(c, (float, np.float32, np.float64)) 534 | assert isinstance(val, (float, np.float32, np.float64)) 535 | 536 | # Check if a is almost zero. If so, solve the remaining linear equation. Note 537 | # that we return only soultions with f''(t) = b > 0 538 | if abs(a) < 1e-9: 539 | if b > 1e-9: 540 | return [(val-c)/b] 541 | else: 542 | return [] 543 | 544 | # Compute the term under the square root in pq formula, if it is negative, 545 | # there is no real solution 546 | det = b**2-4.*a*(c-val) 547 | if det < 0: 548 | return [] 549 | 550 | # Otherwise, compute the two roots 551 | s = np.sqrt(det) 552 | r1 = (-b - np.sign(a)*s)/(2.*a) 553 | r2 = (-b + np.sign(a)*s)/(2.*a) 554 | 555 | # Return the one with f''(t) = 2at + b > 0, or [] 556 | if 2*a*r1+b > 0: 557 | return [r1] 558 | elif 2*a*r2+b > 0: 559 | return [r2] 560 | else: 561 | return [] --------------------------------------------------------------------------------