├── LICENSE.txt
├── README.txt
├── hp_gpsmbo
    ├── __init__.py
    ├── __init__.pyc
    ├── gby.py
    ├── gpr.py
    ├── gpr_math.py
    ├── hmc.py
    ├── hpsuggest.py
    ├── hpsuggest_ei.py
    ├── hpsuggest_lucb.py
    ├── hpsuggest_ucb.py
    ├── kernels.py
    ├── kernels.pyc
    ├── kernels_base.py
    ├── op_Kcond.py
    ├── prodkernels.py
    ├── scrap.py
    ├── suggest_algos.py
    └── test
    │   ├── __init__.py
    │   ├── test_branin.py
    │   ├── test_gpr.py
    │   ├── test_gpr_math.py
    │   ├── test_har6.py
    │   ├── test_hpsuggest.py
    │   ├── test_kernels.py
    │   └── test_normal_log_EI.py
└── setup.py


/LICENSE.txt:
--------------------------------------------------------------------------------
1 | This software (theano-gpr) is may be used by anyone under the terms of the
2 | "Lesser GPL v. 3.0"
3 | 
4 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | README
2 | 
3 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from kernels import SqExp, Product
3 | from gpr import GPR_ML2, GPR_HMC
4 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/__init__.pyc


--------------------------------------------------------------------------------
/hp_gpsmbo/gby.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | 
3 | def groupby(seq, key):
4 |     tmp = OrderedDict()
5 |     for ss in seq:
6 |         tmp.setdefault(key(ss), []).append(ss)
7 |     return tmp
8 | 
9 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/gpr.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | import scipy.optimize
  4 | import theano
  5 | import theano.tensor as TT
  6 | import theano.sandbox.rng_mrg
  7 | from .gpr_math import s_normal_logEI
  8 | from .hmc import HMC_sampler
  9 | 
 10 | 
 11 | def raises(exc, fn, args):
 12 |     try:
 13 |         fn(*args)
 14 |         return False
 15 |     except exc:
 16 |         return True
 17 |     return False
 18 | 
 19 | 
 20 | class GPR_Base(object):
 21 |     def __init__(self, kernel,
 22 |                  maxiter=None,
 23 |                  prior_var=None,
 24 |                  prior_mean=None,
 25 |                  warn_floatX=True,
 26 |                  ):
 27 |         self.kernel = kernel
 28 |         self.maxiter = maxiter
 29 |         self.prior_var = prior_var
 30 |         self.prior_mean = prior_mean
 31 |         self.s_var_min = TT.as_tensor_variable(1e-8, name='s_var_min')
 32 |         self.s_emp_mean = theano.shared(0.0, name='s_emp_mean')
 33 |         self.s_emp_var = theano.shared(1.0, name='s_emp_var')
 34 |         self.s_X = theano.shared(np.zeros((2, 2)), name='s_X')
 35 |         self.s_y = theano.shared(np.zeros((2,)), name='s_y')
 36 |         self.s_var_y_raw = theano.shared(np.zeros(2,), name='s_var_y_raw')
 37 |         self.s_params = theano.tensor.dvector('params')
 38 |         self._logEI_cache = {}
 39 |         if theano.config.floatX != 'float64':
 40 |             raise TypeError('GPR requires floatX==float64')
 41 | 
 42 |         self.s_var_y = TT.maximum(self.s_var_y_raw, self.s_var_min)
 43 | 
 44 |     def set_emp_mean(self, y):
 45 |         if self.prior_mean is None:
 46 |             self.s_emp_mean.set_value(np.mean(y))
 47 |         else:
 48 |             self.s_emp_mean.set_value(self.prior_mean)
 49 | 
 50 |     def set_emp_var(self, y, var_y):
 51 |         self.s_var_y_raw.set_value(np.zeros(len(y)) + var_y)
 52 |         if self.prior_var is None:
 53 |             self.s_emp_var.set_value(max(np.var(y),
 54 |                                          np.min(var_y),
 55 |                                          1e-6))
 56 |         else:
 57 |             self.s_emp_var.set_value(self.prior_var)
 58 | 
 59 |     def set_Xy(self, X, y):
 60 |         X_ = np.atleast_2d(X)
 61 |         self.s_X.set_value(X_)
 62 |         self.s_y.set_value(np.atleast_1d(y) - self.s_emp_mean.get_value())
 63 |         return self.s_X, self.s_y
 64 | 
 65 |     def fit(self, X, y, var_y=0.0):
 66 |         self.set_emp_mean(y)
 67 |         self.set_emp_var(y, var_y)
 68 |         s_X, s_y = self.set_Xy(X, y)
 69 | 
 70 |         _, params, params0, _ = self.kernel.s_nll_params(
 71 |             X, y,
 72 |             var_y=var_y,
 73 |             prior_var=self.s_emp_var)
 74 | 
 75 |         self._params_list = [params0.copy()]
 76 |         self._params_weights = [1.0]
 77 | 
 78 | 
 79 |     def predict(self, x, eval_MSE=False):
 80 |         if eval_MSE:
 81 |             return self.mean_variance(x)
 82 |         else:
 83 |             return self.mean(x)
 84 | 
 85 |     def mean(self, x):
 86 |         """
 87 |         Compute mean at points in x_new
 88 |         """
 89 |         try:
 90 |             self._mean
 91 |         except AttributeError:
 92 |             s_mean_x, s_var_x, s_x = self.kernel.s_mean_var(
 93 |                 self.s_X,
 94 |                 self.s_y,
 95 |                 self.s_var_y,
 96 |                 self.s_emp_var,
 97 |                 self.s_params,
 98 |                 self.s_var_min)
 99 |             self._mean = theano.function(
100 |                 [s_x, self.s_params],
101 |                 s_mean_x + self.s_emp_mean,
102 |                 allow_input_downcast=True,)
103 |         means = [self._mean(x, p) for p in self._params_list]
104 |         weights = self._params_weights
105 |         return np.dot(weights, means)
106 | 
107 |     def mean_variance(self, x):
108 |         """
109 |         Compute mean and variance at points in x_new
110 |         """
111 |         try:
112 |             self._mean_variance
113 |         except AttributeError:
114 |             s_mean_x, s_var_x, s_x = self.kernel.s_mean_var(
115 |                 self.s_X,
116 |                 self.s_y,
117 |                 self.s_var_y,
118 |                 self.s_emp_var,
119 |                 self.s_params,
120 |                 self.s_var_min)
121 |             self._mean_variance = theano.function(
122 |                 [s_x, self.s_params],
123 |                 [s_mean_x + self.s_emp_mean, s_var_x],
124 |                 allow_input_downcast=True,)
125 |         means, variances = zip(*[
126 |             self._mean_variance(x, p) for p in self._params_list])
127 |         weights = self._params_weights
128 |         mean = np.dot(weights, means)
129 |         variance = np.dot(weights, variances)
130 |         return mean, variance
131 | 
132 |     def logEI_fn(self, direction, quad_approx):
133 |         direction = float(direction)
134 |         quad_approx = bool(quad_approx)
135 |         try:
136 |             self._logEI_cache[(direction, quad_approx)]
137 |         except KeyError:
138 |             s_thresh = TT.dscalar('thresh')
139 |             s_mean_x, s_var_x, s_x = self.kernel.s_mean_var(
140 |                 self.s_X,
141 |                 self.s_y,
142 |                 self.s_var_y,
143 |                 self.s_emp_var,
144 |                 self.s_params,
145 |                 self.s_var_min)
146 |             s_logEI = s_normal_logEI(
147 |                 direction * s_thresh,
148 |                 direction * (s_mean_x + self.s_emp_mean),
149 |                 s_var_x,
150 |                 quad_approx=quad_approx)
151 |             self._logEI_cache[(direction, quad_approx)] = theano.function(
152 |                 [s_x, s_thresh, self.s_params],
153 |                 s_logEI,
154 |                 allow_input_downcast=True)
155 |         return self._logEI_cache[(direction, quad_approx)]
156 | 
157 |     def logEI(self, x, thresh, direction=1, quad_approx=False):
158 |         logEI_fn = self.logEI_fn(direction, quad_approx)
159 |         logEIs = [logEI_fn(x, thresh, p) for p in self._params_list]
160 |         weights = self._params_weights
161 |         rval = np.dot(weights, logEIs)
162 |         return np.atleast_1d(rval)
163 | 
164 | 
165 | class GPR_ML2(GPR_Base):
166 |     """
167 |     Fit by maximum marginal likelihood of kernel hyperparameters
168 | 
169 |     """
170 | 
171 |     def __init__(self, *args, **kwargs):
172 |         GPR_Base.__init__(self, *args, **kwargs)
173 | 
174 |         nll, params, params0, bounds, K = self.kernel.s_nll_params(
175 |             self.s_X, self.s_y,
176 |             params=self.s_params,
177 |             var_y=self.s_var_y,
178 |             prior_var=self.s_emp_var, ret_K=True)
179 | 
180 |         cost = nll - self.kernel.s_logprior(params)
181 |         assert nll.ndim == 0, nll.type
182 | 
183 |         self._K = theano.function([params], K)
184 |         self._fit_f_df = theano.function([params],
185 |                                          [cost, TT.grad(cost, params)])
186 |         self._params0 = params0
187 |         self._bounds = bounds
188 | 
189 |     def _fit_params0(self):
190 |         new_x0 = self._params0
191 |         nll_pp = []
192 |         for ii in range(12):
193 |             try:
194 |                 f, df = self._fit_f_df(new_x0)
195 |                 # -- don't start where the function is too steep
196 |                 if np.sqrt(np.dot(df, df)) > 10000:
197 |                     f = np.inf
198 |             except np.linalg.LinAlgError:
199 |                 f = np.inf
200 |             # -- ii is in list to break ties, which
201 |             #    happens if there are multiple infs
202 |             nll_pp.append((f, ii, np.array(new_x0)))
203 |             new_x0 = self.kernel.reduce_lenscale(new_x0)
204 | 
205 |         x0 = sorted(nll_pp)[0][2]
206 |         if np.isinf(sorted(nll_pp)[0][0]):
207 |             raise Exception('fit impossible')
208 |         return x0
209 | 
210 |     def _fit_ml2(self):
211 |         x0 = self._fit_params0()
212 | 
213 |         # -- for some reason, the result object returned by minimize
214 |         #    seems occasionally to include a parameter vector (pp)
215 |         #    for which f_df returned np.inf, when there were other non-inf
216 |         #    evaluations (!?)
217 |         #    Therefore, this best_f and best_pp mechanism is used.
218 |         best_f_pp = [np.inf, None]
219 | 
220 |         def f_df(pp):
221 |             if not np.all(np.isfinite(pp)):
222 |                 return np.inf, pp
223 |             try:
224 |                 ff, df = self._fit_f_df(pp)
225 |                 if ff < best_f_pp[0]:
226 |                     best_f_pp[:] = [ff, pp.copy()]
227 |                 return ff, df
228 |             except np.linalg.LinAlgError:
229 |                 return np.inf, pp
230 |             except ValueError, exc:
231 |                 if 'NaN' in str(exc):
232 |                     return np.inf, pp
233 |                 else:
234 |                     raise
235 |         try:
236 |             scipy.optimize.minimize(
237 |                 fun=f_df, #self._fit_f_df,
238 |                 x0=x0,
239 |                 jac=True, # -- means f returns cost and jacobian
240 |                 method='SLSQP',
241 |                 #method='L-BFGS-B',
242 |                 options={} if self.maxiter is None else (
243 |                     {'maxiter': self.maxiter,}),
244 |                 bounds=self._bounds,
245 |                 )
246 |         except ValueError, e:
247 |             if 'NaN' in str(e):
248 |                 print 'WARNING: GPR.fit caught error', e
249 |                 print 'WARNING: hopeless fit fail, falling back on params0'
250 |                 self._params_list = [self._params0]
251 |             else:
252 |                 raise
253 |         return best_f_pp
254 | 
255 |     def fit_ml2(self, X, y, var_y=0, debug=False, ion=False):
256 |         """
257 |         Fit GPR kernel parameters by minimizing magininal nll.
258 | 
259 |         Returns: None
260 | 
261 |         Side effect: chooses optimal kernel parameters.
262 |         """
263 |         self.set_emp_mean(y)
264 |         self.set_emp_var(y, var_y)
265 |         s_X, s_y = self.set_Xy(X, y)
266 |         best_f, best_params = self._fit_ml2()
267 |         self._params_list = [best_params]
268 |         self._params_weights = [1.0]
269 |         return self
270 | 
271 |     def fit(self, X, y, var_y=0, debug=False, ion=False):
272 |         return self.fit_ml2(X, y, var_y, debug, ion)
273 | 
274 | 
275 | class GPR_HMC(GPR_ML2):
276 |     """
277 |     Fit by collecting kernel hyperparameter samples (by HMC).
278 | 
279 |     """
280 |     def __init__(self, kernel,
281 |                  maxiter=None,
282 |                  prior_var=None,
283 |                  prior_mean=None,
284 |                  hmc_burn_in=0, # -- keep ML first point
285 |                  hmc_draws=200,
286 |                  hmc_keep_step=25):
287 |         GPR_ML2.__init__(self, kernel,
288 |                          maxiter=maxiter,
289 |                          prior_var=prior_var,
290 |                          prior_mean=prior_mean)
291 |         self.positions = theano.shared(np.zeros((1, self.kernel.n_params)),
292 |                                       name='positions')
293 | 
294 |         nll, s_params, params0, bounds = self.kernel.s_nll_params(
295 |             self.s_X, self.s_y, var_y=self.s_var_y,
296 |             params=self.s_params,
297 |             prior_var=self.s_emp_var,)
298 |         cost = nll - self.kernel.s_logprior(s_params)
299 |         self.nll_cost_fn = theano.function([s_params], [nll, cost])
300 |         self._params0 = params0
301 | 
302 |         def energy_fn(params_matrix):
303 |             # PRECONDITOIN: params_matrix has SINGLE ROW
304 |             nll, params, params0, bounds = self.kernel.s_nll_params(
305 |                 self.s_X, self.s_y, var_y=self.s_var_y,
306 |                 prior_var=self.s_emp_var,
307 |                 params=params_matrix[0])
308 |             logprior = self.kernel.s_logprior(params_matrix[0])
309 |             energy = nll - logprior
310 |             #energy = theano.printing.Print('energy')(energy)
311 |             return energy.dimshuffle('x')
312 | 
313 |         print 'creating HMC sampler'
314 |         self.sampler = HMC_sampler.new_from_shared_positions(
315 |             self.positions, energy_fn,
316 |             s_rng=theano.sandbox.rng_mrg.MRG_RandomStreams(1234),
317 |             stepsize_dec=0.95,
318 |             stepsize_inc=1.02,
319 |             stepsize_min=1.0e-8,
320 |             stepsize_max=2.5e-1,
321 |             )
322 |         self._stepsize0 = .001
323 |         self.hmc_burn_in = hmc_burn_in
324 |         self.hmc_draws = hmc_draws
325 |         self.hmc_keep_step = hmc_keep_step
326 | 
327 |     def fit_hmc(self, X, y, var_y=1e-16, debug=False, ion=False,
328 |                init_params_method='cycle'):
329 | 
330 |         self.set_emp_mean(y)
331 |         self.set_emp_var(y, var_y)
332 |         self.set_Xy(X, y)
333 | 
334 |         if init_params_method == 'cycle':
335 |             init_params_method = ['ml2', 'prior'][len(y) % 2]
336 |         if init_params_method == 'ml2':
337 |             _, ml_params = self._fit_ml2()
338 |         elif init_params_method == 'prior':
339 |             ml_params = self._fit_params0()
340 |         else:
341 |             raise NotImplementedError(init_params_method)
342 | 
343 |         self.sampler.positions.set_value(np.asarray([ml_params]))
344 |         self.sampler.stepsize.set_value(self._stepsize0)
345 | 
346 |         def get_state(sampler):
347 |             return {
348 |                 'positions': sampler.positions.get_value(),
349 |                 'stepsize': sampler.stepsize.get_value(),
350 |                 'avg_acceptance_rate': sampler.avg_acceptance_rate.get_value(),
351 |             }
352 |         def set_state(sampler, state):
353 |             for k, v in state.items():
354 |                 getattr(sampler, k).set_value(v)
355 | 
356 |         def draw():
357 |             state = get_state(self.sampler)
358 |             while state['stepsize'] > 1e-12:
359 |                 try:
360 |                     set_state(self.sampler, state)
361 |                     pos = self.sampler.draw()
362 |                     return pos
363 |                 except (ValueError, np.linalg.LinAlgError):
364 |                     print 'shrinking stepsize %f to stabilize sampler' % (
365 |                         self.sampler.stepsize.get_value(),
366 |                     )
367 |                     state['positions'][0] = self.kernel.reduce_lenscale(
368 |                         state['positions'][0])
369 |                     state['stepsize'] /= 2.0
370 |             raise ValueError('hopeless: Nan or inf in K')
371 | 
372 |         samples = []
373 |         nlls = []
374 |         costs = []
375 |         t0 = time.time()
376 |         for ii in range(self.hmc_burn_in):
377 |             pos = draw()
378 |         for ii in range(self.hmc_draws):
379 |             pos = draw()
380 |             samples.append(pos.ravel().copy())
381 |             if 0:
382 |                 nll_ii, cost_ii = self.nll_cost_fn(pos.flatten())
383 |                 print 'current position', pos.flatten(),
384 |                 print 'accept rate', self.sampler.avg_acceptance_rate.get_value(),
385 |                 print 'nll', nll_ii, 'cost', cost_ii
386 |                 nlls.append(nll_ii)
387 |                 costs.append(cost_ii)
388 |         print 'HMC took', (time.time() - t0)
389 |         samples = np.asarray(samples)
390 |         keep = samples[::self.hmc_keep_step]
391 |         if keep.size == 0:
392 |             raise NotImplementedError()
393 | 
394 |         if debug:
395 |             import matplotlib.pyplot as plt
396 |             if ion:
397 |                 plt.figure(2)
398 |             if self.kernel.n_params == 1:
399 |                 plt.subplot(211)
400 |                 plt.cla()
401 |                 plt.hist(np.asarray(samples).flatten())
402 |                 plt.title('nlls observed during sampling')
403 |                 plt.subplot(212)
404 |                 plt.cla()
405 |                 plt.scatter(samples, nlls, label='nll', c='b')
406 |                 plt.scatter(samples, costs, label='cost', c='g')
407 |                 plt.title('nlls vs. alpha')
408 |                 plt.legend()
409 |             if self.kernel.n_params == 2:
410 |                 plt.cla()
411 |                 plt.scatter(samples[:, 0], samples[:, 1])
412 |                 plt.scatter(keep[:, 0], keep[:, 1], s=60)
413 |             if ion:
414 |                 plt.draw()
415 |             else:
416 |                 plt.show()
417 | 
418 |         self._params_list = keep
419 |         self._params_weights = np.ones(len(keep)) / len(keep)
420 | 
421 | 
422 |     def fit(self, X, y, var_y=0, debug=False, ion=False):
423 |         return self.fit_hmc(X, y, var_y, debug, ion)
424 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/gpr_math.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Formulae for Gaussian Process Regression
  3 | 
  4 | """
  5 | 
  6 | import numpy as np
  7 | import theano.tensor as TT
  8 | from theano.sandbox.linalg import cholesky, matrix_inverse, det, psd
  9 | from .op_Kcond import normal_logEI_diff_sigma_elemwise
 10 | 
 11 | 
 12 | def dots(*args):
 13 |     rval = args[0]
 14 |     for a in args[1:]:
 15 |         rval = TT.dot(rval, a)
 16 |     return rval
 17 | 
 18 | 
 19 | def s_nll(K, y, var_y, prior_var):
 20 |     """
 21 |     Marginal negative log likelihood of model
 22 | 
 23 |     K - gram matrix (matrix-like)
 24 |     y - the training targets (vector-like)
 25 |     var_y - the variance of uncertainty about y (vector-like)
 26 | 
 27 |     :note: See RW.pdf page 37, Eq. 2.30.
 28 | 
 29 |     """
 30 | 
 31 |     n = y.shape[0]
 32 |     rK = psd(prior_var * K + var_y * TT.eye(n))
 33 | 
 34 |     fit = .5 * dots(y, matrix_inverse(rK), y)
 35 |     complexity = 0.5 * TT.log(det(rK))
 36 |     normalization = n / 2.0 * TT.log(2 * np.pi)
 37 |     nll = fit + complexity + normalization
 38 |     return nll
 39 | 
 40 | 
 41 | def s_mean(K, y, var_y, prior_var, K_new):
 42 |     rK = psd(prior_var * K + var_y * TT.eye(y.shape[0]))
 43 |     alpha = TT.dot(matrix_inverse(rK), y)
 44 |     y_x = TT.dot(alpha, prior_var * K_new)
 45 |     return y_x
 46 | 
 47 | 
 48 | def s_variance(K, y, var_y, prior_var, K_new, var_min):
 49 |     rK = psd(prior_var * K + var_y * TT.eye(y.shape[0]))
 50 |     L = cholesky(rK)
 51 |     v = dots(matrix_inverse(L), prior_var * K_new)
 52 |     var_x = TT.maximum(prior_var - (v ** 2).sum(axis=0), var_min)
 53 |     return var_x
 54 | 
 55 | 
 56 | def s_normal_pdf(x, mean, var):
 57 |     energy = 0.5 * ((x - mean) ** 2) / var
 58 |     return TT.exp(-energy) / TT.sqrt(2 * np.pi * var)
 59 | 
 60 | 
 61 | def s_normal_logpdf(x, mean, var):
 62 |     energy = 0.5 * ((x - mean) ** 2) / var
 63 |     return -energy - 0.5 * TT.log(2 * np.pi * var)
 64 | 
 65 | 
 66 | def s_normal_cdf(x, mean, var):
 67 |     z = (x - mean) / TT.sqrt(var)
 68 |     return .5 * TT.erfc(-z / np.sqrt(2))
 69 | 
 70 | 
 71 | def s_normal_logcdf(x, mean, var):
 72 |     z = (x - mean) / TT.sqrt(var)
 73 |     return TT.log(.5) + TT.log(TT.erfc(-z / np.sqrt(2)))
 74 | 
 75 | 
 76 | def s_normal_EI(thresh, mean, var):
 77 |     """analytic expected improvement over (above) threshold
 78 | 
 79 |         int_{thresh}^{\infty} (y - thresh) P(y; mean, var) dy
 80 | 
 81 |     """
 82 |     s_thresh = TT.as_tensor_variable(thresh)
 83 |     sigma = TT.sqrt(var)
 84 |     z = (mean - s_thresh) / sigma
 85 |     # -- the following formula is cuter, but
 86 |     #    Theano doesn't produce as stable a gradient I think?
 87 |     #return sigma * (z * s_normal_cdf(z, 0, 1) + s_normal_pdf(z, 0, 1))
 88 |     a = (mean - s_thresh) * s_normal_cdf(z, 0, 1)
 89 |     b = sigma * s_normal_pdf(z, 0, 1)
 90 |     return a + b
 91 | 
 92 | 
 93 | def s_normal_logEI(thresh, mean, var, quad_approx=False):
 94 |     """analytic log-expected improvement over (above) threshold
 95 | 
 96 |         log(int_{thresh}^{\infty} (y - thresh) P(y; mean, var) dy)
 97 | 
 98 |     quad_approx uses a 2nd-order polynomial approximation to the true function
 99 |     when the threshold is way above the mean (34 standard deviations), where
100 |     there's almost no density to integrate.
101 |     """
102 |     return normal_logEI_diff_sigma_elemwise(thresh - mean, TT.sqrt(var))
103 | 
104 | 
105 | def s_normal_EBI(lbound, ubound, mean, var):
106 |     """ int_l^u (y - l) P(y; mean, var)
107 |     """
108 |     s_l = TT.as_tensor_variable(lbound)
109 |     s_u = TT.as_tensor_variable(ubound)
110 | 
111 |     EI_l = s_normal_EI(s_l, mean, var)
112 |     EI_u = s_normal_EI(s_u, mean, var)
113 | 
114 |     #sigma = TT.maximum(TT.sqrt(var), 1e-15)
115 |     sigma = TT.sqrt(var)
116 |     term = (s_l - s_u) * s_normal_cdf((mean - s_u) / sigma, 0, 1)
117 | 
118 |     return EI_l - EI_u + term
119 | 
120 | 
121 | def s_normal_logEBI(lbound, ubound, mean, var):
122 |     return TT.log(s_normal_EBI(lbound, ubound, mean, var))
123 | 
124 | 
125 | # -- eof flake8
126 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/hmc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | TODO
  3 | """
  4 | 
  5 | import numpy
  6 | 
  7 | from theano import function, shared
  8 | from theano import tensor as TT
  9 | import theano
 10 | 
 11 | sharedX = lambda X, name: \
 12 |         shared(numpy.asarray(X, dtype=theano.config.floatX), name=name)
 13 | 
 14 | 
 15 | def kinetic_energy(vel):
 16 |     """Returns the kinetic energy associated with the given velocity
 17 |     and mass of 1.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     vel: theano matrix
 22 |         Symbolic matrix whose rows are velocity vectors.
 23 | 
 24 |     Returns
 25 |     -------
 26 |     return: theano vector
 27 |         Vector whose i-th entry is the kinetic entry associated with vel[i].
 28 | 
 29 |     """
 30 |     return 0.5 * (vel ** 2).sum(axis=1)
 31 | 
 32 | 
 33 | def hamiltonian(pos, vel, energy_fn):
 34 |     """
 35 |     Returns the Hamiltonian (sum of potential and kinetic energy) for the given
 36 |     velocity and position.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     pos: theano matrix
 41 |         Symbolic matrix whose rows are position vectors.
 42 |     vel: theano matrix
 43 |         Symbolic matrix whose rows are velocity vectors.
 44 |     energy_fn: python function
 45 |         Python function, operating on symbolic theano variables, used tox
 46 |         compute the potential energy at a given position.
 47 | 
 48 |     Returns
 49 |     -------
 50 |     return: theano vector
 51 |         Vector whose i-th entry is the Hamiltonian at position pos[i] and
 52 |         velocity vel[i].
 53 |     """
 54 |     # assuming mass is 1
 55 |     return energy_fn(pos) + kinetic_energy(vel)
 56 | 
 57 | 
 58 | def metropolis_hastings_accept(energy_prev, energy_next, s_rng):
 59 |     """
 60 |     Performs a Metropolis-Hastings accept-reject move.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     energy_prev: theano vector
 65 |         Symbolic theano tensor which contains the energy associated with the
 66 |         configuration at time-step t.
 67 |     energy_next: theano vector
 68 |         Symbolic theano tensor which contains the energy associated with the
 69 |         proposed configuration at time-step t+1.
 70 |     s_rng: theano.tensor.shared_randomstreams.RandomStreams
 71 |         Theano shared random stream object used to generate the random number
 72 |         used in proposal.
 73 | 
 74 |     Returns
 75 |     -------
 76 |     return: boolean
 77 |         True if move is accepted, False otherwise
 78 |     """
 79 |     ediff = energy_prev - energy_next
 80 |     #ediff = theano.printing.Print('ediff')(ediff) 
 81 |     return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0
 82 | 
 83 | 
 84 | def simulate_dynamics(initial_pos, initial_vel, stepsize, n_steps, energy_fn):
 85 |     """
 86 |     Return final (position, velocity) obtained after an `n_steps` leapfrog
 87 |     updates, using Hamiltonian dynamics.
 88 | 
 89 |     Parameters
 90 |     ----------
 91 |     initial_pos: shared theano matrix
 92 |         Initial position at which to start the simulation
 93 |     initial_vel: shared theano matrix
 94 |         Initial velocity of particles
 95 |     stepsize: shared theano scalar
 96 |         Scalar value controlling amount by which to move
 97 |     energy_fn: python function
 98 |         Python function, operating on symbolic theano variables, used to
 99 |         compute the potential energy at a given position.
100 | 
101 |     Returns
102 |     -------
103 |     rval1: theano matrix
104 |         Final positions obtained after simulation
105 |     rval2: theano matrix
106 |         Final velocity obtained after simulation
107 |     """
108 | 
109 |     def leapfrog(pos, vel, step):
110 |         """
111 |         Inside loop of Scan. Performs one step of leapfrog update, using
112 |         Hamiltonian dynamics.
113 | 
114 |         Parameters
115 |         ----------
116 |         pos: theano matrix
117 |             in leapfrog update equations, represents pos(t), position at time t
118 |         vel: theano matrix
119 |             in leapfrog update equations, represents vel(t - stepsize/2),
120 |             velocity at time (t - stepsize/2)
121 |         step: theano scalar
122 |             scalar value controlling amount by which to move
123 | 
124 |         Returns
125 |         -------
126 |         rval1: [theano matrix, theano matrix]
127 |             Symbolic theano matrices for new position pos(t + stepsize), and
128 |             velocity vel(t + stepsize/2)
129 |         rval2: dictionary
130 |             Dictionary of updates for the Scan Op
131 |         """
132 |         # from pos(t) and vel(t-stepsize/2), compute vel(t+stepsize/2)
133 |         dE_dpos = TT.grad(energy_fn(pos).sum(), pos)
134 |         new_vel = vel - step * dE_dpos
135 |         # from vel(t+stepsize/2) compute pos(t+stepsize)
136 |         new_pos = pos + step * new_vel
137 |         return [new_pos, new_vel], {}
138 | 
139 |     # compute velocity at time-step: t + stepsize/2
140 |     initial_energy = energy_fn(initial_pos)
141 |     dE_dpos = TT.grad(initial_energy.sum(), initial_pos)
142 | 
143 |     vel_half_step = initial_vel - 0.5 * stepsize * dE_dpos
144 | 
145 |     # compute position at time-step: t + stepsize
146 |     pos_full_step = initial_pos + stepsize * vel_half_step
147 | 
148 |     # perform leapfrog updates: the scan op is used to repeatedly compute
149 |     # vel(t + (m-1/2)*stepsize) and pos(t + m*stepsize) for m in [2,n_steps].
150 |     (all_pos, all_vel), scan_updates = theano.scan(leapfrog,
151 |             outputs_info=[
152 |                 dict(initial=pos_full_step),
153 |                 dict(initial=vel_half_step),
154 |                 ],
155 |             non_sequences=[stepsize],
156 |             n_steps=n_steps - 1)
157 |     final_pos = all_pos[-1]
158 |     final_vel = all_vel[-1]
159 |     # NOTE: Scan always returns an updates dictionary, in case the
160 |     # scanned function draws samples from a RandomStream. These
161 |     # updates must then be used when compiling the Theano function, to
162 |     # avoid drawing the same random numbers each time the function is
163 |     # called. In this case however, we consciously ignore
164 |     # "scan_updates" because we know it is empty.
165 |     assert not scan_updates
166 | 
167 |     # The last velocity returned by scan is vel(t +
168 |     # (n_steps - 1 / 2) * stepsize) We therefore perform one more half-step
169 |     # to return vel(t + n_steps * stepsize)
170 |     energy = energy_fn(final_pos)
171 |     final_vel = final_vel - 0.5 * stepsize * TT.grad(
172 |         energy.sum(), final_pos,
173 |         consider_constant=[final_pos, final_vel])
174 | 
175 |     # return new proposal state
176 |     return final_pos, final_vel
177 | 
178 | 
179 | def hmc_move(s_rng, positions, energy_fn, stepsize, n_steps):
180 |     """
181 |     This function performs one-step of Hybrid Monte-Carlo sampling. We start by
182 |     sampling a random velocity from a univariate Gaussian distribution, perform
183 |     `n_steps` leap-frog updates using Hamiltonian dynamics and accept-reject
184 |     using Metropolis-Hastings.
185 | 
186 |     Parameters
187 |     ----------
188 |     s_rng: theano shared random stream
189 |         Symbolic random number generator used to draw random velocity and
190 |         perform accept-reject move.
191 |     positions: shared theano matrix
192 |         Symbolic matrix whose rows are position vectors.
193 |     energy_fn: python function
194 |         Python function, operating on symbolic theano variables, used to
195 |         compute the potential energy at a given position.
196 |     stepsize:  shared theano scalar
197 |         Shared variable containing the stepsize to use for `n_steps` of HMC
198 |         simulation steps.
199 |     n_steps: integer
200 |         Number of HMC steps to perform before proposing a new position.
201 | 
202 |     Returns
203 |     -------
204 |     rval1: boolean
205 |         True if move is accepted, False otherwise
206 |     rval2: theano matrix
207 |         Matrix whose rows contain the proposed "new position"
208 |     """
209 | 
210 |     # sample random velocity
211 |     initial_vel = s_rng.normal(size=positions.shape)
212 | 
213 |     # perform simulation of particles subject to Hamiltonian dynamics
214 |     final_pos, final_vel = simulate_dynamics(
215 |             initial_pos=positions,
216 |             initial_vel=initial_vel,
217 |             stepsize=stepsize,
218 |             n_steps=n_steps,
219 |             energy_fn=energy_fn)
220 | 
221 |     # accept/reject the proposed move based on the joint distribution
222 |     accept = metropolis_hastings_accept(
223 |             energy_prev=hamiltonian(positions, initial_vel, energy_fn),
224 |             energy_next=hamiltonian(final_pos, final_vel, energy_fn),
225 |             s_rng=s_rng)
226 | 
227 |     return accept, final_pos
228 | 
229 | 
230 | def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
231 |                  target_acceptance_rate, stepsize_inc, stepsize_dec,
232 |                  stepsize_min, stepsize_max, avg_acceptance_slowness):
233 |     """This function is executed after `n_steps` of HMC sampling
234 |     (`hmc_move` function). It creates the updates dictionary used by
235 |     the `simulate` function. It takes care of updating: the position
236 |     (if the move is accepted), the stepsize (to track a given target
237 |     acceptance rate) and the average acceptance rate (computed as a
238 |     moving average).
239 | 
240 |     Parameters
241 |     ----------
242 |     positions: shared variable, theano matrix
243 |         Shared theano matrix whose rows contain the old position
244 |     stepsize: shared variable, theano scalar
245 |         Shared theano scalar containing current step size
246 |     avg_acceptance_rate: shared variable, theano scalar
247 |         Shared theano scalar containing the current average acceptance rate
248 |     final_pos: shared variable, theano matrix
249 |         Shared theano matrix whose rows contain the new position
250 |     accept: theano scalar
251 |         Boolean-type variable representing whether or not the proposed HMC move
252 |         should be accepted or not.
253 |     target_acceptance_rate: float
254 |         The stepsize is modified in order to track this target acceptance rate.
255 |     stepsize_inc: float
256 |         Amount by which to increment stepsize when acceptance rate is too high.
257 |     stepsize_dec: float
258 |         Amount by which to decrement stepsize when acceptance rate is too low.
259 |     stepsize_min: float
260 |         Lower-bound on `stepsize`.
261 |     stepsize_min: float
262 |         Upper-bound on `stepsize`.
263 |     avg_acceptance_slowness: float
264 |         Average acceptance rate is computed as an exponential moving average.
265 |         (1-avg_acceptance_slowness) is the weight given to the newest
266 |         observation.
267 | 
268 |     Returns
269 |     -------
270 |     rval1: dictionary-like
271 |         A dictionary of updates to be used by the `HMC_Sampler.simulate`
272 |         function.  The updates target the position, stepsize and average
273 |         acceptance rate.
274 | 
275 |     """
276 | 
277 |     ## POSITION UPDATES ##
278 |     # broadcast `accept` scalar to tensor with the same dimensions as
279 |     # final_pos.
280 |     accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1)))
281 |     # if accept is True, update to `final_pos` else stay put
282 |     new_positions = TT.switch(accept_matrix, final_pos, positions)
283 | 
284 |     ## STEPSIZE UPDATES ##
285 |     # if acceptance rate is too low, our sampler is too "noisy" and we reduce
286 |     # the stepsize. If it is too high, our sampler is too conservative, we can
287 |     # get away with a larger stepsize (resulting in better mixing).
288 |     _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate,
289 |                               stepsize * stepsize_inc, stepsize * stepsize_dec)
290 |     # maintain stepsize in [stepsize_min, stepsize_max]
291 |     new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max)
292 | 
293 |     ## ACCEPT RATE UPDATES ##
294 |     # perform exponential moving average
295 |     mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype)
296 |     new_acceptance_rate = TT.add(
297 |             avg_acceptance_slowness * avg_acceptance_rate,
298 |             (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype))
299 | 
300 |     return [(positions, new_positions),
301 |             (stepsize, new_stepsize),
302 |             (avg_acceptance_rate, new_acceptance_rate)]
303 | 
304 | 
305 | class HMC_sampler(object):
306 |     """
307 |     Convenience wrapper for performing Hybrid Monte Carlo (HMC). It creates the
308 |     symbolic graph for performing an HMC simulation (using `hmc_move` and
309 |     `hmc_updates`). The graph is then compiled into the `simulate` function, a
310 |     theano function which runs the simulation and updates the required shared
311 |     variables.
312 | 
313 |     Users should interface with the sampler thorugh the `draw` function which
314 |     advances the markov chain and returns the current sample by calling
315 |     `simulate` and `get_position` in sequence.
316 | 
317 |     The hyper-parameters are the same as those used by Marc'Aurelio's
318 |     'train_mcRBM.py' file (available on his personal home page).
319 |     """
320 | 
321 |     def __init__(self, **kwargs):
322 |         self.__dict__.update(kwargs)
323 | 
324 |     @classmethod
325 |     def new_from_shared_positions(cls, shared_positions, energy_fn,
326 |             initial_stepsize=0.01, target_acceptance_rate=.9, n_steps=20,
327 |             stepsize_dec=0.98,
328 |             stepsize_min=0.001,
329 |             stepsize_max=0.25,
330 |             stepsize_inc=1.02,
331 |  # used in geometric avg. 1.0 would be not moving at all
332 |             avg_acceptance_slowness=0.9,
333 |             s_rng=None):
334 |         """
335 |         :param shared_positions: theano ndarray shared var with
336 |             many particle [initial] positions
337 | 
338 |         :param energy_fn:
339 |             callable such that energy_fn(positions)
340 |             returns theano vector of energies.
341 |             The len of this vector is the batch size.
342 | 
343 |             The sum of this energy vector must be differentiable (with
344 |             theano.tensor.grad) with respect to the positions for HMC
345 |             sampling to work.
346 | 
347 |         """
348 | 
349 |         # allocate shared variables
350 |         stepsize = sharedX(initial_stepsize, 'hmc_stepsize')
351 |         avg_acceptance_rate = sharedX(target_acceptance_rate,
352 |                                       'avg_acceptance_rate')
353 |         if s_rng is None:
354 |             s_rng = TT.shared_randomstreams.RandomStreams(12345)
355 | 
356 |         # define graph for an `n_steps` HMC simulation
357 |         accept, final_pos = hmc_move(
358 |                 s_rng,
359 |                 shared_positions,
360 |                 energy_fn,
361 |                 stepsize,
362 |                 n_steps)
363 | 
364 |         # define the dictionary of updates, to apply on every `simulate` call
365 |         simulate_updates = hmc_updates(
366 |                 shared_positions,
367 |                 stepsize,
368 |                 avg_acceptance_rate,
369 |                 final_pos=final_pos,
370 |                 accept=accept,
371 |                 stepsize_min=stepsize_min,
372 |                 stepsize_max=stepsize_max,
373 |                 stepsize_inc=stepsize_inc,
374 |                 stepsize_dec=stepsize_dec,
375 |                 target_acceptance_rate=target_acceptance_rate,
376 |                 avg_acceptance_slowness=avg_acceptance_slowness)
377 | 
378 |         # compile theano function
379 |         simulate = function([], [], updates=simulate_updates)
380 | 
381 |         # create HMC_sampler object with the following attributes ...
382 |         return cls(
383 |                 positions=shared_positions,
384 |                 stepsize=stepsize,
385 |                 stepsize_min=stepsize_min,
386 |                 stepsize_max=stepsize_max,
387 |                 avg_acceptance_rate=avg_acceptance_rate,
388 |                 target_acceptance_rate=target_acceptance_rate,
389 |                 s_rng=s_rng,
390 |                 _updates=simulate_updates,
391 |                 simulate=simulate)
392 | 
393 |     def draw(self, **kwargs):
394 |         """
395 |         Returns a new position obtained after `n_steps` of HMC simulation.
396 | 
397 |         Parameters
398 |         ----------
399 |         kwargs: dictionary
400 |             The `kwargs` dictionary is passed to the shared variable
401 |             (self.positions) `get_value()` function.  For example, to avoid
402 |             copying the shared variable value, consider passing `borrow=True`.
403 | 
404 |         Returns
405 |         -------
406 |         rval: numpy matrix
407 |             Numpy matrix whose of dimensions similar to `initial_position`.
408 |        """
409 |         self.simulate()
410 |         if self.stepsize.get_value() < 1.5 * self.stepsize_min:
411 |             print 'WARN: HMC stepsize %f close to minimum of %f (acceptance %f)' %(
412 |                 self.stepsize.get_value(), self.stepsize_min, self.avg_acceptance_rate.get_value())
413 |         if self.stepsize.get_value() > .9 * self.stepsize_max:
414 |             print 'WARN: HMC stepsize %f close to maximum of %f (acceptance %f)' %(
415 |                 self.stepsize.get_value(), self.stepsize_max, self.avg_acceptance_rate.get_value())
416 |         return self.positions.get_value(borrow=False)
417 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/hpsuggest.py:
--------------------------------------------------------------------------------
  1 | from itertools import groupby
  2 | import numpy as np
  3 | import scipy.optimize
  4 | 
  5 | from hyperopt.pyll_utils import expr_to_config
  6 | from hyperopt import pyll, STATUS_OK
  7 | from hyperopt.algobase import SuggestAlgo
  8 | 
  9 | from . import kernels
 10 | 
 11 | 
 12 | def loss_variances(trials):
 13 |     return [r.get('loss_variance', 0)
 14 |             for r in trials.results if r['status'] == STATUS_OK]
 15 | 
 16 | 
 17 | class SuggestBest(SuggestAlgo):
 18 |     def __init__(self, domain, trials, seed, best_pt):
 19 |         SuggestAlgo.__init__(self, domain, trials, seed)
 20 |         self.best_pt = best_pt
 21 | 
 22 |     def on_node_hyperparameter(self, memo, node, label):
 23 |         if label in self.best_pt:
 24 |             rval = [self.best_pt[label]]
 25 |         else:
 26 |             rval = []
 27 |         return rval
 28 | 
 29 | 
 30 | class ParamHelper(object):
 31 |     def __init__(self, config):
 32 |         self.config = config
 33 | 
 34 |     def __call__(self, name):
 35 |         node = self.config[name]['node']
 36 |         conditional = self.config[name]['conditions'] != set([()])
 37 |         rval = getattr(self, node.name)(node, conditional)
 38 |         return rval
 39 | 
 40 |     def randint(self, node, conditional):
 41 |         upper = int(node.arg['upper'].obj)
 42 |         def val_fn(feat):
 43 |             rval = np.asarray(feat).astype('int')
 44 |             if not np.allclose(rval, feat):
 45 |                 print 'WARNING: optimizer gave randint val_fn a float'
 46 |             return rval
 47 | 
 48 |         if upper == 2:
 49 |             return {
 50 |                 'feature_bounds': (0, 1),
 51 |                 'kernel': kernels.Choice2(0.7, 1e-2, 2.0, conditional),
 52 |                 'ndim': 1,
 53 |                 'continuous': False,
 54 |                 'ordinal': False,
 55 |                 'feature_fn': np.asarray,
 56 |                 'val_fn': val_fn,
 57 |             }
 58 |         else:
 59 |             return {
 60 |                 'feature_bounds': (0, upper),
 61 |                 'kernel': kernels.ChoiceN(upper, conditional),
 62 |                 'ndim': 1,
 63 |                 'continuous': False,
 64 |                 'ordinal': False,
 65 |                 'feature_fn': np.asarray,
 66 |                 'val_fn': val_fn,
 67 |             }
 68 | 
 69 |     def categorical(self, node, conditional):
 70 |         # TODO: bias the choice somehow?
 71 |         return self.randint(node, conditional)
 72 | 
 73 |     def uniform(self, node, conditional, continuous=True, q=None):
 74 |         low = float(node.arg['low'].obj)
 75 |         high = float(node.arg['high'].obj)
 76 |         def val_fn(feat):
 77 |             rval = feat * (high - low) + low
 78 |             if q is not None:
 79 |                 rval = np.round(rval / q) * q
 80 |             return rval
 81 |         return {
 82 |             'feature_bounds': (0, 1),
 83 |             'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional),
 84 |             'ndim': 1,
 85 |             'continuous': continuous,
 86 |             'ordinal': q is not None,
 87 |             'feature_fn': (lambda val: (np.asarray(val) - low) / (high - low)),
 88 |             'val_fn': val_fn,
 89 |         }
 90 | 
 91 |     def quniform(self, node, conditional):
 92 |         q = float(node.arg['q'].obj)
 93 |         return self.uniform(node, conditional, continuous=False, q=q)
 94 | 
 95 |     def loguniform(self, node, conditional, continuous=True, q=None):
 96 |         # -- log-scaling has been handled by feature code
 97 |         #val = np.exp(featureval) - self.logquantized_feature_epsilon
 98 |         low = float(node.arg['low'].obj)
 99 |         high = float(node.arg['high'].obj)
100 |         def val_fn(feat):
101 |             rval = np.exp(feat * (high - low) + low)
102 |             if q is not None:
103 |                 rval = np.round(rval / q) * q
104 |             return rval
105 |         return {
106 |             'feature_bounds': (0, 1),
107 |             'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional),
108 |             'ndim': 1,
109 |             'continuous': continuous,
110 |             'ordinal': q is not None,
111 |             'feature_fn': (lambda val: (np.log(val) - low) / (high - low)),
112 |             'val_fn': val_fn,
113 |         }
114 | 
115 |     def qloguniform(self, node, conditional):
116 |         q = float(node.arg['q'].obj)
117 |         return self.loguniform(node, conditional, continuous=False, q=q)
118 | 
119 |     def normal(self, node, conditional, continuous=True, q=None):
120 |         sigma = float(node.arg['sigma'].obj)
121 |         mu = float(node.arg['mu'].obj)
122 |         def val_fn(feat):
123 |             rval = feat * sigma + mu
124 |             if q is not None:
125 |                 rval = np.round(rval / q) * q
126 |             return rval
127 |         return {
128 |             'feature_bounds': (-10, 10),
129 |             'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional),
130 |             'ndim': 1,
131 |             'continuous': continuous,
132 |             'ordinal': q is not None,
133 |             'feature_fn': (lambda val: (np.asarray(val) - mu) / sigma),
134 |             'val_fn': val_fn,
135 |         }
136 | 
137 |     def qnormal(self, node, conditional):
138 |         q = float(node.arg['q'].obj)
139 |         return self.normal(node, conditional, continuous=False, q=q)
140 | 
141 |     def lognormal(self, node, conditional, continuous=True, q=None):
142 |         sigma = float(node.arg['sigma'].obj)
143 |         mu = float(node.arg['mu'].obj)
144 |         def val_fn(feat):
145 |             rval = np.exp(feat * sigma + mu)
146 |             if q is not None:
147 |                 rval = np.round(rval / q) * q
148 |             return rval
149 |         return {
150 |             'feature_bounds': (-10, 10),
151 |             'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional),
152 |             'ndim': 1,
153 |             'continuous': continuous,
154 |             'ordinal': q is not None,
155 |             'feature_fn': (lambda val: (np.log(val) - mu) / sigma),
156 |             'val_fn': val_fn,
157 |         }
158 | 
159 |     def qlognormal(self, node, conditional):
160 |         q = float(node.arg['q'].obj)
161 |         return self.normal(node, conditional, continuous=False, q=q)
162 | 
163 | 
164 | class DomainGP(object):
165 |     logquantized_feature_epsilon = 1e-3
166 | 
167 |     def __init__(self, domain, GPR=None):
168 |         self.domain = domain
169 | 
170 |         # -- hps: list of hyperparameter names
171 |         self.hps = list(sorted(domain.params.keys()))
172 | 
173 |         # -- config: type and dependency information keyed by hp name
174 |         self.config = {}
175 |         expr_to_config(domain.expr, None, self.config)
176 | 
177 |         if GPR is None:
178 |             GPR = self.GPR # -- class variable
179 | 
180 |         kerns, self.hp_slices, self.x_bounds = self.init_param_helpers()
181 |         self.gpr = GPR(kernels.product(kerns, self.hp_slices))
182 |         #kern = self.compress_product(kerns, slices)
183 |         #self.gpr = GPR(kern)
184 | 
185 |     def init_param_helpers(self):
186 |         # -- called early in constructor before most attributes have been set
187 |         kerns = []
188 |         slices = []
189 |         x_bounds = []
190 |         param_helper = ParamHelper(self.config)
191 |         self.param_helpers = {}
192 |         ndim_offset = 0
193 |         for hpname in self.hps:
194 |             ph = self.param_helpers[hpname] = param_helper(hpname)
195 | 
196 |             kerns.append(ph['kernel'])
197 | 
198 |             # slices are for index into featurevec
199 |             ph['feature_slice'] = slice(ndim_offset, ndim_offset + ph['ndim'])
200 |             slices.append(ph['feature_slice'])
201 |             ndim_offset += ph['ndim']
202 | 
203 |             x_bounds.append(ph['feature_bounds'])
204 | 
205 |         return kerns, slices, np.asarray(x_bounds)
206 | 
207 |     def draw_n_feature_vecs(self, N, rng):
208 |         fake_ids = range(N)
209 |         idxs, vals = pyll.rec_eval(
210 |             self.domain.s_idxs_vals,
211 |             memo={
212 |                 self.domain.s_new_ids: fake_ids,
213 |                 self.domain.s_rng: rng,
214 |             })
215 |         return self.features_from_idxs_vals(fake_ids, idxs, vals)
216 | 
217 |     def features_from_idxs_vals(self, ids, idxs, vals):
218 |         columns = []
219 |         if not np.allclose(ids, np.arange(len(ids))):
220 |             # -- indexing below is a little more complicated, due
221 |             #    to another step of indirection
222 |             raise NotImplementedError('non-contiguous target ids')
223 |         for hpname in self.hps:
224 |             cX = self.param_helpers[hpname]['feature_fn'](vals[hpname])
225 |             if cX.ndim < 2:
226 |                 cX.shape = (len(cX), 1)
227 |             assert cX.ndim == 2
228 |             assert cX.shape[1] == self.param_helpers[hpname]['ndim']
229 |             cc = np.empty((len(ids), cX.shape[1])) + np.nan
230 |             cc[idxs[hpname]] = cX
231 |             columns.append(cc)
232 |         return np.hstack(columns)
233 | 
234 |     def best_pt_from_featurevec(self, featurevec):
235 |         best_pt = {}
236 |         for hpname in self.hps:
237 |             ph = self.param_helpers[hpname]
238 |             feat = featurevec[ph['feature_slice']]
239 |             if not np.isnan(np.sum(feat)):
240 |                 assert len(feat) == 1
241 |                 best_pt[hpname] = ph['val_fn'](feat[0])
242 |         return best_pt
243 | 
244 |     def _X_y_var_y(self, trials, failure_loss=None):
245 |         all_tids = trials.tids
246 |         all_idxs, all_vals = trials.idxs_vals
247 |         X = self.features_from_idxs_vals(all_tids, all_idxs, all_vals)
248 |         def loss(tr):
249 |             if tr['result']['status'] == 'ok':
250 |                 return (
251 |                     float(tr['result']['loss']),
252 |                     float(tr['result'].get('loss_variance', 0)))
253 |             else: # TODO in-fill prediction for in-prog jobs?
254 |                 return float(failure_loss), 0
255 |         y, var_y = zip(*map(loss, trials.trials))
256 |         #y = trials.losses()
257 |         #var_y = loss_variances(trials)
258 |         assert len(y) == len(X) == len(var_y)
259 |         return X, y, var_y
260 | 
261 |     def fit_gpr(self, X, y, var_y, method='ml2'):
262 |         assert X.shape[1] == len(self.hps)
263 |         if method == 'ml2':
264 |             self.gpr.fit_ml2(X, y, var_y=var_y)
265 |         elif method == 'hmc':
266 |             self.gpr.fit_hmc(X, y, var_y=var_y)
267 |         else:
268 |             raise NotImplementedError(method)
269 | 
270 |     def optimize_over_X_finetune(self, vec):
271 |         vec_is_nan = np.isnan(vec)
272 | 
273 |         vec0 = vec.copy()
274 |         vec0[vec_is_nan] = 0
275 | 
276 |         to_opt = np.ones_like(vec)
277 |         to_opt[vec_is_nan] = 0
278 |         for kslice, hpname in zip(self.hp_slices, self.hps):
279 |             ph = self.param_helpers[hpname]
280 |             if not (ph['continuous'] or ph['ordinal']):
281 |                 to_opt[kslice] = 0
282 |         q_filter = np.ones_like(vec)
283 | 
284 |         def f_df(_x):
285 |             x = np.clip(_x, self.x_bounds[:, 0], self.x_bounds[:, 1])
286 |             if not np.allclose(x, _x):
287 |                 print 'x clipped', abs(x - _x)
288 |             x[vec_is_nan] = np.nan
289 |             f, df = self.crit_deriv(np.atleast_2d(x))
290 |             assert len(f) == len(df) == 1
291 |             f = f[0]
292 |             df = df[0]
293 |             assert len(self.hps) == len(self.hp_slices)
294 |             #print 'OPTIMIZE_IN_X: f_df', f, df
295 | 
296 |             # -- don't fine-tune the discrete variables
297 |             #    TODO: don't even compute the gradient in the first place
298 |             #for ii, (kslice, hpname) in enumerate(zip(self.hp_slices, self.hps)):
299 |             #    ph = self.param_helpers[hpname]
300 |             #    print '    %40s\t%.3f\t%20s\t%.3f\t%8s\t%8s' % (
301 |             #        hpname, _x[ii], kslice, df[ii], ph['continuous'], ph['q'])
302 | 
303 |             assert np.all(np.isfinite(df))
304 |             mask = to_opt * q_filter
305 |             df[mask == 0] = 0
306 |             assert np.all(np.isfinite(df))
307 |             assert np.all(np.isfinite(f))
308 |             return f, df
309 | 
310 |         #print 'OPTIMIZE_IN_X start', vec0
311 |         print 'Info: optimizing', (to_opt * q_filter).sum(), 'vars'
312 |         res = scipy.optimize.minimize(
313 |             fun=f_df,
314 |             x0=vec0,
315 |             jac=True, # -- means f returns cost and jacobian
316 |             method='L-BFGS-B',
317 |             #method='SLSQP',
318 |             tol=1e-10, # XXX delete this after validating file
319 |             #options={} if self.maxiter is None else (
320 |                 #{'maxiter': self.maxiter,}),
321 |             bounds=self.x_bounds,
322 |             )
323 |         #print 'OPTIMIZE_IN_X done', res
324 |         res.x = np.clip(res.x, self.x_bounds[:, 0], self.x_bounds[:, 1])
325 |         assert np.all(np.isfinite(res.x))
326 | 
327 |         for kslice, hpname in zip(self.hp_slices, self.hps):
328 |             ph = self.param_helpers[hpname]
329 |             if ph['ordinal']:
330 |                 # -- round quantized variables to nearest valid value
331 |                 res.x[kslice] = ph['feature_fn'](ph['val_fn'](res.x[kslice]))
332 |                 # -- mask out derivatives from here on
333 |                 q_filter[kslice] = 0
334 | 
335 |         # -- maybe reoptimize with quantized variables frozen
336 |         if (to_opt * q_filter).sum():
337 |             print 'Info: reoptimizing', (to_opt * q_filter).sum(), 'vars'
338 |             res2 = scipy.optimize.minimize(
339 |                 fun=f_df,
340 |                 x0=res.x,
341 |                 jac=True, # -- means f returns cost and jacobian
342 |                 method='L-BFGS-B',
343 |                 #method='SLSQP',
344 |                 tol=1e-10, # XXX delete this after validating file
345 |                 #options={} if self.maxiter is None else (
346 |                     #{'maxiter': self.maxiter,}),
347 |                 bounds=self.x_bounds,
348 |                 )
349 |         else:
350 |             print 'Info: skipping reoptimization step'
351 |             res2 = res
352 |         assert np.all(np.isfinite(res2.x))
353 |         #print 'OPTIMIZE_IN_X done', res
354 |         res2.x = np.clip(res2.x, self.x_bounds[:, 0], self.x_bounds[:, 1])
355 |         res2.x[vec_is_nan] = np.nan
356 |         return res2
357 | 
358 |     def optimize_over_X(self, n_buckshots, n_finetunes, rng, ret_raw=False,
359 |                        ret_results=False):
360 |         # -- sample a bunch of points
361 |         buckshot = self.draw_n_feature_vecs(n_buckshots, rng)
362 |         buckshot_crit = self.crit(buckshot)
363 |         best_first = np.argsort(buckshot_crit)
364 |         #print 'buckshot stats', buckshot_crit.min(), buckshot_crit.max()
365 | 
366 |         # -- finetune a few of the best by gradient descent
367 |         results = [
368 |             (buckshot_crit[best_first[0]],
369 |              -1,
370 |              buckshot[best_first[0]].copy(),
371 |              buckshot_crit[best_first[0]],
372 |             )]
373 |         if self._cost_deriv is not None:
374 |             misc_step = int(n_buckshots / (.5 * n_finetunes))
375 |             misc = best_first[n_finetunes::misc_step]
376 |             top_best = best_first[:n_finetunes - len(misc)]
377 |             to_finetune = list(misc) + list(top_best)
378 |             assert len(to_finetune) <= n_finetunes
379 |             for ii in range(n_finetunes):
380 |                 vec = buckshot[to_finetune[ii]]
381 |                 res = self.optimize_over_X_finetune(vec)
382 |                 results.append((res.fun, ii, res.x.copy(),
383 |                                 buckshot_crit[to_finetune[ii]]))
384 |             results.sort()
385 |             if results[0][1] == -1:
386 |                 print 'Warning: finetuning did no good'
387 |         print 'optimize_X', results[0]
388 |         if ret_results:
389 |             return results
390 |         if ret_raw:
391 |             return results[0][2]
392 |         else:
393 |             # -- return the best one
394 |             best_pt = self.best_pt_from_featurevec(results[0][2])
395 |             return best_pt
396 | 
397 | # -- flake-8 abhors blank line EOF
398 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/hpsuggest_ei.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | import theano.tensor
  4 | 
  5 | from hyperopt import rand
  6 | 
  7 | from .hpsuggest import SuggestBest, DomainGP
  8 | from . import gpr_math
  9 | from . import op_Kcond
 10 | from .gpr import GPR_HMC
 11 | 
 12 | class DomainGP_EI(DomainGP):
 13 |     _EI_thresh_increment = 0.1
 14 |     _min_thresh_inc = 0
 15 |     GPR = GPR_HMC
 16 | 
 17 |     def init_fns(self):
 18 |         try:
 19 |             self._cost_deriv
 20 |         except AttributeError:
 21 |             s_thresh = theano.tensor.dscalar('thresh')
 22 |             s_reuse_cholesky = theano.tensor.iscalar('reuse_cholesky')
 23 |             s_reuse_cholesky_idx = theano.tensor.iscalar('reuse_cholesky_idx')
 24 | 
 25 |             s_mean_x, s_var_x, s_x, K_new = self.gpr.kernel.s_mean_var(
 26 |                 self.gpr.s_X,
 27 |                 self.gpr.s_y,
 28 |                 self.gpr.s_var_y,
 29 |                 self.gpr.s_emp_var,
 30 |                 self.gpr.s_params,
 31 |                 self.gpr.s_var_min,
 32 |                 return_K_new=True)
 33 |             s_logEI = gpr_math.s_normal_logEI(
 34 |                 -s_thresh,
 35 |                 -(s_mean_x + self.gpr.s_emp_mean),
 36 |                 s_var_x,
 37 |                 quad_approx=True)
 38 |             cost = -s_logEI
 39 | 
 40 |             assert cost.ndim == 1
 41 |             s_gx = theano.tensor.grad(cost.sum(), s_x)
 42 | 
 43 |             # -- this hack makes it so that the s_reuse_cholesky
 44 |             #    variable is patched in to the graph during optimization
 45 |             #    and allows to disable the computation of training
 46 |             #    K matrix and it's cholesky factorization
 47 |             op_Kcond.use_lazy_cholesky = s_reuse_cholesky
 48 |             op_Kcond.use_lazy_cholesky_idx = s_reuse_cholesky_idx
 49 |             self._cost_deriv = theano.function(
 50 |                 [s_x, s_thresh, self.gpr.s_params,
 51 |                  s_reuse_cholesky, s_reuse_cholesky_idx],
 52 |                 [cost, s_gx],
 53 |                 on_unused_input='ignore',
 54 |                 allow_input_downcast=True,
 55 |                 profile=0)
 56 |             op_Kcond.use_lazy_cholesky = None
 57 |             op_Kcond.use_lazy_cholesky_idx = None
 58 | 
 59 |             if 1:
 60 |                 # /begin hack sanity checking
 61 |                 #import pdb; pdb.set_trace()
 62 |                 n_cholesky = 0
 63 |                 n_lazy_cholesky = 0
 64 |                 for node in self._cost_deriv.maker.fgraph.toposort():
 65 |                     #print node
 66 |                     if isinstance(node.op,
 67 |                                   theano.sandbox.linalg.ops.Solve):
 68 |                         assert node.op.A_structure != 'general'
 69 |                     if isinstance(node.op,
 70 |                                   theano.sandbox.linalg.ops.Cholesky):
 71 |                         n_cholesky += 1
 72 |                     if isinstance(node.op, op_Kcond.LazyCholesky):
 73 |                         n_lazy_cholesky += 1
 74 |                 assert n_cholesky == 0
 75 |                 assert n_lazy_cholesky == 1
 76 |                 # /end hack sanity checking
 77 | 
 78 |             self._cost = theano.function(
 79 |                 [s_x, s_thresh, self.gpr.s_params],
 80 |                 cost,
 81 |                 allow_input_downcast=True)
 82 |             self._K_new = theano.function(
 83 |                 [s_x, self.gpr.s_params], K_new)
 84 |         return self._cost_deriv
 85 | 
 86 |     def set_thresholds(self, y, var_y, z=1.0, max_ei_thresh=None):
 87 |         yy = y - z * np.sqrt(np.maximum(var_y,
 88 |                                         max(
 89 |                                             self.gpr.s_var_min.eval(),
 90 |                                             self._min_thresh_inc ** 2)))
 91 |         if max_ei_thresh is not None:
 92 |             self._EI_thresh = min(np.min(yy), max_ei_thresh)
 93 |         else:
 94 |             self._EI_thresh = np.min(yy)
 95 | 
 96 |     def crit(self, X):
 97 |         self.init_fns()
 98 |         #return -self.gpr.logEI(X,
 99 |                                #self._EI_thresh,
100 |                                #direction=-1, # below thresh
101 |                                #quad_approx=True)
102 |         gpr = self.gpr
103 |         fs = []
104 |         for pp in gpr._params_list:
105 |             f = self._cost(np.atleast_2d(X),
106 |                                self._EI_thresh,
107 |                                pp)
108 |             fs.append(f)
109 |         mean_f = np.dot(gpr._params_weights, fs)
110 |         return mean_f
111 | 
112 |     def crit_deriv(self, X):
113 |         self.init_fns()
114 |         gpr = self.gpr
115 |         fs = []
116 |         dfs = []
117 |         for ii, pp in enumerate(gpr._params_list):
118 |             #print 'pp', pp, 'x', X
119 |             f, df = self._cost_deriv(np.atleast_2d(X),
120 |                                      self._EI_thresh,
121 |                                      pp,
122 |                                      self._cost_deriv_reuse_cholesky,
123 |                                      ii)
124 |             assert f.shape == (1,), (f.shape, X.shape)
125 |             fs.append(f[0])
126 |             dfs.append(df.flatten())
127 |         self._cost_deriv_reuse_cholesky = 1
128 |         mean_f = np.dot(gpr._params_weights, fs)
129 |         #import pdb; pdb.set_trace()
130 |         mean_df = np.dot(gpr._params_weights, np.asarray(dfs))
131 |         return [mean_f], [mean_df]
132 | 
133 |     def optimize_over_X(self, n_buckshots, n_finetunes, rng):
134 |         while True:
135 |             rval_raw = DomainGP.optimize_over_X(self,
136 |                                                 n_buckshots,
137 |                                                 n_finetunes,
138 |                                                 rng,
139 |                                                 ret_raw=True)
140 |             if len(self.gpr._params_list) == 1:
141 |                 Ks = self._K_new(np.atleast_2d(rval_raw),
142 |                                  self.gpr._params_list[0])
143 |                 if (Ks.max() > (1 - 1e-6)):
144 |                     # -- promote exploration with a more aggressive threshold
145 |                     self._EI_thresh -= self._EI_thresh_increment
146 |                     print 'lowering EI thresh to', self._EI_thresh
147 |                 else:
148 |                     break
149 |             else:
150 |                 break
151 |         best_pt = self.best_pt_from_featurevec(rval_raw)
152 |         return best_pt
153 | 
154 | 
155 | _suggest_domain_cache = {}
156 | def suggest(new_ids, domain, trials, seed,
157 |             warmup_cutoff=15, # -- enough for mean & var stats
158 |             n_buckshots=10000,
159 |             n_finetunes=50,
160 |             stop_at=None,
161 |             plot_contours=None,
162 |             gp_fit_method='ml2',
163 |             failure_loss=None,
164 |             max_ei_thresh=None,
165 |             ):
166 |     """
167 |     Parameters
168 |     ----------
169 | 
170 |     """
171 |     # XXX would like to cache on domain, but
172 |     #     fmin(fn, space) always rebuilds a new domain for given fn and space
173 |     key = domain.expr
174 |     try:
175 |         dgp = _suggest_domain_cache[key]
176 |     except KeyError:
177 |         print  'CREATING GP_EI for', domain
178 |         dgp = _suggest_domain_cache[key] = DomainGP_EI(domain)
179 |     if len(trials.trials):
180 |         X, y, var_y = dgp._X_y_var_y(trials, failure_loss=failure_loss)
181 | 
182 |     if len(trials.trials) <= warmup_cutoff:
183 |         if len(trials.trials):
184 |             dgp.gpr.prior_mean = np.mean(y)
185 |             dgp.gpr.prior_var = np.var(y)
186 |         return rand.suggest(new_ids, domain, trials, seed)
187 | 
188 |     if stop_at is not None and min(trials.losses()) < stop_at:
189 |         return []
190 | 
191 |     dgp.fit_gpr(X, y, var_y, method=gp_fit_method)
192 |     dgp.set_thresholds(y, var_y, max_ei_thresh=max_ei_thresh)
193 |     dgp._cost_deriv_reuse_cholesky = 0
194 | 
195 |     print 'EI: Best after %i trials: %f' % ( len(y), np.min(y))
196 |     #dgp.gpr._params_list[0][:] = 0
197 |     rng = np.random.RandomState(seed)
198 |     t0 = time.time()
199 |     best_pt = dgp.optimize_over_X(
200 |         n_buckshots=n_buckshots,
201 |         n_finetunes=n_finetunes,
202 |         rng=rng,
203 |         )
204 |     t1 = time.time()
205 |     print 'optimizing surrogate took', (t1 - t0)
206 |     if plot_contours:
207 |         plot_contours(dgp, 2, dgp._lbound, best_pt)
208 |     new_id, = new_ids
209 |     #print 'REI: Best pt', best_pt
210 |     return SuggestBest(domain, trials, seed, best_pt)(new_id)
211 | 
212 | # --eof
213 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/hpsuggest_lucb.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/hpsuggest_lucb.py


--------------------------------------------------------------------------------
/hp_gpsmbo/hpsuggest_ucb.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano.tensor
  3 | from hyperopt import rand
  4 | 
  5 | from .hpsuggest import SuggestBest, DomainGP
  6 | from .gpr import GPR_ML2
  7 | 
  8 | 
  9 | class DomainGP_UCB(DomainGP):
 10 |     GPR = GPR_ML2
 11 | 
 12 |     def init_cost_fns(self):
 13 |         try:
 14 |             self._cost_fn
 15 |         except AttributeError:
 16 |             s_ucb_z = theano.tensor.dscalar('ucb_z')
 17 | 
 18 |             s_mean_x, s_var_x, s_x, K_new = self.gpr.kernel.s_mean_var(
 19 |                 self.gpr.s_X,
 20 |                 self.gpr.s_y,
 21 |                 self.gpr.s_var_y,
 22 |                 self.gpr.s_emp_var,
 23 |                 self.gpr.s_params,
 24 |                 self.gpr.s_var_min,
 25 |                 return_K_new=True)
 26 |             s_cost = s_mean_x - theano.tensor.sqrt(s_var_x) * s_ucb_z
 27 | 
 28 |             s_gx = theano.tensor.grad(s_cost.sum(), s_x)
 29 |             self._cost_fn = theano.function(
 30 |                 [s_x, s_ucb_z, self.gpr.s_params], s_cost)
 31 |             self._cost_deriv = theano.function(
 32 |                 [s_x, s_ucb_z, self.gpr.s_params], [s_cost, s_gx])
 33 |             self._K_new = theano.function(
 34 |                 [s_x, self.gpr.s_params], K_new)
 35 | 
 36 | 
 37 |     def crit(self, X):
 38 |         self.init_cost_fns()
 39 |         if len(self.gpr._params_list) > 1:
 40 |             raise NotImplementedError()
 41 |         pp, = self.gpr._params_list
 42 |         return self._cost_fn(X, self._ucb_z, pp)
 43 | 
 44 |     def crit_deriv(self, X):
 45 |         self.init_cost_fns()
 46 |         if len(self.gpr._params_list) > 1:
 47 |             raise NotImplementedError()
 48 |         pp, = self.gpr._params_list
 49 |         return self._cost_deriv(X, self._ucb_z, pp)
 50 | 
 51 |     def optimize_over_X(self, n_buckshots, n_finetunes, rng):
 52 |         best_pt = None
 53 |         while True:
 54 |             results = DomainGP.optimize_over_X(self, n_buckshots,
 55 |                                                 n_finetunes, rng, ret_results=True)
 56 |             Ks = self._K_new(np.asarray([rr[2] for rr in results]),
 57 |                              self.gpr._params_list[0]).T
 58 |             #order = rng.permutation(len(results))
 59 |             order = range(len(results))
 60 |             assert len(Ks) == len(results)
 61 |             for ii in order:
 62 |             #for Ki, rr in zip(Ks, results):
 63 |                 Ki = Ks[ii]
 64 |                 rr = results[ii]
 65 |                 if Ki.max() >  self._K_thresh:
 66 |                     #print 'UCB: skipping pt wit h K', Ki.max()
 67 |                     continue
 68 |                 else:
 69 |                     #print 'UCB: picking pt wit h K', Ki.max()
 70 |                     best_pt = self.best_pt_from_featurevec(rr[2])
 71 |                     break
 72 |             if best_pt is None:
 73 |                 self._ucb_z *= 2 + .1
 74 |                 print 'UCB: raising ucb_z to', self._ucb_z
 75 |             else:
 76 |                 break
 77 |         #best_pt = self.best_pt_from_featurevec(rval_raw)
 78 |         return best_pt
 79 | 
 80 | 
 81 | _suggest_domain_cache = {}
 82 | def suggest(new_ids, domain, trials, seed,
 83 |             warmup_cutoff=15,
 84 |             n_buckshots=10000,
 85 |             n_finetunes=50,
 86 |             stop_at=None,
 87 |             plot_contours=None,
 88 |             ):
 89 |     """
 90 |     Parameters
 91 |     ----------
 92 | 
 93 |     """
 94 |     if len(trials.trials) <= warmup_cutoff:
 95 |         return rand.suggest(new_ids, domain, trials, seed)
 96 | 
 97 |     # XXX would like to cache on domain, but
 98 |     #     fmin(fn, space) always rebuilds a new domain for given fn and space
 99 |     key = domain.expr
100 |     try:
101 |         dgp = _suggest_domain_cache[key]
102 |     except KeyError:
103 |         dgp = _suggest_domain_cache[key] = DomainGP_UCB(domain)
104 | 
105 |     if stop_at is not None and min(trials.losses()) < stop_at:
106 |         return []
107 | 
108 |     X, y, var_y = dgp._X_y_var_y(trials)
109 |     dgp.fit_gpr(X, y, var_y)
110 |     print 'Fit ->', dgp.gpr._params_list[0]
111 |     dgp._ucb_z = 0.2
112 |     # XXX: radius should depend on dimensionality?
113 |     #     1e-8 worked for branin in case current one doesn't
114 |     dgp._K_thresh =  (1 - 1e-5) # / (1000 + len(y) ** 2))
115 | 
116 |     print 'UCB: Best after %i trials: %f' % ( len(y), np.min(y))
117 |     #dgp.gpr._params_list[0][:] = 0
118 |     rng = np.random.RandomState(seed)
119 |     best_pt = dgp.optimize_over_X(
120 |         n_buckshots=n_buckshots,
121 |         n_finetunes=n_finetunes,
122 |         rng=rng,
123 |         )
124 |     if plot_contours:
125 |         plot_contours(dgp, 2, dgp._lbound, best_pt)
126 |     new_id, = new_ids
127 |     #print 'REI: Best pt', best_pt
128 |     return SuggestBest(domain, trials, seed, best_pt)(new_id)
129 | # --eof
130 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/kernels.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as TT
  4 | 
  5 | from .op_Kcond import zero_diag, isnan as s_isnan
  6 | from .kernels_base import Kernel
  7 | from .kernels_base import euclidean_sq_distances
  8 | 
  9 | from .prodkernels import SqExpProd
 10 | 
 11 | 
 12 | def check_K(K, tag=None):
 13 |     return K
 14 |     import scipy.linalg
 15 |     def check_pd(op, xin):
 16 |         try:
 17 |             scipy.linalg.cholesky(xin + 1e-12 * np.eye(xin.shape[0]))
 18 |         except:
 19 |             print 'tag', tag
 20 |             theano.printing.debugprint(K)
 21 |             raise
 22 |     return theano.printing.Print('check_K', global_fn=check_pd)(K)
 23 | 
 24 | 
 25 | def check_finite(K, tag=None):
 26 |     return K
 27 |     def check(op, xin):
 28 |         try:
 29 |             assert np.all(np.isfinite(xin))
 30 |         except:
 31 |             print 'tag', tag
 32 |             theano.printing.debugprint(K)
 33 |             raise
 34 |     return theano.printing.Print('check_finite', global_fn=check)(K)
 35 | 
 36 | 
 37 | class ChoiceN(Kernel):
 38 |     def __init__(self, upper, conditional, seed=1):
 39 |         # N.B. seed should not need to be changed
 40 | 
 41 |         # -- XXX only need upper-triangle worth of values
 42 |         #        but need Theano triangle-packing op (already exists??)
 43 |         #self.n_params = upper * (upper - 1) / 2
 44 |         self.n_idxs = (upper + 1) if conditional else upper
 45 |         self.n_params = self.n_idxs ** 2
 46 | 
 47 |         self.seed = seed
 48 |         self.conditional = conditional
 49 | 
 50 |     def prodkey(self):
 51 |         return id(self) # -- choices are not mergeable
 52 | 
 53 |     def reduce_lenscale(self, params):
 54 |         # No-op
 55 |         return params
 56 | 
 57 |     def s_logprior(self, params, strength=10.0):
 58 |         P_shaped = params.reshape((self.n_idxs, self.n_idxs))
 59 |         P_norms = TT.sqrt((P_shaped ** 2).sum(axis=1))
 60 |         return strength * ((P_norms - 1) ** 2).sum()
 61 | 
 62 |     def unit(self, params):
 63 |         P_shaped = params.reshape((self.n_idxs, self.n_idxs))
 64 |         P_norms = TT.sqrt((P_shaped ** 2).sum(axis=1))
 65 |         P_unit = P_shaped / P_norms[:, None]
 66 |         return P_unit
 67 | 
 68 |     def opt_logK(self, x, params):
 69 |         if self.conditional:
 70 |             s_x = TT.switch(TT.isnan(x), self.n_idxs - 1, x)
 71 |         else:
 72 |             s_x = x
 73 |         #s_x = theano.printing.Print('x')(s_x)
 74 |         lbound = 1e-5
 75 |         ubound = 1.0
 76 |         params0 = np.random.RandomState(self.seed).uniform(
 77 |             low=lbound,
 78 |             high=ubound,
 79 |             size=(self.n_idxs, self.n_idxs))
 80 |         P_unit = self.unit(params)
 81 |         idxs = s_x.flatten().astype('int32')
 82 |         #def wtf(node, val):
 83 |         #    print 'IDXS', val
 84 |         #    print 'SELF', self.n_idxs, self.conditional
 85 |         #    return val
 86 | 
 87 |         #idxs = theano.printing.Print('idxs', global_fn=wtf)(idxs)
 88 |         K = TT.dot(P_unit[idxs], P_unit[idxs].T)
 89 |         #K = K + 1e-12 * TT.eye(x.shape[0])
 90 |         bounds = [(lbound, ubound)] * self.n_params
 91 |         return TT.log(K), list(params0.flatten()), bounds
 92 | 
 93 |     def predict_logK(self, x, z, params):
 94 |         if self.conditional:
 95 |             s_x = TT.switch(TT.isnan(x), self.n_idxs - 1, x)
 96 |             s_z = TT.switch(TT.isnan(z), self.n_idxs - 1, z)
 97 |         else:
 98 |             s_x = x
 99 |             s_z = z
100 |         P_unit = self.unit(params)
101 |         K = TT.dot(P_unit[s_x.flatten().astype('int32')],
102 |                    P_unit[s_x.flatten().astype('int32')].T)
103 |         #K_reg = K + 1e-12 * TT.eye(x.shape[0])
104 |         K_new = TT.dot(P_unit[s_x.flatten().astype('int32')],
105 |                        P_unit[s_z.flatten().astype('int32')].T)
106 |         return TT.log(K), TT.log(K_new)
107 | 
108 | 
109 | class StationaryBase(Kernel):
110 |     """
111 | 
112 |     K(x,y) = exp(- ||x-y||^2 / (2 l^2))
113 | 
114 |     N.B. the kernel is parameterized by quantity
115 | 
116 |         alpha = log( 2 * l^2)
117 | 
118 |     So that 
119 | 
120 |         K(x, y) = exp(- ||x - y|| ** 2 / exp(alpha))
121 |         l = sqrt(exp(alpha) / 2)
122 | 
123 | 
124 |     """
125 | 
126 |     @staticmethod
127 |     def _alpha_from_l(l):
128 |         return np.log(2.0 * l ** 2)
129 | 
130 |     @staticmethod
131 |     def _l_from_alpha(alpha):
132 |         return np.sqrt(np.exp(alpha) / 2.)
133 | 
134 |     def __init__(self, lenscale, lenscale_min, lenscale_max, conditional):
135 |         self._lenscale0 = lenscale
136 |         self._lenscale_min = lenscale_min
137 |         self._lenscale_max = lenscale_max
138 |         self._conditional = conditional
139 |         self._n_warp_segments = 0
140 |         if conditional:
141 |             self.n_params = 3 + self._n_warp_segments
142 |         else:
143 |             self.n_params = 1 + self._n_warp_segments
144 | 
145 |     def prodkey(self):
146 |         # -- unique identifier of mergeable product sets
147 |         return (type(self),
148 |                 self._conditional,
149 |                 self._n_warp_segments)
150 | 
151 |     def props(self):
152 |         return (
153 |             self._lenscale0,
154 |             self._lenscale_min,
155 |             self._lenscale_max,
156 |             self._conditional,
157 |             self._n_warp_segments,
158 |             )
159 | 
160 |     def __eq__(self, other):
161 |         return type(self) == type(other) and self.props() == other.props()
162 | 
163 |     def __hash__(self):
164 |         return hash((type(self), self.props()))
165 | 
166 |     def reduce_lenscale(self, params):
167 |         new_alpha = params[0] - 1
168 |         new_l = max(self._lenscale_min, self._l_from_alpha(new_alpha))
169 |         rval = list(params)
170 |         rval[0] = self._alpha_from_l(new_l)
171 |         return rval
172 | 
173 |     def s_logprior(self, params, strength=10.0):
174 |         # -- I don't know what distribution this would be
175 |         #    but I think it makes a nice shape
176 |         alpha = params[0]
177 |         alpha_min = self._alpha_from_l(self._lenscale_min)
178 |         alpha_max = self._alpha_from_l(self._lenscale_max)
179 |         #return strength * (alpha - alpha_min) ** 2
180 |         log0 = -10000
181 |         width = alpha_max - alpha_min
182 |         #alpha_mean = 0.5 * (alpha_max + alpha_min)
183 |         energy = strength * 0.5 * (alpha - alpha_max) ** 2 / width ** 2
184 |         lenscale_logprior = TT.switch(alpha < alpha_min,
185 |                          log0,
186 |                          TT.switch(alpha < alpha_max,
187 |                                    -energy,
188 |                                    log0))
189 |         if self._conditional:
190 |             diff = params[1:3] - np.asarray([0, 1])
191 |             return lenscale_logprior + TT.dot(diff, diff)
192 |         else:
193 |             return lenscale_logprior
194 | 
195 |     def cond_x(self, x, params):
196 |         # x is a full matrix, but will only have one column
197 | 
198 |         x = TT.addbroadcast(x, 1)
199 |         if self._conditional:
200 |             missing_x = params[1:3]
201 |             log_scale_x = params[3:3 + self._n_warp_segments]
202 |         else:
203 |             log_scale_x = params[1:1 + self._n_warp_segments]
204 | 
205 |         if self._n_warp_segments:
206 |             # XXX
207 |             warp_lbound = 0.
208 |             warp_ubound = 1.
209 |             warp_segments = np.linspace(warp_lbound,
210 |                                         warp_ubound,
211 |                                         self._n_warp_segments)
212 |             scale_x = TT.exp(log_scale_x)
213 |             z = TT.sum(
214 |                 TT.tanh(scale_x * (x - warp_segments)),
215 |                 axis=1)[:, None]
216 |             z_min = TT.sum(
217 |                 TT.tanh(scale_x * (np.zeros((1, 1)) - warp_segments)),
218 |                 axis=1)[:, None]
219 |             z_max = TT.sum(
220 |                 TT.tanh(scale_x * (np.ones((1, 1)) - warp_segments)),
221 |                 axis=1)[:, None]
222 |             z = (z - z_min) / (z_max - z_min)
223 |         else:
224 |             z = x
225 |         if self._conditional:
226 |             x2_base = TT.switch(s_isnan(x), missing_x, 0)
227 |             x2 = TT.inc_subtensor(x2_base[:, 0:1], TT.switch(s_isnan(x), 0, z))
228 |             return x2
229 |         else:
230 |             return z
231 | 
232 |     def opt_logK(self, x, params):
233 |         x2 = self.cond_x(x, params)
234 |         logK = self._logK_of_dist(euclidean_sq_distances(x2, x2), params, True)
235 |         params0 = [self._alpha_from_l(self._lenscale0)]
236 |         if self._conditional:
237 |             params0.extend([0., 1.])
238 |         params0.extend([0.] * self._n_warp_segments)
239 |         amin = None if self._lenscale_min is None else (
240 |             self._alpha_from_l(self._lenscale_min))
241 |         amax = None if self._lenscale_max is None else (
242 |             self._alpha_from_l(self._lenscale_max))
243 |         bounds = [[amin, amax]]
244 |         if self._conditional:
245 |             bounds.extend([(-5., 5.), (1e-5, 5.)])
246 |         bounds.extend([(-.2, 2.)] * self._n_warp_segments)
247 |         return logK, params0, bounds
248 | 
249 |     def predict_logK(self, x, z, params):
250 |         x2 = self.cond_x(x, params)
251 |         z2 = self.cond_x(z, params)
252 |         logK = self._logK_of_dist(euclidean_sq_distances(x2, x2), params, True)
253 |         logK_new = self._logK_of_dist(euclidean_sq_distances(x2, z2), params, False)
254 |         return logK, logK_new
255 | 
256 | 
257 | class SqExp(StationaryBase):
258 |     Product = SqExpProd
259 |     def _logK_of_dist(self, sq_dists, params, self_sim):
260 |         _alpha = params[0]
261 |         ll2 = TT.exp(_alpha) # aka 2 * l ** 2
262 |         return -sq_dists / ll2
263 | 
264 | 
265 | class Matern12(SqExp):
266 |     def _K_of_dist(self, sq_dists, params, self_sim):
267 |         _alpha = params[0]
268 |         ll = TT.sqrt(.5 * TT.exp(_alpha))
269 |         return TT.exp(-TT.sqrt(sq_dists) / ll)
270 | 
271 | 
272 | class Matern32(StationaryBase):
273 |     def _K_of_dist(self, sq_dists, params, self_sim):
274 |         _alpha = params[0]
275 |         ll2 = .5 * TT.exp(_alpha) # aka l ** 2
276 |         nrmsq = sq_dists / ll2
277 |         if self_sim:
278 |             # -- help grad by suppressing 0/0 -> NaN
279 |             nrmsq = zero_diag(nrmsq)
280 |         nrm_root_3 = TT.sqrt(3 * nrmsq)
281 |         return ((1 + nrm_root_3) * TT.exp(-nrm_root_3))
282 | 
283 | 
284 | class Matern52(StationaryBase):
285 |     def _K_of_dist(self, sq_dists, params, self_sim):
286 |         _alpha = params[0]
287 |         ll2 = .5 * TT.exp(_alpha) # aka l ** 2
288 |         nrmsq = sq_dists / ll2
289 |         if self_sim:
290 |             # -- help grad by suppressing 0/0 -> NaN
291 |             nrmsq = zero_diag(nrmsq)
292 |         nrm_root_5 = TT.sqrt(5 * nrmsq)
293 |         coef = 1 + nrm_root_5 + 5. / 3. * nrmsq
294 |         return coef * TT.exp(-nrm_root_5)
295 | 
296 | 
297 | Choice2 = SqExp
298 | #class Choice2(StationaryBase):
299 |     #def _logK_of_dist(self, sq_dists, params, self_sim):
300 |         #_alpha = params[0]
301 |         #ll2 = TT.exp(_alpha) # aka 2 * l ** 2
302 |         #return -sq_dists / ll2
303 | 
304 | 
305 | def product(kernels, slices):
306 |     from gby import groupby
307 |     # -- there are some kernels whose product can be handled
308 |     #    by the same sort of Theano graph as it takes to handle
309 |     #    just one term of the product. Pre-consolidating such
310 |     #    sub-products saves a huge amount of compilation time
311 |     #    and it runs faster too.
312 |     prod_mergeable = groupby(zip(kernels, slices),
313 |                              lambda ks: ks[0].prodkey())
314 |     kernels_ = []
315 |     slices_ = []
316 |     for key, mergeable in prod_mergeable.items():
317 |         print key, mergeable
318 |         if len(mergeable) > 1:
319 |             kern = mergeable[0][0].Product(mergeable)
320 |             slc = kern.column_idxs
321 |         else:
322 |             (kern, slc), = mergeable
323 |         kernels_.append(kern)
324 |         slices_.append(slc)
325 |     if len(kernels_) == 1:
326 |         # -- XXX ignores slc ... is ok?
327 |         return kernels_[0]
328 |     return Product(kernels_, slices_)
329 | 
330 | 
331 | class Product(Kernel):
332 |     def __init__(self, kernels, slices):
333 |         self.kernels = kernels
334 |         self.slices = slices
335 |         self.n_params = sum(k.n_params for k in kernels)
336 | 
337 |     def reduce_lenscale(self, params):
338 |         rval = np.zeros_like(params)
339 |         offset = 0
340 |         for k in self.kernels:
341 |             rval[offset: offset + k.n_params] = (
342 |                 k.reduce_lenscale(params[offset: offset + k.n_params]))
343 |             offset += k.n_params
344 |         return rval
345 | 
346 |     def s_logprior(self, params):
347 |         offset = 0
348 |         lps = []
349 |         for k in self.kernels:
350 |             lps.append(k.s_logprior(params[offset: offset + k.n_params]))
351 |             offset += k.n_params
352 |         return reduce(lambda a, b: a + b, lps)
353 | 
354 |     def opt_logK(self, x, params):
355 |         # return a cost, and parameter vector suitable for fitting
356 |         # the GP, and bounds on that parameter vector
357 | 
358 |         params0 = []
359 |         bounds = []
360 |         offset = 0
361 |         logKs = []
362 |         for kern, slice_k in zip(self.kernels, self.slices):
363 |             params_k = params[offset: offset + kern.n_params]
364 |             #if slice_k is None:
365 |                 #logK_k, params0_k, bounds_k = kern.opt_logK(x, params_k)
366 |             #else:
367 |             logK_k, params0_k, bounds_k = kern.opt_logK(x[:, slice_k],
368 |                                                         params_k)
369 |             logKs.append(check_K(logK_k))
370 |             params0.extend(params0_k)
371 |             bounds.extend(bounds_k)
372 |             offset += kern.n_params
373 | 
374 |         if len(self.kernels) == 1:
375 |             return logKs[0], params0, bounds
376 |         else:
377 |             Kstack = TT.stack(*logKs)
378 |             logK = TT.sum(Kstack, axis=0)
379 |             return logK, params0, bounds
380 | 
381 |     def predict_logK(self, x, z, params):
382 |         # s_mean, s_x for computing mean from s_x
383 |         logKs = []
384 |         logKs_new = []
385 |         offset = 0
386 |         for kern, slice_k in zip(self.kernels, self.slices):
387 |             params_k = params[offset: offset + kern.n_params]
388 |             #if slice_k is None:
389 |                 #logK_k, logK_new_k = kern.predict_logK(x, z, params_k)
390 |             #else:
391 |             logK_k, logK_new_k = kern.predict_logK(
392 |                 x[:, slice_k], z[:, slice_k], params_k)
393 |             logKs.append(logK_k)
394 |             logKs_new.append(logK_new_k)
395 |             offset += kern.n_params
396 | 
397 |         if len(self.kernels) == 1:
398 |             return logKs[0], logKs_new[0]
399 |         else:
400 |             logK = TT.sum(TT.stack(*logKs), axis=0)
401 |             logK_new = TT.sum(TT.stack(*logKs_new), axis=0)
402 |             return logK, logK_new
403 | 
404 | 
405 | def prod_of(Kcls, slices):
406 |     kernels = [Kcls() for ii in range(len(slices))]
407 |     return Product(kernels, slices)
408 | 
409 | 
410 | class Mixture(Kernel):
411 |     def __init__(self, kernels, slices):
412 |         self.kernels = kernels
413 |         self.slices = slices
414 |         self.n_my_params = len(kernels) - 1
415 |         self.n_params = sum(k.n_params for k in kernels) + self.n_my_params
416 |         self.prior_strength = 2.0
417 | 
418 |     def reduce_lenscale(self, params):
419 |         rval = np.zeros_like(params)
420 |         offset = 0
421 |         for k in self.kernels:
422 |             rval[offset: offset + k.n_params] = (
423 |                 k.reduce_lenscale(params[offset: offset + k.n_params]))
424 |             offset += k.n_params
425 |         # shrink weights back to even weighting
426 |         rval[offset: offset + len(self.kernels) - 1] *= 0.75
427 |         return rval
428 | 
429 |     def s_logprior(self, params):
430 |         offset = 0
431 |         lps = []
432 |         for k in self.kernels:
433 |             lps.append(k.s_logprior(params[offset: offset + k.n_params]))
434 |             offset += k.n_params
435 |         # -- multiplicative because they are independent
436 |         lp = reduce(lambda a, b: a + b, lps)
437 |         log_weights = params[offset: offset + self.n_my_params]
438 |         return lp - self.prior_strength * TT.dot(log_weights, log_weights)
439 | 
440 |     def opt_K(self, x, params):
441 |         # return a cost, and parameter vector suitable for fitting
442 |         # the GP, and bounds on that parameter vector
443 | 
444 |         params0 = []
445 |         bounds = []
446 |         offset = 0
447 |         Ks = []
448 |         for kern, slice_k in zip(self.kernels, self.slices):
449 |             params_k = params[offset: offset + kern.n_params]
450 |             K_k, params0_k, bounds_k = kern.opt_K(x[:, slice_k], params_k)
451 |             Ks.append(K_k)
452 |             params0.extend(params0_k)
453 |             bounds.extend(bounds_k)
454 |             offset += kern.n_params
455 | 
456 |         params0.extend([0.0] * self.n_my_params)
457 |         bounds.extend([(-4, 4)] * self.n_my_params)
458 | 
459 |         log_weights = TT.concatenate((np.asarray([0.0]),
460 |                                       params[offset:offset + self.n_my_params]))
461 |         weights = TT.exp(log_weights) / TT.exp(log_weights).sum()
462 | 
463 |         if len(self.kernels) == 1:
464 |             return Ks[0], params0, bounds
465 |         else:
466 |             Kstack = TT.stack(*Ks)
467 |             weighted_Kstack = weights[:, None, None] * Kstack
468 |             K = TT.sum(weighted_Kstack, axis=0)
469 |             # XXX: log_K, should be logadd here (#11)
470 |             return K, params0, bounds
471 | 
472 |     def predict_K(self, x, z, params):
473 |         # s_mean, s_x for computing mean from s_x
474 |         Ks = []
475 |         Ks_new = []
476 |         offset = 0
477 |         for kern, slice_k in zip(self.kernels, self.slices):
478 |             params_k = params[offset: offset + kern.n_params]
479 |             K_k, K_new_k = kern.predict_K(
480 |                 x[:, slice_k], z[:, slice_k], params_k)
481 |             Ks.append(K_k)
482 |             Ks_new.append(K_new_k)
483 |             offset += kern.n_params
484 | 
485 |         log_weights = TT.concatenate((np.asarray([0]),
486 |                                       params[offset:offset + self.n_my_params]))
487 |         weights = TT.exp(log_weights) / TT.exp(log_weights).sum()
488 | 
489 |         if len(self.kernels) == 1:
490 |             return Ks[0], Ks_new[0]
491 |         else:
492 |             # XXX: log_K, should be logadd here (#11)
493 |             wK = TT.sum(
494 |                 weights[:, None, None] * TT.stack(*Ks), axis=0)
495 |             wK_new = TT.sum(
496 |                 weights[:, None, None] * TT.stack(*Ks_new), axis=0)
497 |             return wK, wK_new
498 | 
499 | def mix_of(Kcls, slices):
500 |     kernels = [Kcls() for ii in range(len(slices))]
501 |     return Mixture(kernels, slices)
502 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/kernels.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/kernels.pyc


--------------------------------------------------------------------------------
/hp_gpsmbo/kernels_base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import theano
 3 | import theano.tensor as TT
 4 | 
 5 | from .gpr_math import s_nll, s_mean, s_variance
 6 | 
 7 | #TODO: Match name to scikits.learn
 8 | def euclidean_sq_distances(x, z):
 9 |     """Matrix of distances for each row in x to each row in z
10 |     """
11 | 
12 |     # -- TODO: better numerical accuracy
13 |     d = ((x ** 2).sum(axis=1).dimshuffle(0, 'x')
14 |             + (z ** 2).sum(axis=1)
15 |             - 2 * TT.dot(x, z.T))
16 |     return TT.maximum(d, 0)
17 | 
18 | 
19 | class Kernel(object):
20 | 
21 |     def s_nll_params(self, x, y, var_y, prior_var, params=None, ret_K=False):
22 |         # return a cost, and parameter vector suitable for fitting
23 |         # the GP, and bounds on that parameter vector
24 | 
25 |         # -- turn these to constants
26 |         x = TT.as_tensor_variable(x)
27 |         y = TT.as_tensor_variable(y)
28 |         if params is None:
29 |             params = theano.tensor.dvector()
30 |         else:
31 |             params = theano.tensor.as_tensor_variable(params)
32 |             assert params.ndim == 1
33 |         K, params0, bounds = self.opt_K(x, params)
34 |         nll = s_nll(K, y, var_y=var_y, prior_var=prior_var)
35 |         if ret_K:
36 |             return nll, params, params0, bounds, K
37 |         return nll, params, params0, bounds
38 | 
39 |     def s_mean_var(self, x, y, var_y, prior_var, best_params, var_min,
40 |                   x_new=None,
41 |                   return_K_new=False):
42 |         # s_mean, s_x for computing mean from s_x
43 | 
44 |         # -- turn these to constants
45 |         x = TT.as_tensor_variable(x)
46 |         y = TT.as_tensor_variable(y)
47 |         if x_new is None:
48 |             x_new = TT.matrix()
49 |         else:
50 |             assert x_new.ndim == 2
51 |         params = TT.as_tensor_variable(best_params)
52 |         K, K_new = self.predict_K(x, x_new, params)
53 |         K.name = 'K'
54 |         K_new.name = 'K_new'
55 |         mean = s_mean(K, y, var_y, prior_var, K_new)
56 |         var = s_variance(K, y, var_y, prior_var, K_new, var_min)
57 |         mean.name = 'mean_new'
58 |         var.name = 'var_new'
59 |         rval = [mean, var, x_new]
60 |         if return_K_new:
61 |             rval.append(K_new)
62 |         return rval
63 | 
64 |     def predict_K(self, *args, **kwargs):
65 |         logK, logK_new = self.predict_logK(*args, **kwargs)
66 |         return TT.exp(logK), TT.exp(logK_new)
67 | 
68 |     def opt_K(self, *args, **kwargs):
69 |         logK, params, bounds = self.opt_logK(*args, **kwargs)
70 |         return TT.exp(logK), params, bounds
71 | 
72 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/op_Kcond.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from theano import Op, Apply, gradient
  3 | from theano import tensor as TT
  4 | 
  5 | class KCond(Op):
  6 |     """
  7 |     Return a vector of indexes of K to keep
  8 |     """
  9 |     def __init__(self):
 10 |         self.destructive = False
 11 | 
 12 |         self.props = (self.destructive,)
 13 | 
 14 |     def __hash__(self):
 15 |         return hash((type(self), self.props))
 16 | 
 17 |     def __eq__(self, other):
 18 |         return (type(self) == type(other) and self.props == other.props)
 19 | 
 20 |     #def infer_shape(self, node, shapes):
 21 |         #return [shapes[0]]
 22 | 
 23 |     def __str__(self):
 24 |         return 'KCond'
 25 | 
 26 |     def make_node(self, K, y, eps):
 27 |         K = TT.as_tensor_variable(K)
 28 |         y = TT.as_tensor_variable(y)
 29 |         eps = TT.as_tensor_variable(eps)
 30 |         return Apply(self, [K, y, eps], [TT.ivector()])
 31 | 
 32 |     def perform(self, node, inputs, outputs):
 33 |         K, y, eps = inputs
 34 |         M = K.shape[0]
 35 |         assert (M, M) == K.shape
 36 |         assert (M,) == y.shape
 37 |         order = np.argsort(y)  # best to worst
 38 |         keep = np.ones_like(y).astype(np.int32) # order matches K, y
 39 |         assert np.allclose(np.diag(K), 1.0)
 40 |         max_similarity = (K - np.eye(M)).max()
 41 |         if max_similarity + eps > 1.0:
 42 |             print 'max_similarity', max_similarity
 43 | 
 44 |         for ii in xrange(M - 1):
 45 |             this = order[ii]
 46 |             if not keep[this]:
 47 |                 continue
 48 |             # -- we have committed to using row `this`
 49 |             # -- Now, delete all worse points within epsilon of row `this`
 50 |             #    (all pts remaining in `order` are worse by definition)
 51 |             K_this = K[this]
 52 |             for jj in xrange(ii + 1, M):
 53 |                 other = order[jj]
 54 |                 if not keep[other]:  # -- other's already gone
 55 |                     continue
 56 |                 if (1 - K_this[other]) < eps:
 57 |                     keep[other] = 0
 58 |         keep_idxs = np.where(keep)[0].astype(np.int32)
 59 |         if len(keep_idxs) < M:
 60 |             print 'Dropping %i rows to condition K' % (
 61 |                 M - len(keep_idxs))
 62 |         outputs[0][0] = keep_idxs
 63 | 
 64 |     def grad(self, inputs, gradients):
 65 |         return [inp.zeros_like() for inp in inputs]
 66 | 
 67 | 
 68 | def K_cond(K, y, eps):
 69 |     keep_idxs = KCond()(K, y, eps)
 70 |     keep_y = y[keep_idxs]
 71 |     # -- we want to keep the given rows and cols, hence:
 72 |     keep_K = K[keep_idxs].T[keep_idxs].T
 73 |     assert keep_K.type == K.type
 74 |     assert keep_y.type == y.type
 75 |     return keep_K, keep_y, keep_idxs
 76 | 
 77 | class ZeroDiag(Op):
 78 |     """ Return a square matrix with the diagonal zero-d out.
 79 | 
 80 |     The advantage of this Op over masking techniques based on arithmetic
 81 |     is that this Op can remove NaNs from the diagonal.
 82 |     """
 83 |     def __init__(self):
 84 |         self.destructive = False
 85 |         self.props = (self.destructive,)
 86 | 
 87 |     def __hash__(self):
 88 |         return hash((type(self), self.props))
 89 | 
 90 |     def __eq__(self, other):
 91 |         return (type(self) == type(other) and self.props == other.props)
 92 | 
 93 |     def infer_shape(self, node, shapes):
 94 |         return shapes
 95 | 
 96 |     def __str__(self):
 97 |         return 'ZeroDiag'
 98 | 
 99 |     def make_node(self, K):
100 |         K = TT.as_tensor_variable(K)
101 |         return Apply(self, [K], [K.type()])
102 | 
103 |     def perform(self, node, inputs, outputs):
104 |         K, = inputs
105 |         rval = K.copy()
106 |         idxs = np.arange(K.shape[0])
107 |         rval[idxs, idxs] = 0
108 |         outputs[0][0] = rval
109 | 
110 |     def connection_pattern(self, node):
111 |         return [[True]]
112 | 
113 |     def grad(self, inputs, gradients):
114 |         gY = gradients[0]
115 |         return [zero_diag(gY)]
116 | 
117 | zero_diag = ZeroDiag()
118 | 
119 | 
120 | class ZeroForNan(Op):
121 |     """ Return a square matrix with the diagonal zero-d out.
122 | 
123 |     The advantage of this Op over masking techniques based on arithmetic
124 |     is that this Op can remove NaNs from the diagonal.
125 |     """
126 |     def __init__(self):
127 |         self.destructive = False
128 |         self.props = (self.destructive,)
129 | 
130 |     def __hash__(self):
131 |         return hash((type(self), self.props))
132 | 
133 |     def __eq__(self, other):
134 |         return (type(self) == type(other) and self.props == other.props)
135 | 
136 |     def infer_shape(self, node, shapes):
137 |         return shapes
138 | 
139 |     def __str__(self):
140 |         return 'ZeroForNan'
141 | 
142 |     def make_node(self, K):
143 |         K = TT.as_tensor_variable(K)
144 |         return Apply(self, [K], [K.type()])
145 | 
146 |     def perform(self, node, inputs, outputs):
147 |         K, = inputs
148 |         rval = K.copy()
149 |         rval[np.isnan(rval)] = 0
150 |         outputs[0][0] = rval
151 | 
152 |     def connection_pattern(self, node):
153 |         return [[True]]
154 | 
155 |     def grad(self, inputs, gradients):
156 |         #K, = inputs
157 |         gY, = gradients
158 |         return [gY]
159 | 
160 | zero_for_nan = ZeroForNan()
161 | 
162 | 
163 | class IsNan(Op):
164 |     """ Return a square matrix with the diagonal zero-d out.
165 | 
166 |     The advantage of this Op over masking techniques based on arithmetic
167 |     is that this Op can remove NaNs from the diagonal.
168 |     """
169 |     def __init__(self):
170 |         self.destructive = False
171 |         self.props = (self.destructive,)
172 | 
173 |     def __hash__(self):
174 |         return hash((type(self), self.props))
175 | 
176 |     def __eq__(self, other):
177 |         return (type(self) == type(other) and self.props == other.props)
178 | 
179 |     def infer_shape(self, node, shapes):
180 |         return shapes
181 | 
182 |     def __str__(self):
183 |         return 'IsNan'
184 | 
185 |     def make_node(self, K):
186 |         K = TT.as_tensor_variable(K)
187 |         otype = TT.TensorType(dtype='int8',
188 |                               broadcastable=K.broadcastable)
189 |         return Apply(self, [K], [otype()])
190 | 
191 |     def perform(self, node, inputs, outputs):
192 |         outputs[0][0] = np.isnan(inputs[0]).astype('int8')
193 | 
194 |     #def connection_pattern(self, node):
195 |         #return [[False]]
196 | 
197 |     def grad(self, inputs, gradients):
198 |         return [gradient.DisconnectedType()()]
199 | 
200 | isnan = IsNan()
201 | 
202 | import scipy.linalg
203 | import theano
204 | from theano.gof import local_optimizer, PureOp
205 | from theano.tensor.opt import (register_stabilize,
206 |         register_specialize, register_canonicalize)
207 | from theano.sandbox.linalg.ops import Cholesky
208 | 
209 | class LazyCholesky(PureOp):
210 |     def __init__(self, lower):
211 |         self.lower = lower
212 |         self.props = (lower,)
213 | 
214 |     def __hash__(self):
215 |         return hash((type(self), self.props))
216 | 
217 |     def __eq__(self, other):
218 |         return (type(self) == type(other) and self.props == other.props)
219 | 
220 |     def make_node(self, X, use_buf, buf_idx):
221 |         return Apply(self,
222 |                      [X, use_buf, buf_idx],
223 |                      [X.type(), theano.gof.type.generic()])
224 | 
225 |     def infer_shape(self, node, shapes):
226 |         return [shapes[0], None]
227 | 
228 |     def make_thunk(self, node, storage_map, compute_map, no_recycling):
229 |         s_X, s_use_buf, s_buf_idx = node.inputs
230 |         s_chol, s_buf = node.outputs
231 |         comp_X = compute_map[s_X]
232 |         comp_use_buf = compute_map[s_use_buf]
233 |         comp_buf_idx = compute_map[s_buf_idx]
234 |         comp_chol = compute_map[s_chol]
235 |         #comp_buf = compute_map[s_buf]
236 | 
237 |         stor_X = storage_map[s_X]
238 |         stor_use_buf = storage_map[s_use_buf]
239 |         stor_buf_idx = storage_map[s_buf_idx]
240 |         stor_chol = storage_map[s_chol]
241 |         stor_buf = storage_map[s_buf]
242 |         def thunk():
243 |             # -- compute the use_buf flag
244 |             if not comp_use_buf[0]:
245 |                 return [1]
246 |             if not comp_buf_idx[0]:
247 |                 return [2]
248 |             buf_idx = int(stor_buf_idx[0])
249 |             use_buf = stor_use_buf[0]
250 |             if use_buf:
251 |                 buf_dict = stor_buf[0]
252 |                 assert buf_dict is not None, 'buf output is empty'
253 |                 chol = buf_dict[buf_idx]
254 |             else:
255 |                 # -- compute a cholesky and store to buffer
256 |                 if not comp_X[0]:
257 |                     return [0]
258 |                 X = stor_X[0]
259 |                 chol = scipy.linalg.cholesky(X, lower=self.lower)
260 |                 print 'computing cholesky', buf_idx
261 |                 if stor_buf[0] is None:
262 |                     stor_buf[0] = {}
263 |                 chol = chol.astype(X.dtype)
264 |                 buf_dict = stor_buf[0]
265 |                 buf_dict[buf_idx] = chol
266 | 
267 |             stor_chol[0] = chol.copy()
268 |             comp_chol[0] = 1
269 |             return []
270 | 
271 |         thunk.lazy = True
272 |         thunk.inputs = [storage_map[v] for v in node.inputs]
273 |         thunk.outputs = [storage_map[v] for v in node.outputs]
274 |         return thunk
275 | 
276 | use_lazy_cholesky = False
277 | use_lazy_cholesky_idx = None
278 | 
279 | @register_specialize
280 | @local_optimizer(None)
281 | def lazy_cholesky(node):
282 |     """
283 |     If a general solve() is applied to the output of a cholesky op, then
284 |     replace it with a triangular solve.
285 |     """
286 |     if not use_lazy_cholesky:
287 |         return
288 | 
289 |     if isinstance(node.op, Cholesky):
290 |         assert use_lazy_cholesky.name
291 |         for var in node.fgraph.variables:
292 |             if var.name == use_lazy_cholesky.name:
293 |                 break
294 |         else:
295 |             raise Exception('var not found in graph', use_lazy_cholesky)
296 |         buf_flag = var
297 | 
298 |         for var in node.fgraph.variables:
299 |             if var.name == use_lazy_cholesky_idx.name:
300 |                 break
301 |         else:
302 |             raise Exception('var not found in graph', use_lazy_cholesky_idx)
303 |         buf_idx = var
304 |         assert buf_idx is not buf_flag
305 |         X, = node.inputs
306 |         chol, buf = LazyCholesky(node.op.lower)(X, buf_flag, buf_idx)
307 |         assert chol.type == node.outputs[0].type
308 |         return [chol]
309 | 
310 | 
311 | from scipy.stats import norm
312 | 
313 | class NormalLogEIDiffSigmaScalar(theano.scalar.basic.ScalarOp):
314 |     nin = 2
315 |     def __eq__(self, other):
316 |         return type(self) == type(other)
317 | 
318 |     def __hash__(self):
319 |         return hash(type(self))
320 | 
321 |     def impl(self, diff, sigma):
322 |         z = diff / sigma
323 |         if z < 34:
324 |             a = -diff * norm.cdf(-z)
325 |             b = sigma * norm.pdf(-z)
326 |             rval = np.log(a + b)
327 |         else:
328 |             rval = (-4.86466981
329 |                     -0.12442506 * z
330 |                     -0.49903031 * z ** 2)
331 |         return rval
332 | 
333 |     def c_code(self, node, name, inp, out, sub):
334 |         diff, sigma = inp
335 |         y, = out
336 |         z = y + '_z'
337 |         a = y + '_a'
338 |         b = y + '_b'
339 |         cdf = y + '_cdf'
340 |         pdf = y + '_pdf'
341 |         #root_2pi = '%' % np.sqrt(2 * np.pi)
342 |         if node.inputs[0].type in theano.scalar.basic.float_types:
343 |             return """
344 |                 double %(z)s = %(diff)s / %(sigma)s;
345 |                 if (%(z)s < 34)
346 |                 {
347 |                     double %(cdf)s = .5 * erfc(%(z)s / sqrt(2.));
348 |                     double %(pdf)s = exp(-.5 * %(z)s * %(z)s) / sqrt(2 * M_PI);
349 |                     double %(a)s = -%(diff)s * %(cdf)s;
350 |                     double %(b)s = %(sigma)s * %(pdf)s;
351 |                     %(y)s = log(%(a)s + %(b)s);
352 |                 }
353 |                 else
354 |                 {
355 |                     %(y)s = -4.86466981
356 |                             -0.12442506 * %(z)s
357 |                             -0.49903031 * %(z)s * %(z)s;
358 |                 }
359 |                 """ % locals()
360 |         raise NotImplementedError('only floating point is implemented')
361 | 
362 |     def c_code_cache_version(self):
363 |         return (1,)
364 | 
365 |     def grad(self, inp, grads):
366 |         y = self(*inp)
367 |         gy, = grads
368 |         float_out = theano.scalar.basic.float_out
369 |         gd = NormalLogEIDiffSigmaScalarGrad0(float_out)(y, gy, *inp)
370 |         gs = NormalLogEIDiffSigmaScalarGrad1(float_out)(y, gy, *inp)
371 |         return gd, gs
372 | 
373 | class NormalLogEIDiffSigmaScalarGrad0(theano.scalar.basic.ScalarOp):
374 |     nin = 4
375 |     def __eq__(self, other):
376 |         return type(self) == type(other)
377 | 
378 |     def __hash__(self):
379 |         return hash(type(self))
380 | 
381 |     def impl(self, logEI, glogEI, diff, sigma):
382 |         z = diff / sigma
383 |         if z < 34:
384 |             logcdf = norm.logcdf(-z, 0, 1)
385 |             ddiff = -np.exp(logcdf - logEI)     # aka: -cdf / EI
386 |         else:
387 |             foo = 2 * .49903031
388 |             dz = (-0.12442506 - foo * z)
389 |             ddiff = dz / sigma
390 |         return ddiff * glogEI
391 | 
392 |     def c_code(self, node, name, inp, out, sub):
393 |         logEI, glogEI, diff, sigma = inp
394 |         y, = out
395 |         z = y + '_z'
396 |         logcdf = y + '_logcdf'
397 |         #root_2pi = '%' % np.sqrt(2 * np.pi)
398 |         if node.inputs[0].type in theano.scalar.basic.float_types:
399 |             return """
400 |                 double %(z)s = %(diff)s / %(sigma)s;
401 |                 if (%(z)s < 34)
402 |                 {
403 |                     double %(logcdf)s = log(.5) + log(erfc(%(z)s / sqrt(2.)));
404 |                     %(y)s = -exp(%(logcdf)s - %(logEI)s) * %(glogEI)s;
405 |                 }
406 |                 else
407 |                 {
408 |                     %(y)s = (-0.12442506 - 2 * .49903031 * %(z)s)
409 |                         / %(sigma)s
410 |                         * %(glogEI)s;
411 |                 }
412 |                 """ % locals()
413 |         raise NotImplementedError('only floating point is implemented')
414 | 
415 |     def c_code_cache_version(self):
416 |         return (1,)
417 | 
418 | class NormalLogEIDiffSigmaScalarGrad1(theano.scalar.basic.ScalarOp):
419 |     nin = 4
420 |     def __eq__(self, other):
421 |         return type(self) == type(other)
422 | 
423 |     def __hash__(self):
424 |         return hash(type(self))
425 | 
426 |     def impl(self, logEI, glogEI, diff, sigma):
427 |         z = diff / sigma
428 |         if z < 34:
429 |             logpdf = norm.logpdf(-z, 0, 1)
430 |             dsigma = np.exp(logpdf - logEI)  # aka: pdf / EI
431 |         else:
432 |             foo = 2 * .49903031
433 |             dz = (-0.12442506 - foo * z)
434 |             dsigma = dz * (-z / sigma)
435 |                 #(foo * z) ** 2 / sigma
436 |         return dsigma * glogEI
437 | 
438 |     def c_code(self, node, name, inp, out, sub):
439 |         logEI, glogEI, diff, sigma = inp
440 |         y, = out
441 |         z = y + '_z'
442 |         logpdf = y + '_logpdf'
443 |         #root_2pi = '%' % np.sqrt(2 * np.pi)
444 |         if node.inputs[0].type in theano.scalar.basic.float_types:
445 |             return """
446 |                 double %(z)s = %(diff)s / %(sigma)s;
447 |                 if (%(z)s < 34)
448 |                 {
449 |                     double %(logpdf)s = -.5 * (log(2 * M_PI) + %(z)s * %(z)s);
450 |                     %(y)s = exp(%(logpdf)s - %(logEI)s) * %(glogEI)s;
451 |                 }
452 |                 else
453 |                 {
454 |                     %(y)s = (-0.12442506 - 2 * .49903031 * %(z)s)
455 |                         * (-%(z)s / %(sigma)s)
456 |                         * %(glogEI)s;
457 |                 }
458 |                 """ % locals()
459 |         raise NotImplementedError('only floating point is implemented')
460 | 
461 |     def c_code_cache_version(self):
462 |         return (1,)
463 | 
464 | normal_logEI_diff_sigma_scalar = NormalLogEIDiffSigmaScalar(
465 |     theano.scalar.upgrade_to_float_no_complex,
466 |     name='normal_logEI_diff_sigma_elemwise')
467 | 
468 | normal_logEI_diff_sigma_elemwise = theano.tensor.Elemwise(
469 |     normal_logEI_diff_sigma_scalar)
470 | 
471 | class NormalLogEIDiffSigma(theano.Op):
472 |     def __eq__(self, other):
473 |         return type(self) == type(other)
474 | 
475 |     def __hash__(self):
476 |         return hash((type(self),))
477 | 
478 |     def make_node(self, diff, sigma):
479 |         diff = theano.tensor.as_tensor_variable(diff)
480 |         sigma = theano.tensor.as_tensor_variable(sigma)
481 |         foo = diff + sigma
482 |         return theano.Apply(self, [diff, sigma], [foo.type()])
483 | 
484 |     def perform(self, node, inputs, output_storage):
485 |         diff, sigma = inputs
486 |         z = diff / sigma
487 |         # -- the following formula is cuter, but
488 |         #    Theano doesn't produce as stable a gradient I think?
489 |         #return sigma * (z * s_normal_cdf(z, 0, 1) + s_normal_pdf(z, 0, 1))
490 |         a = -diff * norm.cdf(-z, 0, 1)
491 |         b = sigma * norm.pdf(-z, 0, 1)
492 |         rval_naive = np.log(a + b)
493 |         zz = z[z > 34]
494 |         interp = (-4.86466981
495 |                   -0.12442506 * zz
496 |                   -0.49903031 * zz ** 2)
497 |         rval_naive[z > 34] = interp
498 |         output_storage[0][0] = rval_naive
499 | 
500 |     def grad(self, inputs, output_gradients):
501 |         y = NormalLogEIDiffSigma()(*inputs)
502 |         gy, = output_gradients
503 |         return NormalLogEIGrad()(y, gy, *inputs)
504 | 
505 | normal_logEI_diff_sigma = NormalLogEIDiffSigma()
506 | 
507 | 
508 | class NormalLogEIGrad(theano.Op):
509 |     def __eq__(self, other):
510 |         return type(self) == type(other)
511 | 
512 |     def __hash__(self):
513 |         return hash((type(self),))
514 | 
515 |     def make_node(self, logEI, gEI, diff, sigma):
516 |         return theano.Apply(self,
517 |                             [logEI, gEI, diff, sigma],
518 |                             [diff.type(), sigma.type()])
519 | 
520 |     def perform(self, node, inputs, output_storage):
521 |         logEI, gEI, diff, sigma = inputs
522 |         z = diff / sigma
523 |         logcdf = norm.logcdf(-z, 0, 1)
524 |         logpdf = norm.logpdf(-z, 0, 1)
525 |         #for zi, a, b, c in zip(z, logcdf, logpdf, logEI):
526 |             #print zi, 'cdf', a, 'pdf', b, 'EI', c, 'logdz', a - c, 'logsig', b - c
527 |         dz = -np.exp(logcdf - logEI)     # aka: -cdf / EI
528 |         dsigma = np.exp(logpdf - logEI)  # aka: pdf / EI
529 | 
530 |         #if np.any(z > 20):
531 |         #    print 'NormalLogEIGrad: bigz', z[z > 20]
532 | 
533 |         foo = 2 * .49903031
534 |         dz[z > 34] = -0.12442506 - foo * z[z > 34]
535 |         dsigma[z > 34] = dz[z > 34] * (-z[z > 34] / sigma[z > 34])
536 |         dz[z > 34] /= sigma[z > 34]
537 | 
538 |         output_storage[0][0] = dz * gEI
539 |         output_storage[1][0] = dsigma * gEI
540 |         #if np.any(np.isnan(dz)):
541 |         #    import pdb; pdb.set_trace()
542 |         #print ('logEI grad: gEI=%s dz=%s dsigma=%s' % (gEI, dz, dsigma))
543 | 
544 | 
545 | # -- eof
546 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/prodkernels.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .kernels_base import Kernel
  3 | 
  4 | import theano.tensor as TT
  5 | from .op_Kcond import zero_diag, isnan as s_isnan
  6 | from .kernels_base import euclidean_sq_distances
  7 | 
  8 | class SqExpProd(Kernel):
  9 |     """
 10 | 
 11 |     K(x,y) = exp(- ||x-y||^2 / (2 l^2))
 12 | 
 13 |     N.B. the kernel is parameterized by quantity
 14 | 
 15 |         alpha = log( 2 * l^2)
 16 | 
 17 |     So that 
 18 | 
 19 |         K(x, y) = exp(- ||x - y|| ** 2 / exp(alpha))
 20 |         l = sqrt(exp(alpha) / 2)
 21 | 
 22 | 
 23 |     """
 24 | 
 25 |     @staticmethod
 26 |     def _alpha_from_l(l):
 27 |         return np.log(2.0 * l ** 2)
 28 | 
 29 |     @staticmethod
 30 |     def _l_from_alpha(alpha):
 31 |         return np.sqrt(np.exp(alpha) / 2.)
 32 | 
 33 |     def __init__(self,
 34 |                  seq_kern_slice):
 35 |                  #lenscales_0,
 36 |                  #lenscales_min,
 37 |                  #lenscales_max,
 38 |                  #conditional):
 39 |         kerns, slices = zip(*seq_kern_slice)
 40 |         self._conditional = kerns[0]._conditional
 41 |         assert all(self._conditional == kern._conditional
 42 |                    for kern, slc in seq_kern_slice)
 43 |         self._lenscales_0 = np.asarray([kern._lenscale0 for kern in kerns])
 44 |         self._lenscales_min = np.asarray([kern._lenscale_min for kern in kerns])
 45 |         self._lenscales_max = np.asarray([kern._lenscale_max for kern in kerns])
 46 | 
 47 |         self._n_warp_segments_per_X = 0
 48 |         if self._conditional:
 49 |             self.n_params = 3 + self._n_warp_segments_per_X
 50 |         else:
 51 |             self.n_params = 1 + self._n_warp_segments_per_X
 52 |         self.n_params *= len(kerns)
 53 |         self.N = len(kerns)
 54 |         def getidx(slc):
 55 |             assert slc.start + 1 == slc.stop and slc.step == None
 56 |             return slc.start
 57 |         self.column_idxs = np.asarray(map(getidx, slices))
 58 |         self.s_idxs = TT.as_tensor_variable(self.column_idxs)
 59 | 
 60 |     def prodkey(self):
 61 |         # -- unique identifier of mergeable product sets
 62 |         return (type(self),
 63 |                 self._conditional,
 64 |                 self._n_warp_segments_per_X)
 65 | 
 66 |     def reduce_lenscale(self, params):
 67 |         new_l = np.maximum(self._lenscales_min,
 68 |                            self._l_from_alpha(np.asarray(params[0:self.N]) - 1))
 69 |         rval = list(params)
 70 |         rval[0:self.N] = self._alpha_from_l(new_l)
 71 |         return rval
 72 | 
 73 |     def unpack(self, params):
 74 |         alpha = params[0:self.N]
 75 |         cond_x = params[self.N:2 * self.N]
 76 |         cond_y = params[2 * self.N: 3 * self.N]
 77 |         return alpha, cond_x, cond_y
 78 | 
 79 |     def s_logprior(self, s_params, strength=10.0):
 80 |         # -- I don't know what distribution this would be
 81 |         #    but I think it makes a nice shape
 82 |         s_alpha, s_cond_x, s_cond_y = self.unpack(s_params)
 83 |         n_alpha_min = self._alpha_from_l(self._lenscales_min)
 84 |         n_alpha_max = self._alpha_from_l(self._lenscales_max)
 85 |         #return strength * (alpha - alpha_min) ** 2
 86 |         log0 = -10000
 87 |         width = n_alpha_max - n_alpha_min
 88 |         #alpha_mean = 0.5 * (alpha_max + alpha_min)
 89 |         energy = strength * 0.5 * (s_alpha - n_alpha_max) ** 2 / width ** 2
 90 |         lenscale_logprior = TT.switch(s_alpha < n_alpha_min,
 91 |                                       log0,
 92 |                                       TT.switch(s_alpha < n_alpha_max,
 93 |                                                 -energy,
 94 |                                                 log0)).sum()
 95 |         if self._conditional:
 96 |             diff_x = s_cond_x
 97 |             diff_y = s_cond_y - 1
 98 |             rval = (lenscale_logprior
 99 |                     + TT.dot(diff_x, diff_x)
100 |                     + TT.dot(diff_y, diff_y))
101 |         else:
102 |             rval = lenscale_logprior
103 |         assert rval.ndim == 0
104 |         return rval
105 | 
106 |     def cond_x(self, s_x, s_params):
107 |         #import theano
108 |         #s_x_all = theano.printing.Print('x_all')(s_x_all)
109 |         #s_x = s_x_all.T[self.s_idxs].T
110 |         s_alpha, s_missing_x, s_missing_y = self.unpack(s_params)
111 |         assert s_x.ndim == 2
112 |         #s_x = TT.addbroadcast(s_x, 1)
113 |         if self._conditional:
114 |             filled_x = TT.switch(s_isnan(s_x), s_missing_x, s_x)
115 |             filled_y = TT.switch(s_isnan(s_x), s_missing_y, 0)
116 |         else:
117 |             filled_x = s_x
118 |             filled_y = None
119 |         assert filled_x.ndim == 2
120 |         return filled_x, filled_y
121 | 
122 | 
123 |     def opt_logK(self, s_x, s_params):
124 |         s_alpha, s_missing_x, s_missing_y = self.unpack(s_params)
125 |         filled_x, filled_y = self.cond_x(s_x, s_params)
126 | 
127 |         lenscales = TT.sqrt(.5 * TT.exp(s_alpha))
128 | 
129 |         dist_sq = euclidean_sq_distances(filled_x / lenscales,
130 |                                          filled_x / lenscales)
131 |         if filled_y is not None:
132 |             dist_sq += euclidean_sq_distances(filled_y / lenscales,
133 |                                               filled_y / lenscales)
134 |         # Geometric
135 |         logK = -0.5 * dist_sq
136 | 
137 |         params0 = list(self._alpha_from_l(self._lenscales_0))
138 |         if self._conditional:
139 |             params0.extend([0.] * self.N)
140 |             params0.extend([1.] * self.N)
141 |         params0.extend([0.] * self._n_warp_segments_per_X)
142 |         amin = self._alpha_from_l(self._lenscales_min)
143 |         amax = self._alpha_from_l(self._lenscales_max)
144 |         bounds = zip(amin, amax)
145 |         if self._conditional:
146 |             bounds.extend([(-5., 5.)] * self.N)
147 |             bounds.extend([(1e-5, 5.)] * self.N)
148 |         if self._n_warp_segments_per_X:
149 |             #bounds.extend([(-.2, 2.)] * self._n_warp_segments)
150 |             raise NotImplementedError()
151 |         return logK, params0, bounds
152 | 
153 |     def predict_logK(self, s_x, s_z, s_params):
154 |         filled_x_x, filled_x_y = self.cond_x(s_x, s_params)
155 |         filled_z_x, filled_z_y = self.cond_x(s_z, s_params)
156 | 
157 |         s_alpha, s_missing_x, s_missing_y = self.unpack(s_params)
158 |         lenscales = TT.sqrt(.5 * TT.exp(s_alpha))
159 | 
160 |         dist_xx_sq = euclidean_sq_distances(filled_x_x / lenscales,
161 |                                             filled_x_x / lenscales)
162 |         dist_xz_sq = euclidean_sq_distances(filled_x_x / lenscales,
163 |                                             filled_z_x / lenscales)
164 |         if filled_x_y is not None:
165 |             dist_xx_sq += euclidean_sq_distances(filled_x_y / lenscales,
166 |                                                  filled_x_y / lenscales)
167 |             dist_xz_sq += euclidean_sq_distances(filled_x_y / lenscales,
168 |                                                  filled_z_y / lenscales)
169 |         logK = -0.5 * dist_xx_sq
170 |         logK_new = -0.5 * dist_xz_sq
171 | 
172 |         #x2 = self.cond_x(s_x, s_params)
173 |         #z2 = self.cond_x(s_z, s_params)
174 |         #logK = self._logK_of_dist(
175 |             #euclidean_sq_distances(x2, x2), s_params, True)
176 |         #logK_new = self._logK_of_dist(
177 |             #euclidean_sq_distances(x2, z2), s_params, False)
178 |         return logK, logK_new
179 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/scrap.py:
--------------------------------------------------------------------------------
  1 |         if 1:
  2 |             keyfunc = lambda nc: nc[1]['node'].name
  3 |             hps_by_type = dict()
  4 |             idxs_by_type = dict()
  5 |             kerns = []
  6 |             for distname, labels_hps in groupby(sorted(self.config.items(),
  7 |                                                        key=keyfunc),
  8 |                                                 keyfunc):
  9 |                 label_list, hp_list = zip(*list(labels_hps))
 10 |                 hps_by_type[distname] = hp_list
 11 |                 idxs_by_type[distname] = map(self.hps.index, label_list)
 12 |                 foo = hps_by_type[distname]
 13 |                 print distname, len(foo), idxs_by_type[distname]
 14 |                 kerns.append(ph['kernel'])
 15 | 
 16 |             param_helper = ParamHelper(self.config)
 17 | 
 18 |             x_bounds = [(None, None)] * len(self.hps)
 19 |             ndim_offset = 0
 20 |             for hpname in self.hps:
 21 |                 ph = self.param_helpers[hpname] = param_helper(hpname)
 22 | 
 23 |             import sys
 24 |             sys.exit()
 25 |         else:
 26 | 
 27 | 
 28 | 
 29 | 
 30 | class ConvexMixtureKernel(object):
 31 |     """
 32 | 
 33 |     Attributes:
 34 |     
 35 |         kernels -
 36 |         element_ranges - each kernel looks at these elements (default ALL)
 37 |         feature_names - 
 38 |         raw_coefs - 
 39 |         coefs - 
 40 | 
 41 |     """
 42 |     def __init__(self, **kwargs):
 43 |         self.__dict__.update(kwargs)
 44 | 
 45 |     def __str__(self):
 46 |         coefs = self.coefs_f()
 47 |         ks = [str(k) for k in self.kernels]
 48 |         return 'ConvexMixtureKernel{%s}'%(','.join(['%s*%s'%(str(c),s) for c,s in zip(coefs, ks)]))
 49 | 
 50 |     def summary(self):
 51 |         import StringIO
 52 |         ss = StringIO.StringIO()
 53 |         coefs = self.coefs_f()
 54 |         print >> ss,  "ConvexMixtureKernel:"
 55 |         for c, k,fname in zip(coefs,self.kernels, self.feature_names):
 56 |             print >> ss,  "  %f * %s '%s'" %(c, str(k), fname)
 57 |         return ss.getvalue()
 58 | 
 59 |     @classmethod
 60 |     def alloc(cls, kernels, coefs=None, element_ranges=None, feature_names=None):
 61 |         if coefs is None:
 62 |             raw_coefs = theano.shared(np.zeros(len(kernels)))
 63 |             print "HAAACK"
 64 |             raw_coefs.get_value(borrow=True)[0] += 1 
 65 |         else:
 66 |             raise NotImplementedError()
 67 |         coefs=TT.nnet.softmax(raw_coefs.dimshuffle('x',0))[0]
 68 |         coefs_f = theano.function([], coefs)
 69 |         return cls(
 70 |                 kernels=kernels,
 71 |                 coefs=coefs,
 72 |                 coefs_f = coefs_f, #DEBUG
 73 |                 raw_coefs = raw_coefs,
 74 |                 element_ranges=element_ranges,
 75 |                 feature_names = feature_names,
 76 |                 )
 77 | 
 78 |     def params(self):
 79 |         rval = [self.raw_coefs]
 80 |         for k in self.kernels:
 81 |             rval.extend(k.params())
 82 |         return rval
 83 |     def param_bounds(self):
 84 |         rval = [(self.raw_coefs_min, self.raw_coefs_max)]
 85 |         for k in self.kernels:
 86 |             rval.extend(k.param_bounds())
 87 |         return rval
 88 | 
 89 |     def K(self, x, y):
 90 |         # get the kernel matrix from each sub-kernel
 91 |         if self.element_ranges is None:
 92 |             Ks = [kernel.K(x,y) for kernel in  self.kernels]
 93 |         else:
 94 |             assert len(self.element_ranges) == len(self.kernels)
 95 |             Ks = [kernel.K(x[:,er[0]:er[1]],y[:,er[0]:er[1]])
 96 |                     for (kernel,er) in zip(self.kernels, self.element_ranges)]
 97 |         # stack them up
 98 |         Kstack = TT.stack(*Ks)
 99 |         # multiply by coefs
100 |         # and sum down to one kernel
101 |         K = TT.sum(self.coefs.dimshuffle(0,'x','x') * Kstack,
102 |                 axis=0)
103 |         return K
104 | 
105 | 
106 | 
107 | class Exp(SqExp):
108 |     """
109 |     K(x,y) = exp(- ||x-y|| / l)
110 | 
111 |     """
112 | 
113 |     def __init__(self, **kwargs):
114 |         self.__dict__.update(kwargs)
115 |         if self.log_lenscale.ndim!=0:
116 |             raise TypeError('log_lenscale must be scalar', self.log_lenscale)
117 | 
118 |     def __str__(self):
119 |         l = np.exp(self.log_lenscale.value)
120 |         return "ExponentialKernel{l=%s}"%str(l)
121 | 
122 |     @classmethod
123 |     def alloc(cls, l=1, l_min=1e-4, l_max=1000):
124 |         log_l = np.log(l)
125 |         log_lenscale = theano.shared(log_l)
126 |         if l_min is None:
127 |             log_lenscale_min = None
128 |         else:
129 |             log_lenscale_min = np.log(2*(l_min**2))
130 |         if l_max is None:
131 |             log_lenscale_max = None
132 |         else:
133 |             log_lenscale_max = np.log(2*(l_max**2))
134 |         return cls(log_lenscale=log_lenscale,
135 |                 log_lenscale_min=log_lenscale_min,
136 |                 log_lenscale_max=log_lenscale_max)
137 | 
138 |     def params(self):
139 |         return [self.log_lenscale]
140 | 
141 |     def param_bounds(self):
142 |         return [(self.log_lenscale_min, self.log_lenscale_max)]
143 | 
144 |     def K(self, x, y):
145 |         l = TT.exp(self.log_lenscale)
146 |         d = ((x**2).sum(axis=1).dimshuffle(0,'x')
147 |                 + (y**2).sum(axis=1)
148 |                 - 2 * TT.dot(x, y.T))
149 |         K = TT.exp(-TT.sqrt(d)/l)
150 |         return K
151 | 
152 | 
153 | class CategoryKernel(object):
154 |     """
155 |     K(x,y) is 1 if x==y else exp(-1/l)
156 | 
157 |     The idea is that it's like a SquaredExponentialKernel
158 |     where every point is a distance of 1 from every other one, 
159 |     except itself.
160 | 
161 |     Attributes:
162 |         
163 |         l - 
164 | 
165 |     """
166 |     def __init__(self, **kwargs):
167 |         self.__dict__.update(kwargs)
168 |         if self.l.ndim!=0:
169 |             raise TypeError('log_denom must be scalar', self.l)
170 |     def lenscale(self, thing=None):
171 |         if thing is None:
172 |             thing = self.l
173 |         return value(thing)
174 |     def __str__(self):
175 |         l = self.lenscale()
176 |         (a,b), = self.param_bounds()
177 |         return "CategoryKernel{l=%s,bounds=(%s,%s)}"%(
178 |                 str(l), str(a), str(b))
179 | 
180 |     @classmethod
181 |     def alloc(cls, l=1.0, l_min=1e-5, l_max=100.):
182 |         l = theano.shared(l)
183 |         return cls(l=l,
184 |                 l_min=l_min,
185 |                 l_max=l_max,
186 |                 )
187 | 
188 |     def params(self):
189 |         return [self.l]
190 |     def param_bounds(self):
191 |         return [(self.l_min, self.l_max)]
192 | 
193 |     def K(self, x, y):
194 |         xx = x.reshape((x.shape[0],))
195 |         yy = y.reshape((y.shape[0],))
196 |         xx = xx.dimshuffle(0,'x') # drop cols because there should only be 1
197 |         yy = yy.dimshuffle(0)     # drop cols because there should only be 1
198 |         K = TT.exp(-TT.neq(xx,yy)/self.l)
199 |         return K
200 | 
201 | 
202 | 
203 | class GPR_HMC_for_SGD_EI_OPT(object):
204 |     def __init__(self):
205 |         # ...
206 | 
207 |         self.s_EI_pts = theano.shared(np.zeros((2, 2)))
208 |         self.s_EI_vals = theano.shared(np.zeros(2))
209 |         self.s_EI_step = theano.tensor.dscalar('EI_step')
210 |         self.s_EI_thresh = theano.shared(0.0)
211 | 
212 |         s_mean_x, s_var_x, s_x = self.kernel.s_mean_var(
213 |             self.s_X,
214 |             self.s_y,
215 |             self.s_var_y,
216 |             self.s_emp_var,
217 |             self.positions[0],
218 |             self.s_var_min,
219 |             x_new=self.s_EI_pts)
220 |         s_logEI = s_normal_logEI(
221 |             - self.s_EI_thresh,
222 |             - (s_mean_x + self.s_emp_mean),
223 |             s_var_x,
224 |             quad_approx=True)
225 |         print 'compiling update_EI_pts fn'
226 |         self.update_EI_pts = theano.function(
227 |             [self.s_EI_step],
228 |             [],
229 |             updates=[
230 |                 (self.s_EI_pts, TT.clip(
231 |                     self.s_EI_pts + self.s_EI_step * TT.grad(s_logEI.sum(),
232 |                                                              self.s_EI_pts),
233 |                     np.asarray(bounds)[:, 0],
234 |                     np.asarray(bounds)[:, 1])),
235 |                 (self.s_EI_vals, 0.95 * self.s_EI_vals + .05 * s_logEI),
236 |             ],
237 |             allow_input_downcast=True)
238 | 
239 |     def fit_and_optimize_EI(self, X, y, var_y, debug, ion,
240 |                            EI_pts):
241 |         print 'setting up'
242 |         self.s_emp_mean.set_value(np.mean(y))
243 |         self.s_emp_var.set_value(max(np.var(y), np.min(var_y)))
244 |         self.s_X.set_value(X)
245 |         self.s_y.set_value(y - self.s_emp_mean.get_value())
246 |         self.s_var_y.set_value(var_y + np.zeros(len(y)))
247 |         self.s_EI_pts.set_value(EI_pts)
248 |         self.s_EI_vals.set_value(np.zeros(len(EI_pts)))
249 |         self.s_EI_thresh.set_value(np.min(y))
250 | 
251 |         samples = []
252 |         nlls = []
253 |         costs = []
254 |         t0 = time.time()
255 |         hmc_duration = 10.0 # seconds
256 |         print 'running the sampler'
257 |         while time.time() < (t0 + hmc_duration):
258 |             try:
259 |                 tt = time.time() - t0
260 |                 pos = self.sampler.draw()
261 |                 self.update_EI_pts(.003 * min(1, 1. / (.1 + tt)))
262 |                 samples.append(pos.flatten())
263 |                 if debug:
264 |                     nll_ii, cost_ii = self.nll_fn(pos.flatten())
265 |                     #print s_EI_vals.get_value()
266 |                     print 'best_EI', self.s_EI_vals.get_value().min()
267 |                     print 'current position', pos.flatten(),
268 |                     print 'accept rate', self.sampler.avg_acceptance_rate.get_value(),
269 |                     print 'nll', nll_ii, 'cost', cost_ii
270 |                     nlls.append(nll_ii)
271 |                     costs.append(cost_ii)
272 |             except ValueError, e:
273 |                 # -- XXX should not happen
274 |                 print 'ERROR: HMC crashed after %i draws' % len(samples)
275 |                 raise
276 |                 break
277 | 
278 |             except np.linalg.LinAlgError, e:
279 |                 print 'ERROR: HMC singular matrix after %i draws' % len(samples)
280 |                 break
281 |         samples = np.asarray(samples)
282 |         print 'hmc drew', len(samples)
283 |         step = max(1, len(samples) // 10)
284 |         keep = samples[::step]
285 |         if keep.size == 0:
286 |             raise NotImplementedError()
287 | 
288 |         if debug:
289 |             import matplotlib.pyplot as plt
290 |             if ion:
291 |                 plt.figure(2)
292 |             if self.kernel.n_params == 1:
293 |                 plt.subplot(211)
294 |                 plt.cla()
295 |                 plt.hist(np.asarray(samples).flatten())
296 |                 plt.title('nlls observed during sampling')
297 |                 plt.subplot(212)
298 |                 plt.cla()
299 |                 plt.scatter(samples, nlls, label='nll', c='b')
300 |                 plt.scatter(samples, costs, label='cost', c='g')
301 |                 plt.title('nlls vs. alpha')
302 |                 plt.legend()
303 |             if self.kernel.n_params == 2:
304 |                 plt.cla()
305 |                 plt.scatter(samples[:, 0], samples[:, 1])
306 |             if ion:
307 |                 plt.draw()
308 |             else:
309 |                 plt.show()
310 | 
311 |         class Res(object):
312 |             pass
313 | 
314 |         rval = Res()
315 |         best_idx = np.argmax(self.s_EI_vals.get_value())
316 |         rval.x = self.s_EI_pts.get_value()[best_idx]
317 |         rval.fun = self.s_EI_vals.get_value()[best_idx]
318 | 
319 |         self._params_list = keep
320 |         self._params_weights = np.ones(len(keep)) / len(keep)
321 |         return rval
322 | 
323 | class LengthscaleBounds(object):
324 |     def __init__(self, config):
325 |         self.config = config
326 | 
327 |     def LU0(self, name):
328 |         node = self.config[name]['node']
329 |         return getattr(self, node.name)(node)
330 | 
331 |     def randint(self, node):
332 |         return 0.001, 2.0, 1.0
333 | 
334 |     def categorical(self, node):
335 |         return 0.001, 2.0, 1.0
336 | 
337 |     def uniform(self, node):
338 |         low = float(node.arg['low'].obj)
339 |         high = float(node.arg['high'].obj)
340 |         thetaL = (high - low) / 20.0
341 |         thetaU = (high - low) * 2.
342 |         return thetaL, thetaU, (high - low) / 2
343 | 
344 |     def quniform(self, node):
345 |         # -- quantization is irrelevant
346 |         return self.uniform(node)
347 | 
348 |     def loguniform(self, node):
349 |         # -- log-scaling has been handled by feature code
350 |         return self.uniform(node)
351 | 
352 |     def qloguniform(self, node):
353 |         # -- log-scaling has been handled by feature code
354 |         #    quantization is irrelevant
355 |         return self.uniform(node)
356 | 
357 |     def normal(self, node):
358 |         sigma = float(node.arg['sigma'].obj)
359 |         thetaL = sigma / 20.0
360 |         thetaU = sigma * 2.
361 |         return thetaL, thetaU, sigma
362 | 
363 |     def qnormal(self, node):
364 |         # -- quantization is irrelevant
365 |         return self.normal(node)
366 | 
367 |     def lognormal(self, node):
368 |         # -- log-scaling has been handled by feature code
369 |         return self.normal(node)
370 | 
371 |     def qlognormal(self, node):
372 |         # -- log-scaling has been handled by feature code
373 |         #    quantization is irrelevant
374 |         return self.normal(node)
375 | 
376 | 
377 | import numpy as np
378 | 
379 | import theano
380 | import theano.tensor as TT
381 | from hyperopt import rand
382 | 
383 | from . import gpr_math
384 | from .hpsuggest import SuggestBest, DomainGP
385 | 
386 | 
387 | class DomainGP_LUCB(DomainGP):
388 |     _optimism = 1.0
389 |     _sigmoid_bias = -0.0
390 | 
391 |     def init_cost_fns(self):
392 |         try:
393 |             self._cost_fn
394 |         except AttributeError:
395 |             s_optimism = TT.dscalar('optimism')
396 |             s_ubound = TT.dscalar('ubound')
397 |             s_lbound = TT.dscalar('lbound')
398 | 
399 |             # s_mean_x means "symbolic mean of x"
400 |             s_mean_x, s_var_x, s_x, K_new = self.gpr.kernel.s_mean_var(
401 |                 self.gpr.s_X,
402 |                 self.gpr.s_y,
403 |                 self.gpr.s_var_y,
404 |                 self.gpr.s_emp_var,
405 |                 self.gpr.s_params,
406 |                 self.gpr.s_var_min,
407 |                 return_K_new=True)
408 | 
409 |             corrected_mean = s_mean_x + self.gpr.s_emp_mean
410 |             # -- good vars are for maximizing,
411 |             #    in keeping with EI being about improving *over* thresh
412 |             good_max = -s_lbound
413 |             good_best_seen = -s_ubound
414 |             good_mean = -corrected_mean
415 |             good_var = s_var_x
416 | 
417 |             #scalar = 1.0 + s_optimism
418 | 
419 |             #z = (corrected_mean - s_lbound) / TT.sqrt(s_var_x)
420 |             #acq = gpr_math.s_normal_EBI(
421 |             #    0,
422 |             #    -(s_lbound - corrected_mean),
423 |             #    0,
424 |             #    s_var_x)
425 |             #s_cost = -(tradeoff * acq) - (1 - tradeoff) * corrected_mean * TT.erf(-z)
426 | 
427 | 
428 |             if 1: # -- use LUCB
429 |                 #good_var = good_var * s_optimism ** 2
430 |                 lost_mass = gpr_math.s_normal_cdf(-good_max,
431 |                                                   -good_mean,
432 |                                                   good_var)
433 |                 gap = good_max - good_best_seen
434 |                 drop = s_optimism * gap
435 |                 #coef = 1. / s_optimism
436 |                 EBI_ceil = TT.minimum(
437 |                     good_mean,
438 |                     good_max - drop)
439 |                     #coef * good_min + (1 - coef) * good_max)
440 |                 #max_ceil = good_max - (s_optimism - 1) * gap
441 |                 acq = (
442 |                     EBI_ceil
443 |                     + (
444 |                         gpr_math.s_normal_EBI(
445 |                             EBI_ceil, good_max,
446 |                             EBI_ceil, good_var) / (1 - lost_mass)))
447 |                         #+ (good_max - good_mean) * ))
448 | 
449 |             elif 1: # -- use bounded EI type thing
450 |                 ebi_term = gpr_math.s_normal_EBI(
451 |                     good_min, good_max,
452 |                     good_mean, good_var)
453 |                 mass_above_good_max = gpr_math.s_normal_cdf(
454 |                     -good_max, -good_mean, good_var)
455 |                 acq = ebi_term + (good_max - good_min) * mass_above_good_max
456 | 
457 |             s_cost = -acq
458 |             try:
459 |                 s_gx = TT.grad(s_cost.sum(), s_x)
460 |                 self._cost_deriv = theano.function(
461 |                     [s_x, self.gpr.s_params, s_optimism, s_ubound, s_lbound],
462 |                     [s_cost, s_gx],
463 |                     on_unused_input='warn')
464 |             except theano.gradient.DisconnectedInputError:
465 |                 self._cost_deriv = None
466 |             self._cost_fn = theano.function(
467 |                 [s_x, self.gpr.s_params, s_optimism, s_ubound, s_lbound],
468 |                 s_cost,
469 |                 on_unused_input='warn')
470 |             self._K_new = theano.function(
471 |                 [s_x, self.gpr.s_params], K_new)
472 | 
473 |     def crit(self, X):
474 |         self.init_cost_fns()
475 |         if len(self.gpr._params_list) > 1:
476 |             raise NotImplementedError()
477 |         pp, = self.gpr._params_list
478 |         return self._cost_fn(X, pp, self._optimism, self._ubound, self._lbound)
479 | 
480 |     def crit_deriv(self, X):
481 |         if self._cost_deriv is None:
482 |             raise NotImplementedError()
483 |         self.init_cost_fns()
484 |         if len(self.gpr._params_list) > 1:
485 |             raise NotImplementedError()
486 |         pp, = self.gpr._params_list
487 |         return self._cost_deriv(X, pp, self._optimism, self._ubound, self._lbound)
488 | 
489 |     def optimize_over_X(self, n_buckshots, n_finetunes, rng):
490 |         while True:
491 |             rval_raw = DomainGP.optimize_over_X(self,
492 |                                                 n_buckshots,
493 |                                                 n_finetunes,
494 |                                                 rng,
495 |                                                 ret_raw=True)
496 |             Ks = self._K_new(np.atleast_2d(rval_raw), self.gpr._params_list[0])
497 |             # XXX: todo, if other non-redundant local optima were discoverd by
498 |             # the fine-tuning process then it might better to take them,
499 |             # before distorting the utility landscape with this "optimism"
500 |             # multiplier. I wonder if one is more "right" to do than the other
501 |             if (Ks.max() > (1 - 1e-5)):
502 |                 if self._optimism < 1e8:
503 |                     self._optimism *= 2
504 |                     print 'LUCB raising optimism to', self._optimism
505 |                 else:
506 |                     print "LUCB error finding new point!"
507 |             else:
508 |                 break
509 |         best_pt = self.best_pt_from_featurevec(rval_raw)
510 |         return best_pt
511 | 
512 | 
513 | _suggest_domain_cache = {}
514 | def suggest(new_ids, domain, trials, seed,
515 |             warmup_cutoff=1,
516 |             n_buckshots=10000,
517 |             n_finetunes=50,
518 |             best_possible=-np.inf,
519 |             #best_headroom=1.0,
520 |             stop_at=None,
521 |             plot_contours=None,
522 |             ):
523 |     """
524 |     Parameters
525 |     ----------
526 | 
527 |     """
528 |     if stop_at is not None and stop_at < best_possible:
529 |         raise ValueError(
530 |             ('If stop_at is specified'
531 |              ', it (%f) must be greater than best_possible (%f)') % (
532 |                  stop_at, best_possible))
533 | 
534 |     if len(trials.trials) <= warmup_cutoff:
535 |         return rand.suggest(new_ids, domain, trials, seed)
536 | 
537 |     try:
538 |         dgp = _suggest_domain_cache[domain]
539 |     except KeyError:
540 |         dgp = _suggest_domain_cache[domain] = DomainGP_LUCB(domain)
541 | 
542 |     if stop_at is not None and min(trials.losses()) < stop_at:
543 |         return []
544 | 
545 |     X, y, var_y = dgp._X_y_var_y(trials)
546 |     dgp.fit_gpr(X, y, var_y)
547 |     dgp._optimism = 1.0 #0.5 * dgp._optimism
548 |     dgp._ubound = np.min(y)
549 |     dgp._lbound = best_possible
550 | 
551 |     #yy = y + np.sqrt(np.maximum(var_y, dgp.gpr.s_var_min.eval()))
552 |     #dgp._ubound = np.min(yy)
553 |         #max(opt_lbound,  - best_headroom)
554 |     #print 'LUCB interval:', dgp._lbound, dgp._ubound
555 | 
556 |     print 'LUCB: Best after %i trials: %f' % ( len(y), np.min(y))
557 |     #dgp.gpr._params_list[0][:] = 0
558 |     rng = np.random.RandomState(seed)
559 |     best_pt = dgp.optimize_over_X(
560 |         n_buckshots=n_buckshots,
561 |         n_finetunes=n_finetunes,
562 |         rng=rng,
563 |         )
564 |     if plot_contours:
565 |         plot_contours(dgp, 2, dgp._lbound, best_pt)
566 |     new_id, = new_ids
567 |     #print 'LUCB: Best pt', best_pt
568 |     return SuggestBest(domain, trials, seed, best_pt)(new_id)
569 | 
570 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/suggest_algos.py:
--------------------------------------------------------------------------------
1 | 
2 | from .hpsuggest_ei import suggest as ei
3 | from .hpsuggest_ucb import suggest as ucb
4 | #from .hpsuggest_lucb import suggest as lucb
5 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/test/__init__.py


--------------------------------------------------------------------------------
/hp_gpsmbo/test/test_branin.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import numpy as np
 3 | import hyperopt
 4 | from hyperopt.tests.test_domains import branin
 5 | import hp_gpsmbo.hpsuggest
 6 | 
 7 | def test_branin(suggest=hp_gpsmbo.hpsuggest.suggest, seed=1, iters=10):
 8 |     import matplotlib.pyplot as plt
 9 |     plt.ion()
10 |     mins = []
11 |     all_ys = []
12 |     for ii in range(int(seed), int(seed) + int(iters)):
13 |         print 'SEED', ii
14 |         space = branin()
15 |         trials = hyperopt.Trials()
16 |         hyperopt.fmin(
17 |             fn=lambda x: x,
18 |             space=space.expr,
19 |             trials=trials,
20 |             algo=partial(suggest, stop_at=0.398),
21 |             rstate=np.random.RandomState(ii),
22 |             max_evals=50)
23 |         plt.subplot(2, 1, 1)
24 |         plt.cla()
25 |         ys = trials.losses()
26 |         all_ys.append(ys)
27 |         for ys_jj in all_ys:
28 |             plt.plot(ys_jj)
29 |         plt.plot(trials.losses())
30 |         plt.subplot(2, 1, 2)
31 |         plt.cla()
32 |         for ys_jj in all_ys:
33 |             plt.plot(ys_jj)
34 |         plt.ylim(0, 1)
35 |         plt.axhline(np.min(ys))
36 |         plt.annotate('min=%f' % np.min(ys), xy=(1, np.min(ys)))
37 |         plt.draw()
38 |         mins.append(min(ys))
39 |         print 'MINS', mins
40 |     assert np.max(mins) < 0.398
41 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/test/test_gpr.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from hp_gpsmbo import GPR_ML2, kernels
 3 | 
 4 | #class Test1(unittest.TestCase):
 5 | 
 6 | def test_prior_mean(GPR=GPR_ML2):
 7 |     # Test that the prior mean and prior variance are respected
 8 |     # in a simple case where there is just a single data point at 0.
 9 |     for prior_mean in (-5, 0, 5):
10 |         for prior_var in (.1, 1):
11 |             gpr = GPR(kernels.SqExp(1.0, 1e-4, 10, conditional=False),
12 |                     maxiter=1,
13 |                     prior_var=prior_var,
14 |                     prior_mean=prior_mean)
15 |             gpr.fit([[0]], [1])
16 |             m, v = gpr.predict([[-10], [0], [10]], eval_MSE=True)
17 |             assert np.allclose(m[0], prior_mean)
18 |             assert np.allclose(m[1], 1)
19 |             assert np.allclose(m[2], prior_mean)
20 |             assert np.allclose(v[0], prior_var)
21 |             assert np.allclose(v[1], 0)
22 |             assert np.allclose(v[2], prior_var)
23 | 
24 | 
25 | def test_data_pts_respected(GPR=GPR_ML2):
26 |     X = np.asarray([[-1], [0], [1.5]])
27 |     y = np.asarray([-4, 1, 0.5])
28 |     for prior_mean in (-5, 0, 5):
29 |         for prior_var in (.1, 1):
30 |             gpr = GPR(kernels.SqExp(1.0, 1e-4, 10, conditional=False),
31 |                     maxiter=1,
32 |                     prior_var=prior_var,
33 |                     prior_mean=prior_mean)
34 |             gpr.fit(X, y)
35 |             m, v = gpr.predict(X, eval_MSE=True)
36 |             assert np.all(v < 1e-7)
37 |             assert np.allclose(m, y)
38 | 
39 | 
40 | # -- flake8 eof
41 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/test/test_gpr_math.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from hp_gpsmbo import gpr_math
  3 | import theano
  4 | import scipy.stats
  5 | 
  6 | 
  7 | def test_normal_pdf():
  8 |     rng = np.random.RandomState(123)
  9 |     norm = scipy.stats.norm
 10 | 
 11 |     N = 50
 12 |     x = rng.randn(N)
 13 |     mean = rng.randn(N)
 14 |     var = rng.randn(N) ** 2
 15 | 
 16 |     s_x, s_m, s_v = theano.tensor.dvectors('xmv')
 17 | 
 18 |     fn = theano.function([s_x, s_m, s_v],
 19 |                          gpr_math.s_normal_pdf(s_x, s_m, s_v))
 20 | 
 21 | 
 22 |     assert np.allclose(norm.pdf(x, mean, np.sqrt(var)),
 23 |                        fn(x, mean, var))
 24 | 
 25 | 
 26 | def test_normal_logpdf():
 27 |     rng = np.random.RandomState(123)
 28 |     norm = scipy.stats.norm
 29 | 
 30 |     N = 50
 31 |     x = rng.randn(N) * 10 - 50
 32 |     mean = rng.randn(N)
 33 |     var = rng.randn(N) ** 2
 34 | 
 35 |     s_x, s_m, s_v = theano.tensor.dvectors('xmv')
 36 | 
 37 |     logfn = theano.function([s_x, s_m, s_v],
 38 |                          gpr_math.s_normal_logpdf(s_x, s_m, s_v))
 39 | 
 40 |     assert np.allclose(norm.logpdf(x, mean, np.sqrt(var)),
 41 |                        logfn(x, mean, var))
 42 | 
 43 | 
 44 | def test_normal_cdf():
 45 |     rng = np.random.RandomState(123)
 46 |     norm = scipy.stats.norm
 47 | 
 48 |     N = 50
 49 |     x = rng.randn(N)
 50 |     mean = rng.randn(N)
 51 |     var = rng.randn(N) ** 2
 52 | 
 53 |     #x = np.sort(x)
 54 |     #mean = np.zeros(N)
 55 |     #var = np.ones(N)
 56 | 
 57 |     s_x, s_m, s_v = theano.tensor.dvectors('xmv')
 58 | 
 59 |     fn = theano.function([s_x, s_m, s_v],
 60 |                          gpr_math.s_normal_cdf(s_x, s_m, s_v))
 61 |     myval = fn(x, mean, var)
 62 |     spval = norm.cdf(x, mean, np.sqrt(var))
 63 |     for xi, myv, spv in zip(x, myval, spval):
 64 |         print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv)
 65 | 
 66 |     assert np.allclose(norm.cdf(x, mean, np.sqrt(var)),
 67 |                        fn(x, mean, var))
 68 | 
 69 | def test_normal_logcdf():
 70 |     rng = np.random.RandomState(123)
 71 |     norm = scipy.stats.norm
 72 | 
 73 |     N = 50
 74 |     x = rng.randn(N) * 200
 75 |     mean = rng.randn(N)
 76 |     var = rng.randn(N) ** 2
 77 | 
 78 |     #mean = np.zeros(N)
 79 |     #var = np.ones(N)
 80 |     #x = np.sort(x)
 81 | 
 82 |     s_x, s_m, s_v = theano.tensor.dvectors('xmv')
 83 | 
 84 |     lcdf = gpr_math.s_normal_logcdf(s_x, s_m, s_v)
 85 | 
 86 |     fn = theano.function([s_x, s_m, s_v], lcdf)
 87 | 
 88 |     myval= fn(x, mean, var)
 89 |     spval = norm.logcdf(x, mean, np.sqrt(var))
 90 |     for xi, myv, spv in zip(x, myval, spval):
 91 |         print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv)
 92 |     assert np.allclose(norm.logcdf(x, mean, np.sqrt(var)),
 93 |                        myval)
 94 | 
 95 | 
 96 | def test_normal_logEI():
 97 |     #rng = np.random.RandomState(123)
 98 | 
 99 |     N = 2000
100 |     thresh = np.linspace(-10, 50, N)
101 |     #N = 100
102 |     #thresh = np.linspace(37, 38, N)
103 |     mean = thresh * 0
104 |     var = thresh * 0 + 1
105 | 
106 |     s_t, s_m, s_v = theano.tensor.dvectors('tmv')
107 | 
108 |     fn = theano.function([s_t, s_m, s_v],
109 |                          gpr_math.s_normal_logEI(s_t, s_m, s_v))
110 | 
111 |     if 0:
112 |         #print zip(thresh, fn(thresh, mean, var))
113 |         #print 
114 |         a = theano.tensor.dvector()
115 |         y = s_t ** 2 * a[2] + s_t * a[1] + a[0]
116 |         cost = ((y - gpr_math.s_normal_logEI(s_t, s_m, s_v)) ** 2).sum()
117 |         da = theano.grad(cost, a)
118 |         foo = theano.function([a, s_t, s_m, s_v], [cost, da])
119 |         res = scipy.optimize.minimize(foo, [0, -1, -1], jac=True,
120 |                                       args=(thresh, mean, var),
121 |                                       method='L-BFGS-B')
122 |         print res.x
123 | 
124 |     from hyperopt.criteria import logEI_gaussian
125 |     if 0:
126 |         import matplotlib.pyplot as plt
127 |         y = fn(thresh, mean, var)
128 |         z = logEI_gaussian(mean, var, thresh)
129 |         plt.plot(thresh, y)
130 |         plt.plot(thresh, z)
131 |         plt.show()
132 | 
133 |     # -- the gpr_math logEI uses a quadratic approximation for very
134 |     #    hopeless points, which gives the right derivative, but the
135 |     #    slightly wrong value
136 |     assert np.allclose(logEI_gaussian(mean, var, thresh),
137 |                        fn(thresh, mean, var),
138 |                        atol=1e-3, rtol=1e-4)
139 | 
140 |     if 0:
141 |         d_t = theano.grad(gpr_math.s_normal_logEI(s_t, s_m, s_v).sum(), s_t)
142 |         d_fn = theano.function([s_t, s_m, s_v], d_t)
143 | 
144 |         import matplotlib.pyplot as plt
145 |         plt.plot(thresh, d_fn(thresh, mean, var))
146 |         plt.show()
147 | 
148 | 
149 | def test_logEBI():
150 | 
151 |     def EBI_from_sample(sample, l, u):
152 |         sample = sample - l
153 |         sample[sample < 0] = 0
154 |         sample[sample > (u - l)] = 0
155 |         return sample.mean()
156 | 
157 |     def normal_EBI_numeric(l, u, m, sigma, N, rng):
158 |         return EBI_from_sample(rng.randn(N) * sigma + m, l, u)
159 | 
160 |     def normal_EBI_analytic(l, u, m, sigma):
161 |         from scipy.stats import norm
162 |         from hyperopt.criteria import EI_gaussian
163 |         EI_l = EI_gaussian(m, sigma ** 2, l)
164 |         EI_u = EI_gaussian(m, sigma ** 2, u)
165 |         term = (l - u) * norm.cdf((m - u) / sigma)
166 |         return EI_l - EI_u + term
167 | 
168 |     s_l, s_u, s_m, s_sigma = theano.tensor.dscalars('lums')
169 |     s_EBI = gpr_math.s_normal_EBI(s_l, s_u, s_m, s_sigma ** 2)
170 |     normal_EBI_theano = theano.function([s_l, s_u, s_m, s_sigma], s_EBI)
171 | 
172 | 
173 |     def assert_match(l, u, m, sigma, N=100000, seed=123):
174 |         l, u, m, sigma = map(float, (l, u, m, sigma))
175 |         num = normal_EBI_numeric(l, u, m, sigma, N, np.random.RandomState(seed))
176 |         ana = normal_EBI_analytic(l, u, m, sigma)
177 |         thn = normal_EBI_theano(l, u, m, sigma)
178 |         if not np.allclose(num, ana, atol=0.01, rtol=.01):
179 |             print 'test_EBI mismatch', l, u, m, sigma, '->', num, ana
180 |             assert 0
181 |         if not np.allclose(thn, ana, atol=0.0001, rtol=.0001):
182 |             print 'test_EBI theano mismatch', l, u, m, sigma, '->', thn, ana
183 |             assert 0
184 | 
185 |     assert_match(0, 100, 0, 1)
186 |     assert_match(0, 0.2, 0, 1)
187 |     assert_match(0, 1.2, 0, 1)
188 |     assert_match(0, 100, 0.5, 1.5)
189 |     assert_match(0, 0.2, 0.5, 1.5)
190 |     assert_match(0, 1.2, 0.5, 1.5)
191 | 
192 | 
193 | # -- eof flake8
194 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/test/test_har6.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import numpy as np
 3 | 
 4 | import hyperopt
 5 | from hyperopt import hp
 6 | from hypertree import har6
 7 | 
 8 | import hp_gpsmbo.hpsuggest
 9 | 
10 | def test_har6(suggest=hp_gpsmbo.hpsuggest.suggest, seed=1, iters=10):
11 |     # -- see shovel/hps.py for this test with debugging scaffolding
12 |     #    run it by typing e.g.
13 |     # 
14 |     #       shovel hps.run_har6 --seed=9
15 |     #
16 |     #    That should do a run that fails by only getting to -3.2
17 |     mins = []
18 |     for ii in range(int(seed), int(seed) + int(iters)):
19 |         print 'SEED', ii
20 |         space = {
21 |             'a': hp.uniform('a', 0, 1),
22 |             'b': hp.uniform('b', 0, 1),
23 |             'c': hp.uniform('c', 0, 1),
24 |             'x': hp.uniform('x', 0, 1),
25 |             'y': hp.uniform('y', 0, 1),
26 |             'z': hp.uniform('z', 0, 1),
27 |         }
28 |         trials = hyperopt.Trials()
29 |         hyperopt.fmin(
30 |             fn=har6.har6,
31 |             space=space,
32 |             trials=trials,
33 |             algo=partial(suggest, stop_at=-3.32),
34 |             rstate=np.random.RandomState(ii),
35 |             max_evals=100)
36 |         mins.append(min(trials.losses()))
37 | 
38 |     assert np.sum(mins > -3.32) < 3
39 | 
40 |     # XXX ideally this sum should be 0, but our optimizer
41 |     #     isn't that good :(
42 | 
43 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/test/test_hpsuggest.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import unittest
  3 | import numpy as np
  4 | from hyperopt import rand
  5 | from hyperopt import Trials, fmin
  6 | 
  7 | from hyperopt.tests.test_domains import CasePerDomain
  8 | from hp_gpsmbo import suggest_algos
  9 | 
 10 | def passthrough(x):
 11 |     return x
 12 | 
 13 | class TestSmoke(unittest.TestCase, CasePerDomain):
 14 |     def work(self):
 15 |         fmin(
 16 |             fn=passthrough,
 17 |             space=self.bandit.expr,
 18 |             algo=partial(suggest_algos.ei,
 19 |                          warmup_cutoff=3),
 20 |             max_evals=10)
 21 | 
 22 | 
 23 | class TestOpt(unittest.TestCase, CasePerDomain):
 24 |     # -- these thresholds are pretty low
 25 |     #    but they are set to that random does not pass them
 26 |     #    (at least, probably not)
 27 |     thresholds = dict(
 28 |             quadratic1=1e-5,
 29 |             q1_lognormal=0.0002,
 30 |             distractor=-2.0,
 31 |             gauss_wave=-2.8,
 32 |             gauss_wave2=-2.20,
 33 |             n_arms=-3.0,
 34 |             many_dists=-1.,
 35 |             branin=0.5,
 36 |             )
 37 | 
 38 |     LEN = dict(
 39 |             # -- running a long way out tests overflow/underflow
 40 |             #    to some extent
 41 |             twoarms=15,
 42 |             gausswave=50, 
 43 |             quadratic1=1000,
 44 |             many_dists=200,
 45 |             distractor=35,
 46 |             #q1_lognormal=100,
 47 |             branin=200,
 48 |             )
 49 | 
 50 |     def setUp(self):
 51 |         self.olderr = np.seterr('raise')
 52 |         np.seterr(under='ignore')
 53 | 
 54 |     def tearDown(self, *args):
 55 |         np.seterr(**self.olderr)
 56 | 
 57 |     def work(self):
 58 |         np.random.seed(1234)
 59 |         bandit = self.bandit
 60 |         LEN = self.LEN.get(bandit.name, 100)
 61 |         thresh = self.thresholds[bandit.name]
 62 | 
 63 |         print 'STARTING TEST', bandit.name
 64 |         rtrials = Trials()
 65 |         fmin(fn=passthrough,
 66 |             space=self.bandit.expr,
 67 |             trials=rtrials,
 68 |             algo=rand.suggest,
 69 |             max_evals=LEN,
 70 |             rstate=np.random)
 71 |         print 'RANDOM BEST 6:', list(sorted(rtrials.losses()))[:6]
 72 | 
 73 |         if bandit.name != 'n_arms':
 74 |             # -- assert that our threshold is meaningful
 75 |             assert min(rtrials.losses()) > thresh
 76 | 
 77 |         assert bandit.name is not None
 78 |         algo = partial(
 79 |             suggest_algos.ei,
 80 |             stop_at=self.thresholds[bandit.name])
 81 | 
 82 |         trials = Trials()
 83 |         fmin(fn=passthrough,
 84 |             space=self.bandit.expr,
 85 |             trials=trials,
 86 |             algo=algo,
 87 |             max_evals=LEN,
 88 |             rstate=np.random)
 89 |         assert len(trials) <= LEN
 90 | 
 91 | 
 92 |         if 0:
 93 |             plt.subplot(2, 2, 1)
 94 |             plt.scatter(range(LEN), trials.losses())
 95 |             plt.title('TPE losses')
 96 |             plt.subplot(2, 2, 2)
 97 |             plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
 98 |             plt.title('TPE x')
 99 |             plt.subplot(2, 2, 3)
100 |             plt.title('RND losses')
101 |             plt.scatter(range(LEN), rtrials.losses())
102 |             plt.subplot(2, 2, 4)
103 |             plt.title('RND x')
104 |             plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
105 |             plt.show()
106 |         if 0:
107 |             plt.hist(
108 |                     [t['x'] for t in self.experiment.trials],
109 |                     bins=20)
110 | 
111 | 
112 |         #print trials.losses()
113 |         print 'SUGGEST BEST 6:', list(sorted(trials.losses()))[:6]
114 |         #logx = np.log([s['x'] for s in trials.specs])
115 |         #print 'TPE MEAN', np.mean(logx)
116 |         #print 'TPE STD ', np.std(logx)
117 |         print 'Thresh', thresh
118 |         assert min(trials.losses()) < thresh
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/test/test_kernels.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from hp_gpsmbo import GPR, SqExp, Product
 3 | 
 4 | 
 5 | def test_lenscale_wider():
 6 |     # Smoke test that changing lenscale changes fit
 7 |     pass
 8 | 
 9 | 
10 | def test_product_smoke():
11 |     X = np.random.randn(10, 2)
12 |     y = np.random.randn(10)
13 |     model = GPR(
14 |         Product(
15 |             [SqExp(), SqExp()],
16 |             [slice(0, 1), slice(1, 2)]),
17 |         )
18 |     model.fit(X, y)
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/hp_gpsmbo/test/test_normal_log_EI.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from theano.gradient import verify_grad
  3 | import theano.tensor
  4 | from hp_gpsmbo.op_Kcond import normal_logEI_diff_sigma
  5 | from hp_gpsmbo.op_Kcond import normal_logEI_diff_sigma_elemwise
  6 | from hyperopt.criteria import logEI_gaussian
  7 | 
  8 | def test_normal_logEI():
  9 |     rng = np.random.RandomState(123)
 10 | 
 11 |     N = 2000
 12 |     thresh = np.linspace(-50, 500, N)
 13 |     #N = 100
 14 |     #thresh = np.linspace(37, 38, N)
 15 |     mean = thresh * 0
 16 |     var = 1e-1 + rng.rand(N)
 17 |     sigma = np.sqrt(var)
 18 | 
 19 |     s_t, s_m, s_v = theano.tensor.dvectors('tmv')
 20 | 
 21 |     fn = theano.function([s_t, s_m, s_v],
 22 |                          normal_logEI_diff_sigma(s_t - s_m,
 23 |                                                  theano.tensor.sqrt(s_v)))
 24 | 
 25 |     my = fn(thresh, mean, var)
 26 |     ref = logEI_gaussian(mean, var, thresh)
 27 |     for xi, myv, spv in zip(thresh, my, ref):
 28 |         print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv)
 29 | 
 30 |     assert np.any(thresh / sigma > 34)
 31 |     assert np.all(np.isfinite(my))
 32 |     assert np.allclose(my[thresh/sigma < 34], ref[thresh/sigma < 34])
 33 |     assert np.allclose(my, ref, rtol=.1)
 34 | 
 35 | 
 36 | def explore_grad():
 37 |     N = 15
 38 |     ubound = 6e2
 39 |     diff = np.ones(N) * 100
 40 |     rng = np.random.RandomState(123)
 41 |     #diff = np.linspace(0, ubound, N).astype('float64')
 42 |     #var = np.random.rand(N) * .1 + 1 #1e-8 +
 43 |     #var = np.ones(N) * .01
 44 |     var = np.exp(rng.randn(N) * 10) ** 2
 45 |     var = np.sort(var)
 46 | 
 47 |     s_d, s_v = theano.tensor.dvectors('dv')
 48 |     s_y = normal_logEI_diff_sigma(s_d, theano.tensor.sqrt(s_v))
 49 |     s_gd, s_gv = theano.tensor.grad(s_y.sum(), [s_d, s_v])
 50 | 
 51 |     fn = theano.function([s_d, s_v], [s_y, s_gd, s_gv])
 52 | 
 53 |     eps = ubound / 1e8 # 1e1 # 1e-4
 54 |     y, gd, gv = fn(diff, var)
 55 |     y_eps, _, _ = fn(diff + eps, var)
 56 |     y_eps2, _, _ = fn(diff, var + eps)
 57 |     for di, yi, yi_eps, yi2, gdi, gvi in zip(diff, y, y_eps, y_eps2, gd, gv):
 58 |         print 'di %.6f\tyi:%.6f\tgi:%.6f\tref:%.6f\tgv:%s\tref:%s' % (
 59 |             di, yi, gdi, (yi_eps - yi) / eps, gvi, (yi2 - yi) / eps
 60 |         )
 61 | 
 62 | def test_grad_arg0():
 63 |     N = 50
 64 |     def f_arg01(x):
 65 |         return normal_logEI_diff_sigma(x, np.ones(1))
 66 |     def f_arg0(x):
 67 |         return normal_logEI_diff_sigma(x, np.ones(N))
 68 | 
 69 |     rng = np.random.RandomState(123)
 70 |     diffvec = (rng.rand(N) - .5) * 200
 71 | 
 72 |     verify_grad(f_arg01, [np.asarray([-50.])], rng=rng)
 73 |     verify_grad(f_arg01, [np.asarray([50.])], rng=rng)
 74 |     verify_grad(f_arg0, [diffvec], rng=rng, rel_tol=1e-3)
 75 | 
 76 | def test_grad_arg1():
 77 |     N = 50
 78 |     def f_arg1(x):
 79 |         return normal_logEI_diff_sigma(np.ones(N) * 100,
 80 |                                        x)
 81 |     rng = np.random.RandomState(123)
 82 |     #sigmavec = np.exp(np.linspace(N) * 10)
 83 |     sigmavec = np.linspace(.1, 10, N)
 84 | 
 85 |     verify_grad( f_arg1, [sigmavec], rng=rng, rel_tol=1e-3)
 86 | 
 87 | def test_normal_logEI_elemwise():
 88 |     rng = np.random.RandomState(123)
 89 | 
 90 |     N = 2000
 91 |     thresh = np.linspace(-50, 500, N)
 92 |     #N = 100
 93 |     #thresh = np.linspace(37, 38, N)
 94 |     mean = thresh * 0
 95 |     var = 1e-1 + rng.rand(N)
 96 |     sigma = np.sqrt(var)
 97 | 
 98 |     s_t, s_m, s_v = theano.tensor.dvectors('tmv')
 99 | 
100 |     fn = theano.function([s_t, s_m, s_v],
101 |                          normal_logEI_diff_sigma_elemwise(
102 |                              s_t - s_m,
103 |                              theano.tensor.sqrt(s_v)))
104 | 
105 |     my = fn(thresh, mean, var)
106 |     ref = logEI_gaussian(mean, var, thresh)
107 |     for xi, myv, spv in zip(thresh, my, ref):
108 |         print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv)
109 | 
110 |     assert np.any(thresh / sigma > 34)
111 |     assert np.all(np.isfinite(my))
112 |     assert np.allclose(my[thresh/sigma < 34], ref[thresh/sigma < 34])
113 |     assert np.allclose(my, ref, rtol=.1)
114 | 
115 | def test_grad_arg0_elemwise():
116 |     N = 50
117 |     def f_arg01(x):
118 |         return normal_logEI_diff_sigma_elemwise(x, np.ones(1))
119 |     def f_arg0(x):
120 |         return normal_logEI_diff_sigma_elemwise(x, np.ones(N))
121 | 
122 |     rng = np.random.RandomState(123)
123 |     diffvec = (rng.rand(N) - .5) * 200
124 | 
125 |     verify_grad(f_arg01, [np.asarray([-50.])], rng=rng)
126 |     verify_grad(f_arg01, [np.asarray([50.])], rng=rng)
127 |     verify_grad(f_arg0, [diffvec], rng=rng, rel_tol=1e-3)
128 | 
129 | def test_grad_arg1_elemwise():
130 |     N = 50
131 |     def f_arg1(x):
132 |         return normal_logEI_diff_sigma_elemwise(
133 |             np.ones(N) * 100,
134 |             x)
135 |     rng = np.random.RandomState(123)
136 |     #sigmavec = np.exp(np.linspace(N) * 10)
137 |     sigmavec = np.linspace(.1, 10, N)
138 | 
139 |     verify_grad( f_arg1, [sigmavec], rng=rng, rel_tol=1e-3)
140 | 
141 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """ distribute- and pip-enabled setup.py """
  5 | 
  6 | import logging
  7 | import os
  8 | import re
  9 | 
 10 | # ----- overrides -----
 11 | 
 12 | # set these to anything but None to override the automatic defaults
 13 | packages = None
 14 | package_name = None
 15 | package_data = None
 16 | scripts = None
 17 | # ---------------------
 18 | 
 19 | 
 20 | # ----- control flags -----
 21 | 
 22 | # fallback to setuptools if distribute isn't found
 23 | setup_tools_fallback = True
 24 | 
 25 | # don't include subdir named 'tests' in package_data
 26 | skip_tests = False
 27 | 
 28 | # print some extra debugging info
 29 | debug = True
 30 | 
 31 | # -------------------------
 32 | 
 33 | if debug: logging.basicConfig(level=logging.DEBUG)
 34 | # distribute import and testing
 35 | try:
 36 |     import distribute_setup
 37 |     distribute_setup.use_setuptools()
 38 |     logging.debug("distribute_setup.py imported and used")
 39 | except ImportError:
 40 |     # fallback to setuptools?
 41 |     # distribute_setup.py was not in this directory
 42 |     if not (setup_tools_fallback):
 43 |         import setuptools
 44 |         if not (hasattr(setuptools,'_distribute') and \
 45 |                 setuptools._distribute):
 46 |             raise ImportError("distribute was not found and fallback to setuptools was not allowed")
 47 |         else:
 48 |             logging.debug("distribute_setup.py not found, defaulted to system distribute")
 49 |     else:
 50 |         logging.debug("distribute_setup.py not found, defaulting to system setuptools")
 51 | 
 52 | import setuptools
 53 | 
 54 | def find_scripts():
 55 |     return [s for s in setuptools.findall('scripts/') if os.path.splitext(s)[1] != '.pyc']
 56 | 
 57 | def package_to_path(package):
 58 |     """
 59 |     Convert a package (as found by setuptools.find_packages)
 60 |     e.g. "foo.bar" to usable path
 61 |     e.g. "foo/bar"
 62 | 
 63 |     No idea if this works on windows
 64 |     """
 65 |     return package.replace('.','/')
 66 | 
 67 | def find_subdirectories(package):
 68 |     """
 69 |     Get the subdirectories within a package
 70 |     This will include resources (non-submodules) and submodules
 71 |     """
 72 |     try:
 73 |         subdirectories = os.walk(package_to_path(package)).next()[1]
 74 |     except StopIteration:
 75 |         subdirectories = []
 76 |     return subdirectories
 77 | 
 78 | def subdir_findall(dir, subdir):
 79 |     """
 80 |     Find all files in a subdirectory and return paths relative to dir
 81 | 
 82 |     This is similar to (and uses) setuptools.findall
 83 |     However, the paths returned are in the form needed for package_data
 84 |     """
 85 |     strip_n = len(dir.split('/'))
 86 |     path = '/'.join((dir, subdir))
 87 |     return ['/'.join(s.split('/')[strip_n:]) for s in setuptools.findall(path)]
 88 | 
 89 | def find_package_data(packages):
 90 |     """
 91 |     For a list of packages, find the package_data
 92 | 
 93 |     This function scans the subdirectories of a package and considers all
 94 |     non-submodule subdirectories as resources, including them in
 95 |     the package_data
 96 | 
 97 |     Returns a dictionary suitable for setup(package_data=<result>)
 98 |     """
 99 |     package_data = {}
100 |     for package in packages:
101 |         package_data[package] = []
102 |         for subdir in find_subdirectories(package):
103 |             if '.'.join((package, subdir)) in packages: # skip submodules
104 |                 logging.debug("skipping submodule %s/%s" % (package, subdir))
105 |                 continue
106 |             if skip_tests and (subdir == 'tests'): # skip tests
107 |                 logging.debug("skipping tests %s/%s" % (package, subdir))
108 |                 continue
109 |             package_data[package] += subdir_findall(package_to_path(package), subdir)
110 |     return package_data
111 | 
112 | # ----------- Override defaults here ----------------
113 | if packages is None: packages = setuptools.find_packages()
114 | 
115 | if len(packages) == 0: raise Exception("No valid packages found")
116 | 
117 | if package_name is None: package_name = packages[0]
118 | 
119 | if package_data is None: package_data = find_package_data(packages)
120 | 
121 | if scripts is None: scripts = find_scripts()
122 | 
123 | setuptools.setup(
124 |     name = package_name,
125 |     version = '0.0.1.dev',
126 |     packages = packages,
127 |     scripts = scripts,
128 |     url = 'http://github.com:hyperopt/hp_gpsmbo/',
129 |     author = 'James Bergstra',
130 |     author_email = '',
131 |     description = 'Gaussian Process Regression with Theano',
132 |     long_description = '',
133 |     classifiers = [
134 |         'Development Status :: 3 - Alpha',
135 |         'Intended Audience :: Education',
136 |         'Intended Audience :: Science/Research',
137 |         'Intended Audience :: Developers',
138 |         'Environment :: Console',
139 |         'License :: OSI Approved :: LGPL3 License',
140 |         'Operating System :: MacOS :: MacOS X',
141 |         'Operating System :: Microsoft :: Windows',
142 |         'Operating System :: POSIX',
143 |         'Operating System :: Unix',
144 |         'Programming Language :: Python',
145 |         'Topic :: Scientific/Engineering',
146 |         'Topic :: Software Development',
147 |     ],
148 |     platforms = ['Linux', 'OS-X', 'Windows'],
149 |     license = 'LGPL3',
150 |     keywords = 'Theano machine learning',
151 |     package_data = package_data,
152 |     include_package_data = True,
153 |     #install_requires = ['theano', 'scipy', 'numpy'],
154 | )
155 | 


--------------------------------------------------------------------------------