├── LICENSE.txt ├── README.txt ├── hp_gpsmbo ├── __init__.py ├── __init__.pyc ├── gby.py ├── gpr.py ├── gpr_math.py ├── hmc.py ├── hpsuggest.py ├── hpsuggest_ei.py ├── hpsuggest_lucb.py ├── hpsuggest_ucb.py ├── kernels.py ├── kernels.pyc ├── kernels_base.py ├── op_Kcond.py ├── prodkernels.py ├── scrap.py ├── suggest_algos.py └── test │ ├── __init__.py │ ├── test_branin.py │ ├── test_gpr.py │ ├── test_gpr_math.py │ ├── test_har6.py │ ├── test_hpsuggest.py │ ├── test_kernels.py │ └── test_normal_log_EI.py └── setup.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This software (theano-gpr) is may be used by anyone under the terms of the 2 | "Lesser GPL v. 3.0" 3 | 4 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | README 2 | 3 | -------------------------------------------------------------------------------- /hp_gpsmbo/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from kernels import SqExp, Product 3 | from gpr import GPR_ML2, GPR_HMC 4 | -------------------------------------------------------------------------------- /hp_gpsmbo/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/__init__.pyc -------------------------------------------------------------------------------- /hp_gpsmbo/gby.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | def groupby(seq, key): 4 | tmp = OrderedDict() 5 | for ss in seq: 6 | tmp.setdefault(key(ss), []).append(ss) 7 | return tmp 8 | 9 | -------------------------------------------------------------------------------- /hp_gpsmbo/gpr.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import scipy.optimize 4 | import theano 5 | import theano.tensor as TT 6 | import theano.sandbox.rng_mrg 7 | from .gpr_math import s_normal_logEI 8 | from .hmc import HMC_sampler 9 | 10 | 11 | def raises(exc, fn, args): 12 | try: 13 | fn(*args) 14 | return False 15 | except exc: 16 | return True 17 | return False 18 | 19 | 20 | class GPR_Base(object): 21 | def __init__(self, kernel, 22 | maxiter=None, 23 | prior_var=None, 24 | prior_mean=None, 25 | warn_floatX=True, 26 | ): 27 | self.kernel = kernel 28 | self.maxiter = maxiter 29 | self.prior_var = prior_var 30 | self.prior_mean = prior_mean 31 | self.s_var_min = TT.as_tensor_variable(1e-8, name='s_var_min') 32 | self.s_emp_mean = theano.shared(0.0, name='s_emp_mean') 33 | self.s_emp_var = theano.shared(1.0, name='s_emp_var') 34 | self.s_X = theano.shared(np.zeros((2, 2)), name='s_X') 35 | self.s_y = theano.shared(np.zeros((2,)), name='s_y') 36 | self.s_var_y_raw = theano.shared(np.zeros(2,), name='s_var_y_raw') 37 | self.s_params = theano.tensor.dvector('params') 38 | self._logEI_cache = {} 39 | if theano.config.floatX != 'float64': 40 | raise TypeError('GPR requires floatX==float64') 41 | 42 | self.s_var_y = TT.maximum(self.s_var_y_raw, self.s_var_min) 43 | 44 | def set_emp_mean(self, y): 45 | if self.prior_mean is None: 46 | self.s_emp_mean.set_value(np.mean(y)) 47 | else: 48 | self.s_emp_mean.set_value(self.prior_mean) 49 | 50 | def set_emp_var(self, y, var_y): 51 | self.s_var_y_raw.set_value(np.zeros(len(y)) + var_y) 52 | if self.prior_var is None: 53 | self.s_emp_var.set_value(max(np.var(y), 54 | np.min(var_y), 55 | 1e-6)) 56 | else: 57 | self.s_emp_var.set_value(self.prior_var) 58 | 59 | def set_Xy(self, X, y): 60 | X_ = np.atleast_2d(X) 61 | self.s_X.set_value(X_) 62 | self.s_y.set_value(np.atleast_1d(y) - self.s_emp_mean.get_value()) 63 | return self.s_X, self.s_y 64 | 65 | def fit(self, X, y, var_y=0.0): 66 | self.set_emp_mean(y) 67 | self.set_emp_var(y, var_y) 68 | s_X, s_y = self.set_Xy(X, y) 69 | 70 | _, params, params0, _ = self.kernel.s_nll_params( 71 | X, y, 72 | var_y=var_y, 73 | prior_var=self.s_emp_var) 74 | 75 | self._params_list = [params0.copy()] 76 | self._params_weights = [1.0] 77 | 78 | 79 | def predict(self, x, eval_MSE=False): 80 | if eval_MSE: 81 | return self.mean_variance(x) 82 | else: 83 | return self.mean(x) 84 | 85 | def mean(self, x): 86 | """ 87 | Compute mean at points in x_new 88 | """ 89 | try: 90 | self._mean 91 | except AttributeError: 92 | s_mean_x, s_var_x, s_x = self.kernel.s_mean_var( 93 | self.s_X, 94 | self.s_y, 95 | self.s_var_y, 96 | self.s_emp_var, 97 | self.s_params, 98 | self.s_var_min) 99 | self._mean = theano.function( 100 | [s_x, self.s_params], 101 | s_mean_x + self.s_emp_mean, 102 | allow_input_downcast=True,) 103 | means = [self._mean(x, p) for p in self._params_list] 104 | weights = self._params_weights 105 | return np.dot(weights, means) 106 | 107 | def mean_variance(self, x): 108 | """ 109 | Compute mean and variance at points in x_new 110 | """ 111 | try: 112 | self._mean_variance 113 | except AttributeError: 114 | s_mean_x, s_var_x, s_x = self.kernel.s_mean_var( 115 | self.s_X, 116 | self.s_y, 117 | self.s_var_y, 118 | self.s_emp_var, 119 | self.s_params, 120 | self.s_var_min) 121 | self._mean_variance = theano.function( 122 | [s_x, self.s_params], 123 | [s_mean_x + self.s_emp_mean, s_var_x], 124 | allow_input_downcast=True,) 125 | means, variances = zip(*[ 126 | self._mean_variance(x, p) for p in self._params_list]) 127 | weights = self._params_weights 128 | mean = np.dot(weights, means) 129 | variance = np.dot(weights, variances) 130 | return mean, variance 131 | 132 | def logEI_fn(self, direction, quad_approx): 133 | direction = float(direction) 134 | quad_approx = bool(quad_approx) 135 | try: 136 | self._logEI_cache[(direction, quad_approx)] 137 | except KeyError: 138 | s_thresh = TT.dscalar('thresh') 139 | s_mean_x, s_var_x, s_x = self.kernel.s_mean_var( 140 | self.s_X, 141 | self.s_y, 142 | self.s_var_y, 143 | self.s_emp_var, 144 | self.s_params, 145 | self.s_var_min) 146 | s_logEI = s_normal_logEI( 147 | direction * s_thresh, 148 | direction * (s_mean_x + self.s_emp_mean), 149 | s_var_x, 150 | quad_approx=quad_approx) 151 | self._logEI_cache[(direction, quad_approx)] = theano.function( 152 | [s_x, s_thresh, self.s_params], 153 | s_logEI, 154 | allow_input_downcast=True) 155 | return self._logEI_cache[(direction, quad_approx)] 156 | 157 | def logEI(self, x, thresh, direction=1, quad_approx=False): 158 | logEI_fn = self.logEI_fn(direction, quad_approx) 159 | logEIs = [logEI_fn(x, thresh, p) for p in self._params_list] 160 | weights = self._params_weights 161 | rval = np.dot(weights, logEIs) 162 | return np.atleast_1d(rval) 163 | 164 | 165 | class GPR_ML2(GPR_Base): 166 | """ 167 | Fit by maximum marginal likelihood of kernel hyperparameters 168 | 169 | """ 170 | 171 | def __init__(self, *args, **kwargs): 172 | GPR_Base.__init__(self, *args, **kwargs) 173 | 174 | nll, params, params0, bounds, K = self.kernel.s_nll_params( 175 | self.s_X, self.s_y, 176 | params=self.s_params, 177 | var_y=self.s_var_y, 178 | prior_var=self.s_emp_var, ret_K=True) 179 | 180 | cost = nll - self.kernel.s_logprior(params) 181 | assert nll.ndim == 0, nll.type 182 | 183 | self._K = theano.function([params], K) 184 | self._fit_f_df = theano.function([params], 185 | [cost, TT.grad(cost, params)]) 186 | self._params0 = params0 187 | self._bounds = bounds 188 | 189 | def _fit_params0(self): 190 | new_x0 = self._params0 191 | nll_pp = [] 192 | for ii in range(12): 193 | try: 194 | f, df = self._fit_f_df(new_x0) 195 | # -- don't start where the function is too steep 196 | if np.sqrt(np.dot(df, df)) > 10000: 197 | f = np.inf 198 | except np.linalg.LinAlgError: 199 | f = np.inf 200 | # -- ii is in list to break ties, which 201 | # happens if there are multiple infs 202 | nll_pp.append((f, ii, np.array(new_x0))) 203 | new_x0 = self.kernel.reduce_lenscale(new_x0) 204 | 205 | x0 = sorted(nll_pp)[0][2] 206 | if np.isinf(sorted(nll_pp)[0][0]): 207 | raise Exception('fit impossible') 208 | return x0 209 | 210 | def _fit_ml2(self): 211 | x0 = self._fit_params0() 212 | 213 | # -- for some reason, the result object returned by minimize 214 | # seems occasionally to include a parameter vector (pp) 215 | # for which f_df returned np.inf, when there were other non-inf 216 | # evaluations (!?) 217 | # Therefore, this best_f and best_pp mechanism is used. 218 | best_f_pp = [np.inf, None] 219 | 220 | def f_df(pp): 221 | if not np.all(np.isfinite(pp)): 222 | return np.inf, pp 223 | try: 224 | ff, df = self._fit_f_df(pp) 225 | if ff < best_f_pp[0]: 226 | best_f_pp[:] = [ff, pp.copy()] 227 | return ff, df 228 | except np.linalg.LinAlgError: 229 | return np.inf, pp 230 | except ValueError, exc: 231 | if 'NaN' in str(exc): 232 | return np.inf, pp 233 | else: 234 | raise 235 | try: 236 | scipy.optimize.minimize( 237 | fun=f_df, #self._fit_f_df, 238 | x0=x0, 239 | jac=True, # -- means f returns cost and jacobian 240 | method='SLSQP', 241 | #method='L-BFGS-B', 242 | options={} if self.maxiter is None else ( 243 | {'maxiter': self.maxiter,}), 244 | bounds=self._bounds, 245 | ) 246 | except ValueError, e: 247 | if 'NaN' in str(e): 248 | print 'WARNING: GPR.fit caught error', e 249 | print 'WARNING: hopeless fit fail, falling back on params0' 250 | self._params_list = [self._params0] 251 | else: 252 | raise 253 | return best_f_pp 254 | 255 | def fit_ml2(self, X, y, var_y=0, debug=False, ion=False): 256 | """ 257 | Fit GPR kernel parameters by minimizing magininal nll. 258 | 259 | Returns: None 260 | 261 | Side effect: chooses optimal kernel parameters. 262 | """ 263 | self.set_emp_mean(y) 264 | self.set_emp_var(y, var_y) 265 | s_X, s_y = self.set_Xy(X, y) 266 | best_f, best_params = self._fit_ml2() 267 | self._params_list = [best_params] 268 | self._params_weights = [1.0] 269 | return self 270 | 271 | def fit(self, X, y, var_y=0, debug=False, ion=False): 272 | return self.fit_ml2(X, y, var_y, debug, ion) 273 | 274 | 275 | class GPR_HMC(GPR_ML2): 276 | """ 277 | Fit by collecting kernel hyperparameter samples (by HMC). 278 | 279 | """ 280 | def __init__(self, kernel, 281 | maxiter=None, 282 | prior_var=None, 283 | prior_mean=None, 284 | hmc_burn_in=0, # -- keep ML first point 285 | hmc_draws=200, 286 | hmc_keep_step=25): 287 | GPR_ML2.__init__(self, kernel, 288 | maxiter=maxiter, 289 | prior_var=prior_var, 290 | prior_mean=prior_mean) 291 | self.positions = theano.shared(np.zeros((1, self.kernel.n_params)), 292 | name='positions') 293 | 294 | nll, s_params, params0, bounds = self.kernel.s_nll_params( 295 | self.s_X, self.s_y, var_y=self.s_var_y, 296 | params=self.s_params, 297 | prior_var=self.s_emp_var,) 298 | cost = nll - self.kernel.s_logprior(s_params) 299 | self.nll_cost_fn = theano.function([s_params], [nll, cost]) 300 | self._params0 = params0 301 | 302 | def energy_fn(params_matrix): 303 | # PRECONDITOIN: params_matrix has SINGLE ROW 304 | nll, params, params0, bounds = self.kernel.s_nll_params( 305 | self.s_X, self.s_y, var_y=self.s_var_y, 306 | prior_var=self.s_emp_var, 307 | params=params_matrix[0]) 308 | logprior = self.kernel.s_logprior(params_matrix[0]) 309 | energy = nll - logprior 310 | #energy = theano.printing.Print('energy')(energy) 311 | return energy.dimshuffle('x') 312 | 313 | print 'creating HMC sampler' 314 | self.sampler = HMC_sampler.new_from_shared_positions( 315 | self.positions, energy_fn, 316 | s_rng=theano.sandbox.rng_mrg.MRG_RandomStreams(1234), 317 | stepsize_dec=0.95, 318 | stepsize_inc=1.02, 319 | stepsize_min=1.0e-8, 320 | stepsize_max=2.5e-1, 321 | ) 322 | self._stepsize0 = .001 323 | self.hmc_burn_in = hmc_burn_in 324 | self.hmc_draws = hmc_draws 325 | self.hmc_keep_step = hmc_keep_step 326 | 327 | def fit_hmc(self, X, y, var_y=1e-16, debug=False, ion=False, 328 | init_params_method='cycle'): 329 | 330 | self.set_emp_mean(y) 331 | self.set_emp_var(y, var_y) 332 | self.set_Xy(X, y) 333 | 334 | if init_params_method == 'cycle': 335 | init_params_method = ['ml2', 'prior'][len(y) % 2] 336 | if init_params_method == 'ml2': 337 | _, ml_params = self._fit_ml2() 338 | elif init_params_method == 'prior': 339 | ml_params = self._fit_params0() 340 | else: 341 | raise NotImplementedError(init_params_method) 342 | 343 | self.sampler.positions.set_value(np.asarray([ml_params])) 344 | self.sampler.stepsize.set_value(self._stepsize0) 345 | 346 | def get_state(sampler): 347 | return { 348 | 'positions': sampler.positions.get_value(), 349 | 'stepsize': sampler.stepsize.get_value(), 350 | 'avg_acceptance_rate': sampler.avg_acceptance_rate.get_value(), 351 | } 352 | def set_state(sampler, state): 353 | for k, v in state.items(): 354 | getattr(sampler, k).set_value(v) 355 | 356 | def draw(): 357 | state = get_state(self.sampler) 358 | while state['stepsize'] > 1e-12: 359 | try: 360 | set_state(self.sampler, state) 361 | pos = self.sampler.draw() 362 | return pos 363 | except (ValueError, np.linalg.LinAlgError): 364 | print 'shrinking stepsize %f to stabilize sampler' % ( 365 | self.sampler.stepsize.get_value(), 366 | ) 367 | state['positions'][0] = self.kernel.reduce_lenscale( 368 | state['positions'][0]) 369 | state['stepsize'] /= 2.0 370 | raise ValueError('hopeless: Nan or inf in K') 371 | 372 | samples = [] 373 | nlls = [] 374 | costs = [] 375 | t0 = time.time() 376 | for ii in range(self.hmc_burn_in): 377 | pos = draw() 378 | for ii in range(self.hmc_draws): 379 | pos = draw() 380 | samples.append(pos.ravel().copy()) 381 | if 0: 382 | nll_ii, cost_ii = self.nll_cost_fn(pos.flatten()) 383 | print 'current position', pos.flatten(), 384 | print 'accept rate', self.sampler.avg_acceptance_rate.get_value(), 385 | print 'nll', nll_ii, 'cost', cost_ii 386 | nlls.append(nll_ii) 387 | costs.append(cost_ii) 388 | print 'HMC took', (time.time() - t0) 389 | samples = np.asarray(samples) 390 | keep = samples[::self.hmc_keep_step] 391 | if keep.size == 0: 392 | raise NotImplementedError() 393 | 394 | if debug: 395 | import matplotlib.pyplot as plt 396 | if ion: 397 | plt.figure(2) 398 | if self.kernel.n_params == 1: 399 | plt.subplot(211) 400 | plt.cla() 401 | plt.hist(np.asarray(samples).flatten()) 402 | plt.title('nlls observed during sampling') 403 | plt.subplot(212) 404 | plt.cla() 405 | plt.scatter(samples, nlls, label='nll', c='b') 406 | plt.scatter(samples, costs, label='cost', c='g') 407 | plt.title('nlls vs. alpha') 408 | plt.legend() 409 | if self.kernel.n_params == 2: 410 | plt.cla() 411 | plt.scatter(samples[:, 0], samples[:, 1]) 412 | plt.scatter(keep[:, 0], keep[:, 1], s=60) 413 | if ion: 414 | plt.draw() 415 | else: 416 | plt.show() 417 | 418 | self._params_list = keep 419 | self._params_weights = np.ones(len(keep)) / len(keep) 420 | 421 | 422 | def fit(self, X, y, var_y=0, debug=False, ion=False): 423 | return self.fit_hmc(X, y, var_y, debug, ion) 424 | -------------------------------------------------------------------------------- /hp_gpsmbo/gpr_math.py: -------------------------------------------------------------------------------- 1 | """ 2 | Formulae for Gaussian Process Regression 3 | 4 | """ 5 | 6 | import numpy as np 7 | import theano.tensor as TT 8 | from theano.sandbox.linalg import cholesky, matrix_inverse, det, psd 9 | from .op_Kcond import normal_logEI_diff_sigma_elemwise 10 | 11 | 12 | def dots(*args): 13 | rval = args[0] 14 | for a in args[1:]: 15 | rval = TT.dot(rval, a) 16 | return rval 17 | 18 | 19 | def s_nll(K, y, var_y, prior_var): 20 | """ 21 | Marginal negative log likelihood of model 22 | 23 | K - gram matrix (matrix-like) 24 | y - the training targets (vector-like) 25 | var_y - the variance of uncertainty about y (vector-like) 26 | 27 | :note: See RW.pdf page 37, Eq. 2.30. 28 | 29 | """ 30 | 31 | n = y.shape[0] 32 | rK = psd(prior_var * K + var_y * TT.eye(n)) 33 | 34 | fit = .5 * dots(y, matrix_inverse(rK), y) 35 | complexity = 0.5 * TT.log(det(rK)) 36 | normalization = n / 2.0 * TT.log(2 * np.pi) 37 | nll = fit + complexity + normalization 38 | return nll 39 | 40 | 41 | def s_mean(K, y, var_y, prior_var, K_new): 42 | rK = psd(prior_var * K + var_y * TT.eye(y.shape[0])) 43 | alpha = TT.dot(matrix_inverse(rK), y) 44 | y_x = TT.dot(alpha, prior_var * K_new) 45 | return y_x 46 | 47 | 48 | def s_variance(K, y, var_y, prior_var, K_new, var_min): 49 | rK = psd(prior_var * K + var_y * TT.eye(y.shape[0])) 50 | L = cholesky(rK) 51 | v = dots(matrix_inverse(L), prior_var * K_new) 52 | var_x = TT.maximum(prior_var - (v ** 2).sum(axis=0), var_min) 53 | return var_x 54 | 55 | 56 | def s_normal_pdf(x, mean, var): 57 | energy = 0.5 * ((x - mean) ** 2) / var 58 | return TT.exp(-energy) / TT.sqrt(2 * np.pi * var) 59 | 60 | 61 | def s_normal_logpdf(x, mean, var): 62 | energy = 0.5 * ((x - mean) ** 2) / var 63 | return -energy - 0.5 * TT.log(2 * np.pi * var) 64 | 65 | 66 | def s_normal_cdf(x, mean, var): 67 | z = (x - mean) / TT.sqrt(var) 68 | return .5 * TT.erfc(-z / np.sqrt(2)) 69 | 70 | 71 | def s_normal_logcdf(x, mean, var): 72 | z = (x - mean) / TT.sqrt(var) 73 | return TT.log(.5) + TT.log(TT.erfc(-z / np.sqrt(2))) 74 | 75 | 76 | def s_normal_EI(thresh, mean, var): 77 | """analytic expected improvement over (above) threshold 78 | 79 | int_{thresh}^{\infty} (y - thresh) P(y; mean, var) dy 80 | 81 | """ 82 | s_thresh = TT.as_tensor_variable(thresh) 83 | sigma = TT.sqrt(var) 84 | z = (mean - s_thresh) / sigma 85 | # -- the following formula is cuter, but 86 | # Theano doesn't produce as stable a gradient I think? 87 | #return sigma * (z * s_normal_cdf(z, 0, 1) + s_normal_pdf(z, 0, 1)) 88 | a = (mean - s_thresh) * s_normal_cdf(z, 0, 1) 89 | b = sigma * s_normal_pdf(z, 0, 1) 90 | return a + b 91 | 92 | 93 | def s_normal_logEI(thresh, mean, var, quad_approx=False): 94 | """analytic log-expected improvement over (above) threshold 95 | 96 | log(int_{thresh}^{\infty} (y - thresh) P(y; mean, var) dy) 97 | 98 | quad_approx uses a 2nd-order polynomial approximation to the true function 99 | when the threshold is way above the mean (34 standard deviations), where 100 | there's almost no density to integrate. 101 | """ 102 | return normal_logEI_diff_sigma_elemwise(thresh - mean, TT.sqrt(var)) 103 | 104 | 105 | def s_normal_EBI(lbound, ubound, mean, var): 106 | """ int_l^u (y - l) P(y; mean, var) 107 | """ 108 | s_l = TT.as_tensor_variable(lbound) 109 | s_u = TT.as_tensor_variable(ubound) 110 | 111 | EI_l = s_normal_EI(s_l, mean, var) 112 | EI_u = s_normal_EI(s_u, mean, var) 113 | 114 | #sigma = TT.maximum(TT.sqrt(var), 1e-15) 115 | sigma = TT.sqrt(var) 116 | term = (s_l - s_u) * s_normal_cdf((mean - s_u) / sigma, 0, 1) 117 | 118 | return EI_l - EI_u + term 119 | 120 | 121 | def s_normal_logEBI(lbound, ubound, mean, var): 122 | return TT.log(s_normal_EBI(lbound, ubound, mean, var)) 123 | 124 | 125 | # -- eof flake8 126 | -------------------------------------------------------------------------------- /hp_gpsmbo/hmc.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO 3 | """ 4 | 5 | import numpy 6 | 7 | from theano import function, shared 8 | from theano import tensor as TT 9 | import theano 10 | 11 | sharedX = lambda X, name: \ 12 | shared(numpy.asarray(X, dtype=theano.config.floatX), name=name) 13 | 14 | 15 | def kinetic_energy(vel): 16 | """Returns the kinetic energy associated with the given velocity 17 | and mass of 1. 18 | 19 | Parameters 20 | ---------- 21 | vel: theano matrix 22 | Symbolic matrix whose rows are velocity vectors. 23 | 24 | Returns 25 | ------- 26 | return: theano vector 27 | Vector whose i-th entry is the kinetic entry associated with vel[i]. 28 | 29 | """ 30 | return 0.5 * (vel ** 2).sum(axis=1) 31 | 32 | 33 | def hamiltonian(pos, vel, energy_fn): 34 | """ 35 | Returns the Hamiltonian (sum of potential and kinetic energy) for the given 36 | velocity and position. 37 | 38 | Parameters 39 | ---------- 40 | pos: theano matrix 41 | Symbolic matrix whose rows are position vectors. 42 | vel: theano matrix 43 | Symbolic matrix whose rows are velocity vectors. 44 | energy_fn: python function 45 | Python function, operating on symbolic theano variables, used tox 46 | compute the potential energy at a given position. 47 | 48 | Returns 49 | ------- 50 | return: theano vector 51 | Vector whose i-th entry is the Hamiltonian at position pos[i] and 52 | velocity vel[i]. 53 | """ 54 | # assuming mass is 1 55 | return energy_fn(pos) + kinetic_energy(vel) 56 | 57 | 58 | def metropolis_hastings_accept(energy_prev, energy_next, s_rng): 59 | """ 60 | Performs a Metropolis-Hastings accept-reject move. 61 | 62 | Parameters 63 | ---------- 64 | energy_prev: theano vector 65 | Symbolic theano tensor which contains the energy associated with the 66 | configuration at time-step t. 67 | energy_next: theano vector 68 | Symbolic theano tensor which contains the energy associated with the 69 | proposed configuration at time-step t+1. 70 | s_rng: theano.tensor.shared_randomstreams.RandomStreams 71 | Theano shared random stream object used to generate the random number 72 | used in proposal. 73 | 74 | Returns 75 | ------- 76 | return: boolean 77 | True if move is accepted, False otherwise 78 | """ 79 | ediff = energy_prev - energy_next 80 | #ediff = theano.printing.Print('ediff')(ediff) 81 | return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0 82 | 83 | 84 | def simulate_dynamics(initial_pos, initial_vel, stepsize, n_steps, energy_fn): 85 | """ 86 | Return final (position, velocity) obtained after an `n_steps` leapfrog 87 | updates, using Hamiltonian dynamics. 88 | 89 | Parameters 90 | ---------- 91 | initial_pos: shared theano matrix 92 | Initial position at which to start the simulation 93 | initial_vel: shared theano matrix 94 | Initial velocity of particles 95 | stepsize: shared theano scalar 96 | Scalar value controlling amount by which to move 97 | energy_fn: python function 98 | Python function, operating on symbolic theano variables, used to 99 | compute the potential energy at a given position. 100 | 101 | Returns 102 | ------- 103 | rval1: theano matrix 104 | Final positions obtained after simulation 105 | rval2: theano matrix 106 | Final velocity obtained after simulation 107 | """ 108 | 109 | def leapfrog(pos, vel, step): 110 | """ 111 | Inside loop of Scan. Performs one step of leapfrog update, using 112 | Hamiltonian dynamics. 113 | 114 | Parameters 115 | ---------- 116 | pos: theano matrix 117 | in leapfrog update equations, represents pos(t), position at time t 118 | vel: theano matrix 119 | in leapfrog update equations, represents vel(t - stepsize/2), 120 | velocity at time (t - stepsize/2) 121 | step: theano scalar 122 | scalar value controlling amount by which to move 123 | 124 | Returns 125 | ------- 126 | rval1: [theano matrix, theano matrix] 127 | Symbolic theano matrices for new position pos(t + stepsize), and 128 | velocity vel(t + stepsize/2) 129 | rval2: dictionary 130 | Dictionary of updates for the Scan Op 131 | """ 132 | # from pos(t) and vel(t-stepsize/2), compute vel(t+stepsize/2) 133 | dE_dpos = TT.grad(energy_fn(pos).sum(), pos) 134 | new_vel = vel - step * dE_dpos 135 | # from vel(t+stepsize/2) compute pos(t+stepsize) 136 | new_pos = pos + step * new_vel 137 | return [new_pos, new_vel], {} 138 | 139 | # compute velocity at time-step: t + stepsize/2 140 | initial_energy = energy_fn(initial_pos) 141 | dE_dpos = TT.grad(initial_energy.sum(), initial_pos) 142 | 143 | vel_half_step = initial_vel - 0.5 * stepsize * dE_dpos 144 | 145 | # compute position at time-step: t + stepsize 146 | pos_full_step = initial_pos + stepsize * vel_half_step 147 | 148 | # perform leapfrog updates: the scan op is used to repeatedly compute 149 | # vel(t + (m-1/2)*stepsize) and pos(t + m*stepsize) for m in [2,n_steps]. 150 | (all_pos, all_vel), scan_updates = theano.scan(leapfrog, 151 | outputs_info=[ 152 | dict(initial=pos_full_step), 153 | dict(initial=vel_half_step), 154 | ], 155 | non_sequences=[stepsize], 156 | n_steps=n_steps - 1) 157 | final_pos = all_pos[-1] 158 | final_vel = all_vel[-1] 159 | # NOTE: Scan always returns an updates dictionary, in case the 160 | # scanned function draws samples from a RandomStream. These 161 | # updates must then be used when compiling the Theano function, to 162 | # avoid drawing the same random numbers each time the function is 163 | # called. In this case however, we consciously ignore 164 | # "scan_updates" because we know it is empty. 165 | assert not scan_updates 166 | 167 | # The last velocity returned by scan is vel(t + 168 | # (n_steps - 1 / 2) * stepsize) We therefore perform one more half-step 169 | # to return vel(t + n_steps * stepsize) 170 | energy = energy_fn(final_pos) 171 | final_vel = final_vel - 0.5 * stepsize * TT.grad( 172 | energy.sum(), final_pos, 173 | consider_constant=[final_pos, final_vel]) 174 | 175 | # return new proposal state 176 | return final_pos, final_vel 177 | 178 | 179 | def hmc_move(s_rng, positions, energy_fn, stepsize, n_steps): 180 | """ 181 | This function performs one-step of Hybrid Monte-Carlo sampling. We start by 182 | sampling a random velocity from a univariate Gaussian distribution, perform 183 | `n_steps` leap-frog updates using Hamiltonian dynamics and accept-reject 184 | using Metropolis-Hastings. 185 | 186 | Parameters 187 | ---------- 188 | s_rng: theano shared random stream 189 | Symbolic random number generator used to draw random velocity and 190 | perform accept-reject move. 191 | positions: shared theano matrix 192 | Symbolic matrix whose rows are position vectors. 193 | energy_fn: python function 194 | Python function, operating on symbolic theano variables, used to 195 | compute the potential energy at a given position. 196 | stepsize: shared theano scalar 197 | Shared variable containing the stepsize to use for `n_steps` of HMC 198 | simulation steps. 199 | n_steps: integer 200 | Number of HMC steps to perform before proposing a new position. 201 | 202 | Returns 203 | ------- 204 | rval1: boolean 205 | True if move is accepted, False otherwise 206 | rval2: theano matrix 207 | Matrix whose rows contain the proposed "new position" 208 | """ 209 | 210 | # sample random velocity 211 | initial_vel = s_rng.normal(size=positions.shape) 212 | 213 | # perform simulation of particles subject to Hamiltonian dynamics 214 | final_pos, final_vel = simulate_dynamics( 215 | initial_pos=positions, 216 | initial_vel=initial_vel, 217 | stepsize=stepsize, 218 | n_steps=n_steps, 219 | energy_fn=energy_fn) 220 | 221 | # accept/reject the proposed move based on the joint distribution 222 | accept = metropolis_hastings_accept( 223 | energy_prev=hamiltonian(positions, initial_vel, energy_fn), 224 | energy_next=hamiltonian(final_pos, final_vel, energy_fn), 225 | s_rng=s_rng) 226 | 227 | return accept, final_pos 228 | 229 | 230 | def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept, 231 | target_acceptance_rate, stepsize_inc, stepsize_dec, 232 | stepsize_min, stepsize_max, avg_acceptance_slowness): 233 | """This function is executed after `n_steps` of HMC sampling 234 | (`hmc_move` function). It creates the updates dictionary used by 235 | the `simulate` function. It takes care of updating: the position 236 | (if the move is accepted), the stepsize (to track a given target 237 | acceptance rate) and the average acceptance rate (computed as a 238 | moving average). 239 | 240 | Parameters 241 | ---------- 242 | positions: shared variable, theano matrix 243 | Shared theano matrix whose rows contain the old position 244 | stepsize: shared variable, theano scalar 245 | Shared theano scalar containing current step size 246 | avg_acceptance_rate: shared variable, theano scalar 247 | Shared theano scalar containing the current average acceptance rate 248 | final_pos: shared variable, theano matrix 249 | Shared theano matrix whose rows contain the new position 250 | accept: theano scalar 251 | Boolean-type variable representing whether or not the proposed HMC move 252 | should be accepted or not. 253 | target_acceptance_rate: float 254 | The stepsize is modified in order to track this target acceptance rate. 255 | stepsize_inc: float 256 | Amount by which to increment stepsize when acceptance rate is too high. 257 | stepsize_dec: float 258 | Amount by which to decrement stepsize when acceptance rate is too low. 259 | stepsize_min: float 260 | Lower-bound on `stepsize`. 261 | stepsize_min: float 262 | Upper-bound on `stepsize`. 263 | avg_acceptance_slowness: float 264 | Average acceptance rate is computed as an exponential moving average. 265 | (1-avg_acceptance_slowness) is the weight given to the newest 266 | observation. 267 | 268 | Returns 269 | ------- 270 | rval1: dictionary-like 271 | A dictionary of updates to be used by the `HMC_Sampler.simulate` 272 | function. The updates target the position, stepsize and average 273 | acceptance rate. 274 | 275 | """ 276 | 277 | ## POSITION UPDATES ## 278 | # broadcast `accept` scalar to tensor with the same dimensions as 279 | # final_pos. 280 | accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1))) 281 | # if accept is True, update to `final_pos` else stay put 282 | new_positions = TT.switch(accept_matrix, final_pos, positions) 283 | 284 | ## STEPSIZE UPDATES ## 285 | # if acceptance rate is too low, our sampler is too "noisy" and we reduce 286 | # the stepsize. If it is too high, our sampler is too conservative, we can 287 | # get away with a larger stepsize (resulting in better mixing). 288 | _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate, 289 | stepsize * stepsize_inc, stepsize * stepsize_dec) 290 | # maintain stepsize in [stepsize_min, stepsize_max] 291 | new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max) 292 | 293 | ## ACCEPT RATE UPDATES ## 294 | # perform exponential moving average 295 | mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype) 296 | new_acceptance_rate = TT.add( 297 | avg_acceptance_slowness * avg_acceptance_rate, 298 | (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype)) 299 | 300 | return [(positions, new_positions), 301 | (stepsize, new_stepsize), 302 | (avg_acceptance_rate, new_acceptance_rate)] 303 | 304 | 305 | class HMC_sampler(object): 306 | """ 307 | Convenience wrapper for performing Hybrid Monte Carlo (HMC). It creates the 308 | symbolic graph for performing an HMC simulation (using `hmc_move` and 309 | `hmc_updates`). The graph is then compiled into the `simulate` function, a 310 | theano function which runs the simulation and updates the required shared 311 | variables. 312 | 313 | Users should interface with the sampler thorugh the `draw` function which 314 | advances the markov chain and returns the current sample by calling 315 | `simulate` and `get_position` in sequence. 316 | 317 | The hyper-parameters are the same as those used by Marc'Aurelio's 318 | 'train_mcRBM.py' file (available on his personal home page). 319 | """ 320 | 321 | def __init__(self, **kwargs): 322 | self.__dict__.update(kwargs) 323 | 324 | @classmethod 325 | def new_from_shared_positions(cls, shared_positions, energy_fn, 326 | initial_stepsize=0.01, target_acceptance_rate=.9, n_steps=20, 327 | stepsize_dec=0.98, 328 | stepsize_min=0.001, 329 | stepsize_max=0.25, 330 | stepsize_inc=1.02, 331 | # used in geometric avg. 1.0 would be not moving at all 332 | avg_acceptance_slowness=0.9, 333 | s_rng=None): 334 | """ 335 | :param shared_positions: theano ndarray shared var with 336 | many particle [initial] positions 337 | 338 | :param energy_fn: 339 | callable such that energy_fn(positions) 340 | returns theano vector of energies. 341 | The len of this vector is the batch size. 342 | 343 | The sum of this energy vector must be differentiable (with 344 | theano.tensor.grad) with respect to the positions for HMC 345 | sampling to work. 346 | 347 | """ 348 | 349 | # allocate shared variables 350 | stepsize = sharedX(initial_stepsize, 'hmc_stepsize') 351 | avg_acceptance_rate = sharedX(target_acceptance_rate, 352 | 'avg_acceptance_rate') 353 | if s_rng is None: 354 | s_rng = TT.shared_randomstreams.RandomStreams(12345) 355 | 356 | # define graph for an `n_steps` HMC simulation 357 | accept, final_pos = hmc_move( 358 | s_rng, 359 | shared_positions, 360 | energy_fn, 361 | stepsize, 362 | n_steps) 363 | 364 | # define the dictionary of updates, to apply on every `simulate` call 365 | simulate_updates = hmc_updates( 366 | shared_positions, 367 | stepsize, 368 | avg_acceptance_rate, 369 | final_pos=final_pos, 370 | accept=accept, 371 | stepsize_min=stepsize_min, 372 | stepsize_max=stepsize_max, 373 | stepsize_inc=stepsize_inc, 374 | stepsize_dec=stepsize_dec, 375 | target_acceptance_rate=target_acceptance_rate, 376 | avg_acceptance_slowness=avg_acceptance_slowness) 377 | 378 | # compile theano function 379 | simulate = function([], [], updates=simulate_updates) 380 | 381 | # create HMC_sampler object with the following attributes ... 382 | return cls( 383 | positions=shared_positions, 384 | stepsize=stepsize, 385 | stepsize_min=stepsize_min, 386 | stepsize_max=stepsize_max, 387 | avg_acceptance_rate=avg_acceptance_rate, 388 | target_acceptance_rate=target_acceptance_rate, 389 | s_rng=s_rng, 390 | _updates=simulate_updates, 391 | simulate=simulate) 392 | 393 | def draw(self, **kwargs): 394 | """ 395 | Returns a new position obtained after `n_steps` of HMC simulation. 396 | 397 | Parameters 398 | ---------- 399 | kwargs: dictionary 400 | The `kwargs` dictionary is passed to the shared variable 401 | (self.positions) `get_value()` function. For example, to avoid 402 | copying the shared variable value, consider passing `borrow=True`. 403 | 404 | Returns 405 | ------- 406 | rval: numpy matrix 407 | Numpy matrix whose of dimensions similar to `initial_position`. 408 | """ 409 | self.simulate() 410 | if self.stepsize.get_value() < 1.5 * self.stepsize_min: 411 | print 'WARN: HMC stepsize %f close to minimum of %f (acceptance %f)' %( 412 | self.stepsize.get_value(), self.stepsize_min, self.avg_acceptance_rate.get_value()) 413 | if self.stepsize.get_value() > .9 * self.stepsize_max: 414 | print 'WARN: HMC stepsize %f close to maximum of %f (acceptance %f)' %( 415 | self.stepsize.get_value(), self.stepsize_max, self.avg_acceptance_rate.get_value()) 416 | return self.positions.get_value(borrow=False) 417 | -------------------------------------------------------------------------------- /hp_gpsmbo/hpsuggest.py: -------------------------------------------------------------------------------- 1 | from itertools import groupby 2 | import numpy as np 3 | import scipy.optimize 4 | 5 | from hyperopt.pyll_utils import expr_to_config 6 | from hyperopt import pyll, STATUS_OK 7 | from hyperopt.algobase import SuggestAlgo 8 | 9 | from . import kernels 10 | 11 | 12 | def loss_variances(trials): 13 | return [r.get('loss_variance', 0) 14 | for r in trials.results if r['status'] == STATUS_OK] 15 | 16 | 17 | class SuggestBest(SuggestAlgo): 18 | def __init__(self, domain, trials, seed, best_pt): 19 | SuggestAlgo.__init__(self, domain, trials, seed) 20 | self.best_pt = best_pt 21 | 22 | def on_node_hyperparameter(self, memo, node, label): 23 | if label in self.best_pt: 24 | rval = [self.best_pt[label]] 25 | else: 26 | rval = [] 27 | return rval 28 | 29 | 30 | class ParamHelper(object): 31 | def __init__(self, config): 32 | self.config = config 33 | 34 | def __call__(self, name): 35 | node = self.config[name]['node'] 36 | conditional = self.config[name]['conditions'] != set([()]) 37 | rval = getattr(self, node.name)(node, conditional) 38 | return rval 39 | 40 | def randint(self, node, conditional): 41 | upper = int(node.arg['upper'].obj) 42 | def val_fn(feat): 43 | rval = np.asarray(feat).astype('int') 44 | if not np.allclose(rval, feat): 45 | print 'WARNING: optimizer gave randint val_fn a float' 46 | return rval 47 | 48 | if upper == 2: 49 | return { 50 | 'feature_bounds': (0, 1), 51 | 'kernel': kernels.Choice2(0.7, 1e-2, 2.0, conditional), 52 | 'ndim': 1, 53 | 'continuous': False, 54 | 'ordinal': False, 55 | 'feature_fn': np.asarray, 56 | 'val_fn': val_fn, 57 | } 58 | else: 59 | return { 60 | 'feature_bounds': (0, upper), 61 | 'kernel': kernels.ChoiceN(upper, conditional), 62 | 'ndim': 1, 63 | 'continuous': False, 64 | 'ordinal': False, 65 | 'feature_fn': np.asarray, 66 | 'val_fn': val_fn, 67 | } 68 | 69 | def categorical(self, node, conditional): 70 | # TODO: bias the choice somehow? 71 | return self.randint(node, conditional) 72 | 73 | def uniform(self, node, conditional, continuous=True, q=None): 74 | low = float(node.arg['low'].obj) 75 | high = float(node.arg['high'].obj) 76 | def val_fn(feat): 77 | rval = feat * (high - low) + low 78 | if q is not None: 79 | rval = np.round(rval / q) * q 80 | return rval 81 | return { 82 | 'feature_bounds': (0, 1), 83 | 'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional), 84 | 'ndim': 1, 85 | 'continuous': continuous, 86 | 'ordinal': q is not None, 87 | 'feature_fn': (lambda val: (np.asarray(val) - low) / (high - low)), 88 | 'val_fn': val_fn, 89 | } 90 | 91 | def quniform(self, node, conditional): 92 | q = float(node.arg['q'].obj) 93 | return self.uniform(node, conditional, continuous=False, q=q) 94 | 95 | def loguniform(self, node, conditional, continuous=True, q=None): 96 | # -- log-scaling has been handled by feature code 97 | #val = np.exp(featureval) - self.logquantized_feature_epsilon 98 | low = float(node.arg['low'].obj) 99 | high = float(node.arg['high'].obj) 100 | def val_fn(feat): 101 | rval = np.exp(feat * (high - low) + low) 102 | if q is not None: 103 | rval = np.round(rval / q) * q 104 | return rval 105 | return { 106 | 'feature_bounds': (0, 1), 107 | 'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional), 108 | 'ndim': 1, 109 | 'continuous': continuous, 110 | 'ordinal': q is not None, 111 | 'feature_fn': (lambda val: (np.log(val) - low) / (high - low)), 112 | 'val_fn': val_fn, 113 | } 114 | 115 | def qloguniform(self, node, conditional): 116 | q = float(node.arg['q'].obj) 117 | return self.loguniform(node, conditional, continuous=False, q=q) 118 | 119 | def normal(self, node, conditional, continuous=True, q=None): 120 | sigma = float(node.arg['sigma'].obj) 121 | mu = float(node.arg['mu'].obj) 122 | def val_fn(feat): 123 | rval = feat * sigma + mu 124 | if q is not None: 125 | rval = np.round(rval / q) * q 126 | return rval 127 | return { 128 | 'feature_bounds': (-10, 10), 129 | 'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional), 130 | 'ndim': 1, 131 | 'continuous': continuous, 132 | 'ordinal': q is not None, 133 | 'feature_fn': (lambda val: (np.asarray(val) - mu) / sigma), 134 | 'val_fn': val_fn, 135 | } 136 | 137 | def qnormal(self, node, conditional): 138 | q = float(node.arg['q'].obj) 139 | return self.normal(node, conditional, continuous=False, q=q) 140 | 141 | def lognormal(self, node, conditional, continuous=True, q=None): 142 | sigma = float(node.arg['sigma'].obj) 143 | mu = float(node.arg['mu'].obj) 144 | def val_fn(feat): 145 | rval = np.exp(feat * sigma + mu) 146 | if q is not None: 147 | rval = np.round(rval / q) * q 148 | return rval 149 | return { 150 | 'feature_bounds': (-10, 10), 151 | 'kernel': kernels.SqExp(0.7, 1e-6, 1.5, conditional), 152 | 'ndim': 1, 153 | 'continuous': continuous, 154 | 'ordinal': q is not None, 155 | 'feature_fn': (lambda val: (np.log(val) - mu) / sigma), 156 | 'val_fn': val_fn, 157 | } 158 | 159 | def qlognormal(self, node, conditional): 160 | q = float(node.arg['q'].obj) 161 | return self.normal(node, conditional, continuous=False, q=q) 162 | 163 | 164 | class DomainGP(object): 165 | logquantized_feature_epsilon = 1e-3 166 | 167 | def __init__(self, domain, GPR=None): 168 | self.domain = domain 169 | 170 | # -- hps: list of hyperparameter names 171 | self.hps = list(sorted(domain.params.keys())) 172 | 173 | # -- config: type and dependency information keyed by hp name 174 | self.config = {} 175 | expr_to_config(domain.expr, None, self.config) 176 | 177 | if GPR is None: 178 | GPR = self.GPR # -- class variable 179 | 180 | kerns, self.hp_slices, self.x_bounds = self.init_param_helpers() 181 | self.gpr = GPR(kernels.product(kerns, self.hp_slices)) 182 | #kern = self.compress_product(kerns, slices) 183 | #self.gpr = GPR(kern) 184 | 185 | def init_param_helpers(self): 186 | # -- called early in constructor before most attributes have been set 187 | kerns = [] 188 | slices = [] 189 | x_bounds = [] 190 | param_helper = ParamHelper(self.config) 191 | self.param_helpers = {} 192 | ndim_offset = 0 193 | for hpname in self.hps: 194 | ph = self.param_helpers[hpname] = param_helper(hpname) 195 | 196 | kerns.append(ph['kernel']) 197 | 198 | # slices are for index into featurevec 199 | ph['feature_slice'] = slice(ndim_offset, ndim_offset + ph['ndim']) 200 | slices.append(ph['feature_slice']) 201 | ndim_offset += ph['ndim'] 202 | 203 | x_bounds.append(ph['feature_bounds']) 204 | 205 | return kerns, slices, np.asarray(x_bounds) 206 | 207 | def draw_n_feature_vecs(self, N, rng): 208 | fake_ids = range(N) 209 | idxs, vals = pyll.rec_eval( 210 | self.domain.s_idxs_vals, 211 | memo={ 212 | self.domain.s_new_ids: fake_ids, 213 | self.domain.s_rng: rng, 214 | }) 215 | return self.features_from_idxs_vals(fake_ids, idxs, vals) 216 | 217 | def features_from_idxs_vals(self, ids, idxs, vals): 218 | columns = [] 219 | if not np.allclose(ids, np.arange(len(ids))): 220 | # -- indexing below is a little more complicated, due 221 | # to another step of indirection 222 | raise NotImplementedError('non-contiguous target ids') 223 | for hpname in self.hps: 224 | cX = self.param_helpers[hpname]['feature_fn'](vals[hpname]) 225 | if cX.ndim < 2: 226 | cX.shape = (len(cX), 1) 227 | assert cX.ndim == 2 228 | assert cX.shape[1] == self.param_helpers[hpname]['ndim'] 229 | cc = np.empty((len(ids), cX.shape[1])) + np.nan 230 | cc[idxs[hpname]] = cX 231 | columns.append(cc) 232 | return np.hstack(columns) 233 | 234 | def best_pt_from_featurevec(self, featurevec): 235 | best_pt = {} 236 | for hpname in self.hps: 237 | ph = self.param_helpers[hpname] 238 | feat = featurevec[ph['feature_slice']] 239 | if not np.isnan(np.sum(feat)): 240 | assert len(feat) == 1 241 | best_pt[hpname] = ph['val_fn'](feat[0]) 242 | return best_pt 243 | 244 | def _X_y_var_y(self, trials, failure_loss=None): 245 | all_tids = trials.tids 246 | all_idxs, all_vals = trials.idxs_vals 247 | X = self.features_from_idxs_vals(all_tids, all_idxs, all_vals) 248 | def loss(tr): 249 | if tr['result']['status'] == 'ok': 250 | return ( 251 | float(tr['result']['loss']), 252 | float(tr['result'].get('loss_variance', 0))) 253 | else: # TODO in-fill prediction for in-prog jobs? 254 | return float(failure_loss), 0 255 | y, var_y = zip(*map(loss, trials.trials)) 256 | #y = trials.losses() 257 | #var_y = loss_variances(trials) 258 | assert len(y) == len(X) == len(var_y) 259 | return X, y, var_y 260 | 261 | def fit_gpr(self, X, y, var_y, method='ml2'): 262 | assert X.shape[1] == len(self.hps) 263 | if method == 'ml2': 264 | self.gpr.fit_ml2(X, y, var_y=var_y) 265 | elif method == 'hmc': 266 | self.gpr.fit_hmc(X, y, var_y=var_y) 267 | else: 268 | raise NotImplementedError(method) 269 | 270 | def optimize_over_X_finetune(self, vec): 271 | vec_is_nan = np.isnan(vec) 272 | 273 | vec0 = vec.copy() 274 | vec0[vec_is_nan] = 0 275 | 276 | to_opt = np.ones_like(vec) 277 | to_opt[vec_is_nan] = 0 278 | for kslice, hpname in zip(self.hp_slices, self.hps): 279 | ph = self.param_helpers[hpname] 280 | if not (ph['continuous'] or ph['ordinal']): 281 | to_opt[kslice] = 0 282 | q_filter = np.ones_like(vec) 283 | 284 | def f_df(_x): 285 | x = np.clip(_x, self.x_bounds[:, 0], self.x_bounds[:, 1]) 286 | if not np.allclose(x, _x): 287 | print 'x clipped', abs(x - _x) 288 | x[vec_is_nan] = np.nan 289 | f, df = self.crit_deriv(np.atleast_2d(x)) 290 | assert len(f) == len(df) == 1 291 | f = f[0] 292 | df = df[0] 293 | assert len(self.hps) == len(self.hp_slices) 294 | #print 'OPTIMIZE_IN_X: f_df', f, df 295 | 296 | # -- don't fine-tune the discrete variables 297 | # TODO: don't even compute the gradient in the first place 298 | #for ii, (kslice, hpname) in enumerate(zip(self.hp_slices, self.hps)): 299 | # ph = self.param_helpers[hpname] 300 | # print ' %40s\t%.3f\t%20s\t%.3f\t%8s\t%8s' % ( 301 | # hpname, _x[ii], kslice, df[ii], ph['continuous'], ph['q']) 302 | 303 | assert np.all(np.isfinite(df)) 304 | mask = to_opt * q_filter 305 | df[mask == 0] = 0 306 | assert np.all(np.isfinite(df)) 307 | assert np.all(np.isfinite(f)) 308 | return f, df 309 | 310 | #print 'OPTIMIZE_IN_X start', vec0 311 | print 'Info: optimizing', (to_opt * q_filter).sum(), 'vars' 312 | res = scipy.optimize.minimize( 313 | fun=f_df, 314 | x0=vec0, 315 | jac=True, # -- means f returns cost and jacobian 316 | method='L-BFGS-B', 317 | #method='SLSQP', 318 | tol=1e-10, # XXX delete this after validating file 319 | #options={} if self.maxiter is None else ( 320 | #{'maxiter': self.maxiter,}), 321 | bounds=self.x_bounds, 322 | ) 323 | #print 'OPTIMIZE_IN_X done', res 324 | res.x = np.clip(res.x, self.x_bounds[:, 0], self.x_bounds[:, 1]) 325 | assert np.all(np.isfinite(res.x)) 326 | 327 | for kslice, hpname in zip(self.hp_slices, self.hps): 328 | ph = self.param_helpers[hpname] 329 | if ph['ordinal']: 330 | # -- round quantized variables to nearest valid value 331 | res.x[kslice] = ph['feature_fn'](ph['val_fn'](res.x[kslice])) 332 | # -- mask out derivatives from here on 333 | q_filter[kslice] = 0 334 | 335 | # -- maybe reoptimize with quantized variables frozen 336 | if (to_opt * q_filter).sum(): 337 | print 'Info: reoptimizing', (to_opt * q_filter).sum(), 'vars' 338 | res2 = scipy.optimize.minimize( 339 | fun=f_df, 340 | x0=res.x, 341 | jac=True, # -- means f returns cost and jacobian 342 | method='L-BFGS-B', 343 | #method='SLSQP', 344 | tol=1e-10, # XXX delete this after validating file 345 | #options={} if self.maxiter is None else ( 346 | #{'maxiter': self.maxiter,}), 347 | bounds=self.x_bounds, 348 | ) 349 | else: 350 | print 'Info: skipping reoptimization step' 351 | res2 = res 352 | assert np.all(np.isfinite(res2.x)) 353 | #print 'OPTIMIZE_IN_X done', res 354 | res2.x = np.clip(res2.x, self.x_bounds[:, 0], self.x_bounds[:, 1]) 355 | res2.x[vec_is_nan] = np.nan 356 | return res2 357 | 358 | def optimize_over_X(self, n_buckshots, n_finetunes, rng, ret_raw=False, 359 | ret_results=False): 360 | # -- sample a bunch of points 361 | buckshot = self.draw_n_feature_vecs(n_buckshots, rng) 362 | buckshot_crit = self.crit(buckshot) 363 | best_first = np.argsort(buckshot_crit) 364 | #print 'buckshot stats', buckshot_crit.min(), buckshot_crit.max() 365 | 366 | # -- finetune a few of the best by gradient descent 367 | results = [ 368 | (buckshot_crit[best_first[0]], 369 | -1, 370 | buckshot[best_first[0]].copy(), 371 | buckshot_crit[best_first[0]], 372 | )] 373 | if self._cost_deriv is not None: 374 | misc_step = int(n_buckshots / (.5 * n_finetunes)) 375 | misc = best_first[n_finetunes::misc_step] 376 | top_best = best_first[:n_finetunes - len(misc)] 377 | to_finetune = list(misc) + list(top_best) 378 | assert len(to_finetune) <= n_finetunes 379 | for ii in range(n_finetunes): 380 | vec = buckshot[to_finetune[ii]] 381 | res = self.optimize_over_X_finetune(vec) 382 | results.append((res.fun, ii, res.x.copy(), 383 | buckshot_crit[to_finetune[ii]])) 384 | results.sort() 385 | if results[0][1] == -1: 386 | print 'Warning: finetuning did no good' 387 | print 'optimize_X', results[0] 388 | if ret_results: 389 | return results 390 | if ret_raw: 391 | return results[0][2] 392 | else: 393 | # -- return the best one 394 | best_pt = self.best_pt_from_featurevec(results[0][2]) 395 | return best_pt 396 | 397 | # -- flake-8 abhors blank line EOF 398 | -------------------------------------------------------------------------------- /hp_gpsmbo/hpsuggest_ei.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import theano.tensor 4 | 5 | from hyperopt import rand 6 | 7 | from .hpsuggest import SuggestBest, DomainGP 8 | from . import gpr_math 9 | from . import op_Kcond 10 | from .gpr import GPR_HMC 11 | 12 | class DomainGP_EI(DomainGP): 13 | _EI_thresh_increment = 0.1 14 | _min_thresh_inc = 0 15 | GPR = GPR_HMC 16 | 17 | def init_fns(self): 18 | try: 19 | self._cost_deriv 20 | except AttributeError: 21 | s_thresh = theano.tensor.dscalar('thresh') 22 | s_reuse_cholesky = theano.tensor.iscalar('reuse_cholesky') 23 | s_reuse_cholesky_idx = theano.tensor.iscalar('reuse_cholesky_idx') 24 | 25 | s_mean_x, s_var_x, s_x, K_new = self.gpr.kernel.s_mean_var( 26 | self.gpr.s_X, 27 | self.gpr.s_y, 28 | self.gpr.s_var_y, 29 | self.gpr.s_emp_var, 30 | self.gpr.s_params, 31 | self.gpr.s_var_min, 32 | return_K_new=True) 33 | s_logEI = gpr_math.s_normal_logEI( 34 | -s_thresh, 35 | -(s_mean_x + self.gpr.s_emp_mean), 36 | s_var_x, 37 | quad_approx=True) 38 | cost = -s_logEI 39 | 40 | assert cost.ndim == 1 41 | s_gx = theano.tensor.grad(cost.sum(), s_x) 42 | 43 | # -- this hack makes it so that the s_reuse_cholesky 44 | # variable is patched in to the graph during optimization 45 | # and allows to disable the computation of training 46 | # K matrix and it's cholesky factorization 47 | op_Kcond.use_lazy_cholesky = s_reuse_cholesky 48 | op_Kcond.use_lazy_cholesky_idx = s_reuse_cholesky_idx 49 | self._cost_deriv = theano.function( 50 | [s_x, s_thresh, self.gpr.s_params, 51 | s_reuse_cholesky, s_reuse_cholesky_idx], 52 | [cost, s_gx], 53 | on_unused_input='ignore', 54 | allow_input_downcast=True, 55 | profile=0) 56 | op_Kcond.use_lazy_cholesky = None 57 | op_Kcond.use_lazy_cholesky_idx = None 58 | 59 | if 1: 60 | # /begin hack sanity checking 61 | #import pdb; pdb.set_trace() 62 | n_cholesky = 0 63 | n_lazy_cholesky = 0 64 | for node in self._cost_deriv.maker.fgraph.toposort(): 65 | #print node 66 | if isinstance(node.op, 67 | theano.sandbox.linalg.ops.Solve): 68 | assert node.op.A_structure != 'general' 69 | if isinstance(node.op, 70 | theano.sandbox.linalg.ops.Cholesky): 71 | n_cholesky += 1 72 | if isinstance(node.op, op_Kcond.LazyCholesky): 73 | n_lazy_cholesky += 1 74 | assert n_cholesky == 0 75 | assert n_lazy_cholesky == 1 76 | # /end hack sanity checking 77 | 78 | self._cost = theano.function( 79 | [s_x, s_thresh, self.gpr.s_params], 80 | cost, 81 | allow_input_downcast=True) 82 | self._K_new = theano.function( 83 | [s_x, self.gpr.s_params], K_new) 84 | return self._cost_deriv 85 | 86 | def set_thresholds(self, y, var_y, z=1.0, max_ei_thresh=None): 87 | yy = y - z * np.sqrt(np.maximum(var_y, 88 | max( 89 | self.gpr.s_var_min.eval(), 90 | self._min_thresh_inc ** 2))) 91 | if max_ei_thresh is not None: 92 | self._EI_thresh = min(np.min(yy), max_ei_thresh) 93 | else: 94 | self._EI_thresh = np.min(yy) 95 | 96 | def crit(self, X): 97 | self.init_fns() 98 | #return -self.gpr.logEI(X, 99 | #self._EI_thresh, 100 | #direction=-1, # below thresh 101 | #quad_approx=True) 102 | gpr = self.gpr 103 | fs = [] 104 | for pp in gpr._params_list: 105 | f = self._cost(np.atleast_2d(X), 106 | self._EI_thresh, 107 | pp) 108 | fs.append(f) 109 | mean_f = np.dot(gpr._params_weights, fs) 110 | return mean_f 111 | 112 | def crit_deriv(self, X): 113 | self.init_fns() 114 | gpr = self.gpr 115 | fs = [] 116 | dfs = [] 117 | for ii, pp in enumerate(gpr._params_list): 118 | #print 'pp', pp, 'x', X 119 | f, df = self._cost_deriv(np.atleast_2d(X), 120 | self._EI_thresh, 121 | pp, 122 | self._cost_deriv_reuse_cholesky, 123 | ii) 124 | assert f.shape == (1,), (f.shape, X.shape) 125 | fs.append(f[0]) 126 | dfs.append(df.flatten()) 127 | self._cost_deriv_reuse_cholesky = 1 128 | mean_f = np.dot(gpr._params_weights, fs) 129 | #import pdb; pdb.set_trace() 130 | mean_df = np.dot(gpr._params_weights, np.asarray(dfs)) 131 | return [mean_f], [mean_df] 132 | 133 | def optimize_over_X(self, n_buckshots, n_finetunes, rng): 134 | while True: 135 | rval_raw = DomainGP.optimize_over_X(self, 136 | n_buckshots, 137 | n_finetunes, 138 | rng, 139 | ret_raw=True) 140 | if len(self.gpr._params_list) == 1: 141 | Ks = self._K_new(np.atleast_2d(rval_raw), 142 | self.gpr._params_list[0]) 143 | if (Ks.max() > (1 - 1e-6)): 144 | # -- promote exploration with a more aggressive threshold 145 | self._EI_thresh -= self._EI_thresh_increment 146 | print 'lowering EI thresh to', self._EI_thresh 147 | else: 148 | break 149 | else: 150 | break 151 | best_pt = self.best_pt_from_featurevec(rval_raw) 152 | return best_pt 153 | 154 | 155 | _suggest_domain_cache = {} 156 | def suggest(new_ids, domain, trials, seed, 157 | warmup_cutoff=15, # -- enough for mean & var stats 158 | n_buckshots=10000, 159 | n_finetunes=50, 160 | stop_at=None, 161 | plot_contours=None, 162 | gp_fit_method='ml2', 163 | failure_loss=None, 164 | max_ei_thresh=None, 165 | ): 166 | """ 167 | Parameters 168 | ---------- 169 | 170 | """ 171 | # XXX would like to cache on domain, but 172 | # fmin(fn, space) always rebuilds a new domain for given fn and space 173 | key = domain.expr 174 | try: 175 | dgp = _suggest_domain_cache[key] 176 | except KeyError: 177 | print 'CREATING GP_EI for', domain 178 | dgp = _suggest_domain_cache[key] = DomainGP_EI(domain) 179 | if len(trials.trials): 180 | X, y, var_y = dgp._X_y_var_y(trials, failure_loss=failure_loss) 181 | 182 | if len(trials.trials) <= warmup_cutoff: 183 | if len(trials.trials): 184 | dgp.gpr.prior_mean = np.mean(y) 185 | dgp.gpr.prior_var = np.var(y) 186 | return rand.suggest(new_ids, domain, trials, seed) 187 | 188 | if stop_at is not None and min(trials.losses()) < stop_at: 189 | return [] 190 | 191 | dgp.fit_gpr(X, y, var_y, method=gp_fit_method) 192 | dgp.set_thresholds(y, var_y, max_ei_thresh=max_ei_thresh) 193 | dgp._cost_deriv_reuse_cholesky = 0 194 | 195 | print 'EI: Best after %i trials: %f' % ( len(y), np.min(y)) 196 | #dgp.gpr._params_list[0][:] = 0 197 | rng = np.random.RandomState(seed) 198 | t0 = time.time() 199 | best_pt = dgp.optimize_over_X( 200 | n_buckshots=n_buckshots, 201 | n_finetunes=n_finetunes, 202 | rng=rng, 203 | ) 204 | t1 = time.time() 205 | print 'optimizing surrogate took', (t1 - t0) 206 | if plot_contours: 207 | plot_contours(dgp, 2, dgp._lbound, best_pt) 208 | new_id, = new_ids 209 | #print 'REI: Best pt', best_pt 210 | return SuggestBest(domain, trials, seed, best_pt)(new_id) 211 | 212 | # --eof 213 | -------------------------------------------------------------------------------- /hp_gpsmbo/hpsuggest_lucb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/hpsuggest_lucb.py -------------------------------------------------------------------------------- /hp_gpsmbo/hpsuggest_ucb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor 3 | from hyperopt import rand 4 | 5 | from .hpsuggest import SuggestBest, DomainGP 6 | from .gpr import GPR_ML2 7 | 8 | 9 | class DomainGP_UCB(DomainGP): 10 | GPR = GPR_ML2 11 | 12 | def init_cost_fns(self): 13 | try: 14 | self._cost_fn 15 | except AttributeError: 16 | s_ucb_z = theano.tensor.dscalar('ucb_z') 17 | 18 | s_mean_x, s_var_x, s_x, K_new = self.gpr.kernel.s_mean_var( 19 | self.gpr.s_X, 20 | self.gpr.s_y, 21 | self.gpr.s_var_y, 22 | self.gpr.s_emp_var, 23 | self.gpr.s_params, 24 | self.gpr.s_var_min, 25 | return_K_new=True) 26 | s_cost = s_mean_x - theano.tensor.sqrt(s_var_x) * s_ucb_z 27 | 28 | s_gx = theano.tensor.grad(s_cost.sum(), s_x) 29 | self._cost_fn = theano.function( 30 | [s_x, s_ucb_z, self.gpr.s_params], s_cost) 31 | self._cost_deriv = theano.function( 32 | [s_x, s_ucb_z, self.gpr.s_params], [s_cost, s_gx]) 33 | self._K_new = theano.function( 34 | [s_x, self.gpr.s_params], K_new) 35 | 36 | 37 | def crit(self, X): 38 | self.init_cost_fns() 39 | if len(self.gpr._params_list) > 1: 40 | raise NotImplementedError() 41 | pp, = self.gpr._params_list 42 | return self._cost_fn(X, self._ucb_z, pp) 43 | 44 | def crit_deriv(self, X): 45 | self.init_cost_fns() 46 | if len(self.gpr._params_list) > 1: 47 | raise NotImplementedError() 48 | pp, = self.gpr._params_list 49 | return self._cost_deriv(X, self._ucb_z, pp) 50 | 51 | def optimize_over_X(self, n_buckshots, n_finetunes, rng): 52 | best_pt = None 53 | while True: 54 | results = DomainGP.optimize_over_X(self, n_buckshots, 55 | n_finetunes, rng, ret_results=True) 56 | Ks = self._K_new(np.asarray([rr[2] for rr in results]), 57 | self.gpr._params_list[0]).T 58 | #order = rng.permutation(len(results)) 59 | order = range(len(results)) 60 | assert len(Ks) == len(results) 61 | for ii in order: 62 | #for Ki, rr in zip(Ks, results): 63 | Ki = Ks[ii] 64 | rr = results[ii] 65 | if Ki.max() > self._K_thresh: 66 | #print 'UCB: skipping pt wit h K', Ki.max() 67 | continue 68 | else: 69 | #print 'UCB: picking pt wit h K', Ki.max() 70 | best_pt = self.best_pt_from_featurevec(rr[2]) 71 | break 72 | if best_pt is None: 73 | self._ucb_z *= 2 + .1 74 | print 'UCB: raising ucb_z to', self._ucb_z 75 | else: 76 | break 77 | #best_pt = self.best_pt_from_featurevec(rval_raw) 78 | return best_pt 79 | 80 | 81 | _suggest_domain_cache = {} 82 | def suggest(new_ids, domain, trials, seed, 83 | warmup_cutoff=15, 84 | n_buckshots=10000, 85 | n_finetunes=50, 86 | stop_at=None, 87 | plot_contours=None, 88 | ): 89 | """ 90 | Parameters 91 | ---------- 92 | 93 | """ 94 | if len(trials.trials) <= warmup_cutoff: 95 | return rand.suggest(new_ids, domain, trials, seed) 96 | 97 | # XXX would like to cache on domain, but 98 | # fmin(fn, space) always rebuilds a new domain for given fn and space 99 | key = domain.expr 100 | try: 101 | dgp = _suggest_domain_cache[key] 102 | except KeyError: 103 | dgp = _suggest_domain_cache[key] = DomainGP_UCB(domain) 104 | 105 | if stop_at is not None and min(trials.losses()) < stop_at: 106 | return [] 107 | 108 | X, y, var_y = dgp._X_y_var_y(trials) 109 | dgp.fit_gpr(X, y, var_y) 110 | print 'Fit ->', dgp.gpr._params_list[0] 111 | dgp._ucb_z = 0.2 112 | # XXX: radius should depend on dimensionality? 113 | # 1e-8 worked for branin in case current one doesn't 114 | dgp._K_thresh = (1 - 1e-5) # / (1000 + len(y) ** 2)) 115 | 116 | print 'UCB: Best after %i trials: %f' % ( len(y), np.min(y)) 117 | #dgp.gpr._params_list[0][:] = 0 118 | rng = np.random.RandomState(seed) 119 | best_pt = dgp.optimize_over_X( 120 | n_buckshots=n_buckshots, 121 | n_finetunes=n_finetunes, 122 | rng=rng, 123 | ) 124 | if plot_contours: 125 | plot_contours(dgp, 2, dgp._lbound, best_pt) 126 | new_id, = new_ids 127 | #print 'REI: Best pt', best_pt 128 | return SuggestBest(domain, trials, seed, best_pt)(new_id) 129 | # --eof 130 | -------------------------------------------------------------------------------- /hp_gpsmbo/kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as TT 4 | 5 | from .op_Kcond import zero_diag, isnan as s_isnan 6 | from .kernels_base import Kernel 7 | from .kernels_base import euclidean_sq_distances 8 | 9 | from .prodkernels import SqExpProd 10 | 11 | 12 | def check_K(K, tag=None): 13 | return K 14 | import scipy.linalg 15 | def check_pd(op, xin): 16 | try: 17 | scipy.linalg.cholesky(xin + 1e-12 * np.eye(xin.shape[0])) 18 | except: 19 | print 'tag', tag 20 | theano.printing.debugprint(K) 21 | raise 22 | return theano.printing.Print('check_K', global_fn=check_pd)(K) 23 | 24 | 25 | def check_finite(K, tag=None): 26 | return K 27 | def check(op, xin): 28 | try: 29 | assert np.all(np.isfinite(xin)) 30 | except: 31 | print 'tag', tag 32 | theano.printing.debugprint(K) 33 | raise 34 | return theano.printing.Print('check_finite', global_fn=check)(K) 35 | 36 | 37 | class ChoiceN(Kernel): 38 | def __init__(self, upper, conditional, seed=1): 39 | # N.B. seed should not need to be changed 40 | 41 | # -- XXX only need upper-triangle worth of values 42 | # but need Theano triangle-packing op (already exists??) 43 | #self.n_params = upper * (upper - 1) / 2 44 | self.n_idxs = (upper + 1) if conditional else upper 45 | self.n_params = self.n_idxs ** 2 46 | 47 | self.seed = seed 48 | self.conditional = conditional 49 | 50 | def prodkey(self): 51 | return id(self) # -- choices are not mergeable 52 | 53 | def reduce_lenscale(self, params): 54 | # No-op 55 | return params 56 | 57 | def s_logprior(self, params, strength=10.0): 58 | P_shaped = params.reshape((self.n_idxs, self.n_idxs)) 59 | P_norms = TT.sqrt((P_shaped ** 2).sum(axis=1)) 60 | return strength * ((P_norms - 1) ** 2).sum() 61 | 62 | def unit(self, params): 63 | P_shaped = params.reshape((self.n_idxs, self.n_idxs)) 64 | P_norms = TT.sqrt((P_shaped ** 2).sum(axis=1)) 65 | P_unit = P_shaped / P_norms[:, None] 66 | return P_unit 67 | 68 | def opt_logK(self, x, params): 69 | if self.conditional: 70 | s_x = TT.switch(TT.isnan(x), self.n_idxs - 1, x) 71 | else: 72 | s_x = x 73 | #s_x = theano.printing.Print('x')(s_x) 74 | lbound = 1e-5 75 | ubound = 1.0 76 | params0 = np.random.RandomState(self.seed).uniform( 77 | low=lbound, 78 | high=ubound, 79 | size=(self.n_idxs, self.n_idxs)) 80 | P_unit = self.unit(params) 81 | idxs = s_x.flatten().astype('int32') 82 | #def wtf(node, val): 83 | # print 'IDXS', val 84 | # print 'SELF', self.n_idxs, self.conditional 85 | # return val 86 | 87 | #idxs = theano.printing.Print('idxs', global_fn=wtf)(idxs) 88 | K = TT.dot(P_unit[idxs], P_unit[idxs].T) 89 | #K = K + 1e-12 * TT.eye(x.shape[0]) 90 | bounds = [(lbound, ubound)] * self.n_params 91 | return TT.log(K), list(params0.flatten()), bounds 92 | 93 | def predict_logK(self, x, z, params): 94 | if self.conditional: 95 | s_x = TT.switch(TT.isnan(x), self.n_idxs - 1, x) 96 | s_z = TT.switch(TT.isnan(z), self.n_idxs - 1, z) 97 | else: 98 | s_x = x 99 | s_z = z 100 | P_unit = self.unit(params) 101 | K = TT.dot(P_unit[s_x.flatten().astype('int32')], 102 | P_unit[s_x.flatten().astype('int32')].T) 103 | #K_reg = K + 1e-12 * TT.eye(x.shape[0]) 104 | K_new = TT.dot(P_unit[s_x.flatten().astype('int32')], 105 | P_unit[s_z.flatten().astype('int32')].T) 106 | return TT.log(K), TT.log(K_new) 107 | 108 | 109 | class StationaryBase(Kernel): 110 | """ 111 | 112 | K(x,y) = exp(- ||x-y||^2 / (2 l^2)) 113 | 114 | N.B. the kernel is parameterized by quantity 115 | 116 | alpha = log( 2 * l^2) 117 | 118 | So that 119 | 120 | K(x, y) = exp(- ||x - y|| ** 2 / exp(alpha)) 121 | l = sqrt(exp(alpha) / 2) 122 | 123 | 124 | """ 125 | 126 | @staticmethod 127 | def _alpha_from_l(l): 128 | return np.log(2.0 * l ** 2) 129 | 130 | @staticmethod 131 | def _l_from_alpha(alpha): 132 | return np.sqrt(np.exp(alpha) / 2.) 133 | 134 | def __init__(self, lenscale, lenscale_min, lenscale_max, conditional): 135 | self._lenscale0 = lenscale 136 | self._lenscale_min = lenscale_min 137 | self._lenscale_max = lenscale_max 138 | self._conditional = conditional 139 | self._n_warp_segments = 0 140 | if conditional: 141 | self.n_params = 3 + self._n_warp_segments 142 | else: 143 | self.n_params = 1 + self._n_warp_segments 144 | 145 | def prodkey(self): 146 | # -- unique identifier of mergeable product sets 147 | return (type(self), 148 | self._conditional, 149 | self._n_warp_segments) 150 | 151 | def props(self): 152 | return ( 153 | self._lenscale0, 154 | self._lenscale_min, 155 | self._lenscale_max, 156 | self._conditional, 157 | self._n_warp_segments, 158 | ) 159 | 160 | def __eq__(self, other): 161 | return type(self) == type(other) and self.props() == other.props() 162 | 163 | def __hash__(self): 164 | return hash((type(self), self.props())) 165 | 166 | def reduce_lenscale(self, params): 167 | new_alpha = params[0] - 1 168 | new_l = max(self._lenscale_min, self._l_from_alpha(new_alpha)) 169 | rval = list(params) 170 | rval[0] = self._alpha_from_l(new_l) 171 | return rval 172 | 173 | def s_logprior(self, params, strength=10.0): 174 | # -- I don't know what distribution this would be 175 | # but I think it makes a nice shape 176 | alpha = params[0] 177 | alpha_min = self._alpha_from_l(self._lenscale_min) 178 | alpha_max = self._alpha_from_l(self._lenscale_max) 179 | #return strength * (alpha - alpha_min) ** 2 180 | log0 = -10000 181 | width = alpha_max - alpha_min 182 | #alpha_mean = 0.5 * (alpha_max + alpha_min) 183 | energy = strength * 0.5 * (alpha - alpha_max) ** 2 / width ** 2 184 | lenscale_logprior = TT.switch(alpha < alpha_min, 185 | log0, 186 | TT.switch(alpha < alpha_max, 187 | -energy, 188 | log0)) 189 | if self._conditional: 190 | diff = params[1:3] - np.asarray([0, 1]) 191 | return lenscale_logprior + TT.dot(diff, diff) 192 | else: 193 | return lenscale_logprior 194 | 195 | def cond_x(self, x, params): 196 | # x is a full matrix, but will only have one column 197 | 198 | x = TT.addbroadcast(x, 1) 199 | if self._conditional: 200 | missing_x = params[1:3] 201 | log_scale_x = params[3:3 + self._n_warp_segments] 202 | else: 203 | log_scale_x = params[1:1 + self._n_warp_segments] 204 | 205 | if self._n_warp_segments: 206 | # XXX 207 | warp_lbound = 0. 208 | warp_ubound = 1. 209 | warp_segments = np.linspace(warp_lbound, 210 | warp_ubound, 211 | self._n_warp_segments) 212 | scale_x = TT.exp(log_scale_x) 213 | z = TT.sum( 214 | TT.tanh(scale_x * (x - warp_segments)), 215 | axis=1)[:, None] 216 | z_min = TT.sum( 217 | TT.tanh(scale_x * (np.zeros((1, 1)) - warp_segments)), 218 | axis=1)[:, None] 219 | z_max = TT.sum( 220 | TT.tanh(scale_x * (np.ones((1, 1)) - warp_segments)), 221 | axis=1)[:, None] 222 | z = (z - z_min) / (z_max - z_min) 223 | else: 224 | z = x 225 | if self._conditional: 226 | x2_base = TT.switch(s_isnan(x), missing_x, 0) 227 | x2 = TT.inc_subtensor(x2_base[:, 0:1], TT.switch(s_isnan(x), 0, z)) 228 | return x2 229 | else: 230 | return z 231 | 232 | def opt_logK(self, x, params): 233 | x2 = self.cond_x(x, params) 234 | logK = self._logK_of_dist(euclidean_sq_distances(x2, x2), params, True) 235 | params0 = [self._alpha_from_l(self._lenscale0)] 236 | if self._conditional: 237 | params0.extend([0., 1.]) 238 | params0.extend([0.] * self._n_warp_segments) 239 | amin = None if self._lenscale_min is None else ( 240 | self._alpha_from_l(self._lenscale_min)) 241 | amax = None if self._lenscale_max is None else ( 242 | self._alpha_from_l(self._lenscale_max)) 243 | bounds = [[amin, amax]] 244 | if self._conditional: 245 | bounds.extend([(-5., 5.), (1e-5, 5.)]) 246 | bounds.extend([(-.2, 2.)] * self._n_warp_segments) 247 | return logK, params0, bounds 248 | 249 | def predict_logK(self, x, z, params): 250 | x2 = self.cond_x(x, params) 251 | z2 = self.cond_x(z, params) 252 | logK = self._logK_of_dist(euclidean_sq_distances(x2, x2), params, True) 253 | logK_new = self._logK_of_dist(euclidean_sq_distances(x2, z2), params, False) 254 | return logK, logK_new 255 | 256 | 257 | class SqExp(StationaryBase): 258 | Product = SqExpProd 259 | def _logK_of_dist(self, sq_dists, params, self_sim): 260 | _alpha = params[0] 261 | ll2 = TT.exp(_alpha) # aka 2 * l ** 2 262 | return -sq_dists / ll2 263 | 264 | 265 | class Matern12(SqExp): 266 | def _K_of_dist(self, sq_dists, params, self_sim): 267 | _alpha = params[0] 268 | ll = TT.sqrt(.5 * TT.exp(_alpha)) 269 | return TT.exp(-TT.sqrt(sq_dists) / ll) 270 | 271 | 272 | class Matern32(StationaryBase): 273 | def _K_of_dist(self, sq_dists, params, self_sim): 274 | _alpha = params[0] 275 | ll2 = .5 * TT.exp(_alpha) # aka l ** 2 276 | nrmsq = sq_dists / ll2 277 | if self_sim: 278 | # -- help grad by suppressing 0/0 -> NaN 279 | nrmsq = zero_diag(nrmsq) 280 | nrm_root_3 = TT.sqrt(3 * nrmsq) 281 | return ((1 + nrm_root_3) * TT.exp(-nrm_root_3)) 282 | 283 | 284 | class Matern52(StationaryBase): 285 | def _K_of_dist(self, sq_dists, params, self_sim): 286 | _alpha = params[0] 287 | ll2 = .5 * TT.exp(_alpha) # aka l ** 2 288 | nrmsq = sq_dists / ll2 289 | if self_sim: 290 | # -- help grad by suppressing 0/0 -> NaN 291 | nrmsq = zero_diag(nrmsq) 292 | nrm_root_5 = TT.sqrt(5 * nrmsq) 293 | coef = 1 + nrm_root_5 + 5. / 3. * nrmsq 294 | return coef * TT.exp(-nrm_root_5) 295 | 296 | 297 | Choice2 = SqExp 298 | #class Choice2(StationaryBase): 299 | #def _logK_of_dist(self, sq_dists, params, self_sim): 300 | #_alpha = params[0] 301 | #ll2 = TT.exp(_alpha) # aka 2 * l ** 2 302 | #return -sq_dists / ll2 303 | 304 | 305 | def product(kernels, slices): 306 | from gby import groupby 307 | # -- there are some kernels whose product can be handled 308 | # by the same sort of Theano graph as it takes to handle 309 | # just one term of the product. Pre-consolidating such 310 | # sub-products saves a huge amount of compilation time 311 | # and it runs faster too. 312 | prod_mergeable = groupby(zip(kernels, slices), 313 | lambda ks: ks[0].prodkey()) 314 | kernels_ = [] 315 | slices_ = [] 316 | for key, mergeable in prod_mergeable.items(): 317 | print key, mergeable 318 | if len(mergeable) > 1: 319 | kern = mergeable[0][0].Product(mergeable) 320 | slc = kern.column_idxs 321 | else: 322 | (kern, slc), = mergeable 323 | kernels_.append(kern) 324 | slices_.append(slc) 325 | if len(kernels_) == 1: 326 | # -- XXX ignores slc ... is ok? 327 | return kernels_[0] 328 | return Product(kernels_, slices_) 329 | 330 | 331 | class Product(Kernel): 332 | def __init__(self, kernels, slices): 333 | self.kernels = kernels 334 | self.slices = slices 335 | self.n_params = sum(k.n_params for k in kernels) 336 | 337 | def reduce_lenscale(self, params): 338 | rval = np.zeros_like(params) 339 | offset = 0 340 | for k in self.kernels: 341 | rval[offset: offset + k.n_params] = ( 342 | k.reduce_lenscale(params[offset: offset + k.n_params])) 343 | offset += k.n_params 344 | return rval 345 | 346 | def s_logprior(self, params): 347 | offset = 0 348 | lps = [] 349 | for k in self.kernels: 350 | lps.append(k.s_logprior(params[offset: offset + k.n_params])) 351 | offset += k.n_params 352 | return reduce(lambda a, b: a + b, lps) 353 | 354 | def opt_logK(self, x, params): 355 | # return a cost, and parameter vector suitable for fitting 356 | # the GP, and bounds on that parameter vector 357 | 358 | params0 = [] 359 | bounds = [] 360 | offset = 0 361 | logKs = [] 362 | for kern, slice_k in zip(self.kernels, self.slices): 363 | params_k = params[offset: offset + kern.n_params] 364 | #if slice_k is None: 365 | #logK_k, params0_k, bounds_k = kern.opt_logK(x, params_k) 366 | #else: 367 | logK_k, params0_k, bounds_k = kern.opt_logK(x[:, slice_k], 368 | params_k) 369 | logKs.append(check_K(logK_k)) 370 | params0.extend(params0_k) 371 | bounds.extend(bounds_k) 372 | offset += kern.n_params 373 | 374 | if len(self.kernels) == 1: 375 | return logKs[0], params0, bounds 376 | else: 377 | Kstack = TT.stack(*logKs) 378 | logK = TT.sum(Kstack, axis=0) 379 | return logK, params0, bounds 380 | 381 | def predict_logK(self, x, z, params): 382 | # s_mean, s_x for computing mean from s_x 383 | logKs = [] 384 | logKs_new = [] 385 | offset = 0 386 | for kern, slice_k in zip(self.kernels, self.slices): 387 | params_k = params[offset: offset + kern.n_params] 388 | #if slice_k is None: 389 | #logK_k, logK_new_k = kern.predict_logK(x, z, params_k) 390 | #else: 391 | logK_k, logK_new_k = kern.predict_logK( 392 | x[:, slice_k], z[:, slice_k], params_k) 393 | logKs.append(logK_k) 394 | logKs_new.append(logK_new_k) 395 | offset += kern.n_params 396 | 397 | if len(self.kernels) == 1: 398 | return logKs[0], logKs_new[0] 399 | else: 400 | logK = TT.sum(TT.stack(*logKs), axis=0) 401 | logK_new = TT.sum(TT.stack(*logKs_new), axis=0) 402 | return logK, logK_new 403 | 404 | 405 | def prod_of(Kcls, slices): 406 | kernels = [Kcls() for ii in range(len(slices))] 407 | return Product(kernels, slices) 408 | 409 | 410 | class Mixture(Kernel): 411 | def __init__(self, kernels, slices): 412 | self.kernels = kernels 413 | self.slices = slices 414 | self.n_my_params = len(kernels) - 1 415 | self.n_params = sum(k.n_params for k in kernels) + self.n_my_params 416 | self.prior_strength = 2.0 417 | 418 | def reduce_lenscale(self, params): 419 | rval = np.zeros_like(params) 420 | offset = 0 421 | for k in self.kernels: 422 | rval[offset: offset + k.n_params] = ( 423 | k.reduce_lenscale(params[offset: offset + k.n_params])) 424 | offset += k.n_params 425 | # shrink weights back to even weighting 426 | rval[offset: offset + len(self.kernels) - 1] *= 0.75 427 | return rval 428 | 429 | def s_logprior(self, params): 430 | offset = 0 431 | lps = [] 432 | for k in self.kernels: 433 | lps.append(k.s_logprior(params[offset: offset + k.n_params])) 434 | offset += k.n_params 435 | # -- multiplicative because they are independent 436 | lp = reduce(lambda a, b: a + b, lps) 437 | log_weights = params[offset: offset + self.n_my_params] 438 | return lp - self.prior_strength * TT.dot(log_weights, log_weights) 439 | 440 | def opt_K(self, x, params): 441 | # return a cost, and parameter vector suitable for fitting 442 | # the GP, and bounds on that parameter vector 443 | 444 | params0 = [] 445 | bounds = [] 446 | offset = 0 447 | Ks = [] 448 | for kern, slice_k in zip(self.kernels, self.slices): 449 | params_k = params[offset: offset + kern.n_params] 450 | K_k, params0_k, bounds_k = kern.opt_K(x[:, slice_k], params_k) 451 | Ks.append(K_k) 452 | params0.extend(params0_k) 453 | bounds.extend(bounds_k) 454 | offset += kern.n_params 455 | 456 | params0.extend([0.0] * self.n_my_params) 457 | bounds.extend([(-4, 4)] * self.n_my_params) 458 | 459 | log_weights = TT.concatenate((np.asarray([0.0]), 460 | params[offset:offset + self.n_my_params])) 461 | weights = TT.exp(log_weights) / TT.exp(log_weights).sum() 462 | 463 | if len(self.kernels) == 1: 464 | return Ks[0], params0, bounds 465 | else: 466 | Kstack = TT.stack(*Ks) 467 | weighted_Kstack = weights[:, None, None] * Kstack 468 | K = TT.sum(weighted_Kstack, axis=0) 469 | # XXX: log_K, should be logadd here (#11) 470 | return K, params0, bounds 471 | 472 | def predict_K(self, x, z, params): 473 | # s_mean, s_x for computing mean from s_x 474 | Ks = [] 475 | Ks_new = [] 476 | offset = 0 477 | for kern, slice_k in zip(self.kernels, self.slices): 478 | params_k = params[offset: offset + kern.n_params] 479 | K_k, K_new_k = kern.predict_K( 480 | x[:, slice_k], z[:, slice_k], params_k) 481 | Ks.append(K_k) 482 | Ks_new.append(K_new_k) 483 | offset += kern.n_params 484 | 485 | log_weights = TT.concatenate((np.asarray([0]), 486 | params[offset:offset + self.n_my_params])) 487 | weights = TT.exp(log_weights) / TT.exp(log_weights).sum() 488 | 489 | if len(self.kernels) == 1: 490 | return Ks[0], Ks_new[0] 491 | else: 492 | # XXX: log_K, should be logadd here (#11) 493 | wK = TT.sum( 494 | weights[:, None, None] * TT.stack(*Ks), axis=0) 495 | wK_new = TT.sum( 496 | weights[:, None, None] * TT.stack(*Ks_new), axis=0) 497 | return wK, wK_new 498 | 499 | def mix_of(Kcls, slices): 500 | kernels = [Kcls() for ii in range(len(slices))] 501 | return Mixture(kernels, slices) 502 | -------------------------------------------------------------------------------- /hp_gpsmbo/kernels.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/kernels.pyc -------------------------------------------------------------------------------- /hp_gpsmbo/kernels_base.py: -------------------------------------------------------------------------------- 1 | 2 | import theano 3 | import theano.tensor as TT 4 | 5 | from .gpr_math import s_nll, s_mean, s_variance 6 | 7 | #TODO: Match name to scikits.learn 8 | def euclidean_sq_distances(x, z): 9 | """Matrix of distances for each row in x to each row in z 10 | """ 11 | 12 | # -- TODO: better numerical accuracy 13 | d = ((x ** 2).sum(axis=1).dimshuffle(0, 'x') 14 | + (z ** 2).sum(axis=1) 15 | - 2 * TT.dot(x, z.T)) 16 | return TT.maximum(d, 0) 17 | 18 | 19 | class Kernel(object): 20 | 21 | def s_nll_params(self, x, y, var_y, prior_var, params=None, ret_K=False): 22 | # return a cost, and parameter vector suitable for fitting 23 | # the GP, and bounds on that parameter vector 24 | 25 | # -- turn these to constants 26 | x = TT.as_tensor_variable(x) 27 | y = TT.as_tensor_variable(y) 28 | if params is None: 29 | params = theano.tensor.dvector() 30 | else: 31 | params = theano.tensor.as_tensor_variable(params) 32 | assert params.ndim == 1 33 | K, params0, bounds = self.opt_K(x, params) 34 | nll = s_nll(K, y, var_y=var_y, prior_var=prior_var) 35 | if ret_K: 36 | return nll, params, params0, bounds, K 37 | return nll, params, params0, bounds 38 | 39 | def s_mean_var(self, x, y, var_y, prior_var, best_params, var_min, 40 | x_new=None, 41 | return_K_new=False): 42 | # s_mean, s_x for computing mean from s_x 43 | 44 | # -- turn these to constants 45 | x = TT.as_tensor_variable(x) 46 | y = TT.as_tensor_variable(y) 47 | if x_new is None: 48 | x_new = TT.matrix() 49 | else: 50 | assert x_new.ndim == 2 51 | params = TT.as_tensor_variable(best_params) 52 | K, K_new = self.predict_K(x, x_new, params) 53 | K.name = 'K' 54 | K_new.name = 'K_new' 55 | mean = s_mean(K, y, var_y, prior_var, K_new) 56 | var = s_variance(K, y, var_y, prior_var, K_new, var_min) 57 | mean.name = 'mean_new' 58 | var.name = 'var_new' 59 | rval = [mean, var, x_new] 60 | if return_K_new: 61 | rval.append(K_new) 62 | return rval 63 | 64 | def predict_K(self, *args, **kwargs): 65 | logK, logK_new = self.predict_logK(*args, **kwargs) 66 | return TT.exp(logK), TT.exp(logK_new) 67 | 68 | def opt_K(self, *args, **kwargs): 69 | logK, params, bounds = self.opt_logK(*args, **kwargs) 70 | return TT.exp(logK), params, bounds 71 | 72 | -------------------------------------------------------------------------------- /hp_gpsmbo/op_Kcond.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from theano import Op, Apply, gradient 3 | from theano import tensor as TT 4 | 5 | class KCond(Op): 6 | """ 7 | Return a vector of indexes of K to keep 8 | """ 9 | def __init__(self): 10 | self.destructive = False 11 | 12 | self.props = (self.destructive,) 13 | 14 | def __hash__(self): 15 | return hash((type(self), self.props)) 16 | 17 | def __eq__(self, other): 18 | return (type(self) == type(other) and self.props == other.props) 19 | 20 | #def infer_shape(self, node, shapes): 21 | #return [shapes[0]] 22 | 23 | def __str__(self): 24 | return 'KCond' 25 | 26 | def make_node(self, K, y, eps): 27 | K = TT.as_tensor_variable(K) 28 | y = TT.as_tensor_variable(y) 29 | eps = TT.as_tensor_variable(eps) 30 | return Apply(self, [K, y, eps], [TT.ivector()]) 31 | 32 | def perform(self, node, inputs, outputs): 33 | K, y, eps = inputs 34 | M = K.shape[0] 35 | assert (M, M) == K.shape 36 | assert (M,) == y.shape 37 | order = np.argsort(y) # best to worst 38 | keep = np.ones_like(y).astype(np.int32) # order matches K, y 39 | assert np.allclose(np.diag(K), 1.0) 40 | max_similarity = (K - np.eye(M)).max() 41 | if max_similarity + eps > 1.0: 42 | print 'max_similarity', max_similarity 43 | 44 | for ii in xrange(M - 1): 45 | this = order[ii] 46 | if not keep[this]: 47 | continue 48 | # -- we have committed to using row `this` 49 | # -- Now, delete all worse points within epsilon of row `this` 50 | # (all pts remaining in `order` are worse by definition) 51 | K_this = K[this] 52 | for jj in xrange(ii + 1, M): 53 | other = order[jj] 54 | if not keep[other]: # -- other's already gone 55 | continue 56 | if (1 - K_this[other]) < eps: 57 | keep[other] = 0 58 | keep_idxs = np.where(keep)[0].astype(np.int32) 59 | if len(keep_idxs) < M: 60 | print 'Dropping %i rows to condition K' % ( 61 | M - len(keep_idxs)) 62 | outputs[0][0] = keep_idxs 63 | 64 | def grad(self, inputs, gradients): 65 | return [inp.zeros_like() for inp in inputs] 66 | 67 | 68 | def K_cond(K, y, eps): 69 | keep_idxs = KCond()(K, y, eps) 70 | keep_y = y[keep_idxs] 71 | # -- we want to keep the given rows and cols, hence: 72 | keep_K = K[keep_idxs].T[keep_idxs].T 73 | assert keep_K.type == K.type 74 | assert keep_y.type == y.type 75 | return keep_K, keep_y, keep_idxs 76 | 77 | class ZeroDiag(Op): 78 | """ Return a square matrix with the diagonal zero-d out. 79 | 80 | The advantage of this Op over masking techniques based on arithmetic 81 | is that this Op can remove NaNs from the diagonal. 82 | """ 83 | def __init__(self): 84 | self.destructive = False 85 | self.props = (self.destructive,) 86 | 87 | def __hash__(self): 88 | return hash((type(self), self.props)) 89 | 90 | def __eq__(self, other): 91 | return (type(self) == type(other) and self.props == other.props) 92 | 93 | def infer_shape(self, node, shapes): 94 | return shapes 95 | 96 | def __str__(self): 97 | return 'ZeroDiag' 98 | 99 | def make_node(self, K): 100 | K = TT.as_tensor_variable(K) 101 | return Apply(self, [K], [K.type()]) 102 | 103 | def perform(self, node, inputs, outputs): 104 | K, = inputs 105 | rval = K.copy() 106 | idxs = np.arange(K.shape[0]) 107 | rval[idxs, idxs] = 0 108 | outputs[0][0] = rval 109 | 110 | def connection_pattern(self, node): 111 | return [[True]] 112 | 113 | def grad(self, inputs, gradients): 114 | gY = gradients[0] 115 | return [zero_diag(gY)] 116 | 117 | zero_diag = ZeroDiag() 118 | 119 | 120 | class ZeroForNan(Op): 121 | """ Return a square matrix with the diagonal zero-d out. 122 | 123 | The advantage of this Op over masking techniques based on arithmetic 124 | is that this Op can remove NaNs from the diagonal. 125 | """ 126 | def __init__(self): 127 | self.destructive = False 128 | self.props = (self.destructive,) 129 | 130 | def __hash__(self): 131 | return hash((type(self), self.props)) 132 | 133 | def __eq__(self, other): 134 | return (type(self) == type(other) and self.props == other.props) 135 | 136 | def infer_shape(self, node, shapes): 137 | return shapes 138 | 139 | def __str__(self): 140 | return 'ZeroForNan' 141 | 142 | def make_node(self, K): 143 | K = TT.as_tensor_variable(K) 144 | return Apply(self, [K], [K.type()]) 145 | 146 | def perform(self, node, inputs, outputs): 147 | K, = inputs 148 | rval = K.copy() 149 | rval[np.isnan(rval)] = 0 150 | outputs[0][0] = rval 151 | 152 | def connection_pattern(self, node): 153 | return [[True]] 154 | 155 | def grad(self, inputs, gradients): 156 | #K, = inputs 157 | gY, = gradients 158 | return [gY] 159 | 160 | zero_for_nan = ZeroForNan() 161 | 162 | 163 | class IsNan(Op): 164 | """ Return a square matrix with the diagonal zero-d out. 165 | 166 | The advantage of this Op over masking techniques based on arithmetic 167 | is that this Op can remove NaNs from the diagonal. 168 | """ 169 | def __init__(self): 170 | self.destructive = False 171 | self.props = (self.destructive,) 172 | 173 | def __hash__(self): 174 | return hash((type(self), self.props)) 175 | 176 | def __eq__(self, other): 177 | return (type(self) == type(other) and self.props == other.props) 178 | 179 | def infer_shape(self, node, shapes): 180 | return shapes 181 | 182 | def __str__(self): 183 | return 'IsNan' 184 | 185 | def make_node(self, K): 186 | K = TT.as_tensor_variable(K) 187 | otype = TT.TensorType(dtype='int8', 188 | broadcastable=K.broadcastable) 189 | return Apply(self, [K], [otype()]) 190 | 191 | def perform(self, node, inputs, outputs): 192 | outputs[0][0] = np.isnan(inputs[0]).astype('int8') 193 | 194 | #def connection_pattern(self, node): 195 | #return [[False]] 196 | 197 | def grad(self, inputs, gradients): 198 | return [gradient.DisconnectedType()()] 199 | 200 | isnan = IsNan() 201 | 202 | import scipy.linalg 203 | import theano 204 | from theano.gof import local_optimizer, PureOp 205 | from theano.tensor.opt import (register_stabilize, 206 | register_specialize, register_canonicalize) 207 | from theano.sandbox.linalg.ops import Cholesky 208 | 209 | class LazyCholesky(PureOp): 210 | def __init__(self, lower): 211 | self.lower = lower 212 | self.props = (lower,) 213 | 214 | def __hash__(self): 215 | return hash((type(self), self.props)) 216 | 217 | def __eq__(self, other): 218 | return (type(self) == type(other) and self.props == other.props) 219 | 220 | def make_node(self, X, use_buf, buf_idx): 221 | return Apply(self, 222 | [X, use_buf, buf_idx], 223 | [X.type(), theano.gof.type.generic()]) 224 | 225 | def infer_shape(self, node, shapes): 226 | return [shapes[0], None] 227 | 228 | def make_thunk(self, node, storage_map, compute_map, no_recycling): 229 | s_X, s_use_buf, s_buf_idx = node.inputs 230 | s_chol, s_buf = node.outputs 231 | comp_X = compute_map[s_X] 232 | comp_use_buf = compute_map[s_use_buf] 233 | comp_buf_idx = compute_map[s_buf_idx] 234 | comp_chol = compute_map[s_chol] 235 | #comp_buf = compute_map[s_buf] 236 | 237 | stor_X = storage_map[s_X] 238 | stor_use_buf = storage_map[s_use_buf] 239 | stor_buf_idx = storage_map[s_buf_idx] 240 | stor_chol = storage_map[s_chol] 241 | stor_buf = storage_map[s_buf] 242 | def thunk(): 243 | # -- compute the use_buf flag 244 | if not comp_use_buf[0]: 245 | return [1] 246 | if not comp_buf_idx[0]: 247 | return [2] 248 | buf_idx = int(stor_buf_idx[0]) 249 | use_buf = stor_use_buf[0] 250 | if use_buf: 251 | buf_dict = stor_buf[0] 252 | assert buf_dict is not None, 'buf output is empty' 253 | chol = buf_dict[buf_idx] 254 | else: 255 | # -- compute a cholesky and store to buffer 256 | if not comp_X[0]: 257 | return [0] 258 | X = stor_X[0] 259 | chol = scipy.linalg.cholesky(X, lower=self.lower) 260 | print 'computing cholesky', buf_idx 261 | if stor_buf[0] is None: 262 | stor_buf[0] = {} 263 | chol = chol.astype(X.dtype) 264 | buf_dict = stor_buf[0] 265 | buf_dict[buf_idx] = chol 266 | 267 | stor_chol[0] = chol.copy() 268 | comp_chol[0] = 1 269 | return [] 270 | 271 | thunk.lazy = True 272 | thunk.inputs = [storage_map[v] for v in node.inputs] 273 | thunk.outputs = [storage_map[v] for v in node.outputs] 274 | return thunk 275 | 276 | use_lazy_cholesky = False 277 | use_lazy_cholesky_idx = None 278 | 279 | @register_specialize 280 | @local_optimizer(None) 281 | def lazy_cholesky(node): 282 | """ 283 | If a general solve() is applied to the output of a cholesky op, then 284 | replace it with a triangular solve. 285 | """ 286 | if not use_lazy_cholesky: 287 | return 288 | 289 | if isinstance(node.op, Cholesky): 290 | assert use_lazy_cholesky.name 291 | for var in node.fgraph.variables: 292 | if var.name == use_lazy_cholesky.name: 293 | break 294 | else: 295 | raise Exception('var not found in graph', use_lazy_cholesky) 296 | buf_flag = var 297 | 298 | for var in node.fgraph.variables: 299 | if var.name == use_lazy_cholesky_idx.name: 300 | break 301 | else: 302 | raise Exception('var not found in graph', use_lazy_cholesky_idx) 303 | buf_idx = var 304 | assert buf_idx is not buf_flag 305 | X, = node.inputs 306 | chol, buf = LazyCholesky(node.op.lower)(X, buf_flag, buf_idx) 307 | assert chol.type == node.outputs[0].type 308 | return [chol] 309 | 310 | 311 | from scipy.stats import norm 312 | 313 | class NormalLogEIDiffSigmaScalar(theano.scalar.basic.ScalarOp): 314 | nin = 2 315 | def __eq__(self, other): 316 | return type(self) == type(other) 317 | 318 | def __hash__(self): 319 | return hash(type(self)) 320 | 321 | def impl(self, diff, sigma): 322 | z = diff / sigma 323 | if z < 34: 324 | a = -diff * norm.cdf(-z) 325 | b = sigma * norm.pdf(-z) 326 | rval = np.log(a + b) 327 | else: 328 | rval = (-4.86466981 329 | -0.12442506 * z 330 | -0.49903031 * z ** 2) 331 | return rval 332 | 333 | def c_code(self, node, name, inp, out, sub): 334 | diff, sigma = inp 335 | y, = out 336 | z = y + '_z' 337 | a = y + '_a' 338 | b = y + '_b' 339 | cdf = y + '_cdf' 340 | pdf = y + '_pdf' 341 | #root_2pi = '%' % np.sqrt(2 * np.pi) 342 | if node.inputs[0].type in theano.scalar.basic.float_types: 343 | return """ 344 | double %(z)s = %(diff)s / %(sigma)s; 345 | if (%(z)s < 34) 346 | { 347 | double %(cdf)s = .5 * erfc(%(z)s / sqrt(2.)); 348 | double %(pdf)s = exp(-.5 * %(z)s * %(z)s) / sqrt(2 * M_PI); 349 | double %(a)s = -%(diff)s * %(cdf)s; 350 | double %(b)s = %(sigma)s * %(pdf)s; 351 | %(y)s = log(%(a)s + %(b)s); 352 | } 353 | else 354 | { 355 | %(y)s = -4.86466981 356 | -0.12442506 * %(z)s 357 | -0.49903031 * %(z)s * %(z)s; 358 | } 359 | """ % locals() 360 | raise NotImplementedError('only floating point is implemented') 361 | 362 | def c_code_cache_version(self): 363 | return (1,) 364 | 365 | def grad(self, inp, grads): 366 | y = self(*inp) 367 | gy, = grads 368 | float_out = theano.scalar.basic.float_out 369 | gd = NormalLogEIDiffSigmaScalarGrad0(float_out)(y, gy, *inp) 370 | gs = NormalLogEIDiffSigmaScalarGrad1(float_out)(y, gy, *inp) 371 | return gd, gs 372 | 373 | class NormalLogEIDiffSigmaScalarGrad0(theano.scalar.basic.ScalarOp): 374 | nin = 4 375 | def __eq__(self, other): 376 | return type(self) == type(other) 377 | 378 | def __hash__(self): 379 | return hash(type(self)) 380 | 381 | def impl(self, logEI, glogEI, diff, sigma): 382 | z = diff / sigma 383 | if z < 34: 384 | logcdf = norm.logcdf(-z, 0, 1) 385 | ddiff = -np.exp(logcdf - logEI) # aka: -cdf / EI 386 | else: 387 | foo = 2 * .49903031 388 | dz = (-0.12442506 - foo * z) 389 | ddiff = dz / sigma 390 | return ddiff * glogEI 391 | 392 | def c_code(self, node, name, inp, out, sub): 393 | logEI, glogEI, diff, sigma = inp 394 | y, = out 395 | z = y + '_z' 396 | logcdf = y + '_logcdf' 397 | #root_2pi = '%' % np.sqrt(2 * np.pi) 398 | if node.inputs[0].type in theano.scalar.basic.float_types: 399 | return """ 400 | double %(z)s = %(diff)s / %(sigma)s; 401 | if (%(z)s < 34) 402 | { 403 | double %(logcdf)s = log(.5) + log(erfc(%(z)s / sqrt(2.))); 404 | %(y)s = -exp(%(logcdf)s - %(logEI)s) * %(glogEI)s; 405 | } 406 | else 407 | { 408 | %(y)s = (-0.12442506 - 2 * .49903031 * %(z)s) 409 | / %(sigma)s 410 | * %(glogEI)s; 411 | } 412 | """ % locals() 413 | raise NotImplementedError('only floating point is implemented') 414 | 415 | def c_code_cache_version(self): 416 | return (1,) 417 | 418 | class NormalLogEIDiffSigmaScalarGrad1(theano.scalar.basic.ScalarOp): 419 | nin = 4 420 | def __eq__(self, other): 421 | return type(self) == type(other) 422 | 423 | def __hash__(self): 424 | return hash(type(self)) 425 | 426 | def impl(self, logEI, glogEI, diff, sigma): 427 | z = diff / sigma 428 | if z < 34: 429 | logpdf = norm.logpdf(-z, 0, 1) 430 | dsigma = np.exp(logpdf - logEI) # aka: pdf / EI 431 | else: 432 | foo = 2 * .49903031 433 | dz = (-0.12442506 - foo * z) 434 | dsigma = dz * (-z / sigma) 435 | #(foo * z) ** 2 / sigma 436 | return dsigma * glogEI 437 | 438 | def c_code(self, node, name, inp, out, sub): 439 | logEI, glogEI, diff, sigma = inp 440 | y, = out 441 | z = y + '_z' 442 | logpdf = y + '_logpdf' 443 | #root_2pi = '%' % np.sqrt(2 * np.pi) 444 | if node.inputs[0].type in theano.scalar.basic.float_types: 445 | return """ 446 | double %(z)s = %(diff)s / %(sigma)s; 447 | if (%(z)s < 34) 448 | { 449 | double %(logpdf)s = -.5 * (log(2 * M_PI) + %(z)s * %(z)s); 450 | %(y)s = exp(%(logpdf)s - %(logEI)s) * %(glogEI)s; 451 | } 452 | else 453 | { 454 | %(y)s = (-0.12442506 - 2 * .49903031 * %(z)s) 455 | * (-%(z)s / %(sigma)s) 456 | * %(glogEI)s; 457 | } 458 | """ % locals() 459 | raise NotImplementedError('only floating point is implemented') 460 | 461 | def c_code_cache_version(self): 462 | return (1,) 463 | 464 | normal_logEI_diff_sigma_scalar = NormalLogEIDiffSigmaScalar( 465 | theano.scalar.upgrade_to_float_no_complex, 466 | name='normal_logEI_diff_sigma_elemwise') 467 | 468 | normal_logEI_diff_sigma_elemwise = theano.tensor.Elemwise( 469 | normal_logEI_diff_sigma_scalar) 470 | 471 | class NormalLogEIDiffSigma(theano.Op): 472 | def __eq__(self, other): 473 | return type(self) == type(other) 474 | 475 | def __hash__(self): 476 | return hash((type(self),)) 477 | 478 | def make_node(self, diff, sigma): 479 | diff = theano.tensor.as_tensor_variable(diff) 480 | sigma = theano.tensor.as_tensor_variable(sigma) 481 | foo = diff + sigma 482 | return theano.Apply(self, [diff, sigma], [foo.type()]) 483 | 484 | def perform(self, node, inputs, output_storage): 485 | diff, sigma = inputs 486 | z = diff / sigma 487 | # -- the following formula is cuter, but 488 | # Theano doesn't produce as stable a gradient I think? 489 | #return sigma * (z * s_normal_cdf(z, 0, 1) + s_normal_pdf(z, 0, 1)) 490 | a = -diff * norm.cdf(-z, 0, 1) 491 | b = sigma * norm.pdf(-z, 0, 1) 492 | rval_naive = np.log(a + b) 493 | zz = z[z > 34] 494 | interp = (-4.86466981 495 | -0.12442506 * zz 496 | -0.49903031 * zz ** 2) 497 | rval_naive[z > 34] = interp 498 | output_storage[0][0] = rval_naive 499 | 500 | def grad(self, inputs, output_gradients): 501 | y = NormalLogEIDiffSigma()(*inputs) 502 | gy, = output_gradients 503 | return NormalLogEIGrad()(y, gy, *inputs) 504 | 505 | normal_logEI_diff_sigma = NormalLogEIDiffSigma() 506 | 507 | 508 | class NormalLogEIGrad(theano.Op): 509 | def __eq__(self, other): 510 | return type(self) == type(other) 511 | 512 | def __hash__(self): 513 | return hash((type(self),)) 514 | 515 | def make_node(self, logEI, gEI, diff, sigma): 516 | return theano.Apply(self, 517 | [logEI, gEI, diff, sigma], 518 | [diff.type(), sigma.type()]) 519 | 520 | def perform(self, node, inputs, output_storage): 521 | logEI, gEI, diff, sigma = inputs 522 | z = diff / sigma 523 | logcdf = norm.logcdf(-z, 0, 1) 524 | logpdf = norm.logpdf(-z, 0, 1) 525 | #for zi, a, b, c in zip(z, logcdf, logpdf, logEI): 526 | #print zi, 'cdf', a, 'pdf', b, 'EI', c, 'logdz', a - c, 'logsig', b - c 527 | dz = -np.exp(logcdf - logEI) # aka: -cdf / EI 528 | dsigma = np.exp(logpdf - logEI) # aka: pdf / EI 529 | 530 | #if np.any(z > 20): 531 | # print 'NormalLogEIGrad: bigz', z[z > 20] 532 | 533 | foo = 2 * .49903031 534 | dz[z > 34] = -0.12442506 - foo * z[z > 34] 535 | dsigma[z > 34] = dz[z > 34] * (-z[z > 34] / sigma[z > 34]) 536 | dz[z > 34] /= sigma[z > 34] 537 | 538 | output_storage[0][0] = dz * gEI 539 | output_storage[1][0] = dsigma * gEI 540 | #if np.any(np.isnan(dz)): 541 | # import pdb; pdb.set_trace() 542 | #print ('logEI grad: gEI=%s dz=%s dsigma=%s' % (gEI, dz, dsigma)) 543 | 544 | 545 | # -- eof 546 | -------------------------------------------------------------------------------- /hp_gpsmbo/prodkernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .kernels_base import Kernel 3 | 4 | import theano.tensor as TT 5 | from .op_Kcond import zero_diag, isnan as s_isnan 6 | from .kernels_base import euclidean_sq_distances 7 | 8 | class SqExpProd(Kernel): 9 | """ 10 | 11 | K(x,y) = exp(- ||x-y||^2 / (2 l^2)) 12 | 13 | N.B. the kernel is parameterized by quantity 14 | 15 | alpha = log( 2 * l^2) 16 | 17 | So that 18 | 19 | K(x, y) = exp(- ||x - y|| ** 2 / exp(alpha)) 20 | l = sqrt(exp(alpha) / 2) 21 | 22 | 23 | """ 24 | 25 | @staticmethod 26 | def _alpha_from_l(l): 27 | return np.log(2.0 * l ** 2) 28 | 29 | @staticmethod 30 | def _l_from_alpha(alpha): 31 | return np.sqrt(np.exp(alpha) / 2.) 32 | 33 | def __init__(self, 34 | seq_kern_slice): 35 | #lenscales_0, 36 | #lenscales_min, 37 | #lenscales_max, 38 | #conditional): 39 | kerns, slices = zip(*seq_kern_slice) 40 | self._conditional = kerns[0]._conditional 41 | assert all(self._conditional == kern._conditional 42 | for kern, slc in seq_kern_slice) 43 | self._lenscales_0 = np.asarray([kern._lenscale0 for kern in kerns]) 44 | self._lenscales_min = np.asarray([kern._lenscale_min for kern in kerns]) 45 | self._lenscales_max = np.asarray([kern._lenscale_max for kern in kerns]) 46 | 47 | self._n_warp_segments_per_X = 0 48 | if self._conditional: 49 | self.n_params = 3 + self._n_warp_segments_per_X 50 | else: 51 | self.n_params = 1 + self._n_warp_segments_per_X 52 | self.n_params *= len(kerns) 53 | self.N = len(kerns) 54 | def getidx(slc): 55 | assert slc.start + 1 == slc.stop and slc.step == None 56 | return slc.start 57 | self.column_idxs = np.asarray(map(getidx, slices)) 58 | self.s_idxs = TT.as_tensor_variable(self.column_idxs) 59 | 60 | def prodkey(self): 61 | # -- unique identifier of mergeable product sets 62 | return (type(self), 63 | self._conditional, 64 | self._n_warp_segments_per_X) 65 | 66 | def reduce_lenscale(self, params): 67 | new_l = np.maximum(self._lenscales_min, 68 | self._l_from_alpha(np.asarray(params[0:self.N]) - 1)) 69 | rval = list(params) 70 | rval[0:self.N] = self._alpha_from_l(new_l) 71 | return rval 72 | 73 | def unpack(self, params): 74 | alpha = params[0:self.N] 75 | cond_x = params[self.N:2 * self.N] 76 | cond_y = params[2 * self.N: 3 * self.N] 77 | return alpha, cond_x, cond_y 78 | 79 | def s_logprior(self, s_params, strength=10.0): 80 | # -- I don't know what distribution this would be 81 | # but I think it makes a nice shape 82 | s_alpha, s_cond_x, s_cond_y = self.unpack(s_params) 83 | n_alpha_min = self._alpha_from_l(self._lenscales_min) 84 | n_alpha_max = self._alpha_from_l(self._lenscales_max) 85 | #return strength * (alpha - alpha_min) ** 2 86 | log0 = -10000 87 | width = n_alpha_max - n_alpha_min 88 | #alpha_mean = 0.5 * (alpha_max + alpha_min) 89 | energy = strength * 0.5 * (s_alpha - n_alpha_max) ** 2 / width ** 2 90 | lenscale_logprior = TT.switch(s_alpha < n_alpha_min, 91 | log0, 92 | TT.switch(s_alpha < n_alpha_max, 93 | -energy, 94 | log0)).sum() 95 | if self._conditional: 96 | diff_x = s_cond_x 97 | diff_y = s_cond_y - 1 98 | rval = (lenscale_logprior 99 | + TT.dot(diff_x, diff_x) 100 | + TT.dot(diff_y, diff_y)) 101 | else: 102 | rval = lenscale_logprior 103 | assert rval.ndim == 0 104 | return rval 105 | 106 | def cond_x(self, s_x, s_params): 107 | #import theano 108 | #s_x_all = theano.printing.Print('x_all')(s_x_all) 109 | #s_x = s_x_all.T[self.s_idxs].T 110 | s_alpha, s_missing_x, s_missing_y = self.unpack(s_params) 111 | assert s_x.ndim == 2 112 | #s_x = TT.addbroadcast(s_x, 1) 113 | if self._conditional: 114 | filled_x = TT.switch(s_isnan(s_x), s_missing_x, s_x) 115 | filled_y = TT.switch(s_isnan(s_x), s_missing_y, 0) 116 | else: 117 | filled_x = s_x 118 | filled_y = None 119 | assert filled_x.ndim == 2 120 | return filled_x, filled_y 121 | 122 | 123 | def opt_logK(self, s_x, s_params): 124 | s_alpha, s_missing_x, s_missing_y = self.unpack(s_params) 125 | filled_x, filled_y = self.cond_x(s_x, s_params) 126 | 127 | lenscales = TT.sqrt(.5 * TT.exp(s_alpha)) 128 | 129 | dist_sq = euclidean_sq_distances(filled_x / lenscales, 130 | filled_x / lenscales) 131 | if filled_y is not None: 132 | dist_sq += euclidean_sq_distances(filled_y / lenscales, 133 | filled_y / lenscales) 134 | # Geometric 135 | logK = -0.5 * dist_sq 136 | 137 | params0 = list(self._alpha_from_l(self._lenscales_0)) 138 | if self._conditional: 139 | params0.extend([0.] * self.N) 140 | params0.extend([1.] * self.N) 141 | params0.extend([0.] * self._n_warp_segments_per_X) 142 | amin = self._alpha_from_l(self._lenscales_min) 143 | amax = self._alpha_from_l(self._lenscales_max) 144 | bounds = zip(amin, amax) 145 | if self._conditional: 146 | bounds.extend([(-5., 5.)] * self.N) 147 | bounds.extend([(1e-5, 5.)] * self.N) 148 | if self._n_warp_segments_per_X: 149 | #bounds.extend([(-.2, 2.)] * self._n_warp_segments) 150 | raise NotImplementedError() 151 | return logK, params0, bounds 152 | 153 | def predict_logK(self, s_x, s_z, s_params): 154 | filled_x_x, filled_x_y = self.cond_x(s_x, s_params) 155 | filled_z_x, filled_z_y = self.cond_x(s_z, s_params) 156 | 157 | s_alpha, s_missing_x, s_missing_y = self.unpack(s_params) 158 | lenscales = TT.sqrt(.5 * TT.exp(s_alpha)) 159 | 160 | dist_xx_sq = euclidean_sq_distances(filled_x_x / lenscales, 161 | filled_x_x / lenscales) 162 | dist_xz_sq = euclidean_sq_distances(filled_x_x / lenscales, 163 | filled_z_x / lenscales) 164 | if filled_x_y is not None: 165 | dist_xx_sq += euclidean_sq_distances(filled_x_y / lenscales, 166 | filled_x_y / lenscales) 167 | dist_xz_sq += euclidean_sq_distances(filled_x_y / lenscales, 168 | filled_z_y / lenscales) 169 | logK = -0.5 * dist_xx_sq 170 | logK_new = -0.5 * dist_xz_sq 171 | 172 | #x2 = self.cond_x(s_x, s_params) 173 | #z2 = self.cond_x(s_z, s_params) 174 | #logK = self._logK_of_dist( 175 | #euclidean_sq_distances(x2, x2), s_params, True) 176 | #logK_new = self._logK_of_dist( 177 | #euclidean_sq_distances(x2, z2), s_params, False) 178 | return logK, logK_new 179 | -------------------------------------------------------------------------------- /hp_gpsmbo/scrap.py: -------------------------------------------------------------------------------- 1 | if 1: 2 | keyfunc = lambda nc: nc[1]['node'].name 3 | hps_by_type = dict() 4 | idxs_by_type = dict() 5 | kerns = [] 6 | for distname, labels_hps in groupby(sorted(self.config.items(), 7 | key=keyfunc), 8 | keyfunc): 9 | label_list, hp_list = zip(*list(labels_hps)) 10 | hps_by_type[distname] = hp_list 11 | idxs_by_type[distname] = map(self.hps.index, label_list) 12 | foo = hps_by_type[distname] 13 | print distname, len(foo), idxs_by_type[distname] 14 | kerns.append(ph['kernel']) 15 | 16 | param_helper = ParamHelper(self.config) 17 | 18 | x_bounds = [(None, None)] * len(self.hps) 19 | ndim_offset = 0 20 | for hpname in self.hps: 21 | ph = self.param_helpers[hpname] = param_helper(hpname) 22 | 23 | import sys 24 | sys.exit() 25 | else: 26 | 27 | 28 | 29 | 30 | class ConvexMixtureKernel(object): 31 | """ 32 | 33 | Attributes: 34 | 35 | kernels - 36 | element_ranges - each kernel looks at these elements (default ALL) 37 | feature_names - 38 | raw_coefs - 39 | coefs - 40 | 41 | """ 42 | def __init__(self, **kwargs): 43 | self.__dict__.update(kwargs) 44 | 45 | def __str__(self): 46 | coefs = self.coefs_f() 47 | ks = [str(k) for k in self.kernels] 48 | return 'ConvexMixtureKernel{%s}'%(','.join(['%s*%s'%(str(c),s) for c,s in zip(coefs, ks)])) 49 | 50 | def summary(self): 51 | import StringIO 52 | ss = StringIO.StringIO() 53 | coefs = self.coefs_f() 54 | print >> ss, "ConvexMixtureKernel:" 55 | for c, k,fname in zip(coefs,self.kernels, self.feature_names): 56 | print >> ss, " %f * %s '%s'" %(c, str(k), fname) 57 | return ss.getvalue() 58 | 59 | @classmethod 60 | def alloc(cls, kernels, coefs=None, element_ranges=None, feature_names=None): 61 | if coefs is None: 62 | raw_coefs = theano.shared(np.zeros(len(kernels))) 63 | print "HAAACK" 64 | raw_coefs.get_value(borrow=True)[0] += 1 65 | else: 66 | raise NotImplementedError() 67 | coefs=TT.nnet.softmax(raw_coefs.dimshuffle('x',0))[0] 68 | coefs_f = theano.function([], coefs) 69 | return cls( 70 | kernels=kernels, 71 | coefs=coefs, 72 | coefs_f = coefs_f, #DEBUG 73 | raw_coefs = raw_coefs, 74 | element_ranges=element_ranges, 75 | feature_names = feature_names, 76 | ) 77 | 78 | def params(self): 79 | rval = [self.raw_coefs] 80 | for k in self.kernels: 81 | rval.extend(k.params()) 82 | return rval 83 | def param_bounds(self): 84 | rval = [(self.raw_coefs_min, self.raw_coefs_max)] 85 | for k in self.kernels: 86 | rval.extend(k.param_bounds()) 87 | return rval 88 | 89 | def K(self, x, y): 90 | # get the kernel matrix from each sub-kernel 91 | if self.element_ranges is None: 92 | Ks = [kernel.K(x,y) for kernel in self.kernels] 93 | else: 94 | assert len(self.element_ranges) == len(self.kernels) 95 | Ks = [kernel.K(x[:,er[0]:er[1]],y[:,er[0]:er[1]]) 96 | for (kernel,er) in zip(self.kernels, self.element_ranges)] 97 | # stack them up 98 | Kstack = TT.stack(*Ks) 99 | # multiply by coefs 100 | # and sum down to one kernel 101 | K = TT.sum(self.coefs.dimshuffle(0,'x','x') * Kstack, 102 | axis=0) 103 | return K 104 | 105 | 106 | 107 | class Exp(SqExp): 108 | """ 109 | K(x,y) = exp(- ||x-y|| / l) 110 | 111 | """ 112 | 113 | def __init__(self, **kwargs): 114 | self.__dict__.update(kwargs) 115 | if self.log_lenscale.ndim!=0: 116 | raise TypeError('log_lenscale must be scalar', self.log_lenscale) 117 | 118 | def __str__(self): 119 | l = np.exp(self.log_lenscale.value) 120 | return "ExponentialKernel{l=%s}"%str(l) 121 | 122 | @classmethod 123 | def alloc(cls, l=1, l_min=1e-4, l_max=1000): 124 | log_l = np.log(l) 125 | log_lenscale = theano.shared(log_l) 126 | if l_min is None: 127 | log_lenscale_min = None 128 | else: 129 | log_lenscale_min = np.log(2*(l_min**2)) 130 | if l_max is None: 131 | log_lenscale_max = None 132 | else: 133 | log_lenscale_max = np.log(2*(l_max**2)) 134 | return cls(log_lenscale=log_lenscale, 135 | log_lenscale_min=log_lenscale_min, 136 | log_lenscale_max=log_lenscale_max) 137 | 138 | def params(self): 139 | return [self.log_lenscale] 140 | 141 | def param_bounds(self): 142 | return [(self.log_lenscale_min, self.log_lenscale_max)] 143 | 144 | def K(self, x, y): 145 | l = TT.exp(self.log_lenscale) 146 | d = ((x**2).sum(axis=1).dimshuffle(0,'x') 147 | + (y**2).sum(axis=1) 148 | - 2 * TT.dot(x, y.T)) 149 | K = TT.exp(-TT.sqrt(d)/l) 150 | return K 151 | 152 | 153 | class CategoryKernel(object): 154 | """ 155 | K(x,y) is 1 if x==y else exp(-1/l) 156 | 157 | The idea is that it's like a SquaredExponentialKernel 158 | where every point is a distance of 1 from every other one, 159 | except itself. 160 | 161 | Attributes: 162 | 163 | l - 164 | 165 | """ 166 | def __init__(self, **kwargs): 167 | self.__dict__.update(kwargs) 168 | if self.l.ndim!=0: 169 | raise TypeError('log_denom must be scalar', self.l) 170 | def lenscale(self, thing=None): 171 | if thing is None: 172 | thing = self.l 173 | return value(thing) 174 | def __str__(self): 175 | l = self.lenscale() 176 | (a,b), = self.param_bounds() 177 | return "CategoryKernel{l=%s,bounds=(%s,%s)}"%( 178 | str(l), str(a), str(b)) 179 | 180 | @classmethod 181 | def alloc(cls, l=1.0, l_min=1e-5, l_max=100.): 182 | l = theano.shared(l) 183 | return cls(l=l, 184 | l_min=l_min, 185 | l_max=l_max, 186 | ) 187 | 188 | def params(self): 189 | return [self.l] 190 | def param_bounds(self): 191 | return [(self.l_min, self.l_max)] 192 | 193 | def K(self, x, y): 194 | xx = x.reshape((x.shape[0],)) 195 | yy = y.reshape((y.shape[0],)) 196 | xx = xx.dimshuffle(0,'x') # drop cols because there should only be 1 197 | yy = yy.dimshuffle(0) # drop cols because there should only be 1 198 | K = TT.exp(-TT.neq(xx,yy)/self.l) 199 | return K 200 | 201 | 202 | 203 | class GPR_HMC_for_SGD_EI_OPT(object): 204 | def __init__(self): 205 | # ... 206 | 207 | self.s_EI_pts = theano.shared(np.zeros((2, 2))) 208 | self.s_EI_vals = theano.shared(np.zeros(2)) 209 | self.s_EI_step = theano.tensor.dscalar('EI_step') 210 | self.s_EI_thresh = theano.shared(0.0) 211 | 212 | s_mean_x, s_var_x, s_x = self.kernel.s_mean_var( 213 | self.s_X, 214 | self.s_y, 215 | self.s_var_y, 216 | self.s_emp_var, 217 | self.positions[0], 218 | self.s_var_min, 219 | x_new=self.s_EI_pts) 220 | s_logEI = s_normal_logEI( 221 | - self.s_EI_thresh, 222 | - (s_mean_x + self.s_emp_mean), 223 | s_var_x, 224 | quad_approx=True) 225 | print 'compiling update_EI_pts fn' 226 | self.update_EI_pts = theano.function( 227 | [self.s_EI_step], 228 | [], 229 | updates=[ 230 | (self.s_EI_pts, TT.clip( 231 | self.s_EI_pts + self.s_EI_step * TT.grad(s_logEI.sum(), 232 | self.s_EI_pts), 233 | np.asarray(bounds)[:, 0], 234 | np.asarray(bounds)[:, 1])), 235 | (self.s_EI_vals, 0.95 * self.s_EI_vals + .05 * s_logEI), 236 | ], 237 | allow_input_downcast=True) 238 | 239 | def fit_and_optimize_EI(self, X, y, var_y, debug, ion, 240 | EI_pts): 241 | print 'setting up' 242 | self.s_emp_mean.set_value(np.mean(y)) 243 | self.s_emp_var.set_value(max(np.var(y), np.min(var_y))) 244 | self.s_X.set_value(X) 245 | self.s_y.set_value(y - self.s_emp_mean.get_value()) 246 | self.s_var_y.set_value(var_y + np.zeros(len(y))) 247 | self.s_EI_pts.set_value(EI_pts) 248 | self.s_EI_vals.set_value(np.zeros(len(EI_pts))) 249 | self.s_EI_thresh.set_value(np.min(y)) 250 | 251 | samples = [] 252 | nlls = [] 253 | costs = [] 254 | t0 = time.time() 255 | hmc_duration = 10.0 # seconds 256 | print 'running the sampler' 257 | while time.time() < (t0 + hmc_duration): 258 | try: 259 | tt = time.time() - t0 260 | pos = self.sampler.draw() 261 | self.update_EI_pts(.003 * min(1, 1. / (.1 + tt))) 262 | samples.append(pos.flatten()) 263 | if debug: 264 | nll_ii, cost_ii = self.nll_fn(pos.flatten()) 265 | #print s_EI_vals.get_value() 266 | print 'best_EI', self.s_EI_vals.get_value().min() 267 | print 'current position', pos.flatten(), 268 | print 'accept rate', self.sampler.avg_acceptance_rate.get_value(), 269 | print 'nll', nll_ii, 'cost', cost_ii 270 | nlls.append(nll_ii) 271 | costs.append(cost_ii) 272 | except ValueError, e: 273 | # -- XXX should not happen 274 | print 'ERROR: HMC crashed after %i draws' % len(samples) 275 | raise 276 | break 277 | 278 | except np.linalg.LinAlgError, e: 279 | print 'ERROR: HMC singular matrix after %i draws' % len(samples) 280 | break 281 | samples = np.asarray(samples) 282 | print 'hmc drew', len(samples) 283 | step = max(1, len(samples) // 10) 284 | keep = samples[::step] 285 | if keep.size == 0: 286 | raise NotImplementedError() 287 | 288 | if debug: 289 | import matplotlib.pyplot as plt 290 | if ion: 291 | plt.figure(2) 292 | if self.kernel.n_params == 1: 293 | plt.subplot(211) 294 | plt.cla() 295 | plt.hist(np.asarray(samples).flatten()) 296 | plt.title('nlls observed during sampling') 297 | plt.subplot(212) 298 | plt.cla() 299 | plt.scatter(samples, nlls, label='nll', c='b') 300 | plt.scatter(samples, costs, label='cost', c='g') 301 | plt.title('nlls vs. alpha') 302 | plt.legend() 303 | if self.kernel.n_params == 2: 304 | plt.cla() 305 | plt.scatter(samples[:, 0], samples[:, 1]) 306 | if ion: 307 | plt.draw() 308 | else: 309 | plt.show() 310 | 311 | class Res(object): 312 | pass 313 | 314 | rval = Res() 315 | best_idx = np.argmax(self.s_EI_vals.get_value()) 316 | rval.x = self.s_EI_pts.get_value()[best_idx] 317 | rval.fun = self.s_EI_vals.get_value()[best_idx] 318 | 319 | self._params_list = keep 320 | self._params_weights = np.ones(len(keep)) / len(keep) 321 | return rval 322 | 323 | class LengthscaleBounds(object): 324 | def __init__(self, config): 325 | self.config = config 326 | 327 | def LU0(self, name): 328 | node = self.config[name]['node'] 329 | return getattr(self, node.name)(node) 330 | 331 | def randint(self, node): 332 | return 0.001, 2.0, 1.0 333 | 334 | def categorical(self, node): 335 | return 0.001, 2.0, 1.0 336 | 337 | def uniform(self, node): 338 | low = float(node.arg['low'].obj) 339 | high = float(node.arg['high'].obj) 340 | thetaL = (high - low) / 20.0 341 | thetaU = (high - low) * 2. 342 | return thetaL, thetaU, (high - low) / 2 343 | 344 | def quniform(self, node): 345 | # -- quantization is irrelevant 346 | return self.uniform(node) 347 | 348 | def loguniform(self, node): 349 | # -- log-scaling has been handled by feature code 350 | return self.uniform(node) 351 | 352 | def qloguniform(self, node): 353 | # -- log-scaling has been handled by feature code 354 | # quantization is irrelevant 355 | return self.uniform(node) 356 | 357 | def normal(self, node): 358 | sigma = float(node.arg['sigma'].obj) 359 | thetaL = sigma / 20.0 360 | thetaU = sigma * 2. 361 | return thetaL, thetaU, sigma 362 | 363 | def qnormal(self, node): 364 | # -- quantization is irrelevant 365 | return self.normal(node) 366 | 367 | def lognormal(self, node): 368 | # -- log-scaling has been handled by feature code 369 | return self.normal(node) 370 | 371 | def qlognormal(self, node): 372 | # -- log-scaling has been handled by feature code 373 | # quantization is irrelevant 374 | return self.normal(node) 375 | 376 | 377 | import numpy as np 378 | 379 | import theano 380 | import theano.tensor as TT 381 | from hyperopt import rand 382 | 383 | from . import gpr_math 384 | from .hpsuggest import SuggestBest, DomainGP 385 | 386 | 387 | class DomainGP_LUCB(DomainGP): 388 | _optimism = 1.0 389 | _sigmoid_bias = -0.0 390 | 391 | def init_cost_fns(self): 392 | try: 393 | self._cost_fn 394 | except AttributeError: 395 | s_optimism = TT.dscalar('optimism') 396 | s_ubound = TT.dscalar('ubound') 397 | s_lbound = TT.dscalar('lbound') 398 | 399 | # s_mean_x means "symbolic mean of x" 400 | s_mean_x, s_var_x, s_x, K_new = self.gpr.kernel.s_mean_var( 401 | self.gpr.s_X, 402 | self.gpr.s_y, 403 | self.gpr.s_var_y, 404 | self.gpr.s_emp_var, 405 | self.gpr.s_params, 406 | self.gpr.s_var_min, 407 | return_K_new=True) 408 | 409 | corrected_mean = s_mean_x + self.gpr.s_emp_mean 410 | # -- good vars are for maximizing, 411 | # in keeping with EI being about improving *over* thresh 412 | good_max = -s_lbound 413 | good_best_seen = -s_ubound 414 | good_mean = -corrected_mean 415 | good_var = s_var_x 416 | 417 | #scalar = 1.0 + s_optimism 418 | 419 | #z = (corrected_mean - s_lbound) / TT.sqrt(s_var_x) 420 | #acq = gpr_math.s_normal_EBI( 421 | # 0, 422 | # -(s_lbound - corrected_mean), 423 | # 0, 424 | # s_var_x) 425 | #s_cost = -(tradeoff * acq) - (1 - tradeoff) * corrected_mean * TT.erf(-z) 426 | 427 | 428 | if 1: # -- use LUCB 429 | #good_var = good_var * s_optimism ** 2 430 | lost_mass = gpr_math.s_normal_cdf(-good_max, 431 | -good_mean, 432 | good_var) 433 | gap = good_max - good_best_seen 434 | drop = s_optimism * gap 435 | #coef = 1. / s_optimism 436 | EBI_ceil = TT.minimum( 437 | good_mean, 438 | good_max - drop) 439 | #coef * good_min + (1 - coef) * good_max) 440 | #max_ceil = good_max - (s_optimism - 1) * gap 441 | acq = ( 442 | EBI_ceil 443 | + ( 444 | gpr_math.s_normal_EBI( 445 | EBI_ceil, good_max, 446 | EBI_ceil, good_var) / (1 - lost_mass))) 447 | #+ (good_max - good_mean) * )) 448 | 449 | elif 1: # -- use bounded EI type thing 450 | ebi_term = gpr_math.s_normal_EBI( 451 | good_min, good_max, 452 | good_mean, good_var) 453 | mass_above_good_max = gpr_math.s_normal_cdf( 454 | -good_max, -good_mean, good_var) 455 | acq = ebi_term + (good_max - good_min) * mass_above_good_max 456 | 457 | s_cost = -acq 458 | try: 459 | s_gx = TT.grad(s_cost.sum(), s_x) 460 | self._cost_deriv = theano.function( 461 | [s_x, self.gpr.s_params, s_optimism, s_ubound, s_lbound], 462 | [s_cost, s_gx], 463 | on_unused_input='warn') 464 | except theano.gradient.DisconnectedInputError: 465 | self._cost_deriv = None 466 | self._cost_fn = theano.function( 467 | [s_x, self.gpr.s_params, s_optimism, s_ubound, s_lbound], 468 | s_cost, 469 | on_unused_input='warn') 470 | self._K_new = theano.function( 471 | [s_x, self.gpr.s_params], K_new) 472 | 473 | def crit(self, X): 474 | self.init_cost_fns() 475 | if len(self.gpr._params_list) > 1: 476 | raise NotImplementedError() 477 | pp, = self.gpr._params_list 478 | return self._cost_fn(X, pp, self._optimism, self._ubound, self._lbound) 479 | 480 | def crit_deriv(self, X): 481 | if self._cost_deriv is None: 482 | raise NotImplementedError() 483 | self.init_cost_fns() 484 | if len(self.gpr._params_list) > 1: 485 | raise NotImplementedError() 486 | pp, = self.gpr._params_list 487 | return self._cost_deriv(X, pp, self._optimism, self._ubound, self._lbound) 488 | 489 | def optimize_over_X(self, n_buckshots, n_finetunes, rng): 490 | while True: 491 | rval_raw = DomainGP.optimize_over_X(self, 492 | n_buckshots, 493 | n_finetunes, 494 | rng, 495 | ret_raw=True) 496 | Ks = self._K_new(np.atleast_2d(rval_raw), self.gpr._params_list[0]) 497 | # XXX: todo, if other non-redundant local optima were discoverd by 498 | # the fine-tuning process then it might better to take them, 499 | # before distorting the utility landscape with this "optimism" 500 | # multiplier. I wonder if one is more "right" to do than the other 501 | if (Ks.max() > (1 - 1e-5)): 502 | if self._optimism < 1e8: 503 | self._optimism *= 2 504 | print 'LUCB raising optimism to', self._optimism 505 | else: 506 | print "LUCB error finding new point!" 507 | else: 508 | break 509 | best_pt = self.best_pt_from_featurevec(rval_raw) 510 | return best_pt 511 | 512 | 513 | _suggest_domain_cache = {} 514 | def suggest(new_ids, domain, trials, seed, 515 | warmup_cutoff=1, 516 | n_buckshots=10000, 517 | n_finetunes=50, 518 | best_possible=-np.inf, 519 | #best_headroom=1.0, 520 | stop_at=None, 521 | plot_contours=None, 522 | ): 523 | """ 524 | Parameters 525 | ---------- 526 | 527 | """ 528 | if stop_at is not None and stop_at < best_possible: 529 | raise ValueError( 530 | ('If stop_at is specified' 531 | ', it (%f) must be greater than best_possible (%f)') % ( 532 | stop_at, best_possible)) 533 | 534 | if len(trials.trials) <= warmup_cutoff: 535 | return rand.suggest(new_ids, domain, trials, seed) 536 | 537 | try: 538 | dgp = _suggest_domain_cache[domain] 539 | except KeyError: 540 | dgp = _suggest_domain_cache[domain] = DomainGP_LUCB(domain) 541 | 542 | if stop_at is not None and min(trials.losses()) < stop_at: 543 | return [] 544 | 545 | X, y, var_y = dgp._X_y_var_y(trials) 546 | dgp.fit_gpr(X, y, var_y) 547 | dgp._optimism = 1.0 #0.5 * dgp._optimism 548 | dgp._ubound = np.min(y) 549 | dgp._lbound = best_possible 550 | 551 | #yy = y + np.sqrt(np.maximum(var_y, dgp.gpr.s_var_min.eval())) 552 | #dgp._ubound = np.min(yy) 553 | #max(opt_lbound, - best_headroom) 554 | #print 'LUCB interval:', dgp._lbound, dgp._ubound 555 | 556 | print 'LUCB: Best after %i trials: %f' % ( len(y), np.min(y)) 557 | #dgp.gpr._params_list[0][:] = 0 558 | rng = np.random.RandomState(seed) 559 | best_pt = dgp.optimize_over_X( 560 | n_buckshots=n_buckshots, 561 | n_finetunes=n_finetunes, 562 | rng=rng, 563 | ) 564 | if plot_contours: 565 | plot_contours(dgp, 2, dgp._lbound, best_pt) 566 | new_id, = new_ids 567 | #print 'LUCB: Best pt', best_pt 568 | return SuggestBest(domain, trials, seed, best_pt)(new_id) 569 | 570 | -------------------------------------------------------------------------------- /hp_gpsmbo/suggest_algos.py: -------------------------------------------------------------------------------- 1 | 2 | from .hpsuggest_ei import suggest as ei 3 | from .hpsuggest_ucb import suggest as ucb 4 | #from .hpsuggest_lucb import suggest as lucb 5 | -------------------------------------------------------------------------------- /hp_gpsmbo/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyperopt/hyperopt-gpsmbo/8009f82a18620b33faecca2382f973bc214bd88c/hp_gpsmbo/test/__init__.py -------------------------------------------------------------------------------- /hp_gpsmbo/test/test_branin.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import numpy as np 3 | import hyperopt 4 | from hyperopt.tests.test_domains import branin 5 | import hp_gpsmbo.hpsuggest 6 | 7 | def test_branin(suggest=hp_gpsmbo.hpsuggest.suggest, seed=1, iters=10): 8 | import matplotlib.pyplot as plt 9 | plt.ion() 10 | mins = [] 11 | all_ys = [] 12 | for ii in range(int(seed), int(seed) + int(iters)): 13 | print 'SEED', ii 14 | space = branin() 15 | trials = hyperopt.Trials() 16 | hyperopt.fmin( 17 | fn=lambda x: x, 18 | space=space.expr, 19 | trials=trials, 20 | algo=partial(suggest, stop_at=0.398), 21 | rstate=np.random.RandomState(ii), 22 | max_evals=50) 23 | plt.subplot(2, 1, 1) 24 | plt.cla() 25 | ys = trials.losses() 26 | all_ys.append(ys) 27 | for ys_jj in all_ys: 28 | plt.plot(ys_jj) 29 | plt.plot(trials.losses()) 30 | plt.subplot(2, 1, 2) 31 | plt.cla() 32 | for ys_jj in all_ys: 33 | plt.plot(ys_jj) 34 | plt.ylim(0, 1) 35 | plt.axhline(np.min(ys)) 36 | plt.annotate('min=%f' % np.min(ys), xy=(1, np.min(ys))) 37 | plt.draw() 38 | mins.append(min(ys)) 39 | print 'MINS', mins 40 | assert np.max(mins) < 0.398 41 | -------------------------------------------------------------------------------- /hp_gpsmbo/test/test_gpr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hp_gpsmbo import GPR_ML2, kernels 3 | 4 | #class Test1(unittest.TestCase): 5 | 6 | def test_prior_mean(GPR=GPR_ML2): 7 | # Test that the prior mean and prior variance are respected 8 | # in a simple case where there is just a single data point at 0. 9 | for prior_mean in (-5, 0, 5): 10 | for prior_var in (.1, 1): 11 | gpr = GPR(kernels.SqExp(1.0, 1e-4, 10, conditional=False), 12 | maxiter=1, 13 | prior_var=prior_var, 14 | prior_mean=prior_mean) 15 | gpr.fit([[0]], [1]) 16 | m, v = gpr.predict([[-10], [0], [10]], eval_MSE=True) 17 | assert np.allclose(m[0], prior_mean) 18 | assert np.allclose(m[1], 1) 19 | assert np.allclose(m[2], prior_mean) 20 | assert np.allclose(v[0], prior_var) 21 | assert np.allclose(v[1], 0) 22 | assert np.allclose(v[2], prior_var) 23 | 24 | 25 | def test_data_pts_respected(GPR=GPR_ML2): 26 | X = np.asarray([[-1], [0], [1.5]]) 27 | y = np.asarray([-4, 1, 0.5]) 28 | for prior_mean in (-5, 0, 5): 29 | for prior_var in (.1, 1): 30 | gpr = GPR(kernels.SqExp(1.0, 1e-4, 10, conditional=False), 31 | maxiter=1, 32 | prior_var=prior_var, 33 | prior_mean=prior_mean) 34 | gpr.fit(X, y) 35 | m, v = gpr.predict(X, eval_MSE=True) 36 | assert np.all(v < 1e-7) 37 | assert np.allclose(m, y) 38 | 39 | 40 | # -- flake8 eof 41 | -------------------------------------------------------------------------------- /hp_gpsmbo/test/test_gpr_math.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hp_gpsmbo import gpr_math 3 | import theano 4 | import scipy.stats 5 | 6 | 7 | def test_normal_pdf(): 8 | rng = np.random.RandomState(123) 9 | norm = scipy.stats.norm 10 | 11 | N = 50 12 | x = rng.randn(N) 13 | mean = rng.randn(N) 14 | var = rng.randn(N) ** 2 15 | 16 | s_x, s_m, s_v = theano.tensor.dvectors('xmv') 17 | 18 | fn = theano.function([s_x, s_m, s_v], 19 | gpr_math.s_normal_pdf(s_x, s_m, s_v)) 20 | 21 | 22 | assert np.allclose(norm.pdf(x, mean, np.sqrt(var)), 23 | fn(x, mean, var)) 24 | 25 | 26 | def test_normal_logpdf(): 27 | rng = np.random.RandomState(123) 28 | norm = scipy.stats.norm 29 | 30 | N = 50 31 | x = rng.randn(N) * 10 - 50 32 | mean = rng.randn(N) 33 | var = rng.randn(N) ** 2 34 | 35 | s_x, s_m, s_v = theano.tensor.dvectors('xmv') 36 | 37 | logfn = theano.function([s_x, s_m, s_v], 38 | gpr_math.s_normal_logpdf(s_x, s_m, s_v)) 39 | 40 | assert np.allclose(norm.logpdf(x, mean, np.sqrt(var)), 41 | logfn(x, mean, var)) 42 | 43 | 44 | def test_normal_cdf(): 45 | rng = np.random.RandomState(123) 46 | norm = scipy.stats.norm 47 | 48 | N = 50 49 | x = rng.randn(N) 50 | mean = rng.randn(N) 51 | var = rng.randn(N) ** 2 52 | 53 | #x = np.sort(x) 54 | #mean = np.zeros(N) 55 | #var = np.ones(N) 56 | 57 | s_x, s_m, s_v = theano.tensor.dvectors('xmv') 58 | 59 | fn = theano.function([s_x, s_m, s_v], 60 | gpr_math.s_normal_cdf(s_x, s_m, s_v)) 61 | myval = fn(x, mean, var) 62 | spval = norm.cdf(x, mean, np.sqrt(var)) 63 | for xi, myv, spv in zip(x, myval, spval): 64 | print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv) 65 | 66 | assert np.allclose(norm.cdf(x, mean, np.sqrt(var)), 67 | fn(x, mean, var)) 68 | 69 | def test_normal_logcdf(): 70 | rng = np.random.RandomState(123) 71 | norm = scipy.stats.norm 72 | 73 | N = 50 74 | x = rng.randn(N) * 200 75 | mean = rng.randn(N) 76 | var = rng.randn(N) ** 2 77 | 78 | #mean = np.zeros(N) 79 | #var = np.ones(N) 80 | #x = np.sort(x) 81 | 82 | s_x, s_m, s_v = theano.tensor.dvectors('xmv') 83 | 84 | lcdf = gpr_math.s_normal_logcdf(s_x, s_m, s_v) 85 | 86 | fn = theano.function([s_x, s_m, s_v], lcdf) 87 | 88 | myval= fn(x, mean, var) 89 | spval = norm.logcdf(x, mean, np.sqrt(var)) 90 | for xi, myv, spv in zip(x, myval, spval): 91 | print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv) 92 | assert np.allclose(norm.logcdf(x, mean, np.sqrt(var)), 93 | myval) 94 | 95 | 96 | def test_normal_logEI(): 97 | #rng = np.random.RandomState(123) 98 | 99 | N = 2000 100 | thresh = np.linspace(-10, 50, N) 101 | #N = 100 102 | #thresh = np.linspace(37, 38, N) 103 | mean = thresh * 0 104 | var = thresh * 0 + 1 105 | 106 | s_t, s_m, s_v = theano.tensor.dvectors('tmv') 107 | 108 | fn = theano.function([s_t, s_m, s_v], 109 | gpr_math.s_normal_logEI(s_t, s_m, s_v)) 110 | 111 | if 0: 112 | #print zip(thresh, fn(thresh, mean, var)) 113 | #print 114 | a = theano.tensor.dvector() 115 | y = s_t ** 2 * a[2] + s_t * a[1] + a[0] 116 | cost = ((y - gpr_math.s_normal_logEI(s_t, s_m, s_v)) ** 2).sum() 117 | da = theano.grad(cost, a) 118 | foo = theano.function([a, s_t, s_m, s_v], [cost, da]) 119 | res = scipy.optimize.minimize(foo, [0, -1, -1], jac=True, 120 | args=(thresh, mean, var), 121 | method='L-BFGS-B') 122 | print res.x 123 | 124 | from hyperopt.criteria import logEI_gaussian 125 | if 0: 126 | import matplotlib.pyplot as plt 127 | y = fn(thresh, mean, var) 128 | z = logEI_gaussian(mean, var, thresh) 129 | plt.plot(thresh, y) 130 | plt.plot(thresh, z) 131 | plt.show() 132 | 133 | # -- the gpr_math logEI uses a quadratic approximation for very 134 | # hopeless points, which gives the right derivative, but the 135 | # slightly wrong value 136 | assert np.allclose(logEI_gaussian(mean, var, thresh), 137 | fn(thresh, mean, var), 138 | atol=1e-3, rtol=1e-4) 139 | 140 | if 0: 141 | d_t = theano.grad(gpr_math.s_normal_logEI(s_t, s_m, s_v).sum(), s_t) 142 | d_fn = theano.function([s_t, s_m, s_v], d_t) 143 | 144 | import matplotlib.pyplot as plt 145 | plt.plot(thresh, d_fn(thresh, mean, var)) 146 | plt.show() 147 | 148 | 149 | def test_logEBI(): 150 | 151 | def EBI_from_sample(sample, l, u): 152 | sample = sample - l 153 | sample[sample < 0] = 0 154 | sample[sample > (u - l)] = 0 155 | return sample.mean() 156 | 157 | def normal_EBI_numeric(l, u, m, sigma, N, rng): 158 | return EBI_from_sample(rng.randn(N) * sigma + m, l, u) 159 | 160 | def normal_EBI_analytic(l, u, m, sigma): 161 | from scipy.stats import norm 162 | from hyperopt.criteria import EI_gaussian 163 | EI_l = EI_gaussian(m, sigma ** 2, l) 164 | EI_u = EI_gaussian(m, sigma ** 2, u) 165 | term = (l - u) * norm.cdf((m - u) / sigma) 166 | return EI_l - EI_u + term 167 | 168 | s_l, s_u, s_m, s_sigma = theano.tensor.dscalars('lums') 169 | s_EBI = gpr_math.s_normal_EBI(s_l, s_u, s_m, s_sigma ** 2) 170 | normal_EBI_theano = theano.function([s_l, s_u, s_m, s_sigma], s_EBI) 171 | 172 | 173 | def assert_match(l, u, m, sigma, N=100000, seed=123): 174 | l, u, m, sigma = map(float, (l, u, m, sigma)) 175 | num = normal_EBI_numeric(l, u, m, sigma, N, np.random.RandomState(seed)) 176 | ana = normal_EBI_analytic(l, u, m, sigma) 177 | thn = normal_EBI_theano(l, u, m, sigma) 178 | if not np.allclose(num, ana, atol=0.01, rtol=.01): 179 | print 'test_EBI mismatch', l, u, m, sigma, '->', num, ana 180 | assert 0 181 | if not np.allclose(thn, ana, atol=0.0001, rtol=.0001): 182 | print 'test_EBI theano mismatch', l, u, m, sigma, '->', thn, ana 183 | assert 0 184 | 185 | assert_match(0, 100, 0, 1) 186 | assert_match(0, 0.2, 0, 1) 187 | assert_match(0, 1.2, 0, 1) 188 | assert_match(0, 100, 0.5, 1.5) 189 | assert_match(0, 0.2, 0.5, 1.5) 190 | assert_match(0, 1.2, 0.5, 1.5) 191 | 192 | 193 | # -- eof flake8 194 | -------------------------------------------------------------------------------- /hp_gpsmbo/test/test_har6.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import numpy as np 3 | 4 | import hyperopt 5 | from hyperopt import hp 6 | from hypertree import har6 7 | 8 | import hp_gpsmbo.hpsuggest 9 | 10 | def test_har6(suggest=hp_gpsmbo.hpsuggest.suggest, seed=1, iters=10): 11 | # -- see shovel/hps.py for this test with debugging scaffolding 12 | # run it by typing e.g. 13 | # 14 | # shovel hps.run_har6 --seed=9 15 | # 16 | # That should do a run that fails by only getting to -3.2 17 | mins = [] 18 | for ii in range(int(seed), int(seed) + int(iters)): 19 | print 'SEED', ii 20 | space = { 21 | 'a': hp.uniform('a', 0, 1), 22 | 'b': hp.uniform('b', 0, 1), 23 | 'c': hp.uniform('c', 0, 1), 24 | 'x': hp.uniform('x', 0, 1), 25 | 'y': hp.uniform('y', 0, 1), 26 | 'z': hp.uniform('z', 0, 1), 27 | } 28 | trials = hyperopt.Trials() 29 | hyperopt.fmin( 30 | fn=har6.har6, 31 | space=space, 32 | trials=trials, 33 | algo=partial(suggest, stop_at=-3.32), 34 | rstate=np.random.RandomState(ii), 35 | max_evals=100) 36 | mins.append(min(trials.losses())) 37 | 38 | assert np.sum(mins > -3.32) < 3 39 | 40 | # XXX ideally this sum should be 0, but our optimizer 41 | # isn't that good :( 42 | 43 | -------------------------------------------------------------------------------- /hp_gpsmbo/test/test_hpsuggest.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import unittest 3 | import numpy as np 4 | from hyperopt import rand 5 | from hyperopt import Trials, fmin 6 | 7 | from hyperopt.tests.test_domains import CasePerDomain 8 | from hp_gpsmbo import suggest_algos 9 | 10 | def passthrough(x): 11 | return x 12 | 13 | class TestSmoke(unittest.TestCase, CasePerDomain): 14 | def work(self): 15 | fmin( 16 | fn=passthrough, 17 | space=self.bandit.expr, 18 | algo=partial(suggest_algos.ei, 19 | warmup_cutoff=3), 20 | max_evals=10) 21 | 22 | 23 | class TestOpt(unittest.TestCase, CasePerDomain): 24 | # -- these thresholds are pretty low 25 | # but they are set to that random does not pass them 26 | # (at least, probably not) 27 | thresholds = dict( 28 | quadratic1=1e-5, 29 | q1_lognormal=0.0002, 30 | distractor=-2.0, 31 | gauss_wave=-2.8, 32 | gauss_wave2=-2.20, 33 | n_arms=-3.0, 34 | many_dists=-1., 35 | branin=0.5, 36 | ) 37 | 38 | LEN = dict( 39 | # -- running a long way out tests overflow/underflow 40 | # to some extent 41 | twoarms=15, 42 | gausswave=50, 43 | quadratic1=1000, 44 | many_dists=200, 45 | distractor=35, 46 | #q1_lognormal=100, 47 | branin=200, 48 | ) 49 | 50 | def setUp(self): 51 | self.olderr = np.seterr('raise') 52 | np.seterr(under='ignore') 53 | 54 | def tearDown(self, *args): 55 | np.seterr(**self.olderr) 56 | 57 | def work(self): 58 | np.random.seed(1234) 59 | bandit = self.bandit 60 | LEN = self.LEN.get(bandit.name, 100) 61 | thresh = self.thresholds[bandit.name] 62 | 63 | print 'STARTING TEST', bandit.name 64 | rtrials = Trials() 65 | fmin(fn=passthrough, 66 | space=self.bandit.expr, 67 | trials=rtrials, 68 | algo=rand.suggest, 69 | max_evals=LEN, 70 | rstate=np.random) 71 | print 'RANDOM BEST 6:', list(sorted(rtrials.losses()))[:6] 72 | 73 | if bandit.name != 'n_arms': 74 | # -- assert that our threshold is meaningful 75 | assert min(rtrials.losses()) > thresh 76 | 77 | assert bandit.name is not None 78 | algo = partial( 79 | suggest_algos.ei, 80 | stop_at=self.thresholds[bandit.name]) 81 | 82 | trials = Trials() 83 | fmin(fn=passthrough, 84 | space=self.bandit.expr, 85 | trials=trials, 86 | algo=algo, 87 | max_evals=LEN, 88 | rstate=np.random) 89 | assert len(trials) <= LEN 90 | 91 | 92 | if 0: 93 | plt.subplot(2, 2, 1) 94 | plt.scatter(range(LEN), trials.losses()) 95 | plt.title('TPE losses') 96 | plt.subplot(2, 2, 2) 97 | plt.scatter(range(LEN), ([s['x'] for s in trials.specs])) 98 | plt.title('TPE x') 99 | plt.subplot(2, 2, 3) 100 | plt.title('RND losses') 101 | plt.scatter(range(LEN), rtrials.losses()) 102 | plt.subplot(2, 2, 4) 103 | plt.title('RND x') 104 | plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs])) 105 | plt.show() 106 | if 0: 107 | plt.hist( 108 | [t['x'] for t in self.experiment.trials], 109 | bins=20) 110 | 111 | 112 | #print trials.losses() 113 | print 'SUGGEST BEST 6:', list(sorted(trials.losses()))[:6] 114 | #logx = np.log([s['x'] for s in trials.specs]) 115 | #print 'TPE MEAN', np.mean(logx) 116 | #print 'TPE STD ', np.std(logx) 117 | print 'Thresh', thresh 118 | assert min(trials.losses()) < thresh 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /hp_gpsmbo/test/test_kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hp_gpsmbo import GPR, SqExp, Product 3 | 4 | 5 | def test_lenscale_wider(): 6 | # Smoke test that changing lenscale changes fit 7 | pass 8 | 9 | 10 | def test_product_smoke(): 11 | X = np.random.randn(10, 2) 12 | y = np.random.randn(10) 13 | model = GPR( 14 | Product( 15 | [SqExp(), SqExp()], 16 | [slice(0, 1), slice(1, 2)]), 17 | ) 18 | model.fit(X, y) 19 | 20 | 21 | -------------------------------------------------------------------------------- /hp_gpsmbo/test/test_normal_log_EI.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from theano.gradient import verify_grad 3 | import theano.tensor 4 | from hp_gpsmbo.op_Kcond import normal_logEI_diff_sigma 5 | from hp_gpsmbo.op_Kcond import normal_logEI_diff_sigma_elemwise 6 | from hyperopt.criteria import logEI_gaussian 7 | 8 | def test_normal_logEI(): 9 | rng = np.random.RandomState(123) 10 | 11 | N = 2000 12 | thresh = np.linspace(-50, 500, N) 13 | #N = 100 14 | #thresh = np.linspace(37, 38, N) 15 | mean = thresh * 0 16 | var = 1e-1 + rng.rand(N) 17 | sigma = np.sqrt(var) 18 | 19 | s_t, s_m, s_v = theano.tensor.dvectors('tmv') 20 | 21 | fn = theano.function([s_t, s_m, s_v], 22 | normal_logEI_diff_sigma(s_t - s_m, 23 | theano.tensor.sqrt(s_v))) 24 | 25 | my = fn(thresh, mean, var) 26 | ref = logEI_gaussian(mean, var, thresh) 27 | for xi, myv, spv in zip(thresh, my, ref): 28 | print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv) 29 | 30 | assert np.any(thresh / sigma > 34) 31 | assert np.all(np.isfinite(my)) 32 | assert np.allclose(my[thresh/sigma < 34], ref[thresh/sigma < 34]) 33 | assert np.allclose(my, ref, rtol=.1) 34 | 35 | 36 | def explore_grad(): 37 | N = 15 38 | ubound = 6e2 39 | diff = np.ones(N) * 100 40 | rng = np.random.RandomState(123) 41 | #diff = np.linspace(0, ubound, N).astype('float64') 42 | #var = np.random.rand(N) * .1 + 1 #1e-8 + 43 | #var = np.ones(N) * .01 44 | var = np.exp(rng.randn(N) * 10) ** 2 45 | var = np.sort(var) 46 | 47 | s_d, s_v = theano.tensor.dvectors('dv') 48 | s_y = normal_logEI_diff_sigma(s_d, theano.tensor.sqrt(s_v)) 49 | s_gd, s_gv = theano.tensor.grad(s_y.sum(), [s_d, s_v]) 50 | 51 | fn = theano.function([s_d, s_v], [s_y, s_gd, s_gv]) 52 | 53 | eps = ubound / 1e8 # 1e1 # 1e-4 54 | y, gd, gv = fn(diff, var) 55 | y_eps, _, _ = fn(diff + eps, var) 56 | y_eps2, _, _ = fn(diff, var + eps) 57 | for di, yi, yi_eps, yi2, gdi, gvi in zip(diff, y, y_eps, y_eps2, gd, gv): 58 | print 'di %.6f\tyi:%.6f\tgi:%.6f\tref:%.6f\tgv:%s\tref:%s' % ( 59 | di, yi, gdi, (yi_eps - yi) / eps, gvi, (yi2 - yi) / eps 60 | ) 61 | 62 | def test_grad_arg0(): 63 | N = 50 64 | def f_arg01(x): 65 | return normal_logEI_diff_sigma(x, np.ones(1)) 66 | def f_arg0(x): 67 | return normal_logEI_diff_sigma(x, np.ones(N)) 68 | 69 | rng = np.random.RandomState(123) 70 | diffvec = (rng.rand(N) - .5) * 200 71 | 72 | verify_grad(f_arg01, [np.asarray([-50.])], rng=rng) 73 | verify_grad(f_arg01, [np.asarray([50.])], rng=rng) 74 | verify_grad(f_arg0, [diffvec], rng=rng, rel_tol=1e-3) 75 | 76 | def test_grad_arg1(): 77 | N = 50 78 | def f_arg1(x): 79 | return normal_logEI_diff_sigma(np.ones(N) * 100, 80 | x) 81 | rng = np.random.RandomState(123) 82 | #sigmavec = np.exp(np.linspace(N) * 10) 83 | sigmavec = np.linspace(.1, 10, N) 84 | 85 | verify_grad( f_arg1, [sigmavec], rng=rng, rel_tol=1e-3) 86 | 87 | def test_normal_logEI_elemwise(): 88 | rng = np.random.RandomState(123) 89 | 90 | N = 2000 91 | thresh = np.linspace(-50, 500, N) 92 | #N = 100 93 | #thresh = np.linspace(37, 38, N) 94 | mean = thresh * 0 95 | var = 1e-1 + rng.rand(N) 96 | sigma = np.sqrt(var) 97 | 98 | s_t, s_m, s_v = theano.tensor.dvectors('tmv') 99 | 100 | fn = theano.function([s_t, s_m, s_v], 101 | normal_logEI_diff_sigma_elemwise( 102 | s_t - s_m, 103 | theano.tensor.sqrt(s_v))) 104 | 105 | my = fn(thresh, mean, var) 106 | ref = logEI_gaussian(mean, var, thresh) 107 | for xi, myv, spv in zip(thresh, my, ref): 108 | print xi, 'my', myv, 'sp', spv, 'diff', (myv - spv) 109 | 110 | assert np.any(thresh / sigma > 34) 111 | assert np.all(np.isfinite(my)) 112 | assert np.allclose(my[thresh/sigma < 34], ref[thresh/sigma < 34]) 113 | assert np.allclose(my, ref, rtol=.1) 114 | 115 | def test_grad_arg0_elemwise(): 116 | N = 50 117 | def f_arg01(x): 118 | return normal_logEI_diff_sigma_elemwise(x, np.ones(1)) 119 | def f_arg0(x): 120 | return normal_logEI_diff_sigma_elemwise(x, np.ones(N)) 121 | 122 | rng = np.random.RandomState(123) 123 | diffvec = (rng.rand(N) - .5) * 200 124 | 125 | verify_grad(f_arg01, [np.asarray([-50.])], rng=rng) 126 | verify_grad(f_arg01, [np.asarray([50.])], rng=rng) 127 | verify_grad(f_arg0, [diffvec], rng=rng, rel_tol=1e-3) 128 | 129 | def test_grad_arg1_elemwise(): 130 | N = 50 131 | def f_arg1(x): 132 | return normal_logEI_diff_sigma_elemwise( 133 | np.ones(N) * 100, 134 | x) 135 | rng = np.random.RandomState(123) 136 | #sigmavec = np.exp(np.linspace(N) * 10) 137 | sigmavec = np.linspace(.1, 10, N) 138 | 139 | verify_grad( f_arg1, [sigmavec], rng=rng, rel_tol=1e-3) 140 | 141 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ distribute- and pip-enabled setup.py """ 5 | 6 | import logging 7 | import os 8 | import re 9 | 10 | # ----- overrides ----- 11 | 12 | # set these to anything but None to override the automatic defaults 13 | packages = None 14 | package_name = None 15 | package_data = None 16 | scripts = None 17 | # --------------------- 18 | 19 | 20 | # ----- control flags ----- 21 | 22 | # fallback to setuptools if distribute isn't found 23 | setup_tools_fallback = True 24 | 25 | # don't include subdir named 'tests' in package_data 26 | skip_tests = False 27 | 28 | # print some extra debugging info 29 | debug = True 30 | 31 | # ------------------------- 32 | 33 | if debug: logging.basicConfig(level=logging.DEBUG) 34 | # distribute import and testing 35 | try: 36 | import distribute_setup 37 | distribute_setup.use_setuptools() 38 | logging.debug("distribute_setup.py imported and used") 39 | except ImportError: 40 | # fallback to setuptools? 41 | # distribute_setup.py was not in this directory 42 | if not (setup_tools_fallback): 43 | import setuptools 44 | if not (hasattr(setuptools,'_distribute') and \ 45 | setuptools._distribute): 46 | raise ImportError("distribute was not found and fallback to setuptools was not allowed") 47 | else: 48 | logging.debug("distribute_setup.py not found, defaulted to system distribute") 49 | else: 50 | logging.debug("distribute_setup.py not found, defaulting to system setuptools") 51 | 52 | import setuptools 53 | 54 | def find_scripts(): 55 | return [s for s in setuptools.findall('scripts/') if os.path.splitext(s)[1] != '.pyc'] 56 | 57 | def package_to_path(package): 58 | """ 59 | Convert a package (as found by setuptools.find_packages) 60 | e.g. "foo.bar" to usable path 61 | e.g. "foo/bar" 62 | 63 | No idea if this works on windows 64 | """ 65 | return package.replace('.','/') 66 | 67 | def find_subdirectories(package): 68 | """ 69 | Get the subdirectories within a package 70 | This will include resources (non-submodules) and submodules 71 | """ 72 | try: 73 | subdirectories = os.walk(package_to_path(package)).next()[1] 74 | except StopIteration: 75 | subdirectories = [] 76 | return subdirectories 77 | 78 | def subdir_findall(dir, subdir): 79 | """ 80 | Find all files in a subdirectory and return paths relative to dir 81 | 82 | This is similar to (and uses) setuptools.findall 83 | However, the paths returned are in the form needed for package_data 84 | """ 85 | strip_n = len(dir.split('/')) 86 | path = '/'.join((dir, subdir)) 87 | return ['/'.join(s.split('/')[strip_n:]) for s in setuptools.findall(path)] 88 | 89 | def find_package_data(packages): 90 | """ 91 | For a list of packages, find the package_data 92 | 93 | This function scans the subdirectories of a package and considers all 94 | non-submodule subdirectories as resources, including them in 95 | the package_data 96 | 97 | Returns a dictionary suitable for setup(package_data=) 98 | """ 99 | package_data = {} 100 | for package in packages: 101 | package_data[package] = [] 102 | for subdir in find_subdirectories(package): 103 | if '.'.join((package, subdir)) in packages: # skip submodules 104 | logging.debug("skipping submodule %s/%s" % (package, subdir)) 105 | continue 106 | if skip_tests and (subdir == 'tests'): # skip tests 107 | logging.debug("skipping tests %s/%s" % (package, subdir)) 108 | continue 109 | package_data[package] += subdir_findall(package_to_path(package), subdir) 110 | return package_data 111 | 112 | # ----------- Override defaults here ---------------- 113 | if packages is None: packages = setuptools.find_packages() 114 | 115 | if len(packages) == 0: raise Exception("No valid packages found") 116 | 117 | if package_name is None: package_name = packages[0] 118 | 119 | if package_data is None: package_data = find_package_data(packages) 120 | 121 | if scripts is None: scripts = find_scripts() 122 | 123 | setuptools.setup( 124 | name = package_name, 125 | version = '0.0.1.dev', 126 | packages = packages, 127 | scripts = scripts, 128 | url = 'http://github.com:hyperopt/hp_gpsmbo/', 129 | author = 'James Bergstra', 130 | author_email = '', 131 | description = 'Gaussian Process Regression with Theano', 132 | long_description = '', 133 | classifiers = [ 134 | 'Development Status :: 3 - Alpha', 135 | 'Intended Audience :: Education', 136 | 'Intended Audience :: Science/Research', 137 | 'Intended Audience :: Developers', 138 | 'Environment :: Console', 139 | 'License :: OSI Approved :: LGPL3 License', 140 | 'Operating System :: MacOS :: MacOS X', 141 | 'Operating System :: Microsoft :: Windows', 142 | 'Operating System :: POSIX', 143 | 'Operating System :: Unix', 144 | 'Programming Language :: Python', 145 | 'Topic :: Scientific/Engineering', 146 | 'Topic :: Software Development', 147 | ], 148 | platforms = ['Linux', 'OS-X', 'Windows'], 149 | license = 'LGPL3', 150 | keywords = 'Theano machine learning', 151 | package_data = package_data, 152 | include_package_data = True, 153 | #install_requires = ['theano', 'scipy', 'numpy'], 154 | ) 155 | --------------------------------------------------------------------------------