├── .gitattributes
├── .gitignore
├── DeepSparseKernel.py
├── README.md
├── conf.toml
├── morun.py
├── paper.pdf
├── paper
    ├── .gitignore
    ├── abstract.tex
    ├── background.tex
    ├── conclusion.tex
    ├── experiments.tex
    ├── img
    │   ├── NN-GP.png
    │   ├── NN-MOGP.png
    │   ├── nn1.pdf
    │   └── nn2.pdf
    ├── introduction.tex
    ├── makefile
    ├── mogp.tex
    ├── nips_2018.sty
    ├── paper.tex
    └── ref.bib
└── run.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | * linguist-vendored
2 | *.py linguist-vendored=false
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | *.swp
 3 | ref/
 4 | __pycache__/
 5 | test_x
 6 | test_y
 7 | train_x
 8 | train_y
 9 | pred_y
10 | pred_y_init
11 | .try/
12 | *.ipynb*
13 | *.pyc
14 | 


--------------------------------------------------------------------------------
/DeepSparseKernel.py:
--------------------------------------------------------------------------------
  1 | import autograd.numpy as np
  2 | from autograd import grad
  3 | import autograd.numpy.random as npr
  4 | import matplotlib.pyplot as plt
  5 | import math
  6 | import sys
  7 | from scipy.optimize import fmin_cg, fmin_l_bfgs_b, fmin_ncg
  8 | import traceback
  9 | 
 10 | def tanh(x):
 11 |     return np.tanh(x)
 12 | 
 13 | def relu(x):
 14 |     return np.maximum(x, 0.0)
 15 | 
 16 | def sigmoid(x):
 17 |     return np.exp(x) / (1 + np.exp(x))
 18 | 
 19 | def erf(x):
 20 |     # save the sign of x
 21 |     # sign = 1 if x >= 0 else -1
 22 |     sign = np.sign(x);
 23 |     x    = np.abs(x)
 24 | 
 25 |     # constants
 26 |     a1 =  0.254829592
 27 |     a2 = -0.284496736
 28 |     a3 =  1.421413741
 29 |     a4 = -1.453152027
 30 |     a5 =  1.061405429
 31 |     p  =  0.3275911
 32 | 
 33 |     # A&S formula 7.1.26
 34 |     t = 1.0/(1.0 + p*x)
 35 |     y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*np.exp(-x*x)
 36 |     return sign*y
 37 | 
 38 | class NN:
 39 |     def __init__(self, layer_sizes, activations):
 40 |         self.num_layers  = np.copy(len(layer_sizes))
 41 |         self.layer_sizes = np.copy(layer_sizes)
 42 |         self.activation  = activations
 43 |     def num_param(self, xdim):
 44 |         xs = [xdim];
 45 |         np = 0;
 46 |         for ls in self.layer_sizes:
 47 |             xs.append(ls)
 48 |         for i in range(self.num_layers):
 49 |             np += (1+xs[i]) * xs[i+1]
 50 |         return np
 51 | 
 52 |     def w_nobias(self, w, dim):
 53 |         """
 54 |         return weights without bias, it can be used for the L1/L2 regularizaton
 55 |         """
 56 |         prev_size = dim
 57 |         start_idx = 0;
 58 |         wnb       = np.array([])
 59 |         for i in range(self.num_layers):
 60 |             layer_size   = self.layer_sizes[i]
 61 |             num_w_layer  = (prev_size+1) * layer_size;
 62 |             w_layer      = np.reshape(w[start_idx:start_idx+num_w_layer], (prev_size+1, layer_size))[:prev_size, :];
 63 |             wnb          = np.concatenate((wnb, w_layer.reshape(w_layer.size)));
 64 |             prev_size    = layer_size
 65 |             start_idx   += num_w_layer
 66 |         return wnb
 67 | 
 68 |     def predict(self, w, x):
 69 |         dim, num_data = x.shape
 70 |         out           = x;
 71 |         bias          = np.ones((1, num_data));
 72 |         prev_size     = dim
 73 |         start_idx     = 0;
 74 |         for i in range(self.num_layers):
 75 |             layer_size   = self.layer_sizes[i]
 76 |             num_w_layer  = (prev_size+1) * layer_size;
 77 |             w_layer      = np.reshape(w[start_idx:start_idx+num_w_layer], (prev_size+1, layer_size))
 78 |             out          = self.activation[i](np.dot(w_layer.T, np.concatenate((out, bias))))
 79 |             prev_size    = layer_size
 80 |             start_idx   += num_w_layer
 81 |         return out
 82 | 
 83 | def chol_solve(L, y):
 84 |     """
 85 |     K = L.dot(L.T)
 86 |     return inv(K) * y
 87 |     """
 88 |     v = np.linalg.solve(L, y)
 89 |     return np.linalg.solve(L.T, v)
 90 | 
 91 | 
 92 | def chol_inv(L):
 93 |     return chol_solve(L, np.eye(L.shape[0]))
 94 | 
 95 | 
 96 | def scale_x(xs, log_lscales):
 97 |     lscales = np.exp(log_lscales).repeat(xs.shape[1], axis=0).reshape(xs.shape);
 98 |     return xs / lscales
 99 | 
100 | class DSK_GP:
101 |     def __init__(self, train_x, train_y, layer_sizes, activations, bfgs_iter=500, l1=0, l2=0, debug = False):
102 |         self.train_x   = np.copy(train_x)
103 |         self.train_y   = np.copy(train_y)
104 |         self.mean      = np.mean(train_y)
105 |         self.dim       = self.train_x.shape[0]
106 |         self.num_train = self.train_x.shape[1]
107 |         self.nn        = NN(layer_sizes, activations)
108 |         self.num_param = 2 + self.dim + self.nn.num_param(self.dim) # noise + variance + lengthscales + NN weights
109 |         self.m         = layer_sizes[-1];
110 |         self.loss      = np.inf
111 |         self.bfgs_iter = bfgs_iter;
112 |         self.debug     = debug
113 |         self.l1        = l1; # TODO: only regularize weight, do not regularize bias
114 |         self.l2        = l2;
115 |         self.train_y.reshape(1, train_y.size)
116 |         self.train_y_zero = self.train_y - self.mean;
117 | 
118 |     def rand_theta(self, scale=0.1):
119 |         """ 
120 |         Generate an initial theta, the weights of NN are randomly initialized
121 |         """
122 |         theta = scale * np.random.randn(self.num_param)
123 |         theta[0] = np.log(np.std(self.train_y) / 2)
124 |         theta[1] = np.log(np.std(self.train_y))
125 |         for i in range(self.dim):
126 |             theta[2 *  + i] = np.maximum(-100, np.log(0.5 * (self.train_x[i, :].max() - self.train_x[i, :].min())))
127 |         return theta
128 | 
129 |     def calc_Phi(self, w, x):
130 |         Phi = self.nn.predict(w, x);
131 |         return Phi
132 | 
133 |     def log_likelihood(self, theta):
134 |         # TODO: verification of this log_likelihood
135 |         log_sn      = theta[0]
136 |         log_sp      = theta[1]
137 |         log_lscales = theta[2:2+self.dim];
138 |         w           = theta[2+self.dim:]
139 |         scaled_x    = scale_x(self.train_x, log_lscales)
140 |         sn2         = np.exp(2 * log_sn)
141 |         sp          = np.exp(1 * log_sp);
142 |         sp2         = np.exp(2 * log_sp);
143 | 
144 |         neg_likelihood = np.inf
145 |         Phi            = self.calc_Phi(w, scaled_x);
146 |         m, num_train   = Phi.shape
147 |         A              = np.dot(Phi, Phi.T) + (sn2 * m / sp2) * np.eye(m);
148 |         LA             = np.linalg.cholesky(A)
149 | 
150 |         Phi_y = np.dot(Phi, self.train_y_zero.T)
151 |         data_fit = (np.dot(self.train_y_zero, self.train_y_zero.T) - np.dot(Phi_y.T, chol_solve(LA, Phi_y))) / sn2
152 |         logDetA = 0
153 |         for i in range(m):
154 |             logDetA += 2 * np.log(LA[i][i])
155 |         neg_likelihood = 0.5 * (data_fit + logDetA - m * np.log(m * sn2 / sp2) + num_train * np.log(2 * np.pi * sn2))
156 |         if(np.isnan(neg_likelihood)):
157 |             neg_likelihood = np.inf
158 |         
159 |         w_nobias       = self.nn.w_nobias(w, self.dim);
160 |         l1_reg         = self.l1 * np.abs(w_nobias).sum()
161 |         l2_reg         = self.l2 * np.dot(w_nobias, w_nobias)
162 |         neg_likelihood = neg_likelihood + l1_reg + l2_reg
163 | 
164 |         # refresh current best
165 |         if neg_likelihood < self.loss:
166 |             self.loss  = neg_likelihood
167 |             self.theta = np.copy(theta)
168 |             self.LA    = LA.copy()
169 |             self.A     = A.copy()
170 | 
171 |         return neg_likelihood
172 | 
173 |     def fit(self, theta):
174 |         theta0     = theta.copy()
175 |         self.loss  = np.inf
176 |         self.theta = theta0;
177 |         def loss(w):
178 |             nlz = self.log_likelihood(w);
179 |             return nlz
180 |         gloss      = grad(loss)
181 |         try:
182 |             fmin_l_bfgs_b(loss, theta0, gloss, maxiter = self.bfgs_iter, m = 100, iprint=1)
183 |         except np.linalg.LinAlgError:
184 |             print("Increase noise term and re-optimization")
185 |             theta0     = np.copy(self.theta);
186 |             theta0[0] += np.log(10);
187 |             try:
188 |                 fmin_l_bfgs_b(loss, theta0, gloss, maxiter = self.bfgs_iter, m = 10, iprint=1)
189 |             except:
190 |                 print("Exception caught, L-BFGS early stopping...")
191 |                 if self.debug:
192 |                     print(traceback.format_exc())
193 |         except:
194 |             print("Exception caught, L-BFGS early stopping...")
195 |             if self.debug:
196 |                 print(traceback.format_exc())
197 | 
198 |         print("Optimized loss is %g" % self.loss)
199 |         if(np.isinf(self.loss) or np.isnan(self.loss)):
200 |             print("Fail to build GP model")
201 |             sys.exit(1)
202 | 
203 |         # pre-computation
204 |         log_sn      = self.theta[0]
205 |         log_sp      = self.theta[1]
206 |         log_lscales = self.theta[2:2+self.dim]
207 |         w           = self.theta[2+self.dim:]
208 |         sn2         = np.exp(2 * log_sn)
209 |         sp          = np.exp(log_sp);
210 |         sp2         = np.exp(2*log_sp);
211 |         Phi         = self.calc_Phi(w, scale_x(self.train_x, log_lscales))
212 |         m           = self.m
213 |         self.alpha  = chol_solve(self.LA, np.dot(Phi, self.train_y_zero.T))
214 | 
215 |     def predict(self, test_x):
216 |         log_sn      = self.theta[0]
217 |         log_sp      = self.theta[1]
218 |         log_lscales = self.theta[2:2+self.dim]
219 |         w           = self.theta[2+self.dim:]
220 |         sn          = np.exp(log_sn)
221 |         sn2         = np.exp(2*log_sn)
222 |         sp          = np.exp(log_sp)
223 |         sp2         = np.exp(2*log_sp)
224 |         Phi_test    = self.calc_Phi(w, scale_x(test_x, log_lscales))
225 |         py          = self.mean + Phi_test.T.dot(self.alpha)
226 |         ps2         = sn2 + sn2 * np.diagonal(Phi_test.T.dot(chol_solve(self.LA, Phi_test)));
227 |         return py, ps2
228 | 
229 | 
230 | class MODSK:
231 |     def __init__(self, train_x, train_y, shared_nn, non_shared_nns, max_iter = 100, l1 = 0, l2 = 0, debug=False): 
232 |         self.train_x        = np.copy(train_x)
233 |         self.train_y        = np.copy(train_y)
234 |         self.dim            = self.train_x.shape[0]
235 |         self.num_train      = self.train_x.shape[1]
236 |         self.num_obj        = self.train_y.shape[1]
237 |         self.means          = np.mean(self.train_y, axis=0)
238 |         self.stds           = np.std(self.train_y, axis=0)
239 |         self.train_y        = (self.train_y - self.means) / self.stds # standardize output
240 |         self.debug          = debug
241 |         self.max_iter       = max_iter # max iter for the L-BFGS optimization
242 |         self.l1             = l1
243 |         self.l2             = l2
244 |         self.shared_nn      = shared_nn
245 |         self.non_shared_nns = non_shared_nns
246 |         self.num_param      = self.calc_num_params()
247 |         if(train_x.ndim != 2 or train_y.ndim != 2):
248 |             print("train_x.ndim != 2 or train_y.ndim != 2")
249 |             sys.exit(1)
250 |         if(train_x.shape[1] != train_y.shape[0]):
251 |             print("train_x.shape[1] != train_y.shape[0]")
252 |             sys.exit(1)
253 |         if(len(non_shared_nns) != self.num_obj):
254 |             print("len(non_shared_nns) != self.num_obj")
255 |             sys.exit(1)
256 | 
257 |     def calc_num_params(self):
258 |         """ 
259 |         parameters:
260 |             1. length scales: dim
261 |             2. noise: num_obj
262 |             3. self covariance: num_obj
263 |         """
264 |         num_param = self.dim + 2 * self.num_obj + self.shared_nn.num_param(self.dim)
265 |         size_last_layer_shared = self.shared_nn.layer_sizes[-1]
266 |         for i in range(self.num_obj):
267 |             num_param += self.non_shared_nns[i].num_param(size_last_layer_shared)
268 |         return num_param
269 | 
270 |     def w_nobias(self, ws):
271 |         w_shared     = ws[:self.shared_nn.num_param(self.dim)]
272 |         w_non_shared = ws[self.shared_nn.num_param(self.dim):]
273 |         m_shared     = self.shared_nn.layer_sizes[-1]
274 |         wnb          = self.shared_nn.w_nobias(w_shared, self.dim)
275 |         start_idx    = 0
276 |         for nn in self.non_shared_nns:
277 |             w_tmp = w_non_shared[start_idx: start_idx + nn.num_param(m_shared)]
278 |             wnb   = np.concatenate((wnb, nn.w_nobias(w_tmp, m_shared)))
279 |             start_idx = start_idx + w_tmp.size
280 |             if self.debug:
281 |                 assert(w_tmp.size == nn.num_param(m_shared))
282 |         return wnb
283 |     def calc_Phi(self, ws, x):
284 |         w_shared     = ws[:self.shared_nn.num_param(self.dim)]
285 |         w_non_shared = ws[self.shared_nn.num_param(self.dim):]
286 |         Phi_shared   = self.shared_nn.predict(w_shared, x)
287 |         m_shared     = Phi_shared.shape[0]
288 |         Phis         = []
289 |         start_idx    = 0
290 |         for nn in self.non_shared_nns:
291 |             w_tmp     = w_non_shared[start_idx: start_idx + nn.num_param(m_shared)]
292 |             Phi_tmp   = nn.predict(w_tmp, Phi_shared)
293 |             start_idx = start_idx + w_tmp.size
294 |             Phis     += [Phi_tmp]
295 |             if self.debug:
296 |                 assert(w_tmp.size == nn.num_param(Phi_shared.shape[0]))
297 |         return Phis
298 | 
299 |     def split_theta(self, theta):
300 |         if self.debug:
301 |             assert(theta.size == self.num_param)
302 |         num_obj     = self.num_obj
303 |         log_sns     = theta[:num_obj];
304 |         log_sps     = theta[num_obj:2*num_obj];
305 |         log_lscales = theta[2*num_obj:2*num_obj+self.dim]
306 |         ws          = theta[2*num_obj+self.dim:]
307 |         return (log_sns, log_sps, log_lscales, ws)
308 | 
309 |     def rand_theta(self, scale=1):
310 |         """ 
311 |         Generate an initial theta, the weights of NN are randomly initialized
312 |         """
313 |         theta = scale * np.random.randn(self.num_param)
314 |         # noises and self covariances
315 |         for i in range(self.num_obj):
316 |             theta[i]                = np.log(np.std(self.train_y[:, i]) / 2)
317 |             theta[self.num_obj + i] = np.log(np.std(self.train_y[:, i]))
318 |         # lengthscales
319 |         for i in range(self.dim):
320 |             theta[2 * self.num_obj + i] = np.maximum(-100, np.log(0.5 * (self.train_x[i, :].max() - self.train_x[i, :].min())))
321 |         return theta
322 |     
323 |     def loss(self, theta):
324 |         """
325 |         return a list of losses
326 |         """
327 |         if self.debug:
328 |             assert(theta.size == self.num_param)
329 |         log_sns, log_sps, log_lscales, ws = self.split_theta(theta)
330 |         Phis                              = self.calc_Phi(ws, scale_x(self.train_x, log_lscales))
331 |         losses                            = []
332 |         for i in range(self.num_obj):
333 |             losses += [self.log_likelihood(log_sns[i], log_sps[i], Phis[i], self.train_y[:, i].reshape(self.num_train, 1))]
334 |         return losses
335 | 
336 |     def log_likelihood(self, log_sn, log_sp, Phi, train_y):
337 |         sn2         = np.exp(2 * log_sn)
338 |         sp          = np.exp(1 * log_sp);
339 |         sp2         = np.exp(2 * log_sp);
340 | 
341 |         neg_likelihood = np.inf
342 |         m, num_train   = Phi.shape
343 |         A              = np.dot(Phi, Phi.T) + (sn2 * m / sp2) * np.eye(m);
344 |         LA             = np.linalg.cholesky(A)
345 | 
346 |         Phi_y = np.dot(Phi, train_y)
347 |         data_fit = (np.dot(train_y.T, train_y) - np.dot(Phi_y.T, chol_solve(LA, Phi_y))) / sn2
348 |         logDetA = 0
349 |         for i in range(m):
350 |             logDetA += 2 * np.log(LA[i][i])
351 |         neg_likelihood = 0.5 * (data_fit + logDetA - m * np.log(m * sn2 / sp2) + num_train * np.log(2 * np.pi * sn2))
352 |         if(np.isnan(neg_likelihood)):
353 |             neg_likelihood = np.inf
354 |         
355 |         return neg_likelihood
356 | 
357 |     def fit(self, theta):
358 |         theta0         = theta.copy()
359 |         self.best_loss = np.inf
360 |         def lossfit(theta):
361 |             loss   = sum(self.loss(theta))
362 |             w      = theta[2*self.num_obj+self.dim:]
363 |             wnb    = self.w_nobias(w)
364 |             l1_reg = self.l1 * np.abs(wnb).sum();
365 |             l2_reg = self.l2 * np.dot(wnb, wnb)
366 |             loss   = loss + l1_reg + l2_reg
367 |             if loss < self.best_loss:
368 |                 self.best_loss = loss
369 |                 self.theta     = theta.copy()
370 |             return loss
371 |         gloss = grad(lossfit)
372 |         try:
373 |             fmin_l_bfgs_b(lossfit, theta0, gloss, maxiter = self.max_iter, m = 100, iprint=1)
374 |         except np.linalg.LinAlgError:
375 |             print("Increase noise term and re-optimization")
376 |             theta0 = np.copy(self.theta)
377 |             for i in range(self.num_obj):
378 |                 theta0[i] = theta0[i] + np.log(10)
379 |             try:
380 |                 fmin_l_bfgs_b(lossfit, theta0, gloss, maxiter = self.max_iter, m = 10, iprint=1)
381 |             except:
382 |                 print("Exception caught, L-BFGS early stopping...")
383 |                 if self.debug:
384 |                     print(traceback.format_exc())
385 |         except:
386 |             print("Exception caught, L-BFGS early stopping...")
387 |             if self.debug:
388 |                 print(traceback.format_exc())
389 |         
390 |         print("Optimized")
391 |         log_sns, log_sps, log_lscales, ws = self.split_theta(self.theta)
392 |         scaled_x                          = scale_x(self.train_x, log_lscales)
393 |         Phis                              = self.calc_Phi(ws, scaled_x)
394 |         self.Phis                         = Phis
395 |         self.LAs                          = []
396 |         self.alphas                       = []
397 |         for i in range(self.num_obj):
398 |             Phi          = Phis[i]
399 |             sn2          = np.exp(2 * log_sns[i])
400 |             sp2          = np.exp(2 * log_sps[i])
401 |             m            = Phi.shape[0]
402 |             A            = np.dot(Phi, Phi.T) + (sn2 * m / sp2) * np.eye(m);
403 |             LA           = np.linalg.cholesky(A)
404 |             self.LAs    += [LA]
405 |             self.alphas += [chol_solve(LA, np.dot(Phi, self.train_y[:, i]))]
406 | 
407 |     def predict(self, x):
408 |         num_test = x.shape[1];
409 |         py       = np.zeros((num_test, self.num_obj))
410 |         ps2      = np.zeros((num_test, self.num_obj))
411 | 
412 |         log_sns, log_sps, log_lscales, ws = self.split_theta(self.theta)
413 |         scaled_x  = scale_x(x, log_lscales)
414 |         Phis_test = self.calc_Phi(ws, scaled_x)
415 | 
416 |         for i in range(self.num_obj):
417 |             Phi_test = Phis_test[i]
418 |             sn2      = np.exp(2 * log_sns[i])
419 |             py[:, i] = np.dot(Phi_test.T, self.alphas[i]);
420 |             for j in range(num_test):
421 |                 ps2[j, i] = sn2 + sn2 * np.dot(Phi_test[:, j].T, chol_solve(self.LAs[i], Phi_test[:, j]))
422 |         py  = (py * self.stds) + self.means;
423 |         ps2 = ps2 * (self.stds**2)
424 |         return py, ps2
425 | 
426 |     def mix_predict(self, K, x, scale):
427 |         # TODO: this version can not be paralleled, as different threads would share the same self.theta
428 |         pys    = []
429 |         ps2s   = []
430 |         losses = []
431 |         for i in range(K):
432 |             theta   = self.rand_theta(scale=scale)
433 |             self.fit(theta)
434 |             py_i, ps2_i  = self.predict(x)
435 |             pys         += [py_i]
436 |             ps2s        += [ps2_i]
437 |             losses      += [self.best_loss[0][0]]
438 |         py  = np.zeros((x.shape[1], self.num_obj))
439 |         ps2 = np.zeros((x.shape[1], self.num_obj))
440 |         for i in range(K):
441 |             py  += pys[i] / K;
442 |             ps2 += (ps2s[i] + pys[i]**2) / K
443 |         ps2 -= py**2
444 |         print("losses:")
445 |         print(losses)
446 |         return py, ps2
447 | 
448 | # TODO: # https://towardsdatascience.com/random-initialization-for-neural-networks-a-thing-of-the-past-bfcdd806bf9e
449 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | ## About
 4 | 
 5 | 
 6 | 
 7 | The last hidden layer of a neural network can be viewed as a finite feature
 8 | map, from which a degenerate Gaussian process model can be built; on the other
 9 | hands, multiple correlated outputs can be represented by a neural network with
10 | shared hidden layers. In this paper, we build opon these two ideas, and propose
11 | a simple multi-output Gaussian process regression model, the kernels of
12 | multiple outputs are constructed from a multi-task neural network with shared
13 | hidden layers and task-specific layers. We compare our multi-task neural
14 | network enhanced Gaussian process (MTNN-GP) model with several multi-output
15 | Gaussian process models using two public datasets and one examples of
16 | real-world analog integrated circuits, the results show that our model is
17 | competitive compared with these models.
18 | 
19 | ## Future work
20 | 
21 | - Learning covariance between tasks and handle missing data
22 | - Other architecture: cross-stich
23 | - Advanced NN training: batch-normalization, dropout
24 | - Hyperparameters and architectures of NN: use BO to optimize it
25 | 


--------------------------------------------------------------------------------
/conf.toml:
--------------------------------------------------------------------------------
 1 | num_shared_layer     = 2
 2 | num_non_shared_layer = 2
 3 | hidden_shared        = 50
 4 | hidden_non_shared    = 50
 5 | l1                   = 0.0
 6 | l2                   = 0.0
 7 | scale                = 1
 8 | max_iter             = 100
 9 | K                    = 4
10 | activation           = "relu"
11 | 


--------------------------------------------------------------------------------
/morun.py:
--------------------------------------------------------------------------------
 1 | from   DeepSparseKernel  import np
 2 | import matplotlib.pyplot as plt
 3 | import sys
 4 | import DeepSparseKernel  as dsk
 5 | import toml
 6 | 
 7 | def trans(data):
 8 |     if data.ndim == 1:
 9 |         return data.reshape(data.size, 1)
10 |     else:
11 |         return data
12 | 
13 | argv = sys.argv[1:]
14 | conf = toml.load(argv[0])
15 | 
16 | # configurations
17 | num_shared_layer     = conf["num_shared_layer"]
18 | num_non_shared_layer = conf["num_non_shared_layer"]
19 | hidden_shared        = conf["hidden_shared"]
20 | hidden_non_shared    = conf["hidden_non_shared"]
21 | l1                   = conf["l1"]
22 | l2                   = conf["l2"]
23 | scale                = conf["scale"]
24 | max_iter             = conf["max_iter"]
25 | K                    = conf["K"]
26 | activation           = conf["activation"];
27 | 
28 | act_f = dsk.tanh
29 | if activation == "relu":
30 |     act_f = dsk.relu
31 | elif activation == "erf":
32 |     act_f = dsk.erf
33 | elif activation == "sigmoid":
34 |     act_f = dsk.sigmoid
35 | else:
36 |     act_f = dsk.tanh
37 | 
38 | train_x              = trans(np.loadtxt('train_x')).T
39 | train_y              = trans(np.loadtxt('train_y'))
40 | test_x               = trans(np.loadtxt('test_x')).T
41 | test_y               = trans(np.loadtxt('test_y'))
42 | dim, num_train       = train_x.shape
43 | num_obj              = train_y.shape[1]
44 | num_test             = train_x.shape[1]
45 | 
46 | shared_layers_sizes     = [hidden_shared]     * num_shared_layer
47 | shared_activations      = [dsk.tanh]          * num_shared_layer
48 | non_shared_layers_sizes = [hidden_non_shared] * num_non_shared_layer
49 | non_shared_activations  = [dsk.tanh]          * num_non_shared_layer
50 | 
51 | shared_nn      = dsk.NN(shared_layers_sizes, shared_activations)
52 | non_shared_nns = []
53 | 
54 | for i in range(num_obj):
55 |     non_shared_nns += [dsk.NN(non_shared_layers_sizes, non_shared_activations)]
56 | 
57 | modsk = dsk.MODSK(train_x, train_y, shared_nn, non_shared_nns, debug=True, max_iter=max_iter, l1=l1, l2=l2)
58 | 
59 | py, ps2 = modsk.mix_predict(K, test_x, scale=scale);
60 | np.savetxt('pred_y', py);
61 | np.savetxt('pred_s2', ps2);
62 | print("Finished")
63 | 


--------------------------------------------------------------------------------
/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alaya-in-Matrix/NeuralLinear/bc0209e1e8c24756a3777a505b8655c93009df4a/paper.pdf


--------------------------------------------------------------------------------
/paper/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore file for the ICML manuscript
 2 | 
 3 | *.aux
 4 | *.bbl
 5 | *.blg
 6 | *.log
 7 | *.out
 8 | *.pdf
 9 | ref/
10 | 


--------------------------------------------------------------------------------
/paper/abstract.tex:
--------------------------------------------------------------------------------
1 | \begin{abstract}
2 | A neural network can be viewed as a finite feature map, from which a reduced-rank Gaussian process model can be built. On the other hand, multiple correlated outputs can be represented by a neural network with shared hidden layers. In this paper, we propose a simple multi-output Gaussian process regression model based on the aforementioned two ideas. The kernel functions of multiple outputs are constructed from a multi-task neural network with shared hidden layers and task-specific layers. The correlations of the outputs are thus captured by the shared hidden layers. We compare our multi-task neural network enhanced Gaussian process (MTNN-GP) model with several existing multi-output Gaussian process models using two public datasets and one example of real-world analog integrated circuits. The results show that our model is competitive compared with these models.
3 | \end{abstract}
4 | 


--------------------------------------------------------------------------------
/paper/background.tex:
--------------------------------------------------------------------------------
  1 | \section{Background}\label{sec:Background}
  2 | 
  3 | \subsection{Gaussian Process Regression}\label{sec:SOGP}
  4 | 
  5 | Given a training set $D_T = \{X, \bm{y}\}$ where $X = \{\bm{x}_1,~\dots~\bm{x}_N\}$, and $\bm{y} = \{y_1,~\dots~y_N\}$, we assume the target value $y$ is generated
  6 | by a latent function $f(\bm{x})$ with additive noise $\epsilon \sim N(0, \sigma_n^2)$ such that
  7 | \begin{equation}
  8 |     \label{eq:yf}
  9 |     y_i \sim N(f(\bm{x}_i), \sigma_n^2)
 10 |     % y_i = f(\bm{x}_i) + \epsilon_i.
 11 | \end{equation}
 12 | Here $N(\cdot, \cdot)$ denotes a Gaussian distribution. We use Gaussian process (GP)~\cite{GPML} to learn the latent function $f(\bm{x})$. A GP model defines a prior over $f(\bm{x})$. GP is fully characterized by a mean function $m(\bm{x})$ and a covariance function $k(\bm{x}, \bm{y})$. For the training set $D_T$, the latent function values $\bm{f} = (f(\bm{x}_1),~\dots~f(\bm{x}_N))^T$ follow a joint Gaussian distribution $\bm{f} \sim N(\bm{m}, K)$, where $\bm{m} = (m(\bm{x}_1),~\dots~,m(\bm{x}_N))^T$ is the mean vector, and $K_{ij} = k(\bm{x}_i, \bm{x}_j)$ is the covariance matrix. The mean function $m(\bm{x})$ can be any function, while the kernel function $k(\bm{x}, \bm{y})$ has to make sure that the covariance matrix is a symmetric positive definite (SPD) matrix. In this paper, we fix $m(\bm{x}) = 0$.
 13 | 
 14 | Given a new input $\bm{x}_*$, GP model predicts the distribution of the output, i.e., $y \sim N(\mu(\bm{x}_*), \sigma^2(\bm{x}_*))$, where $\mu(\bm{x}_*)$ and $\sigma^2(\bm{x}_*)$ can be expressed as
 15 | 
 16 | \begin{equation}
 17 |     \left\{
 18 |         \begin{array}{lll}
 19 |             \mu(\bm{x}_*)      &=& k(\bm{x}_*, X) (K + \sigma_n^2 I)^{-1} \bm{y} \\
 20 |             \sigma^2(\bm{x}_*) &=& \sigma_n^2 + k(\bm{x}_*, \bm{x}_*) - k(\bm{x}_*, X) (K + \sigma_n^2 I)^{-1} k(X, \bm{x}_*),
 21 |         \end{array}
 22 |     \right.
 23 |     \label{eq:GPRPred}
 24 | \end{equation}
 25 | 
 26 | where $k(\bm{x}_*, X) = (k(\bm{x}_*, \bm{x}_1),~\dots~,k(\bm{x}_*, \bm{x}_N))$ and $k(X, \bm{x}_*) = k(\bm{x}_*, X)^T$. In \eqref{eq:GPRPred}, $\mu(\bm{x}_*)$ and $\sigma^2(\bm{x}_*)$ can be viewed as the prediction and the uncertainty measure.
 27 | 
 28 | There are usually some hyperparameters for a GP model, including the noise level $\sigma_n$ and the hyperparameters for the kernel functions. For example, the squared exponential kernel is a commonly used kernel in GP regression. The kernel function is defined as
 29 | 
 30 | \begin{equation}
 31 |     \label{eq:GaussianCovarianceFunction}
 32 |     k(\bm{x}_i, \bm{x}_j) = \sigma_f^2 \exp\Big(-\frac{1}{2}(\bm{x}_i - \bm{x}_j)^T\Lambda^{-1}(\bm{x}_i - \bm{x}_j)\Big),
 33 | \end{equation}
 34 | where $\Lambda = \mathrm{diag}(l_1, \dots, l_d)$ is a diagonal matrix and $l_i$ denotes the length scale of the $i$-th dimension. $\sigma_f$ and $\Lambda$ are the hyperparameters for the kernel. Denote $\bm{\theta}$ as the vector of hyperparameters, the hyperparameters can be learned via maximum likelihood estimation (MLE) by maximizing the following likelihood function
 35 | 
 36 | \begin{equation}
 37 |     \label{eq:GPloglikelihood}
 38 |     \log p(\bm{y} | X, \bm{\theta}) = -\frac{1}{2}(\bm{y}^T K_{\bm{\theta}}^{-1} \bm{y} + \log |K_{\theta}| + N \log(2 \pi)),
 39 | \end{equation}
 40 | 
 41 | where $K_{\bm{\theta}}$ is the covariance matrix of the training input calculated by the kernel function.
 42 | 
 43 | \subsection{Reduced-Rank Gaussian Process Model from Finite-Dimensional Feature Map}\label{sec:NNGP}
 44 | 
 45 | \begin{figure}[!htb]
 46 |     \centering
 47 |     \includegraphics[width=\columnwidth]{./img/nn1.pdf}
 48 |     \caption{Architecture of the Gaussian process model with kernel characterized by neural network.}
 49 |     \label{fig:NNGP}
 50 | \end{figure}
 51 | 
 52 | A Gaussian process model can also be derived from a weight space view. Let $\phi(\bm{x}): R^D \rightarrow R^M$ be a feature map from $D$-dimensional input space to the $M$-dimensional feature space. We assume that the latent function $f(\bm{x})$ is a linear combination of the nonlinear features, and the observed target values are generated from $f(\bm{x})$ with additive noise
 53 | 
 54 | \begin{equation}
 55 |     \label{eq:weightspace}
 56 |     \left\{
 57 |     \begin{array}{lll}
 58 |         f(\bm{x}) &=&    \bm{w}^T \phi(\bm{x})   \\
 59 |         y_i       &\sim& N(f(\bm{x}_i), \sigma_n^2)
 60 |     \end{array}.
 61 |     \right.
 62 | \end{equation}
 63 | 
 64 | If a zeros mean Gaussian prior with covariance matrix $\Sigma_p$ is imposed on the weights $\bm{w}$, i.e., $\bm{w} \sim N(0, \Sigma_p)$, it can be proved that $f$ follows a Gaussian process $f \sim \mathcal{GP}(0, k)$ \cite{GPML}, with the kernel function $k$ defined as
 65 | \begin{equation}
 66 |     \label{eq:kernel_from_weight}
 67 |     k(\bm{x}, \bm{y}) = \phi(\bm{x})^T \Sigma_p \phi(\bm{y}).
 68 | \end{equation}
 69 | The GP model defined from finite feature map is called \emph{degenerate} Gaussian process, as the covariance matrix calculated from \eqref{eq:kernel_from_weight} would have a lower rank $M$ than $N$.
 70 | 
 71 | %XXX: mension the matrix inversion lemma
 72 | 
 73 | If we set $\Sigma_p$ to a diagonal matrix $\Sigma_p = \frac{\sigma_p^2}{M} I$, the predictive distribution of \eqref{eq:GPRPred} can be reformulated as \cite{GPML,lazaro2010marginalized}
 74 | \begin{equation}
 75 |     \left\{
 76 |         \begin{array}{lll}
 77 |             \mu(\bm{x})      &= & \phi(x)^T A^{-1} \Phi \bm{y} \\
 78 |             \sigma^2(\bm{x}) &= & \sigma_n^2 + \sigma_n^2 \phi(\bm{x})^T A^{-1} \phi(\bm{x}) \\
 79 |             \Phi             &= & (\phi(\bm{x}_1),~\dots~,\phi(\bm{x}_N)) \\
 80 |             A                &= & \Phi \Phi^T + \frac{M \sigma_n^2}{\sigma_p^2} I
 81 |         \end{array}.
 82 |     \right.
 83 |     \label{eq:DegeneratePred}
 84 | \end{equation}
 85 | 
 86 | Note that the when calculating $\mu(\bm{x})$ and $\sigma^2(\bm{x})$ directly using \eqref{eq:GPRPred}, the time complexity are $O(N)$ and $O(N^2)$, respectively. However, if \eqref{eq:DegeneratePred} is used, the time complexity for calculating $\mu(\bm{x})$ and $\sigma^2(\bm{x})$ become $O(M)$ and $O(M^2)$ as long as the inverse of $A$ is pre-calculated.
 87 | 
 88 | The log likelihood of the training data defined in \eqref{eq:GPloglikelihood} can also be reformulated as \cite{lazaro2010marginalized}
 89 | \begin{equation}
 90 |     \label{eq:DegenerateGPloglikelihood}
 91 |     \log p(\bm{y} | X, \bm{\theta}) = -\frac{1}{2\sigma_n^2}(\bm{y}^T\bm{y} - \bm{y}^T \Phi^T A^{-1} \Phi \bm{y}) - \frac{1}{2}\log |A| + \frac{M}{2} \log \frac{M \sigma_n^2}{\sigma_p^2} - \frac{N}{2} \log(2 \pi \sigma_n^2),
 92 | \end{equation}
 93 | where $\bm{\theta}$ is the vector containing $\sigma_p$, $\sigma_n$ and the parameters of $\phi$. In \eqref{eq:GPloglikelihood}, the covariance matrix $K$ should be inverted, which would take $O(N^3)$ operations. For \eqref{eq:DegenerateGPloglikelihood}, the matrix $A$ is of the size $M \times M$, so the time complexity for calculating \eqref{eq:DegenerateGPloglikelihood} is only $O(NM^2 + M^3)$.
 94 | 
 95 | % XXX: the loss function for the
 96 | It can be seen that GP model can be constructed from a finite feature map. Neural network (NN) can provide effective feature representations, it is thus natural to use a neural network as the feature map $\phi$. In \cite{lazaro2010marginalized}, a neural network with one hidden layer is proposed as the feature map $\phi(\bm{x})$. The weights of the neural network are obtained by maximizing the likelihood function in \eqref{eq:DegenerateGPloglikelihood} with gradient back-propagation. In \cite{huang2015scalable}, the single hidden layer is extended to multiple layers. The Gaussian process with kernel characterized by neural network is illustrated in \Fref{fig:NNGP}. A similar work is \cite{snoek2015scalable}, where a neural network is firstly pre-trained, and Bayesian linear regression is then performed to the last layer. The finally trained model is used for Bayesian optimization.
 97 | 
 98 | \subsection{Model Averaging to Improve the Quality of Uncertainty Prediction}\label{sec:deepensemble}
 99 | 
100 | When probabilistic models are built from neural networks, a simple model averaging technique \cite{lazaro2010marginalized, huang2015scalable, lakshminarayanan2017simple} can significantly improve the quality of the estimated uncertainty.
101 | 
102 | Firstly, $K$ independent probabilistic neural network models are trained with random initializations. Each model would give predictive distribution $p(y | \bm{x}, \bm{\theta}_k) = N(\mu_k(\bm{x}), \sigma_k^2(\bm{x}))$ where $\bm{\theta}_k$ is the neural network parameters for the $k$-th model, $\mu_k(\bm{x})$ and $\sigma_k^2(\bm{x})$ are the mean and variance of the corresponding predictive Gaussian distribution. The final predictive distribution can be expressed as $p(y | \bm{x}) = N(\mu(\bm{x}), \sigma^2(\bm{x}))$, where
103 | \begin{equation}
104 |     \left\{
105 |         \begin{array}{lll}
106 |             \mu(\bm{x})      &=& \frac{1}{K} \sum_k \mu_k(\bm{x}) \\
107 |             \sigma^2(\bm{x}) &=& \frac{1}{K} \sum_k (\mu_k^2(\bm{x}) + \sigma_k^2(\bm{x})) - \mu^2(\bm{x})
108 |         \end{array}.
109 |     \right.
110 |     \label{eq:deepensemble}
111 | \end{equation}
112 | 
113 | In \cite{lazaro2010marginalized, huang2015scalable}, the uncertainty is obtained according to \eqref{eq:DegeneratePred}, while in \cite{lakshminarayanan2017simple}, the uncertainty is generated by adversarial training. It is shown that the ensemble technique defined in \eqref{eq:deepensemble} can greatly improve the quality of the uncertainty measure. In \cite{lakshminarayanan2017simple}, with $K = 5$, the model-averaging significantly improves the quality of the uncertainty estimation and outperforms Bayesian-based models like probabilistic backpropagation \cite{hernandez2015probabilistic} and MC-dropout \cite{gal2016dropout}.
114 | 


--------------------------------------------------------------------------------
/paper/conclusion.tex:
--------------------------------------------------------------------------------
1 | \section{Conclusion}\label{sec:conclusion}
2 | 
3 | In this paper, a multi-output Gaussian process regression model is proposed. The reduced-rank Gaussian processes are constructed from neural network feature maps. The correlations between tasks are represented by the shared layers of the neural network. The experimental results show that our proposed model outperforms independent GP model and other state-of-the-art multi-output GP models.
4 | 
5 | 


--------------------------------------------------------------------------------
/paper/experiments.tex:
--------------------------------------------------------------------------------
  1 | \section{Experimental Results}\label{sec:experiments}
  2 | 
  3 | % TODO: report training time
  4 | 
  5 | We implemented the multi-output Gaussian process model using Python 3.5.2. The gradient of the loss function in \eqref{eq:mo_likelihood} is calculated by \texttt{autograd} package \cite{maclaurin2015autograd}. We compared our model with independent GP and several state-of-the-art multi-output GP models on three datasets. The datasets include two publicly available datasets and one dataset sampled from a real-world analog integrated circuit, as summarized in \Fref{tab:datasets}. All the datasets and the test code are provided in the supplementary materials and will be made public upon publication.
  6 | 
  7 | \begin{table}[!htb]
  8 |     \centering
  9 |     \caption{Summary of the used datasets}
 10 |     \label{tab:datasets}
 11 |     \begin{tabular}{lllll}
 12 |         \toprule
 13 |         Dataset & \# inputs & \# outputs & \# training & \# testing \\ \midrule
 14 |         ENB    & 8         & 2          & 700         & 68  \\
 15 |         SARCOS & 21        & 2          & 44484       & 4449 \\
 16 |         OpAmp  & 10        & 15         & 2000        & 8000 \\
 17 |         \bottomrule
 18 |     \end{tabular}
 19 | \end{table}
 20 | We use standardized mean squared error (SMSE) and negative log likelihood (NLL) as the evaluation criteria. For all the test cases and the compared models, 10 independent runs were performed to average the random fluctuations. We report both the means and standard deviations of the SMSE and NLL.
 21 | 
 22 | \subsection{The Energy Building (ENB) Dataset}\label{sec:enb}
 23 | 
 24 | The ENB dataset used is a small dataset with 768 samples, each sample has 8 inputs and 2 outputs. We use 700 samples as the training data, and the remaining 68 samples are used for testing. The dataset comes from simulations to 768 buildings \cite{spyromitros2016multi, tsanas2012accurate}. The 8 inputs are the building parameters like surface area and orientation, while the 2 outputs are the heating load and the cooling load\footnote{The dataset is available at http://mulan.sourceforge.net/datasets-mtr.html}.
 25 | 
 26 | For our MTNN-GP model, we use a neural network with 2 shared layers, and 1 task-specific layer per output, $K$ is set to 5. Each layer has 100 hidden units with the tanh activation function. For this architecture, we need to learn more than 30 thousands of parameters using only 700 samples.
 27 | 
 28 | The MTNN-GP model is compared with independent GP modeling (IGP) using the \texttt{GPML} package \cite{rasmussen2010gaussian} and 4 multi-output Gaussian process models, including the collaborative multi-output Gaussian processes (COGP)\footnote{downloaded from https://github.com/trungngv/cogp} proposed in \cite{nguyen2014collaborative}, the sparse convolved Gaussian processes (SCGP)\footnote{downloaded from https://github.com/SheffieldML/multigp} method proposed in \cite{alvarez2009sparse}, and the Gaussian process regression network with nonparametric variational inference and mean-field inference methods\footnote{downloaded from https://github.com/trungngv/gprn}(GPRN-NPV and GPRN-MF) \cite{nguyen2013efficient}. For the independent GP (IGP), the ARD squared-exponential kernel function is used. For other methods, we use the default configurations of the corresponding open source packages, except that we used 200 inducing points for the COGP model instead of using the default 500 inducing points.
 29 | 
 30 | The SMSE and NLL statistics are given in \Fref{tab:result_enb}. The GPRN-NPV and GPRN-MF models give no predictive variances, so only the SMSE statistics are reported. We can see that although more than 30 thousands of parameters are learnt from only 700 samples, the learnt model gives very good prediction to the test set. Our MTNN-GP is better than the independent GP models. As for other multi-output GP models, they all gave results worse than the IGP models.
 31 | 
 32 | \begin{table}[!htb]
 33 |     \centering
 34 |     \caption{The SMSE and NLL statistics of the ENB dataset}
 35 |     \label{tab:result_enb}
 36 |     \begin{tabular}{lllll}
 37 |         \toprule
 38 |         Algo     & Output1(SMSE)          & Output2(SMSE)          & Output1(NLL)        & Output2(NLL)         \\ \midrule
 39 |         MTNN-GP  & \textbf{0.00155 $\pm$ 0.000159} & \textbf{0.00753 $\pm$ 0.00135}  & \textbf{0.332 $\pm$ 0.0634}  & \textbf{0.972 $\pm$ 0.107}    \\
 40 |         IGP      & 0.00188 $\pm$ 0        & 0.00911 $\pm$ 0        & 0.538 $\pm$ 0       & 1.01  $\pm$ 0        \\
 41 |         GOGP     & 0.00597 $\pm$ 0.00088  & 0.0144  $\pm$ 0.000831 & 1.34  $\pm$ 0.159   & 2.08  $\pm$ 0.212    \\
 42 |         SCGP     & 0.708   $\pm$ 3.78e-05 & 1.21    $\pm$ 4.1e-05  & 1.56  $\pm$ 0.00363 & 1.66  $\pm$ 0.00063  \\
 43 |         GPRN-NPV & 6.87    $\pm$ 9.36e-16 & 8.63    $\pm$ 1.87e-15 & NA                  & NA                   \\
 44 |         GPRN-MF  & 0.359   $\pm$ 0.225    & 0.614   $\pm$ 0.339    & NA                  & NA                   \\
 45 |         \bottomrule
 46 |     \end{tabular}
 47 | \end{table}
 48 | 
 49 | \subsection{The SARCOS dataset}\label{sec:sarcos}
 50 | 
 51 | \begin{table}[!htb]
 52 |     \centering
 53 |     \caption{The SMSE and NLL statistics of the SARCOS dataset}
 54 |     \label{tab:result_sarcos}
 55 |     \begin{tabular}{lllll}
 56 |         \toprule
 57 |         Algo      & Output1(SMSE)                     & Output2(SMSE)                     & Output1(NLL)                   & Output2(NLL)           \\ \midrule
 58 |         MTNN-GP  & \textbf{0.00156 \(\pm\) 3.46e-05} & \textbf{0.00307 \(\pm\) 5.64e-05} & \textbf{0.804 \(\pm\) 0.0111}  & \textbf{-0.509 \(\pm\) 0.00813} \\
 59 |         IGP      & 0.0045  \(\pm\) 0.000153          & 0.00787 \(\pm\) 0.000257          & 1.06  \(\pm\) 0.00941          & -0.236 \(\pm\) 0.0124  \\
 60 |         GOGP     & 0.00852 \(\pm\) 0.000241          & 0.0149  \(\pm\) 0.000433          & 2.48  \(\pm\) 0.0577           & 2.4    \(\pm\) 0.097   \\
 61 |         SCGP     & 4.7     \(\pm\) 0.0245            & 3.39    \(\pm\) 0.0192            & 4.63  \(\pm\) 0.0128           & 2.87   \(\pm\) 0.011   \\
 62 |         GPRN-NPV & 4.99    \(\pm\) 0                 & 3.58    \(\pm\) 0                 & NA                             & NA    \\
 63 |         GPRN-MF  & 2.21    \(\pm\) 2.06              & 1.65    \(\pm\) 1.51              & NA                             & NA    \\ \midrule
 64 |         COGP \cite{nguyen2014collaborative}            & 0.2631            & 0.0127  & 3.6   & 0.8302                 \\
 65 |         GPRN-AVI \cite{NIPS2015_5665}                  & $\approx$ 0.009   & > 0.009 & NA    & NA     \\
 66 |         \bottomrule
 67 |     \end{tabular}
 68 | \end{table}
 69 | 
 70 | We use the SARCOS dataset\footnote{available at http://www.gaussianprocess.org/gpml/data/} to test the scalability of our model for large and high dimensional dataset. The dataset comes from a robot inverse dynamic model. The whole dataset contains 48933 samples. Each sample has 21 inputs (the joint positions, joint velocities, and joint accelerations) and 7 targets (7 joint torques). 44484 samples are used as the training set, and the remaining 4449 samples are used as the testing data. We select the 4-th and 7-th torques as the two outputs, which is the same setting as \cite{nguyen2014collaborative} and \cite{NIPS2015_5665}. We also compared our experimental results with the results reported by \cite{nguyen2014collaborative,NIPS2015_5665}.
 71 | 
 72 | % For our MTNN-GP model, we used two layers with 100 hidden units as the shared layers, two layers with 100 hidden units are use
 73 | For our MTNN-GP model, we use the same setting as we used for the ENB dataset. For the independent GP model, we used the FITC approximation with 200 inducing points to speedup the training. As the SCGP, GPRN-NPV and GPRN-MF cannot scale to such large dataset, in each run, we randomly select 1000 points from the training set to train the two GPRN models and 2000 points to train the SCGP model.
 74 | % For this dataset, the typical training time is about two hours for independent GP, four hours for the MTNN-GP, COGP models and the SCGP model with 2000 training set. However, the two GPRN models with only 1000 training points need more than one day to finish the training.
 75 | 
 76 | The SMSE and NLL results are listed in \Fref{tab:result_sarcos}. It can be clearly seen that our MTNN-GP gave better predictions than the independent GP models, and the IGP model outperformed all other multi-output Gaussian process models.
 77 | 
 78 | Note that the data of the last two rows in \Fref{tab:result_sarcos} are not from experiments we performed, but from the public reported results of two multi-output Gaussian process models \cite{nguyen2014collaborative, NIPS2015_5665}. We show that simply using independent GP model with FITC approximation can generate considerably better performances than the reported results.
 79 | 
 80 | The reported results of the COGP model are different with our experimental results due to the different settings. In \cite{nguyen2014collaborative}, only 2000 training points were used for the first output, while we used the whole 44484 training data for both outputs. Also, we used 200 inducing points (same as the independent GP), while in \cite{nguyen2014collaborative}, 500 inducing points are used. The COGP results from our experiments gave better predictions for the first output but worse predictions for the second output.
 81 | 
 82 | 
 83 | \subsection{Behaviour Modeling of Operational Amplifier}\label{sec:dac14}
 84 | 
 85 | % \begin{figure}[!htb]
 86 | %     \centering
 87 | %     \includegraphics[width=200pt]{./img/sopam.pdf}
 88 | %     \caption{The schematic of the operational amplifier}
 89 | %     \label{fig:sopamp}
 90 | % \end{figure}
 91 | 
 92 | The last dataset we used is the simulation data of a real-world analog integrated circuit. The circuit is an operational amplifier (OpAmp) with 10 design variables. We randomly sampled the 10 design variables and used a commercial circuit simulator HSPICE to get the performances of the circuit. We considered the gain, unit gain frequency (UGF) and the phase margin (PM) of the OpAmp over 5 design corners, so there are up to 15 performances considered. We gathered 10000 samples and used 2000 points as the training set, the rest 8000 data points are used for testing. The dataset can be seen in the supplementary materials.
 93 | 
 94 | 
 95 | The IGP, COGP, SCGP, GPRN-NPV and GPRN-MF models are compared. The algorithm settings for our MTNN-GP and the compared models are same with the settings described in \Fref{sec:sarcos}.
 96 | 
 97 | 
 98 | The SMSE statistics are listed in \Fref{tab:smse_DAC} and the NLL statistics are given in \Fref{tab:nll_DAC}. Like the previous two datasets, the MTNN-GP model gives far better predictions than the IGP and other multi-output GP models. The SMSE of the COGP model is slightly better than the IGP model, but the NLL results are worse. The SCGP and the GPRN-NPV completely underfitted the dataset, they predict everything to zero. Although the SMSE values of COGP is better than IGP, SCGP give constant prediction for all inputs. We found in \Fref{tab:nll_DAC} that the NLL of COGP is much worse than SCGP.
 99 | 
100 | \begin{table}[!htb]
101 |     \centering
102 |     \caption{The SMSE statistics of the OpAmp dataset}
103 |     \label{tab:smse_DAC}
104 |     \begin{tabular}{lllllll}
105 |         \toprule
106 |         Output & MTNN-GP                        &  IGP                  & COGP                  & SCGP          &  GPRN-NPV    & GPRN-MF            \\ \midrule
107 |         O1     &  \textbf{6.5e-4 $\pm$  3.5e-5} &  0.20 $\pm$  0.0019   &  0.19 $\pm$  0.0044   &  1  $\pm$  0  &  1  $\pm$  0 &  0.79 $\pm$  0.039 \\
108 |         O2     &  \textbf{6.2e-4 $\pm$  3.9e-5} &  0.18 $\pm$  0.0088   &  0.15 $\pm$  0.0033   &  1  $\pm$  0  &  1  $\pm$  0 &  0.73 $\pm$  0.049 \\
109 |         O3     &  \textbf{5.4e-4 $\pm$  5.4e-5} &  0.08 $\pm$  0.003    &  0.08 $\pm$  0.002    &  1  $\pm$  0  &  1  $\pm$  0 &  0.70 $\pm$  0.045 \\
110 |         O4     &  \textbf{2.4e-4 $\pm$  2.0e-5} &  0.22 $\pm$  0.0015   &  0.05 $\pm$  0.00099  &  1  $\pm$  0  &  1  $\pm$  0 &  0.59 $\pm$  0.065 \\
111 |         O5     &  \textbf{2.9e-4 $\pm$  2.2e-5} &  0.40 $\pm$  0.0074   &  0.09 $\pm$  0.00037  &  1  $\pm$  0  &  1  $\pm$  0 &  0.64 $\pm$  0.068 \\
112 |         O6     &  \textbf{2.2e-4 $\pm$  1.8e-5} &  0.23 $\pm$  0.0025   &  0.05 $\pm$  0.00073  &  1  $\pm$  0  &  1  $\pm$  0 &  0.61 $\pm$  0.066 \\
113 |         O7     &  \textbf{1.3e-3 $\pm$  1.0e-4} &  0.62 $\pm$  0.019    &  0.22 $\pm$  0.0033   &  1  $\pm$  0  &  1  $\pm$  0 &  0.75 $\pm$  0.043 \\
114 |         O8     &  \textbf{1.1e-3 $\pm$  1.7e-4} &  0.44 $\pm$  0.0059   &  0.15 $\pm$  0.0021   &  1  $\pm$  0  &  1  $\pm$  0 &  0.71 $\pm$  0.042 \\
115 |         O9     &  \textbf{1.1e-3 $\pm$  9.6e-5} &  0.12 $\pm$  0.0021   &  0.07 $\pm$  0.0019   &  1  $\pm$  0  &  1  $\pm$  0 &  0.68 $\pm$  0.037 \\
116 |         O10    &  \textbf{1.2e-3 $\pm$  1.1e-4} &  0.40 $\pm$  0.11     &  0.25 $\pm$  0.0045   &  1  $\pm$  0  &  1  $\pm$  0 &  0.82 $\pm$  0.044 \\
117 |         O11    &  \textbf{1.1e-3 $\pm$  9.5e-5} &  0.49 $\pm$  0.01     &  0.18 $\pm$  0.0027   &  1  $\pm$  0  &  1  $\pm$  0 &  0.75 $\pm$  0.046 \\
118 |         O12    &  \textbf{1.0e-3 $\pm$  6.0e-5} &  0.40 $\pm$  0.071    &  0.10 $\pm$  0.0015   &  1  $\pm$  0  &  1  $\pm$  0 &  0.70 $\pm$  0.043 \\
119 |         O13    &  \textbf{3.3e-3 $\pm$  2.7e-5} &  0.09 $\pm$  0.0029   &  0.09 $\pm$  0.00093  &  1  $\pm$  0  &  1  $\pm$  0 &  0.71 $\pm$  0.061 \\
120 |         O14    &  \textbf{3.7e-3 $\pm$  3.1e-5} &  0.13 $\pm$  0.0022   &  0.11 $\pm$  0.0024   &  1  $\pm$  0  &  1  $\pm$  0 &  0.74 $\pm$  0.053 \\
121 |         O15    &  \textbf{2.8e-3 $\pm$  1.4e-5} &  0.06 $\pm$  0.00051  &  0.06 $\pm$  0.00099  &  1  $\pm$  0  &  1  $\pm$  0 &  0.68 $\pm$  0.059 \\
122 |         \bottomrule
123 |     \end{tabular}
124 | \end{table}
125 | 
126 | \begin{table}[!htb]
127 |     \centering
128 |     \caption{The NLL statistics of the OpAmp dataset}
129 |     \label{tab:nll_DAC}
130 |     \begin{tabular}{lllllll}
131 |         \toprule
132 |         Output & MTNN-GP                       & IGP                     & COGP                & SCGP                &  GPRN-NPV & GPRN-MF \\ \midrule
133 |         O1     &  \textbf{-2.3  $\pm$  0.036}  &  0.43    $\pm$  0.0058  &  4.7  $\pm$  0.22   &  1.8  $\pm$  0.17   &  NA      & NA \\
134 |         O2     &  \textbf{-2.5  $\pm$  0.024}  &  0.18    $\pm$  0.011   &  3.2  $\pm$  0.13   &  1.9  $\pm$  0.15   &  NA      & NA \\
135 |         O3     &  \textbf{-2.4  $\pm$  0.032}  &  -0.1    $\pm$  0.0035  &  3.2  $\pm$  0.14   &  1.6  $\pm$  0.11   &  NA      & NA \\
136 |         O4     &  \textbf{-3    $\pm$  0.04 }  &  -1.1    $\pm$  0.0084  &  2    $\pm$  0.068  &  1.5  $\pm$  0.16   &  NA      & NA \\
137 |         O5     &  \textbf{-3    $\pm$  0.043}  &  -1.1    $\pm$  0.029   &  2.1  $\pm$  0.17   &  1.5  $\pm$  0.11   &  NA      & NA \\
138 |         O6     &  \textbf{-3    $\pm$  0.035}  &  -0.95   $\pm$  0.014   &  1.8  $\pm$  0.11   &  1.7  $\pm$  0.17   &  NA      & NA \\
139 |         O7     &  \textbf{-2.1  $\pm$  0.051}  &  -0.73   $\pm$  0.01    &  5.7  $\pm$  0.35   &  1.5  $\pm$  0.049  &  NA      & NA \\
140 |         O8     &  \textbf{-2.3  $\pm$  0.057}  &  -1      $\pm$  0.056   &  4.9  $\pm$  0.27   &  1.5  $\pm$  0.038  &  NA      & NA \\
141 |         O9     &  \textbf{-2.2  $\pm$  0.031}  &  -1      $\pm$  0.016   &  2.5  $\pm$  0.19   &  1.6  $\pm$  0.1    &  NA      & NA \\
142 |         O10    &  \textbf{-2    $\pm$  0.059}  &  0.33    $\pm$  0.062   &  5.1  $\pm$  0.53   &  1.8  $\pm$  0.11   &  NA      & NA \\
143 |         O11    &  \textbf{-2.2  $\pm$  0.064}  &  -0.046  $\pm$  0.015   &  4    $\pm$  0.095  &  1.8  $\pm$  0.099  &  NA      & NA \\
144 |         O12    &  \textbf{-2.1  $\pm$  0.032}  &  -0.27   $\pm$  0.016   &  3.6  $\pm$  0.28   &  1.7  $\pm$  0.069  &  NA      & NA \\
145 |         O13    &  \textbf{-2.7  $\pm$  0.032}  &  0.051   $\pm$  0.0075  &  2.7  $\pm$  0.14   &  1.6  $\pm$  0.17   &  NA      & NA \\
146 |         O14    &  \textbf{-2.7  $\pm$  0.034}  &  -0.046  $\pm$  0.0025  &  2.5  $\pm$  0.25   &  1.7  $\pm$  0.22   &  NA      & NA \\
147 |         O15    &  \textbf{-2.7  $\pm$  0.03 }  &  -0.2    $\pm$  0.0036  &  2.5  $\pm$  0.2    &  1.5  $\pm$  0.06   &  NA      & NA \\
148 |         \bottomrule
149 |     \end{tabular}
150 | \end{table}
151 | 
152 | % XXX: GPRN-NPV might be a good model, but it is poorly implemented, we set max_iter = 100, but it often pre-converge after tens of iterations
153 | 


--------------------------------------------------------------------------------
/paper/img/NN-GP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alaya-in-Matrix/NeuralLinear/bc0209e1e8c24756a3777a505b8655c93009df4a/paper/img/NN-GP.png


--------------------------------------------------------------------------------
/paper/img/NN-MOGP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alaya-in-Matrix/NeuralLinear/bc0209e1e8c24756a3777a505b8655c93009df4a/paper/img/NN-MOGP.png


--------------------------------------------------------------------------------
/paper/img/nn1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alaya-in-Matrix/NeuralLinear/bc0209e1e8c24756a3777a505b8655c93009df4a/paper/img/nn1.pdf


--------------------------------------------------------------------------------
/paper/img/nn2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alaya-in-Matrix/NeuralLinear/bc0209e1e8c24756a3777a505b8655c93009df4a/paper/img/nn2.pdf


--------------------------------------------------------------------------------
/paper/introduction.tex:
--------------------------------------------------------------------------------
 1 | \section{Introduction}
 2 | 
 3 | The Gaussian process (GP) regression is popular as the model provides both predictions and well-calibrated uncertainties. The uncertainty estimation makes the model more robust to unseen events as the model \emph{knows what it knows}. The conventional GP models are usually designed to learn a scalar-valued function. However, in some scenarios, we need to model a vector-valued function, and the multiple outputs are possibly correlated. Instead of treating the vector-valued function as multiple separate scalar-valued functions, the multi-output learning \cite{zhang2017survey} tries to build a unified model and simultaneously learn all the outputs. The overall performance could be enhanced by exploiting the correlations of the tasks.
 4 | 
 5 | % Existing methods: convolution based, krok based, GPRN.
 6 | Multi-task Gaussian process \cite{vectorvaluedkernel} tries to combine multi-task learning and Gaussian process. The linear model of coregionalization (LMC) methods \cite{journel1978mining} assume the $Q$ outputs $f_i(\bm{x}), i \in \{1\dots Q\}$ are linear combinations of several latent functions as $f_i(\bm{x}) = \sum_{j=1}^U a_{ij} u_j(\bm{x})$. In \cite{bonilla2008multi}, the covariance matrix is expressed as the Kronecker product of the task covariance and the task-irrelevant input covariance matrix. In \cite{nguyen2014collaborative}, the output function $f_i(\bm{x})$ is expressed as the linear combinations of $U$ latent functions $u_j(\bm{x}), j \in \{1\dots U\}$ plus a task-specific function $h_i(\bm{x})$. Efficient training and inference methods are also developed. In the Gaussian process regression network (GPRN) model~\cite{wilson2012gaussian}, the $i$-th output function $f_i(\bm{x})$ is also expressed as weighted combinations of $U$ latent functions, however, the weights are also nonlinear functions characterized by GP so that $f_i(\bm{x}) = \sum_{j=1}^U w_{ij}(\bm{x}) (u_j(\bm{x}) + \epsilon_u) + \epsilon_f$ where $\epsilon_u$ and $\epsilon_f$ are the noise terms. Another methodology of building multi-task kernels is \emph{process convolution} \cite{boyle2005dependent,alvarez2009sparse,alvarez2011computationally}, where the output function $f_i(\bm{x})$ is assumed to be the convolution of output-dependent smoothing kernel $g_i(\bm{x})$ and a latent function $u(\bm{x})$.
 7 | 
 8 | In this paper, we propose a multi-output Gaussian process model based on the multi-task neural network. As a degenerate GP can be built from a finite feature map and neural networks can generate representative features, a degenerate GP can be derived from a neural network with finite hidden units \cite{lazaro2010marginalized, huang2015scalable}. In our proposed model, the multi-output GP is built from a neural network with shared layers and task-specific layers. The input data is first transformed into \emph{shared features} through the shared layers and further mapped to \emph{task-specific features} by each task's task-specific layers. GP kernels for the outputs are then built from the inner products of the task-specific features. The weights of the multi-task neural network is obtained by maximizing the likelihood with gradient back-propagation.
 9 | 
10 | We compared our model with independent GP model and several multi-output GP models. Three datasets were used, including two public datasets and one dataset from the simulation results of a real-world analog integrated circuit. It is demonstrated that our model can provide better predictions and uncertainty estimations than the compared models.
11 | 
12 | The rest of the paper is organized as follows. In \Fref{sec:Background}, we present the background of the Gaussian process and GP model built from neural networks with finite hidden units. In \Fref{sec:mogp}, we present our proposed multi-output GP model via multi-task neural network. The experimental results are given in \Fref{sec:experiments}. We conclude the paper in \Fref{sec:conclusion}.
13 | % XXX:
14 | % advantage:
15 | %   the correlation modeling is more flexibility
16 | %   efficient
17 | 
18 | % XXX:
19 | % Kronecker product: the covaiance matrix is decomposed as the Kronecker product of covaiance of task and covaiance of input
20 | % GPRN: $\bm{f}(x) = W(\bm{x})^T \bm{lf}(\bm{x})$, where $\bm{x}$ and $W(\bm{x})$ are GP
21 | % convolution:
22 | 


--------------------------------------------------------------------------------
/paper/makefile:
--------------------------------------------------------------------------------
 1 | mainfile = paper
 2 | PDF      = ${mainfile}.pdf
 3 | 
 4 | all : ${PDF}
 5 | 
 6 | ${mainfile}.pdf: *.tex ref.bib
 7 | 	pdflatex ${mainfile}.tex
 8 | 	- bibtex ${mainfile}
 9 | 	pdflatex ${mainfile}.tex
10 | 	pdflatex ${mainfile}.tex
11 | 
12 | tidy:
13 | 	rm  -vf *.log *.aux \
14 | 	*.cfg *.glo *.idx *.toc \
15 | 	*.ilg *.ind *.out *.lof \
16 | 	*.lot *.bbl *.blg *.gls *.cut *.hd \
17 | 	*.dvi *.ps *.thm *.tgz *.zip *.rpi
18 | 
19 | 
20 | clean: tidy
21 | 	rm -f ${PDF}
22 | 


--------------------------------------------------------------------------------
/paper/mogp.tex:
--------------------------------------------------------------------------------
 1 | \section{Multi-output Gaussian Process Regression via Neural network}\label{sec:mogp}
 2 | 
 3 | \begin{figure}[!htb]
 4 |     \centering
 5 |     \includegraphics[width=\columnwidth]{./img/nn2.pdf}
 6 |     \caption{Architecture of the multi-output Gaussian process model.}
 7 |     \label{fig:MONNGP}
 8 | \end{figure}
 9 | 
10 | We have shown that a GP can be constructed from a neural network with finite hidden units in the last layer. Now, we will show how to model the correlations between tasks and how to construct a multi-output Gaussian process model based on neural network.
11 | 
12 | Suppose we have $N$ observations $D_Q = \{X, Y | X \in R^{D \times N}, Y \in R^{N \times Q}\}$. We assume they are generated by $Q$ latent functions and polluted by noises with different noise levels. Instead of building $Q$ independent models, we build a multi-output model that makes use of the correlations between the $Q$ tasks. Firstly, a neural network is used to define a \emph{shared feature map} $\phi_s : R^D \rightarrow R^{M_s}$. Then, for each task $i \in \{1,~\dots~,Q\}$, the shared feature $\phi_s(\bm{x})$ is followed by a \emph{task-specific} neural network that defines a feature map $\phi_i : R^{M_s} \rightarrow R^{M_i}$. We assume the $i$-th latent function $f_i(\bm{x})$ follows a GP distribution $f_i \sim \mathcal{GP}(0, k_i)$ defined as follows:
13 | 
14 | \begin{equation}
15 |     \label{eq:mo_kernel}
16 |     \left\{
17 |     \begin{array}{lll}
18 |         k_i(\bm{x}, \bm{y}) &=&    \phi_i(\phi_s(\bm{x}))^T~\frac{\sigma_{p, i}^2}{M_i}~\phi_i(\phi_s(\bm{y})) \\
19 |         f_i                 &\sim& \mathcal{GP}(0, k_i)      \\
20 |         y_i                 &\sim& N(f_i(\bm{x}), \sigma_{n, i}^2)
21 |     \end{array}.
22 |     \right.
23 | \end{equation}
24 | 
25 | As illustrated in \Fref{fig:MONNGP}, by defining GP with kernel function of \eqref{eq:mo_kernel}, we actually defined a neural network architecture with shared layers and task-specific layers, which is a common architecture used in multi-task deep learning \cite{ruder2017overview}. The correlations between tasks are naturally encoded by the shared layers, while the task-specific features are further learnt from the shared features.
26 | 
27 | With the model define in \eqref{eq:mo_kernel}, different tasks are \emph{conditionally independent} given the shared features. Each specific task still sees a neural network with the same architecture plotted in \Fref{fig:NNGP}. The inferences of $\mu(\bm{x})$ and $\sigma^2(\bm{x})$ are exactly the same as \eqref{eq:DegeneratePred}, and no additional overhead will be introduced. As different tasks are conditionally independent, the log likelihood of the training data can be expressed as the sum of the log likelihood of each specific task
28 | \begin{equation}
29 |     \label{eq:mo_likelihood}
30 |     \log p(Y | X, \bm{\Theta}) = \sum_{i=1}^Q \log p(\bm{y}_i | X, \bm{\theta}_i, \bm{\theta}_s),
31 | \end{equation}
32 | where $\log p(\bm{y}_i | X, \bm{\theta}_i, \bm{\theta}_s)$ is the log likelihood of the $i$-th task as defined in \eqref{eq:DegenerateGPloglikelihood}, $\bm{y}_i$ is the $i$-the column of $Y$, $\bm{\Theta}$ is the vector of all the parameters, including the shared parameters and the task-specific parameters. $\bm{\theta_i}$ is the vector of parameters for specific task $i$, including the weights of the $i$-th task-specific neural network, the weight prior factor $\sigma_{p, i}$ and the noise level $\sigma_{n, i}$ for the $i$-th task. $\bm{\theta}_s$ is a vector of the weights of the shared layers. The parameters $\bm{\Theta}$ are obtained by maximizing the likelihood function in \eqref{eq:mo_likelihood} with gradient back-propagation. The model averaging technique as described in subsection \ref{sec:deepensemble} is also employed in our model to improve the quality of uncertainty estimation. $K$ independent neural network models are trained with random initializations in our model. According to \eqref{eq:DegenerateGPloglikelihood} and \eqref{eq:mo_likelihood}, the time complexity of training for our model is $O(KN\sum_i^Q M_i^2)$.
33 | 


--------------------------------------------------------------------------------
/paper/nips_2018.sty:
--------------------------------------------------------------------------------
  1 | % partial rewrite of the LaTeX2e package for submissions to the
  2 | % Conference on Neural Information Processing Systems (NIPS):
  3 | %
  4 | % - uses more LaTeX conventions
  5 | % - line numbers at submission time replaced with aligned numbers from
  6 | %   lineno package
  7 | % - \nipsfinalcopy replaced with [final] package option
  8 | % - automatically loads times package for authors
  9 | % - loads natbib automatically; this can be suppressed with the
 10 | %   [nonatbib] package option
 11 | % - adds foot line to first page identifying the conference
 12 | % - adds preprint option for submission to e.g. arXiv
 13 | %
 14 | % Roman Garnett (garnett@wustl.edu) and the many authors of
 15 | % nips15submit_e.sty, including MK and drstrip@sandia
 16 | %
 17 | % last revision: March 2018
 18 | 
 19 | \NeedsTeXFormat{LaTeX2e}
 20 | \ProvidesPackage{nips_2018}[2018/03/03 NIPS 2018 submission/camera-ready style file]
 21 | 
 22 | % declare final option, which creates camera-ready copy
 23 | \newif\if@nipsfinal\@nipsfinalfalse
 24 | \DeclareOption{final}{
 25 |   \@nipsfinaltrue
 26 | }
 27 | 
 28 | % declare nonatbib option, which does not load natbib in case of
 29 | % package clash (users can pass options to natbib via
 30 | % \PassOptionsToPackage)
 31 | \newif\if@natbib\@natbibtrue
 32 | \DeclareOption{nonatbib}{
 33 |   \@natbibfalse
 34 | }
 35 | 
 36 | % declare preprint option, which creates a preprint version ready for
 37 | % upload to, e.g., arXiv
 38 | \newif\if@preprint\@preprintfalse
 39 | \DeclareOption{preprint}{
 40 |   \@preprinttrue
 41 | }
 42 | 
 43 | \ProcessOptions\relax
 44 | 
 45 | % determine whether this is an anonymized submission
 46 | \newif\if@submission\@submissiontrue
 47 | \if@nipsfinal\@submissionfalse\fi
 48 | \if@preprint\@submissionfalse\fi
 49 | 
 50 | % fonts
 51 | \renewcommand{\rmdefault}{ptm}
 52 | \renewcommand{\sfdefault}{phv}
 53 | 
 54 | % change this every year for notice string at bottom
 55 | \newcommand{\@nipsordinal}{32nd}
 56 | \newcommand{\@nipsyear}{2018}
 57 | \newcommand{\@nipslocation}{Montr\'{e}al, Canada}
 58 | 
 59 | % handle tweaks for camera-ready copy vs. submission copy
 60 | \if@preprint
 61 |   \newcommand{\@noticestring}{%
 62 |     Preprint. Work in progress.%
 63 |   }
 64 | \else
 65 |   \if@nipsfinal
 66 |     \newcommand{\@noticestring}{%
 67 |       \@nipsordinal\/ Conference on Neural Information Processing Systems
 68 |       (NIPS \@nipsyear), \@nipslocation.%
 69 |     }
 70 |   \else
 71 |     \newcommand{\@noticestring}{%
 72 |       Submitted to \@nipsordinal\/ Conference on Neural Information
 73 |       Processing Systems (NIPS \@nipsyear). Do not distribute.%
 74 |     }
 75 | 
 76 |     % line numbers for submission
 77 |     \RequirePackage{lineno}
 78 |     \linenumbers
 79 | 
 80 |     % fix incompatibilities between lineno and amsmath, if required, by
 81 |     % transparently wrapping linenomath environments around amsmath
 82 |     % environments
 83 |     \AtBeginDocument{%
 84 |       \@ifpackageloaded{amsmath}{%
 85 |         \newcommand*\patchAmsMathEnvironmentForLineno[1]{%
 86 |           \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname
 87 |           \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname
 88 |           \renewenvironment{#1}%
 89 |                            {\linenomath\csname old#1\endcsname}%
 90 |                            {\csname oldend#1\endcsname\endlinenomath}%
 91 |         }%
 92 |         \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{%
 93 |           \patchAmsMathEnvironmentForLineno{#1}%
 94 |           \patchAmsMathEnvironmentForLineno{#1*}%
 95 |         }%
 96 |         \patchBothAmsMathEnvironmentsForLineno{equation}%
 97 |         \patchBothAmsMathEnvironmentsForLineno{align}%
 98 |         \patchBothAmsMathEnvironmentsForLineno{flalign}%
 99 |         \patchBothAmsMathEnvironmentsForLineno{alignat}%
100 |         \patchBothAmsMathEnvironmentsForLineno{gather}%
101 |         \patchBothAmsMathEnvironmentsForLineno{multline}%
102 |       }{}
103 |     }
104 |   \fi
105 | \fi
106 | 
107 | % load natbib unless told otherwise
108 | \if@natbib
109 |   \RequirePackage{natbib}
110 | \fi
111 | 
112 | % set page geometry
113 | \usepackage[verbose=true,letterpaper]{geometry}
114 | \AtBeginDocument{
115 |   \newgeometry{
116 |     textheight=9in,
117 |     textwidth=5.5in,
118 |     top=1in,
119 |     headheight=12pt,
120 |     headsep=25pt,
121 |     footskip=30pt
122 |   }
123 |   \@ifpackageloaded{fullpage}
124 |     {\PackageWarning{nips_2018}{fullpage package not allowed! Overwriting formatting.}}
125 |     {}
126 | }
127 | 
128 | \widowpenalty=10000
129 | \clubpenalty=10000
130 | \flushbottom
131 | \sloppy
132 | 
133 | % font sizes with reduced leading
134 | \renewcommand{\normalsize}{%
135 |   \@setfontsize\normalsize\@xpt\@xipt
136 |   \abovedisplayskip      7\p@ \@plus 2\p@ \@minus 5\p@
137 |   \abovedisplayshortskip \z@ \@plus 3\p@
138 |   \belowdisplayskip      \abovedisplayskip
139 |   \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
140 | }
141 | \normalsize
142 | \renewcommand{\small}{%
143 |   \@setfontsize\small\@ixpt\@xpt
144 |   \abovedisplayskip      6\p@ \@plus 1.5\p@ \@minus 4\p@
145 |   \abovedisplayshortskip \z@  \@plus 2\p@
146 |   \belowdisplayskip      \abovedisplayskip
147 |   \belowdisplayshortskip 3\p@ \@plus 2\p@   \@minus 2\p@
148 | }
149 | \renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
150 | \renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
151 | \renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
152 | \renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
153 | \renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
154 | \renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
155 | \renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
156 | \renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
157 | 
158 | % sections with less space
159 | \providecommand{\section}{}
160 | \renewcommand{\section}{%
161 |   \@startsection{section}{1}{\z@}%
162 |                 {-2.0ex \@plus -0.5ex \@minus -0.2ex}%
163 |                 { 1.5ex \@plus  0.3ex \@minus  0.2ex}%
164 |                 {\large\bf\raggedright}%
165 | }
166 | \providecommand{\subsection}{}
167 | \renewcommand{\subsection}{%
168 |   \@startsection{subsection}{2}{\z@}%
169 |                 {-1.8ex \@plus -0.5ex \@minus -0.2ex}%
170 |                 { 0.8ex \@plus  0.2ex}%
171 |                 {\normalsize\bf\raggedright}%
172 | }
173 | \providecommand{\subsubsection}{}
174 | \renewcommand{\subsubsection}{%
175 |   \@startsection{subsubsection}{3}{\z@}%
176 |                 {-1.5ex \@plus -0.5ex \@minus -0.2ex}%
177 |                 { 0.5ex \@plus  0.2ex}%
178 |                 {\normalsize\bf\raggedright}%
179 | }
180 | \providecommand{\paragraph}{}
181 | \renewcommand{\paragraph}{%
182 |   \@startsection{paragraph}{4}{\z@}%
183 |                 {1.5ex \@plus 0.5ex \@minus 0.2ex}%
184 |                 {-1em}%
185 |                 {\normalsize\bf}%
186 | }
187 | \providecommand{\subparagraph}{}
188 | \renewcommand{\subparagraph}{%
189 |   \@startsection{subparagraph}{5}{\z@}%
190 |                 {1.5ex \@plus 0.5ex \@minus 0.2ex}%
191 |                 {-1em}%
192 |                 {\normalsize\bf}%
193 | }
194 | \providecommand{\subsubsubsection}{}
195 | \renewcommand{\subsubsubsection}{%
196 |   \vskip5pt{\noindent\normalsize\rm\raggedright}%
197 | }
198 | 
199 | % float placement
200 | \renewcommand{\topfraction      }{0.85}
201 | \renewcommand{\bottomfraction   }{0.4}
202 | \renewcommand{\textfraction     }{0.1}
203 | \renewcommand{\floatpagefraction}{0.7}
204 | 
205 | \newlength{\@nipsabovecaptionskip}\setlength{\@nipsabovecaptionskip}{7\p@}
206 | \newlength{\@nipsbelowcaptionskip}\setlength{\@nipsbelowcaptionskip}{\z@}
207 | 
208 | \setlength{\abovecaptionskip}{\@nipsabovecaptionskip}
209 | \setlength{\belowcaptionskip}{\@nipsbelowcaptionskip}
210 | 
211 | % swap above/belowcaptionskip lengths for tables
212 | \renewenvironment{table}
213 |   {\setlength{\abovecaptionskip}{\@nipsbelowcaptionskip}%
214 |    \setlength{\belowcaptionskip}{\@nipsabovecaptionskip}%
215 |    \@float{table}}
216 |   {\end@float}
217 | 
218 | % footnote formatting
219 | \setlength{\footnotesep }{6.65\p@}
220 | \setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
221 | \renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
222 | \setcounter{footnote}{0}
223 | 
224 | % paragraph formatting
225 | \setlength{\parindent}{\z@}
226 | \setlength{\parskip  }{5.5\p@}
227 | 
228 | % list formatting
229 | \setlength{\topsep       }{4\p@ \@plus 1\p@   \@minus 2\p@}
230 | \setlength{\partopsep    }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
231 | \setlength{\itemsep      }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
232 | \setlength{\parsep       }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
233 | \setlength{\leftmargin   }{3pc}
234 | \setlength{\leftmargini  }{\leftmargin}
235 | \setlength{\leftmarginii }{2em}
236 | \setlength{\leftmarginiii}{1.5em}
237 | \setlength{\leftmarginiv }{1.0em}
238 | \setlength{\leftmarginv  }{0.5em}
239 | \def\@listi  {\leftmargin\leftmargini}
240 | \def\@listii {\leftmargin\leftmarginii
241 |               \labelwidth\leftmarginii
242 |               \advance\labelwidth-\labelsep
243 |               \topsep  2\p@ \@plus 1\p@    \@minus 0.5\p@
244 |               \parsep  1\p@ \@plus 0.5\p@ \@minus 0.5\p@
245 |               \itemsep \parsep}
246 | \def\@listiii{\leftmargin\leftmarginiii
247 |               \labelwidth\leftmarginiii
248 |               \advance\labelwidth-\labelsep
249 |               \topsep    1\p@ \@plus 0.5\p@ \@minus 0.5\p@
250 |               \parsep    \z@
251 |               \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
252 |               \itemsep \topsep}
253 | \def\@listiv {\leftmargin\leftmarginiv
254 |               \labelwidth\leftmarginiv
255 |               \advance\labelwidth-\labelsep}
256 | \def\@listv  {\leftmargin\leftmarginv
257 |               \labelwidth\leftmarginv
258 |               \advance\labelwidth-\labelsep}
259 | \def\@listvi {\leftmargin\leftmarginvi
260 |               \labelwidth\leftmarginvi
261 |               \advance\labelwidth-\labelsep}
262 | 
263 | % create title
264 | \providecommand{\maketitle}{}
265 | \renewcommand{\maketitle}{%
266 |   \par
267 |   \begingroup
268 |     \renewcommand{\thefootnote}{\fnsymbol{footnote}}
269 |     % for perfect author name centering
270 |     \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
271 |     % The footnote-mark was overlapping the footnote-text,
272 |     % added the following to fix this problem               (MK)
273 |     \long\def\@makefntext##1{%
274 |       \parindent 1em\noindent
275 |       \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
276 |     }
277 |     \thispagestyle{empty}
278 |     \@maketitle
279 |     \@thanks
280 |     \@notice
281 |   \endgroup
282 |   \let\maketitle\relax
283 |   \let\thanks\relax
284 | }
285 | 
286 | % rules for title box at top of first page
287 | \newcommand{\@toptitlebar}{
288 |   \hrule height 4\p@
289 |   \vskip 0.25in
290 |   \vskip -\parskip%
291 | }
292 | \newcommand{\@bottomtitlebar}{
293 |   \vskip 0.29in
294 |   \vskip -\parskip
295 |   \hrule height 1\p@
296 |   \vskip 0.09in%
297 | }
298 | 
299 | % create title (includes both anonymized and non-anonymized versions)
300 | \providecommand{\@maketitle}{}
301 | \renewcommand{\@maketitle}{%
302 |   \vbox{%
303 |     \hsize\textwidth
304 |     \linewidth\hsize
305 |     \vskip 0.1in
306 |     \@toptitlebar
307 |     \centering
308 |     {\LARGE\bf \@title\par}
309 |     \@bottomtitlebar
310 |     \if@submission
311 |       \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}
312 |         Anonymous Author(s) \\
313 |         Affiliation \\
314 |         Address \\
315 |         \texttt{email} \\
316 |       \end{tabular}%
317 |     \else
318 |       \def\And{%
319 |         \end{tabular}\hfil\linebreak[0]\hfil%
320 |         \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
321 |       }
322 |       \def\AND{%
323 |         \end{tabular}\hfil\linebreak[4]\hfil%
324 |         \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
325 |       }
326 |       \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
327 |     \fi
328 |     \vskip 0.3in \@minus 0.1in
329 |   }
330 | }
331 | 
332 | % add conference notice to bottom of first page
333 | \newcommand{\ftype@noticebox}{8}
334 | \newcommand{\@notice}{%
335 |   % give a bit of extra room back to authors on first page
336 |   \enlargethispage{2\baselineskip}%
337 |   \@float{noticebox}[b]%
338 |     \footnotesize\@noticestring%
339 |   \end@float%
340 | }
341 | 
342 | % abstract styling
343 | \renewenvironment{abstract}%
344 | {%
345 |   \vskip 0.075in%
346 |   \centerline%
347 |   {\large\bf Abstract}%
348 |   \vspace{0.5ex}%
349 |   \begin{quote}%
350 | }
351 | {
352 |   \par%
353 |   \end{quote}%
354 |   \vskip 1ex%
355 | }
356 | 
357 | \endinput
358 | 


--------------------------------------------------------------------------------
/paper/paper.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | 
  3 | % if you need to pass options to natbib, use, e.g.:
  4 | % \PassOptionsToPackage{numbers, compress}{natbib}
  5 | % before loading nips_2018
  6 | 
  7 | % ready for submission
  8 | \usepackage{nips_2018}
  9 | 
 10 | % to compile a preprint version, e.g., for submission to arXiv, add
 11 | % add the [preprint] option:
 12 | % \usepackage[preprint]{nips_2018}
 13 | 
 14 | % to compile a camera-ready version, add the [final] option, e.g.:
 15 | % \usepackage[final]{nips_2018}
 16 | 
 17 | % to avoid loading the natbib package, add option nonatbib:
 18 | % \usepackage[nonatbib]{nips_2018}
 19 | 
 20 | \usepackage[utf8]{inputenc} % allow utf-8 input
 21 | \usepackage[T1]{fontenc}    % use 8-bit T1 fonts
 22 | \usepackage{hyperref}       % hyperlinks
 23 | \usepackage{url}            % simple URL typesetting
 24 | \usepackage{booktabs}       % professional-quality tables
 25 | \usepackage{amsfonts}       % blackboard math symbols
 26 | \usepackage{nicefrac}       % compact symbols for 1/2, etc.
 27 | \usepackage{microtype}      % microtypography
 28 | 
 29 | 
 30 | % Added by Wenlong Lyu
 31 | \usepackage{amsmath}
 32 | \usepackage[plain]{fancyref}
 33 | \usepackage{bm}
 34 | \usepackage{todonotes}
 35 | \usepackage{placeins}
 36 | \usepackage{multirow}
 37 | \usepackage{longtable}
 38 | 
 39 | \title{Multi-output Gaussian Process Regression via Multi-task Neural Network}
 40 | 
 41 | % The \author macro works with any number of authors. There are two
 42 | % commands used to separate the names and addresses of multiple
 43 | % authors: \And and \AND.
 44 | %
 45 | % Using \And between authors leaves it to LaTeX to determine where to
 46 | % break the lines. Using \AND forces a line break at that point. So,
 47 | % if LaTeX puts 3 of 4 authors names on the first line, and the last
 48 | % on the second line, try using \AND instead of \And before the third
 49 | % author name.
 50 | 
 51 | \author{
 52 |   Wenlong Lyu \\
 53 |   School of Microelectronics, Fudan University\\
 54 |   \texttt{wllv16@fudan.edu.cn} \\
 55 |   %% examples of more authors
 56 |   %% \And
 57 |   %% Coauthor \\
 58 |   %% Affiliation \\
 59 |   %% Address \\
 60 |   %% \texttt{email} \\
 61 |   %% \AND
 62 |   %% Coauthor \\
 63 |   %% Affiliation \\
 64 |   %% Address \\
 65 |   %% \texttt{email} \\
 66 |   %% \And
 67 |   %% Coauthor \\
 68 |   %% Affiliation \\
 69 |   %% Address \\
 70 |   %% \texttt{email} \\
 71 |   %% \And
 72 |   %% Coauthor \\
 73 |   %% Affiliation \\
 74 |   %% Address \\
 75 |   %% \texttt{email} \\
 76 | }
 77 | 
 78 | \begin{document}
 79 | % \nipsfinalcopy is no longer used
 80 | 
 81 | \maketitle
 82 | 
 83 | 
 84 | \input{abstract}
 85 | \input{introduction}
 86 | \input{background}
 87 | \input{mogp}
 88 | \input{experiments}
 89 | \input{conclusion}
 90 | 
 91 | 
 92 | % XXX:
 93 | % DNGO
 94 | % Deep-Kernel-Learning
 95 | % DNN-GP
 96 | 
 97 | \FloatBarrier
 98 | 
 99 | \bibliographystyle{unsrt}
100 | \bibliography{ref}
101 | 
102 | \end{document}
103 | 


--------------------------------------------------------------------------------
/paper/ref.bib:
--------------------------------------------------------------------------------
   1 | @article{rutenbar2007hierarchical,
   2 |     author={R. A. Rutenbar and G. G. E. Gielen and J. Roychowdhury},
   3 |     journal={Proceedings of the IEEE},
   4 |     title={{Hierarchical Modeling, Optimization, and Synthesis for System-Level Analog and RF Designs}},
   5 |     year={2007},
   6 |     volume={95},
   7 |     number={3},
   8 |     pages={640-669},
   9 |     keywords={analogue integrated circuits;circuit CAD;integrated circuit design;radiofrequency integrated circuits;RF designs;algorithmic techniques;analog design automation;automated device sizing;automatic extraction;component-level tools;computer-aided design;computer-aided model generation;device-level circuit;hierarchical analog synthesis;hierarchical modeling;hierarchical synthesis;integrated circuits;mixed-signal designs;model optimization;nonlinear behaviors;nonlinear macromodel;numerical techniques;performance tradeoffs;statistical centering;statistical manufacturing variations;system-level analog design tools;Analog circuits;Analog computers;Assembly systems;Circuit synthesis;Design automation;Design optimization;Digital systems;Phase locked loops;Radio frequency;Space exploration;Computer-aided design;integrated circuits;modeling;simulation},
  10 |     doi={10.1109/JPROC.2006.889371},
  11 |     ISSN={0018-9219},
  12 |     month={March},
  13 | }
  14 | @article{rutenbar2007hierarchical,
  15 |     author={R. A. Rutenbar and G. G. E. Gielen and J. Roychowdhury},
  16 |     journal={Proceedings of the IEEE},
  17 |     title={{Hierarchical Modeling, Optimization, and Synthesis for System-Level Analog and RF Designs}},
  18 |     year={2007},
  19 |     volume={95},
  20 |     number={3},
  21 |     pages={640-669},
  22 |     keywords={analogue integrated circuits;circuit CAD;integrated circuit design;radiofrequency integrated circuits;RF designs;algorithmic techniques;analog design automation;automated device sizing;automatic extraction;component-level tools;computer-aided design;computer-aided model generation;device-level circuit;hierarchical analog synthesis;hierarchical modeling;hierarchical synthesis;integrated circuits;mixed-signal designs;model optimization;nonlinear behaviors;nonlinear macromodel;numerical techniques;performance tradeoffs;statistical centering;statistical manufacturing variations;system-level analog design tools;Analog circuits;Analog computers;Assembly systems;Circuit synthesis;Design automation;Design optimization;Digital systems;Phase locked loops;Radio frequency;Space exploration;Computer-aided design;integrated circuits;modeling;simulation},
  23 |     doi={10.1109/JPROC.2006.889371},
  24 |     ISSN={0018-9219},
  25 |     month={March},
  26 | }
  27 | @article{PRML,
  28 |   title={{Pattern Recognition and Machine Learning}},
  29 |   author={Bishop, C},
  30 |   journal={Springer, New York},
  31 |   year={2007}
  32 | }
  33 | @article{GPML,
  34 |   title={{Gaussian processes for machine learning}},
  35 |   author={Rasmussen, Carl Edward},
  36 |   year={2006},
  37 |   publisher={Citeseer}
  38 | }
  39 | @ARTICLE{shahriari2016taking,
  40 |     author={B. Shahriari and K. Swersky and Z. Wang and R. P. Adams and N. de Freitas},
  41 |     journal={Proceedings of the IEEE},
  42 |     title={{Taking the Human Out of the Loop: A Review of Bayesian Optimization}},
  43 |     year={2016},
  44 |     volume={104},
  45 |     number={1},
  46 |     pages={148-175},
  47 |     keywords={Bayes methods;Big Data;optimisation;storage allocation;Bayesian optimization;Big data application;human productivity;large-scale heterogeneous computing;massive complex software system;product quality;storage architecture;Bayes methods;Big data;Decision making;Design of experiments;Genomes;Linear programming;Optimization;Statistical analysis;Decision making;decision making;design of experiments;genomic medicine;optimization;response surface methodology;statistical learning},
  48 |     doi={10.1109/JPROC.2015.2494218},
  49 |     ISSN={0018-9219},
  50 |     month={Jan},
  51 | }
  52 | @book{nocedal2006numerical,
  53 |   title={{Numerical optimization}},
  54 |   author={Nocedal, Jorge and Wright, Stephen},
  55 |   year={2006},
  56 |   publisher={Springer Science \& Business Media}
  57 | }
  58 | @article{daems2003simulation,
  59 |     author={W. Daems and G. Gielen and W. Sansen},
  60 |     journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
  61 |     title={{Simulation-based generation of posynomial performance models for the sizing of analog integrated circuits}},
  62 |     year={2003},
  63 |     volume={22},
  64 |     number={5},
  65 |     pages={517-534},
  66 |     keywords={analogue integrated circuits;circuit CAD;circuit simulation;design of experiments;geometric programming;integrated circuit design;integrated circuit modelling;response surface methodology;DOE;SPICE-level accuracy;accurate sizing model generation;analog IC sizing;analog circuit sizing problem;analog integrated circuits;design of experiments;geometric programming;linear circuits;nonlinear circuits;numerical simulation data;performance characteristics;posynomial performance models;posynomial response surface models;response surface modeling;simulation-based generation;Analog circuits;Analog integrated circuits;Character generation;Circuit simulation;Design automation;Differential equations;Integrated circuit modeling;Numerical simulation;Response surface methodology;SPICE},
  67 |     doi={10.1109/TCAD.2003.810742},
  68 |     ISSN={0278-0070},
  69 |     month={May},
  70 | }
  71 | @article{phelps2000anaconda,
  72 |     author={R. Phelps and M. Krasnicki and R. A. Rutenbar and L. R. Carley and J. R. Hellums},
  73 |     journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
  74 |     title={{Anaconda: simulation-based synthesis of analog circuits via stochastic pattern search}},
  75 |     year={2000},
  76 |     volume={19},
  77 |     number={6},
  78 |     pages={703-717},
  79 |     keywords={circuit CAD;circuit simulation;evolutionary computation;integrated circuit design;mixed analogue-digital integrated circuits;Anaconda;analog circuits;evolutionary algorithms;full circuit simulation;industrial-strength simulation environments;manual industrial designs;mixed-signal ICs;numerical search algorithm;pattern search;population-of-solutions ideas;simulation environment;simulation-based synthesis;stochastic pattern search;synthesized cell-level circuits;transparent network parallelism;Acceleration;Analog circuits;Circuit simulation;Circuit synthesis;Engines;Instruments;Integrated circuit synthesis;Network synthesis;Robustness;Stochastic processes},
  80 |     doi={10.1109/43.848091},
  81 |     ISSN={0278-0070},
  82 |     month={Jun},
  83 | }
  84 | @Article{boyd2007tutorial,
  85 |     author="Boyd, Stephen
  86 |         and Kim, Seung-Jean
  87 |         and Vandenberghe, Lieven
  88 |         and Hassibi, Arash",
  89 |     title="A tutorial on geometric programming",
  90 |     journal="Optimization and Engineering",
  91 |     year="2007",
  92 |     volume="8",
  93 |     number="1",
  94 |     pages="67",
  95 |     abstract="A geometric program (GP) is a type of mathematical optimization problem characterized by objective and constraint functions that have a special form. Recently developed solution methods can solve even large-scale GPs extremely efficiently and reliably; at the same time a number of practical problems, particularly in circuit design, have been found to be equivalent to (or well approximated by) GPs. Putting these two together, we get effective solutions for the practical problems. The basic approach in GP modeling is to attempt to express a practical problem, such as an engineering analysis or design problem, in GP format. In the best case, this formulation is exact; when this is not possible, we settle for an approximate formulation. This tutorial paper collects together in one place the basic background material needed to do GP modeling. We start with the basic definitions and facts, and some methods used to transform problems into GP format. We show how to recognize functions and problems compatible with GP, and how to approximate functions or data in a form compatible with GP (when this is possible). We give some simple and representative examples, and also describe some common extensions of GP, along with methods for solving (or approximately solving) them.",
  96 |     issn="1573-2924",
  97 |     doi="10.1007/s11081-007-9001-7",
  98 |     url="http://dx.doi.org/10.1007/s11081-007-9001-7"
  99 | }
 100 | @inproceedings{mcconaghy2005analysis,
 101 |     author={T. McConaghy and G. Gielen},
 102 |     booktitle={2005 IEEE International Symposium on Circuits and Systems},
 103 |     title={{Analysis of simulation-driven numerical performance modeling techniques for application to analog circuit optimization}},
 104 |     year={2005},
 105 |     pages={1298-1301 Vol. 2},
 106 |     keywords={analogue circuits;circuit optimisation;circuit simulation;feedforward neural nets;genetic algorithms;polynomials;regression analysis;splines (mathematics);support vector machines;analog circuit optimization;boosted feedforward neural networks;genetic programming;kriging;multivariate adaptive regression splines;numerical performance modeling;polynomials;posynomials;simulation-driven modeling;simulator-in-the-loop;support vector machines;Analog circuits;Analytical models;Circuit simulation;Design optimization;Feedforward neural networks;Neural networks;Numerical models;Numerical simulation;Performance analysis;Performance gain},
 107 |     doi={10.1109/ISCAS.2005.1464833},
 108 |     ISSN={0271-4302},
 109 |     month={May},
 110 | }
 111 | @inproceedings{adc_gp_2002,
 112 |     author={M. del Mar Hershenson},
 113 |     booktitle={IEEE/ACM International Conference on Computer Aided Design, 2002. ICCAD 2002.},
 114 |     title={{Design of pipeline analog-to-digital converters via geometric programming}},
 115 |     year={2002},
 116 |     pages={317-324},
 117 |     keywords={analogue-digital conversion;circuit CAD;circuit optimisation;geometric programming;integrated circuit design;integrated circuit noise;network topology;pipeline processing;polynomials;signal sampling;ADC component sizes;ADC design constraints;ADC polynomial model;ADC topology;area specifications;capacitors;component sizes;design specifications;geometric programming;geometric programming framework;globally optimal design;hierarchical problem formulation;infeasible specifications;intermediate design variables;modular formulation;monomial equality constraints;numerical algorithms;pipeline analog-to-digital converter design;polynomial inequality constraints;power specifications;process technology;sampling frequency;signal-to-noise ratio;transistors;Analog-digital conversion;Capacitors;Design methodology;Frequency;Pipelines;Polynomials;Signal design;Signal sampling;Signal to noise ratio;Topology},
 118 |     doi={10.1109/ICCAD.2002.1167553},
 119 |     ISSN={1092-3152},
 120 |     month={Nov},
 121 | }
 122 | @inproceedings{colleran2003optimization,
 123 |     author={D. M. Colleran and C. Portmann and A. Hassibi and C. Crusius and S. S. Mohan and S. Boyd and T. H. Lee and M. del Mar Hershenson},
 124 |     booktitle={Proceedings of the IEEE 2003 Custom Integrated Circuits Conference, 2003.},
 125 |     title={Optimization of phase-locked loop circuits via geometric programming},
 126 |     year={2003},
 127 |     pages={377-380},
 128 |     keywords={CMOS analogue integrated circuits;UHF integrated circuits;circuit optimisation;geometric programming;integrated circuit design;jitter;phase locked loops;0.18 micron;1.8 V;1.9 GHz;10.8 mW;CMOS;PLL array;PLL frequency range;PLL global optimization;accumulated jitter;automated analog circuit design;geometric programming;period jitter;phase-locked loop circuits;CMOS process;Circuits;Design optimization;Equations;Frequency;Jitter;Phase locked loops;Semiconductor device modeling;Silicon;Voltage-controlled oscillators},
 129 |     doi={10.1109/CICC.2003.1249422},
 130 |     month={Sept},
 131 | }
 132 | @INPROCEEDINGS{del2004cmos,
 133 |     author={M. del Mar Hershenson},
 134 |     booktitle={Proceedings of the 2004 American Control Conference},
 135 |     title={{CMOS} analog circuit design via geometric programming},
 136 |     year={2004},
 137 |     volume={4},
 138 |     pages={3266-3271 vol.4},
 139 |     keywords={CMOS analogue integrated circuits;circuit optimisation;convex programming;geometric programming;integrated circuit design;CMOS analog circuit design;circuit design problem;circuit performance measures;circuit synthesis method;convex optimization problem;geometric programming;polynomial functions},
 140 |     ISSN={0743-1619},
 141 |     month={June},
 142 | }
 143 | @INPROCEEDINGS{wang2014enabling,
 144 |     author={Y. Wang and M. Orshansky and C. Caramanis},
 145 |     booktitle={2014 51st ACM/EDAC/IEEE Design Automation Conference (DAC)},
 146 |     title={Enabling efficient analog synthesis by coupling sparse regression and polynomial optimization},
 147 |     year={2014},
 148 |     pages={1-6},
 149 |     keywords={analogue integrated circuits;concave programming;convex programming;integrated circuit design;least squares approximations;polynomials;regression analysis;relaxation theory;SDP relaxations;SPICE-generated data;convex relaxations;efficient analog synthesis;equation-based analog synthesis;least-square fits;non-convex optimization;polynomial optimization;semidefinite programming relaxations;sparse regression;Accuracy;Couplings;Integrated circuit modeling;Mathematical model;Optimization;Polynomials;SPICE},
 150 |     ISSN={0738-100X},
 151 |     month={June},
 152 | }
 153 | @inproceedings{PolyGP2016DATE,
 154 |     author={Y. Wang and C. Caramanis and M. Orshansky},
 155 |     booktitle={2016 Design, Automation Test in Europe Conference Exhibition (DATE)},
 156 |     title={{PolyGP: Improving GP-based analog optimization through accurate high-order monomials and semidefinite relaxation}},
 157 |     year={2016},
 158 |     pages={1423-1428},
 159 |     keywords={analogue integrated circuits;geometric programming;higher order statistics;least squares approximations;polynomial approximation;GP-based analog optimization;PolyGP;SDP-relaxations;accurate high-order monomials;analog circuits;analog performance functions;common device-circuit functions;computational tractability;fitting device;fitting error;leverage recent tools;logarithmic variables;moment optimization;monomial inaccuracy;polynomial exponential functions;polynomial geometric programming;polynomial optimization;semidefinite optimization;semidefinite relaxation;sums-of-squares;symbolic performance functions;Analytical models;Convex functions;Mathematical model;Optimization;Performance evaluation;Programming;Transistors},
 160 |     month={March},
 161 | }
 162 | @ARTICLE{6155115,
 163 |     author={M. B. Yelten and T. Zhu and S. Koziel and P. D. Franzon and M. B. Steer},
 164 |     journal={IEEE Circuits and Systems Magazine},
 165 |     title={Demystifying Surrogate Modeling for Circuits and Systems},
 166 |     year={2012},
 167 |     volume={12},
 168 |     number={1},
 169 |     pages={45-63},
 170 |     keywords={circuit simulation;black-box surrogate modeling;circuit surrogate modeling;engineering models;grey-box surrogate modeling;system surrogate modeling;Couplings;Electromagnetic modeling;Mathematical model;Modeling;Solid modeling},
 171 |     doi={10.1109/MCAS.2011.2181095},
 172 |     ISSN={1531-636X},
 173 |     month={Firstquarter},
 174 | }
 175 | @inproceedings{okobiah2012ordinary,
 176 |     author={O. Okobiah and S. P. Mohanty and E. Kougianos},
 177 |     booktitle={Thirteenth International Symposium on Quality Electronic Design (ISQED)},
 178 |     title={Ordinary Kriging metamodel-assisted Ant Colony algorithm for fast analog design optimization},
 179 |     year={2012},
 180 |     pages={458-463},
 181 |     keywords={amplifiers;analogue integrated circuits;integrated circuit modelling;interpolation;optimisation;SPICE netlist;a design constraint;analog design optimization;correlation effects;integrated circuit modelling;interpolation;metamodel generation phase;ordinary Kriging metamodel-assisted ant colony algorithm;regression;sense amplifier precharge time;Accuracy;Algorithm design and analysis;Computational modeling;Correlation;Metamodeling;Optimization;Response surface methodology;Kriging Methods;Metamodeling;Nano-CMOS;Robust Design;Sense Amplifier},
 182 |     doi={10.1109/ISQED.2012.6187533},
 183 |     ISSN={1948-3287},
 184 |     month={March},
 185 | }
 186 | @inproceedings{Okobiah_kriging_2014,
 187 |     author={O. Okobiah and S. P. Mohanty and E. Kougianos},
 188 |     booktitle={2014 IEEE Computer Society Annual Symposium on VLSI},
 189 |     title={Exploring Kriging for Fast and Accurate Design Optimization of Nanoscale Analog Circuits},
 190 |     year={2014},
 191 |     pages={244-247},
 192 |     keywords={analogue integrated circuits;nanoelectronics;optimisation;statistical analysis;Kriging;design optimization;manual iterative circuit sizing;metamodeling techniques;nanoscale analog circuits;optimal designs;process variation;Algorithm design and analysis;Artificial neural networks;Computational modeling;Metamodeling;Optimization;Phase locked loops;Analog Mixed-Signal (AMS);Geostatistics;Kriging;Nano-CMOS;Neural Network;Optimization;Process Variation},
 193 |     doi={10.1109/ISVLSI.2014.12},
 194 |     ISSN={2159-3469},
 195 |     month={July},
 196 | }
 197 | @inproceedings{yu2008yield,
 198 |   title={Yield-aware hierarchical optimization of large analog integrated circuits},
 199 |   author={Yu, Guo and Li, Peng},
 200 |   booktitle={Proceedings of the 2008 IEEE/ACM International Conference on Computer-Aided Design},
 201 |   pages={79--84},
 202 |   year={2008},
 203 |   organization={IEEE Press}
 204 | }
 205 | @article{okobiah2014fast,
 206 |     author={O. Okobiah and S. Mohanty and E. Kougianos},
 207 |     journal={IEEE Transactions on Very Large Scale Integration (VLSI) Systems},
 208 |     title={Fast Design Optimization Through Simple Kriging Metamodeling: A Sense Amplifier Case Study},
 209 |     year={2014},
 210 |     volume={22},
 211 |     number={4},
 212 |     pages={932-937},
 213 |     keywords={CMOS integrated circuits;amplifiers;circuit optimisation;integrated circuit design;integrated circuit modelling;simulated annealing;clamped bitline amplifier circuit;correlation effect modelling;fast design optimization;nanoscale CMOS circuits;sense amplifier;simple Kriging metamodeling technique;simulated annealing algorithm;Fast design optimization;Kriging methods;metamodeling;nano CMOS analog circuits},
 214 |     doi={10.1109/TVLSI.2013.2256436},
 215 |     ISSN={1063-8210},
 216 |     month={April},
 217 | }
 218 | @article{vural2012analog,
 219 |     title={Analog circuit sizing via swarm intelligence},
 220 |     author={Vural, RA and Yildirim, T},
 221 |     journal={AEU-International Journal of Electronics and Communications},
 222 |     volume={66},
 223 |     number={9},
 224 |     pages={732--740},
 225 |     year={2012},
 226 |     publisher={Elsevier}
 227 | }
 228 | @article{6420988,
 229 |     author={G. Huang and L. Qian and S. Saibua and D. Zhou and X. Zeng},
 230 |     journal={IEEE Transactions on Circuits and Systems I: Regular Papers},
 231 |     title={An Efficient Optimization Based Method to Evaluate the {DRV} of {SRAM} Cells},
 232 |     year={2013},
 233 |     volume={60},
 234 |     number={6},
 235 |     pages={1511-1520},
 236 |     keywords={Monte Carlo methods;SRAM chips;circuit simulation;leakage currents;optimisation;time-domain analysis;0data retention voltage;DRV evaluation tool;Monte Carlo method;SRAM cells;circuit simulator;device parameter variations;importance sampling;multistart point optimization strategy;size 45 nm to 130 nm;substantial leakage current;time domain worst performance bound problem;Integrated circuit modeling;Leakage current;Optimization;SPICE;SRAM cells;Time domain analysis;Transient analysis;DRV;global optimization;multi-start point;process and device parameter variation},
 237 |     doi={10.1109/TCSI.2012.2226504},
 238 |     ISSN={1549-8328},
 239 |     month={June},
 240 | }
 241 | @inproceedings{1510411,
 242 |     author={A. Nieuwoudt and Y. Massoud},
 243 |     booktitle={Proceedings. 42nd Design Automation Conference, 2005.},
 244 |     title={Multi-level approach for integrated spiral inductor optimization},
 245 |     year={2005},
 246 |     pages={648-651},
 247 |     keywords={circuit optimisation;electronic design automation;inductors;integrated circuit design;search problems;analog design automation;analog synthesis;constrained global optimization;integrated spiral inductor;local nonlinear convex optimization;mesh-adaptive direct search algorithms;mixed-signal design automation;optimal spiral inductor geometry;scalable multi-level optimization methodology;Algorithm design and analysis;Constraint optimization;Design automation;Design optimization;Inductors;Integrated circuit synthesis;Permission;Q factor;Signal synthesis;Spirals},
 248 |     doi={10.1145/1065579.1065749},
 249 |     ISSN={0738-100X},
 250 |     month={June},
 251 | }
 252 | @inproceedings{6811900,
 253 |     author={Dian Zhou and Guanming Huang},
 254 |     booktitle={ASIC (ASICON), 2013 IEEE 10th International Conference on},
 255 |     title={Design automation of analog circuit considering the process variations},
 256 |     year={2013},
 257 |     pages={1-4},
 258 |     keywords={analogue integrated circuits;circuit optimisation;integrated circuit design;analog circuit;automatic design method;embedded spice-level simulator;multistart point optimization strategy;Accuracy;Analog circuits;Design automation;Linear programming;Load modeling;Numerical models;Optimization},
 259 |     doi={10.1109/ASICON.2013.6811900},
 260 |     ISSN={2162-7541},
 261 |     month={Oct},
 262 | }
 263 | @article{liu2011synthesis,
 264 |     author={B. Liu and D. Zhao and P. Reynaert and G. G. E. Gielen},
 265 |     journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
 266 |     title={{Synthesis of Integrated Passive Components for High-Frequency RF ICs Based on Evolutionary Computation and Machine Learning Techniques}},
 267 |     year={2011},
 268 |     volume={30},
 269 |     number={10},
 270 |     pages={1458-1468},
 271 |     keywords={Gaussian processes;electronic engineering computing;evolutionary computation;learning (artificial intelligence);millimetre wave integrated circuits;neural nets;search problems;Gaussian process;artificial neural network;differential evolution algorithm;electromagnetic simulation;equivalent circuit;evolutionary computation;frequency 60 GHz;high frequency RF IC;integrated passive components;machine learning techniques;memetic machine learning based differential evolution;microwave passive components;search mechanism;surrogate model based memetic evolutionary optimization mechanism;Computational modeling;Integrated circuit modeling;Microwave circuits;Microwave integrated circuits;Optimization;Radio frequency;Artificial neural network;differential evolution;gaussian process;inductor synthesis;microwave components;surrogate model;transformer synthesis},
 272 |     doi={10.1109/TCAD.2011.2162067},
 273 |     ISSN={0278-0070},
 274 |     month={Oct},
 275 | }
 276 | @inproceedings{5763181,
 277 |     author={B. Liu and Y. He and P. Reynaert and G. Gielen},
 278 |     booktitle={2011 Design, Automation Test in Europe},
 279 |     title={Global optimization of integrated transformers for high frequency microwave circuits using a {Gaussian} process based surrogate model},
 280 |     year={2011},
 281 |     pages={1-6},
 282 |     keywords={Gaussian processes;equivalent circuits;microwave integrated circuits;optimisation;passive networks;transformers;GPDECO;Gaussian process;RF passive components;RFIC designers;constrained optimization;differential evolution;electromagnetic simulations;equivalent circuit models;global optimization;high frequency microwave circuits;integrated transformers;microwave passive components;surrogate model;Accuracy;Data models;Integrated circuit modeling;Microwave circuits;Optimization;Predictive models;Differential evolution;Gaussian process;Microwave components;Microwave design;Surrogate model;Transformer synthesis},
 283 |     doi={10.1109/DATE.2011.5763181},
 284 |     ISSN={1530-1591},
 285 |     month={March},
 286 | }
 287 | @article{6218230,
 288 |     author={B. Liu and N. Deferm and D. Zhao and P. Reynaert and G. G. E. Gielen},
 289 |     journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
 290 |     title={{An Efficient High-Frequency Linear RF Amplifier Synthesis Method Based on Evolutionary Computation and Machine Learning Techniques}},
 291 |     year={2012},
 292 |     volume={31},
 293 |     number={7},
 294 |     pages={981-993},
 295 |     keywords={CMOS integrated circuits;Gaussian processes;HF amplifiers;differential amplifiers;equivalent circuits;evolutionary computation;field effect MIMIC;learning (artificial intelligence);millimetre wave amplifiers;CMOS technology;EM simulator;Gaussian process;adaptive population generation;core algorithm;evolutionary computation;frequency 60 GHz to 100 GHz;gain 10 dB;global optimization algorithm;high frequency linear RF amplifier synthesis;machine learning-based differential evolution;millimeter wave linear RF amplifier synthesis;naive Bayes classification;parasitic-included equivalent circuit models;predefined layout templates;radiofrequency integrated circuit design automation;size 90 nm;three-stage differential amplifier;time 25 h;Computational modeling;Inductors;Integrated circuit modeling;Optimization;Power transmission lines;Radio frequency;Transistors;Differential evolution;Gaussian process;efficient global optimization;expensive black-box optimization;mm-wave frequency;radio frequency (RF) circuit synthesis},
 296 |     doi={10.1109/TCAD.2012.2187207},
 297 |     ISSN={0278-0070},
 298 |     month={July},
 299 | }
 300 | @article{liu2014gaspad,
 301 |     author={B. Liu and D. Zhao and P. Reynaert and G. G. E. Gielen},
 302 |     journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
 303 |     title={{GASPAD}: A General and Efficient mm-Wave Integrated Circuit Synthesis Method Based on Surrogate Model Assisted Evolutionary Algorithm},
 304 |     year={2014},
 305 |     volume={33},
 306 |     number={2},
 307 |     pages={169-182},
 308 |     doi={10.1109/TCAD.2013.2284109},
 309 |     ISSN={0278-0070},
 310 |     month={Feb},
 311 | }
 312 | @inproceedings{liu2016efficient,
 313 | author={B. Liu and A. Nikolaeva},
 314 | booktitle={2016 Design, Automation Test in Europe Conference Exhibition (DATE)},
 315 | title={{Efficient global optimization of MEMS based on surrogate model assisted evolutionary algorithm}},
 316 | year={2016},
 317 | pages={555-558},
 318 | keywords={Gaussian processes;evolutionary computation;micromechanical
 319 |     devices;numerical analysis;MEMS design optimization;ad-hoc analytical
 320 |         model;adaptive Gaussian process;assisted differential
 321 |         evolution;assisted optimization framework;behavioural
 322 |         model;differential evolution algorithm;on-line adaptive
 323 |         surrogate;surrogate model assisted evolutionary algorithm;time
 324 |         consuming numerical simulations;Computational modeling;Micromechanical
 325 |         devices;Numerical models;Optimization;Sociology;Statistics;Training
 326 |         data},
 327 | month={March},
 328 | }
 329 | @article{liu2009analog,
 330 |   title={Analog circuit optimization system based on hybrid evolutionary algorithms},
 331 |   author={Liu, Bo and Wang, Yan and Yu, Zhiping and Liu, Leibo and Li, Miao and Wang, Zheng and Lu, Jing and Fern{\'a}ndez, Francisco V},
 332 |   journal={Integration, the VLSI journal},
 333 |   volume={42},
 334 |   number={2},
 335 |   pages={137--148},
 336 |   year={2009},
 337 |   publisher={Elsevier}
 338 | }
 339 | 
 340 | @article{deb2000efficient,
 341 |     title={An efficient constraint handling method for genetic algorithms},
 342 |     author={Deb, Kalyanmoy},
 343 |     journal={Computer methods in applied mechanics and engineering},
 344 |     volume={186},
 345 |     number={2},
 346 |     pages={311--338},
 347 |     year={2000},
 348 |     publisher={Elsevier}
 349 | }
 350 | @inproceedings{pengboDate,
 351 |     author={B. Peng and F. Yang and C. Yan and X. Zeng and D. Zhou},
 352 |     booktitle={2016 Design, Automation Test in Europe Conference Exhibition (DATE)},
 353 |     title={Efficient multiple starting point optimization for automated analog circuit optimization via recycling simulation data},
 354 |     year={2016},
 355 |     pages={1417-1422},
 356 |     keywords={analogue circuits;circuit optimisation;gradient methods;quadratic programming;automated analog circuit optimization;gradient calculation;multiple starting point optimization;recycling simulation data;sequential quadratic programming;sparse gradient recovery;Algorithm design and analysis;Analog circuits;Data models;Integrated circuit modeling;Mathematical model;Optimization;Recycling},
 357 |     month={March},
 358 | }
 359 | @phdthesis{tugui2013design,
 360 |     TITLE = {{Design Methodology for High-performance Circuits Based on Automatic Optimization Methods.}},
 361 |     AUTHOR = {Tugui, Catalin Adrian},
 362 |     URL = {https://tel.archives-ouvertes.fr/tel-00789352},
 363 |     NUMBER = {2013SUPL0002},
 364 |     SCHOOL = {{Sup{\'e}lec}},
 365 |     YEAR = {2013},
 366 |     MONTH = Jan,
 367 |     KEYWORDS = {Analog design methodology ; Continuous-time ; Bayesian optimization ; Optimisation bay{\'e}sienne ; Temps-continu ; Sigma-Delta ; M{\'e}thodologie de conception analogique temps-continu},
 368 |     TYPE = {Theses},
 369 |     PDF = {https://tel.archives-ouvertes.fr/tel-00789352/file/TUGUI_Catalin_Thesis_2012_01_VF.pdf},
 370 |     HAL_ID = {tel-00789352},
 371 |     HAL_VERSION = {v1},
 372 | }
 373 | @article{bull2011convergence,
 374 |   title={Convergence rates of efficient global optimization algorithms},
 375 |   author={Bull, Adam D},
 376 |   journal={Journal of Machine Learning Research},
 377 |   volume={12},
 378 |   number={Oct},
 379 |   pages={2879--2904},
 380 |   year={2011}
 381 | }
 382 | @phdthesis{duvenaud2014automatic,
 383 |   title={{Automatic model construction with Gaussian processes}}},
 384 |   author={Duvenaud, David},
 385 |   year={2014},
 386 |   school={University of Cambridge}
 387 | }
 388 | @article{gonzalez2017automated,
 389 |   title={{An automated design methodology of RF circuits by using Pareto-optimal fronts of EM-simulated inductors}},
 390 |   author={Gonz{\'a}lez-Echevarr{\'\i}a, Reinier and Roca, Elisenda and Castro-L{\'o}pez, Rafael and Fern{\'a}ndez, Francisco V and Sieiro, Javier and L{\'o}pez-Villegas, Jos{\'e} Mar{\'\i}a and Vidal, Neus},
 391 |   journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
 392 |   volume={36},
 393 |   number={1},
 394 |   pages={15--26},
 395 |   year={2017},
 396 |   publisher={IEEE}
 397 | }
 398 | @article{minasny2005matern,
 399 |   title={The Mat{\'e}rn function as a general model for soil variograms},
 400 |   author={Minasny, Budiman and McBratney, Alex B},
 401 |   journal={Geoderma},
 402 |   volume={128},
 403 |   number={3},
 404 |   pages={192--207},
 405 |   year={2005},
 406 |   publisher={Elsevier}
 407 | }
 408 | @article{jones1998efficient,
 409 |   title={Efficient global optimization of expensive black-box functions},
 410 |   author={Jones, Donald R and Schonlau, Matthias and Welch, William J},
 411 |   journal={Journal of Global optimization},
 412 |   volume={13},
 413 |   number={4},
 414 |   pages={455--492},
 415 |   year={1998},
 416 |   publisher={Springer}
 417 | }
 418 | @article{shen2006fast,
 419 |   title={{Fast Gaussian process regression using KD-trees}},
 420 |   author={Shen, Yirong and Ng, Andrew and Seeger, Matthias},
 421 |   journal={Advances in neural information processing systems},
 422 |   volume={18},
 423 |   pages={1225},
 424 |   year={2006},
 425 |   publisher={MIT; 1998}
 426 | }
 427 | @inproceedings{hoffman2011portfolio,
 428 |   title={{Portfolio Allocation for Bayesian Optimization.}},
 429 |   author={Hoffman, Matthew D and Brochu, Eric and de Freitas, Nando},
 430 |   booktitle={UAI},
 431 |   pages={327--336},
 432 |   year={2011}
 433 | }
 434 | @article{hutter2010sequential,
 435 |   title={Sequential model-based optimization for general algorithm configuration (extended version)},
 436 |   author={Hutter, Frank and Hoos, Holger H and Leyton-Brown, Kevin},
 437 |   journal={Technical Report TR-2010--10, University of British Columbia, Computer Science, Tech. Rep.},
 438 |   year={2010}
 439 | }
 440 | @article{schonlau1998global,
 441 |   title={Global versus local search in constrained optimization of computer models},
 442 |   author={Schonlau, Matthias and Welch, William J and Jones, Donald R},
 443 |   journal={Lecture Notes-Monograph Series},
 444 |   pages={11--25},
 445 |   year={1998},
 446 |   publisher={JSTOR}
 447 | }
 448 | @inproceedings{gardner2014bayesian,
 449 |   title={{Bayesian Optimization with Inequality Constraints}},
 450 |   author={Gardner, Jacob and Kusner, Matt and Weinberger, Kilian and Cunningham, John and others},
 451 |   booktitle={Proceedings of The 31st International Conference on Machine Learning},
 452 |   pages={937--945},
 453 |   year={2014}
 454 | }
 455 | @inproceedings{gelbart2014bayesian,
 456 |   title={Bayesian optimization with unknown constraints},
 457 |   author={Gelbart, Michael A and Snoek, Jasper and Adams, Ryan P},
 458 |   booktitle={Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence},
 459 |   pages={250--259},
 460 |   year={2014},
 461 |   organization={AUAI Press}
 462 | }
 463 | @article{hernandez2016general,
 464 |   title={A general framework for constrained {Bayesian} optimization using information-based search},
 465 |   author={Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Gelbart, Michael A and Adams, Ryan P and Hoffman, Matthew W and Ghahramani, Zoubin},
 466 |   journal={Journal of Machine Learning Research},
 467 |   volume={17},
 468 |   number={160},
 469 |   pages={1--53},
 470 |   year={2016},
 471 |   publisher={Journal of Machine Learning Research}
 472 | }
 473 | @article{lizotte2012experimental,
 474 |   title={An experimental methodology for response surface optimization methods},
 475 |   author={Lizotte, Daniel J and Greiner, Russell and Schuurmans, Dale},
 476 |   journal={Journal of Global Optimization},
 477 |   volume={53},
 478 |   number={4},
 479 |   pages={699--736},
 480 |   year={2012},
 481 |   publisher={Springer}
 482 | }
 483 | @inproceedings{snoek2012practical,
 484 |   title={{Practical Bayesian optimization of machine learning algorithms}},
 485 |   author={Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P},
 486 |   booktitle={Advances in neural information processing systems},
 487 |   pages={2951--2959},
 488 |   year={2012}
 489 | }
 490 | @phdthesis{gelbart2015constrained,
 491 |     title={{Constrained Bayesian Optimization and Applications}},
 492 |     author={Gelbart, Michael Adam},
 493 |     year={2015}
 494 | }
 495 | @article{storn1997differential,
 496 |     title={Differential evolution--a simple and efficient heuristic for global optimization over continuous spaces},
 497 |     author={Storn, Rainer and Price, Kenneth},
 498 |     journal={Journal of global optimization},
 499 |     volume={11},
 500 |     number={4},
 501 |     pages={341--359},
 502 |     year={1997},
 503 |     publisher={Springer}
 504 | }
 505 | @book{sutton1998reinforcement,
 506 |   title={Reinforcement learning: An introduction},
 507 |   author={Sutton, Richard S and Barto, Andrew G},
 508 |   volume={1},
 509 |   number={1},
 510 |   year={1998},
 511 |   publisher={MIT press Cambridge}
 512 | }
 513 | @article{Yan2013,
 514 |     author={Z. Yan and P. I. Mak and M. K. Law and R. P. Martins},
 515 |     journal={IEEE Journal of Solid-State Circuits},
 516 |     title={A 0.016-mm$^{2}$ 144-$\mu$W Three-Stage Amplifier Capable of Driving 1-to-15 nF Capacitive Load With $> $0.95-MHz GBW},
 517 |     year={2013},
 518 |     volume={48},
 519 |     number={2},
 520 |     pages={527-540},
 521 |     keywords={UHF amplifiers;circuit feedback;compensation;GBW;capacitance 1 nF to 15 nF;capacitive load;control-centric method;current-buffer Miller compensation;figure-of-merit;frequency 0.95 MHz;local feedback loop analysis;minimum gain-bandwidth product;parasitic-pole cancellation;pole-zero placements;power 144 muW;size 0.35 mum;slew rate;small-signal FOM;three-stage amplifier;Circuit analysis;Feedback loop;Limiting;Poles and zeros;Stability analysis;Standards;Active LHP zero;CMOS;Miller compensation;current buffer;current buffer Miller compensation;frequency compensation;pole-zero cancellation;three-stage amplifier},
 522 |     doi={10.1109/JSSC.2012.2229070},
 523 |     ISSN={0018-9200},
 524 |     month={Feb},
 525 | }
 526 | @ARTICLE{TVLSI-3DIC,
 527 |     author={S. J. Park and B. Bae and J. Kim and M. Swaminathan},
 528 |     journal={IEEE Transactions on Very Large Scale Integration (VLSI) Systems},
 529 |     title={Application of Machine Learning for Optimization of {3-D} Integrated Circuits and Systems},
 530 |     year={2017},
 531 |     volume={PP},
 532 |     number={99},
 533 |     pages={1-10},
 534 |     keywords={Clocks;Computational modeling;Integrated circuit modeling;Optimization;Semiconductor device measurement;Temperature distribution;Temperature measurement;3-D IC;Bayesian optimization (BO);electrical-thermal simulation;machine learning;temperature gradient;thermal-induced skew.},
 535 |     doi={10.1109/TVLSI.2017.2656843},
 536 |     ISSN={1063-8210},
 537 |     month={},
 538 | }
 539 | @software{johnson2014nlopt,
 540 |   author = {Johnson, Steven G.},
 541 |   title = {The NLopt nonlinear-optimization package},
 542 |   url = {http://ab-initio.mit.edu/nlopt},
 543 |   year = 2014
 544 | }
 545 | @INPROCEEDINGS{ye2013digitally,
 546 |     author={L. Ye and J. Chen and L. Kong and P. Cathelin and E. Alon and A. Niknejad},
 547 |     booktitle={2013 IEEE International Solid-State Circuits Conference Digest of Technical Papers},
 548 |     title={{A digitally modulated 2.4GHz WLAN transmitter with integrated phase path and dynamic load modulation in 65nm CMOS}},
 549 |     year={2013},
 550 |     pages={330-331},
 551 |     keywords={CMOS integrated circuits;DC-DC power convertors;OFDM modulation;radio transmitters;wireless LAN;DC-DC converter;OFDM modulation;PA supply;PAPR;Wi-Fi transmitter;battery life;close-in spectral performance;digitally modulated WLAN transmitter;dynamic load modulation;frequency 2 GHz to 5 GHz;integrated CMOS PA;integrated phase path;mobile device;on-chip matching network;peak-to-average-power-ratio;size 65 nm},
 552 |     doi={10.1109/ISSCC.2013.6487756},
 553 |     ISSN={0193-6530},
 554 |     month={Feb},
 555 | }
 556 | @INPROCEEDINGS{lu201324,
 557 |     author={C. Lu and H. Wang and C. Peng and A. Goel and S. Son and P. Liang and A. Niknejad and H. Hwang and G. Chien},
 558 |     booktitle={2013 IEEE International Solid-State Circuits Conference Digest of Technical Papers},
 559 |     title={{A 24.7dBm all-digital RF transmitter for multimode broadband applications in 40nm CMOS}},
 560 |     year={2013},
 561 |     pages={332-333},
 562 |     keywords={CMOS integrated circuits;radio transceivers;radiofrequency integrated circuits;802.11n;CMOS;Cartesian-to-polar transformation;RF circuits;all-digital RF transmitter;broadband wireless connectivity;digital RF transmitters;digital-intensive RF transceivers;digitally-assisted RF transceivers;direct quadrature architecture;frequency 40 MHz;high speed transistors;high-speed wireless standards;multimode broadband applications;open-loop phase interpolation topology;polar architecture;size 40 nm;Bandwidth;Baseband;CMOS integrated circuits;Power amplifiers;Power generation;Radio frequency;Table lookup},
 563 |     doi={10.1109/ISSCC.2013.6487757},
 564 |     ISSN={0193-6530},
 565 |     month={Feb},
 566 | }
 567 | @inproceedings{chapelle2011empirical,
 568 |   title={An empirical evaluation of thompson sampling},
 569 |   author={Chapelle, Olivier and Li, Lihong},
 570 |   booktitle={Advances in neural information processing systems},
 571 |   pages={2249--2257},
 572 |   year={2011}
 573 | }
 574 | @inproceedings{hernandez2014predictive,
 575 |   title={Predictive entropy search for efficient global optimization of black-box functions},
 576 |   author={Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Hoffman, Matthew W and Ghahramani, Zoubin},
 577 |   booktitle={Advances in neural information processing systems},
 578 |   pages={918--926},
 579 |   year={2014}
 580 | }
 581 | @article{hennig2012entropy,
 582 |   title={Entropy search for information-efficient global optimization},
 583 |   author={Hennig, Philipp and Schuler, Christian J},
 584 |   journal={Journal of Machine Learning Research},
 585 |   volume={13},
 586 |   number={Jun},
 587 |   pages={1809--1837},
 588 |   year={2012}
 589 | }
 590 | 
 591 | @article{nsgaii,
 592 |   title={{A fast and elitist multiobjective genetic algorithm: NSGA-II}},
 593 |   author={Deb, Kalyanmoy and Pratap, Amrit and Agarwal, Sameer and Meyarivan, TAMT},
 594 |   journal={IEEE transactions on evolutionary computation},
 595 |   volume={6},
 596 |   number={2},
 597 |   pages={182--197},
 598 |   year={2002},
 599 |   publisher={IEEE}
 600 | }
 601 | @article{moead,
 602 |   title={{MOEA/D: A multiobjective evolutionary algorithm based on decomposition}},
 603 |   author={Zhang, Qingfu and Li, Hui},
 604 |   journal={IEEE Transactions on evolutionary computation},
 605 |   volume={11},
 606 |   number={6},
 607 |   pages={712--731},
 608 |   year={2007},
 609 |   publisher={IEEE}
 610 | }
 611 | @article{MO_overview,
 612 |   title={{Survey of multi-objective optimization methods for engineering}},
 613 |   author={Marler, R Timothy and Arora, Jasbir S},
 614 |   journal={Structural and multidisciplinary optimization},
 615 |   volume={26},
 616 |   number={6},
 617 |   pages={369--395},
 618 |   year={2004},
 619 |   publisher={Springer}
 620 | }
 621 | @inproceedings{AIDA-C,
 622 | author={R. Martins and N. Lourenço and S. Rodrigues and J. Guilherme and N. Horta},
 623 | booktitle={2012 International Conference on Synthesis, Modeling, Analysis and Simulation Methods and Applications to Circuit Design (SMACD)},
 624 | title={{AIDA: Automated analog IC design flow from circuit level to layout}},
 625 | year={2012},
 626 | pages={29-32},
 627 | keywords={analogue integrated circuits;evolutionary computation;integrated circuit layout;AIDA environment;CALIBRE;Corners analysis;DRC;GENOM-POF;HSPICE;LAYGEN II;analysis tools;automated analog IC design flow;automated circuit-level synthesis;automatic layout generation;circuit-level specification;circuit-level topology;electrical simulator;evolutionary optimization kernels;expert knowledge;high level layout guidelines;in-house tools;industrial simulators;multiobjective multiconstraint optimization approach;physical layout description;sized circuit-level description;technology independent abstract layout template;Bioinformatics;Design automation;Genomics;Integrated circuits;Layout;Optimization;Robustness;Analog Integrated Circuits;Circuit Level;Design Autonation;Physical Level;Robust Design},
 628 | doi={10.1109/SMACD.2012.6339409},
 629 | month={Sept},
 630 | }
 631 | @inproceedings{GENOM-POF,
 632 |   title={{GENOM-POF: multi-objective evolutionary synthesis of analog ICs with corners validation}},
 633 |   author={Louren{\c{c}}o, Nuno and Horta, Nuno},
 634 |   booktitle={Proceedings of the 14th annual conference on Genetic and evolutionary computation},
 635 |   pages={1119--1126},
 636 |   year={2012},
 637 |   organization={ACM}
 638 | }
 639 | @inproceedings{7927171,
 640 | author={A. Canelas and R. Martins and R. Póvoa and N. Lourenço and N. Horta},
 641 | booktitle={{Design, Automation Test in Europe Conference Exhibition (DATE), 2017}},
 642 | title={Efficient yield optimization method using a variable K-Means algorithm for analog IC sizing},
 643 | year={2017},
 644 | pages={1201-1206},
 645 | keywords={Clustering algorithms;Integrated circuit modeling;Monte Carlo methods;Optical fibers;Optimization;Yield estimation;Analog Integrated Circuits;Clustering;Electronic Design Automation;K-Means;Monte Carlo Simulations;Robust Design;Yield Optimization},
 646 | doi={10.23919/DATE.2017.7927171},
 647 | month={March},}
 648 | 
 649 | @inproceedings{LiuBo_MOEAD,
 650 | author={B. Liu and F. V. Fernández and Q. Zhang and M. Pak and S. Sipahi and G. Gielen},
 651 | booktitle={IEEE Congress on Evolutionary Computation},
 652 | title={{An enhanced MOEA/D-DE and its application to multiobjective analog cell sizing}},
 653 | year={2010},
 654 | pages={1-7},
 655 | keywords={Pareto optimisation;differential equations;evolutionary computation;search problems;Pareto optimal set;decomposition;differential evolution;multiobjective analog cell sizing;multiobjective evolutionary algorithm;multiobjective problem;replacement mechanism;scalar optimization sub-problem;search engine;Approximation algorithms;Approximation methods;Benchmark testing;Evolutionary computation;Maintenance engineering;Optimization;Search engines},
 656 | doi={10.1109/CEC.2010.5585957},
 657 | ISSN={1089-778X},
 658 | month={July},}
 659 | @article{fakhfakh2010analog,
 660 |   title={Analog circuit design optimization through the particle swarm optimization technique},
 661 |   author={Fakhfakh, Mourad and Cooren, Yann and Sallem, Amin and Loulou, Mourad and Siarry, Patrick},
 662 |   journal={Analog Integrated Circuits and Signal Processing},
 663 |   volume={63},
 664 |   number={1},
 665 |   pages={71--82},
 666 |   year={2010},
 667 |   publisher={Springer}
 668 | }
 669 | @inproceedings{mojito_nsgaii_palmers2009massively,
 670 |   title={Massively multi-topology sizing of analog integrated circuits},
 671 |   author={Palmers, Pieter and McConnaghy, Trent and Steyaert, Michiel and Gielen, Georges},
 672 |   booktitle={Design, Automation \& Test in Europe Conference \& Exhibition, 2009. DATE'09.},
 673 |   pages={706--711},
 674 |   year={2009},
 675 |   organization={IEEE}
 676 | }
 677 | @inproceedings{mojito_nsgaii_mcconaghy2007simultaneous,
 678 |   title={Simultaneous multi-topology multi-objective sizing across thousands of analog circuit topologies},
 679 |   author={McConaghy, Trent and Palmers, Pieter and Gielen, Georges and Steyaert, Michiel},
 680 |   booktitle={Proceedings of the 44th annual Design Automation Conference},
 681 |   pages={944--947},
 682 |   year={2007},
 683 |   organization={ACM}
 684 | }
 685 | @inproceedings{nsgaii_moead_guerra2009optimizing,
 686 |   title={Optimizing current conveyors by evolutionary algorithms including differential evolution},
 687 |   author={Guerra-G{\'o}mez, Ivick and Tlelo-Cuautle, Esteban and McConaghy, Trent and Gielen, G},
 688 |   booktitle={Electronics, Circuits, and Systems, 2009. ICECS 2009. 16th IEEE International Conference on},
 689 |   pages={259--262},
 690 |   year={2009},
 691 |   organization={IEEE}
 692 | }
 693 | @article{cheng2012performance,
 694 |   title={On the performance metrics of multiobjective optimization},
 695 |   author={Cheng, Shi and Shi, Yuhui and Qin, Quande},
 696 |   journal={Advances in Swarm Intelligence},
 697 |   pages={504--512},
 698 |   year={2012},
 699 |   publisher={Springer}
 700 | }
 701 | @inproceedings{auger2009theory,
 702 |   title={Theory of the hypervolume indicator: optimal $\mu$-distributions and the choice of the reference point},
 703 |   author={Auger, Anne and Bader, Johannes and Brockhoff, Dimo and Zitzler, Eckart},
 704 |   booktitle={Proceedings of the tenth ACM SIGEVO workshop on Foundations of genetic algorithms},
 705 |   pages={87--102},
 706 |   year={2009},
 707 |   organization={ACM}
 708 | }
 709 | @article{lyu2017efficient,
 710 |   title={An Efficient Bayesian Optimization Approach for Automated Optimization of Analog Circuits},
 711 |   author={Lyu, Wenlong and Xue, Pan and Yang, Fan and Yan, Changhao and Hong, Zhiliang and Zeng, Xuan and Zhou, Dian},
 712 |   journal={IEEE Transactions on Circuits and Systems I: Regular Papers},
 713 |   year={2017},
 714 |   publisher={IEEE}
 715 | }
 716 | @article{wang2017efficient,
 717 |   title={Efficient Yield Optimization for Analog and SRAM Circuits via Gaussian Process Regression and Adaptive Yield Estimation},
 718 |   author={Wang, Mengshuo and Lv, Wenlong and Yang, Fan and Yan, Changhao and Cai, Wei and Zhou, Dian and Zeng, Xuan},
 719 |   journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
 720 |   year={2017},
 721 |   publisher={IEEE}
 722 | }
 723 | @inproceedings{azimi2010batch,
 724 |   title={Batch bayesian optimization via simulation matching},
 725 |   author={Azimi, Javad and Fern, Alan and Fern, Xiaoli Z},
 726 |   booktitle={Advances in Neural Information Processing Systems},
 727 |   pages={109--117},
 728 |   year={2010}
 729 | }
 730 | @article{desautels2014parallelizing,
 731 |   title={Parallelizing exploration-exploitation tradeoffs in gaussian process bandit optimization},
 732 |   author={Desautels, Thomas and Krause, Andreas and Burdick, Joel W},
 733 |   journal={The Journal of Machine Learning Research},
 734 |   volume={15},
 735 |   number={1},
 736 |   pages={3873--3923},
 737 |   year={2014},
 738 |   publisher={JMLR. org}
 739 | }
 740 | @inproceedings{contal2013parallel,
 741 |   title={Parallel gaussian process optimization with upper confidence bound and pure exploration},
 742 |   author={Contal, Emile and Buffoni, David and Robicquet, Alexandre and Vayatis, Nicolas},
 743 |   booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases},
 744 |   pages={225--240},
 745 |   year={2013},
 746 |   organization={Springer}
 747 | }
 748 | @inproceedings{gonzalez2016batch,
 749 |   title={Batch bayesian optimization via local penalization},
 750 |   author={Gonz{\'a}lez, Javier and Dai, Zhenwen and Hennig, Philipp and Lawrence, Neil},
 751 |   booktitle={Artificial Intelligence and Statistics},
 752 |   pages={648--657},
 753 |   year={2016}
 754 | }
 755 | @inproceedings{shah2015parallel,
 756 |   title={Parallel predictive entropy search for batch global optimization of expensive objective functions},
 757 |   author={Shah, Amar and Ghahramani, Zoubin},
 758 |   booktitle={Advances in Neural Information Processing Systems},
 759 |   pages={3330--3338},
 760 |   year={2015}
 761 | }
 762 | @inproceedings{wu2016parallel,
 763 |   title={The parallel knowledge gradient method for batch bayesian optimization},
 764 |   author={Wu, Jian and Frazier, Peter},
 765 |   booktitle={Advances in Neural Information Processing Systems},
 766 |   pages={3126--3134},
 767 |   year={2016}
 768 | }
 769 | @article{brochu2010tutorial,
 770 |   title={A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning},
 771 |   author={Brochu, Eric and Cora, Vlad M and De Freitas, Nando},
 772 |   journal={arXiv preprint arXiv:1012.2599},
 773 |   year={2010}
 774 | }
 775 | @InProceedings{demo,
 776 | author="Robi{\v{c}}, Tea
 777 | and Filipi{\v{c}}, Bogdan",
 778 | editor="Coello Coello, Carlos A.
 779 | and Hern{\'a}ndez Aguirre, Arturo
 780 | and Zitzler, Eckart",
 781 | title="DEMO: Differential Evolution for Multiobjective Optimization",
 782 | booktitle="Evolutionary Multi-Criterion Optimization",
 783 | year="2005",
 784 | publisher="Springer Berlin Heidelberg",
 785 | address="Berlin, Heidelberg",
 786 | pages="520--533",
 787 | abstract="Differential Evolution (DE) is a simple but powerful evolutionary optimization algorithm with many successful applications. In this paper we propose Differential Evolution for Multiobjective Optimization (DEMO) -- a new approach to multiobjective optimization based on DE. DEMO combines the advantages of DE with the mechanisms of Pareto-based ranking and crowding distance sorting, used by state-of-the-art evolutionary algorithms for multiobjective optimization. DEMO is implemented in three variants that achieve competitive results on five ZDT test problems.",
 788 | isbn="978-3-540-31880-4"
 789 | }
 790 | @article{scott2011correlated,
 791 |   title={The correlated knowledge gradient for simulation optimization of continuous parameters using gaussian process regression},
 792 |   author={Scott, Warren and Frazier, Peter and Powell, Warren},
 793 |   journal={SIAM Journal on Optimization},
 794 |   volume={21},
 795 |   number={3},
 796 |   pages={996--1026},
 797 |   year={2011},
 798 |   publisher={SIAM}
 799 | }
 800 | @article{dixon1978global,
 801 |   title={The Global Optimization Problem. An Introduction},
 802 |   author={Dixon, LCW},
 803 |   journal={Toward global optimization},
 804 |   volume={2},
 805 |   pages={1--15},
 806 |   year={1978},
 807 |   publisher={North-Holland}
 808 | }
 809 | @inproceedings{qEI,
 810 |   title={Fast computation of the multi-points expected improvement with applications in batch selection},
 811 |   author={Chevalier, Cl{\'e}ment and Ginsbourger, David},
 812 |   booktitle={International Conference on Learning and Intelligent Optimization},
 813 |   pages={59--69},
 814 |   year={2013},
 815 |   organization={Springer}
 816 | }
 817 | @article{wang2017max,
 818 |   title={Max-value entropy search for efficient Bayesian optimization},
 819 |   author={Wang, Zi and Jegelka, Stefanie},
 820 |   journal={arXiv preprint arXiv:1703.01968},
 821 |   year={2017}
 822 | }
 823 | @article{einstein,
 824 |     author =       "Albert Einstein",
 825 |     title =        "{Zur Elektrodynamik bewegter K{\"o}rper}. ({German})
 826 |         [{On} the electrodynamics of moving bodies]",
 827 |     journal =      "Annalen der Physik",
 828 |     volume =       "322",
 829 |     number =       "10",
 830 |     pages =        "891--921",
 831 |     year =         "1905",
 832 |     DOI =          "http://dx.doi.org/10.1002/andp.19053221004"
 833 | }
 834 | @inproceedings{lakshminarayanan2017simple,
 835 |   title={Simple and scalable predictive uncertainty estimation using deep ensembles},
 836 |   author={Lakshminarayanan, Balaji and Pritzel, Alexander and Blundell, Charles},
 837 |   booktitle={Advances in Neural Information Processing Systems},
 838 |   pages={6405--6416},
 839 |   year={2017}
 840 | }
 841 | @article{lazaro2010marginalized,
 842 |   title={Marginalized neural network mixtures for large-scale regression},
 843 |   author={L{\'a}zaro-Gredilla, Miguel and Figueiras-Vidal, An{\'\i}bal R},
 844 |   journal={IEEE transactions on neural networks},
 845 |   volume={21},
 846 |   number={8},
 847 |   pages={1345--1351},
 848 |   year={2010},
 849 |   publisher={IEEE}
 850 | }
 851 | @inproceedings{huang2015scalable,
 852 |   title={Scalable Gaussian Process Regression Using Deep Neural Networks.},
 853 |   author={Huang, Wen-bing and Zhao, Deli and Sun, Fuchun and Liu, Huaping and Chang, Edward Y},
 854 |   booktitle={IJCAI},
 855 |   pages={3576--3582},
 856 |   year={2015}
 857 | }
 858 | @inproceedings{gal2016dropout,
 859 |   title={Dropout as a Bayesian approximation: Representing model uncertainty in deep learning},
 860 |   author={Gal, Yarin and Ghahramani, Zoubin},
 861 |   booktitle={international conference on machine learning},
 862 |   pages={1050--1059},
 863 |   year={2016}
 864 | }
 865 | @inproceedings{hernandez2015probabilistic,
 866 |   title={Probabilistic backpropagation for scalable learning of bayesian neural networks},
 867 |   author={Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Adams, Ryan},
 868 |   booktitle={International Conference on Machine Learning},
 869 |   pages={1861--1869},
 870 |   year={2015}
 871 | }
 872 | @article{ruder2017overview,
 873 |   title={An overview of multi-task learning in deep neural networks},
 874 |   author={Ruder, Sebastian},
 875 |   journal={arXiv preprint arXiv:1706.05098},
 876 |   year={2017}
 877 | }
 878 | @inproceedings{maclaurin2015autograd,
 879 |   title={Autograd: Effortless gradients in numpy},
 880 |   author={Maclaurin, Dougal and Duvenaud, David and Adams, Ryan P},
 881 |   booktitle={ICML 2015 AutoML Workshop},
 882 |   year={2015}
 883 | }
 884 | @article{spyromitros2016multi,
 885 |   title={Multi-target regression via input space expansion: treating targets as inputs},
 886 |   author={Spyromitros-Xioufis, Eleftherios and Tsoumakas, Grigorios and Groves, William and Vlahavas, Ioannis},
 887 |   journal={Machine Learning},
 888 |   volume={104},
 889 |   number={1},
 890 |   pages={55--98},
 891 |   year={2016},
 892 |   publisher={Springer}
 893 | }
 894 | @article{tsanas2012accurate,
 895 |   title={Accurate quantitative estimation of energy performance of residential buildings using statistical machine learning tools},
 896 |   author={Tsanas, Athanasios and Xifara, Angeliki},
 897 |   journal={Energy and Buildings},
 898 |   volume={49},
 899 |   pages={560--567},
 900 |   year={2012},
 901 |   publisher={Elsevier}
 902 | }
 903 | @article{rasmussen2010gaussian,
 904 |   title={Gaussian processes for machine learning ({GPML}) toolbox},
 905 |   author={Rasmussen, Carl Edward and Nickisch, Hannes},
 906 |   journal={Journal of Machine Learning Research},
 907 |   volume={11},
 908 |   number={Nov},
 909 |   pages={3011--3015},
 910 |   year={2010}
 911 | }
 912 | @inproceedings{alvarez2009sparse,
 913 |   title={Sparse convolved Gaussian processes for multi-output regression},
 914 |   author={Alvarez, Mauricio and Lawrence, Neil D},
 915 |   booktitle={Advances in neural information processing systems},
 916 |   pages={57--64},
 917 |   year={2009}
 918 | }
 919 | @article{alvarez2011computationally,
 920 |   title={Computationally efficient convolved multiple output gaussian processes},
 921 |   author={{\'A}lvarez, Mauricio A and Lawrence, Neil D},
 922 |   journal={Journal of Machine Learning Research},
 923 |   volume={12},
 924 |   number={May},
 925 |   pages={1459--1500},
 926 |   year={2011}
 927 | }
 928 | @inproceedings{nguyen2014collaborative,
 929 |   title={Collaborative Multi-output Gaussian Processes.},
 930 |   author={Nguyen, Trung V and Bonilla, Edwin V and others},
 931 |   booktitle={UAI},
 932 |   pages={643--652},
 933 |   year={2014}
 934 | }
 935 | @inproceedings{nguyen2013efficient,
 936 |   title={Efficient variational inference for Gaussian process regression networks},
 937 |   author={Nguyen, Trung and Bonilla, Edwin},
 938 |   booktitle={Artificial Intelligence and Statistics},
 939 |   pages={472--480},
 940 |   year={2013}
 941 | }
 942 | @incollection{NIPS2015_5665,
 943 | title = {Scalable Inference for Gaussian Process Models with Black-Box Likelihoods},
 944 | author = {Dezfouli, Amir and Bonilla, Edwin V},
 945 | booktitle = {Advances in Neural Information Processing Systems 28},
 946 | editor = {C. Cortes and N. D. Lawrence and D. D. Lee and M. Sugiyama and R. Garnett},
 947 | pages = {1414--1422},
 948 | year = {2015},
 949 | publisher = {Curran Associates, Inc.},
 950 | url = {http://papers.nips.cc/paper/5665-scalable-inference-for-gaussian-process-models-with-black-box-likelihoods.pdf}
 951 | }
 952 | @inproceedings{bonilla2008multi,
 953 |   title={Multi-task Gaussian process prediction},
 954 |   author={Bonilla, Edwin V and Chai, Kian M and Williams, Christopher},
 955 |   booktitle={Advances in neural information processing systems},
 956 |   pages={153--160},
 957 |   year={2008}
 958 | }
 959 | @article{vectorvaluedkernel,
 960 |   title={Kernels for vector-valued functions: A review},
 961 |   author={Alvarez, Mauricio A and Rosasco, Lorenzo and Lawrence, Neil D and others},
 962 |   journal={Foundations and Trends{\textregistered} in Machine Learning},
 963 |   volume={4},
 964 |   number={3},
 965 |   pages={195--266},
 966 |   year={2012},
 967 |   publisher={Now Publishers, Inc.}
 968 | }
 969 | @article{liu2018remarks,
 970 |   title={Remarks on Multi-Output Gaussian Process Regression},
 971 |   author={Liu, Haitao and Cai, Jianfei and Ong, Yew-Soon},
 972 |   journal={Knowledge-Based Systems},
 973 |   year={2018},
 974 |   publisher={Elsevier}
 975 | }
 976 | @inproceedings{yu2007yield,
 977 |   title={Yield-aware analog integrated circuit optimization using geostatistics motivated performance modeling},
 978 |   author={Yu, Guo and Li, Peng},
 979 |   booktitle={Proceedings of the 2007 IEEE/ACM international conference on Computer-aided design},
 980 |   pages={464--469},
 981 |   year={2007},
 982 |   organization={IEEE Press}
 983 | }
 984 | @article{zhang2017survey,
 985 |   title={A survey on multi-task learning},
 986 |   author={Zhang, Yu and Yang, Qiang},
 987 |   journal={arXiv preprint arXiv:1707.08114},
 988 |   year={2017}
 989 | }
 990 | @article{zhang2017survey,
 991 |   title={A survey on multi-task learning},
 992 |   author={Zhang, Yu and Yang, Qiang},
 993 |   journal={arXiv preprint arXiv:1707.08114},
 994 |   year={2017}
 995 | @inproceedings{snoek2015scalable,
 996 |   title={Scalable bayesian optimization using deep neural networks},
 997 |   author={Snoek, Jasper and Rippel, Oren and Swersky, Kevin and Kiros, Ryan and Satish, Nadathur and Sundaram, Narayanan and Patwary, Mostofa and Prabhat, Mr and Adams, Ryan},
 998 |   booktitle={International conference on machine learning},
 999 |   pages={2171--2180},
1000 |   year={2015}
1001 | }
1002 | @book{journel1978mining,
1003 |   title={Mining geostatistics},
1004 |   author={Journel, Andre G and Huijbregts, Ch J},
1005 |   year={1978},
1006 |   publisher={Academic press}
1007 | }
1008 | @inproceedings{boyle2005dependent,
1009 |   title={Dependent gaussian processes},
1010 |   author={Boyle, Phillip and Frean, Marcus},
1011 |   booktitle={Advances in neural information processing systems},
1012 |   pages={217--224},
1013 |   year={2005}
1014 | }
1015 | 
1016 | @inproceedings{wilson2012gaussian,
1017 |   title={Gaussian process regression networks},
1018 |   author={Wilson, Andrew Gordon and Knowles, David A and Ghahramani, Zoubin},
1019 |   booktitle={Proceedings of the 29th International Coference on International Conference on Machine Learning},
1020 |   pages={1139--1146},
1021 |   year={2012},
1022 |   organization={Omnipress}
1023 | }
1024 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import DeepSparseKernel as dsk
 2 | from DeepSparseKernel import np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | train_x = np.loadtxt('train_x')
 6 | train_y = np.loadtxt('train_y')
 7 | train_y = train_y.reshape(1, train_y.size)
 8 | 
 9 | test_x = np.loadtxt('test_x')
10 | test_y = np.loadtxt('test_y')
11 | test_y = test_y.reshape(1, test_y.size)
12 | 
13 | num_train = train_x.shape[0]
14 | num_test  = test_x.shape[0]
15 | dim       = int(train_x.size / num_train)
16 | train_x   = train_x.reshape(num_train, dim).T;
17 | test_x    = test_x.reshape(num_test,  dim).T;
18 | 
19 | print(dim)
20 | print(train_x.shape)
21 | print(test_x.shape)
22 | 
23 | layer_sizes = [50, 50, 50, 50]
24 | activations = [dsk.relu, dsk.tanh, dsk.relu, dsk.tanh]
25 | scale       = 0.1
26 | 
27 | dim = train_x.shape[0]
28 | 
29 | gp    = dsk.DSK_GP(train_x, train_y, layer_sizes, activations, bfgs_iter=200, l1=0, l2=0.0, debug=True);
30 | theta = gp.rand_theta(scale=scale)
31 | gp.fit(theta)
32 | py, ps2             = gp.predict(test_x)
33 | py_train, ps2_train = gp.predict(train_x)
34 | 
35 | 
36 | log_lscales = gp.theta[2:2+dim];
37 | Phi_train   = gp.calc_Phi(gp.theta[2+dim:], dsk.scale_x(train_x, log_lscales));
38 | Phi_test    = gp.calc_Phi(gp.theta[2+dim:], dsk.scale_x(test_x, log_lscales));
39 | 
40 | np.savetxt('pred_y', py)
41 | np.savetxt('pred_s2', ps2)
42 | np.savetxt('theta', gp.theta)
43 | np.savetxt('Phi_train', Phi_train)
44 | np.savetxt('Phi_test', Phi_test)
45 | 
46 | # plt.plot(test_y.reshape(test_y.size), py.reshape(py.size), 'r.', train_y.reshape(train_y.size), py_train.reshape(train_y.size), 'b.')
47 | # plt.show()
48 | 
49 | gp.debug = True
50 | print(gp.log_likelihood(gp.theta))
51 | np.savetxt('loss', gp.log_likelihood(gp.theta))
52 | 


--------------------------------------------------------------------------------