├── .gitignore
├── LICENSE
├── Nested Deep GPs.ipynb
├── README.md
├── choleskies.py
├── coldeep.py
├── layers.py
├── plotting.py
├── special_einsum.py
└── step_fn_demo.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | #
 2 | *#
 3 | *~
 4 | .#*
 5 | .ipynb_checkpoints
 6 | 
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | 
11 | # C extensions
12 | *.so
13 | 
14 | # Distribution / packaging
15 | .Python
16 | env/
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *,cover
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # PyBuilder
63 | target/
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, James Hensman
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of deepGPy nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # deepGPy
 2 | Deep GPs with GPy
 3 | 
 4 | http://arxiv.org/abs/1412.1370
 5 | 
 6 | ### Requires
 7 |  - GPy (development branch)
 8 |  - openMP
 9 | 
10 | 
11 | ### How to run on OS X (10.9+) ###
12 | deepGPy requires OS X does not support openMP by default. To have openMP support on your OS X, you can use a clang-omp lib (i.e., an implementation of the OpenMP C/C++ language extensions in Clang/LLVM compiler). There are two ways:
13 | 
14 |     a)  Build from source. To download and build your own clang-omp, please refer to: http://clang-omp.github.io/
15 |     
16 |     b)  Using Homebrew version:
17 |         1. brew update
18 |         2. brew install clang-omp
19 |     
20 | After installing clang-omp, copy the omp.h from your clang-omp installation path to your python enviroment. e.g.: 
21 | 
22 |     cp /usr/local/Cellar/libiomp/20150701/include/libiomp/omp.h /Users/<your user name>/anaconda2/include/.
23 |     
24 | At last, leave 'extra_link_args' and 'libraries' empty in special_einsum.py. e.g.:
25 | 
26 |     'extra_link_args':[],
27 |     'libraries': []
28 | 


--------------------------------------------------------------------------------
/choleskies.py:
--------------------------------------------------------------------------------
  1 | # Copyright James Hensman and Max Zwiessele 2014
  2 | # Licensed under the GNU GPL version 3.0
  3 | 
  4 | import numpy as np
  5 | from scipy import weave
  6 | try:
  7 |     from scipy.linalg.lapack import dpotri
  8 | except:
  9 |     from scipy import linalg
 10 |     dpotri = linalg.lapack.clapack.dpotri
 11 | 
 12 | import GPy
 13 | 
 14 | def safe_root(N):
 15 |     i = np.sqrt(N)
 16 |     j = int(i)
 17 |     if i != j:
 18 |         raise ValueError, "N is not square!"
 19 |     return j
 20 | 
 21 | def flat_to_triang(flat):
 22 |     """take a matrix N x D and return a M X M x D array where
 23 | 
 24 |     N = M(M+1)/2
 25 | 
 26 |     the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
 27 |     """
 28 |     N, D = flat.shape
 29 |     M = (-1 + safe_root(8*N+1))/2
 30 |     ret = np.zeros((M, M, D))
 31 |     flat = np.ascontiguousarray(flat)
 32 | 
 33 |     code = """
 34 |     int count = 0;
 35 |     for(int m=0; m<M; m++)
 36 |     {
 37 |       for(int mm=0; mm<=m; mm++)
 38 |       {
 39 |         for(int d=0; d<D; d++)
 40 |         {
 41 |           ret[d + m*D*M + mm*D] = flat[count];
 42 |           count++;
 43 |         }
 44 |       }
 45 |     }
 46 |     """
 47 |     weave.inline(code, ['flat', 'ret', 'D', 'M'])
 48 |     return ret
 49 | 
 50 | def triang_to_flat(L):
 51 |     M, _, D = L.shape
 52 | 
 53 |     L = np.ascontiguousarray(L) # should do nothing if L was created by flat_to_triang
 54 | 
 55 |     N = M*(M+1)/2
 56 |     flat = np.empty((N, D))
 57 |     code = """
 58 |     int count = 0;
 59 |     for(int m=0; m<M; m++)
 60 |     {
 61 |       for(int mm=0; mm<=m; mm++)
 62 |       {
 63 |         for(int d=0; d<D; d++)
 64 |         {
 65 |           flat[count] = L[d + m*D*M + mm*D];
 66 |           count++;
 67 |         }
 68 |       }
 69 |     }
 70 |     """
 71 |     weave.inline(code, ['flat', 'L', 'D', 'M'])
 72 |     return flat
 73 | 
 74 | def multiple_dpotri_old(Ls):
 75 |     M, _, D = Ls.shape
 76 |     Kis = np.rollaxis(Ls, -1).copy()
 77 |     [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in range(D)]
 78 |     code = """
 79 |     for(int d=0; d<D; d++)
 80 |     {
 81 |       for(int m=0; m<M; m++)
 82 |       {
 83 |         for(int mm=0; mm<m; mm++)
 84 |         {
 85 |           Kis[d*M*M + mm*M + m ] = Kis[d*M*M + m*M + mm];
 86 |         }
 87 |       }
 88 |     }
 89 | 
 90 |     """
 91 |     weave.inline(code, ['Kis', 'D', 'M'])
 92 |     Kis = np.rollaxis(Kis, 0, 3) #wtf rollaxis?
 93 |     return Kis
 94 | 
 95 | def multiple_dpotri(Ls):
 96 |     return np.dstack([GPy.util.linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])
 97 | 
 98 | 
 99 | 
100 | 
101 | def indexes_to_fix_for_low_rank(rank, size):
102 |     """
103 |     work out which indexes of the flatteneed array should be fixed if we want the cholesky to represent a low rank matrix
104 |     """
105 |     #first we'll work out what to keep, and the do the set difference.
106 | 
107 |     #here are the indexes of the first column, which are the triangular numbers
108 |     n = np.arange(size)
109 |     triangulars = (n**2 + n) / 2
110 |     keep = []
111 |     for i in range(rank):
112 |         keep.append(triangulars[i:] + i)
113 |     #add the diagonal
114 |     keep.append(triangulars[1:]-1)
115 |     keep.append((size**2 + size)/2 -1)# the very last element
116 |     keep = np.hstack(keep)
117 | 
118 |     return np.setdiff1d(np.arange((size**2+size)/2), keep)
119 | 
120 | 
121 | 
122 | class cholchecker(GPy.core.Model):
123 |     def __init__(self, L, name='cholchecker'):
124 |         super(cholchecker, self).__init__(name)
125 |         self.L = GPy.core.Param('L',L)
126 |         self.add_parameter(self.L)
127 |     def parameters_changed(self):
128 |         LL = flat_to_triang(self.L)
129 |         Ki = multiple_dpotri(LL)
130 |         self.L.gradient = 2*np.einsum('ijk,jlk->ilk', Ki, LL)
131 |         self._loglik = np.sum([np.sum(np.log(np.abs(np.diag(LL[:,:,i])))) for i in range(self.L.shape[-1])])
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/coldeep.py:
--------------------------------------------------------------------------------
  1 | import GPy
  2 | import numpy as np
  3 | from layers import ObservedLayer, InputLayer, HiddenLayer, InputLayerFixed
  4 | try:
  5 |     from mpi4py import MPI
  6 | except:
  7 |     print "mpi not found"
  8 | import sys
  9 | import plotting
 10 | from GPy.util.choleskies import indexes_to_fix_for_low_rank
 11 | 
 12 | class ColDeep(GPy.core.Model):
 13 |     def __init__(self, layers, name='deepgp'):
 14 |         super(ColDeep, self).__init__(name)
 15 |         self.layers = layers
 16 |         self.link_parameters(*layers)
 17 | 
 18 |     def parameters_changed(self):
 19 |         self.layers[0].feed_forward()
 20 | 
 21 |     def log_likelihood(self):
 22 |         return sum([l._log_marginal_contribution for l in self.layers])
 23 | 
 24 |     def predict_sampling(self, Xtest, Ns):
 25 |         """
 26 |         pushes the Xtest points through the model, sampling at each layer.
 27 | 
 28 |         For every points in the matrix Xtest, we produce Ns samples.
 29 | 
 30 |         Returns a 3D array where the first dimension indexes the samples, the
 31 |         next indexes thepoints in Xtest and the last dimensino indexes the
 32 |         output dimensinos of the model.
 33 |         """
 34 |         XX = np.repeat(Xtest, Ns, axis=0)
 35 |         return self.layers[0].predict_forward_sampling(XX).reshape(Ns, Xtest.shape[0], -1, order='F')
 36 | 
 37 |     def posterior_sample(self, X):
 38 |         """
 39 |         like predict_sampling, but draw a correlated sample for every layer
 40 |         """
 41 |         return self.layers[0].predict_forward_sampling(X, correlated=True, noise_off=True)
 42 | 
 43 |     def log_density_sampling(self, X, Y, Ns):
 44 |         XX = np.repeat(X, Ns, axis=0)
 45 |         return self.layers[0].log_density_sampling(XX, Y)
 46 | 
 47 |     def predict_means(self, X):
 48 |         #predict the output layer from the input, throwing away the variance at each level
 49 |         return self.layers[0].predict_means(X)
 50 | 
 51 |     def get_natgrad(self):
 52 |         return np.hstack([np.hstack([l.dL_dEu.flatten(), l.dL_duuT.flatten()]) for l in self.layers])
 53 | 
 54 |     def set_vb_param(self, p):
 55 |         count = 0
 56 |         for l in self.layers:
 57 |             size = l.q_of_U_mean.size
 58 |             l.q_yyPof_U_mean.param_array = p[count:count+size]
 59 |             l.q_yyPof_U_mean.param_array = p[count:count+size]
 60 |             count += size
 61 | 
 62 |             size = l.q_of_U_precision.size
 63 |             #L = 
 64 |             l.q_of_U_mean.param_array = p[count:count+size]
 65 |             count += size
 66 | 
 67 |     def plot(self, xlim=None, Nsamples=0):
 68 |         plotting.plot_deep(self, xlim, Nsamples)
 69 | 
 70 |             
 71 | class ColDeepStochastic(ColDeep):
 72 |     """
 73 |         A Deep GP that can be optimized by stochastic methods. Only the
 74 |         supervised case is considered, i.e. the top layer is an InputLayerFixed
 75 |         instance.
 76 |     """
 77 |     def __init__(self, layers, X, Y, name='deepgp'):
 78 |         ColDeep.__init__(self, layers, name)
 79 |         self.X, self.Y = X, Y # keep a copy of X and Y her to select batches from
 80 |         self.set_batchsize(10)
 81 |         import climin.util
 82 |         self.slicer = climin.util.draw_mini_slices(self.X.shape[0], self._batchsize)
 83 | 
 84 |     def set_batchsize(self, n):
 85 |         """
 86 |         set the batch size to n
 87 |         """
 88 |         self._batchsize = int(n)
 89 |         N = float(self.X.shape[0])
 90 |         for l in self.layers:
 91 |             l.KL_scaling = self._batchsize/N
 92 | 
 93 |     def set_batch(self):
 94 |         """
 95 |         Select a random sub-set of the data, and call parameters_changed() to
 96 |         update the gradients and objective function
 97 |         """
 98 |         index = self.slicer.next()
 99 |         Xbatch, Ybatch = self.X[index], self.Y[index]
100 |         self.layers[0].X = Xbatch
101 |         self.layers[-1].Y = Ybatch
102 | 
103 |     def stochastic_fprime(self, w):
104 |         self.set_batch()
105 |         return self._grads(w)
106 | 
107 | class ColDeepMPI(ColDeep):
108 |     def __init__(self, layers, name, comm):
109 |         self.mpi_comm = comm
110 |         super(ColDeepMPI, self).__init__(layers, name)
111 | 
112 |     def synch(self, x):
113 |         xx = np.ascontiguousarray(x)
114 |         self.mpi_comm.Bcast([xx, MPI.DOUBLE], root=0)
115 |         x[:] = xx
116 | 
117 |     def _grads(self, x):
118 |         if self.mpi_comm.rank==0:
119 |             self.mpi_comm.Bcast([np.int32(3), MPI.INT])
120 |         #synchroize across all mpi nodes
121 |         self.synch(x)
122 |         g = super(ColDeepMPI, self)._grads(x)
123 |         g_all = g.copy()
124 |         self.mpi_comm.Reduce([g, MPI.DOUBLE], [g_all, MPI.DOUBLE], root=0)
125 |         return g_all
126 | 
127 |     def _objective(self, x):
128 |         if self.mpi_comm.rank==0:
129 |             self.mpi_comm.Bcast([np.int32(2), MPI.INT])
130 |         self.synch(x)
131 |         o = super(ColDeepMPI, self)._objective(x)
132 |         o_all = np.zeros(1)
133 |         self.mpi_comm.Reduce([np.float64(o), MPI.DOUBLE], [o_all, MPI.DOUBLE], root=0)
134 |         return o_all
135 | 
136 |     def _objective_grads(self, x):
137 |         if self.mpi_comm.rank==0:
138 |             self.mpi_comm.Bcast([np.int32(1), MPI.INT])
139 |         self.synch(x)
140 |         o, g = super(ColDeepMPI, self)._objective_grads(x)
141 |         g_all = g.copy()
142 |         o_all = np.zeros(1)
143 |         self.mpi_comm.Reduce([g, MPI.DOUBLE], [g_all, MPI.DOUBLE], root=0)
144 |         self.mpi_comm.Reduce([np.float64(o), MPI.DOUBLE], [o_all, MPI.DOUBLE], root=0)
145 |         return o_all, g_all
146 | 
147 |     def optimize(self,*a, **kw):
148 |         if self.mpi_comm.rank==0:
149 |             super(ColDeepMPI, self).optimize(*a, **kw)
150 |             self.mpi_comm.Bcast([np.int32(-1), MPI.INT])#after optimization , tell all the mpi processes to exit
151 |         else:
152 |             x = self.optimizer_array.copy()
153 |             while True:
154 |                 flag = np.zeros(1,dtype=np.int32)
155 |                 self.mpi_comm.Bcast(flag,root=0)
156 |                 if flag==1:
157 |                     self._objective_grads(x)
158 |                 elif flag==2:
159 |                     self._objective(x)
160 |                 elif flag==3:
161 |                     self._grads(x)
162 |                 elif flag==-1:
163 |                     break
164 |                 else:
165 |                     raise ValueError, "bad integer broadcast"
166 | 
167 | #TODO: move this helper to its own utility file
168 | def divide_data(datanum, comm):
169 |     residue = (datanum)%comm.size
170 |     datanum_list = np.empty((comm.size),dtype=np.int32)
171 |     for i in xrange(comm.size):
172 |         if i<residue:
173 |             datanum_list[i] = int(datanum/comm.size)+1
174 |         else:
175 |             datanum_list[i] = int(datanum/comm.size)
176 |     if comm.rank<residue:
177 |         size = datanum/comm.size+1
178 |         offset = size*comm.rank
179 |     else:
180 |         size = datanum/comm.size
181 |         offset = size*comm.rank+residue
182 |     return offset, offset+size, datanum_list
183 | 
184 | 
185 | def build_supervised(X, Y, Qs, Ms, ranks=None, mpi=False, ARD_X=False, useGPU=False, stochastic=False, S_param='chol'):
186 |     """
187 |     Build a coldeep structure with len(Qs) hidden layers.
188 | 
189 |     Note that len(Ms) = len(Qs) + 1, since there's always 1 more GP than there
190 |     are hidden layers.
191 | 
192 |     ranks are optionally a list of the rank of the approximation to the covariance
193 |     """
194 |     Ms, Qs = np.asarray(Ms), np.asarray(Qs)
195 |     assert len(Ms) == (1 + len(Qs))
196 |     if ranks:
197 |         assert len(ranks)==len(Ms)
198 | 
199 |     Nx,D_in = X.shape
200 |     Ny, D_out = Y.shape
201 |     assert Nx==Ny
202 | 
203 |     if mpi:
204 |         assert not stochastic, "mpi and stochastic not compatible right now"
205 |         #cut the data into chunks
206 |         start, stop, _ = divide_data(X.shape[0], MPI.COMM_WORLD)
207 |         Xinit = X[start:stop]
208 |         Yinit = Y[start:stop]
209 |     elif stochastic:
210 |         Xinit = X[:2] #arbitrary 2 points for init
211 |         Yinit = Y[:2]
212 |     else:
213 |         Xinit=X
214 |         Yinit=Y
215 | 
216 |     layers = []
217 |     #input layer
218 |     layers.append(InputLayerFixed(X=Xinit,
219 |                       input_dim=D_in,
220 |                       output_dim=Qs[0],
221 |                       kern=GPy.kern.RBF(D_in, ARD=ARD_X, useGPU=useGPU),
222 |                       Z=np.random.randn(Ms[0], D_in),
223 |                       beta=500.,
224 |                       S_param=S_param,
225 |                       name='layerX'))
226 | 
227 | 
228 |     #hidden layers
229 |     for h in range(len(Qs)-1):
230 |         layers.append(HiddenLayer(input_dim=Qs[h],
231 |             output_dim=Qs[h+1],
232 |             kern=GPy.kern.RBF(Qs[h], ARD=True, useGPU=useGPU),
233 |             Z=np.random.randn(Ms[h+1], Qs[h]),
234 |             beta=500.,
235 |             S_param=S_param,
236 |             name='layer%i'%h))
237 |         layers[-2].add_layer(layers[-1])
238 | 
239 |     #output layer
240 |     layers.append(ObservedLayer(Y=Yinit,
241 |         input_dim=Qs[-1],
242 |         output_dim=D_out,
243 |         kern=GPy.kern.RBF(Qs[-1], ARD=True, useGPU=useGPU),
244 |         Z=np.random.randn(Ms[-1], Qs[-1]),
245 |         beta=500.,
246 |         S_param=S_param,
247 |         name='layerY'))
248 |     layers[-2].add_layer(layers[-1])
249 | 
250 |     if ranks:
251 |         i = indexes_to_fix_for_low_rank(ranks[0], Ms[0])
252 |         for ii in i:
253 |             layers[0].q_of_U_choleskies[ii].constrain_fixed(trigger_parent=False)
254 |         for h in range(len(Qs)-1):
255 |             i = indexes_to_fix_for_low_rank(ranks[h+1], Ms[h+1])
256 |             for ii in i:
257 |                 layers[h+1].q_of_U_choleskies[ii].constrain_fixed(trigger_parent=False)
258 |         i = indexes_to_fix_for_low_rank(ranks[-1], Ms[-1])
259 |         for ii in i:
260 |             layers[-1].q_of_U_choleskies[ii].constrain_fixed(trigger_parent=False)
261 | 
262 | 
263 | 
264 |     if mpi:
265 |         assert not stochastic, "mpi and stochastic not compatible right now"
266 |         m = ColDeepMPI(layers, name='deep_gp', comm=MPI.COMM_WORLD)
267 |         for layer in layers:
268 |             layer.KL_scaling = 1./MPI.COMM_WORLD.size
269 | 
270 |         #synchronize
271 |         param = m[:]
272 |         MPI.COMM_WORLD.Bcast([param, MPI.DOUBLE], root=0)
273 |         m[:] = param
274 |     elif stochastic:
275 |         m = ColDeepStochastic(layers, X, Y, name='deep_gp')
276 |     else:
277 |         m = ColDeep(layers, name='deep_gp')
278 |     return m
279 | 
280 | 
281 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
  1 | import GPy
  2 | import numpy as np
  3 | #import GPy.util.choleskies # TODO use GPy's cholskies module
  4 | import choleskies
  5 | import plotting
  6 | from special_einsum import special_einsum
  7 | 
  8 | class Layer(GPy.core.parameterization.Parameterized):
  9 |     """
 10 |     A general Layer class, the base for hidden, input and output layers.
 11 |     """
 12 |     def __init__(self, input_dim, output_dim, kern, Z, beta=10.0, natgrads=False, S_param='chol', name='layer'):
 13 |         super(Layer, self).__init__(name)
 14 |         self.input_dim = input_dim
 15 |         self.output_dim = output_dim
 16 |         self.num_inducing = Z.shape[0]
 17 | 
 18 |         #a factor by which to multiply the KL (only used in parallel implementations)
 19 |         self.KL_scaling = 1.
 20 | 
 21 |         #store Z, kern, beta in this Parameterized object.
 22 |         assert Z.shape[1] == self.input_dim
 23 |         self.kern = kern
 24 |         self.Z = GPy.core.Param('Z', Z)
 25 |         self.beta = GPy.core.Param('beta', beta, GPy.core.parameterization.transformations.Logexp())
 26 |         self.link_parameters(self.Z, self.kern, self.beta)
 27 | 
 28 |         self.natgrads = natgrads
 29 |         # initialize q(U)
 30 |         #make the mean a random draw from I
 31 |         if not self.natgrads:
 32 |             self.q_of_U_mean = GPy.core.Param('q(U)_mean', np.random.randn(self.num_inducing, self.output_dim))
 33 |             self.link_parameter(self.q_of_U_mean)
 34 |             #make the mean a random draw from Kmm
 35 |             #self.q_of_U_mean = GPy.core.Param('q(U)_mean', np.random.multivariate_normal(np.zeros(self.num_inducing), self.kern.K(self.Z), self.output_dim).T)
 36 | 
 37 |             self.S_param = S_param
 38 |             if S_param=='chol':
 39 |                 chols = choleskies.triang_to_flat(np.dstack([np.eye(self.num_inducing)*0.1 for i in range(self.output_dim)]))
 40 |                 self.q_of_U_choleskies = GPy.core.Param('q(U)_chol', chols)
 41 |                 self.link_parameter(self.q_of_U_choleskies)
 42 |             elif S_param=='diag':
 43 |                 self.q_of_U_diags = GPy.core.Param('q(U)_diag',np.ones((self.num_inducing, self.output_dim)),GPy.core.parameterization.transformations.Logexp())
 44 |                 self.link_parameter(self.q_of_U_diags)
 45 |             else:
 46 |                 raise NotImplementedError
 47 | 
 48 |         else:
 49 |             #initialize using the natural gradient method
 50 |             mean = np.random.randn(self.num_inducing, self.output_dim)
 51 |             precision = np.dstack([np.eye(self.num_inducing)*10 for i in range(self.output_dim)])
 52 |             Sim = np.einsum('ijk,jk->ik', precision, mean)
 53 |             self.set_vb_param(np.hstack((Sim.flatten(), -0.5*precision.flatten() )))
 54 | 
 55 |         #and empty list to contain the lower layers
 56 |         self.lower_layers = []
 57 | 
 58 |     def set_vb_param(self, p):
 59 |         Sim = p[:self.num_inducing*self.output_dim].reshape(self.num_inducing, self.output_dim)
 60 |         prec = -2*p[self.num_inducing*self.output_dim:].reshape(self.num_inducing, self.num_inducing, self.output_dim)
 61 |         covs, _, Ls, logdets = zip(*[GPy.util.linalg.pdinv(prec[:,:,i]) for i in range(self.output_dim)])
 62 |         self.q_of_U_covariance = np.dstack(covs)
 63 |         self.q_of_U_precision= prec
 64 |         self.q_of_U_mean = np.einsum('ijk,jk->ik', covs, Sim)
 65 |         self.q_of_U_cov_logdet = -np.array(logdets)
 66 | 
 67 |         #don't forget to do a forward/backwards pass to update teh gradients!
 68 | 
 69 |     def grad_natgrad(self):
 70 |         #return the gradient and natural gradient in the parameters of q(U)
 71 |         pass #TODO
 72 | 
 73 | 
 74 |     def initialize_inducing(self, X):
 75 |         """
 76 |         iniitalize the inducing input points from the array X, and recurse to lower layers
 77 |         """
 78 |         i = np.random.permutation(X.shape[0])[:self.num_inducing]
 79 |         self.Z[:] = X[i]
 80 |         self.compute_output_dist()
 81 |         [l.initialize_inducing(self.q_of_X_out.mean*1) for l in self.lower_layers]
 82 | 
 83 |     def predict(self, Xtest, full_cov=False, noise_off=False):
 84 |         """
 85 |         Given the posterior q(U), the kernel and some test points Xtest,
 86 |         compute the Gaussian posterior for this layer
 87 |         """
 88 |         Knm = self.kern.K(Xtest, self.Z)
 89 |         mu = np.dot(Knm, self.Kmmi).dot(self.q_of_U_mean)
 90 |         if self.woodbury_inv is None:
 91 |             if self.S_param is 'chol':
 92 |                 self.woodbury_inv = self.Kmmi[:,:,None] - np.einsum('ij,jkl,km->iml', self.Kmmi, self.q_of_U_covariance, self.Kmmi)
 93 |             else:
 94 |                 self.woodbury_inv = self.Kmmi[:,:,None] - np.einsum('ijk,jl->ilk', self.Kmmi[:,:,None]*self.q_of_U_diags[None,:,:], self.Kmmi)
 95 | 
 96 |         if full_cov:
 97 |             Knn = self.kern.K(Xtest)
 98 |             var = Knn[:,:,None] - np.einsum('ij, jkl,mk->iml', Knm, self.woodbury_inv, Knm)
 99 |             if not noise_off:
100 |                 var += np.eye(Xtest.shape[0])[:,:,None]/self.beta
101 |         else:
102 |             Knn = self.kern.Kdiag(Xtest)
103 |             var = Knn[:,None] - np.einsum('ij,jkl,ik->il', Knm, self.woodbury_inv, Knm)
104 |             if not noise_off:
105 |                 var += 1./self.beta
106 |         return mu, var
107 | 
108 |     def posterior_samples(self, Xtest, full_cov=False, noise_off=False):
109 |         """
110 |         Produce samples from the posterior at the points Xtest.
111 |         
112 |         if not full_cov, we produce a separate draw for each point in Xtest. Otherwise, a single multivariate (correlated) draw.
113 |         """
114 |         m,v = self.predict(Xtest, full_cov=full_cov, noise_off=noise_off)
115 |         if full_cov:
116 |             return np.vstack([np.random.multivariate_normal(m[:,i], v[:,:,i]) for i in range(m.shape[1])]).T
117 |         else:
118 |             return m + np.random.randn(*m.shape)*np.sqrt(v)
119 | 
120 |     def add_layer(self, layer):
121 |         """
122 |         Simply append a layer onto the list of lower layers
123 |         """
124 |         assert layer.input_dim == self.output_dim
125 |         self.lower_layers.append(layer)
126 |         layer.previous_layer = self
127 | 
128 |     def shared_computations(self):
129 |         #essential computations
130 |         self.Kmmi, Lm, Lmi, self.log_det_Kmm = GPy.util.linalg.pdinv(self.Kmm)
131 |         self.psi1Kmmi = np.dot(self.psi1, self.Kmmi)
132 |         if not self.natgrads:
133 |             if self.S_param is 'chol':
134 |                 L = choleskies.flat_to_triang(self.q_of_U_choleskies)
135 |                 self.q_of_U_covariance = np.einsum('ijk,ljk->ilk', L, L)
136 |                 self.q_of_U_precision = choleskies.multiple_dpotri(L)
137 |                 self.q_of_U_cov_logdet = 2.*np.array([np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(self.output_dim)])
138 |                 uuT = np.dot(self.q_of_U_mean, self.q_of_U_mean.T) + self.q_of_U_covariance.sum(-1)
139 |                 self.psi1KmmiS = np.einsum('ij,jkl->ikl', self.psi1Kmmi, self.q_of_U_covariance) # intermediate computation
140 |             elif self.S_param is 'diag':
141 |                 self.q_of_U_cov_logdet = np.sum(np.log(self.q_of_U_diags),0)
142 |                 uuT = np.dot(self.q_of_U_mean, self.q_of_U_mean.T) + np.diag(self.q_of_U_diags.sum(-1))
143 |                 self.q_of_U_precision = np.dstack([np.diag(1./x) for x in 1*self.q_of_U_diags.T])
144 |                 self.psi1KmmiS = self.psi1Kmmi[:,:,None]*self.q_of_U_diags[None,:,:]
145 | 
146 | 
147 | 
148 |         self.KiuuT = np.dot(self.Kmmi, uuT)
149 |         self.KiuuTKi = self.KiuuT.dot(self.Kmmi)
150 |         self.KmmiPsi2 = np.dot(self.Kmmi, self.psi2)
151 |         self.KmmiPsi2Kmmi = self.KmmiPsi2.dot(self.Kmmi)
152 | 
153 |         #this thing is only used for prediction
154 |         self.woodbury_inv = None
155 | 
156 |     def compute_trace_term(self):
157 |         trace = -0.5*self.output_dim*(self.psi0.sum() - np.trace(self.KmmiPsi2))
158 |         self._log_marginal_contribution += self.beta*trace
159 |         self.beta.gradient += trace
160 |         self.dL_dpsi0 += -0.5*self.beta*self.output_dim
161 |         self.dL_dpsi2 += 0.5*self.beta*self.Kmmi*self.output_dim
162 |         self.dL_dKmm  += -0.5*self.beta*self.KmmiPsi2Kmmi*self.output_dim
163 | 
164 |     def compute_KL_term(self):
165 |         """Kullback Leibler term KL[q(u)||p(u)]"""
166 |         self._log_marginal_contribution -= self.KL_scaling * 0.5*(self.output_dim * self.log_det_Kmm  - self.q_of_U_cov_logdet.sum() + self.num_inducing*self.output_dim + np.trace(self.KiuuT))
167 |         self.dL_dKmm += self.KL_scaling * 0.5*(-self.Kmmi*self.output_dim + self.KiuuTKi)
168 |         self.dL_duuT += self.KL_scaling * 0.5*(self.q_of_U_precision - self.Kmmi[:,:,None])
169 |         self.dL_dEu -= self.KL_scaling * np.einsum('ijk,jk->ik', self.q_of_U_precision, self.q_of_U_mean)
170 | 
171 |     def gradient_updates(self):
172 |         """set the derivatives in the kernel and in Z"""
173 |         self.kern.update_gradients_full(self.dL_dKmm, self.Z)
174 |         g = self.kern._gradient_array_.copy()
175 | #         self.dL_dpsi2 = np.repeat(self.dL_dpsi2[None,:,:], self.q_of_X_in.shape[0], axis=0)
176 |         self.kern.update_gradients_expectations(Z=self.Z, variational_posterior=self.q_of_X_in, dL_dpsi0=self.dL_dpsi0, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2)
177 |         self.kern._gradient_array_ += g
178 | 
179 |         self.Z.gradient = self.kern.gradients_X(self.dL_dKmm, self.Z)
180 |         self.Z.gradient += self.kern.gradients_Z_expectations(Z=self.Z, variational_posterior=self.q_of_X_in, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2, dL_dpsi0=self.dL_dpsi0)
181 | 
182 |         if not self.natgrads:
183 |             self.q_of_U_mean.gradient = self.dL_dEu + 2.*np.einsum('ijk,jk->ik', self.dL_duuT, self.q_of_U_mean)
184 |             if self.S_param is 'chol':
185 |                 L = choleskies.flat_to_triang(self.q_of_U_choleskies)
186 |                 dL_dchol = 2.*np.einsum('ijk,jlk->ilk', self.dL_duuT, L)
187 |                 self.q_of_U_choleskies.gradient = choleskies.triang_to_flat(dL_dchol)
188 |             else:
189 |                 self.q_of_U_diags.gradient = np.vstack([np.diag(self.dL_duuT[:,:,i]) for i in xrange(self.output_dim)]).T
190 | 
191 |     def compute_output_dist(self):
192 |         """Gaussian distributions to pass forward"""
193 |         forward_mean = self.psi1Kmmi.dot(self.q_of_U_mean) # assumes mean is MxD
194 |         if self.S_param is 'chol':
195 |             #forward_var = np.einsum("ij,jkl,ki->il", self.psi1Kmmi, self.q_of_U_covariance, self.psi1Kmmi.T) + 1./self.beta # assumes covariance is MMD
196 |             forward_var = np.einsum('ijk,ij->ik', self.psi1KmmiS, self.psi1Kmmi) + 1./self.beta
197 |         else:
198 |             forward_var = np.square(self.psi1Kmmi).dot(self.q_of_U_diags) + 1./self.beta#TODO: check this
199 | 
200 |         self.q_of_X_out = GPy.core.parameterization.variational.NormalPosterior(forward_mean, forward_var)
201 | 
202 |     def backpropagated_gradients(self, dL_dmean, dL_dvar):
203 |         """Given derivatives of terms in lower layers, update derivatives in this layer"""
204 |         Kim = self.Kmmi.dot(self.q_of_U_mean)
205 | 
206 |         #additional gradients for self.psi1 from the chain rule
207 |         self.dL_dpsi1 += np.einsum('ij,kj->ik', dL_dmean, Kim)
208 |         tmp0 = np.einsum('kjl,kl->kj', self.psi1KmmiS, dL_dvar)
209 |         self.dL_dpsi1 += tmp0.dot(self.Kmmi) * 2.
210 | 
211 |         #additional gradients for self.Kmm from the chain rule
212 |         tmp1 = np.dot(self.psi1Kmmi.T, dL_dmean)
213 |         tmp2 = np.dot(Kim, tmp1.T)
214 |         self.dL_dKmm += -0.5*(tmp2 + tmp2.T)
215 |         tmp3 = tmp0.T.dot(self.psi1)
216 |         self.dL_dKmm += -np.dot(self.Kmmi, tmp3 + tmp3.T).dot(self.Kmmi)
217 | 
218 |         #additional gradients for self.beta from the chain rule
219 |         self.beta.gradient += -dL_dvar.sum()/self.beta**2
220 | 
221 |         #additional gradients for q(U)
222 |         self.dL_dEu += tmp1
223 |         dL_dS = special_einsum(self.psi1Kmmi, dL_dvar)
224 |         self.dL_duuT += dL_dS
225 |         self.dL_dEu -= 2.*np.einsum('ijk,jk->ik', dL_dS, self.q_of_U_mean)
226 | 
227 |     def reset_gradients(self):
228 |         self._log_marginal_contribution = 0.
229 |         self.dL_dpsi0 = np.zeros_like(self.psi0)
230 |         self.dL_dpsi1 = np.zeros_like(self.psi1)
231 |         self.dL_dpsi2 = np.zeros_like(self.psi2)
232 |         self.dL_dKmm  = np.zeros_like(self.Kmm)
233 |         self.beta.gradient = 0.
234 |         self.dL_duuT = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
235 |         self.dL_dEu = np.zeros((self.num_inducing, self.output_dim))
236 | 
237 | 
238 | 
239 | 
240 | class HiddenLayer(Layer):
241 | 
242 |     def plot(self):
243 |         plotting.plot_hidden_layer(self)
244 | 
245 |     def log_density_sampling(self, X, Y):
246 |         XX = self.posterior_samples(X)
247 |         return self.lower_layers[0].log_density_sampling(XX, Y)
248 | 
249 |     def predict_forward_sampling(self, X, correlated=False, noise_off=False):
250 |         XX = self.posterior_samples(X, full_cov=correlated, noise_off=noise_off)
251 |         return self.lower_layers[0].predict_forward_sampling(XX, correlated=correlated, noise_off=noise_off)
252 | 
253 |     def predict_means(self, X):
254 |         m, v = self.predict(X)
255 |         return self.lower_layers[0].predict_means(m)
256 | 
257 | 
258 |     def feed_forward(self, q_of_X_in):
259 |         """
260 |         Compute the distribution for the outputs of this layer, as well as any
261 |         marginal likelihood terms that occur
262 |         """
263 | 
264 |         #store the distribution from the incoming layer
265 |         self.q_of_X_in = q_of_X_in
266 | 
267 |         #kernel computations
268 |         self.psi0 = self.kern.psi0(self.Z, q_of_X_in)
269 |         self.psi1 = self.kern.psi1(self.Z, q_of_X_in)
270 |         self.psi2 = self.kern.psi2(self.Z, q_of_X_in)
271 |         self.Kmm = self.kern.K(self.Z) + np.eye(self.num_inducing)*1e-3
272 | 
273 |         self.reset_gradients()
274 |         self.shared_computations()
275 |         self.compute_trace_term()
276 |         self.compute_KL_term()
277 | 
278 |         #complete the square term
279 |         psi2_centered = self.psi2 - np.dot(self.psi1.T, self.psi1)
280 |         tmp = np.sum(psi2_centered * self.KiuuTKi)
281 |         self.beta.gradient += -0.5*tmp
282 |         self._log_marginal_contribution += -0.5*self.beta*tmp
283 |         self.dL_dpsi1 += self.beta*self.KiuuTKi.dot(self.psi1.T).T
284 |         self.dL_dpsi2 += -0.5*self.beta*self.KiuuTKi
285 |         tmp = self.KiuuTKi.dot(psi2_centered).dot(self.Kmmi)
286 |         self.dL_dKmm += 0.5*self.beta*(tmp + tmp.T)
287 |         self.dL_duuT += -0.5*self.beta*(self.Kmmi.dot(psi2_centered).dot(self.Kmmi))[:,:,None]
288 | 
289 |         self.compute_output_dist()
290 | 
291 |         #feed forward to downstream layers
292 |         [l.feed_forward(self.q_of_X_out) for l in self.lower_layers]
293 | 
294 |     def feed_backwards(self, dL_dmean, dL_dvar):
295 |         """
296 |         Layers ahead of this one will compute terms that affect the marginal
297 |         likelihood. Given the derivative of those terms with respect to what
298 |         this layer has fed forward, we can compute the derivatives wrt
299 |         parameters in this layer.
300 |         """
301 | 
302 |         self.backpropagated_gradients(dL_dmean, dL_dvar)
303 | 
304 |         self.gradient_updates()
305 | 
306 |         #compute the gradients wrt the input q(X) to feed backward to the previous layer
307 |         X_mean_grad, X_var_grad = self.kern.gradients_qX_expectations(variational_posterior=self.q_of_X_in, Z=self.Z, dL_dpsi0=self.dL_dpsi0, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2)
308 |         self.previous_layer.feed_backwards(X_mean_grad, X_var_grad)
309 | 
310 | class InputLayerFixed(Layer):
311 |     def __init__(self, X, input_dim, output_dim, kern, Z, beta=0.01, S_param='chol', name='input_layer'):
312 |         super(InputLayerFixed, self).__init__(input_dim=input_dim, output_dim=output_dim, kern=kern, Z=Z, beta=beta, S_param=S_param, name=name)
313 |         assert X.shape[1] == self.input_dim
314 |         if isinstance(X, (GPy.core.parameterization.ObsAr, GPy.core.parameterization.variational.VariationalPosterior)):
315 |             self.X = X.copy()
316 |         else: self.X = GPy.core.parameterization.ObsAr(X)
317 | 
318 |     def log_density_sampling(self, X, Y):
319 |         XX = self.posterior_samples(X)
320 |         return self.lower_layers[0].log_density_sampling(XX, Y)
321 | 
322 |     def predict_forward_sampling(self, X, correlated=False, noise_off=False):
323 |         XX = self.posterior_samples(X, full_cov=correlated, noise_off=noise_off)
324 |         return self.lower_layers[0].predict_forward_sampling(XX, correlated=correlated, noise_off=noise_off)
325 | 
326 |     def predict_means(self, X):
327 |         m, v = self.predict(X)
328 |         return self.lower_layers[0].predict_means(m)
329 | 
330 |     def plot(self):
331 |         plotting.plot_input_layer(self)
332 | 
333 | 
334 |     def feed_forward(self):
335 |         """
336 |         Compute the distribution for the outputs of this layer, as well as any
337 |         marginal likelihood terms that occur.
338 |         """
339 | 
340 |         #kernel computations
341 |         self.psi0 = self.kern.Kdiag(self.X)
342 |         self.psi1 = self.kern.K(self.X, self.Z)
343 |         self.psi2 = self.psi1.T.dot(self.psi1)
344 |         self.Kmm = self.kern.K(self.Z) + np.eye(self.num_inducing)*1e-3
345 | 
346 |         self.reset_gradients()
347 |         self.shared_computations()
348 |         self.compute_trace_term()
349 |         self.compute_KL_term()
350 |         self.compute_output_dist()
351 | 
352 |         #feed forward to downstream layers
353 |         [l.feed_forward(self.q_of_X_out) for l in self.lower_layers]
354 | 
355 |     def feed_backwards(self, dL_dmean, dL_dvar):
356 |         """
357 |         Layers ahead of this one will compute terms that affect the marginal
358 |         likelihood. Given the derivative of those terms with respect to what
359 |         this layer has fed forward, we can compute the derivatives wrt
360 |         parameters in this layer.
361 |         """
362 | 
363 |         self.backpropagated_gradients(dL_dmean, dL_dvar)
364 |         self.gradient_updates()
365 | 
366 |     def gradient_updates(self):
367 |         #note that the kerel gradients are a little different because there's no q(X), just a fixed X
368 |         self.kern.update_gradients_full(self.dL_dKmm, self.Z)
369 |         g = self.kern._gradient_array_.copy()
370 |         dL_dKnm = self.dL_dpsi1 + 2.*self.psi1.dot(self.dL_dpsi2)
371 |         self.kern.update_gradients_full(dL_dKnm, self.X, self.Z)
372 |         g += self.kern._gradient_array_.copy()
373 |         self.kern.update_gradients_diag(self.dL_dpsi0, self.X)
374 |         self.kern._gradient_array_ += g
375 | 
376 |         self.Z.gradient = self.kern.gradients_X(self.dL_dKmm, self.Z)
377 |         self.Z.gradient += self.kern.gradients_X(dL_dKnm.T, self.Z, self.X)
378 | 
379 |         self.q_of_U_mean.gradient = self.dL_dEu + 2.*np.einsum('ijk,jk->ik',self.dL_duuT, self.q_of_U_mean)
380 |         if self.S_param is 'chol':
381 |             L = choleskies.flat_to_triang(self.q_of_U_choleskies)
382 |             dL_dchol = 2.*np.einsum('ijk,jlk->ilk', self.dL_duuT, L)
383 |             self.q_of_U_choleskies.gradient = choleskies.triang_to_flat(dL_dchol)
384 |         else:
385 |             self.q_of_U_diags.gradient = np.vstack([np.diag(self.dL_duuT[:,:,i]) for i in xrange(self.output_dim)]).T
386 | 
387 | 
388 | 
389 | class ObservedLayer(Layer):
390 |     def plot(self):
391 |         plotting.plot_output_layer(self)
392 | 
393 |     def __init__(self, Y, input_dim, output_dim, kern, Z, beta=0.01, S_param='chol', name='output_layer'):
394 |         super(ObservedLayer, self).__init__(input_dim, output_dim, kern, Z, beta=beta, S_param=S_param, name=name)
395 |         assert Y.shape[1] == output_dim
396 |         self.Y = Y
397 |         tmp = np.random.randn(self.num_inducing, self.num_inducing)
398 | 
399 |     def log_density_sampling(self, X, Y):
400 |         m, v = self.predict(X)
401 |         m, v = m.reshape(-1,*Y.shape, order='F'), v.reshape(-1,*Y.shape, order='F')
402 |         logdensities =  -0.5*np.log(2*np.pi) - 0.5*np.log(v) -0.5*np.square(m - Y[None,:,:])/v
403 |         logdensities = logdensities.sum(-1)#sum over the output dimensions
404 |         #safe exponentiate and average over the samples
405 |         Nsamples = logdensities.shape[0]
406 |         from scipy.misc import logsumexp
407 |         return logsumexp(logdensities, axis=0) - np.log(Nsamples)
408 | 
409 |     def predict_forward_sampling(self, X, correlated=False, noise_off=False):
410 |         XX = self.posterior_samples(X, full_cov=correlated, noise_off=noise_off)
411 |         return XX
412 | 
413 |     def predict_means(self, X):
414 |         return self.predict(X)
415 | 
416 | 
417 |     def feed_forward(self, q_of_X_in):
418 |         #store the distribution from the incoming layer
419 |         self.q_of_X_in = q_of_X_in
420 | 
421 |         #kernel computations
422 |         self.psi0 = self.kern.psi0(self.Z, q_of_X_in)
423 |         self.psi1 = self.kern.psi1(self.Z, q_of_X_in)
424 |         self.psi2 = self.kern.psi2(self.Z, q_of_X_in)
425 |         self.Kmm = self.kern.K(self.Z) + np.eye(self.num_inducing)*1e-3
426 |         self.Kmmi, Lm, Lmi, self.log_det_Kmm = GPy.util.linalg.pdinv(self.Kmm)
427 | 
428 | 
429 |         self.reset_gradients()
430 |         self.shared_computations()
431 |         self.compute_trace_term()
432 |         self.compute_KL_term()
433 | 
434 |         #data likelihood terms
435 |         if self.S_param is 'chol':
436 |             uuT = np.dot(self.q_of_U_mean, self.q_of_U_mean.T) + self.q_of_U_covariance.sum(-1)
437 |         else:
438 |             uuT = np.dot(self.q_of_U_mean, self.q_of_U_mean.T) + np.diag(self.q_of_U_diags.sum(-1))
439 |         proj_mean = self.psi1Kmmi.dot(self.q_of_U_mean)
440 |         muY = np.dot(self.q_of_U_mean, self.Y.T)
441 |         N = q_of_X_in.shape[0]
442 |         expected_dist = np.sum(np.square(self.Y)) + np.sum(uuT * self.KmmiPsi2Kmmi) - 2.*np.sum(self.Y*proj_mean)
443 |         self._log_marginal_contribution += -0.5*N*self.output_dim*np.log(2*np.pi)
444 |         self._log_marginal_contribution += 0.5*N*self.output_dim*np.log(self.beta)
445 |         self._log_marginal_contribution += -0.5*self.beta*expected_dist
446 |         self.dL_dpsi1 += self.beta*np.dot(self.Kmmi, muY).T
447 |         self.dL_dpsi2 += -0.5*self.beta*self.Kmmi.dot(uuT).dot(self.Kmmi)
448 |         tmp = self.Kmmi.dot(uuT.dot(self.Kmmi).dot(self.psi2) - muY.dot(self.psi1)).dot(self.Kmmi) #TODO: inefficient?
449 |         self.dL_dKmm += 0.5*self.beta*(tmp + tmp.T)
450 |         self.beta.gradient += 0.5*N*self.output_dim/self.beta -0.5*expected_dist
451 |         self.dL_duuT += -0.5*self.beta*self.KmmiPsi2Kmmi[:,:,None]
452 |         self.dL_dEu += self.beta*np.einsum('ij,ik->jk', self.psi1Kmmi, self.Y)
453 | 
454 |         #since we're the leaf of the tree, update all the gradients now.
455 |         self.gradient_updates()
456 | 
457 |         #compute the gradients wrt the input q(X) to feed backward to the previous layer
458 |         X_mean_grad, X_var_grad = self.kern.gradients_qX_expectations(variational_posterior=self.q_of_X_in, Z=self.Z, dL_dpsi0=self.dL_dpsi0, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2)
459 | 
460 |         self.previous_layer.feed_backwards(X_mean_grad, X_var_grad)
461 | 
462 | 
463 | 
464 | 
465 | class InputLayer(GPy.core.parameterization.Parameterized):
466 |     """
467 |     A simple layer to represent the unsupervised inputs to a deep network
468 |     """
469 |     def __init__(self, X_mean, X_variance, name='input_layer'):
470 |         super(InputLayer, self).__init__(name)
471 | 
472 |         self.X = GPy.core.parameterization.variational.NormalPosterior(X_mean, X_variance)
473 |         self.link_parameter(self.X)
474 | 
475 |         self.variational_prior = GPy.core.parameterization.variational.NormalPrior()
476 |         self.lower_layers = []
477 | 
478 |     def add_layer(self, layer):
479 |         assert layer.input_dim == self.X.shape[1]
480 |         self.lower_layers.append(layer)
481 |         layer.previous_layer = self
482 | 
483 |     def feed_forward(self):
484 |         [l.feed_forward(self.X) for l in self.lower_layers]
485 | 
486 |     def feed_backwards(self, dL_dmean, dL_dvar):
487 |         self._log_marginal_contribution = -self.variational_prior.KL_divergence(self.X)
488 |         self.X.mean.gradient, self.X.variance.gradient = dL_dmean, dL_dvar
489 |         self.variational_prior.update_gradients_KL(self.X)
490 | 
491 | 
492 | 


--------------------------------------------------------------------------------
/plotting.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015 James Hensman
  2 | # Licensed under the BSD 3-clause license (see LICENSE.txt)
  3 | 
  4 | from GPy.plotting.matplot_dep.base_plots import gpplot, x_frame1D, x_frame2D
  5 | from matplotlib import pyplot as plt
  6 | import numpy as np
  7 | 
  8 | def sausage_plot(layer, Xnew, ax):
  9 |     mu, var = layer.predict(Xnew)
 10 |     gpplot(Xnew, mu, mu + 2*np.sqrt(var), mu - 2*np.sqrt(var), ax=ax)
 11 | 
 12 | def errorbars(layer, ax):
 13 |     if layer.S_param is 'chol':
 14 |         yerr = 2.*np.sqrt(np.vstack([np.diag(layer.q_of_U_covariance[:,:,i]) for i in range(layer.output_dim)]).T[:,0])
 15 |     else:
 16 |         yerr = 2*np.sqrt(layer.q_of_U_diag.flatten())
 17 | 
 18 |     ax.errorbar(layer.Z[:,0]*1, layer.q_of_U_mean[:,0]*1, yerr=yerr , linestyle='')
 19 | 
 20 | def plot_gaussians(q, ax,limits=None, vertical=False, color='k'):
 21 |     if limits is None:
 22 |         Xnew, xmin, xmax = x_frame1D(q.mean, resolution=2000)
 23 |     else:
 24 |         xmin, xmax = limits
 25 |         Xnew = np.linspace(xmin, xmax, 2000)[:,None]
 26 | 
 27 |     #compute Gaussian densities
 28 |     log_density = -0.5*np.log(2*np.pi) -0.5*np.log(q.variance) -0.5*(q.mean-Xnew.T)**2/q.variance
 29 |     density = np.exp(log_density)
 30 |     if vertical:
 31 |         [ax.plot(d, Xnew[:,0], color, linewidth=1.) for d in density]
 32 |         [ax.fill(d, Xnew[:,0], color=color, linewidth=0., alpha=0.2) for d in density]
 33 |     else:
 34 |         ax.plot(Xnew, density.T, color, linewidth=1.)
 35 |         ax.fill(Xnew, density.T, color=color, linewidth=0., alpha=0.2)
 36 | 
 37 | 
 38 | def plot_hidden_layer(layer):
 39 |     if layer.input_dim==1:
 40 |         fig = plt.figure()
 41 |         ax1 = fig.add_axes([0.2, 0.2, 0.7, 0.7])
 42 | 
 43 |         Xnew, xmin, xmax = x_frame1D(np.vstack((layer.Z*1, layer.q_of_X_in.mean*1)), resolution=200)
 44 | 
 45 |         sausage_plot(layer, Xnew, ax1)
 46 |         errorbars(layer, ax1)
 47 |         plt.setp(ax1.get_xticklabels(), visible=False)
 48 |         plt.setp(ax1.get_yticklabels(), visible=False)
 49 | 
 50 |         #do the gaussians for the input
 51 |         ax2 = fig.add_axes([0.2, 0.1, 0.7, 0.1], sharex=ax1)
 52 |         ax2.set_yticks([])
 53 |         plot_gaussians(layer.q_of_X_in, ax2, (xmin, xmax))
 54 |         ax2.set_xlim(xmin, xmax)
 55 |         #ax2.set_ylim(ax2.get_ylim()[::-1])
 56 | 
 57 |         ax3 = fig.add_axes([0.1, 0.2, 0.1, 0.7], sharey=ax1)
 58 |         plot_gaussians(layer.q_of_X_out, ax3, ax1.get_ylim(), vertical=True)
 59 |         ax3.set_xticks([])
 60 |         ax3.set_xlim(ax3.get_xlim()[::-1])
 61 | 
 62 | def plot_input_layer(layer):
 63 |     if layer.input_dim==1:
 64 |         fig = plt.figure()
 65 |         ax1 = fig.add_axes([0.2, 0.2, 0.7, 0.7])
 66 | 
 67 |         Xnew, xmin, xmax = x_frame1D(layer.Z, resolution=200)
 68 | 
 69 |         sausage_plot(layer, Xnew, ax1)
 70 |         errorbars(layer, ax1)
 71 |         plt.setp(ax1.get_xticklabels(), visible=False)
 72 |         plt.setp(ax1.get_yticklabels(), visible=False)
 73 | 
 74 |         #do crosses for the input
 75 |         ax2 = fig.add_axes([0.2, 0.1, 0.7, 0.1], sharex=ax1)
 76 |         ax2.set_yticks([])
 77 |         ax2.plot(layer.X*1, layer.X*0, 'kx', mew=2., ms=9)
 78 |         ax2.set_xlim(xmin, xmax)
 79 | 
 80 |         ax3 = fig.add_axes([0.1, 0.2, 0.1, 0.7], sharey=ax1)
 81 |         plot_gaussians(layer.q_of_X_out, ax3,vertical=True)
 82 |         ax3.set_xticks([])
 83 |         ax3.set_xlim(ax3.get_xlim()[::-1])
 84 | 
 85 | def plot_output_layer(layer):
 86 |     if layer.input_dim==1:
 87 |         fig = plt.figure()
 88 |         ax1 = fig.add_axes([0.1, 0.2, 0.8, 0.7])
 89 | 
 90 |         Xnew, xmin, xmax = x_frame1D(layer.Z, resolution=200)
 91 | 
 92 |         sausage_plot(layer, Xnew, ax1)
 93 |         errorbars(layer, ax1)
 94 |         plt.setp(ax1.get_xticklabels(), visible=False)
 95 | 
 96 |         #plot the data
 97 |         ax1.plot(layer.q_of_X_in.mean*1., layer.Y, 'kx', mew=2, ms=9 )
 98 | 
 99 |         #do the gaussians for the input
100 |         ax2 = fig.add_axes([0.1, 0.1, 0.8, 0.1], sharex=ax1)
101 |         ax2.set_yticks([])
102 |         plot_gaussians(layer.q_of_X_in, ax2, (xmin, xmax))
103 |         ax2.set_xlim(xmin, xmax)
104 |         #ax2.set_ylim(ax2.get_ylim()[::-1])
105 | 
106 | 
107 | def plot_deep(model, xlim=None, Nsamples=0):
108 |     if model.layerX.input_dim==1 and model.layerY.output_dim==1:
109 |         fig, ax1 = plt.subplots(1)
110 |         if xlim is None:
111 |             Xnew, xmin, xmax = x_frame1D(model.layerX.X, resolution=200)
112 |         else:
113 |             xmin = xlim[0]
114 |             xmax = xlim[1]
115 |             Xnew = np.linspace(xmin, xmax, 200)[:, None]
116 |         Xnew = np.linspace(xmin,xmax,200)[:,None]
117 |         s = model.predict_sampling(Xnew, 1000)
118 |         yTest = model.predict_means(Xnew)[0]
119 |         H, xedges, yedges = np.histogram2d(np.repeat(Xnew.T,  1000, 0).flatten(), 
120 |                                            s.flatten(), 
121 |                                            bins=[Xnew.flatten(),
122 |                                                  np.linspace(s.min(),s.max(),50)])
123 |         ax1.imshow(H.T, 
124 |                    extent=[xedges.min(), xedges.max(),
125 |                            yedges.min(), yedges.max()], 
126 |                    cmap=plt.cm.Blues, 
127 |                    interpolation='nearest',
128 |                    origin='lower',
129 |                    aspect='auto')
130 |         ax1.plot(model.layerX.X, model.layerY.Y, 'kx', mew=1.3)
131 |         ax1.plot(Xnew.flatten(), yTest.flatten())
132 |         ax1.set_ylim(yedges.min(), yedges.max())
133 |         ax1.set_xlim(xmin, xmax)
134 | 
135 |         for n in range(Nsamples):
136 |             Y = model.posterior_sample(Xnew)
137 |             ax1.plot(Xnew, Y, 'r', lw=1.4)
138 | 


--------------------------------------------------------------------------------
/special_einsum.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 James Hensman
 2 | # Licensed under the BSD 3-clause license (see LICENSE.txt)
 3 | 
 4 | import numpy as np
 5 | from scipy import weave
 6 | 
 7 | """
 8 | This file provides a weavified version of the function
 9 | 
10 |   np.einsum('ij,ik,il->jkl', A, A, B)
11 | 
12 | see also test_special_einsum.py
13 | """
14 | 
15 | code = """
16 | int n,m,mm,d;
17 | double tmp;
18 | for(n=0;n<N;n++){
19 |   for(m=0;m<M;m++){
20 |     //compute diag
21 |     tmp = A(n,m)*A(n,m);
22 |     for(d=0;d<D;d++){
23 |       res(m,m,d) += tmp*B(n,d);
24 |     }
25 |     //only compute in lower half
26 |     for(mm=0;mm<m;mm++){
27 |       tmp = A(n,m)*A(n,mm);
28 |       for(d=0;d<D;d++){
29 |         res(m,mm,d) += tmp*B(n,d);
30 |       }
31 |     }
32 |   }
33 | }
34 | //make symmpetrical
35 | for(m=0;m<M;m++){
36 |   for(mm=0;mm<m;mm++){
37 |     for(d=0;d<D;d++){
38 |       res(mm,m,d) = res(m,mm,d);
39 |     }
40 |   }
41 | }
42 | """
43 | 
44 | def special_einsum(A,B):
45 |     opts = {'headers'           : ['<omp.h>'],
46 |             'extra_compile_args': ['-fopenmp -O3'],
47 |             'extra_link_args'   : ['-lgomp'],
48 |             'libraries': ['gomp']}
49 |     N, M = A.shape
50 |     N2, D = B.shape
51 |     assert N==N2
52 | 
53 |     res = np.zeros((M, M, D))
54 |     weave.inline(code, ['N','M','D','res','A','B'], type_converters=weave.converters.blitz, support_code='#include <omp.h>', **opts)
55 |     return res
56 | 
57 | 


--------------------------------------------------------------------------------
/step_fn_demo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 James Hensman
 2 | # Licensed under the BSD 3-clause license (see LICENSE.txt)
 3 | 
 4 | from matplotlib import use;use('pdf')
 5 | from coldeep import ColDeep
 6 | from layers import *
 7 | import GPy
 8 | from matplotlib import pyplot as plt
 9 | plt.close('all')
10 | np.random.seed(0)
11 | 
12 | N = 30
13 | D = 1
14 | X = np.linspace(0,1,N)[:,None]
15 | Y = np.where(X>0.5, 1,0) + np.random.randn(N,1)*0.02
16 | Q = 1
17 | M = 15
18 | Xtest = np.linspace(-1,2,500)[:,None]
19 | 
20 | def plot(m, prefix):
21 |     #for i, l in enumerate(m.layers):
22 |         #l.plot()
23 |         #plt.savefig('./step_fn_plots/%s_layer%i.pdf'%(prefix,i))
24 |         #plt.close()
25 |     s = m.predict_sampling(Xtest, 1000)
26 |     H, xedges, yedges = np.histogram2d(np.repeat(Xtest.T,  1000, 0).flatten(), s.flatten(), bins=[Xtest.flatten(),np.linspace(-.3,1.3,50)])
27 |     plt.figure()
28 |     plt.imshow(H.T[:,::-1], extent=[xedges.min(), xedges.max(),yedges.min(),yedges.max()], cmap=plt.cm.Blues, interpolation='nearest')
29 |     plt.plot(X, Y, 'kx', mew=1.3)
30 |     plt.ylim(-.3, 1.3)
31 | 
32 | #a GP
33 | m = GPy.models.GPRegression(X,Y)
34 | m.optimize()
35 | print m.log_likelihood()
36 | m.plot()
37 | 
38 | mu, var = m.predict(Xtest)
39 | s = np.random.randn(mu.size, 1000)*np.sqrt(var) + mu
40 | H, xedges, yedges = np.histogram2d(np.repeat(Xtest.T,  1000, 0).flatten(), s.T.flatten(), bins=[Xtest.flatten(),np.linspace(-.3,1.3,50)])
41 | plt.figure()
42 | plt.imshow(H.T[:,::-1], extent=[xedges.min(), xedges.max(),yedges.min(),yedges.max()], cmap=plt.cm.Blues, interpolation='nearest')
43 | plt.plot(X, Y, 'kx', mew=1.3)
44 | plt.ylim(-.3, 1.3)
45 | 
46 | 
47 | 
48 | #one hidden layer:
49 | layer_X = InputLayerFixed(X, input_dim=1, output_dim=Q, kern=GPy.kern.RBF(1), Z=np.random.rand(M,1), beta=100., name='layerX')
50 | layer_Y = ObservedLayer(Y, input_dim=Q, output_dim=D, kern=GPy.kern.RBF(Q), Z=np.random.randn(M,Q), beta=500., name='layerY')
51 | layer_X.add_layer(layer_Y)
52 | m = ColDeep([layer_X, layer_Y])
53 | layer_X.Z.fix()
54 | m.optimize('bfgs', max_iters=1000, messages=1)
55 | print m.log_likelihood()
56 | plot(m, 'H1')
57 | 
58 | 
59 | 
60 | #two hidden layers
61 | layer_X = InputLayerFixed(X, input_dim=1, output_dim=Q, kern=GPy.kern.RBF(1), Z=np.random.rand(M,1), beta=100., name='layerX')
62 | layer_H = HiddenLayer(input_dim=Q, output_dim=Q, kern=GPy.kern.RBF(Q, ARD=True), Z=np.random.randn(M,Q), beta=100., name='layerH')
63 | layer_Y = ObservedLayer(Y, input_dim=Q, output_dim=D, kern=GPy.kern.RBF(Q), Z=np.random.randn(M,Q), beta=500., name='layerY')
64 | layer_X.add_layer(layer_H)
65 | layer_H.add_layer(layer_Y)
66 | m = ColDeep([layer_X, layer_H, layer_Y])
67 | layer_X.Z.fix()
68 | m.optimize('bfgs', max_iters=1000, messages=1)
69 | print m.log_likelihood()
70 | plot(m, 'H2')
71 | 
72 | #threee hidden layers
73 | layer_X = InputLayerFixed(X, input_dim=1, output_dim=Q, kern=GPy.kern.RBF(1), Z=np.random.rand(M,1), beta=100., name='layerX')
74 | layer_H = HiddenLayer(input_dim=Q, output_dim=Q, kern=GPy.kern.RBF(Q, ARD=True), Z=np.random.randn(M,Q), beta=100., name='layerH')
75 | layer_H2 = HiddenLayer(input_dim=Q, output_dim=Q, kern=GPy.kern.RBF(Q, ARD=True), Z=np.random.randn(M,Q), beta=100., name='layerH2')
76 | layer_Y = ObservedLayer(Y, input_dim=Q, output_dim=D, kern=GPy.kern.RBF(Q), Z=np.random.randn(M,Q), beta=500., name='layerY')
77 | layer_X.add_layer(layer_H)
78 | layer_H.add_layer(layer_H2)
79 | layer_H2.add_layer(layer_Y)
80 | m = ColDeep([layer_X, layer_H, layer_H2, layer_Y])
81 | layer_X.Z.fix()
82 | m.optimize('bfgs', max_iters=1000, messages=1)
83 | print m.log_likelihood()
84 | plot(m, 'H3')
85 | 
86 | 


--------------------------------------------------------------------------------