├── README.md
└── Hierarchical Mixture of Experts
    ├── helpers.py
    ├── label_binariser.py
    ├── weighted_gda.py
    ├── weighted_lin_reg.py
    ├── softmax_reg.py
    ├── general_hme.py
    └── nodes_hme.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Mixture-of-Experts-Models
 2 | 
 3 | Future work:
 4 |  - Add Mixture Density Neural Network using tensorflow
 5 |  - Rewrite HME with tensorflow
 6 |  
 7 |  
 8 | ##  Hierarchical Mixture of Experts
 9 | 
10 | Hierarchical mixture of experts can be used to solve standard [regression](https://github.com/AmazaspShumik/Mixture-of-Experts-Models/blob/master/Hierarchical%20Mixture%20of%20Experts/hme_standard_regression_examples.ipynb) and [classification](https://github.com/AmazaspShumik/Mixture-of-Experts-Models/blob/master/Hierarchical%20Mixture%20of%20Experts/hme_classification_examples.ipynb) problems, however one of the main applications of hme are problems with [multimodal output](https://github.com/AmazaspShumik/Mixture-of-Experts-Models/blob/master/Hierarchical%20Mixture%20of%20Experts/hme_multimodal_output_examples.ipynb).
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/AmazaspShumik/mixture-models/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
18 | 
19 | 


--------------------------------------------------------------------------------
/Hierarchical Mixture of Experts/helpers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import random
  5 | 
  6 | 
  7 | 
  8 | def train_test_split(x,y,test_p = 0.25):
  9 |     '''
 10 |     Divides data set into train and test data sets
 11 |     
 12 |     Parameters:
 13 |     ----------
 14 |     
 15 |     x: numpy array of size 'n x k' (k can be 1 and above)
 16 |        Exogeneous variables
 17 |        
 18 |     y: numpy array of size 'n  x m' (m can be 1 and above)
 19 |        Endogeneous variables
 20 |        
 21 |     test_p: float
 22 |        Proportion of data that should go to testing set
 23 |        
 24 |     Returns:
 25 |     --------
 26 |     
 27 |     [x_train,x_test]: list of size 2
 28 |        First element of list is training set, second is testing set
 29 |         
 30 |     '''
 31 |     n            = x.shape[0]  
 32 |     sample_index = random.sample(xrange(n), int(n*test_p))
 33 |     train_index  = [e for e in xrange(n) if e not in set(sample_index)]
 34 |     
 35 |     if len(x.shape) > 1:
 36 |         x_test       = x[sample_index,:]
 37 |         x_train      = x[train_index,:]
 38 |         
 39 |     if len(y.shape) > 1:
 40 |         y_test       = y[sample_index,:]
 41 |         y_train      = y[train_index,:]
 42 |         
 43 |     if len(x.shape) == 1:
 44 |         x_test       = x[sample_index]
 45 |         x_train      = x[train_index]
 46 |         
 47 |     if len(y.shape) == 1:
 48 |         y_test       = y[sample_index]
 49 |         y_train      = y[train_index]
 50 |         
 51 |     return [x_train,x_test,y_train,y_test]
 52 |     
 53 |     
 54 | 
 55 | 
 56 | def bounded_variable(x,lo,hi=None):
 57 |     '''
 58 |     Bounds variable from below and above, prevents underflow and overflow
 59 |     
 60 |     Parameters:
 61 |     -----------
 62 |     
 63 |     x: numpy array of size 'n x k' (k can be 1)
 64 |        input vector
 65 |        
 66 |     hi: float
 67 |        Upper bound
 68 |        
 69 |     lo: float
 70 |        Lower bound
 71 |        
 72 |     Returns:
 73 |     --------
 74 |     : numpy array of size 'n x k'
 75 |        
 76 |     '''
 77 |     def _bounded_vector(z,lo,hi):
 78 |         if hi is not None:
 79 |             z[ z > hi] = hi
 80 |         z[ z < lo] = lo
 81 |         return z
 82 |     if len(np.shape(x)) > 1:
 83 |         for i in range(np.shape(x)[1]):
 84 |             pass
 85 |             x[:,i] = _bounded_vector(x[:,i],lo,hi)
 86 |         return x
 87 |     return _bounded_vector(x,lo,hi)
 88 |     
 89 |     
 90 |     
 91 | class NodeNotFoundError(LookupError):
 92 |     '''
 93 |     Error raised in case node is not found
 94 |     '''
 95 |     
 96 |     def __init__(self,n_pos,n_type, message):
 97 |         m            = "Node with index {0} of type {1} {2}"
 98 |         self.message = m.format(n_pos,n_type,message)
 99 |         
100 |     def __str__(self):
101 |         return self.message
102 |         
103 | class NodeModelNotImplemented(NotImplementedError):
104 |     '''
105 |     Error raised in case model is not implemented for node
106 |     '''
107 |     
108 |     def __init__(self,model_name,n_type):
109 |         m            = "Model {0} is not implemented for node type {1}"
110 |         self.message = m.format(model_name, n_type)
111 |         
112 |     def __str__(self):
113 |         return self.message
114 | 


--------------------------------------------------------------------------------
/Hierarchical Mixture of Experts/label_binariser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | import numpy as np
  5 | from scipy.sparse import csr_matrix
  6 | 
  7 | class ClassificationTargetError(Exception):
  8 |     '''
  9 |     Exception raised in case of mismatch between number of expected and 
 10 |     observed classes in target vector (for classification problem)
 11 |     '''
 12 |     
 13 |     def __init__(self, expected, observed):
 14 |         self.e = expected
 15 |         self.o = observed
 16 |         
 17 |     def __str__(self):
 18 |         s = 'Mismatch in number of classes, expected  - {0} , observed - {1}'.format(self.e,self.o) 
 19 |         return s
 20 |         
 21 | 
 22 | 
 23 | class LabelBinariser(object):
 24 |     
 25 |     '''
 26 |     Binarize labels in a one-vs-all fashion.
 27 |     
 28 |     Allows easily transform vector of targets for classification to ground truth 
 29 |     matrix and easily make inverse transformation.
 30 |     
 31 |     n = n_samples , k = n_classes
 32 |     
 33 |     Parameters:
 34 |     ------------
 35 |     
 36 |     Y: numpy array of size 'n_samples x 1'
 37 |        Target variables, vector of classes in classification problem
 38 |     
 39 |     k: int
 40 |        Number of classes 
 41 |     
 42 |     '''
 43 |     
 44 |     def __init__(self,Y,k):
 45 |         
 46 |         self.Y          = Y
 47 |         self.n          = np.shape(Y)[0]
 48 |         self.k          = k
 49 |         #  mapping between set of integers to set of classes
 50 |         classes         = set(Y)
 51 |         if len(classes) != k:
 52 |             raise ClassificationTargetError(k,len(classes))
 53 |         self.direct_mapping  = {}
 54 |         self.inverse_mapping = {}
 55 |         for i,el in enumerate(sorted(list(classes))):
 56 |             self.direct_mapping[el] = i
 57 |             self.inverse_mapping[i] = el
 58 |             
 59 |             
 60 |     def convert_vec_to_binary_matrix(self,Y_raw = None, compress = False):
 61 |         '''
 62 |         Converts vector to ground truth matrix
 63 |         
 64 |         Parameters:
 65 |         ------------
 66 |                 
 67 |         compress: bool
 68 |                If True will use csr_matrix to output compressed matrix
 69 |                   
 70 |         Returns:
 71 |         --------
 72 |         
 73 |         Y: numpy array of size 'n x k'
 74 |                Ground truth matrix , column number represents class index,
 75 |                each row has all zeros and only one 1.  
 76 |                 
 77 |         '''
 78 |         if Y_raw is None:
 79 |             Y_raw = self.Y
 80 |         Y = np.zeros([self.n,self.k])
 81 |         for el,idx in self.direct_mapping.items():
 82 |             Y[self.Y==el,idx] = 1
 83 |         if compress is True:
 84 |             return csr_matrix(Y)
 85 |         return Y
 86 |         
 87 |         
 88 |     def logistic_reg_direct_mapping(self, Y_raw = None):
 89 |         '''
 90 |         Converts vector with two possible classes to vector of zeros and ones.
 91 | 
 92 |         Returns:
 93 |         --------
 94 |         
 95 |         Y: numpy array of size 'n x 1'
 96 |                Vector of zeros and ones. (Mainly inteneded for logistic regression) 
 97 |                 
 98 |         '''
 99 |         Y      = np.zeros(self.n)
100 |         el_one = self.inverse_mapping[1]
101 |         if Y_raw is None:
102 |             Y_raw = self.Y
103 |         Y[Y_raw == el_one] = 1
104 |         return Y
105 |     
106 |     
107 |     def logistic_reg_inverse_mapping(self,Y):
108 |         '''
109 |         Converts probabilities to original format
110 |         
111 |         Parameters:
112 |         -----------
113 |         Y:  numpy array of size [n_samples,1]
114 |             Vector of zeros and ones (for example output of logistic regression)
115 |             
116 |         Returns:
117 |         --------
118 |         
119 |         Y: numpy array of size 'n x 1'
120 |             Target estimates in original format.
121 |             
122 |         '''
123 |         Y[Y >  0.5] = 1
124 |         Y[Y <= 0.5] = 0
125 |         Y_out       = np.zeros(self.n, dtype = self.Y.dtype)
126 |         Y_out[Y==1] = self.inverse_mapping[1]
127 |         Y_out[Y==0] = self.inverse_mapping[0]
128 |         return Y_out
129 |             
130 |             
131 |     def convert_binary_matrix_to_vec(self,B, compressed = False):
132 |         '''
133 |         Converts ground truth matrix to vector of classificaion targets
134 |         
135 |         Parameters:
136 |         -----------
137 |         compressed: bool
138 |              If True input is csr_matrix, otherwise B is numpy array
139 |             
140 |         Returns:
141 |         ---------
142 |         
143 |         Y: numpy array of size 'n x 1'
144 |             Vector of targets, classes
145 |         '''
146 |         if compressed is True:
147 |             B = B.dot(np.eye(np.shape(B)[1]))
148 |         Y = np.zeros(self.n, dtype = self.Y.dtype)
149 |         for i in range(np.shape(B)[1]):
150 |             Y[B[:,i]==1] = self.inverse_mapping[i]
151 |         return Y
152 |         
153 |         
154 |     def convert_prob_matrix_to_vec(self,Y):
155 |         '''
156 |         Converts matrix of probabilities to vector of classification targets
157 |         
158 |         Parameters:
159 |         -----------
160 |         Y:  numpy array of size [n_samples,n_classes]
161 |             Matrix of class probabilities, element at cell [i,j] shows probability
162 |             that observation i belongs to class j
163 |             
164 |         Returns:
165 |         --------
166 |         
167 |         Y: numpy array of size 'n x 1'
168 |             Ground truth matrix , column number represents class index,
169 |             each row has all zeros and only one 1.
170 |             
171 |         '''
172 |         Y_max = np.argmax(Y, axis = 1)
173 |         Y     = np.array([self.inverse_mapping[e] for e in Y_max])
174 |         return Y
175 | 
176 |         


--------------------------------------------------------------------------------
/Hierarchical Mixture of Experts/weighted_gda.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | from scipy.stats import multivariate_normal as mvn
  5 | from scipy.misc import logsumexp
  6 | 
  7 | class WeightedGaussianDiscriminantAnalysis(object):
  8 |     '''
  9 |     Weighted Gaussian Discriminant Analysis
 10 |     
 11 |     A classifier with linear decision boundary , generated by fitting class
 12 |     conditional densities with Gaussian. And using using Bayes rule
 13 |     to obtain posterior distribution. 
 14 |     '''
 15 |     
 16 |     def __init__(self,stop_learning = 1e-3, bias_term = True):
 17 |         self.stop_learning      = stop_learning
 18 |         self.bias_term          = bias_term
 19 |         self.delta_param_norm   = 0
 20 |         self.delta_log_like     = 0
 21 |         self.means              = None
 22 | 
 23 |     
 24 |     def init_params(self,m,k):
 25 |         '''
 26 |         Initialises parameters
 27 |         
 28 |         Parameters:
 29 |         -----------
 30 |         
 31 |         m: int
 32 |             Dimensionality of data
 33 |             
 34 |         k: int
 35 |             Number of classes
 36 |             
 37 |         '''
 38 |         self.k                  =  k
 39 |         self.cov                =  np.eye(m)
 40 |         self.means              =  np.random.random([m,k])
 41 |         self.log_priors         =  -1*np.log(np.ones(k)*k)
 42 | 
 43 |         
 44 |     def _bias_term_pre_processing_X(self,X,bias_term):
 45 |         '''
 46 |         Preprocesses X and adjusts for bias term
 47 |         
 48 |         Returns:
 49 |         --------
 50 |           X: numpy array of size 'n x (m-1)' 
 51 |              Design matrix without column of bias_term, which is expected to be 
 52 |              last column
 53 |         '''
 54 |         if bias_term is None:
 55 |             bias_term = self.bias_term
 56 |         if bias_term is True:
 57 |             return X[:,:-1]
 58 |         return X
 59 |     
 60 |     
 61 |     def fit(self,Y,X,weights = None,  bias_term = None):
 62 |         '''
 63 |         Finds parameters of weighted gaussian discriminant analysis that maximise
 64 |         maximum likelihood.
 65 |         
 66 |         Parameters:
 67 |         -----------
 68 |         
 69 |         X: numpy array of size 'n x m'
 70 |             Expalanatory variables
 71 |             
 72 |         Y: numpy array of size 'n x 1'
 73 |             Dependent variables that need to be approximated
 74 |             
 75 |         weights: numpy array of size 'n x 1'
 76 |             Weighting for each observation
 77 |             
 78 |         bias_term: bool
 79 |             If True, matrix of explanatory variables already contains bias term,
 80 |             which should be discarded in estimation (expected that bias term is in last
 81 |             column of X matrix)
 82 |             
 83 |         '''
 84 |         
 85 |         # preprocess X if it contains bias term
 86 |         X = self._bias_term_pre_processing_X(X,bias_term)
 87 |         
 88 |         n,m              =  np.shape(X)
 89 |         k                =  self.k
 90 |         weights_total    =  np.sum(weights)
 91 |         
 92 |         if weights is None:
 93 |             weights = np.ones(n)
 94 |                    
 95 |         # Interestingly loop was faster than using outer product
 96 |         Y_w = (Y.T*weights).T
 97 |              
 98 |         # recovery in case of decrease in log-likelihood (NUMERICAL UNDERFLOW ISSUE IN DEEP
 99 |         # HIERARCHICAL MIXTURE OF EXPERTS)
100 |         mean_recovery    =  self.means
101 |         cov_recovery     =  self.cov
102 |         prior_recovery   =  self.log_priors
103 |         log_like_before  =  self.log_likelihood(X,Y_w,weights, weighted_Y = True, bias_term = False)
104 |         
105 |         # calculate log priors
106 |         weighted_norm    =  np.sum(Y_w, axis = 0) 
107 |         self.log_priors  =  np.log(weighted_norm) - np.log(weights_total)
108 |         
109 |         # calculate weighted means of Gaussians for each class
110 |         weighted_sum     =  np.dot(X.T*weights,Y)
111 |         self.means       =  weighted_sum / weighted_norm
112 | 
113 |         # calculate pooled covarince matrix
114 |         self.cov         = np.zeros([m,m])
115 |         cov              = np.zeros([m,m])
116 |         M                = np.zeros([m,n])
117 |         for i in range(k):
118 |             np.outer(self.means[:,i],np.ones(n), out = M)
119 |             X_cent       = (X - M.T)
120 |             np.dot(X_cent.T*Y_w[:,i],X_cent, out = cov)
121 |             self.cov    += cov
122 |         self.cov        /= weights_total
123 |         
124 |         # check that log-likelihood did not dropped (UNDERFLOW IN DEEP HMEs)
125 |         # or incresed by very little (for preventing overfitting and long iteration
126 |         # cycle
127 |         log_like_after      =  self.log_likelihood(X,Y_w,weights,bias_term = False, weighted_Y = True)
128 |         delta_log_like      = (log_like_after - log_like_before )/n
129 |         if delta_log_like < self.stop_learning:
130 |             self.means      = mean_recovery
131 |             self.cov        = cov_recovery
132 |             self.log_priors = prior_recovery
133 |             delta_log_like  = 0 
134 |             
135 |         # saves changes in likelihood and parameters in instance variables
136 |         delta = self.means - mean_recovery
137 |         self.delta_param_norm = np.sum(np.dot(delta.T,delta))
138 |         self.delta_log_like   = delta_log_like
139 |             
140 |             
141 |     def predict_probs(self,X, bias_term = None):
142 |         '''
143 |         Calculates posterior probability of x belonging to any particular class
144 |         
145 |         Parameters:
146 |         -----------
147 |         
148 |         X: numpy array of size 'unknown x m'
149 |             Expalanatory variables
150 |             
151 |         bias_term: bool
152 |             If True , explanatory variables matrix contains bias_term (bias term should be 
153 |             in last column of design matrix)
154 |             
155 |         Returns:
156 |         --------
157 |         
158 |         prior_prob: numpy array of size 'unknown x k'
159 |             Posterior probability that class belongs to particular probability
160 |         
161 |         '''
162 |         prior_prob = np.exp(self.predict_log_probs(X,bias_term))
163 |         return prior_prob
164 |         
165 |         
166 |     def predict_log_probs(self,X,bias_term = None):
167 |         '''
168 |         Calculates log of probabilities
169 |         
170 |         Parameters:
171 |         -----------
172 |         
173 |         X: numpy array of size 'unknown x m'
174 |             Expalanatory variables
175 |             
176 |         bias_term: bool
177 |             If True , explanatory variables matrix contains bias_term (bias term should be 
178 |             in last column of design matrix)
179 |             
180 |         Returns:
181 |         --------
182 |         
183 |         prior_prob: numpy array of size 'unknown x k'
184 |             Posterior probability that class belongs to particular probability
185 |         
186 |         '''
187 |         X         = self._bias_term_pre_processing_X(X,bias_term)
188 |         n,m       = np.shape(X)
189 |         log_posterior = np.zeros([n,self.k])
190 |         for i in range(self.k):
191 |             log_posterior[:,i]  = mvn.logpdf(X,self.means[:,i], cov = self.cov)
192 |             log_posterior[:,i] += self.log_priors[i]
193 |         normaliser         = logsumexp(log_posterior, axis = 1)
194 |         posterior_log_prob = (log_posterior.T - normaliser).T
195 |         return posterior_log_prob
196 |                 
197 |         
198 |     def log_likelihood(self, X, Y, weights = None, bias_term = None, weighted_Y = False):
199 |         '''
200 |         Calculates log likelihood for weighted gaussian discriminant analysis
201 |         
202 |         Parameters:
203 |         -----------
204 |         
205 |         X: numpy array of size 'n x m'
206 |              Explanatory variables
207 |         
208 |         Y: numpy array of size 'n x 1'
209 |              Target variable can take only values 0 or 1
210 |          
211 |         weights: numpy array of size 'n x 1'
212 |              Weights for observations
213 |              
214 |         k: int 
215 |              Number of classes
216 |              
217 |         bias_term: bool
218 |              If True excludes bias term (which is expected to be in last column of X)
219 |              
220 |         weighted_Y:
221 |              If True Y is already weighted (optimisation so that not recalculate Y*w)
222 |              
223 |         Returns:
224 |         --------
225 |         
226 |         log_like: float
227 |              Log likelihood
228 |         '''
229 |         X         = self._bias_term_pre_processing_X(X,bias_term)
230 |         n,m       = np.shape(X)
231 |         
232 |         # default weights
233 |         if weights is None:
234 |             weights = np.ones(n)
235 |             
236 |         # log-likelihood
237 |         log_posterior = np.zeros([n,self.k])
238 |         for i in range(self.k):
239 |             log_posterior[:,i]  = mvn.logpdf(X,self.means[:,i], cov = self.cov)
240 |             log_posterior[:,i] += self.log_priors[i]
241 |         if weighted_Y is False:
242 |            Y             = (Y.T*weights).T
243 |         log_like         = np.sum(Y*log_posterior)
244 |         return log_like
245 |         
246 |         
247 |     def posterior_log_probs(self,X,Y,bias_term = None):
248 |         '''
249 |         Probability of observing Y given X and parameters
250 |         '''
251 |         X = self._bias_term_pre_processing_X(X,bias_term)
252 |         log_P = np.sum(Y*self.predict_log_probs(X,bias_term = False), axis = 1)
253 |         return log_P
254 |         


--------------------------------------------------------------------------------
/Hierarchical Mixture of Experts/weighted_lin_reg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Weighted Linear Regression , Expert in HME model
  4 | 
  5 |    m - dimensionality of input (i.e. length of row in matrix X)
  6 |    n - number of observations
  7 | """
  8 | 
  9 | import numpy as np
 10 | from scipy.stats import norm
 11 | from scipy.linalg import solve_triangular
 12 | 
 13 | #------------------------------------ Least Squares Solvers-------------------------------#
 14 | 
 15 | def cholesky_solver_least_squares(part_one, part_two):
 16 |     '''
 17 |     Solves least squares problem using cholesky decomposition
 18 |     
 19 |     Parameters:
 20 |     -----------
 21 |     
 22 |     part_one: numpy array of size 'm x m', 
 23 |               Equals X.T * X
 24 |     part_two: numpy array of size 'm x 1'
 25 |               Equals X.T * Y
 26 |               
 27 |     Returns:
 28 |     --------
 29 |     Theta: numpy array of size 'm x 1'
 30 |               Vector of coefficients
 31 |     
 32 |     '''
 33 |     # R*R.T*Theta = part_two
 34 |     R = np.linalg.cholesky(part_one)
 35 |     # R*Z = part_two
 36 |     Z     = solve_triangular(R,part_two, check_finite = False, lower = True)
 37 |     # R.T*Theta = Z
 38 |     Theta = solve_triangular(R.T,Z, check_finite = False, lower = False)
 39 |     return Theta
 40 |     
 41 |     
 42 | def qr_solver(Q,R,Y):
 43 |     '''
 44 |     Solves least squares problem using qr decomposition.
 45 |     
 46 |     Parameters:
 47 |     -----------
 48 |     
 49 |     Q: numpy array of size 'n x m'
 50 |         Matrix Q in QR decomposition (Matrix of orthonormal vectors)
 51 |         
 52 |     R: numpy array of size 'm x m'
 53 |         Matrix R in QR decomposition (Matrix of projection coefficients on 
 54 |         orthonormal vectors)
 55 |         
 56 |     Y: numpy array of size ' n x 1'
 57 |         Vector of dependent variables
 58 |         
 59 |     Returns:
 60 |     -------
 61 |     Theta: numpy array of size 'm x 1'
 62 |          Vector of parameters
 63 |     '''
 64 |     qy      = np.dot(Q.T,Y)
 65 |     Theta   = solve_triangular(R,qy, check_finite = False, lower = False)
 66 |     return  Theta
 67 |    
 68 |    
 69 | def lstsq_wrapper(y,X):
 70 |     '''
 71 |     Uses C++ Linear Algebra Package to calculate coefficients and residuals
 72 |     of regression. Is much faster than other methods, since it calls C++ functions.
 73 |     
 74 |     Parameters:
 75 |     -----------
 76 |     
 77 |     Y: numpy array of size 'n x 1'
 78 |         Vector of dependent variables
 79 |         
 80 |     X: numpy array of size 'n x m'
 81 |         Explanatory variables
 82 |     '''
 83 |     theta,r,rank,s = np.linalg.lstsq(X,y)
 84 |     return theta
 85 |     
 86 |     
 87 | #----------------------------------
 88 |     
 89 | def norm_pdf_log_pdf(theta,y,x,sigma_2):
 90 |     '''
 91 |     Calculates probability of observing Y given Theta and sigma and explanatory
 92 |     varibales
 93 |     
 94 |     Parameters:
 95 |     ----------
 96 |     
 97 |     theta: numpy array of size 'm x k', 
 98 |            Matrix of parameters
 99 |     y: numpy array of size 'n x 1'
100 |            Vector of dependent variables
101 |     x: numpy array of size 'n x m'
102 |            Matrix of inputs 
103 |     sigma_2: numpy array of size 'm x 1'
104 |            Vector of variances
105 |     
106 |     Returns:
107 |     -------
108 |     prob: numpy array of size 'n x 1'
109 |           Probability of observing y given theta and X
110 |     
111 |     '''
112 |     u              = y - np.dot(x,theta)
113 |     log_normaliser = -1* np.log(np.sqrt(2*np.pi*sigma_2))
114 |     log_main       = -u*u/(2*sigma_2)
115 |     log_pdf        = log_normaliser + log_main
116 |     prob           = np.exp(log_pdf)
117 |     return [log_pdf,prob]
118 |         
119 |         
120 |     
121 | #------------------------------------- Weighted Linear Regression-------------------------#
122 | 
123 | class WeightedLinearRegression(object):
124 |     '''
125 |     Weighted Linear Regression
126 |             
127 |     Parameters:
128 |     -----------
129 |     
130 |     solver: string (default = "qr")
131 |          Numerical method to find weighted linear regression solution
132 |          
133 |     underflow_tol: float (default = 1e-20)
134 |          Threshold to prevent underflow in likelihood computation
135 |          
136 |     '''
137 |     
138 |     def __init__(self, solver = "lapack_solver", stop_learning = 1e-3):
139 |         self.solver              = solver
140 |         self.theta               = None             
141 |         self.var                 = 0               
142 |         self.stop_learning       = stop_learning
143 |         self.delta_param_norm    = 0
144 |         self.deta_log_like       = 0
145 | 
146 | 
147 |     def init_params(self,m):
148 |         '''
149 |         Initialises weights and preallocates memory
150 |         
151 |         Parameters:
152 |         ----------
153 |         m: int
154 |            Number of parameters, should equal to dimensionality of data
155 |            
156 |         '''
157 |         self.theta = np.random.normal(0,1,m)
158 |         self.var   = 1 
159 | 
160 | 
161 |     def fit(self,Y,X,weights = None):
162 |         ''' 
163 |         Fits weighted regression, updates coefficients and variance
164 |         
165 |         Parameters:
166 |         -----------
167 |         
168 |         X: numpy array of size 'n x m'
169 |              Explanatory variables
170 |         
171 |         Y: numpy array of size 'n x 1'
172 |              Target variable can take only values 0 or 1
173 |          
174 |         weights: numpy array of size 'n x 1'
175 |              Weights for observations
176 |         
177 |         '''
178 |         n,m         =  np.shape(X)
179 |         if weights is not None:
180 |              w            =  np.sqrt(weights)
181 |         else:
182 |              w            =  np.ones(n)
183 |              weights      =  w
184 |         X_w          =  (X.T*w).T
185 |         Y_w          =  Y*w
186 |          
187 |         if self.theta is None:
188 |             self.init_params(m)
189 |              
190 |         # save paramters in case log-likelihood drops ( PRECISION ISSUE IN 
191 |         # DEEP HIERARCHICAL MIXTURE OF EXPERTS)
192 |         theta_recovery  =  self.theta
193 |         var_recovery    =  self.var
194 |         log_like_before =  self.log_likelihood(X,Y,weights)
195 |         
196 |         # use cholesky decomposition for least squares 
197 |         if self.solver  == "cholesky":
198 |            part_one     =  np.dot(X_w.T,X_w)
199 |            part_two     =  np.dot(X_w.T,Y_w)
200 |            self.theta   =  cholesky_solver_least_squares(part_one, part_two)
201 |            
202 |         # use qr decomposition for least squares
203 |         elif self.solver == "qr":
204 |             Q,R        = np.linalg.qr(X_w)
205 |             self.theta = qr_solver(Q,R,Y_w)
206 |             
207 |         # lapack least squares solver
208 |         elif self.solver == "lapack_solver":
209 |             self.theta = lstsq_wrapper(Y_w,X_w)
210 |             
211 |         # calculate variances 
212 |         vec_1          =  (Y_w - np.dot(X_w,self.theta))
213 |         self.var       =  np.dot(vec_1,vec_1)/np.sum(weights)
214 |         
215 |         # if likelihood dropped ( PRECISION ISSUE) use recovery parameters
216 |         # used in DEEP HIERARCHICAL MIXTURE OF EXPERTS
217 |         log_like_after = self.log_likelihood(X,Y,weights)
218 |         delta_log_like = ( log_like_after - log_like_before)/n
219 |         if delta_log_like < self.stop_learning:
220 |             self.theta = theta_recovery
221 |             self.var   = var_recovery
222 |             delta_log_like = 0
223 |             
224 |         # save change in parameters and likelihood
225 |         delta                 = self.theta - theta_recovery
226 |         self.delta_param_norm = np.sum(np.dot(delta.T,delta))
227 |         self.delta_log_like   = delta_log_like
228 |         
229 |         
230 |         
231 |     def predict(self,X):
232 |         '''
233 |         Calculates point estimator based on learned parameters
234 |         
235 |         Parameters:
236 |         -----------
237 |         X: numpy array of size 'n x m'
238 |              Explanatory variables
239 |              
240 |         Returns:
241 |         --------
242 |         X: numpy array of size 'unknown x 1'
243 |            Explanatory variables from test set
244 |         
245 |         '''
246 |         return np.dot(X,self.theta)
247 |         
248 |         
249 |     def posterior_log_probs(self,X,Y):
250 |         ''' 
251 |         Wrapper for norm_pdf (primarily used in HME)
252 |         '''
253 |         log_pdf,pdf = norm_pdf_log_pdf(self.theta,Y,X,self.var)
254 |         return log_pdf
255 |         
256 |         
257 |     def log_likelihood(self,X,Y,weights = None):
258 |         '''
259 |         Returns log likelihood for linear regression with noise distributed 
260 |         as Gaussian
261 |         
262 |         Parameters:
263 |         -----------
264 |         X: numpy array of size 'n x m'
265 |              Explanatory variables
266 |         
267 |         Y: numpy array of size 'n x 1'
268 |              Target variable can take only values 0 or 1
269 |          
270 |         weights: numpy array of size 'n x 1'
271 |              Weights for observations
272 |              
273 |         Returns:
274 |         --------
275 |         weighted_log_likelihood: float
276 |              Log likelihood
277 | 
278 |         '''
279 |         if weights is None:
280 |             weights = np.ones(X.shape[0])
281 |         log_pdf, pdf        = norm_pdf_log_pdf(self.theta,Y,X,self.var)
282 |         log_likelihood      = np.sum(weights*log_pdf)
283 |         return log_likelihood
284 |         
285 |         
286 |     def posterior_cdf(self,X,y_lo = None,y_hi = None):
287 |         ''' 
288 |         Calculate probability of observing target variable in range [y_lo, y_hi]
289 |         given explanatory variable and parameters
290 |         
291 |         Parameters:
292 |         -----------
293 |         X: numpy array of size 'unknown x n'
294 |            Explanatory variables
295 |            
296 |         y_lo: numpy array of size 'unknown x 1'
297 |            Lower bound 
298 |            
299 |         y_hi: numpy array of size 'unknown x 1'
300 |            Upper bound
301 |            
302 |         Returns:
303 |         --------
304 |         delta_prob: numpy array of size 'unknown x 1'
305 |             Probability of observing Y in range [y_lo, y_hi]
306 |         '''
307 |         # check that upper bound is higher than lower bound
308 |         assert np.sum(y_hi<y_lo) == 0, "upper bound can not be smaller than lower bound"
309 |         # calculate difference in cdfs
310 |         means       = self.predict(X)
311 |         std         = np.sqrt(self.var)
312 |         upper_bound = 0
313 |         lower_bound = 0
314 |         if y_hi is not None:
315 |            upper_bound = norm.cdf(y_hi,loc = means, scale = std)
316 |         if y_lo is not None:
317 |            lower_bound = norm.cdf(y_lo, loc = means, scale = std)
318 |         delta_prob  = abs(upper_bound - lower_bound)
319 |         return delta_prob
320 | 
321 | 
322 | 
323 |         


--------------------------------------------------------------------------------
/Hierarchical Mixture of Experts/softmax_reg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # THINGS TO DO
  4 | 
  5 | # TODO: Implement function analytically computing hessian for softmax regression 
  6 | #       without overparametrisation (for newton-cg)
  7 | # 
  8 | 
  9 | 
 10 | import numpy as np
 11 | from scipy.optimize import fmin_l_bfgs_b
 12 | from scipy.misc import logsumexp
 13 | 
 14 | 
 15 | 
 16 | def log_softmax(Theta,X):
 17 |     '''
 18 |     Calculates value of softmax function. Output can be interpreted as matrix
 19 |     of probabilities, where  each entry of column i is probability that corresponding 
 20 |     input belongs to class 'i'.
 21 |     
 22 | 
 23 |     Parameters:
 24 |     -----------
 25 |     
 26 |     Theta: numpy array of size 'm x k'
 27 |              Matrix of coefficients
 28 |              
 29 |     X: numpy array of size 'n x m'
 30 |              Explanatory variables
 31 |     
 32 |     Returns:
 33 |     --------
 34 |      probs: numpy array of size 'n x k'
 35 |              Matrix of probabilities
 36 | 
 37 |     '''
 38 |     n,m           = np.shape(X)
 39 |     m,k           = np.shape(Theta)
 40 |     X_Theta       = np.dot(X,Theta)
 41 |     norm          = logsumexp(X_Theta, axis = 1)
 42 |     log_softmax   = (X_Theta.T - norm).T
 43 |     return log_softmax
 44 | 
 45 | 
 46 |     
 47 | def cost_grad(Theta,Y,X,k,weights):
 48 |     '''
 49 |     Calculates weighted negative log likelihood and gradient of weighted negative 
 50 |     log likelihood of multinomial distribution together.
 51 |     
 52 |     Parameters:
 53 |     ----------
 54 |     
 55 |     Theta: numpy array of size 'm x k', 
 56 |              Matrix of coefficients
 57 |              
 58 |     Y:  numpy array of size 'n x k'
 59 |              Ground Truth Matrix
 60 |              
 61 |     X: numpy array of size 'n x m'
 62 |              Explanatory variables
 63 |     
 64 |     k: int 
 65 |              Number of classes
 66 | 
 67 |     Returns:
 68 |     --------
 69 |     
 70 |     tuple(int, np.array): tuple, of size 2
 71 |             First element is value of cost function, second element is numpy array 
 72 |             of size 'm x 1', representing gradient
 73 |     '''
 74 |     n,m                   =  np.shape(X)
 75 |     Theta                 =  np.reshape(Theta,(m,k-1))
 76 |     Theta                 =  np.concatenate([np.zeros([m,1]),Theta], axis = 1)
 77 |     log_P                 =  log_softmax(Theta,X)
 78 |     unweighted            =  np.sum(Y*log_P, axis = 1)
 79 |     cost                  =  -1.0*np.dot(weights,unweighted)
 80 |     resid                 =  (Y-np.exp(log_P))
 81 |     grad                  =  -1.0*np.dot( X.T*weights,resid)
 82 |     grad                  =  grad[:,1:]
 83 |     # use no.array(np.reshape()), otherwise l_bfgs_b have strange 
 84 |     # initialisation error for FORTRAN code
 85 |     return (cost, np.array(np.reshape(grad,(m*(k-1),))))
 86 |     
 87 |     
 88 | #TODO: analytical calculation of hessian for faster convergence 
 89 | def cost_grad_hess(Theta,Y,X,k,weights, bias_term = True):
 90 |     pass
 91 |     
 92 | 
 93 | #----------------------------------------  Softmax Regression  --------------------------------------------#
 94 |     
 95 | class SoftmaxRegression(object):
 96 |     '''
 97 |     Softmax classifier using l-bfgs-b optimization procedure.
 98 |     (Bias term is not added in the process of computation, so  it needs to be in
 99 |     data matrix). This implementation of softmax regression does not suffer from
100 |     overparametrization (so it has unique soltuion), however in case of complete 
101 |     separability will have the same problem as logistic regression (norm of coefficient 
102 |     going to infinity)
103 | 
104 |     Parameters:
105 |     -----------
106 |     
107 |     max_iter: int 
108 |                 Maximum number of iterations (default = 1000)
109 |                 
110 |     tolerance: float 
111 |                 Precision threshold for convergence (default = 1e-10)
112 |                 
113 |     stop_learning: float (default: 1e-5)
114 |                 If change in weighted log-likelihood is below stop_learning
115 |                 threshold then new parameters are discarded and model in hme will
116 |                 use old ones
117 |                 
118 |     '''
119 |     
120 |     def __init__(self,tolerance = 1e-5, max_iter = 20, stop_learning = 1e-5):
121 |         self.tolerance              = tolerance
122 |         self.max_iter               = max_iter
123 |         self.stop_learning          = stop_learning
124 |         self.delta_param_norm       = 0
125 |         self.delta_log_like         = 0
126 |         self.theta                  = None
127 |         
128 | 
129 |         
130 |     def init_params(self,m,k):
131 |         '''
132 |         
133 |         Parameters:
134 |         -----------
135 |         m: int
136 |            Dimensionality of data        
137 |         
138 |         k: int
139 |            Number of classes in classification problem
140 |         '''
141 |         self.m, self.k = m,k
142 |         # for soft splits in beginning of training in HME make parameters smaller
143 |         self.theta      = np.random.random([m,k])*0.1
144 |         # restrict paramters so that softmax regression will not be overparamerised
145 |         self.theta[:,0] = np.zeros(m)
146 |         
147 | 
148 |     def fit(self,Y,X,weights):
149 |         '''
150 |         Fits parameters of softmax regression using l-bfgs-b optimization procedure
151 |                 
152 |         Parameters:
153 |         -----------
154 |         
155 |         X: numpy array of size 'n x m'
156 |             Expalanatory variables
157 |             
158 |         Y_raw: numpy array of size 'n x 1'
159 |             Dependent variables that need to be approximated
160 |             
161 |         k: int
162 |             Number of classes
163 |             
164 |         weights: numpy array of size 'n x 1'
165 |             Weighting for each observation
166 | 
167 |         '''
168 |         # initialise parameters
169 |         n,m             = np.shape(X)
170 |         k               = self.k
171 |         
172 |         # initiate parameters for fitting (avoids overparametarization)
173 |         theta_initial   = np.zeros([m,k-1])
174 |         if self.theta is None:
175 |             self.init_params(m,k)
176 |         
177 |         # Use previously fitted values for refitting, if weights in HME changed a 
178 |         # little this will provide much faster convergence since initialised parameters 
179 |         # will be near optimal point.
180 |         theta_initial  += self.theta[:,1:]
181 |         
182 |         # save recovery paramters in case log-likelihood drops due to underflow
183 |         theta_recovery  = self.theta
184 |         log_like_before = self.log_likelihood(X,Y,weights)
185 |         
186 |         # optimisation with lbfgsb
187 |         fitter          = lambda theta: cost_grad(theta,Y,X,k,weights)
188 |         theta,J,D       = fmin_l_bfgs_b(fitter,
189 |                                         theta_initial,
190 |                                         fprime = None,
191 |                                         pgtol = self.tolerance,
192 |                                         approx_grad = False,
193 |                                         maxiter = self.max_iter)
194 |         
195 |         # theta with dimensionality m x k-1 
196 |         theta           = np.reshape(theta,(m,k-1))
197 |         
198 |         # transform to standard softmax representattion with m x k dimensionality
199 |         self.theta      = np.concatenate([np.zeros([m,1]), theta], axis = 1)
200 |         
201 |         # check behaviour of log-likelihood 
202 |         log_like_after = self.log_likelihood(X,Y,weights)
203 |         delta_log_like = (log_like_after - log_like_before) / n
204 |         
205 |         # Code below is for two following cases:
206 |         #
207 |         # CASE 1: 
208 |         #         In process of fitting deep HME due to errors in floating point
209 |         #         operations and underflows, when weights change is small 
210 |         #         ( errors seem to start when total change in weights is 1e-30 and smaller)
211 |         #         log-likelihood of model after refitting can be smaller than before.
212 |         #         If that happens then model uses old parameters instead of new
213 |         #
214 |         # CASE 2:  
215 |         #         Softmax regression suffers from the same
216 |         #         drawback as logistic regression, in case of perfect
217 |         #         or near perfect separability norm of parameters keep increasing ( basically
218 |         #         multiplying optimal w by constant). In that case change in parameters does 
219 |         #         not decrease, while change in log-likelihood is tiny.
220 |         #
221 |         if delta_log_like < self.stop_learning:
222 |             self.theta     = theta_recovery
223 |             delta_log_like = 0
224 |             
225 |         # save changes in likelihood and parameters
226 |         delta = self.theta - theta_recovery
227 |         self.delta_log_like   = delta_log_like
228 |         self.delta_param_norm = np.sum(np.dot(delta.T,delta))
229 |         
230 |         
231 |     def predict_probs(self,X_test):
232 |         '''
233 |         Calculates matrix of probabilities for given data matrix
234 |         
235 |         Parameters:
236 |         -----------
237 |         
238 |         X_test: numpy array of size 'uknown x m' 
239 |              Explanatory variables of test set
240 |         
241 |         Returns:
242 |         -------
243 |                  
244 |         P: numpy array of size 'uknown x k'
245 |              Matrix of probabilities, showing probabilty of observation belong
246 |              to particular class
247 |         '''
248 |         log_P = log_softmax(self.theta,X_test)
249 |         P     = np.exp(log_P)
250 |         return P
251 |         
252 |         
253 |     def predict_log_probs(self,X_test):
254 |         '''
255 |         Calculates matrix of log probabilities
256 |         
257 |         Parameters:
258 |         -----------
259 |         
260 |         X_test: numpy array of size 'uknown x m' 
261 |              Explanatory variables of test set
262 |         
263 |         Returns:
264 |         -------
265 |                  
266 |         log_P: numpy array of size 'uknown x k'
267 |              Matrix of log probabilities
268 |              
269 |         '''
270 |         log_P = log_softmax(self.theta,X_test)
271 |         return log_P
272 |         
273 |         
274 |     def log_likelihood(self,X,Y,weights):
275 |         '''
276 |         Returns log likelihood for softmax regression
277 |         
278 |         Parameters:
279 |         -----------
280 |         
281 |         X: numpy array of size 'n x m'
282 |              Explanatory variables
283 |         
284 |         Y: numpy array of size 'n x 1'
285 |              Target variable can take only values 0 or 1
286 |          
287 |         weights: numpy array of size 'n x 1'
288 |              Weights for observations
289 |              
290 |         Returns:
291 |         --------
292 |         
293 |         weighted_log_likelihood: float
294 |              log likelihood
295 |         
296 |         '''
297 |         weighted_log_like = -1*cost_grad(self.theta[:,1:],Y,X,self.k,weights)[0]
298 |         return weighted_log_like
299 |         
300 |         
301 |     def posterior_log_probs(self,X,Y):
302 |         '''
303 |         Calculates probability of observing Y given X and parameters (for HME usage)
304 |         '''
305 |         log_p = np.sum(Y*log_softmax(self.theta,X), axis = 1)
306 |         return log_p
307 | 
308 |     
309 | 


--------------------------------------------------------------------------------
/Hierarchical Mixture of Experts/general_hme.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import nodes_hme as nh
  4 | import numpy as np
  5 | import label_binariser as lb
  6 | from helpers import *
  7 | 
  8 | 
  9 | class HME(object):
 10 |     '''
 11 |     Implementation of Hierarchical Mixture of Experts, supports only balanced tree 
 12 |     of arbitrary depth and arbitrary branching factor.
 13 |     
 14 |     Parameters:
 15 |     -----------
 16 |     
 17 |     Y_train: numpy array of size 'n x 1'
 18 |        Dependent variables, training set
 19 |        
 20 |     X_train: numpy array of size 'n x m'
 21 |        Matrix of inputs, training set
 22 |        
 23 |     Y_test: numpy array of size 'n_test x 1'
 24 |        Dependent variables, test set
 25 |     
 26 |     X_test: numpy array of size 'n_test x m'
 27 |        Matrix of explanatory variables, test set
 28 |        
 29 |     expert_type: str
 30 |        Type of the expert to be used, either "logit" or "gaussian"
 31 |        
 32 |     gate_type: str
 33 |        Type of gating network used "softmax" or "wgda" (weighted gaussian discriminant
 34 |        analysis) 
 35 |        
 36 |     bias: bool
 37 |        If True adds bias term (columns of ones) to X matrix (It is expected that X does
 38 |        not contain bias term)
 39 |        
 40 |     k: int
 41 |        Branching parameter
 42 |        
 43 |     levels: int
 44 |        Number of levels in tree
 45 |        
 46 |     max_iter: int
 47 |        Maximum number of iterations of EM algorithm
 48 |     
 49 |     conv_thresh: float
 50 |        Convergence threshold, if change in lower bound of likelihood is smaller
 51 |        than threshold then EM algorithm terminates
 52 |        
 53 |     verbose: str
 54 |        If True prints likelihood and iteration information
 55 |        
 56 |        
 57 |     References:
 58 |     -----------
 59 |     Jordan and Jacobs, Hierarchical Mixture of Experts and the EM algorithm, 1994
 60 |     Jordan and Xu, Convergence Results for the EM approach to mixture of experts architechtures, 1995
 61 |     '''
 62 |     
 63 |     def __init__(self,Y_train,X_train,Y_test,X_test, expert_type, gate_type = "softmax", 
 64 |                                                                   bias        = True,
 65 |                                                                   branching   = 2,
 66 |                                                                   levels      = 8, 
 67 |                                                                   max_iter    = 100,
 68 |                                                                   conv_thresh = 1e-50,
 69 |                                                                   verbose     = False):
 70 |         self.nodes        = []
 71 |         self.bias         = bias
 72 |         self.conv_thresh  = conv_thresh
 73 |         self.verbose      = verbose
 74 |         self.max_iter     = max_iter
 75 | 
 76 |         # add bias term if required
 77 |         if self.bias is True:
 78 |             n         = np.shape(X_train)[0]
 79 |             X_train   = np.concatenate([X_train,np.ones([n,1])], axis = 1)        
 80 | 
 81 |         # check that there is no errors in gate and expert type
 82 |         expert_types      = ["gaussian","softmax","wgda"]
 83 |         gater_types       = ["softmax","wgda"]
 84 |         if expert_type not in expert_types:
 85 |             raise NodeModelNotImplementedError(expert_type, "expert")
 86 |         if gate_type not in gater_types:
 87 |             raise NodeModelNotImplementedError(gate_type, "gater")
 88 |         
 89 |         # assign expert and gate types
 90 |         self.expert_type  = expert_type
 91 |         self.gate_type    = gate_type
 92 |         
 93 |         # transform target variables in case of logistic regression task
 94 |         if expert_type == "softmax" or expert_type == "wgda":
 95 |             self.classes   = len(set(Y_train))
 96 |             self.converter = lb.LabelBinariser(Y_train,self.classes)
 97 |             Y_train        = self.converter.convert_vec_to_binary_matrix(Y_raw = Y_train)
 98 |             Y_test         = self.converter.convert_vec_to_binary_matrix(Y_raw = Y_test)
 99 |             
100 |         # split data into training and test
101 |         self.X             = X_train
102 |         self.Y             = Y_train
103 |         self.x             = X_test
104 |         self.y             = Y_test
105 |         self.n,self.m      = np.shape(self.X)
106 |         
107 |         # overall number of parameters in HME
108 |         self.total_params  = 0
109 |                 
110 |         # list of l2 norm of parameter change (in optimum change should be near 0)
111 |         self.delta_param_norm    = []
112 |         self.delta_log_like_lb   = []
113 |         self.test_log_like       = []
114 |         
115 |         # create HME tree
116 |         self._create_hme_topology(levels,branching)
117 | 
118 |         
119 | 
120 |     def _create_hme_topology(self,levels,k):
121 |         ''' 
122 |         Creates HME tree with given depth and branching parameter
123 |         
124 |         Parameters:
125 |         -----------
126 |         
127 |         levels: int
128 |            Number of levels in tree  
129 |         
130 |         k: int
131 |            Branching parameter
132 | 
133 |         '''
134 |         node_counter = 0
135 |         for level in range(levels):
136 |             for node_pos in range(k**level):
137 |                 
138 |                 # adding gating nodes (all but last levels of hme tree)
139 |                 if level < levels-1 :
140 |                     
141 |                     # softmax gating model 
142 |                     if self.gate_type == "softmax":
143 |                         self.nodes.append(nh.GaterNodeSoftmax(self.n,node_counter,k,
144 |                                                                                   self.m,
145 |                                                                                   classes = k))
146 |                         self.total_params += (k-1)*self.m
147 |                         
148 |                     # weighted gaussian discriminant gating model
149 |                     elif self.gate_type == "wgda":
150 |                         self.nodes.append(nh.GaterNodeWGDA(self.n,node_counter,k,
151 |                                                                                self.m,
152 |                                                                                bias_term = self.bias,
153 |                                                                                classes   = k ))
154 |                         if self.bias is True:
155 |                             self.total_params += k*(self.m-1)
156 |                         else:
157 |                             self.total_params += k*self.m
158 |                                        
159 |                 #  adding expert nodes (last level of hme tree)
160 |                 elif level == levels-1:
161 |                     
162 |                     # linear regression expert
163 |                     if self.expert_type   == "gaussian":
164 |                         self.nodes.append(nh.ExpertNodeLinReg(self.n,node_counter,k,self.m))
165 |                         self.total_params += self.m
166 |                         
167 |                     # multilogit regression ( softmax regression for classification)
168 |                     elif self.expert_type == "softmax":
169 |                         self.nodes.append(nh.ExpertNodeSoftmaxReg(self.n,node_counter,k,
170 |                                                                                       self.m,
171 |                                                                                       classes = self.classes))
172 |                         self.total_params += (self.classes-1)*self.m
173 |                         
174 |                     # weighted gaussian discriminant analysis as model in expert node
175 |                     elif self.expert_type == "wgda":
176 |                         self.nodes.append(nh.ExpertNodeWGDA(self.n,node_counter,k,
177 |                                                                                 self.m,
178 |                                                                                 bias_term = self.bias,
179 |                                                                                 classes   = self.classes))
180 |                         if self.bias is True:
181 |                             self.total_params += self.classes*(self.m-1)
182 |                         else:
183 |                             self.total_params += self.classes*self.m
184 |                 node_counter+=1
185 | 
186 | 
187 |     def _up_tree_pass(self):
188 |         ''' 
189 |         Performs up tree pass, calculates prior probabilities of latent variables
190 |         '''
191 |         for node in reversed(self.nodes):
192 |             if node.node_type == "expert":
193 |                 node.up_tree_pass(self.X,self.Y)
194 |             elif node.node_type == "gate":
195 |                 node.up_tree_pass(self.X, self.nodes)
196 |             
197 |                                 
198 |     def _down_tree_pass(self):
199 |         ''' 
200 |         Performs down tree pass, calculates posterior probabilities of 
201 |         latent variables (E-step) and maximises lower bound of likelihood by 
202 |         updating parameters (M-step)
203 |         '''
204 |         delta_param_norm = 0
205 |         delta_log_like   = 0
206 |         N                = len(self.nodes)
207 |         for node in self.nodes:
208 |             if node.node_type == "expert":
209 |                 node.down_tree_pass(self.X,self.Y,self.nodes)
210 |             elif node.node_type == "gate":
211 |                 node.down_tree_pass(self.X,self.nodes)
212 |             delta_param_norm += node.get_delta_param_norm()
213 |             delta_log_like   += node.get_delta_log_like()
214 |              
215 |         # normalise change in parameters and lower bound of likelihood
216 |         normalised_delta_params       = delta_param_norm  / self.total_params
217 |         normalised_delta_like         = delta_log_like / self.n*N
218 |         
219 |         # save changes in likelihood  and parameters for last iteration 
220 |         self.delta_param_norm.append(normalised_delta_params)
221 |         self.delta_log_like_lb.append(normalised_delta_like)
222 | 
223 |             
224 |             
225 |     def fit(self):
226 |         '''
227 |         Performs iterations of EM algorithm until convergence (or limit of iterations)
228 |         '''
229 |         converged    = False
230 |         for i in range(self.max_iter):
231 |             self._up_tree_pass()
232 |             self._down_tree_pass()
233 |             if self.verbose is True:
234 |                 out = "iteration {0} completed , total change in lower bound of likelihood is {1}"
235 |                 print out.format(i,self.delta_param_norm[-1])
236 |             
237 |             # terminate algorithm if lower bound of likelihood changed by less than threshold
238 |             # should we use lower bound or change in parameters?????
239 |             log_like_change = self.delta_log_like_lb[-1]
240 |             if log_like_change <= self.conv_thresh:
241 |                     if self.verbose is True:
242 |                        print "Algorithm converged"
243 |                     converged = True
244 |                     break
245 |         if self.verbose is True and converged is False:
246 |               print "Maximum number of iterations is reached"
247 |             
248 |             
249 |     def predict(self,X, bias_term = False, predict_type = "predict_response", y_lo = None, y_hi = None):
250 |         '''
251 |         Returns weighted average of expert predictions
252 |         
253 |         Parameters:
254 |         -----------
255 |         
256 |         X: numpy array of size 'unknown x m'
257 |            Explanatory variables for test set
258 |            
259 |         bias_terms: bool
260 |            If True, then columns of ones is appended to matrix X as last column
261 |            
262 |         predict_type: str
263 |            Can be  "predict_response", "predict_prob", "predict_cdf"
264 |            "predict_response"    - works for all type of experts 
265 |            "predict_probs"       - works for classification experts ('wgda','softmax')
266 |            "prdict_cdf"          - works only for 'gaussian' expert
267 |            
268 |         y_lo: numpy array of size 'unknown x 1'
269 |             Lower bound for 'predict_cdf' prediction type
270 |             
271 |         y_hi: numpy array of size 'unknown x 1'
272 |             Upper bound for 'predict_cdf' prediction type
273 |         
274 |         Returns:
275 |         --------
276 |         prediction: numpy array of size 'unknown x 1'
277 |         
278 |         '''
279 |         # include bias term if needed
280 |         if self.bias is True:
281 |             n = np.shape(X)[0]
282 |             X = np.concatenate([X,np.ones([n,1])], axis = 1)
283 |             
284 |         # for classification cases use probability predictions (they will be 
285 |         # transformed into response variable later)
286 |         if self.expert_type in ["softmax" ,"wgda"] and predict_type == "predict_response":
287 |             prediction = self.nodes[0].propagate_prediction(X,self.nodes,"predict_probs",y_lo,y_hi)
288 |             
289 |         # predict_probs is defined only for classification problems
290 |         elif self.expert_type == "gaussian" and predict_type == "predict_probs":
291 |             raise NotImplementedError(" 'predict_probs' is implemented only for classification experts")
292 |         
293 |         # predict_cdf is defined only for 'gaussian' expert
294 |         elif self.expert_type != "gaussian" and predict_type == "predict_cdf":
295 |             raise NotImplementedError(" 'predict_cdf' is implemented only for 'gaussian' expert")
296 |         else:
297 |             prediction = self.nodes[0].propagate_prediction(X,self.nodes,predict_type,y_lo,y_hi)
298 |             
299 |         # post processing (transform average probabilities to response variable)
300 |         if self.expert_type in ["softmax" ,"wgda"] and predict_type == "predict_response":
301 |             return self.converter.convert_prob_matrix_to_vec(prediction)
302 |             
303 |         return prediction
304 |         
305 |         
306 |         
307 | #------------------------------ Grid contruction--------------------------------------#
308 |         
309 | def prob_grid(hme,X,y_lo,y_hi,n_steps, posterior_type = "pdf"):
310 |     '''
311 |     Calculates probability of observing values of y in grid given parameters of
312 |     hme model and matrix of dependent variables. If posterior_type is set to 'cdf'
313 |     outputs cumulative probabilities.
314 |     
315 |     Parameters:
316 |     -----------
317 |     
318 |     hme: reference to instance of HME class
319 |         Fitted HME model
320 |         
321 |     X:  numpy array of size 'unknown x m'
322 |         Explanatory variables
323 |         
324 |     y_lo: numpy array of size 'unknown x m' (should have the same number of rows as X)
325 |         Lower bound
326 |         
327 |     y_hi: numpy array of size 'unknown x m' (should have the same number of rows as X)
328 |         Lower bound
329 |         
330 |     posrerior_type:  str
331 |         Can be 'pdf' or 'cdf'. If 'pdf' will give probability of being in each
332 |         square of grid, otherwise return cumulative probabilities
333 |     
334 |     '''
335 |     n,m    = np.shape(X)
336 |     x_grid = np.outer(X[:,0], np.ones(n_steps-1))
337 |     y_grid = np.zeros([n,n_steps-1])
338 |     P_grid = np.zeros([n,n_steps-1])
339 |     step   = (y_hi-y_lo)/n_steps
340 |     y_hi_i = y_lo + step
341 |     for i in range(1,n_steps-1):
342 |         if posterior_type == "pdf":
343 |             P_grid[:,i]  = hme.predict(X,predict_type = "predict_cdf",y_lo = y_lo ,y_hi = y_hi_i)
344 |         elif posterior_type == "cdf":
345 |             P_grid[:,i]  = hme.predict(X,predict_type = "predict_cdf",y_lo = None ,y_hi = y_hi_i)
346 |         y_grid[:,i]  = y_lo + (y_hi_i - y_lo)/2
347 |         y_hi_i      += step
348 |         y_lo        += step
349 |     return [x_grid,y_grid,P_grid]
350 | 
351 | 
352 |               


--------------------------------------------------------------------------------
/Hierarchical Mixture of Experts/nodes_hme.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import abc
  5 | import weighted_lin_reg as wlr
  6 | import softmax_reg as sr
  7 | import weighted_gda as wgda
  8 | from scipy.misc import logsumexp
  9 | from helpers import *
 10 | 
 11 | 
 12 | ########################################## Abstarct Base Node Class ######################################################
 13 | 
 14 | 
 15 | 
 16 | 
 17 | class Node(object):
 18 |     __metaclass__ = abc.ABCMeta
 19 |     '''
 20 |     Abstract base class for gating and expert nodes.
 21 |     
 22 |     
 23 |     Parameters:
 24 |     -----------
 25 |     n: int
 26 |         Number of observations in training set
 27 |         
 28 |     node_position: int
 29 |         Position of this node in heap array
 30 |         
 31 |     k: int 
 32 |         Branching parameter of the tree
 33 |         
 34 |     m: int
 35 |         Dimensionality of input data
 36 |         
 37 |     bias: bool
 38 |         True if X contains bias term
 39 |         
 40 |     underflow_tol: float
 41 |         Smallest value probability can take when calculating responsibilities,
 42 |         prevents underflow
 43 |         
 44 |     stop_learning: float
 45 |         Does not allow expert or gater model to learn if change in likelihood 
 46 |         is small
 47 |         
 48 |     max_iter: int
 49 |         Number of iterations before convergence (for softmax regression in gates)
 50 |         
 51 |     conv_threshold: float
 52 |         Convergence parameter (for softmax regression in gates)
 53 |         
 54 |     stop_learning_softmax: float
 55 |         If change in weighted log_likelihood of softmax is smaller than threshold
 56 |         
 57 |     '''
 58 |     
 59 |     def __init__(self,n,node_position,k,m, bias_term = True,    underflow_tol            = 1e-10,
 60 |                                                                 classes                  = 2,
 61 |                                                                 max_iter                 = 100, 
 62 |                                                                 conv_threshold           = 1e-10,
 63 |                                                                 stop_learning_softmax    = 1e-10,
 64 |                                                                 stop_learning_regression = 1e-20,
 65 |                                                                 stop_learning_wgda       = 1e-20): 
 66 |         self.weights            = np.zeros(n, dtype = np.float64)
 67 |         self.bound_weights      = np.zeros(n, dtype = np.float64)
 68 |         self.node_position      = node_position
 69 |         self.k                  = k
 70 |         self.underflow_tol      = underflow_tol
 71 |         self.m                  = m
 72 |         self.bias               = bias_term
 73 |         self.max_iter           = max_iter
 74 |         self.conv_threshold     = conv_threshold
 75 |         self.n                  = n
 76 |         # log-likelihood
 77 |         self.log_like_test      = 0
 78 |         self.stop_learning_sr   = stop_learning_softmax
 79 |         self.stop_learning_wlr  = stop_learning_regression
 80 |         self.stop_learning_wgda = stop_learning_wgda
 81 |         self.classes            = classes
 82 |         
 83 |         
 84 |     @abc.abstractmethod
 85 |     def _m_step_update(self):
 86 |         pass
 87 |         
 88 |         
 89 |     @abc.abstractmethod
 90 |     def up_tree_pass(self):
 91 |         pass
 92 |         
 93 |         
 94 |     @abc.abstractmethod
 95 |     def down_tree_pass(self):
 96 |         pass
 97 |         
 98 |         
 99 |     @abc.abstractmethod
100 |     def _prior(self):
101 |         pass
102 |         
103 |         
104 |     @abc.abstractmethod
105 |     def propagate_prediction(self):
106 |         pass
107 |         
108 |         
109 |     def get_childrens(self,nodes):
110 |         '''
111 |         Gets children of current node.
112 |         
113 |         Parameters:
114 |         -----------
115 |         
116 |         nodes: list of size equal number of nodes in HME
117 |              List with all nodes of HME
118 |              
119 |         Returns:
120 |         --------
121 |         
122 |         children_nodes: list of size k (branching factor of tree)
123 |              List of children of node
124 |         '''
125 |         children_nodes = []
126 |         for i in range(1,self.k+1):
127 |             child_position = self.node_position*self.k + i
128 |             if child_position >= len(nodes):
129 |                raise NodeNotFoundError(self.node_position,self.node_type,"does not have children")
130 |             children_nodes.append(nodes[child_position])
131 |         return children_nodes
132 |         
133 |         
134 |     def get_parent_and_birth_order(self,nodes):
135 |         '''
136 |         Gets parent of current node and finds number of children to the left.
137 |         
138 |         Parameters:
139 |         -----------
140 |         
141 |         nodes: list of size equal number of nodes in HME
142 |              List with all nodes of HME
143 |              
144 |         Returns:
145 |         --------
146 |         
147 |         [parent,birth_order]: list 
148 |              First element of list os parent of node, second identifies child position
149 |         '''
150 |         parent_index      =  (self.node_position - 1) / self.k
151 |         if parent_index < 0:
152 |             raise NodeNotFoundError(self.node_position,self.node_type,"does not have parent")
153 |         birth_order       =  (self.node_position - 1) % self.k
154 |         parent            =  nodes[parent_index]
155 |         return [parent, birth_order]
156 |         
157 |         
158 |     def has_parent(self):
159 |         '''
160 |         Returns True if node has parent, False if otherwise
161 |         '''
162 |         if self.node_position == 0:
163 |             return False
164 |         return True
165 |         
166 |         
167 |     def get_delta_param_norm(self):
168 |         ''' L2 norm of change in parameters of gate model'''
169 |         return self.model.delta_param_norm
170 |         
171 |         
172 |     def get_delta_log_like(self):
173 |         ''' Returns change in likelihood on m-step'''
174 |         return self.model.delta_log_like
175 |         
176 |         
177 |     
178 | ############################################### Gate Node ################################################################
179 | 
180 | 
181 | #----------------------------------------- Abstarct Gater Class ---------------------------------------------------------#
182 | 
183 | 
184 | 
185 | class AbstractGaterNode(Node):
186 |     '''
187 |     Abstract gate node class
188 |     '''
189 |     
190 |     def __init__(self,*args,**kwargs):
191 |         super(AbstractGaterNode,self).__init__(*args,**kwargs)
192 |         self.responsibilities = np.zeros([self.n,self.k])
193 |         self.normaliser       = np.zeros(self.n)
194 |         self.node_type        = "gate"
195 | 
196 |     
197 |     def down_tree_pass(self,X,nodes):
198 |         '''
199 |         Calculates responsibilities and performs weighted maximum 
200 |         likelihood estimation
201 |         
202 |         Parameters:
203 |         -----------
204 |         
205 |         X: numpy array of size 'n x m'
206 |             Explanatory variables
207 |             
208 |         nodes: list of size equal number of nodes in HME
209 |              List with all nodes of HME
210 |              
211 |         '''
212 |         # E-step of EM algorithm
213 |         if self.has_parent() is True:
214 |             parent,birth_order                   = self.get_parent_and_birth_order(nodes)
215 |             self.weights                         = parent.responsibilities[:,birth_order] - parent.normaliser
216 |             self.weights                        += parent.weights
217 |         log_H = self.responsibilities - np.outer(self.normaliser, np.ones(self.k))
218 |         H     = np.exp(log_H)
219 |         
220 |         # bound weights to prevent underflow in weighted regression
221 |         self.bound_weights =  bounded_variable(np.exp(self.weights),self.underflow_tol)
222 |         
223 |         # M-step of EM algorithm
224 |         self._m_step_update(H,X)
225 | 
226 |         
227 |     def up_tree_pass(self,X,nodes):
228 |         '''
229 |         Calculates prior probability of latent variables and combines 
230 |         prior probability of children to calculate posterior for the 
231 |         latent variable corresponding to node
232 |         
233 |         Parameters:
234 |         -----------
235 |         
236 |         X: numpy array of size 'n x m'
237 |             Explanatory variables
238 |             
239 |         nodes: list of size equal number of nodes in HME
240 |              List with all nodes of HME
241 |              
242 |         '''
243 |         self._prior(X)
244 |         children = self.get_childrens(nodes)
245 |         
246 |         # check that all children are of the same type
247 |         if len(set([e.node_type for e in children])) != 1:
248 |                raise ValueError("Children nodes should have the same node type")
249 |                
250 |         # prior probabilities calculation
251 |         for i,child_node in enumerate(children):
252 |             if child_node.node_type == "expert":
253 |                self.responsibilities[:,i] += child_node.weights
254 |             elif child_node.node_type == "gate":
255 |                self.responsibilities[:,i] += logsumexp(child_node.responsibilities, axis = 1)
256 |             else:
257 |                 raise TypeError("Unidentified node type")
258 |                 
259 |         #prevent underflow
260 |         self.normaliser         = logsumexp(self.responsibilities, axis = 1)
261 |     
262 |     
263 |     def propagate_prediction(self,X,nodes,predict_type = "predict_response", y_lo=None, y_hi=None):
264 |         '''
265 |         Returns weighted mean of predictions in experts which are in subtree
266 |         
267 |         Parameters:
268 |         -----------
269 |         
270 |         X: numpy array of size 'unkonwn x m'
271 |             Explanatory variables for test set
272 |             
273 |         nodes: list of size equal number of nodes in HME
274 |              List with all nodes of HME
275 |              
276 |         predict_type: str
277 |              Can be "predict_response", "predict_prob", "predict_cdf"
278 |              "predict_resposne"   - works for all type of experts 
279 |              "predict_prob"       - works for classification experts ('wgda','softmax')
280 |              "predict_cdf"        - works only for 'gaussian' expert
281 |             
282 |         Returns:
283 |         --------
284 |         
285 |         mean_prediction: numpy array of size 'unknown x m'
286 |              Weighted prediction
287 |         '''
288 |         self._prior(X)
289 |         children        = self.get_childrens(nodes)
290 |         n,m             = np.shape(X)
291 |         mean_prediction = None
292 |         for i,child in enumerate(children):
293 |             w                   = np.exp(self.responsibilities[:,i])
294 |             children_average    = child.propagate_prediction(X,nodes,predict_type,y_lo,y_hi)
295 |             if len(children_average.shape) > 1:
296 |                 k                = children_average.shape[1]
297 |                 w                = np.outer(w,np.ones(k))
298 |             if mean_prediction is None:
299 |                 mean_prediction  = (w * children_average)
300 |             else:
301 |                 mean_prediction += (w * children_average)
302 |         return mean_prediction
303 |         
304 | 
305 |     def _m_step_update(self,H,X):
306 |         ''' Updates parameters running weighted softmax regression '''
307 |         self.model.fit(H,X,self.bound_weights)
308 |         
309 |     
310 |     def _prior(self,X):
311 |         '''Calculates  prior probabilities for latent variables'''
312 |         probs = self.model.predict_log_probs(X)
313 |         self.responsibilities = probs
314 |         
315 |         
316 | #----------------------------------------- implementations of Gaters ---------------------------------------------#
317 |     
318 |         
319 | class GaterNodeSoftmax(AbstractGaterNode):
320 |     '''
321 |     Gate node of Hierarchical Mixture of Experts with softmax transfer function.
322 |     Calculates responsibilities and updates parmameters using weighted softmax regression.
323 |     '''
324 |     
325 |     def __init__(self,*args,**kwargs):
326 |         ''' Initialises gate node '''
327 |         super(GaterNodeSoftmax,self).__init__(*args,**kwargs)
328 |         self.model = sr.SoftmaxRegression(self.conv_threshold, self.max_iter,self.stop_learning_sr)
329 |         self.model.init_params(self.m,self.k)
330 |         
331 |         
332 | class GaterNodeWGDA(AbstractGaterNode):
333 |     '''
334 |     Gate node of Hierarchical Mixture of Experts with weighted gaussian discriminant
335 |     analysis as gating model. Calculates responsibilities and updates parameters 
336 |     of gating model.
337 |     '''
338 |     
339 |     def __init__(self,*args,**kwargs):
340 |         ''' Initialises gate node '''
341 |         super(GaterNodeWGDA,self).__init__(*args,**kwargs)
342 |         self.model = wgda.WeightedGaussianDiscriminantAnalysis(bias_term     = self.bias, 
343 |                                                                stop_learning = self.stop_learning_wgda)
344 |         if self.bias is True:
345 |             self.model.init_params(self.m-1,self.k)
346 |         else:
347 |             self.model.init_params(self.m,self.k)
348 | 
349 | 
350 |  
351 | ################################################## Expert Nodes ##########################################################
352 |  
353 |  
354 | #----------------------------------------- Abstarct Expert Class ---------------------------------------------------------#
355 |       
356 |       
357 |       
358 | class ExpertNodeAbstract(Node):
359 |     '''
360 |     Abstract Base Class for experts (linear, logistic etc. regressions) 
361 |     '''
362 | 
363 |     def down_tree_pass(self,X,Y,nodes):
364 |         '''
365 |         Calculates responsibilities and performs weighted maximum likelihood
366 |         estimation.
367 |         
368 |         Parameters:
369 |         -----------
370 |         
371 |         X: numpy array of size 'n x m'
372 |             Explanatory variables
373 |             
374 |         Y: numpy array of size 'n x m'
375 |             Target variables that should be approximated
376 |             
377 |         nodes: list of size equal number of nodes in HME
378 |              List with all nodes of HME
379 |              
380 |         '''
381 |         # E-step of EM algorithm
382 |         parent, birth_order = self.get_parent_and_birth_order(nodes)
383 |         
384 |         self.weights           =  parent.responsibilities[:,birth_order] - parent.normaliser
385 |         self.weights          += parent.weights 
386 |         
387 |         # prevent underflow in weighted regressions
388 |         self.bound_weights = bounded_variable(np.exp(self.weights),self.underflow_tol)
389 |         
390 |         # M-step of EM algorithm
391 |         self._m_step_update(X,Y)
392 | 
393 |        
394 |     def up_tree_pass(self,X,Y):
395 |         '''
396 |         Calculates prior probability of latent variables corresponding to 
397 |         expert at node and likelihood. 
398 |         
399 |         Parameters:
400 |         -----------
401 |         
402 |         X: numpy array of size 'n x m'
403 |             Explanatory variables
404 |             
405 |         Y: numpy array of size 'n x 1'
406 |              Target variable that should be approximated
407 |              
408 |         '''
409 |         self._prior(X,Y)
410 |         
411 |                 
412 |     def propagate_prediction(self,X,nodes, predict_type = "mean",y_lo=None,y_hi=None):
413 |         '''
414 |         Returns prediction of expert for test input X
415 |         
416 |         Parameters:
417 |         -----------
418 |         
419 |         X: numpy array of size 'unkonwn x m'
420 |             Explanatory variables for test set
421 |             
422 |         nodes: list of size equal number of nodes in HME
423 |              List with all nodes of HME
424 |              
425 |         predict_type: str
426 |              Can be "predict_response", "predict_prob", "predict_cdf"
427 |              "predict_resposne"   - works for all type of experts 
428 |              "predict_prob"       - works for classification experts ('wgda','softmax')
429 |              "predict_cdf"        - works only for 'gaussian' expert
430 |              
431 |         Returns:
432 |         --------
433 |         : numpy array of size 'unknown x m'
434 |              Weighted prediction
435 |         
436 |         '''
437 |         if predict_type == "predict_probs":
438 |             return self.model.predict_probs(X)
439 |         elif predict_type == "predict_response":
440 |             return self.model.predict(X)
441 |         elif predict_type == "predict_cdf":
442 |             return self.model.posterior_cdf(X,y_lo,y_hi)
443 |         else:
444 |             raise NotImplementedError("Not implemented prediction type")
445 |         
446 |         
447 |     def propagate_log_probs(self,X,Y):
448 |         ''' Returns probability of observing Y given X and parameters'''
449 |         return self.model.posterior_log_probs(X,Y)
450 |      
451 |          
452 |     def _prior(self,X,Y):
453 |         ''' Calculates probability of observing Y given X and parameters of regression '''
454 |         self.weights = self.model.posterior_log_probs(X,Y)
455 |         
456 | 
457 |     def _m_step_update(self,X,Y):
458 |         ''' Updates parameters of linear regression (coefficient and estimates of variance) '''
459 |         # parameters are updated and saved in expert 
460 |         self.model.fit(Y,X,self.bound_weights)
461 | 
462 |         
463 |         
464 | #-------------------------------------- Implementation of Expert Nodes --------------------------------------------------#
465 | 
466 |         
467 | class ExpertNodeLinReg(ExpertNodeAbstract):
468 |     '''
469 |     Expert node in Hierarchical Mixture of Experts, with expert being 
470 |     standard weighted linear regression.
471 |     '''
472 |     
473 |     def __init__(self,*args,**kwargs):
474 |         ''' Initialise linear regression expert node '''
475 |         super(ExpertNodeLinReg,self).__init__(*args,**kwargs)
476 |         self.model = wlr.WeightedLinearRegression(stop_learning = self.stop_learning_wlr)
477 |         self.model.init_params(self.m)
478 |         self.node_type = "expert"
479 |         
480 |     
481 | class ExpertNodeSoftmaxReg(ExpertNodeAbstract):
482 |     '''
483 |     Expert Node with Softmax model as an expert
484 |     '''
485 | 
486 |     def __init__(self,*args, **kwargs):
487 |         super(ExpertNodeSoftmaxReg,self).__init__(*args,**kwargs)
488 |         self.model = sr.SoftmaxRegression( tolerance       = self.conv_threshold, 
489 |                                              max_iter      = self.max_iter,
490 |                                              stop_learning = self.stop_learning_sr)
491 |         self.model.init_params(self.m, self.classes)
492 |         self.node_type = "expert"
493 |                
494 |         
495 |         
496 | class ExpertNodeWGDA(ExpertNodeAbstract):
497 |     '''
498 |     Expert Node with Gaussian Discriminant Analysis as an expert
499 |     '''
500 |     
501 |     def __init__(self,*args,**kwargs):
502 |         super(ExpertNodeWGDA,self).__init__(*args,**kwargs)
503 |         self.model = wgda.WeightedGaussianDiscriminantAnalysis(stop_learning = self.stop_learning_wgda,
504 |                                                                bias_term     = self.bias)
505 |         if self.bias is True:
506 |             self.model.init_params(self.m-1,self.classes)
507 |         else:
508 |             self.model.init_params(self.m,self.classes)
509 |         self.node_type ="expert"
510 |         
511 |         
512 | 
513 | 


--------------------------------------------------------------------------------