├── README.md └── Hierarchical Mixture of Experts ├── helpers.py ├── label_binariser.py ├── weighted_gda.py ├── weighted_lin_reg.py ├── softmax_reg.py ├── general_hme.py └── nodes_hme.py /README.md: -------------------------------------------------------------------------------- 1 | # Mixture-of-Experts-Models 2 | 3 | Future work: 4 | - Add Mixture Density Neural Network using tensorflow 5 | - Rewrite HME with tensorflow 6 | 7 | 8 | ## Hierarchical Mixture of Experts 9 | 10 | Hierarchical mixture of experts can be used to solve standard [regression](https://github.com/AmazaspShumik/Mixture-of-Experts-Models/blob/master/Hierarchical%20Mixture%20of%20Experts/hme_standard_regression_examples.ipynb) and [classification](https://github.com/AmazaspShumik/Mixture-of-Experts-Models/blob/master/Hierarchical%20Mixture%20of%20Experts/hme_classification_examples.ipynb) problems, however one of the main applications of hme are problems with [multimodal output](https://github.com/AmazaspShumik/Mixture-of-Experts-Models/blob/master/Hierarchical%20Mixture%20of%20Experts/hme_multimodal_output_examples.ipynb). 11 | 12 | 13 | 14 | 15 | 16 | 17 | [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/AmazaspShumik/mixture-models/trend.png)](https://bitdeli.com/free "Bitdeli Badge") 18 | 19 | -------------------------------------------------------------------------------- /Hierarchical Mixture of Experts/helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import random 5 | 6 | 7 | 8 | def train_test_split(x,y,test_p = 0.25): 9 | ''' 10 | Divides data set into train and test data sets 11 | 12 | Parameters: 13 | ---------- 14 | 15 | x: numpy array of size 'n x k' (k can be 1 and above) 16 | Exogeneous variables 17 | 18 | y: numpy array of size 'n x m' (m can be 1 and above) 19 | Endogeneous variables 20 | 21 | test_p: float 22 | Proportion of data that should go to testing set 23 | 24 | Returns: 25 | -------- 26 | 27 | [x_train,x_test]: list of size 2 28 | First element of list is training set, second is testing set 29 | 30 | ''' 31 | n = x.shape[0] 32 | sample_index = random.sample(xrange(n), int(n*test_p)) 33 | train_index = [e for e in xrange(n) if e not in set(sample_index)] 34 | 35 | if len(x.shape) > 1: 36 | x_test = x[sample_index,:] 37 | x_train = x[train_index,:] 38 | 39 | if len(y.shape) > 1: 40 | y_test = y[sample_index,:] 41 | y_train = y[train_index,:] 42 | 43 | if len(x.shape) == 1: 44 | x_test = x[sample_index] 45 | x_train = x[train_index] 46 | 47 | if len(y.shape) == 1: 48 | y_test = y[sample_index] 49 | y_train = y[train_index] 50 | 51 | return [x_train,x_test,y_train,y_test] 52 | 53 | 54 | 55 | 56 | def bounded_variable(x,lo,hi=None): 57 | ''' 58 | Bounds variable from below and above, prevents underflow and overflow 59 | 60 | Parameters: 61 | ----------- 62 | 63 | x: numpy array of size 'n x k' (k can be 1) 64 | input vector 65 | 66 | hi: float 67 | Upper bound 68 | 69 | lo: float 70 | Lower bound 71 | 72 | Returns: 73 | -------- 74 | : numpy array of size 'n x k' 75 | 76 | ''' 77 | def _bounded_vector(z,lo,hi): 78 | if hi is not None: 79 | z[ z > hi] = hi 80 | z[ z < lo] = lo 81 | return z 82 | if len(np.shape(x)) > 1: 83 | for i in range(np.shape(x)[1]): 84 | pass 85 | x[:,i] = _bounded_vector(x[:,i],lo,hi) 86 | return x 87 | return _bounded_vector(x,lo,hi) 88 | 89 | 90 | 91 | class NodeNotFoundError(LookupError): 92 | ''' 93 | Error raised in case node is not found 94 | ''' 95 | 96 | def __init__(self,n_pos,n_type, message): 97 | m = "Node with index {0} of type {1} {2}" 98 | self.message = m.format(n_pos,n_type,message) 99 | 100 | def __str__(self): 101 | return self.message 102 | 103 | class NodeModelNotImplemented(NotImplementedError): 104 | ''' 105 | Error raised in case model is not implemented for node 106 | ''' 107 | 108 | def __init__(self,model_name,n_type): 109 | m = "Model {0} is not implemented for node type {1}" 110 | self.message = m.format(model_name, n_type) 111 | 112 | def __str__(self): 113 | return self.message 114 | -------------------------------------------------------------------------------- /Hierarchical Mixture of Experts/label_binariser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import numpy as np 5 | from scipy.sparse import csr_matrix 6 | 7 | class ClassificationTargetError(Exception): 8 | ''' 9 | Exception raised in case of mismatch between number of expected and 10 | observed classes in target vector (for classification problem) 11 | ''' 12 | 13 | def __init__(self, expected, observed): 14 | self.e = expected 15 | self.o = observed 16 | 17 | def __str__(self): 18 | s = 'Mismatch in number of classes, expected - {0} , observed - {1}'.format(self.e,self.o) 19 | return s 20 | 21 | 22 | 23 | class LabelBinariser(object): 24 | 25 | ''' 26 | Binarize labels in a one-vs-all fashion. 27 | 28 | Allows easily transform vector of targets for classification to ground truth 29 | matrix and easily make inverse transformation. 30 | 31 | n = n_samples , k = n_classes 32 | 33 | Parameters: 34 | ------------ 35 | 36 | Y: numpy array of size 'n_samples x 1' 37 | Target variables, vector of classes in classification problem 38 | 39 | k: int 40 | Number of classes 41 | 42 | ''' 43 | 44 | def __init__(self,Y,k): 45 | 46 | self.Y = Y 47 | self.n = np.shape(Y)[0] 48 | self.k = k 49 | # mapping between set of integers to set of classes 50 | classes = set(Y) 51 | if len(classes) != k: 52 | raise ClassificationTargetError(k,len(classes)) 53 | self.direct_mapping = {} 54 | self.inverse_mapping = {} 55 | for i,el in enumerate(sorted(list(classes))): 56 | self.direct_mapping[el] = i 57 | self.inverse_mapping[i] = el 58 | 59 | 60 | def convert_vec_to_binary_matrix(self,Y_raw = None, compress = False): 61 | ''' 62 | Converts vector to ground truth matrix 63 | 64 | Parameters: 65 | ------------ 66 | 67 | compress: bool 68 | If True will use csr_matrix to output compressed matrix 69 | 70 | Returns: 71 | -------- 72 | 73 | Y: numpy array of size 'n x k' 74 | Ground truth matrix , column number represents class index, 75 | each row has all zeros and only one 1. 76 | 77 | ''' 78 | if Y_raw is None: 79 | Y_raw = self.Y 80 | Y = np.zeros([self.n,self.k]) 81 | for el,idx in self.direct_mapping.items(): 82 | Y[self.Y==el,idx] = 1 83 | if compress is True: 84 | return csr_matrix(Y) 85 | return Y 86 | 87 | 88 | def logistic_reg_direct_mapping(self, Y_raw = None): 89 | ''' 90 | Converts vector with two possible classes to vector of zeros and ones. 91 | 92 | Returns: 93 | -------- 94 | 95 | Y: numpy array of size 'n x 1' 96 | Vector of zeros and ones. (Mainly inteneded for logistic regression) 97 | 98 | ''' 99 | Y = np.zeros(self.n) 100 | el_one = self.inverse_mapping[1] 101 | if Y_raw is None: 102 | Y_raw = self.Y 103 | Y[Y_raw == el_one] = 1 104 | return Y 105 | 106 | 107 | def logistic_reg_inverse_mapping(self,Y): 108 | ''' 109 | Converts probabilities to original format 110 | 111 | Parameters: 112 | ----------- 113 | Y: numpy array of size [n_samples,1] 114 | Vector of zeros and ones (for example output of logistic regression) 115 | 116 | Returns: 117 | -------- 118 | 119 | Y: numpy array of size 'n x 1' 120 | Target estimates in original format. 121 | 122 | ''' 123 | Y[Y > 0.5] = 1 124 | Y[Y <= 0.5] = 0 125 | Y_out = np.zeros(self.n, dtype = self.Y.dtype) 126 | Y_out[Y==1] = self.inverse_mapping[1] 127 | Y_out[Y==0] = self.inverse_mapping[0] 128 | return Y_out 129 | 130 | 131 | def convert_binary_matrix_to_vec(self,B, compressed = False): 132 | ''' 133 | Converts ground truth matrix to vector of classificaion targets 134 | 135 | Parameters: 136 | ----------- 137 | compressed: bool 138 | If True input is csr_matrix, otherwise B is numpy array 139 | 140 | Returns: 141 | --------- 142 | 143 | Y: numpy array of size 'n x 1' 144 | Vector of targets, classes 145 | ''' 146 | if compressed is True: 147 | B = B.dot(np.eye(np.shape(B)[1])) 148 | Y = np.zeros(self.n, dtype = self.Y.dtype) 149 | for i in range(np.shape(B)[1]): 150 | Y[B[:,i]==1] = self.inverse_mapping[i] 151 | return Y 152 | 153 | 154 | def convert_prob_matrix_to_vec(self,Y): 155 | ''' 156 | Converts matrix of probabilities to vector of classification targets 157 | 158 | Parameters: 159 | ----------- 160 | Y: numpy array of size [n_samples,n_classes] 161 | Matrix of class probabilities, element at cell [i,j] shows probability 162 | that observation i belongs to class j 163 | 164 | Returns: 165 | -------- 166 | 167 | Y: numpy array of size 'n x 1' 168 | Ground truth matrix , column number represents class index, 169 | each row has all zeros and only one 1. 170 | 171 | ''' 172 | Y_max = np.argmax(Y, axis = 1) 173 | Y = np.array([self.inverse_mapping[e] for e in Y_max]) 174 | return Y 175 | 176 | -------------------------------------------------------------------------------- /Hierarchical Mixture of Experts/weighted_gda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | from scipy.stats import multivariate_normal as mvn 5 | from scipy.misc import logsumexp 6 | 7 | class WeightedGaussianDiscriminantAnalysis(object): 8 | ''' 9 | Weighted Gaussian Discriminant Analysis 10 | 11 | A classifier with linear decision boundary , generated by fitting class 12 | conditional densities with Gaussian. And using using Bayes rule 13 | to obtain posterior distribution. 14 | ''' 15 | 16 | def __init__(self,stop_learning = 1e-3, bias_term = True): 17 | self.stop_learning = stop_learning 18 | self.bias_term = bias_term 19 | self.delta_param_norm = 0 20 | self.delta_log_like = 0 21 | self.means = None 22 | 23 | 24 | def init_params(self,m,k): 25 | ''' 26 | Initialises parameters 27 | 28 | Parameters: 29 | ----------- 30 | 31 | m: int 32 | Dimensionality of data 33 | 34 | k: int 35 | Number of classes 36 | 37 | ''' 38 | self.k = k 39 | self.cov = np.eye(m) 40 | self.means = np.random.random([m,k]) 41 | self.log_priors = -1*np.log(np.ones(k)*k) 42 | 43 | 44 | def _bias_term_pre_processing_X(self,X,bias_term): 45 | ''' 46 | Preprocesses X and adjusts for bias term 47 | 48 | Returns: 49 | -------- 50 | X: numpy array of size 'n x (m-1)' 51 | Design matrix without column of bias_term, which is expected to be 52 | last column 53 | ''' 54 | if bias_term is None: 55 | bias_term = self.bias_term 56 | if bias_term is True: 57 | return X[:,:-1] 58 | return X 59 | 60 | 61 | def fit(self,Y,X,weights = None, bias_term = None): 62 | ''' 63 | Finds parameters of weighted gaussian discriminant analysis that maximise 64 | maximum likelihood. 65 | 66 | Parameters: 67 | ----------- 68 | 69 | X: numpy array of size 'n x m' 70 | Expalanatory variables 71 | 72 | Y: numpy array of size 'n x 1' 73 | Dependent variables that need to be approximated 74 | 75 | weights: numpy array of size 'n x 1' 76 | Weighting for each observation 77 | 78 | bias_term: bool 79 | If True, matrix of explanatory variables already contains bias term, 80 | which should be discarded in estimation (expected that bias term is in last 81 | column of X matrix) 82 | 83 | ''' 84 | 85 | # preprocess X if it contains bias term 86 | X = self._bias_term_pre_processing_X(X,bias_term) 87 | 88 | n,m = np.shape(X) 89 | k = self.k 90 | weights_total = np.sum(weights) 91 | 92 | if weights is None: 93 | weights = np.ones(n) 94 | 95 | # Interestingly loop was faster than using outer product 96 | Y_w = (Y.T*weights).T 97 | 98 | # recovery in case of decrease in log-likelihood (NUMERICAL UNDERFLOW ISSUE IN DEEP 99 | # HIERARCHICAL MIXTURE OF EXPERTS) 100 | mean_recovery = self.means 101 | cov_recovery = self.cov 102 | prior_recovery = self.log_priors 103 | log_like_before = self.log_likelihood(X,Y_w,weights, weighted_Y = True, bias_term = False) 104 | 105 | # calculate log priors 106 | weighted_norm = np.sum(Y_w, axis = 0) 107 | self.log_priors = np.log(weighted_norm) - np.log(weights_total) 108 | 109 | # calculate weighted means of Gaussians for each class 110 | weighted_sum = np.dot(X.T*weights,Y) 111 | self.means = weighted_sum / weighted_norm 112 | 113 | # calculate pooled covarince matrix 114 | self.cov = np.zeros([m,m]) 115 | cov = np.zeros([m,m]) 116 | M = np.zeros([m,n]) 117 | for i in range(k): 118 | np.outer(self.means[:,i],np.ones(n), out = M) 119 | X_cent = (X - M.T) 120 | np.dot(X_cent.T*Y_w[:,i],X_cent, out = cov) 121 | self.cov += cov 122 | self.cov /= weights_total 123 | 124 | # check that log-likelihood did not dropped (UNDERFLOW IN DEEP HMEs) 125 | # or incresed by very little (for preventing overfitting and long iteration 126 | # cycle 127 | log_like_after = self.log_likelihood(X,Y_w,weights,bias_term = False, weighted_Y = True) 128 | delta_log_like = (log_like_after - log_like_before )/n 129 | if delta_log_like < self.stop_learning: 130 | self.means = mean_recovery 131 | self.cov = cov_recovery 132 | self.log_priors = prior_recovery 133 | delta_log_like = 0 134 | 135 | # saves changes in likelihood and parameters in instance variables 136 | delta = self.means - mean_recovery 137 | self.delta_param_norm = np.sum(np.dot(delta.T,delta)) 138 | self.delta_log_like = delta_log_like 139 | 140 | 141 | def predict_probs(self,X, bias_term = None): 142 | ''' 143 | Calculates posterior probability of x belonging to any particular class 144 | 145 | Parameters: 146 | ----------- 147 | 148 | X: numpy array of size 'unknown x m' 149 | Expalanatory variables 150 | 151 | bias_term: bool 152 | If True , explanatory variables matrix contains bias_term (bias term should be 153 | in last column of design matrix) 154 | 155 | Returns: 156 | -------- 157 | 158 | prior_prob: numpy array of size 'unknown x k' 159 | Posterior probability that class belongs to particular probability 160 | 161 | ''' 162 | prior_prob = np.exp(self.predict_log_probs(X,bias_term)) 163 | return prior_prob 164 | 165 | 166 | def predict_log_probs(self,X,bias_term = None): 167 | ''' 168 | Calculates log of probabilities 169 | 170 | Parameters: 171 | ----------- 172 | 173 | X: numpy array of size 'unknown x m' 174 | Expalanatory variables 175 | 176 | bias_term: bool 177 | If True , explanatory variables matrix contains bias_term (bias term should be 178 | in last column of design matrix) 179 | 180 | Returns: 181 | -------- 182 | 183 | prior_prob: numpy array of size 'unknown x k' 184 | Posterior probability that class belongs to particular probability 185 | 186 | ''' 187 | X = self._bias_term_pre_processing_X(X,bias_term) 188 | n,m = np.shape(X) 189 | log_posterior = np.zeros([n,self.k]) 190 | for i in range(self.k): 191 | log_posterior[:,i] = mvn.logpdf(X,self.means[:,i], cov = self.cov) 192 | log_posterior[:,i] += self.log_priors[i] 193 | normaliser = logsumexp(log_posterior, axis = 1) 194 | posterior_log_prob = (log_posterior.T - normaliser).T 195 | return posterior_log_prob 196 | 197 | 198 | def log_likelihood(self, X, Y, weights = None, bias_term = None, weighted_Y = False): 199 | ''' 200 | Calculates log likelihood for weighted gaussian discriminant analysis 201 | 202 | Parameters: 203 | ----------- 204 | 205 | X: numpy array of size 'n x m' 206 | Explanatory variables 207 | 208 | Y: numpy array of size 'n x 1' 209 | Target variable can take only values 0 or 1 210 | 211 | weights: numpy array of size 'n x 1' 212 | Weights for observations 213 | 214 | k: int 215 | Number of classes 216 | 217 | bias_term: bool 218 | If True excludes bias term (which is expected to be in last column of X) 219 | 220 | weighted_Y: 221 | If True Y is already weighted (optimisation so that not recalculate Y*w) 222 | 223 | Returns: 224 | -------- 225 | 226 | log_like: float 227 | Log likelihood 228 | ''' 229 | X = self._bias_term_pre_processing_X(X,bias_term) 230 | n,m = np.shape(X) 231 | 232 | # default weights 233 | if weights is None: 234 | weights = np.ones(n) 235 | 236 | # log-likelihood 237 | log_posterior = np.zeros([n,self.k]) 238 | for i in range(self.k): 239 | log_posterior[:,i] = mvn.logpdf(X,self.means[:,i], cov = self.cov) 240 | log_posterior[:,i] += self.log_priors[i] 241 | if weighted_Y is False: 242 | Y = (Y.T*weights).T 243 | log_like = np.sum(Y*log_posterior) 244 | return log_like 245 | 246 | 247 | def posterior_log_probs(self,X,Y,bias_term = None): 248 | ''' 249 | Probability of observing Y given X and parameters 250 | ''' 251 | X = self._bias_term_pre_processing_X(X,bias_term) 252 | log_P = np.sum(Y*self.predict_log_probs(X,bias_term = False), axis = 1) 253 | return log_P 254 | -------------------------------------------------------------------------------- /Hierarchical Mixture of Experts/weighted_lin_reg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Weighted Linear Regression , Expert in HME model 4 | 5 | m - dimensionality of input (i.e. length of row in matrix X) 6 | n - number of observations 7 | """ 8 | 9 | import numpy as np 10 | from scipy.stats import norm 11 | from scipy.linalg import solve_triangular 12 | 13 | #------------------------------------ Least Squares Solvers-------------------------------# 14 | 15 | def cholesky_solver_least_squares(part_one, part_two): 16 | ''' 17 | Solves least squares problem using cholesky decomposition 18 | 19 | Parameters: 20 | ----------- 21 | 22 | part_one: numpy array of size 'm x m', 23 | Equals X.T * X 24 | part_two: numpy array of size 'm x 1' 25 | Equals X.T * Y 26 | 27 | Returns: 28 | -------- 29 | Theta: numpy array of size 'm x 1' 30 | Vector of coefficients 31 | 32 | ''' 33 | # R*R.T*Theta = part_two 34 | R = np.linalg.cholesky(part_one) 35 | # R*Z = part_two 36 | Z = solve_triangular(R,part_two, check_finite = False, lower = True) 37 | # R.T*Theta = Z 38 | Theta = solve_triangular(R.T,Z, check_finite = False, lower = False) 39 | return Theta 40 | 41 | 42 | def qr_solver(Q,R,Y): 43 | ''' 44 | Solves least squares problem using qr decomposition. 45 | 46 | Parameters: 47 | ----------- 48 | 49 | Q: numpy array of size 'n x m' 50 | Matrix Q in QR decomposition (Matrix of orthonormal vectors) 51 | 52 | R: numpy array of size 'm x m' 53 | Matrix R in QR decomposition (Matrix of projection coefficients on 54 | orthonormal vectors) 55 | 56 | Y: numpy array of size ' n x 1' 57 | Vector of dependent variables 58 | 59 | Returns: 60 | ------- 61 | Theta: numpy array of size 'm x 1' 62 | Vector of parameters 63 | ''' 64 | qy = np.dot(Q.T,Y) 65 | Theta = solve_triangular(R,qy, check_finite = False, lower = False) 66 | return Theta 67 | 68 | 69 | def lstsq_wrapper(y,X): 70 | ''' 71 | Uses C++ Linear Algebra Package to calculate coefficients and residuals 72 | of regression. Is much faster than other methods, since it calls C++ functions. 73 | 74 | Parameters: 75 | ----------- 76 | 77 | Y: numpy array of size 'n x 1' 78 | Vector of dependent variables 79 | 80 | X: numpy array of size 'n x m' 81 | Explanatory variables 82 | ''' 83 | theta,r,rank,s = np.linalg.lstsq(X,y) 84 | return theta 85 | 86 | 87 | #---------------------------------- 88 | 89 | def norm_pdf_log_pdf(theta,y,x,sigma_2): 90 | ''' 91 | Calculates probability of observing Y given Theta and sigma and explanatory 92 | varibales 93 | 94 | Parameters: 95 | ---------- 96 | 97 | theta: numpy array of size 'm x k', 98 | Matrix of parameters 99 | y: numpy array of size 'n x 1' 100 | Vector of dependent variables 101 | x: numpy array of size 'n x m' 102 | Matrix of inputs 103 | sigma_2: numpy array of size 'm x 1' 104 | Vector of variances 105 | 106 | Returns: 107 | ------- 108 | prob: numpy array of size 'n x 1' 109 | Probability of observing y given theta and X 110 | 111 | ''' 112 | u = y - np.dot(x,theta) 113 | log_normaliser = -1* np.log(np.sqrt(2*np.pi*sigma_2)) 114 | log_main = -u*u/(2*sigma_2) 115 | log_pdf = log_normaliser + log_main 116 | prob = np.exp(log_pdf) 117 | return [log_pdf,prob] 118 | 119 | 120 | 121 | #------------------------------------- Weighted Linear Regression-------------------------# 122 | 123 | class WeightedLinearRegression(object): 124 | ''' 125 | Weighted Linear Regression 126 | 127 | Parameters: 128 | ----------- 129 | 130 | solver: string (default = "qr") 131 | Numerical method to find weighted linear regression solution 132 | 133 | underflow_tol: float (default = 1e-20) 134 | Threshold to prevent underflow in likelihood computation 135 | 136 | ''' 137 | 138 | def __init__(self, solver = "lapack_solver", stop_learning = 1e-3): 139 | self.solver = solver 140 | self.theta = None 141 | self.var = 0 142 | self.stop_learning = stop_learning 143 | self.delta_param_norm = 0 144 | self.deta_log_like = 0 145 | 146 | 147 | def init_params(self,m): 148 | ''' 149 | Initialises weights and preallocates memory 150 | 151 | Parameters: 152 | ---------- 153 | m: int 154 | Number of parameters, should equal to dimensionality of data 155 | 156 | ''' 157 | self.theta = np.random.normal(0,1,m) 158 | self.var = 1 159 | 160 | 161 | def fit(self,Y,X,weights = None): 162 | ''' 163 | Fits weighted regression, updates coefficients and variance 164 | 165 | Parameters: 166 | ----------- 167 | 168 | X: numpy array of size 'n x m' 169 | Explanatory variables 170 | 171 | Y: numpy array of size 'n x 1' 172 | Target variable can take only values 0 or 1 173 | 174 | weights: numpy array of size 'n x 1' 175 | Weights for observations 176 | 177 | ''' 178 | n,m = np.shape(X) 179 | if weights is not None: 180 | w = np.sqrt(weights) 181 | else: 182 | w = np.ones(n) 183 | weights = w 184 | X_w = (X.T*w).T 185 | Y_w = Y*w 186 | 187 | if self.theta is None: 188 | self.init_params(m) 189 | 190 | # save paramters in case log-likelihood drops ( PRECISION ISSUE IN 191 | # DEEP HIERARCHICAL MIXTURE OF EXPERTS) 192 | theta_recovery = self.theta 193 | var_recovery = self.var 194 | log_like_before = self.log_likelihood(X,Y,weights) 195 | 196 | # use cholesky decomposition for least squares 197 | if self.solver == "cholesky": 198 | part_one = np.dot(X_w.T,X_w) 199 | part_two = np.dot(X_w.T,Y_w) 200 | self.theta = cholesky_solver_least_squares(part_one, part_two) 201 | 202 | # use qr decomposition for least squares 203 | elif self.solver == "qr": 204 | Q,R = np.linalg.qr(X_w) 205 | self.theta = qr_solver(Q,R,Y_w) 206 | 207 | # lapack least squares solver 208 | elif self.solver == "lapack_solver": 209 | self.theta = lstsq_wrapper(Y_w,X_w) 210 | 211 | # calculate variances 212 | vec_1 = (Y_w - np.dot(X_w,self.theta)) 213 | self.var = np.dot(vec_1,vec_1)/np.sum(weights) 214 | 215 | # if likelihood dropped ( PRECISION ISSUE) use recovery parameters 216 | # used in DEEP HIERARCHICAL MIXTURE OF EXPERTS 217 | log_like_after = self.log_likelihood(X,Y,weights) 218 | delta_log_like = ( log_like_after - log_like_before)/n 219 | if delta_log_like < self.stop_learning: 220 | self.theta = theta_recovery 221 | self.var = var_recovery 222 | delta_log_like = 0 223 | 224 | # save change in parameters and likelihood 225 | delta = self.theta - theta_recovery 226 | self.delta_param_norm = np.sum(np.dot(delta.T,delta)) 227 | self.delta_log_like = delta_log_like 228 | 229 | 230 | 231 | def predict(self,X): 232 | ''' 233 | Calculates point estimator based on learned parameters 234 | 235 | Parameters: 236 | ----------- 237 | X: numpy array of size 'n x m' 238 | Explanatory variables 239 | 240 | Returns: 241 | -------- 242 | X: numpy array of size 'unknown x 1' 243 | Explanatory variables from test set 244 | 245 | ''' 246 | return np.dot(X,self.theta) 247 | 248 | 249 | def posterior_log_probs(self,X,Y): 250 | ''' 251 | Wrapper for norm_pdf (primarily used in HME) 252 | ''' 253 | log_pdf,pdf = norm_pdf_log_pdf(self.theta,Y,X,self.var) 254 | return log_pdf 255 | 256 | 257 | def log_likelihood(self,X,Y,weights = None): 258 | ''' 259 | Returns log likelihood for linear regression with noise distributed 260 | as Gaussian 261 | 262 | Parameters: 263 | ----------- 264 | X: numpy array of size 'n x m' 265 | Explanatory variables 266 | 267 | Y: numpy array of size 'n x 1' 268 | Target variable can take only values 0 or 1 269 | 270 | weights: numpy array of size 'n x 1' 271 | Weights for observations 272 | 273 | Returns: 274 | -------- 275 | weighted_log_likelihood: float 276 | Log likelihood 277 | 278 | ''' 279 | if weights is None: 280 | weights = np.ones(X.shape[0]) 281 | log_pdf, pdf = norm_pdf_log_pdf(self.theta,Y,X,self.var) 282 | log_likelihood = np.sum(weights*log_pdf) 283 | return log_likelihood 284 | 285 | 286 | def posterior_cdf(self,X,y_lo = None,y_hi = None): 287 | ''' 288 | Calculate probability of observing target variable in range [y_lo, y_hi] 289 | given explanatory variable and parameters 290 | 291 | Parameters: 292 | ----------- 293 | X: numpy array of size 'unknown x n' 294 | Explanatory variables 295 | 296 | y_lo: numpy array of size 'unknown x 1' 297 | Lower bound 298 | 299 | y_hi: numpy array of size 'unknown x 1' 300 | Upper bound 301 | 302 | Returns: 303 | -------- 304 | delta_prob: numpy array of size 'unknown x 1' 305 | Probability of observing Y in range [y_lo, y_hi] 306 | ''' 307 | # check that upper bound is higher than lower bound 308 | assert np.sum(y_hi= len(nodes): 129 | raise NodeNotFoundError(self.node_position,self.node_type,"does not have children") 130 | children_nodes.append(nodes[child_position]) 131 | return children_nodes 132 | 133 | 134 | def get_parent_and_birth_order(self,nodes): 135 | ''' 136 | Gets parent of current node and finds number of children to the left. 137 | 138 | Parameters: 139 | ----------- 140 | 141 | nodes: list of size equal number of nodes in HME 142 | List with all nodes of HME 143 | 144 | Returns: 145 | -------- 146 | 147 | [parent,birth_order]: list 148 | First element of list os parent of node, second identifies child position 149 | ''' 150 | parent_index = (self.node_position - 1) / self.k 151 | if parent_index < 0: 152 | raise NodeNotFoundError(self.node_position,self.node_type,"does not have parent") 153 | birth_order = (self.node_position - 1) % self.k 154 | parent = nodes[parent_index] 155 | return [parent, birth_order] 156 | 157 | 158 | def has_parent(self): 159 | ''' 160 | Returns True if node has parent, False if otherwise 161 | ''' 162 | if self.node_position == 0: 163 | return False 164 | return True 165 | 166 | 167 | def get_delta_param_norm(self): 168 | ''' L2 norm of change in parameters of gate model''' 169 | return self.model.delta_param_norm 170 | 171 | 172 | def get_delta_log_like(self): 173 | ''' Returns change in likelihood on m-step''' 174 | return self.model.delta_log_like 175 | 176 | 177 | 178 | ############################################### Gate Node ################################################################ 179 | 180 | 181 | #----------------------------------------- Abstarct Gater Class ---------------------------------------------------------# 182 | 183 | 184 | 185 | class AbstractGaterNode(Node): 186 | ''' 187 | Abstract gate node class 188 | ''' 189 | 190 | def __init__(self,*args,**kwargs): 191 | super(AbstractGaterNode,self).__init__(*args,**kwargs) 192 | self.responsibilities = np.zeros([self.n,self.k]) 193 | self.normaliser = np.zeros(self.n) 194 | self.node_type = "gate" 195 | 196 | 197 | def down_tree_pass(self,X,nodes): 198 | ''' 199 | Calculates responsibilities and performs weighted maximum 200 | likelihood estimation 201 | 202 | Parameters: 203 | ----------- 204 | 205 | X: numpy array of size 'n x m' 206 | Explanatory variables 207 | 208 | nodes: list of size equal number of nodes in HME 209 | List with all nodes of HME 210 | 211 | ''' 212 | # E-step of EM algorithm 213 | if self.has_parent() is True: 214 | parent,birth_order = self.get_parent_and_birth_order(nodes) 215 | self.weights = parent.responsibilities[:,birth_order] - parent.normaliser 216 | self.weights += parent.weights 217 | log_H = self.responsibilities - np.outer(self.normaliser, np.ones(self.k)) 218 | H = np.exp(log_H) 219 | 220 | # bound weights to prevent underflow in weighted regression 221 | self.bound_weights = bounded_variable(np.exp(self.weights),self.underflow_tol) 222 | 223 | # M-step of EM algorithm 224 | self._m_step_update(H,X) 225 | 226 | 227 | def up_tree_pass(self,X,nodes): 228 | ''' 229 | Calculates prior probability of latent variables and combines 230 | prior probability of children to calculate posterior for the 231 | latent variable corresponding to node 232 | 233 | Parameters: 234 | ----------- 235 | 236 | X: numpy array of size 'n x m' 237 | Explanatory variables 238 | 239 | nodes: list of size equal number of nodes in HME 240 | List with all nodes of HME 241 | 242 | ''' 243 | self._prior(X) 244 | children = self.get_childrens(nodes) 245 | 246 | # check that all children are of the same type 247 | if len(set([e.node_type for e in children])) != 1: 248 | raise ValueError("Children nodes should have the same node type") 249 | 250 | # prior probabilities calculation 251 | for i,child_node in enumerate(children): 252 | if child_node.node_type == "expert": 253 | self.responsibilities[:,i] += child_node.weights 254 | elif child_node.node_type == "gate": 255 | self.responsibilities[:,i] += logsumexp(child_node.responsibilities, axis = 1) 256 | else: 257 | raise TypeError("Unidentified node type") 258 | 259 | #prevent underflow 260 | self.normaliser = logsumexp(self.responsibilities, axis = 1) 261 | 262 | 263 | def propagate_prediction(self,X,nodes,predict_type = "predict_response", y_lo=None, y_hi=None): 264 | ''' 265 | Returns weighted mean of predictions in experts which are in subtree 266 | 267 | Parameters: 268 | ----------- 269 | 270 | X: numpy array of size 'unkonwn x m' 271 | Explanatory variables for test set 272 | 273 | nodes: list of size equal number of nodes in HME 274 | List with all nodes of HME 275 | 276 | predict_type: str 277 | Can be "predict_response", "predict_prob", "predict_cdf" 278 | "predict_resposne" - works for all type of experts 279 | "predict_prob" - works for classification experts ('wgda','softmax') 280 | "predict_cdf" - works only for 'gaussian' expert 281 | 282 | Returns: 283 | -------- 284 | 285 | mean_prediction: numpy array of size 'unknown x m' 286 | Weighted prediction 287 | ''' 288 | self._prior(X) 289 | children = self.get_childrens(nodes) 290 | n,m = np.shape(X) 291 | mean_prediction = None 292 | for i,child in enumerate(children): 293 | w = np.exp(self.responsibilities[:,i]) 294 | children_average = child.propagate_prediction(X,nodes,predict_type,y_lo,y_hi) 295 | if len(children_average.shape) > 1: 296 | k = children_average.shape[1] 297 | w = np.outer(w,np.ones(k)) 298 | if mean_prediction is None: 299 | mean_prediction = (w * children_average) 300 | else: 301 | mean_prediction += (w * children_average) 302 | return mean_prediction 303 | 304 | 305 | def _m_step_update(self,H,X): 306 | ''' Updates parameters running weighted softmax regression ''' 307 | self.model.fit(H,X,self.bound_weights) 308 | 309 | 310 | def _prior(self,X): 311 | '''Calculates prior probabilities for latent variables''' 312 | probs = self.model.predict_log_probs(X) 313 | self.responsibilities = probs 314 | 315 | 316 | #----------------------------------------- implementations of Gaters ---------------------------------------------# 317 | 318 | 319 | class GaterNodeSoftmax(AbstractGaterNode): 320 | ''' 321 | Gate node of Hierarchical Mixture of Experts with softmax transfer function. 322 | Calculates responsibilities and updates parmameters using weighted softmax regression. 323 | ''' 324 | 325 | def __init__(self,*args,**kwargs): 326 | ''' Initialises gate node ''' 327 | super(GaterNodeSoftmax,self).__init__(*args,**kwargs) 328 | self.model = sr.SoftmaxRegression(self.conv_threshold, self.max_iter,self.stop_learning_sr) 329 | self.model.init_params(self.m,self.k) 330 | 331 | 332 | class GaterNodeWGDA(AbstractGaterNode): 333 | ''' 334 | Gate node of Hierarchical Mixture of Experts with weighted gaussian discriminant 335 | analysis as gating model. Calculates responsibilities and updates parameters 336 | of gating model. 337 | ''' 338 | 339 | def __init__(self,*args,**kwargs): 340 | ''' Initialises gate node ''' 341 | super(GaterNodeWGDA,self).__init__(*args,**kwargs) 342 | self.model = wgda.WeightedGaussianDiscriminantAnalysis(bias_term = self.bias, 343 | stop_learning = self.stop_learning_wgda) 344 | if self.bias is True: 345 | self.model.init_params(self.m-1,self.k) 346 | else: 347 | self.model.init_params(self.m,self.k) 348 | 349 | 350 | 351 | ################################################## Expert Nodes ########################################################## 352 | 353 | 354 | #----------------------------------------- Abstarct Expert Class ---------------------------------------------------------# 355 | 356 | 357 | 358 | class ExpertNodeAbstract(Node): 359 | ''' 360 | Abstract Base Class for experts (linear, logistic etc. regressions) 361 | ''' 362 | 363 | def down_tree_pass(self,X,Y,nodes): 364 | ''' 365 | Calculates responsibilities and performs weighted maximum likelihood 366 | estimation. 367 | 368 | Parameters: 369 | ----------- 370 | 371 | X: numpy array of size 'n x m' 372 | Explanatory variables 373 | 374 | Y: numpy array of size 'n x m' 375 | Target variables that should be approximated 376 | 377 | nodes: list of size equal number of nodes in HME 378 | List with all nodes of HME 379 | 380 | ''' 381 | # E-step of EM algorithm 382 | parent, birth_order = self.get_parent_and_birth_order(nodes) 383 | 384 | self.weights = parent.responsibilities[:,birth_order] - parent.normaliser 385 | self.weights += parent.weights 386 | 387 | # prevent underflow in weighted regressions 388 | self.bound_weights = bounded_variable(np.exp(self.weights),self.underflow_tol) 389 | 390 | # M-step of EM algorithm 391 | self._m_step_update(X,Y) 392 | 393 | 394 | def up_tree_pass(self,X,Y): 395 | ''' 396 | Calculates prior probability of latent variables corresponding to 397 | expert at node and likelihood. 398 | 399 | Parameters: 400 | ----------- 401 | 402 | X: numpy array of size 'n x m' 403 | Explanatory variables 404 | 405 | Y: numpy array of size 'n x 1' 406 | Target variable that should be approximated 407 | 408 | ''' 409 | self._prior(X,Y) 410 | 411 | 412 | def propagate_prediction(self,X,nodes, predict_type = "mean",y_lo=None,y_hi=None): 413 | ''' 414 | Returns prediction of expert for test input X 415 | 416 | Parameters: 417 | ----------- 418 | 419 | X: numpy array of size 'unkonwn x m' 420 | Explanatory variables for test set 421 | 422 | nodes: list of size equal number of nodes in HME 423 | List with all nodes of HME 424 | 425 | predict_type: str 426 | Can be "predict_response", "predict_prob", "predict_cdf" 427 | "predict_resposne" - works for all type of experts 428 | "predict_prob" - works for classification experts ('wgda','softmax') 429 | "predict_cdf" - works only for 'gaussian' expert 430 | 431 | Returns: 432 | -------- 433 | : numpy array of size 'unknown x m' 434 | Weighted prediction 435 | 436 | ''' 437 | if predict_type == "predict_probs": 438 | return self.model.predict_probs(X) 439 | elif predict_type == "predict_response": 440 | return self.model.predict(X) 441 | elif predict_type == "predict_cdf": 442 | return self.model.posterior_cdf(X,y_lo,y_hi) 443 | else: 444 | raise NotImplementedError("Not implemented prediction type") 445 | 446 | 447 | def propagate_log_probs(self,X,Y): 448 | ''' Returns probability of observing Y given X and parameters''' 449 | return self.model.posterior_log_probs(X,Y) 450 | 451 | 452 | def _prior(self,X,Y): 453 | ''' Calculates probability of observing Y given X and parameters of regression ''' 454 | self.weights = self.model.posterior_log_probs(X,Y) 455 | 456 | 457 | def _m_step_update(self,X,Y): 458 | ''' Updates parameters of linear regression (coefficient and estimates of variance) ''' 459 | # parameters are updated and saved in expert 460 | self.model.fit(Y,X,self.bound_weights) 461 | 462 | 463 | 464 | #-------------------------------------- Implementation of Expert Nodes --------------------------------------------------# 465 | 466 | 467 | class ExpertNodeLinReg(ExpertNodeAbstract): 468 | ''' 469 | Expert node in Hierarchical Mixture of Experts, with expert being 470 | standard weighted linear regression. 471 | ''' 472 | 473 | def __init__(self,*args,**kwargs): 474 | ''' Initialise linear regression expert node ''' 475 | super(ExpertNodeLinReg,self).__init__(*args,**kwargs) 476 | self.model = wlr.WeightedLinearRegression(stop_learning = self.stop_learning_wlr) 477 | self.model.init_params(self.m) 478 | self.node_type = "expert" 479 | 480 | 481 | class ExpertNodeSoftmaxReg(ExpertNodeAbstract): 482 | ''' 483 | Expert Node with Softmax model as an expert 484 | ''' 485 | 486 | def __init__(self,*args, **kwargs): 487 | super(ExpertNodeSoftmaxReg,self).__init__(*args,**kwargs) 488 | self.model = sr.SoftmaxRegression( tolerance = self.conv_threshold, 489 | max_iter = self.max_iter, 490 | stop_learning = self.stop_learning_sr) 491 | self.model.init_params(self.m, self.classes) 492 | self.node_type = "expert" 493 | 494 | 495 | 496 | class ExpertNodeWGDA(ExpertNodeAbstract): 497 | ''' 498 | Expert Node with Gaussian Discriminant Analysis as an expert 499 | ''' 500 | 501 | def __init__(self,*args,**kwargs): 502 | super(ExpertNodeWGDA,self).__init__(*args,**kwargs) 503 | self.model = wgda.WeightedGaussianDiscriminantAnalysis(stop_learning = self.stop_learning_wgda, 504 | bias_term = self.bias) 505 | if self.bias is True: 506 | self.model.init_params(self.m-1,self.classes) 507 | else: 508 | self.model.init_params(self.m,self.classes) 509 | self.node_type ="expert" 510 | 511 | 512 | 513 | --------------------------------------------------------------------------------