├── LICENSE ├── README.md ├── free_energy_clustering ├── FE_landscape_clustering.py ├── GMM.py ├── GMM_free_energy.py ├── __init__.py ├── cluster_density.py ├── cross_validation.py ├── free_energy_pathways.py └── stack_landscapes.py ├── toy_models ├── Kmeans_cluster.py ├── __init__.py ├── agglomerative_ward_cluster.py ├── evaluate_toy_models.py ├── spectral_cluster.py ├── toy_model_GMM_2D.py ├── toy_model_blobs.py ├── toy_model_moons.py ├── toy_model_multiple_GMMs.py └── toy_model_nonlinear_GMM_2D.py └── tutorial_free_energy_clustering.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 delemottelab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Free energy estimation and clustering with InfleCS 2 | This repository contains a jupyter notebook tutorial together with the necessary information and code for estimating free energy landscapes with Gaussian mixture models and extracting core-states from density maxima with InfleCS clustering. 3 | 4 | Dependencies: 5 | * Python 3.6 6 | * Scikit-learn 0.19 or later 7 | * Matplotlib 8 | 9 | ----------------------------------------------------------- 10 | # References 11 | **Free energy estimation with Gaussian mixture models**
12 | *Inference of Calmodulin’s Ca2+-Dependent Free Energy Landscapes via Gaussian Mixture Model Validation*
13 | Annie M. Westerlund, Tyler J. Harpole, Christian Blau, and Lucie Delemotte
14 | Journal of Chemical Theory and Computation, 2018
15 | DOI: 10.1021/acs.jctc.7b00346
16 | 17 | 18 | **Clustering with InfleCS**
19 | *InfleCS: Clustering Free Energy Landscapes with Gaussian Mixtures*
20 | Annie M. Westerlund, Lucie Delemotte
21 | Journal of Chemical Theory and Computation, 2019
22 | DOI: 10.1021/acs.jctc.9b00454
23 | 24 | ---------------------------------------------------------- 25 | Annie Westerlund, KTH Royal Institute of Technology, 2019 26 | -------------------------------------------------------------------------------- /free_energy_clustering/FE_landscape_clustering.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from scipy.optimize import fmin_cg 4 | import free_energy_clustering.cluster_density as cluster 5 | from scipy.spatial.distance import cdist 6 | from scipy.stats import multivariate_normal 7 | 8 | class LandscapeClustering(): 9 | 10 | def __init__(self, ensemble_of_GMMs=False, verbose=True): 11 | self.cluster_centers_ = None 12 | self.labels_ = None 13 | self.ensemble_of_GMMs = ensemble_of_GMMs 14 | self.clusterer_ = None 15 | self.verbose_ = verbose 16 | return 17 | 18 | def get_cluster_representative(self, x, labels, free_energies): 19 | """ 20 | Get one point in each cluster that has minimum FE in that cluster 21 | """ 22 | n_clusters = int(np.max(labels) + 1) 23 | n_points = x.shape[0] 24 | 25 | print('Cluster labels: '+str(np.unique(labels))) 26 | 27 | min_FE_inds = np.zeros(n_clusters-1) 28 | all_inds = np.arange(n_points) 29 | mask = np.ones(n_clusters-1,dtype=bool) 30 | for i_cluster in range(1,n_clusters): 31 | cluster_inds = all_inds[labels == i_cluster] 32 | if cluster_inds.shape[0] > 0: 33 | min_FE_inds[i_cluster-1] = cluster_inds[np.argmin(free_energies[cluster_inds])] 34 | else: 35 | min_FE_inds[i_cluster-1] = np.nan 36 | mask[i_cluster-1]=False 37 | print('No point in cluster '+str(i_cluster)) 38 | 39 | self.cluster_centers_ = min_FE_inds[mask].astype(int) 40 | return self.cluster_centers_ 41 | 42 | def assign_transition_points(self, cluster_indices, points, density_model): 43 | """ 44 | Assign cluster indices to transition points by maximizing density towards local maximum and use this to assign 45 | the cluster index. 46 | :return: 47 | """ 48 | print("Assigning cluster indices to non-core cluster points.") 49 | if np.sum(cluster_indices) == 0: # If all points are marked as transition points 50 | return cluster_indices+1 51 | 52 | cl_inds_final = np.copy(cluster_indices) 53 | transition_point_inds = np.where(cluster_indices==0)[0] 54 | n_assigned = np.sum(cluster_indices>0) 55 | 56 | # Sort points from higher to lower density 57 | density_all = density_model.density(points) 58 | densities_trans_points = density_all[transition_point_inds] 59 | 60 | # Sort transition points in decending density order (assign cluster index to highest density points first) 61 | sort_inds = np.argsort(-densities_trans_points) 62 | transition_point_inds = transition_point_inds[sort_inds] 63 | 64 | counter = 0 65 | for ind in transition_point_inds: 66 | 67 | point = points[ind] 68 | # Extract assigned points 69 | assigned_inds = np.where(cl_inds_final>0)[0] 70 | assigned_points = points[assigned_inds,:] 71 | distances = cdist(point[np.newaxis,:],assigned_points) 72 | 73 | # Find closest assigned point. Use its cluster index on the current unassigned point. 74 | closest_point = np.argmin(distances[0,:]) 75 | cl_inds_final[ind] = cl_inds_final[assigned_inds[closest_point]] 76 | 77 | n_assigned += 1 78 | counter += 1 79 | return cl_inds_final 80 | 81 | def _compute_gradients(self, density_model, points, inv_covs=None): 82 | n_points = points.shape[0] 83 | n_dims = points.shape[1] 84 | n_components = density_model.n_components_ 85 | 86 | means = density_model.means_ 87 | covs = density_model.covariances_ 88 | weights = density_model.weights_ 89 | 90 | gradients = np.zeros((n_points, n_dims)) 91 | 92 | compute_inv_covs = False 93 | if inv_covs is None: 94 | inv_covs = [np.zeros((n_dims, n_dims))] * n_components 95 | compute_inv_covs = True 96 | 97 | for i_component in range(n_components): 98 | if compute_inv_covs: 99 | inv_covs[i_component] = np.linalg.inv(covs[i_component]) 100 | 101 | devs = points - means[i_component] 102 | exp_deriv = -devs.dot(inv_covs[i_component]) 103 | for i_point in range(n_points): 104 | gradients[i_point, :] += weights[i_component] * exp_deriv[i_point, :] * multivariate_normal.pdf( 105 | points[i_point, :], mean=means[i_component], cov=covs[i_component]) 106 | if compute_inv_covs: 107 | return gradients, inv_covs 108 | return gradients 109 | 110 | def _compute_GMM_Hessian(self, density_model, x, inv_covs): 111 | n_dims = x.shape[0] 112 | n_components = density_model.n_components_ 113 | 114 | means = density_model.means_ 115 | covs = density_model.covariances_ 116 | weights = density_model.weights_ 117 | 118 | hessian = np.zeros((n_dims, n_dims)) 119 | 120 | for i_component in range(n_components): 121 | devs = x - means[i_component] 122 | exp_deriv = -devs.dot(inv_covs[i_component]) 123 | 124 | # Compute Hessian at current point 125 | for i_dim in range(n_dims): 126 | for j_dim in range(n_dims): 127 | post_weight = weights[i_component] * multivariate_normal.pdf(x, mean=means[i_component], 128 | cov=covs[i_component]) 129 | hessian[i_dim, j_dim] += post_weight * ( 130 | -inv_covs[i_component][i_dim, j_dim] + exp_deriv[i_dim] * exp_deriv[j_dim]) 131 | 132 | return hessian 133 | 134 | def _compute_GMM_FE_Hessian(self, density_model, x, inv_covs): 135 | n_dims = x.shape[0] 136 | n_components = density_model.n_components_ 137 | 138 | means = density_model.means_ 139 | covs = density_model.covariances_ 140 | weights = density_model.weights_ 141 | 142 | hessian = np.zeros((n_dims, n_dims)) 143 | 144 | point = x[np.newaxis,:] 145 | gradient = self._compute_gradients(density_model, point, inv_covs=inv_covs) 146 | density = density_model.density(point) 147 | density[density<1e-15] = 1e-15 148 | 149 | for i_component in range(n_components): 150 | devs = x - means[i_component] 151 | exp_deriv = -devs.dot(inv_covs[i_component]) 152 | 153 | # Compute Hessian at current point 154 | for i_dim in range(n_dims): 155 | for j_dim in range(n_dims): 156 | post_weight = weights[i_component] * multivariate_normal.pdf(x, mean=means[i_component], 157 | cov=covs[i_component]) 158 | hessian[i_dim, j_dim] += post_weight * ( 159 | -inv_covs[i_component][i_dim, j_dim] + exp_deriv[i_dim] * exp_deriv[j_dim]) 160 | 161 | for i_dim in range(n_dims): 162 | for j_dim in range(n_dims): 163 | FE_hess = 1.0/density**2 * gradient[0,i_dim]*gradient[0,j_dim]-hessian[i_dim, j_dim]/density 164 | hessian[i_dim, j_dim] = FE_hess 165 | 166 | return hessian 167 | 168 | def _Hessian_def(self, density_model, points, use_FE_landscape=False): 169 | """ 170 | Compute the Hessian in every point to check whether they belong to a 171 | free energy minimum or not. 172 | """ 173 | n_points = points.shape[0] 174 | n_dims = points.shape[1] 175 | 176 | if self.ensemble_of_GMMs: 177 | n_models = density_model.n_models_ 178 | 179 | is_FE_min = [False] * n_points 180 | 181 | # Compute all inverse covariances 182 | if self.ensemble_of_GMMs: 183 | all_inv_covs = [0]*n_models 184 | n_components = [0]*n_models 185 | for i_model in range(n_models): 186 | 187 | n_components = density_model.GMM_list_[i_model].n_components_ 188 | 189 | inv_covs = [np.zeros((n_dims, n_dims))] * n_components 190 | for i_component in range(n_components): 191 | inv_covs[i_component] = np.linalg.inv(density_model.GMM_list_[i_model].covariances_[i_component]) 192 | all_inv_covs[i_model] = inv_covs 193 | else: 194 | n_components = density_model.n_components_ 195 | inv_covs = [np.zeros((n_dims, n_dims))] * n_components 196 | for i_component in range(n_components): 197 | inv_covs[i_component] = np.linalg.inv(density_model.covariances_[i_component]) 198 | 199 | # Computing Hessian to determine whether point belongs to FE min or not 200 | if use_FE_landscape: 201 | print('Computing Hessians of free energy landscape.') 202 | else: 203 | print('Computing Hessians of density landscape.') 204 | 205 | for i_point, x in enumerate(points): 206 | if self.verbose_: 207 | sys.stdout.write("\r"+'Point: '+str(i_point+1)+'/'+str(points.shape[0])) 208 | sys.stdout.flush() 209 | if self.ensemble_of_GMMs: 210 | hessian = np.zeros((n_dims,n_dims)) 211 | for i_model in range(n_models): 212 | if density_model.model_weights_[i_model] > 0: 213 | if use_FE_landscape: 214 | hessian += density_model.model_weights_[i_model] * self._compute_GMM_FE_Hessian( 215 | density_model.GMM_list_[i_model], x, all_inv_covs[i_model]) 216 | else: 217 | hessian += density_model.model_weights_[i_model]*self._compute_GMM_Hessian(density_model.GMM_list_[i_model], 218 | x, all_inv_covs[i_model]) 219 | else: 220 | if use_FE_landscape: 221 | hessian = self._compute_GMM_FE_Hessian(density_model, x, inv_covs) 222 | else: 223 | hessian = self._compute_GMM_Hessian(density_model, x, inv_covs) 224 | 225 | # Compute Hessian eigenvalues 226 | eigvals = np.linalg.eigvals(hessian) 227 | 228 | if use_FE_landscape: 229 | # Check: if Hessian is positive definite => the point is at a free energy minimum 230 | if eigvals.min() > 0.0: 231 | is_FE_min[i_point] = True 232 | else: 233 | # Check: if Hessian is negative definite => the point is at a density maximum 234 | if eigvals.max() < 0.0: 235 | is_FE_min[i_point] = True 236 | if self.verbose_: 237 | print() 238 | return is_FE_min 239 | 240 | def cluster(self, density_models, points, eval_points=None, use_FE_landscape=False, transition_matrix=None): 241 | # Indicate whether points are at free energy minimum or not 242 | is_FE_min = self._Hessian_def(density_models, points, use_FE_landscape=use_FE_landscape) 243 | self.grid_points_=points 244 | # Cluster free energy landscape 245 | self.clusterer_ = cluster.ClusterDensity(points, eval_points) 246 | self.labels_ = self.clusterer_.cluster_data(is_FE_min, transition_matrix=transition_matrix) 247 | return self.labels_, is_FE_min 248 | 249 | -------------------------------------------------------------------------------- /free_energy_clustering/GMM.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import math 3 | import numpy as np 4 | from scipy.stats import multivariate_normal 5 | 6 | class GaussianMixture(): 7 | 8 | def __init__(self,n_components=2, convergence_tol=1e-6, verbose=False): 9 | self.n_components_ = n_components 10 | self.weights_ = np.ones(n_components)/float(n_components) 11 | self.means_ = np.zeros(n_components) 12 | self.covariances_ = [np.zeros((n_components,n_components))]*n_components 13 | self.tol_ = convergence_tol 14 | self.data_weights_ = None 15 | self.verbose_ = verbose 16 | return 17 | 18 | def fit(self, x, data_weights=None): 19 | """ 20 | Fit GMM to points in x with EM. 21 | :param data_weights: Weights of each data point. 22 | """ 23 | 24 | if data_weights is not None: 25 | x = x[data_weights>0] 26 | data_weights = data_weights[data_weights>0] 27 | data_weights = data_weights/np.sum(data_weights) 28 | data_weights = data_weights * data_weights.shape[0] 29 | 30 | self.data_weights_ = data_weights 31 | while True: 32 | prev_loglikelihood = np.inf 33 | loglikelihood = 0 34 | self._initialize_parameters(x) 35 | 36 | while(np.abs(prev_loglikelihood-loglikelihood) > self.tol_): 37 | 38 | gamma = self._expectation(x, self.data_weights_) 39 | self._maximization(x, gamma) 40 | 41 | prev_loglikelihood= loglikelihood 42 | loglikelihood = self.loglikelihood(x, self.data_weights_) 43 | 44 | break 45 | return self 46 | 47 | def predict(self,x): 48 | gamma = self._expectation(x) 49 | labels = np.argmax(gamma,axis=0) 50 | return labels 51 | 52 | def _initialize_parameters(self,x): 53 | """ 54 | Initialize component means and covariances 55 | """ 56 | n_points = x.shape[0] 57 | inds = np.random.randint(n_points,size=self.n_components_) 58 | 59 | # Initialize means 60 | self.means_ = x[inds,:] 61 | # Initialize covariances 62 | tmp_cov = np.cov(x.T) 63 | for i_component in range(self.n_components_): 64 | self.covariances_[i_component] = tmp_cov 65 | return 66 | 67 | def _expectation(self,x, data_weights=None): 68 | """ 69 | Perform expecation step 70 | """ 71 | n_points = x.shape[0] 72 | gamma = np.zeros((self.n_components_,n_points)) 73 | 74 | for i_component in range(self.n_components_): 75 | 76 | normal_density = multivariate_normal.pdf(x, mean=self.means_[i_component], cov=self.covariances_[i_component]) 77 | gamma[i_component, :] = self.weights_[i_component]*normal_density 78 | 79 | gamma /= np.sum(gamma,axis=0) 80 | 81 | if data_weights is not None: 82 | gamma = np.multiply(gamma, data_weights) 83 | 84 | return gamma 85 | 86 | def _maximization(self,x, gamma): 87 | """ 88 | Update parameters with maximization step 89 | """ 90 | self._update_weights(x, gamma) 91 | self._update_means(x, gamma) 92 | self._update_covariances(x, gamma) 93 | return 94 | 95 | def _update_weights(self,x, gamma): 96 | """ 97 | Update each component amplitude. 98 | """ 99 | 100 | self.weights_ = np.sum(gamma,axis=1) 101 | 102 | # Normalize Cat-distibution 103 | self.weights_ /= np.sum(self.weights_) 104 | return 105 | 106 | 107 | def _update_means(self,x, gamma): 108 | """ 109 | Update each component mean. 110 | """ 111 | Nk = np.sum(gamma,axis=1) 112 | for i_component in range(self.n_components_): 113 | self.means_[i_component, :] = np.dot(x.T,gamma[i_component])/Nk[i_component] 114 | 115 | return 116 | 117 | def _update_covariances(self, x, gamma): 118 | """ 119 | Update each component covariance 120 | """ 121 | n_dims = x.shape[1] 122 | 123 | Nk = np.sum(gamma, axis=1) 124 | for i_component in range(self.n_components_): 125 | y = x - self.means_[i_component] 126 | y2 = np.multiply(gamma[i_component,:,np.newaxis],y).T 127 | self.covariances_[i_component] = y2.dot(y)/Nk[i_component] + 1e-9*np.eye(n_dims) 128 | 129 | return 130 | 131 | def density(self, x): 132 | """ 133 | Compute GMM density at given points, x. 134 | """ 135 | n_points = x.shape[0] 136 | n_dims = x.shape[1] 137 | 138 | density = np.zeros(n_points) 139 | for i_component in range(self.n_components_): 140 | normal_density = multivariate_normal.pdf(x, mean=self.means_[i_component], cov=self.covariances_[i_component]) 141 | density += self.weights_[i_component]*normal_density 142 | 143 | return density 144 | 145 | def loglikelihood(self, x, data_weights=None): 146 | """ 147 | Compute log-likelihood. Support data weights. 148 | """ 149 | density = self.density(x) 150 | density[density<1e-15] = 1e-15 151 | if data_weights is None: 152 | log_density = np.log(density) 153 | else: 154 | log_density = np.multiply(np.log(density), data_weights) 155 | return np.mean(log_density) 156 | 157 | def bic(self, x, data_weights=None): 158 | """ 159 | Compute BIC score. Support data weights. 160 | """ 161 | n_points, n_dims = x.shape 162 | n_params = (1 + n_dims + n_dims * (n_dims + 1) / 2.0) * self.n_components_ 163 | loglikelihood = n_points * self.loglikelihood(x, data_weights=data_weights) 164 | return -2.0 * loglikelihood + n_params * math.log(n_points) 165 | 166 | def aic(self, x, data_weights=None): 167 | """ 168 | Compute BIC score. Support data weights. 169 | """ 170 | n_points, n_dims = x.shape 171 | n_params = (1 + n_dims + n_dims * (n_dims + 1) / 2.0) * self.n_components_ 172 | loglikelihood = n_points * self.loglikelihood(x, data_weights=data_weights) 173 | return -2.0 * loglikelihood + 2.0 * n_params 174 | 175 | def sample(self, n_points): 176 | """ 177 | Sample points from the density model. 178 | :param n_points: 179 | :return: 180 | """ 181 | n_dims = self.means_.shape[1] 182 | sampled_points = np.zeros((n_points, n_dims)) 183 | prob_component = np.cumsum(self.weights_) 184 | r = np.random.uniform(size=n_points) 185 | 186 | is_point_sampled = np.zeros((n_points), dtype=int) 187 | 188 | for i_point in range(n_points): 189 | for i_component in range(self.n_components_): 190 | if r[i_point] <= prob_component[i_component]: 191 | sampled_points[i_point, :] = np.random.multivariate_normal(self.means_[i_component], 192 | self.covariances_[i_component], 1) 193 | is_point_sampled[i_point] = 1 194 | break 195 | if is_point_sampled[i_point] ==0: 196 | print('Warning: Did not sample point: '+str(r[i_point])+' '+str(prob_component)) 197 | return sampled_points 198 | -------------------------------------------------------------------------------- /free_energy_clustering/GMM_free_energy.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | from scipy.spatial.distance import cdist 4 | 5 | import free_energy_clustering.GMM as GMM 6 | from sklearn.mixture import GaussianMixture 7 | import free_energy_clustering.cross_validation as CV 8 | import free_energy_clustering as FEC 9 | 10 | import matplotlib 11 | import matplotlib.pyplot as plt 12 | from mpl_toolkits.mplot3d import Axes3D 13 | 14 | class FreeEnergyClustering(object): 15 | 16 | def __init__(self, data, min_n_components=8, max_n_components=None, n_components_step=1, x_lims=None, temperature=300.0, 17 | n_grids=50, n_splits=1, shuffle_data=False, n_iterations=1, convergence_tol=1e-4, stack_landscapes=False, 18 | verbose=True, test_set_perc=0.0, data_weights=None): 19 | """ 20 | Class for computing free energy landscape in [kcal/mol]. 21 | - observed_data has dimensionality [N x d]. 22 | """ 23 | self.data_ = data 24 | self.shuffle_data = shuffle_data 25 | self.n_splits_ = n_splits 26 | self.n_iterations_ = n_iterations 27 | self.convergence_tol_ = convergence_tol 28 | self.stack_landscapes_ = stack_landscapes 29 | 30 | self.min_n_components = min_n_components 31 | self.max_n_components = max_n_components 32 | self.n_components_step = n_components_step 33 | 34 | self.FE_points_ = None 35 | self.FE_landscape_ = None 36 | self.coords_ = None 37 | self.min_FE_ = None 38 | 39 | self.cl_ = None # Clustering object 40 | self.labels_ = None 41 | self.core_labels_ = None 42 | self.cluster_centers_ = None 43 | self.pathways_ = None 44 | self.state_populations_ = None 45 | 46 | if x_lims is not None: 47 | self.x_lims_ = x_lims 48 | self.n_dims_ = len(self.x_lims_) 49 | else: 50 | if len(data.shape) > 1: 51 | self.x_lims_ = [] 52 | for i in range(data.shape[1]): 53 | self.x_lims_.append([data[:,i].min(),data[:,i].max()]) 54 | self.n_dims_ = len(self.x_lims_) 55 | else: 56 | self.x_lims_ = [[data.min(),data.max()]] 57 | self.n_dims_ = 1 58 | 59 | self.temperature_ = temperature # [K] 60 | self.boltzmann_constant_ = 0.0019872041 # [kcal/(mol K)] 61 | self.density_est_ = None 62 | self.standard_error_FE_ = None 63 | self.nx_ = n_grids 64 | self.n_grids_ = [self.nx_]*self.n_dims_ 65 | self.test_set_perc_ = test_set_perc 66 | self.verbose_ = verbose 67 | self.data_weights_ = data_weights 68 | 69 | self.BICs_ = [] 70 | 71 | if data_weights is not None: 72 | use_data_weights = True 73 | # Convert data weights to the right format 74 | self.data_weights_ /= self.data_weights_.sum() 75 | self.data_weights_ *= self.data_weights_.shape[0] 76 | else: 77 | use_data_weights = False 78 | 79 | self.test_set_loglikelihood = None 80 | if verbose: 81 | print('*----------------Gaussian mixture model free energy estimator----------------*') 82 | print(' n_splits = '+str(n_splits)) 83 | print(' shuffle_data = ' + str(shuffle_data)) 84 | print(' n_iterations = ' + str(n_iterations)) 85 | print(' n_grids = ' + str(n_grids)) 86 | print(' covergence_tol = ' + str(convergence_tol)) 87 | print(' stack_landscapes = ' + str(stack_landscapes)) 88 | print(' x_lims (axes limits) = ' + str(self.x_lims_)) 89 | print(' temperature = ' + str(temperature)) 90 | print(' min_n_components = ' + str(min_n_components)) 91 | print(' max_n_components = ' + str(max_n_components)) 92 | print(' n_components_step = ' + str(n_components_step)) 93 | print(' Using weighted data: ' + str(use_data_weights)) 94 | print('*----------------------------------------------------------------------------*') 95 | return 96 | 97 | def _get_grid_coords(self): 98 | if self.n_dims_ < 4: 99 | x = [] 100 | self.n_grids_ = [] 101 | for i_dim in range(self.n_dims_): 102 | self.n_grids_.append(self.nx_) 103 | x.append(np.linspace(self.x_lims_[i_dim][0], self.x_lims_[i_dim][1], self.nx_)) 104 | 105 | if self.n_dims_ == 1: 106 | return x 107 | coords = np.meshgrid(*x) 108 | else: 109 | # Do not discretize 110 | print('Note: # features > 3 => density not evaluated on grid.') 111 | coords = None 112 | 113 | return coords 114 | 115 | def _density_landscape(self, density_est): 116 | """ 117 | Evaluate density model at the grid points. 118 | """ 119 | if self.coords_ is None: 120 | coords = self._get_grid_coords() 121 | else: 122 | coords = self.coords_ 123 | 124 | if self.n_dims_ == 1: 125 | densities = density_est.density(coords[0][:,np.newaxis]) 126 | return coords, densities 127 | 128 | if coords is not None: 129 | print('Density grid shape: '+str(self.n_grids_)) 130 | grid_points_flatten = [] 131 | for x in coords: 132 | grid_points_flatten.append(np.ravel(x)) 133 | points = np.asarray(grid_points_flatten).T 134 | densities = density_est.density(points) 135 | densities = np.reshape(densities, self.n_grids_) 136 | else: 137 | densities = density_est.density(self.data_) 138 | 139 | return coords, densities 140 | 141 | def _free_energy(self,density): 142 | density[density < 1e-8] = 1e-8 143 | FE = -self.temperature_ * self.boltzmann_constant_ * np.log(density) 144 | return FE 145 | 146 | def standard_error(self, n_data_blocks=3): 147 | """ 148 | Estimating standard error. 149 | """ 150 | print('Estimating standard error.') 151 | n_points = self.data_.shape[0] 152 | n_data_points = int(n_points/n_data_blocks) 153 | 154 | free_energies = [] 155 | 156 | for i in range(n_data_blocks): 157 | 158 | if i != n_data_blocks-1: 159 | data = np.copy(self.data_[i*n_data_points:(i+1)*n_data_points]) 160 | else: 161 | data = np.copy(self.data_[i*n_data_points::]) 162 | 163 | if self.n_dims_ == 1: 164 | data = data[:,np.newaxis] 165 | 166 | _, density_model = self._fit_FE(data, set_density_model=False) 167 | _, density = self._density_landscape(density_model) 168 | free_energies.append(self._free_energy(density)) 169 | 170 | free_energies = np.asarray(free_energies) 171 | self.standard_error_FE_ = np.std(free_energies,axis=0)/np.sqrt(n_data_blocks-1) 172 | print('Standard error estimation done.') 173 | return self.standard_error_FE_ 174 | 175 | def _train_GMM(self, data, n_components, train_inds=None, val_inds=None, loglikelihood=0): 176 | """ 177 | Perform one training of GMM. 178 | :param data: 179 | :param n_components: 180 | :return: 181 | """ 182 | 183 | if train_inds is not None and val_inds is not None: 184 | training_data, validation_data = CV.get_train_validation_set(data, train_inds, val_inds) 185 | else: 186 | training_data = np.copy(data) 187 | validation_data = np.copy(data) 188 | 189 | if self.data_weights_ is None: 190 | gmm = GaussianMixture(n_components=n_components, tol=self.convergence_tol_) 191 | 192 | # Train model on the current training data 193 | gmm.fit(training_data) 194 | 195 | # Check log-likelihood of validation data 196 | loglikelihood += gmm.score(validation_data) 197 | else: 198 | gmm = GMM.GaussianMixture(n_components=n_components, convergence_tol=self.convergence_tol_,verbose=self.verbose_) 199 | 200 | training_data_weights = self.data_weights_ 201 | validation_data_weights = self.data_weights_ 202 | 203 | if train_inds is not None and val_inds is not None: 204 | if self.data_weights_ is not None: 205 | training_data_weights, validation_data_weights = CV.get_train_validation_set(self.data_weights_, 206 | train_inds, val_inds) 207 | 208 | # Train model on the current training data 209 | gmm.fit(training_data, data_weights=training_data_weights) 210 | 211 | # Check log-likelihood of validation data 212 | loglikelihood += gmm.loglikelihood(validation_data, data_weights=validation_data_weights) 213 | 214 | return gmm, loglikelihood 215 | 216 | def _fit_FE(self, data, set_density_model=True): 217 | """ 218 | Fit density to data points. 219 | :param data: [n_samples x n_dims] 220 | :return: free energy of points 221 | """ 222 | 223 | best_n_components = self.min_n_components 224 | 225 | # Extract test set from the dataset 226 | n_points_test = int(self.test_set_perc_*data.shape[0]) 227 | data_orig = np.copy(data) 228 | data_weights_orig = np.copy(self.data_weights_) 229 | 230 | if n_points_test > 0: 231 | test_data = data[-n_points_test::,:] 232 | data = np.copy(data[0:-n_points_test, :]) 233 | if self.data_weights_ is not None: 234 | self.data_weights_ = np.copy(self.data_weights_[0:-n_points_test,:]) 235 | else: 236 | test_data = np.zeros((0,self.n_dims_)) 237 | 238 | if self.stack_landscapes_: 239 | print('Estimating density with stacked GMMs.') 240 | else: 241 | print('Estimating density with GMM.') 242 | 243 | if self.data_weights_ is not None: 244 | print('Using weighted data to estimate GMM.') 245 | 246 | best_loglikelihood = -np.inf 247 | list_of_GMMs = [] 248 | list_of_validation_data = [] 249 | ICs = [] 250 | 251 | # Get indices of training and validation datasets 252 | if self.n_splits_ > 1: 253 | train_inds, val_inds = CV.split_train_validation(data, self.n_splits_, self.shuffle_data) 254 | 255 | # Determine number of components with k-fold cross-validation, 256 | # or store all estimated densities and then weight together. 257 | if self.max_n_components is not None: 258 | for n_components in range(self.min_n_components,self.max_n_components+1,self.n_components_step): 259 | if self.verbose_: 260 | print('# Components = '+str(n_components)) 261 | 262 | if self.n_splits_ > 1 and not(self.stack_landscapes_): 263 | loglikelihood = 0 264 | for i_split in range(self.n_splits_): 265 | gmm, loglikelihood = self._train_GMM(data, n_components, train_inds[i_split], val_inds[i_split], loglikelihood) 266 | 267 | # Keep best model 268 | if loglikelihood > best_loglikelihood: 269 | best_loglikelihood = loglikelihood 270 | best_n_components = n_components 271 | else: 272 | best_loglikelihood = -np.inf 273 | for i_iter in range(self.n_iterations_): 274 | # Train GMM 275 | gmm, loglikelihood = self._train_GMM(data, n_components) 276 | 277 | # Compute average AIC/BIC over iterations 278 | if i_iter == 0: 279 | if self.stack_landscapes_: 280 | if self.data_weights_ is None: 281 | ICs.append(gmm.aic(data)) 282 | else: 283 | ICs.append(gmm.aic(data, self.data_weights_)) 284 | else: 285 | if self.data_weights_ is None: 286 | ICs.append(gmm.bic(data)) 287 | else: 288 | ICs.append(gmm.bic(data, self.data_weights_)) 289 | 290 | # Keep best model 291 | if loglikelihood > best_loglikelihood: 292 | best_loglikelihood = loglikelihood 293 | if i_iter == 0: 294 | list_of_GMMs.append(GMM.GaussianMixture(n_components=n_components)) 295 | 296 | if self.stack_landscapes_: 297 | ICs[-1] = gmm.aic(data) 298 | else: 299 | ICs[-1] = gmm.bic(data) 300 | 301 | list_of_GMMs[-1].weights_ = gmm.weights_ 302 | list_of_GMMs[-1].means_ = gmm.means_ 303 | list_of_GMMs[-1].covariances_ = gmm.covariances_ 304 | 305 | if self.stack_landscapes_: 306 | if self.max_n_components is None: 307 | gmm, _ = self._train_GMM(data, self.min_n_components) 308 | list_of_GMMs.append(gmm) 309 | 310 | ICs = np.asarray(ICs) 311 | model_weights = np.exp(-0.5 *(ICs-ICs.min())) 312 | model_weights /= model_weights.sum() 313 | 314 | # Fit mixture of density estimators using the validation data 315 | density_est = FEC.LandscapeStacker(data, list_of_validation_data, list_of_GMMs, n_splits=1, 316 | convergence_tol=self.convergence_tol_, n_iterations=self.n_iterations_, 317 | model_weights=model_weights) 318 | 319 | density = density_est.density(data_orig) 320 | if set_density_model: 321 | self.density_est_ = density_est 322 | else: 323 | # Estimate FE with best number of components (deduced from cross-validation) 324 | if self.n_splits_ > 1: 325 | print('Training final model with ' + str(best_n_components) + ' components.') 326 | best_loglikelihood = -np.inf 327 | density_est = GMM.GaussianMixture(n_components=best_n_components) 328 | # Fit multiple times to 329 | for i_iter in range(self.n_iterations_): 330 | gmm, loglikelihood = self._train_GMM(data, best_n_components) 331 | 332 | if loglikelihood > best_loglikelihood: 333 | best_loglikelihood = loglikelihood 334 | density_est.weights_ = gmm.weights_ 335 | density_est.means_ = gmm.means_ 336 | density_est.covariances_ = gmm.covariances_ 337 | else: 338 | ICs = np.asarray(ICs) 339 | self.BICs_ = np.copy(ICs) 340 | model_ind = ICs.argmin() 341 | gmm = list_of_GMMs[model_ind] 342 | best_n_components = gmm.weights_.shape[0] 343 | density_est = GMM.GaussianMixture(n_components=best_n_components) 344 | 345 | print('Identifying final model with ' + str(density_est.n_components_) + ' components.') 346 | 347 | density_est.weights_ = gmm.weights_ 348 | density_est.means_ = gmm.means_ 349 | density_est.covariances_ = gmm.covariances_ 350 | 351 | density = density_est.density(data_orig) 352 | 353 | if set_density_model: 354 | self.density_est_ = density_est 355 | 356 | if set_density_model: 357 | # Compute test set loglikelihood on the test set if test set exists 358 | if n_points_test > 0: 359 | self.test_set_loglikelihood = self.density_est_.loglikelihood(test_data) 360 | return self._free_energy(density) 361 | else: 362 | return self._free_energy(density), density_est 363 | 364 | def landscape(self): 365 | """ 366 | Computing free energy landscape with 367 | G(x) = -kT*log(p(x|T)) 368 | Returns the X,Y coordinate matrices (meshgrid) and 369 | their corresponding free energy. 370 | """ 371 | 372 | if len(self.data_.shape) == 1: 373 | FE_points = self._fit_FE(self.data_[:,np.newaxis]) 374 | else: 375 | FE_points = self._fit_FE(self.data_) 376 | 377 | print('Evaluating density in landscape') 378 | coords, density = self._density_landscape(self.density_est_) 379 | 380 | FE_landscape = self._free_energy(density) 381 | 382 | # Shift to zero 383 | self.min_FE_ = np.min(FE_landscape) 384 | FE_landscape = FE_landscape-self.min_FE_ 385 | FE_points = FE_points-self.min_FE_ 386 | 387 | self.FE_points_ = FE_points 388 | self.FE_landscape_ = FE_landscape 389 | self.coords_ = coords 390 | 391 | return coords, FE_landscape, FE_points 392 | 393 | def evaluate_free_energy(self,data): 394 | """ 395 | Evaluate the free energy of given data in the current free energy model. 396 | """ 397 | density = self.density_est_.density(data) 398 | free_energy = self._free_energy(density) 399 | if self.min_FE_ is not None: 400 | free_energy -= self.min_FE_ 401 | 402 | return free_energy 403 | 404 | def population_states(self, n_sampled_points=10000): 405 | """ 406 | Estimate the population of states (probability to be in a state) based on Mante-Carlo integration of 407 | the estimated density and state definitions. 408 | :param n_sampled_points: 409 | :return: 410 | """ 411 | 412 | if self.stack_landscapes_: 413 | state_populations = None 414 | print('TODO: Estimating population of states is not possible with stacked landscapes yet.') 415 | else: 416 | 417 | print('Sampling points from density.') 418 | # Sample points from estimated density 419 | points = self.density_est_.sample(n_sampled_points) 420 | 421 | # Assign cluster labels of sampled points 422 | cluster_labels = self.evaluate_clustering(points) 423 | 424 | print('Computing state populations.') 425 | # Monte-Carlo integration (histogramming) 426 | self.state_populations_, _ = np.histogram(cluster_labels, bins=int(self.labels_.max()+1), range=(self.labels_.min(),self.labels_.max()),density=False) 427 | 428 | #print(state_populations) 429 | self.state_populations_ = self.state_populations_/self.state_populations_.sum() 430 | 431 | return self.state_populations_ 432 | 433 | def evaluate_clustering(self, points, assign_transition_points=False): 434 | """ 435 | Assign cluster indices to points based on precomputed density model clustering. 436 | """ 437 | print('Assigning cluster labels based on precomputed density model clustering.') 438 | if self.cl_ is not None and self.cl_.clusterer_ is not None: 439 | labels = self.cl_.clusterer_.data_cluster_indices(cdist(points, self.cl_.clusterer_.grid_points_), self.cl_.clusterer_.grid_cluster_inds_) 440 | 441 | if assign_transition_points: 442 | labels = self.cl_.assign_transition_points(labels, points, self.density_est_) 443 | 444 | return labels 445 | 446 | def cluster(self, points, free_energies, eval_points=None, return_center_coords=False, assign_transition_points=False,use_FE_landscape=False, unravel_grid=True, transition_matrix=None): 447 | """ 448 | Cluster points according to estimated density. 449 | """ 450 | 451 | self.transition_matrix_ = transition_matrix 452 | 453 | print('Clustering free energy landscape...') 454 | self.cl_ = FEC.LandscapeClustering(self.stack_landscapes_,verbose=self.verbose_) 455 | 456 | if eval_points is not None and unravel_grid: 457 | tmp_points = [] 458 | for x in points: 459 | tmp_points.append(np.ravel(x)) 460 | points = np.asarray(tmp_points).T 461 | 462 | if len(points.shape) == 1: 463 | points = points[:,np.newaxis] 464 | 465 | 466 | if eval_points is not None: 467 | if len(eval_points.shape) == 1: 468 | eval_points = eval_points[:,np.newaxis] 469 | 470 | self.labels_, self.is_FE_min = self.cl_.cluster(self.density_est_, points, eval_points=eval_points, use_FE_landscape=use_FE_landscape, transition_matrix=self.transition_matrix_) 471 | 472 | self.core_labels_ = np.copy(self.labels_) 473 | 474 | if eval_points is not None: 475 | self.cluster_centers_ = self.cl_.get_cluster_representative(eval_points, self.labels_, free_energies) 476 | else: 477 | self.cluster_centers_ = self.cl_.get_cluster_representative(points, self.labels_, free_energies) 478 | 479 | if assign_transition_points: 480 | if eval_points is not None: 481 | self.labels_ = self.cl_.assign_transition_points(self.labels_, eval_points, self.density_est_) 482 | else: 483 | self.labels_ = self.cl_.assign_transition_points(self.labels_, points, self.density_est_) 484 | 485 | print('Done clustering.') 486 | if return_center_coords: 487 | return self.labels_, eval_points[self.cluster_centers_,:] 488 | else: 489 | return self.labels_, self.cluster_centers_ 490 | 491 | def pathways(self, states_from, states_to,n_points=10, convergence_tol=1e-1, step_size=1e-3, max_iter=100): 492 | """ 493 | Calculate minimum pathways between points (indices) in states_from and states_to. 494 | :param states_from: 495 | :param states_to: 496 | :param n_points: 497 | :param convergence_tol: 498 | :param step_size: 499 | :return: 500 | """ 501 | pathway_estimator = FEC.FreeEnergyPathways(self.density_est_, self.data_, self.temperature_, 502 | n_points=n_points, convergence_tol=convergence_tol, 503 | step_size=step_size, ensemble_of_GMMs=self.stack_landscapes_, 504 | max_iter=max_iter) 505 | self.pathways_ = [] 506 | for from_ind, to_ind in zip(states_from,states_to): 507 | self.pathways_.append(pathway_estimator.minimum_pathway(from_ind, to_ind)) 508 | 509 | return 510 | 511 | def visualize(self,title="Free energy landscape", fontsize=30, savefig=True, xlabel='x', ylabel='y', zlabel='z', vmax=7.5, 512 | n_contour_levels=15, show_data=False, figsize= [12, 10], filename='free_energy_landscape', dx=1, ax=None, data_cmap='jet'): 513 | 514 | if self.n_dims_ > 3: 515 | print('Plotting does not support > 3 dimensions') 516 | return 517 | 518 | # Set custom colormaps 519 | my_cmap = copy.copy(matplotlib.cm.get_cmap('jet')) 520 | my_cmap.set_over('white') 521 | my_cmap_cont = matplotlib.colors.ListedColormap(['black']) 522 | my_cmap_cont.set_over('white') 523 | 524 | data_cmap = copy.copy(matplotlib.cm.get_cmap(data_cmap)) 525 | 526 | plt.rcParams['figure.figsize'] = figsize 527 | 528 | if ax is None: 529 | fig = plt.figure() 530 | if self.n_dims_ < 3: 531 | ax = fig.add_subplot(1, 1, 1) 532 | else: 533 | ax = fig.add_subplot(111, projection='3d') 534 | ax.tick_params(labelsize=fontsize - 2) 535 | 536 | plt.tick_params(axis='both', which='major', labelsize=fontsize-4) 537 | 538 | for tick in ax.get_xticklabels(): 539 | tick.set_fontname("Serif") 540 | tick.set_fontweight('light') 541 | 542 | for tick in ax.get_yticklabels(): 543 | tick.set_fontname("Serif") 544 | tick.set_fontweight('light') 545 | 546 | # Plot free energy landscape 547 | FE_landscape = np.copy(self.FE_landscape_) 548 | FE_landscape[self.FE_landscape_ > vmax+0.5] = vmax+0.5 549 | 550 | if self.n_dims_ == 2: 551 | ctf = ax.contourf(self.coords_[0], self.coords_[1], FE_landscape, n_contour_levels, cmap=my_cmap, vmin=0, vmax=vmax) 552 | cb=plt.colorbar(ctf, label='[kcal/mol]') 553 | text = cb.ax.yaxis.label 554 | font = matplotlib.font_manager.FontProperties(size=fontsize-3,family='serif',weight='light') 555 | text.set_font_properties(font) 556 | cb.ax.tick_params(labelsize=fontsize-2) 557 | 558 | for tick in cb.ax.get_yticklabels(): 559 | tick.set_fontname("Serif") 560 | tick.set_fontweight('light') 561 | 562 | ax.set_ylim([self.coords_[1].min(), self.coords_[1].max()]) 563 | ax.set_ylabel(ylabel, fontsize=fontsize - 2,fontname='serif',fontweight='light') 564 | elif self.n_dims_ == 1: 565 | if self.standard_error_FE_ is not None: 566 | ax.fill_between(self.coords_[0], FE_landscape - self.standard_error_FE_, FE_landscape + self.standard_error_FE_, color='k', alpha=0.2,zorder=2) 567 | ax.plot(self.coords_[0], FE_landscape, linewidth=3,color='k',zorder=1) 568 | ax.set_ylabel('Free energy [kcal/mol]',fontsize=fontsize-2,fontname='serif',fontweight='light') 569 | else: 570 | sc = ax.scatter(self.data_[::dx,0], self.data_[::dx,1], self.data_[::dx,2], s=30, c=self.FE_points_[::dx], alpha=0.8, cmap=my_cmap, vmin=0, vmax=vmax, edgecolor='k') 571 | 572 | ax.set_ylim([self.coords_[1].min(), self.coords_[1].max()]) 573 | ax.set_zlim([self.coords_[2].min(), self.coords_[2].max()]) 574 | 575 | cb=plt.colorbar(sc,label='[kcal/mol]') 576 | text = cb.ax.yaxis.label 577 | font = matplotlib.font_manager.FontProperties(size=fontsize-3,family='serif',weight='light') 578 | text.set_font_properties(font) 579 | cb.ax.tick_params(labelsize=fontsize-2) 580 | 581 | ax.set_ylabel(ylabel, fontsize=fontsize - 2,fontname='serif',fontweight='light') 582 | ax.set_zlabel(zlabel, fontsize=fontsize - 2,fontname='serif',fontweight='light') 583 | 584 | ax.set_xlim([self.coords_[0].min(), self.coords_[0].max()]) 585 | 586 | # Plot projected data points 587 | if show_data and self.n_dims_ < 3: 588 | 589 | # Plot projected data points 590 | if self.labels_ is not None: 591 | if self.n_dims_ > 1: 592 | transition_points=self.data_[self.labels_==0] 593 | core_points = self.data_[self.labels_ > 0] 594 | core_labels = self.labels_[self.labels_>0] 595 | ax.scatter(transition_points[::dx, 0], transition_points[::dx, 1], s=30, color=[0.67, 0.67, 0.67],alpha=0.5) 596 | ax.scatter(core_points[::dx, 0], core_points[::dx, 1], s=80, c=core_labels[::dx], 597 | edgecolor='k', cmap=data_cmap, label='Intermediate state',alpha=0.8) 598 | else: 599 | ax.scatter(self.data_[self.labels_==0], self.FE_points_[self.labels_==0], s=30, color=[0.67, 0.67, 0.65],alpha=0.6,zorder=3) 600 | ax.scatter(self.data_[self.labels_>0], self.FE_points_[self.labels_>0], s=50, c=self.labels_[self.labels_>0], 601 | edgecolor='k', cmap=data_cmap, label='Intermediate state',alpha=0.8,zorder=4) 602 | if fontsize > 18: 603 | plt.legend(fontsize=fontsize-10,facecolor=[0.9,0.9,0.92]) 604 | else: 605 | plt.legend(fontsize=fontsize-4,facecolor=[0.9,0.9,0.92]) 606 | else: 607 | if self.n_dims_ > 1: 608 | ax.scatter(self.data_[:, 0], self.data_[:, 1], s=30, color=[0.67, 0.67, 0.65],alpha=0.5) 609 | else: 610 | ax.scatter(self.data_, self.FE_points_[:, 1], s=30, color=[0.67, 0.67, 0.65],alpha=0.5) 611 | 612 | # Plot minimum pathways between states 613 | if self.pathways_ is not None and self.n_dims_ > 1: 614 | set_pathway_label = True 615 | for p in self.pathways_: 616 | if set_pathway_label: 617 | ax.plot(p[:, 0], p[:, 1], color=[43.0/256.0,46.0/256.0,60.0/256.0], linewidth=5, marker='', label='Pathway') 618 | set_pathway_label = False 619 | else: 620 | ax.plot(p[:, 0], p[:, 1], color=[43.0/256.0,46.0/256.0,60.0/256.0], linewidth=5, marker='') 621 | 622 | if fontsize > 18: 623 | plt.legend(fontsize=fontsize-10,facecolor=[0.9,0.9,0.92]) 624 | else: 625 | plt.legend(fontsize=fontsize-4,facecolor=[0.9,0.9,0.92]) 626 | 627 | # Plot cluster centers in landscape 628 | if self.cluster_centers_ is not None: 629 | if self.n_dims_ > 1: 630 | ax.scatter(self.data_[self.cluster_centers_,0], self.data_[self.cluster_centers_,1], marker='s', s=120, 631 | linewidth=4, facecolor='',edgecolor='w', label='Cluster center') 632 | else: 633 | ax.scatter(self.data_[self.cluster_centers_], self.FE_points_[self.cluster_centers_], marker='s', s=120, 634 | linewidth=4, facecolor='',edgecolor='w', label='Cluster center',zorder=5) 635 | if fontsize > 18: 636 | plt.legend(fontsize=fontsize-10,facecolor=[0.9,0.9,0.92]) 637 | else: 638 | plt.legend(fontsize=fontsize-4,facecolor=[0.9,0.9,0.92]) 639 | ax.set_title(title, fontsize=fontsize,fontname='serif',fontweight='light') 640 | ax.set_xlabel(xlabel, fontsize=fontsize - 2,fontname='serif',fontweight='light') 641 | plt.rc('xtick', labelsize=fontsize-2) 642 | plt.rc('ytick', labelsize=fontsize-2) 643 | matplotlib.rc('font',family='Serif') 644 | 645 | if savefig: 646 | plt.savefig(filename + '.svg') 647 | plt.savefig(filename + '.eps') 648 | plt.savefig(filename + '.png') 649 | 650 | return 651 | -------------------------------------------------------------------------------- /free_energy_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from .GMM_free_energy import FreeEnergyClustering 2 | from .FE_landscape_clustering import LandscapeClustering 3 | from .stack_landscapes import LandscapeStacker 4 | from .free_energy_pathways import FreeEnergyPathways 5 | -------------------------------------------------------------------------------- /free_energy_clustering/cluster_density.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import cdist 3 | 4 | 5 | class ClusterDensity(object): 6 | 7 | def __init__(self, points, eval_points=None): 8 | self.grid_points_ = points 9 | self.points_ = eval_points 10 | self.grid_cluster_inds_ = None 11 | return 12 | 13 | def _construct_components(self,distance_matrix, is_FE_min, use_transition_matrix=False): 14 | # Build subgraphs with connected components of the isolated FE minima 15 | print('Constructing connected components.') 16 | n_points = distance_matrix.shape[0] 17 | 18 | graph = np.zeros((n_points,n_points)) 19 | 20 | if use_transition_matrix: 21 | # Set distance matrix to information distance of transition matrix: 22 | # Kinetically close states have high transition probability but should have low distance 23 | distance_matrix = -np.log(distance_matrix+1e-9) 24 | 25 | # Sort distances in ascending order 26 | sort_inds = np.argsort(distance_matrix,axis=1) 27 | 28 | for i in range(n_points): 29 | if is_FE_min[i]: 30 | check_points = [] 31 | neighbors = sort_inds[i,:] 32 | k_neighbors=1 33 | 34 | # Add neighbors until another potential component is reached 35 | for j in range(k_neighbors,n_points): 36 | current_neighbor = neighbors[j] 37 | if is_FE_min[current_neighbor]: 38 | 39 | neighbor_distance = distance_matrix[i,current_neighbor] 40 | 41 | if len(check_points) > 2: 42 | check_point_distances = distance_matrix[current_neighbor,np.asarray(check_points)] 43 | is_smaller_dist = check_point_distances < neighbor_distance 44 | if np.sum(is_smaller_dist) > 0: 45 | # A non-component point is closer to both the current point and 46 | # the other component point => the two component points are not neighbors 47 | break; 48 | 49 | # Add connection between neighbors 50 | graph[i,current_neighbor] = 1 51 | # Enforce symmetry 52 | graph[current_neighbor,i] = 1 53 | else: 54 | check_points.append(current_neighbor) 55 | 56 | # Sparsify graph to contain only the connected components 57 | graph = graph[is_FE_min,:] 58 | graph = graph[:,is_FE_min] 59 | 60 | return graph 61 | 62 | def _find_connected_components(self,graph): 63 | # Assign points to connected components 64 | print('Clustering data points.') 65 | 66 | n_points = graph.shape[0] 67 | component_indices = np.zeros(n_points) 68 | is_visited = np.zeros(n_points) 69 | all_inds = np.arange(n_points) 70 | 71 | i_component = 0 72 | while np.sum(is_visited) < is_visited.shape[0]: 73 | i_component += 1 74 | queue = [] 75 | # get next unvisited point 76 | unvisited_points = all_inds[is_visited==0] 77 | queue.append(unvisited_points[0]) 78 | 79 | while len(queue) > 0: 80 | current_point = queue.pop(0) 81 | if is_visited[current_point] == 0: 82 | is_visited[current_point] = 1 83 | component_indices[current_point] = i_component 84 | 85 | # get unvisited neighbors 86 | neighbors = all_inds[graph[current_point,:] > 0] 87 | for neighbor in neighbors: 88 | if is_visited[neighbor] == 0: 89 | queue.append(neighbor) 90 | 91 | return component_indices 92 | 93 | def data_cluster_indices(self, point_distances, cluster_indices_eval_points): 94 | """ 95 | Set cluster indices according to the closest data point. 96 | """ 97 | n_points = point_distances.shape[0] 98 | cluster_inds = np.zeros(n_points) 99 | 100 | min_inds = np.argmin(point_distances,axis=1) 101 | 102 | # Set cluster index of point to the same as the cluster index of evaluated (grid) point 103 | cluster_inds = cluster_indices_eval_points[min_inds] 104 | return cluster_inds 105 | 106 | def cluster_data(self, is_FE_min, transition_matrix=None): 107 | 108 | # Construct and detect connected components 109 | if transition_matrix is None: 110 | graph = self._construct_components(cdist(self.grid_points_,self.grid_points_), is_FE_min) 111 | else: 112 | print('Using transition probabilities to define distances.') 113 | graph = self._construct_components(transition_matrix, is_FE_min, use_transition_matrix=True) 114 | 115 | print('# Graph connections: '+str(np.sum(graph))) 116 | cluster_indices_grid_points = self._find_connected_components(graph) 117 | 118 | self.grid_cluster_inds_ = np.zeros(self.grid_points_.shape[0]) 119 | self.grid_cluster_inds_[is_FE_min] = cluster_indices_grid_points 120 | if self.points_ is not None: 121 | cluster_indices = self.data_cluster_indices(cdist(self.points_,self.grid_points_),self.grid_cluster_inds_) 122 | else: 123 | cluster_indices = self.grid_cluster_inds_ 124 | 125 | return cluster_indices 126 | -------------------------------------------------------------------------------- /free_energy_clustering/cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import KFold 3 | 4 | 5 | def split_train_validation(data, n_splits, shuffle=False): 6 | """ 7 | Split the data into n_splits training and test sets. 8 | """ 9 | kf = KFold(n_splits=n_splits, shuffle=shuffle) 10 | 11 | train_inds = [] 12 | val_inds = [] 13 | 14 | for train_ind, val_ind in kf.split(data): 15 | train_inds.append(train_ind) 16 | val_inds.append(val_ind) 17 | 18 | train_inds, val_inds = make_homogenous_validation_sets(train_inds, val_inds) 19 | 20 | return train_inds, val_inds 21 | 22 | def make_homogenous_validation_sets(train_inds, val_inds): 23 | """ 24 | Ensure that the validation sets have equal amount of points. 25 | """ 26 | min_val_inds = val_inds[0].shape[0] 27 | for i in range(len(val_inds)): 28 | if val_inds[i].shape[0] < min_val_inds: 29 | min_val_inds = val_inds[i].shape[0] 30 | 31 | for i in range(len(val_inds)): 32 | if val_inds[i].shape[0] > min_val_inds: 33 | n_inds_to_move = int(val_inds[i].shape[0]-min_val_inds) 34 | train_inds[i] = np.concatenate((train_inds[i], val_inds[i][0:n_inds_to_move])) 35 | val_inds[i] = val_inds[i][n_inds_to_move::] 36 | 37 | return train_inds, val_inds 38 | 39 | def get_train_validation_set(data, train_ind, val_inds): 40 | """ 41 | Get the train and test set given their indices. 42 | """ 43 | training_data = data[train_ind] 44 | validation_data = data[val_inds] 45 | 46 | return training_data, validation_data 47 | -------------------------------------------------------------------------------- /free_energy_clustering/free_energy_pathways.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import free_energy_clustering as FEC 3 | import sys 4 | 5 | class FreeEnergyPathways(FEC.LandscapeClustering): 6 | 7 | def __init__(self, density_model, data, temperature=300, n_points=100, convergence_tol=1e-1, step_size=1e-3, 8 | ensemble_of_GMMs=False, max_iter=1000): 9 | 10 | FEC.LandscapeClustering.__init__(self,ensemble_of_GMMs=ensemble_of_GMMs) 11 | 12 | self.density_model_ = density_model 13 | self.convergence_tol_ = convergence_tol 14 | self.n_points_ = n_points 15 | self.data_ = data 16 | self.n_dims_ = data.shape[1] 17 | self.temperature_ = temperature # [K] 18 | self.boltzmann_constant_ = 0.0019872041 # [kcal/(mol K)] 19 | self.step_size_ = step_size 20 | self.max_iterations_ = max_iter 21 | return 22 | 23 | def _initialize_path(self, state_from, state_to): 24 | """ 25 | Set initial path guess as straight path between the two states. 26 | :param state_from: 27 | :param state_to: 28 | :return: 29 | """ 30 | path = np.zeros((self.n_points_, self.n_dims_)) 31 | for i_dim in range(self.n_dims_): 32 | path[:,i_dim] = np.linspace(state_from[i_dim],state_to[i_dim],num=self.n_points_) 33 | return path 34 | 35 | def _length_of_subpaths(self, path): 36 | partial_path_lengths = np.zeros(self.n_points_) 37 | 38 | for i in range(1,self.n_points_): 39 | partial_path_lengths[i] = partial_path_lengths[i-1]+np.linalg.norm(path[i]-path[i-1]) 40 | 41 | subpath_points = np.arange(self.n_points_)*partial_path_lengths[-1]/(self.n_points_-1) 42 | 43 | return partial_path_lengths, subpath_points 44 | 45 | def _equilibrate_path_points(self, path): 46 | """ 47 | Spread points equidistantly along path. 48 | :param path: 49 | :return: 50 | """ 51 | partial_path_lengths, subpath_points = self._length_of_subpaths(path) 52 | 53 | new_path = path 54 | for i in range(1,self.n_points_-1): 55 | s = subpath_points[i] 56 | for j in range(1,self.n_points_): 57 | if s > partial_path_lengths[j-1] and s < partial_path_lengths[j]: 58 | new_path[i] = path[j-1] + (s-partial_path_lengths[j-1])*(path[j]-path[j-1])/np.linalg.norm(path[j]-path[j-1]) 59 | break 60 | 61 | return new_path 62 | 63 | def _update_path(self, path): 64 | """ 65 | Update path with one minimization and equilibration of path points. 66 | :param path: 67 | :return: 68 | """ 69 | density = self.density_model_.density(path) 70 | density[density<1e-15]=1e-15 71 | 72 | inner_derivative, _ = self._compute_gradients(self.density_model_, path) 73 | outer_derivative = -self.temperature_*self.boltzmann_constant_/density 74 | step = self.step_size_*(np.multiply(outer_derivative[:,np.newaxis],inner_derivative)) 75 | new_path = path - step 76 | new_path = self._equilibrate_path_points(new_path) 77 | return new_path 78 | 79 | def minimum_pathway(self, state_from, state_to): 80 | """ 81 | Compute minimum pathway between two states using the estimated free energy landscape based on GMM. 82 | :param state_from: 83 | :param state_to: 84 | :return: 85 | """ 86 | 87 | # Set linear path between end points 88 | path = self._initialize_path(self.data_[state_from], self.data_[state_to]) 89 | 90 | prev_path = np.inf * path 91 | counter = 1 92 | while np.linalg.norm(path-prev_path) > self.convergence_tol_: 93 | sys.stdout.write("\r" + 'Iteration: ' + str(counter) + '/' + str(self.max_iterations_)) 94 | sys.stdout.flush() 95 | prev_path = np.copy(path) 96 | path = self._update_path(prev_path) 97 | if counter >= self.max_iterations_: 98 | break 99 | counter+=1 100 | print() 101 | return path 102 | 103 | -------------------------------------------------------------------------------- /free_energy_clustering/stack_landscapes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import free_energy_clustering.GMM as GMM 3 | import scipy.optimize as opt 4 | 5 | class LandscapeStacker(object): 6 | def __init__(self, data, list_of_validation_data, list_of_models, n_splits=1, convergence_tol=5e-3, n_iterations=1, 7 | model_weights=None): 8 | """ 9 | Class for weighting density estimators with EM, based on how well they describe the validation dataset. 10 | :param data: [n_samples x n_dimensions] 11 | :param list_of_estimators: 12 | :param n_splits: Number of folds in K-fold cross-validation 13 | :param convergence_tol: 14 | """ 15 | self.GMM_list_ = list_of_models 16 | self.val_data_list_ = list_of_validation_data 17 | self.data_ = data 18 | self.convergence_tol_ = convergence_tol 19 | self.n_models_ = int(len(list_of_models)/n_splits) 20 | self.n_splits_ = n_splits 21 | self.n_iterations_ = n_iterations 22 | self.n_components_list_ = [] 23 | 24 | # Initlialize weights 25 | if model_weights is None: 26 | if self.n_models_ > 0: 27 | self.model_weights_ = 1.0 / self.n_models_ * np.ones(self.n_models_) 28 | else: 29 | self.model_weights_ = model_weights 30 | self._sparisify_model() 31 | print('Model weights: ' + str(self.model_weights_)) 32 | print('GMM list: '+str(self.GMM_list_)) 33 | 34 | self._set_n_component_list() 35 | print('# Components in models: '+str(self.n_components_list_)) 36 | return 37 | 38 | def objective_function(self,W): 39 | # -log(likelihood) 40 | W /= W.sum() 41 | return -self.loglikelihood(self.val_data_list_, list_of_validation_data=True, weights=W) 42 | 43 | def fit(self): 44 | do_EM = True 45 | 46 | print('Training density model weights.') 47 | 48 | if do_EM: 49 | loglikelihood = -np.inf 50 | prev_loglikelihood = 0 51 | while (np.abs(prev_loglikelihood - loglikelihood) > self.convergence_tol_): 52 | beta = self._expectation() 53 | self._maximization(beta) 54 | prev_loglikelihood = loglikelihood 55 | loglikelihood = self.loglikelihood(self.val_data_list_, list_of_validation_data=True) 56 | else: 57 | self.model_weights_ = opt.fmin_cg(self.objective_function, self.model_weights_) 58 | 59 | # Keep only models with nonzero weight 60 | self._sparisify_model() 61 | self._set_n_component_list() 62 | 63 | # Train each density model on the full dataset. 64 | print('Training each model on the full dataset.') 65 | for i_model in range(self.n_models_): 66 | n_components = self.GMM_list_[i_model].n_components_ 67 | print(' - Training model with '+str(n_components)+' components') 68 | best_loglikelihood = -np.inf 69 | for i_iter in range(self.n_iterations_): 70 | density_model = GMM.GaussianMixture(n_components=n_components, 71 | convergence_tol=self.convergence_tol_) 72 | density_model.fit(self.data_) 73 | loglikelihood = density_model.loglikelihood(self.data_) 74 | if loglikelihood > best_loglikelihood: 75 | best_loglikelihood = loglikelihood 76 | self.GMM_list_[i_model] = density_model 77 | 78 | self.n_components_list_ = np.asarray(self.n_components_list_) 79 | return 80 | 81 | def _set_n_component_list(self): 82 | """ 83 | Set the list with number of components. 84 | :return: 85 | """ 86 | self.n_components_list_ = [] 87 | for i_model in range(self.n_models_): 88 | n_components = self.GMM_list_[i_model*self.n_splits_].weights_.shape[0] 89 | self.n_components_list_.append(n_components) 90 | return 91 | 92 | def _expectation(self): 93 | n_points = self.val_data_list_[0].shape[0] 94 | 95 | beta = np.zeros((self.n_splits_, self.n_models_, n_points)) 96 | 97 | for i_split in range(self.n_splits_): 98 | for i_model in range(self.n_models_): 99 | ind = i_model*self.n_splits_+i_split 100 | beta[i_split, i_model, :] = self.model_weights_[i_model]*self.GMM_list_[ind].density(self.val_data_list_[ind]) 101 | 102 | beta[i_split] /= np.sum(beta[i_split],axis=0) 103 | 104 | return beta 105 | 106 | def _maximization(self, beta): 107 | """ 108 | Update density estimator weights. 109 | """ 110 | self.model_weights_ = beta.sum(axis=(0,2)) 111 | 112 | # Normalize Cat-distibution 113 | self.model_weights_ /= self.model_weights_.sum() 114 | return 115 | 116 | def _sparisify_model(self): 117 | """ 118 | Remove all models with zero-weights (done after converged optimization). 119 | :return: 120 | """ 121 | print('Removing zero-weighted models.') 122 | threshold = 1e-3 123 | n_models = np.sum(self.model_weights_>threshold) 124 | new_weights = [] 125 | new_models = [] 126 | 127 | for i_model in range(self.n_models_): 128 | if self.model_weights_[i_model] > threshold: 129 | new_weights.append(self.model_weights_[i_model]) 130 | for i_split in range(self.n_splits_): 131 | new_models.append(self.GMM_list_[i_model*self.n_splits_+i_split]) 132 | 133 | self.n_models_ = n_models 134 | self.GMM_list_ = new_models 135 | print(self.GMM_list_) 136 | self.model_weights_ = np.asarray(new_weights) 137 | self.model_weights_ /= self.model_weights_.sum() 138 | return 139 | 140 | def density(self, x, list_of_validation_data=False, weights=None): 141 | """ 142 | Compute mixture of landscape density at the given points, x. 143 | x is either a numpy-array of size [n_samples x n_dims] or a list of 144 | validation datasets with length [self.n_models_]. 145 | """ 146 | if list_of_validation_data: 147 | n_points = x[0].shape[0] 148 | density = np.zeros(n_points*self.n_splits_) 149 | for i_model in range(self.n_models_): 150 | for i_split in range(self.n_splits_): 151 | if weights is None: 152 | density[n_points*i_split:n_points*(i_split+1)] += self.model_weights_[i_model]*self.GMM_list_[i_model*self.n_splits_+i_split].density(x[i_model*self.n_splits_+i_split]) 153 | else: 154 | density[n_points*i_split:n_points*(i_split+1)] += weights[i_model]*self.GMM_list_[i_model*self.n_splits_+i_split].density(x[i_model*self.n_splits_+i_split]) 155 | else: 156 | density = np.zeros(x.shape[0]) 157 | for i_model in range(self.n_models_): 158 | density += self.model_weights_[i_model]*self.GMM_list_[i_model].density(x) 159 | return density 160 | 161 | def loglikelihood(self, x, list_of_validation_data=False,weights=None): 162 | """ 163 | Compute log-likelihood. 164 | """ 165 | density = self.density(x, list_of_validation_data=list_of_validation_data,weights=weights) 166 | density[density<1e-8]=1e-8 167 | return np.mean(np.log(density)) 168 | -------------------------------------------------------------------------------- /toy_models/Kmeans_cluster.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import KMeans 3 | from sklearn.metrics import silhouette_score 4 | from sklearn.neighbors import KNeighborsClassifier 5 | 6 | class KMeansCluster(): 7 | 8 | def __init__(self,n_min_clusters,n_max_clusters): 9 | self.n_min_clusters_ = n_min_clusters 10 | self.n_max_clusters_ = n_max_clusters 11 | self.labels_ = None 12 | self.classifier = KNeighborsClassifier(n_neighbors=3) 13 | self.name='kmeans' 14 | return 15 | 16 | def cluster(self, x): 17 | print('Cluster data with K-means') 18 | all_cluster_labels = [] 19 | silhouette_scores = np.zeros(self.n_max_clusters_-self.n_min_clusters_+1) 20 | 21 | for n_clusters in range(self.n_min_clusters_,self.n_max_clusters_+1): 22 | km = KMeans(n_clusters=n_clusters).fit(x) 23 | all_cluster_labels.append(km.labels_) 24 | silhouette_scores[n_clusters-self.n_min_clusters_] = silhouette_score(x, all_cluster_labels[-1]) 25 | 26 | ind = np.argmax(silhouette_scores) 27 | self.labels_ = all_cluster_labels[ind]+1 28 | 29 | # Train kNN classifier 30 | self.classifier.fit(x, self.labels_) 31 | print('Cluster labels: '+str(np.unique(self.labels_))) 32 | return self.labels_ 33 | 34 | def assign_cluster_labels(self, x): 35 | return self.classifier.predict(x) -------------------------------------------------------------------------------- /toy_models/__init__.py: -------------------------------------------------------------------------------- 1 | from toy_models.toy_model_GMM_2D import GMM2D 2 | from toy_models.toy_model_nonlinear_GMM_2D import GMM2dNonlinear 3 | from toy_models.evaluate_toy_models import MethodEvaluator 4 | from toy_models.toy_model_multiple_GMMs import MultipleGMMs 5 | from toy_models.toy_model_moons import Moons 6 | from toy_models.toy_model_blobs import Blobs -------------------------------------------------------------------------------- /toy_models/agglomerative_ward_cluster.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import silhouette_score 3 | from sklearn.neighbors import KNeighborsClassifier 4 | from sklearn.cluster import AgglomerativeClustering 5 | 6 | class AWCluster(): 7 | 8 | def __init__(self,n_min_clusters,n_max_clusters): 9 | self.n_min_clusters_ = n_min_clusters 10 | self.n_max_clusters_ = n_max_clusters 11 | self.labels_ = None 12 | self.classifier = KNeighborsClassifier(n_neighbors=3) 13 | self.name='AW' 14 | return 15 | 16 | def cluster(self, x): 17 | print('Cluster data with agglomerative-Ward') 18 | all_cluster_labels = [] 19 | silhouette_scores = np.zeros(self.n_max_clusters_-self.n_min_clusters_+1) 20 | 21 | for n_clusters in range(self.n_min_clusters_,self.n_max_clusters_+1): 22 | aw = AgglomerativeClustering(n_clusters=n_clusters,linkage='ward').fit(x) 23 | all_cluster_labels.append(aw.labels_) 24 | silhouette_scores[n_clusters-self.n_min_clusters_] = silhouette_score(x, all_cluster_labels[-1]) 25 | 26 | ind = np.argmax(silhouette_scores) 27 | self.labels_ = all_cluster_labels[ind]+1 28 | 29 | # Train kNN classifier 30 | self.classifier.fit(x, self.labels_) 31 | print('Cluster labels: '+str(np.unique(self.labels_))) 32 | return self.labels_ 33 | 34 | def assign_cluster_labels(self, x): 35 | return self.classifier.predict(x) -------------------------------------------------------------------------------- /toy_models/evaluate_toy_models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import toy_models as tm 4 | import free_energy_clustering as GMM_FE 5 | from toy_models import Kmeans_cluster as kmc 6 | from toy_models import spectral_cluster as sc 7 | from toy_models import agglomerative_ward_cluster as awc 8 | 9 | from sklearn.metrics import v_measure_score 10 | from sklearn.metrics import adjusted_mutual_info_score 11 | from sklearn.metrics.cluster import fowlkes_mallows_score 12 | from sklearn.cluster import KMeans 13 | 14 | class MethodEvaluator(object): 15 | 16 | def __init__(self, toy_model='GMM_2D', x_lims=None, n_grids=30, convergence_tol=1e-4,verbose=False, presampled_data=None, n_features=None, noise=0, n_components=3): 17 | 18 | if toy_model == 'GMM_2D': 19 | self.toy_model_ = tm.GMM2D() 20 | elif toy_model == 'mGMMs': 21 | self.toy_model_ = tm.MultipleGMMs() 22 | elif toy_model == 'blobs': 23 | self.toy_model_ = tm.Blobs(n_components=n_components, n_dims=n_features, noise=noise) 24 | elif toy_model == 'digits': 25 | self.toy_model_ = tm.Digits() 26 | elif toy_model == 'nonlinear_GMM_2D': 27 | self.toy_model_ = tm.GMM2dNonlinear() 28 | else: 29 | print('Toy model: '+str(toy_model)+' does not exist') 30 | sys.exit(0) 31 | 32 | self.cluster_score_ami_kmeans_ = None 33 | self.cluster_score_ami_AW_ = None 34 | self.cluster_score_ami_spectral_ = None 35 | self.cluster_score_ami_density_peaks_ = None 36 | self.cluster_score_ami_GMM_ = None 37 | self.cluster_score_ami_GMM_FE_min_ = None 38 | 39 | self.cluster_score_fm_kmeans_ = None 40 | self.cluster_score_fm_AW_ = None 41 | self.cluster_score_fm_spectral_ = None 42 | self.cluster_score_fm_density_peaks_ = None 43 | self.cluster_score_fm_GMM_ = None 44 | self.cluster_score_fm_GMM_FE_min_ = None 45 | 46 | self.cluster_score_vm_kmeans_ = None 47 | self.cluster_score_vm_AW_ = None 48 | self.cluster_score_vm_spectral_ = None 49 | self.cluster_score_vm_density_peaks_ = None 50 | self.cluster_score_vm_DPA_ = None 51 | self.cluster_score_vm_HDBSCAN_ = None 52 | self.cluster_score_vm_SDC_ = None 53 | self.cluster_score_vm_GMM_ = None 54 | self.cluster_score_vm_GMM_FE_min_ = None 55 | 56 | self.convergence_tol_ = convergence_tol 57 | 58 | self.x_lims_ = x_lims 59 | self.n_grids_ = n_grids 60 | 61 | self.presampled_data = presampled_data 62 | 63 | self.true_FE_ = None 64 | self.true_density_ = None 65 | self.true_labels_ = None 66 | self.test_set_ = None 67 | self.min_FE_ = None 68 | self.verbose_ = verbose 69 | 70 | self.set_true_free_energy() 71 | return 72 | 73 | def set_true_free_energy(self): 74 | """ 75 | Create a free energy object that contains the true free energy and density on the given grid. 76 | :return: 77 | """ 78 | # Create grid and evaluate density on it 79 | print('Setting true model.') 80 | self.test_set_ = self.toy_model_.sample(2000) 81 | self.true_FE_ = GMM_FE.FreeEnergyClustering(self.test_set_, x_lims=self.x_lims_, n_grids=self.n_grids_,verbose=False, 82 | convergence_tol=self.convergence_tol_) 83 | self.true_FE_.density_est_ = self.toy_model_ 84 | 85 | coords, self.true_density_ = self.true_FE_._density_landscape(self.toy_model_) 86 | 87 | # Compute true free energy 88 | FE_landscape = self.true_FE_._free_energy(self.true_density_) 89 | self.min_FE_= np.min(FE_landscape) 90 | FE_landscape = FE_landscape - self.min_FE_ 91 | 92 | # Set true free energy 93 | self.true_FE_.coords_ = coords 94 | self.true_FE_.FE_landscape_ = FE_landscape 95 | 96 | if hasattr(self.toy_model_,"assign_cluster_labels"): 97 | self.true_labels_ = self.toy_model_.assign_cluster_labels(self.test_set_) 98 | else: 99 | self.true_labels_, _ = self.true_FE_.cluster(coords, np.zeros(self.test_set_.shape[0]), self.test_set_) 100 | return 101 | 102 | def run_evaluation(self, n_runs=1, n_points=1000, n_iterations=1, min_n_components=2, max_n_components=25, 103 | n_splits=3, save_data=False, file_label='',n_microstates=None, all_methods=True, 104 | assign_transition_points=True): 105 | """ 106 | Run multiple free energy estimations and evaluate performance. 107 | :param n_runs: 108 | :return: 109 | """ 110 | 111 | if self.presampled_data is not None: 112 | sampled_data = self.presampled_data[0] 113 | true_clustering = self.presampled_data[1] 114 | n_runs = sampled_data.shape[0] 115 | 116 | self.cluster_score_ami_kmeans_ = np.zeros(n_runs) 117 | self.cluster_score_ami_AW_ = np.zeros(n_runs) 118 | self.cluster_score_ami_spectral_ = np.zeros(n_runs) 119 | self.cluster_score_ami_density_peaks_ = np.zeros(n_runs) 120 | self.cluster_score_ami_GMM_ = np.zeros(n_runs) 121 | self.cluster_score_ami_GMM_FE_min_ = np.zeros(n_runs) 122 | 123 | self.cluster_score_fm_kmeans_ = np.zeros(n_runs) 124 | self.cluster_score_fm_AW_ = np.zeros(n_runs) 125 | self.cluster_score_fm_spectral_ = np.zeros(n_runs) 126 | self.cluster_score_fm_density_peaks_ = np.zeros(n_runs) 127 | self.cluster_score_fm_GMM_ = np.zeros(n_runs) 128 | self.cluster_score_fm_GMM_FE_min_ = np.zeros(n_runs) 129 | 130 | self.cluster_score_vm_kmeans_ = np.zeros(n_runs) 131 | self.cluster_score_vm_AW_ = np.zeros(n_runs) 132 | self.cluster_score_vm_spectral_ = np.zeros(n_runs) 133 | self.cluster_score_vm_density_peaks_ = np.zeros(n_runs) 134 | self.cluster_score_vm_GMM_ = np.zeros(n_runs) 135 | self.cluster_score_vm_GMM_FE_min_ = np.zeros(n_runs) 136 | 137 | data = self.toy_model_.sample(3) 138 | 139 | # Create free energy estimators 140 | gmm_FE = GMM_FE.FreeEnergyClustering(data, min_n_components=min_n_components, max_n_components=max_n_components, 141 | x_lims=self.x_lims_, n_grids=self.n_grids_, stack_landscapes=False, 142 | n_splits=n_splits, n_iterations=n_iterations,convergence_tol=self.convergence_tol_, 143 | verbose=self.verbose_) 144 | 145 | km = kmc.KMeansCluster(min_n_components, max_n_components) 146 | aw = awc.AWCluster(min_n_components, max_n_components) 147 | spectral = sc.SpectralCluster(min_n_components, max_n_components) 148 | 149 | all_data = [] 150 | for i_run in range(n_runs): 151 | print("Run: "+str(i_run+1)+'/'+str(n_runs)) 152 | 153 | if self.presampled_data is None: 154 | # Sample data 155 | data = self.toy_model_.sample(n_points) 156 | else: 157 | data = sampled_data[i_run] 158 | 159 | all_data.append(data) 160 | 161 | print('Shape data: ' + str(data.shape)) 162 | 163 | # Set data in model and estimate GMM density 164 | gmm_FE.data_ = data 165 | coords, est_FE_landsc, FE_points = gmm_FE.landscape() 166 | 167 | # Get true cluster labels 168 | if self.presampled_data is None: 169 | if hasattr(self.toy_model_, "assign_cluster_labels"): 170 | self.true_labels_ = self.toy_model_.assign_cluster_labels(data) 171 | else: 172 | print('Setting true labels.') 173 | self.true_labels_, _ = self.true_FE_.cluster(data, np.zeros(data.shape[0])) 174 | else: 175 | self.true_labels_ = true_clustering[i_run] 176 | 177 | # Cluster data with different methods 178 | if n_microstates is None: 179 | self.FE_min_labels, _ = gmm_FE.cluster(data, FE_points, assign_transition_points=assign_transition_points) 180 | else: 181 | kmea = KMeans(n_clusters=n_microstates).fit(data[::2]) 182 | microstate_centers = kmea.cluster_centers_ 183 | self.FE_min_labels, _ = gmm_FE.cluster(microstate_centers, FE_points, data, assign_transition_points=assign_transition_points, unravel_grid=False) 184 | 185 | if all_methods: 186 | self.km_labels = km.cluster(data) 187 | self.aw_labels = aw.cluster(data) 188 | self.spectral_labels = spectral.cluster(data) 189 | 190 | # Score clustering using different scoring metrics 191 | # V-measure score 192 | self.cluster_score_vm_GMM_FE_min_[i_run] = self._score_clustering(self.FE_min_labels,'vm') 193 | print(self.cluster_score_vm_GMM_FE_min_[i_run]) 194 | if all_methods: 195 | self.cluster_score_vm_GMM_[i_run] = self._score_clustering(gmm_FE.density_est_.predict(data),'vm') 196 | self.cluster_score_vm_kmeans_[i_run] = self._score_clustering(self.km_labels,'vm') 197 | self.cluster_score_vm_AW_[i_run] = self._score_clustering(self.aw_labels,'vm') 198 | self.cluster_score_vm_spectral_[i_run] = self._score_clustering(self.spectral_labels,'vm') 199 | 200 | # Adjusted MI 201 | self.cluster_score_ami_GMM_FE_min_[i_run] = self._score_clustering(self.FE_min_labels,'ami') 202 | self.cluster_score_ami_GMM_[i_run] = self._score_clustering(gmm_FE.density_est_.predict(data),'ami') 203 | self.cluster_score_ami_kmeans_[i_run] = self._score_clustering(self.km_labels,'ami') 204 | self.cluster_score_ami_AW_[i_run] = self._score_clustering(self.aw_labels,'ami') 205 | self.cluster_score_ami_spectral_[i_run] = self._score_clustering(self.spectral_labels,'ami') 206 | 207 | # Fowlkes Mallows 208 | self.cluster_score_fm_GMM_FE_min_[i_run] = self._score_clustering(self.FE_min_labels,'fm') 209 | self.cluster_score_fm_GMM_[i_run] = self._score_clustering(gmm_FE.density_est_.predict(data),'fm') 210 | self.cluster_score_fm_kmeans_[i_run] = self._score_clustering(self.km_labels,'fm') 211 | self.cluster_score_fm_AW_[i_run] = self._score_clustering(self.aw_labels,'fm') 212 | self.cluster_score_fm_spectral_[i_run] = self._score_clustering(self.spectral_labels,'fm') 213 | 214 | if save_data: 215 | if self.presampled_data is None: 216 | np.save('data_out/sampled_data_'+self.toy_model_.name+file_label+'.npy',all_data) 217 | 218 | if False: 219 | np.save('data_out/cluster_score_fm_FE_min_'+self.toy_model_.name+file_label+'.npy',self.cluster_score_fm_GMM_FE_min_) 220 | np.save('data_out/cluster_score_fm_GMM_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_fm_GMM_) 221 | np.save('data_out/cluster_score_fm_kmeans_' + self.toy_model_.name +file_label +'.npy', self.cluster_score_fm_kmeans_) 222 | np.save('data_out/cluster_score_fm_AW_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_fm_AW_) 223 | np.save('data_out/cluster_score_fm_spectral_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_fm_spectral_) 224 | 225 | np.save('data_out/cluster_score_ami_FE_min_'+self.toy_model_.name+file_label+'.npy',self.cluster_score_ami_GMM_FE_min_) 226 | np.save('data_out/cluster_score_ami_GMM_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_GMM_) 227 | np.save('data_out/cluster_score_ami_kmeans_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_kmeans_) 228 | np.save('data_out/cluster_score_ami_AW_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_AW_) 229 | np.save('data_out/cluster_score_ami_spectral_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_spectral_) 230 | 231 | np.save('data_out/cluster_score_vm_FE_min_'+self.toy_model_.name+file_label+'.npy',self.cluster_score_vm_GMM_FE_min_) 232 | if all_methods: 233 | np.save('data_out/cluster_score_vm_GMM_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_GMM_) 234 | np.save('data_out/cluster_score_vm_kmeans_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_kmeans_) 235 | np.save('data_out/cluster_score_vm_AW_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_AW_) 236 | np.save('data_out/cluster_score_vm_spectral_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_spectral_) 237 | return 238 | 239 | def _score_clustering(self, labels,metric='vm'): 240 | # Score clustering compared to true model 241 | if metric=='fm': 242 | score = fowlkes_mallows_score(self.true_labels_, labels) 243 | elif metric=='ami': 244 | score = adjusted_mutual_info_score(self.true_labels_, labels) 245 | else: 246 | score = v_measure_score(self.true_labels_[labels>0], labels[labels>0]) 247 | return score 248 | 249 | def _FE_error(self, estimated_FE_landscape): 250 | error = np.mean(np.abs(estimated_FE_landscape-self.true_FE_.FE_landscape_)) 251 | return error 252 | 253 | def _density_error(self, estimated_density): 254 | error = np.mean(np.abs(estimated_density - self.true_density_)) 255 | return error 256 | 257 | def visualize(self): 258 | """ 259 | Visualizing the quantities from estimations. 260 | :return: 261 | """ 262 | plt.figure(1) 263 | ax1 = plt.add_suplot(1,2,1) 264 | # Plot free energy error 265 | ax1.plot(self.FE_errors_GMM_CV_, linewidth=4, label='GMM with cross-validation') 266 | ax1.plot(self.FE_errors_GMM_mix_models_, linewidth=4, label='GMM with mixture of models') 267 | plt.legend() 268 | 269 | # Plot density error 270 | 271 | # Plot log-likelihood of test set 272 | 273 | # Plot clustering score 274 | 275 | plt.show() 276 | 277 | return 278 | -------------------------------------------------------------------------------- /toy_models/spectral_cluster.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import KMeans 3 | from scipy.sparse.linalg import eigsh 4 | from scipy.spatial.distance import cdist 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | class SpectralCluster(): 8 | 9 | def __init__(self,n_min_clusters,n_max_clusters): 10 | self.n_min_clusters_ = n_min_clusters 11 | self.n_max_clusters_ = n_max_clusters 12 | self.labels_ = None 13 | self.classifier = KNeighborsClassifier(n_neighbors=3) 14 | self.name = 'spectral' 15 | return 16 | 17 | def transition_matrix(self, A): 18 | for i in range(A.shape[0]): 19 | A[i,i] = 0 20 | D = np.sum(A, axis=1) 21 | D_inv = np.diag(1 / np.sqrt(D)) 22 | T = np.dot(D_inv, np.dot(A, D_inv)) 23 | return T 24 | 25 | def get_n_clusters(self, transition_mat): 26 | print('Spectral embedding') 27 | eigenvalues, eigenvectors = np.linalg.eig(transition_mat)#eigsh(transition_mat, k=(self.n_max_clusters_ + 1)); 28 | 29 | # Sort in descending order 30 | ind_sort = np.argsort(-eigenvalues) 31 | eigenvalues = eigenvalues[ind_sort] 32 | 33 | # Get largest eigengap 34 | eigengaps = -np.diff(eigenvalues) 35 | ind = np.argmax(eigengaps[self.n_min_clusters_:self.n_max_clusters_+1]) 36 | n_clusters = ind+self.n_min_clusters_ 37 | embedding = eigenvectors[:, ind_sort[0:n_clusters+1]] 38 | for i in range(embedding.shape[0]): 39 | embedding[i] /= np.linalg.norm(embedding[i]) 40 | 41 | return n_clusters, embedding 42 | 43 | def cluster(self, x): 44 | 45 | # Set affinity matrix 46 | distances = cdist(x,x) 47 | distSort = np.sort(distances, axis=1) 48 | gamma = np.max(distSort[:,1])**2 49 | 50 | dist_squared = np.multiply(distances, distances) 51 | 52 | A = np.exp(-dist_squared/(2*gamma)) 53 | 54 | print('Cluster data with spectral clustering') 55 | # Get transition matrix, select number of dimensions/clusters and project data 56 | transition_mat = self.transition_matrix(A) 57 | n_clusters, embedding = self.get_n_clusters(transition_mat) 58 | 59 | print('Cluster data with '+str(n_clusters)+' clusters.') 60 | km = KMeans(n_clusters=n_clusters).fit(embedding) 61 | self.labels_ = km.labels_+1 62 | 63 | # Train kNN classifier 64 | self.classifier.fit(x, self.labels_) 65 | print('Cluster labels: '+str(np.unique(self.labels_))) 66 | return self.labels_ 67 | 68 | def assign_cluster_labels(self, x): 69 | return self.classifier.predict(x) -------------------------------------------------------------------------------- /toy_models/toy_model_GMM_2D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from free_energy_clustering.GMM import GaussianMixture 3 | from free_energy_clustering.GMM_free_energy import FreeEnergyClustering 4 | 5 | class GMM2D(GaussianMixture): 6 | 7 | def __init__(self): 8 | GaussianMixture.__init__(self, n_components=9) 9 | 10 | self.n_dims_ = 2 11 | self._set_parameters() 12 | self.name = 'GMM_2D' 13 | return 14 | 15 | def _set_cov(self, x11,x12,x22): 16 | tmp_cov = np.zeros((self.n_dims_, self.n_dims_)) 17 | 18 | tmp_cov[0, 0] = x11 19 | tmp_cov[0, 1] = x12 20 | tmp_cov[1, 0] = x12 21 | tmp_cov[1, 1] = x22 22 | return tmp_cov 23 | 24 | def _set_parameters(self): 25 | 26 | self.means_ = np.asarray([ np.asarray([0.8,0.35]), np.asarray([0.45,0.52]), np.asarray([0.2,0.6]), 27 | np.asarray([0.05,0.8]), np.asarray([0.5,0.25]), np.asarray([0.5,0.25]), 28 | np.asarray([0.5, 0.25]), np.asarray([0.4, 0.34]), np.asarray([0.8,0.5])]) 29 | 30 | covs = [np.zeros((2,2))]*self.n_components_ 31 | 32 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002) 33 | covs[1] = self._set_cov(0.001, 0.0009, 0.001) 34 | covs[2] = self._set_cov(0.002, 0.001, 0.002) 35 | covs[3] = self._set_cov(0.003, 0.0008, 0.003) 36 | covs[4] = self._set_cov(0.0012, 0.0005, 0.0012) 37 | covs[5] = self._set_cov(0.005, 0.0, 0.0015) 38 | covs[6] = self._set_cov(0.002, 0.0, 0.002) 39 | covs[7] = self._set_cov(0.0012, 0.00, 0.002) 40 | covs[8] = self._set_cov(0.001, 0.0009, 0.001) 41 | 42 | self.covariances_ = covs 43 | 44 | self.weights_ = np.asarray([0.15,0.1,0.3,0.25,0.1,0.05,0.1,0.05,0.4]) 45 | self.weights_ /= np.sum(self.weights_) 46 | 47 | return 48 | -------------------------------------------------------------------------------- /toy_models/toy_model_blobs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import datasets 3 | from free_energy_clustering.GMM import GaussianMixture 4 | 5 | class Blobs(GaussianMixture): 6 | def __init__(self,n_components=3,n_dims=2,noise=0): 7 | GaussianMixture.__init__(self, n_components=n_components) 8 | self.labels_ = None 9 | self.data_ = None 10 | self.n_features_ = n_dims 11 | self.noise_level_ = noise 12 | self.name = 'blobs' 13 | return 14 | 15 | def sample(self, n_points): 16 | self.data_, self.labels_ = datasets.make_blobs(n_samples=n_points, n_features=self.n_features_) 17 | print(self.data_.shape) 18 | 19 | self.set_density() 20 | 21 | # Sample noise uniformly over space 22 | n_noise_points = int(self.noise_level_ * self.data_.shape[0]) 23 | data_noise = np.random.uniform(self.data_.min(axis=0), self.data_.max(axis=0), 24 | size=(n_noise_points, self.data_.shape[1])) 25 | 26 | self.data_[0:n_noise_points] = data_noise 27 | return self.data_ 28 | 29 | def set_density(self): 30 | 31 | unique_labels = np.unique(self.labels_) 32 | 33 | self.weights_ = np.zeros(self.n_components_) 34 | self.means_ = np.zeros((self.n_components_,self.n_features_)) 35 | self.covariances_ = [np.zeros((self.n_features_,self.n_features_))]*self.n_components_ 36 | for label in unique_labels: 37 | self.weights_[label] = np.mean(self.labels_==label) 38 | self.means_[label] = np.mean(self.data_[self.labels_==label],axis=0) 39 | self.covariances_[label] = np.cov(self.data_[self.labels_==label].T) 40 | return 41 | 42 | def assign_cluster_labels(self, x): 43 | return self.predict(x) 44 | 45 | -------------------------------------------------------------------------------- /toy_models/toy_model_moons.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import datasets 3 | from scipy.spatial.distance import cdist 4 | from sklearn.neighbors import KNeighborsClassifier 5 | 6 | class Moons(): 7 | def __init__(self): 8 | self.labels_ = None 9 | self.data_ = None 10 | self.classifier = KNeighborsClassifier(n_neighbors=3) 11 | self.name = 'moons' 12 | return 13 | 14 | def sample(self, n_points): 15 | self.data_, self.labels_ = datasets.make_moons(n_samples=n_points, noise=.05) 16 | self.classifier.fit(self.data_,self.labels_+1) 17 | return self.data_ 18 | 19 | def density(self,x): 20 | min_dist = cdist(x,self.data_[::10]).min(axis=1) 21 | density = np.zeros(x.shape[0]) 22 | density[min_dist < 5e-2] = 0.5 23 | density /= density.sum() 24 | return density 25 | 26 | def assign_cluster_labels(self, x): 27 | return self.classifier.predict(x) 28 | 29 | -------------------------------------------------------------------------------- /toy_models/toy_model_multiple_GMMs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from free_energy_clustering.GMM import GaussianMixture 3 | 4 | class MultipleGMMs(GaussianMixture): 5 | def __init__(self): 6 | data = np.zeros((3,2)) 7 | GaussianMixture.__init__(self, n_components=10) 8 | 9 | self.name = 'mGMMs' 10 | self.n_dims_ = 2 11 | self._set_parameters() 12 | return 13 | 14 | def sample_multi_GMM(self, n_points): 15 | """ 16 | Sample from stacked GMMs. 17 | """ 18 | sampled_points = np.zeros((n_points, self.n_dims_)) 19 | prob_model = np.cumsum(self.model_weights_) 20 | 21 | r = np.random.uniform(size=n_points) 22 | is_point_sampled = np.zeros((n_points), dtype=int) 23 | 24 | for i_point in range(n_points): 25 | for i_model in range(self.n_models_): 26 | if r[i_point] <= prob_model[i_model]: 27 | is_point_sampled[i_model] += 1 28 | sampled_points[i_point,:] = self.GMM_list_[i_model].sample(1) 29 | break 30 | print('Sampled: '+str(is_point_sampled.sum())+'/'+str(n_points)) 31 | return sampled_points 32 | 33 | def _set_cov(self, x11,x12,x22): 34 | tmp_cov = np.zeros((self.n_dims_, self.n_dims_)) 35 | 36 | tmp_cov[0, 0] = x11 37 | tmp_cov[0, 1] = x12 38 | tmp_cov[1, 0] = x12 39 | tmp_cov[1, 1] = x22 40 | return tmp_cov 41 | 42 | def _set_GMM1(self): 43 | n_components = 4 44 | means = np.asarray([np.asarray([0.5,0.27]), np.asarray([0.5, 0.27]), np.asarray([0.5, 0.27]), 45 | np.asarray([0.5, 0.27])]) 46 | 47 | covs = [np.zeros((2, 2))] * n_components 48 | 49 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002) 50 | covs[1] = self._set_cov(0.001, 0.0009, 0.001) 51 | covs[2] = self._set_cov(0.002, 0.001, 0.002) 52 | covs[3] = self._set_cov(0.003, 0.0008, 0.003) 53 | 54 | weights = np.asarray([0.5, 0.3,0.3,0.3]) 55 | weights /= weights.sum() 56 | return means, covs, weights 57 | 58 | def _set_GMM2(self): 59 | 60 | n_components = 3 61 | means = np.asarray([np.asarray([0.45, 0.5]), np.asarray([0.45, 0.5]), np.asarray([0.45, 0.5])]) 62 | 63 | covs = [np.zeros((2, 2))] * n_components 64 | 65 | covs[0] = self._set_cov(0.01, 0.0, 0.0001) 66 | covs[1] = self._set_cov(0.0003, 0.0003, 0.015) 67 | covs[2] = self._set_cov(0.0012, 0.00, 0.002) 68 | 69 | weights = np.asarray([0.5, 0.2, 0.3]) 70 | weights /= weights.sum() 71 | return means, covs, weights 72 | 73 | def _set_GMM3(self): 74 | n_components = 3 75 | means = np.asarray( 76 | [np.asarray([0.05, 0.8]), np.asarray([0.05, 0.8]), np.asarray([0.05, 0.8])]) 77 | 78 | covs = [np.zeros((2, 2))] * n_components 79 | 80 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002) 81 | covs[1] = self._set_cov(0.001, 0.0009, 0.001) 82 | covs[2] = self._set_cov(0.002, 0.001, 0.002) 83 | 84 | weights = np.ones(n_components) 85 | weights /= weights.sum() 86 | return means, covs, weights 87 | 88 | def _set_GMM12(self): 89 | n_components = 9 90 | means = np.asarray([ np.asarray([0.8,0.35]), np.asarray([0.45,0.5]), np.asarray([0.2,0.6]), 91 | np.asarray([0.05,0.8]), np.asarray([0.5,0.27]), np.asarray([0.5,0.27]), 92 | np.asarray([0.5, 0.27]), np.asarray([0.5, 0.4]), np.asarray([0.8,0.5])]) 93 | 94 | covs = [np.zeros((2,2))]*n_components 95 | 96 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002) 97 | covs[1] = self._set_cov(0.001, 0.0009, 0.001) 98 | covs[2] = self._set_cov(0.002, 0.001, 0.002) 99 | covs[3] = self._set_cov(0.003, 0.0008, 0.003) 100 | covs[4] = self._set_cov(0.0012, 0.0005, 0.0012) 101 | covs[5] = self._set_cov(0.01, 0.0, 0.0015) 102 | covs[6] = self._set_cov(0.005, 0.001, 0.02) 103 | covs[7] = self._set_cov(0.002, -0.0001, 0.002) 104 | covs[8] = self._set_cov(0.001, 0.0009, 0.001) 105 | 106 | weights = np.asarray([0.15,0.1,0.5,0.25,0.1,0.05,0.1,0.05,0.4]) 107 | weights /= weights.sum() 108 | return means, covs, weights 109 | 110 | def _set_GMM22(self): 111 | 112 | n_components = 3 113 | means = np.asarray([ np.asarray([0.6,0.35]), np.asarray([0.45,0.5]), np.asarray([0.19,0.62])]) 114 | 115 | covs = [np.zeros((2,2))]*n_components 116 | 117 | covs[0] = self._set_cov(0.003, 0.0008, 0.003) 118 | covs[1] = self._set_cov(0.005, 0.0, 0.0015) 119 | covs[2] = self._set_cov(0.0012, 0.00, 0.002) 120 | 121 | weights = np.asarray([0.5,0.2,0.3]) 122 | weights /= weights.sum() 123 | return means, covs, weights 124 | 125 | 126 | def _set_GMM32(self): 127 | n_components = 5 128 | means = np.asarray([ np.asarray([0.05,0.8]),np.asarray([0.05,0.8]), np.asarray([0.52,0.25]), np.asarray([0.52,0.27]), np.asarray([0.45, 0.5])]) 129 | 130 | covs = [np.zeros((2,2))]*n_components 131 | 132 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002) 133 | covs[1] = self._set_cov(0.001, 0.0009, 0.001) 134 | covs[2] = self._set_cov(0.002, 0.001, 0.002) 135 | covs[3] = self._set_cov(0.003, 0.0008, 0.003) 136 | covs[4] = self._set_cov(0.0012, 0.0005, 0.0012) 137 | 138 | weights = np.asarray([0.15,0.25,0.10,0.3,0.2]) 139 | weights /= weights.sum() 140 | return means, covs, weights 141 | 142 | def assign_cluster_labels(self,x): 143 | gamma = self._expectation(x) 144 | labels = np.argmax(gamma, axis=0)+1 145 | return labels 146 | 147 | def _expectation(self, x): 148 | n_points = x.shape[0] 149 | gamma = np.zeros((self.n_models_, n_points)) 150 | 151 | for i_model in range(self.n_models_): 152 | gamma[i_model, :] = self.model_weights_[i_model] *self.GMM_list_[i_model].density(x) 153 | gamma /= np.sum(gamma, axis=0) 154 | return gamma 155 | 156 | def _set_parameters(self): 157 | n_components = 10 158 | means = np.asarray([np.asarray([0.5, 0.27]), np.asarray([0.5, 0.27]), np.asarray([0.5, 0.27]), 159 | np.asarray([0.5, 0.27]), np.asarray([0.40, 0.5]), np.asarray([0.4, 0.5]), 160 | np.asarray([0.4, 0.5]),np.asarray([0.05, 0.8]), np.asarray([0.05, 0.8]), 161 | np.asarray([0.05, 0.8])]) 162 | 163 | covs = [np.zeros((2, 2))] * n_components 164 | 165 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002) 166 | covs[1] = self._set_cov(0.001, 0.0009, 0.001) 167 | covs[2] = self._set_cov(0.002, 0.001, 0.002) 168 | covs[3] = self._set_cov(0.003, 0.0008, 0.003) 169 | 170 | weights1 = np.asarray([0.5, 0.3, 0.3, 0.3]) 171 | weights1 /= weights1.sum() 172 | 173 | covs[4] = self._set_cov(0.01, 0.0, 0.0001) 174 | covs[5] = self._set_cov(0.0003, 0.0000, 0.0003) 175 | covs[6] = self._set_cov(0.0012, 0.00, 0.002) 176 | 177 | weights2 = np.asarray([0.5, 0.2, 0.3]) 178 | weights2 /= weights2.sum() 179 | 180 | covs[7] = self._set_cov(0.0021, 0.0005, 0.002) 181 | covs[8] = self._set_cov(0.001, 0.0009, 0.001) 182 | covs[9] = self._set_cov(0.002, 0.001, 0.002) 183 | 184 | weights3 = np.ones(3) 185 | weights3 /= weights3.sum() 186 | 187 | weights = np.ravel(np.concatenate((0.25*weights1, 0.5*weights2, 0.25*weights3))) 188 | 189 | self.means_ = means 190 | self.covariances_ = covs 191 | self.weights_ = weights 192 | 193 | self.GMM_list_ = [] 194 | self.GMM_list_.append(GaussianMixture(n_components=4)) 195 | self.GMM_list_[-1].means_ = means[0:4,:] 196 | self.GMM_list_[-1].covariances_ = covs[0:4] 197 | self.GMM_list_[-1].weights_ = weights1 198 | 199 | self.GMM_list_.append(GaussianMixture(n_components=3)) 200 | self.GMM_list_[-1].means_ = means[4:7,:] 201 | self.GMM_list_[-1].covariances_ = covs[4:7] 202 | self.GMM_list_[-1].weights_ = weights1 203 | 204 | self.GMM_list_.append(GaussianMixture(n_components=3)) 205 | self.GMM_list_[-1].means_ = means[7::, :] 206 | self.GMM_list_[-1].covariances_ = covs[7::] 207 | self.GMM_list_[-1].weights_ = weights1 208 | 209 | self.model_weights_ = np.asarray([0.25,0.5,0.25]) 210 | self.n_models_ = 3 211 | return 212 | -------------------------------------------------------------------------------- /toy_models/toy_model_nonlinear_GMM_2D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from free_energy_clustering.GMM import GaussianMixture 3 | from free_energy_clustering.GMM_free_energy import FreeEnergyClustering 4 | 5 | class GMM2dNonlinear(): 6 | 7 | def __init__(self, transform_data=True): 8 | 9 | self.transform_data=transform_data 10 | self.n_dims_ = 2 11 | self.name = 'nonlinear_GMM_2D' 12 | n_components = 3 13 | self.GMM = GaussianMixture(n_components=n_components) 14 | self.n_components_ = n_components 15 | self._set_parameters() 16 | return 17 | 18 | def transform(self,x): 19 | x = np.asarray([x[:,0], x[:,1]+(2.0*x[:,0]**3)]).T # np.sqrt(np.abs(x[:,0]))]).T 20 | return x 21 | 22 | def detransform(self,x): 23 | x = np.asarray([x[:,0], x[:,1]-(2.0*x[:,0]**3)]).T # np.sqrt(np.abs(x[:,0]))]).T 24 | return x 25 | 26 | def sample(self, n_points): 27 | x = self.GMM.sample(n_points) 28 | if self.transform_data: 29 | x = self.transform(x) 30 | return x 31 | 32 | def assign_cluster_labels(self,x): 33 | if self.transform_data: 34 | x = self.detransform(x) 35 | labels = self.GMM.predict(x)+1 36 | #labels[labels==3] = 2 37 | return labels 38 | 39 | def density(self, x): 40 | if self.transform_data: 41 | x = self.detransform(x) 42 | return self.GMM.density(x) 43 | 44 | def _set_cov(self, x11,x12,x22): 45 | tmp_cov = np.zeros((self.n_dims_, self.n_dims_)) 46 | 47 | tmp_cov[0, 0] = x11 48 | tmp_cov[0, 1] = x12 49 | tmp_cov[1, 0] = x12 50 | tmp_cov[1, 1] = x22 51 | return tmp_cov 52 | 53 | def _set_parameters(self): 54 | 55 | #self.GMM.means_ = np.asarray([np.asarray([0.0,0.6]), np.asarray([0.3,0.25]), np.asarray([0.3,0.25])]) 56 | self.GMM.means_ = np.asarray([np.asarray([-0.8, 0.6]), np.asarray([-0.5, 0.25]), np.asarray([-0.6, 0.25])]) 57 | covs = [np.zeros((2,2))]* self.GMM.n_components_ 58 | 59 | covs[0] = self._set_cov(0.01, 0.005, 0.05) 60 | covs[1] = self._set_cov(0.05, -0.01, 0.015) 61 | covs[2] = self._set_cov(0.001, 0.000, 0.01) 62 | #covs[1] = self._set_cov(0.05, -0.01, 0.015) 63 | #covs[2] = self._set_cov(0.001, 0.000, 0.01) 64 | 65 | self.GMM.covariances_ = covs 66 | 67 | self.GMM.weights_ = np.asarray([0.8,0.08,0.08]) 68 | self.GMM.weights_ /= np.sum(self.GMM.weights_) 69 | 70 | return 71 | --------------------------------------------------------------------------------