├── LICENSE
├── README.md
├── free_energy_clustering
├── FE_landscape_clustering.py
├── GMM.py
├── GMM_free_energy.py
├── __init__.py
├── cluster_density.py
├── cross_validation.py
├── free_energy_pathways.py
└── stack_landscapes.py
├── toy_models
├── Kmeans_cluster.py
├── __init__.py
├── agglomerative_ward_cluster.py
├── evaluate_toy_models.py
├── spectral_cluster.py
├── toy_model_GMM_2D.py
├── toy_model_blobs.py
├── toy_model_moons.py
├── toy_model_multiple_GMMs.py
└── toy_model_nonlinear_GMM_2D.py
└── tutorial_free_energy_clustering.ipynb
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 delemottelab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Free energy estimation and clustering with InfleCS
2 | This repository contains a jupyter notebook tutorial together with the necessary information and code for estimating free energy landscapes with Gaussian mixture models and extracting core-states from density maxima with InfleCS clustering.
3 |
4 | Dependencies:
5 | * Python 3.6
6 | * Scikit-learn 0.19 or later
7 | * Matplotlib
8 |
9 | -----------------------------------------------------------
10 | # References
11 | **Free energy estimation with Gaussian mixture models**
12 | *Inference of Calmodulin’s Ca2+-Dependent Free Energy Landscapes via Gaussian Mixture Model Validation*
13 | Annie M. Westerlund, Tyler J. Harpole, Christian Blau, and Lucie Delemotte
14 | Journal of Chemical Theory and Computation, 2018
15 | DOI: 10.1021/acs.jctc.7b00346
16 |
17 |
18 | **Clustering with InfleCS**
19 | *InfleCS: Clustering Free Energy Landscapes with Gaussian Mixtures*
20 | Annie M. Westerlund, Lucie Delemotte
21 | Journal of Chemical Theory and Computation, 2019
22 | DOI: 10.1021/acs.jctc.9b00454
23 |
24 | ----------------------------------------------------------
25 | Annie Westerlund, KTH Royal Institute of Technology, 2019
26 |
--------------------------------------------------------------------------------
/free_energy_clustering/FE_landscape_clustering.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy as np
3 | from scipy.optimize import fmin_cg
4 | import free_energy_clustering.cluster_density as cluster
5 | from scipy.spatial.distance import cdist
6 | from scipy.stats import multivariate_normal
7 |
8 | class LandscapeClustering():
9 |
10 | def __init__(self, ensemble_of_GMMs=False, verbose=True):
11 | self.cluster_centers_ = None
12 | self.labels_ = None
13 | self.ensemble_of_GMMs = ensemble_of_GMMs
14 | self.clusterer_ = None
15 | self.verbose_ = verbose
16 | return
17 |
18 | def get_cluster_representative(self, x, labels, free_energies):
19 | """
20 | Get one point in each cluster that has minimum FE in that cluster
21 | """
22 | n_clusters = int(np.max(labels) + 1)
23 | n_points = x.shape[0]
24 |
25 | print('Cluster labels: '+str(np.unique(labels)))
26 |
27 | min_FE_inds = np.zeros(n_clusters-1)
28 | all_inds = np.arange(n_points)
29 | mask = np.ones(n_clusters-1,dtype=bool)
30 | for i_cluster in range(1,n_clusters):
31 | cluster_inds = all_inds[labels == i_cluster]
32 | if cluster_inds.shape[0] > 0:
33 | min_FE_inds[i_cluster-1] = cluster_inds[np.argmin(free_energies[cluster_inds])]
34 | else:
35 | min_FE_inds[i_cluster-1] = np.nan
36 | mask[i_cluster-1]=False
37 | print('No point in cluster '+str(i_cluster))
38 |
39 | self.cluster_centers_ = min_FE_inds[mask].astype(int)
40 | return self.cluster_centers_
41 |
42 | def assign_transition_points(self, cluster_indices, points, density_model):
43 | """
44 | Assign cluster indices to transition points by maximizing density towards local maximum and use this to assign
45 | the cluster index.
46 | :return:
47 | """
48 | print("Assigning cluster indices to non-core cluster points.")
49 | if np.sum(cluster_indices) == 0: # If all points are marked as transition points
50 | return cluster_indices+1
51 |
52 | cl_inds_final = np.copy(cluster_indices)
53 | transition_point_inds = np.where(cluster_indices==0)[0]
54 | n_assigned = np.sum(cluster_indices>0)
55 |
56 | # Sort points from higher to lower density
57 | density_all = density_model.density(points)
58 | densities_trans_points = density_all[transition_point_inds]
59 |
60 | # Sort transition points in decending density order (assign cluster index to highest density points first)
61 | sort_inds = np.argsort(-densities_trans_points)
62 | transition_point_inds = transition_point_inds[sort_inds]
63 |
64 | counter = 0
65 | for ind in transition_point_inds:
66 |
67 | point = points[ind]
68 | # Extract assigned points
69 | assigned_inds = np.where(cl_inds_final>0)[0]
70 | assigned_points = points[assigned_inds,:]
71 | distances = cdist(point[np.newaxis,:],assigned_points)
72 |
73 | # Find closest assigned point. Use its cluster index on the current unassigned point.
74 | closest_point = np.argmin(distances[0,:])
75 | cl_inds_final[ind] = cl_inds_final[assigned_inds[closest_point]]
76 |
77 | n_assigned += 1
78 | counter += 1
79 | return cl_inds_final
80 |
81 | def _compute_gradients(self, density_model, points, inv_covs=None):
82 | n_points = points.shape[0]
83 | n_dims = points.shape[1]
84 | n_components = density_model.n_components_
85 |
86 | means = density_model.means_
87 | covs = density_model.covariances_
88 | weights = density_model.weights_
89 |
90 | gradients = np.zeros((n_points, n_dims))
91 |
92 | compute_inv_covs = False
93 | if inv_covs is None:
94 | inv_covs = [np.zeros((n_dims, n_dims))] * n_components
95 | compute_inv_covs = True
96 |
97 | for i_component in range(n_components):
98 | if compute_inv_covs:
99 | inv_covs[i_component] = np.linalg.inv(covs[i_component])
100 |
101 | devs = points - means[i_component]
102 | exp_deriv = -devs.dot(inv_covs[i_component])
103 | for i_point in range(n_points):
104 | gradients[i_point, :] += weights[i_component] * exp_deriv[i_point, :] * multivariate_normal.pdf(
105 | points[i_point, :], mean=means[i_component], cov=covs[i_component])
106 | if compute_inv_covs:
107 | return gradients, inv_covs
108 | return gradients
109 |
110 | def _compute_GMM_Hessian(self, density_model, x, inv_covs):
111 | n_dims = x.shape[0]
112 | n_components = density_model.n_components_
113 |
114 | means = density_model.means_
115 | covs = density_model.covariances_
116 | weights = density_model.weights_
117 |
118 | hessian = np.zeros((n_dims, n_dims))
119 |
120 | for i_component in range(n_components):
121 | devs = x - means[i_component]
122 | exp_deriv = -devs.dot(inv_covs[i_component])
123 |
124 | # Compute Hessian at current point
125 | for i_dim in range(n_dims):
126 | for j_dim in range(n_dims):
127 | post_weight = weights[i_component] * multivariate_normal.pdf(x, mean=means[i_component],
128 | cov=covs[i_component])
129 | hessian[i_dim, j_dim] += post_weight * (
130 | -inv_covs[i_component][i_dim, j_dim] + exp_deriv[i_dim] * exp_deriv[j_dim])
131 |
132 | return hessian
133 |
134 | def _compute_GMM_FE_Hessian(self, density_model, x, inv_covs):
135 | n_dims = x.shape[0]
136 | n_components = density_model.n_components_
137 |
138 | means = density_model.means_
139 | covs = density_model.covariances_
140 | weights = density_model.weights_
141 |
142 | hessian = np.zeros((n_dims, n_dims))
143 |
144 | point = x[np.newaxis,:]
145 | gradient = self._compute_gradients(density_model, point, inv_covs=inv_covs)
146 | density = density_model.density(point)
147 | density[density<1e-15] = 1e-15
148 |
149 | for i_component in range(n_components):
150 | devs = x - means[i_component]
151 | exp_deriv = -devs.dot(inv_covs[i_component])
152 |
153 | # Compute Hessian at current point
154 | for i_dim in range(n_dims):
155 | for j_dim in range(n_dims):
156 | post_weight = weights[i_component] * multivariate_normal.pdf(x, mean=means[i_component],
157 | cov=covs[i_component])
158 | hessian[i_dim, j_dim] += post_weight * (
159 | -inv_covs[i_component][i_dim, j_dim] + exp_deriv[i_dim] * exp_deriv[j_dim])
160 |
161 | for i_dim in range(n_dims):
162 | for j_dim in range(n_dims):
163 | FE_hess = 1.0/density**2 * gradient[0,i_dim]*gradient[0,j_dim]-hessian[i_dim, j_dim]/density
164 | hessian[i_dim, j_dim] = FE_hess
165 |
166 | return hessian
167 |
168 | def _Hessian_def(self, density_model, points, use_FE_landscape=False):
169 | """
170 | Compute the Hessian in every point to check whether they belong to a
171 | free energy minimum or not.
172 | """
173 | n_points = points.shape[0]
174 | n_dims = points.shape[1]
175 |
176 | if self.ensemble_of_GMMs:
177 | n_models = density_model.n_models_
178 |
179 | is_FE_min = [False] * n_points
180 |
181 | # Compute all inverse covariances
182 | if self.ensemble_of_GMMs:
183 | all_inv_covs = [0]*n_models
184 | n_components = [0]*n_models
185 | for i_model in range(n_models):
186 |
187 | n_components = density_model.GMM_list_[i_model].n_components_
188 |
189 | inv_covs = [np.zeros((n_dims, n_dims))] * n_components
190 | for i_component in range(n_components):
191 | inv_covs[i_component] = np.linalg.inv(density_model.GMM_list_[i_model].covariances_[i_component])
192 | all_inv_covs[i_model] = inv_covs
193 | else:
194 | n_components = density_model.n_components_
195 | inv_covs = [np.zeros((n_dims, n_dims))] * n_components
196 | for i_component in range(n_components):
197 | inv_covs[i_component] = np.linalg.inv(density_model.covariances_[i_component])
198 |
199 | # Computing Hessian to determine whether point belongs to FE min or not
200 | if use_FE_landscape:
201 | print('Computing Hessians of free energy landscape.')
202 | else:
203 | print('Computing Hessians of density landscape.')
204 |
205 | for i_point, x in enumerate(points):
206 | if self.verbose_:
207 | sys.stdout.write("\r"+'Point: '+str(i_point+1)+'/'+str(points.shape[0]))
208 | sys.stdout.flush()
209 | if self.ensemble_of_GMMs:
210 | hessian = np.zeros((n_dims,n_dims))
211 | for i_model in range(n_models):
212 | if density_model.model_weights_[i_model] > 0:
213 | if use_FE_landscape:
214 | hessian += density_model.model_weights_[i_model] * self._compute_GMM_FE_Hessian(
215 | density_model.GMM_list_[i_model], x, all_inv_covs[i_model])
216 | else:
217 | hessian += density_model.model_weights_[i_model]*self._compute_GMM_Hessian(density_model.GMM_list_[i_model],
218 | x, all_inv_covs[i_model])
219 | else:
220 | if use_FE_landscape:
221 | hessian = self._compute_GMM_FE_Hessian(density_model, x, inv_covs)
222 | else:
223 | hessian = self._compute_GMM_Hessian(density_model, x, inv_covs)
224 |
225 | # Compute Hessian eigenvalues
226 | eigvals = np.linalg.eigvals(hessian)
227 |
228 | if use_FE_landscape:
229 | # Check: if Hessian is positive definite => the point is at a free energy minimum
230 | if eigvals.min() > 0.0:
231 | is_FE_min[i_point] = True
232 | else:
233 | # Check: if Hessian is negative definite => the point is at a density maximum
234 | if eigvals.max() < 0.0:
235 | is_FE_min[i_point] = True
236 | if self.verbose_:
237 | print()
238 | return is_FE_min
239 |
240 | def cluster(self, density_models, points, eval_points=None, use_FE_landscape=False, transition_matrix=None):
241 | # Indicate whether points are at free energy minimum or not
242 | is_FE_min = self._Hessian_def(density_models, points, use_FE_landscape=use_FE_landscape)
243 | self.grid_points_=points
244 | # Cluster free energy landscape
245 | self.clusterer_ = cluster.ClusterDensity(points, eval_points)
246 | self.labels_ = self.clusterer_.cluster_data(is_FE_min, transition_matrix=transition_matrix)
247 | return self.labels_, is_FE_min
248 |
249 |
--------------------------------------------------------------------------------
/free_energy_clustering/GMM.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import math
3 | import numpy as np
4 | from scipy.stats import multivariate_normal
5 |
6 | class GaussianMixture():
7 |
8 | def __init__(self,n_components=2, convergence_tol=1e-6, verbose=False):
9 | self.n_components_ = n_components
10 | self.weights_ = np.ones(n_components)/float(n_components)
11 | self.means_ = np.zeros(n_components)
12 | self.covariances_ = [np.zeros((n_components,n_components))]*n_components
13 | self.tol_ = convergence_tol
14 | self.data_weights_ = None
15 | self.verbose_ = verbose
16 | return
17 |
18 | def fit(self, x, data_weights=None):
19 | """
20 | Fit GMM to points in x with EM.
21 | :param data_weights: Weights of each data point.
22 | """
23 |
24 | if data_weights is not None:
25 | x = x[data_weights>0]
26 | data_weights = data_weights[data_weights>0]
27 | data_weights = data_weights/np.sum(data_weights)
28 | data_weights = data_weights * data_weights.shape[0]
29 |
30 | self.data_weights_ = data_weights
31 | while True:
32 | prev_loglikelihood = np.inf
33 | loglikelihood = 0
34 | self._initialize_parameters(x)
35 |
36 | while(np.abs(prev_loglikelihood-loglikelihood) > self.tol_):
37 |
38 | gamma = self._expectation(x, self.data_weights_)
39 | self._maximization(x, gamma)
40 |
41 | prev_loglikelihood= loglikelihood
42 | loglikelihood = self.loglikelihood(x, self.data_weights_)
43 |
44 | break
45 | return self
46 |
47 | def predict(self,x):
48 | gamma = self._expectation(x)
49 | labels = np.argmax(gamma,axis=0)
50 | return labels
51 |
52 | def _initialize_parameters(self,x):
53 | """
54 | Initialize component means and covariances
55 | """
56 | n_points = x.shape[0]
57 | inds = np.random.randint(n_points,size=self.n_components_)
58 |
59 | # Initialize means
60 | self.means_ = x[inds,:]
61 | # Initialize covariances
62 | tmp_cov = np.cov(x.T)
63 | for i_component in range(self.n_components_):
64 | self.covariances_[i_component] = tmp_cov
65 | return
66 |
67 | def _expectation(self,x, data_weights=None):
68 | """
69 | Perform expecation step
70 | """
71 | n_points = x.shape[0]
72 | gamma = np.zeros((self.n_components_,n_points))
73 |
74 | for i_component in range(self.n_components_):
75 |
76 | normal_density = multivariate_normal.pdf(x, mean=self.means_[i_component], cov=self.covariances_[i_component])
77 | gamma[i_component, :] = self.weights_[i_component]*normal_density
78 |
79 | gamma /= np.sum(gamma,axis=0)
80 |
81 | if data_weights is not None:
82 | gamma = np.multiply(gamma, data_weights)
83 |
84 | return gamma
85 |
86 | def _maximization(self,x, gamma):
87 | """
88 | Update parameters with maximization step
89 | """
90 | self._update_weights(x, gamma)
91 | self._update_means(x, gamma)
92 | self._update_covariances(x, gamma)
93 | return
94 |
95 | def _update_weights(self,x, gamma):
96 | """
97 | Update each component amplitude.
98 | """
99 |
100 | self.weights_ = np.sum(gamma,axis=1)
101 |
102 | # Normalize Cat-distibution
103 | self.weights_ /= np.sum(self.weights_)
104 | return
105 |
106 |
107 | def _update_means(self,x, gamma):
108 | """
109 | Update each component mean.
110 | """
111 | Nk = np.sum(gamma,axis=1)
112 | for i_component in range(self.n_components_):
113 | self.means_[i_component, :] = np.dot(x.T,gamma[i_component])/Nk[i_component]
114 |
115 | return
116 |
117 | def _update_covariances(self, x, gamma):
118 | """
119 | Update each component covariance
120 | """
121 | n_dims = x.shape[1]
122 |
123 | Nk = np.sum(gamma, axis=1)
124 | for i_component in range(self.n_components_):
125 | y = x - self.means_[i_component]
126 | y2 = np.multiply(gamma[i_component,:,np.newaxis],y).T
127 | self.covariances_[i_component] = y2.dot(y)/Nk[i_component] + 1e-9*np.eye(n_dims)
128 |
129 | return
130 |
131 | def density(self, x):
132 | """
133 | Compute GMM density at given points, x.
134 | """
135 | n_points = x.shape[0]
136 | n_dims = x.shape[1]
137 |
138 | density = np.zeros(n_points)
139 | for i_component in range(self.n_components_):
140 | normal_density = multivariate_normal.pdf(x, mean=self.means_[i_component], cov=self.covariances_[i_component])
141 | density += self.weights_[i_component]*normal_density
142 |
143 | return density
144 |
145 | def loglikelihood(self, x, data_weights=None):
146 | """
147 | Compute log-likelihood. Support data weights.
148 | """
149 | density = self.density(x)
150 | density[density<1e-15] = 1e-15
151 | if data_weights is None:
152 | log_density = np.log(density)
153 | else:
154 | log_density = np.multiply(np.log(density), data_weights)
155 | return np.mean(log_density)
156 |
157 | def bic(self, x, data_weights=None):
158 | """
159 | Compute BIC score. Support data weights.
160 | """
161 | n_points, n_dims = x.shape
162 | n_params = (1 + n_dims + n_dims * (n_dims + 1) / 2.0) * self.n_components_
163 | loglikelihood = n_points * self.loglikelihood(x, data_weights=data_weights)
164 | return -2.0 * loglikelihood + n_params * math.log(n_points)
165 |
166 | def aic(self, x, data_weights=None):
167 | """
168 | Compute BIC score. Support data weights.
169 | """
170 | n_points, n_dims = x.shape
171 | n_params = (1 + n_dims + n_dims * (n_dims + 1) / 2.0) * self.n_components_
172 | loglikelihood = n_points * self.loglikelihood(x, data_weights=data_weights)
173 | return -2.0 * loglikelihood + 2.0 * n_params
174 |
175 | def sample(self, n_points):
176 | """
177 | Sample points from the density model.
178 | :param n_points:
179 | :return:
180 | """
181 | n_dims = self.means_.shape[1]
182 | sampled_points = np.zeros((n_points, n_dims))
183 | prob_component = np.cumsum(self.weights_)
184 | r = np.random.uniform(size=n_points)
185 |
186 | is_point_sampled = np.zeros((n_points), dtype=int)
187 |
188 | for i_point in range(n_points):
189 | for i_component in range(self.n_components_):
190 | if r[i_point] <= prob_component[i_component]:
191 | sampled_points[i_point, :] = np.random.multivariate_normal(self.means_[i_component],
192 | self.covariances_[i_component], 1)
193 | is_point_sampled[i_point] = 1
194 | break
195 | if is_point_sampled[i_point] ==0:
196 | print('Warning: Did not sample point: '+str(r[i_point])+' '+str(prob_component))
197 | return sampled_points
198 |
--------------------------------------------------------------------------------
/free_energy_clustering/GMM_free_energy.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import numpy as np
3 | from scipy.spatial.distance import cdist
4 |
5 | import free_energy_clustering.GMM as GMM
6 | from sklearn.mixture import GaussianMixture
7 | import free_energy_clustering.cross_validation as CV
8 | import free_energy_clustering as FEC
9 |
10 | import matplotlib
11 | import matplotlib.pyplot as plt
12 | from mpl_toolkits.mplot3d import Axes3D
13 |
14 | class FreeEnergyClustering(object):
15 |
16 | def __init__(self, data, min_n_components=8, max_n_components=None, n_components_step=1, x_lims=None, temperature=300.0,
17 | n_grids=50, n_splits=1, shuffle_data=False, n_iterations=1, convergence_tol=1e-4, stack_landscapes=False,
18 | verbose=True, test_set_perc=0.0, data_weights=None):
19 | """
20 | Class for computing free energy landscape in [kcal/mol].
21 | - observed_data has dimensionality [N x d].
22 | """
23 | self.data_ = data
24 | self.shuffle_data = shuffle_data
25 | self.n_splits_ = n_splits
26 | self.n_iterations_ = n_iterations
27 | self.convergence_tol_ = convergence_tol
28 | self.stack_landscapes_ = stack_landscapes
29 |
30 | self.min_n_components = min_n_components
31 | self.max_n_components = max_n_components
32 | self.n_components_step = n_components_step
33 |
34 | self.FE_points_ = None
35 | self.FE_landscape_ = None
36 | self.coords_ = None
37 | self.min_FE_ = None
38 |
39 | self.cl_ = None # Clustering object
40 | self.labels_ = None
41 | self.core_labels_ = None
42 | self.cluster_centers_ = None
43 | self.pathways_ = None
44 | self.state_populations_ = None
45 |
46 | if x_lims is not None:
47 | self.x_lims_ = x_lims
48 | self.n_dims_ = len(self.x_lims_)
49 | else:
50 | if len(data.shape) > 1:
51 | self.x_lims_ = []
52 | for i in range(data.shape[1]):
53 | self.x_lims_.append([data[:,i].min(),data[:,i].max()])
54 | self.n_dims_ = len(self.x_lims_)
55 | else:
56 | self.x_lims_ = [[data.min(),data.max()]]
57 | self.n_dims_ = 1
58 |
59 | self.temperature_ = temperature # [K]
60 | self.boltzmann_constant_ = 0.0019872041 # [kcal/(mol K)]
61 | self.density_est_ = None
62 | self.standard_error_FE_ = None
63 | self.nx_ = n_grids
64 | self.n_grids_ = [self.nx_]*self.n_dims_
65 | self.test_set_perc_ = test_set_perc
66 | self.verbose_ = verbose
67 | self.data_weights_ = data_weights
68 |
69 | self.BICs_ = []
70 |
71 | if data_weights is not None:
72 | use_data_weights = True
73 | # Convert data weights to the right format
74 | self.data_weights_ /= self.data_weights_.sum()
75 | self.data_weights_ *= self.data_weights_.shape[0]
76 | else:
77 | use_data_weights = False
78 |
79 | self.test_set_loglikelihood = None
80 | if verbose:
81 | print('*----------------Gaussian mixture model free energy estimator----------------*')
82 | print(' n_splits = '+str(n_splits))
83 | print(' shuffle_data = ' + str(shuffle_data))
84 | print(' n_iterations = ' + str(n_iterations))
85 | print(' n_grids = ' + str(n_grids))
86 | print(' covergence_tol = ' + str(convergence_tol))
87 | print(' stack_landscapes = ' + str(stack_landscapes))
88 | print(' x_lims (axes limits) = ' + str(self.x_lims_))
89 | print(' temperature = ' + str(temperature))
90 | print(' min_n_components = ' + str(min_n_components))
91 | print(' max_n_components = ' + str(max_n_components))
92 | print(' n_components_step = ' + str(n_components_step))
93 | print(' Using weighted data: ' + str(use_data_weights))
94 | print('*----------------------------------------------------------------------------*')
95 | return
96 |
97 | def _get_grid_coords(self):
98 | if self.n_dims_ < 4:
99 | x = []
100 | self.n_grids_ = []
101 | for i_dim in range(self.n_dims_):
102 | self.n_grids_.append(self.nx_)
103 | x.append(np.linspace(self.x_lims_[i_dim][0], self.x_lims_[i_dim][1], self.nx_))
104 |
105 | if self.n_dims_ == 1:
106 | return x
107 | coords = np.meshgrid(*x)
108 | else:
109 | # Do not discretize
110 | print('Note: # features > 3 => density not evaluated on grid.')
111 | coords = None
112 |
113 | return coords
114 |
115 | def _density_landscape(self, density_est):
116 | """
117 | Evaluate density model at the grid points.
118 | """
119 | if self.coords_ is None:
120 | coords = self._get_grid_coords()
121 | else:
122 | coords = self.coords_
123 |
124 | if self.n_dims_ == 1:
125 | densities = density_est.density(coords[0][:,np.newaxis])
126 | return coords, densities
127 |
128 | if coords is not None:
129 | print('Density grid shape: '+str(self.n_grids_))
130 | grid_points_flatten = []
131 | for x in coords:
132 | grid_points_flatten.append(np.ravel(x))
133 | points = np.asarray(grid_points_flatten).T
134 | densities = density_est.density(points)
135 | densities = np.reshape(densities, self.n_grids_)
136 | else:
137 | densities = density_est.density(self.data_)
138 |
139 | return coords, densities
140 |
141 | def _free_energy(self,density):
142 | density[density < 1e-8] = 1e-8
143 | FE = -self.temperature_ * self.boltzmann_constant_ * np.log(density)
144 | return FE
145 |
146 | def standard_error(self, n_data_blocks=3):
147 | """
148 | Estimating standard error.
149 | """
150 | print('Estimating standard error.')
151 | n_points = self.data_.shape[0]
152 | n_data_points = int(n_points/n_data_blocks)
153 |
154 | free_energies = []
155 |
156 | for i in range(n_data_blocks):
157 |
158 | if i != n_data_blocks-1:
159 | data = np.copy(self.data_[i*n_data_points:(i+1)*n_data_points])
160 | else:
161 | data = np.copy(self.data_[i*n_data_points::])
162 |
163 | if self.n_dims_ == 1:
164 | data = data[:,np.newaxis]
165 |
166 | _, density_model = self._fit_FE(data, set_density_model=False)
167 | _, density = self._density_landscape(density_model)
168 | free_energies.append(self._free_energy(density))
169 |
170 | free_energies = np.asarray(free_energies)
171 | self.standard_error_FE_ = np.std(free_energies,axis=0)/np.sqrt(n_data_blocks-1)
172 | print('Standard error estimation done.')
173 | return self.standard_error_FE_
174 |
175 | def _train_GMM(self, data, n_components, train_inds=None, val_inds=None, loglikelihood=0):
176 | """
177 | Perform one training of GMM.
178 | :param data:
179 | :param n_components:
180 | :return:
181 | """
182 |
183 | if train_inds is not None and val_inds is not None:
184 | training_data, validation_data = CV.get_train_validation_set(data, train_inds, val_inds)
185 | else:
186 | training_data = np.copy(data)
187 | validation_data = np.copy(data)
188 |
189 | if self.data_weights_ is None:
190 | gmm = GaussianMixture(n_components=n_components, tol=self.convergence_tol_)
191 |
192 | # Train model on the current training data
193 | gmm.fit(training_data)
194 |
195 | # Check log-likelihood of validation data
196 | loglikelihood += gmm.score(validation_data)
197 | else:
198 | gmm = GMM.GaussianMixture(n_components=n_components, convergence_tol=self.convergence_tol_,verbose=self.verbose_)
199 |
200 | training_data_weights = self.data_weights_
201 | validation_data_weights = self.data_weights_
202 |
203 | if train_inds is not None and val_inds is not None:
204 | if self.data_weights_ is not None:
205 | training_data_weights, validation_data_weights = CV.get_train_validation_set(self.data_weights_,
206 | train_inds, val_inds)
207 |
208 | # Train model on the current training data
209 | gmm.fit(training_data, data_weights=training_data_weights)
210 |
211 | # Check log-likelihood of validation data
212 | loglikelihood += gmm.loglikelihood(validation_data, data_weights=validation_data_weights)
213 |
214 | return gmm, loglikelihood
215 |
216 | def _fit_FE(self, data, set_density_model=True):
217 | """
218 | Fit density to data points.
219 | :param data: [n_samples x n_dims]
220 | :return: free energy of points
221 | """
222 |
223 | best_n_components = self.min_n_components
224 |
225 | # Extract test set from the dataset
226 | n_points_test = int(self.test_set_perc_*data.shape[0])
227 | data_orig = np.copy(data)
228 | data_weights_orig = np.copy(self.data_weights_)
229 |
230 | if n_points_test > 0:
231 | test_data = data[-n_points_test::,:]
232 | data = np.copy(data[0:-n_points_test, :])
233 | if self.data_weights_ is not None:
234 | self.data_weights_ = np.copy(self.data_weights_[0:-n_points_test,:])
235 | else:
236 | test_data = np.zeros((0,self.n_dims_))
237 |
238 | if self.stack_landscapes_:
239 | print('Estimating density with stacked GMMs.')
240 | else:
241 | print('Estimating density with GMM.')
242 |
243 | if self.data_weights_ is not None:
244 | print('Using weighted data to estimate GMM.')
245 |
246 | best_loglikelihood = -np.inf
247 | list_of_GMMs = []
248 | list_of_validation_data = []
249 | ICs = []
250 |
251 | # Get indices of training and validation datasets
252 | if self.n_splits_ > 1:
253 | train_inds, val_inds = CV.split_train_validation(data, self.n_splits_, self.shuffle_data)
254 |
255 | # Determine number of components with k-fold cross-validation,
256 | # or store all estimated densities and then weight together.
257 | if self.max_n_components is not None:
258 | for n_components in range(self.min_n_components,self.max_n_components+1,self.n_components_step):
259 | if self.verbose_:
260 | print('# Components = '+str(n_components))
261 |
262 | if self.n_splits_ > 1 and not(self.stack_landscapes_):
263 | loglikelihood = 0
264 | for i_split in range(self.n_splits_):
265 | gmm, loglikelihood = self._train_GMM(data, n_components, train_inds[i_split], val_inds[i_split], loglikelihood)
266 |
267 | # Keep best model
268 | if loglikelihood > best_loglikelihood:
269 | best_loglikelihood = loglikelihood
270 | best_n_components = n_components
271 | else:
272 | best_loglikelihood = -np.inf
273 | for i_iter in range(self.n_iterations_):
274 | # Train GMM
275 | gmm, loglikelihood = self._train_GMM(data, n_components)
276 |
277 | # Compute average AIC/BIC over iterations
278 | if i_iter == 0:
279 | if self.stack_landscapes_:
280 | if self.data_weights_ is None:
281 | ICs.append(gmm.aic(data))
282 | else:
283 | ICs.append(gmm.aic(data, self.data_weights_))
284 | else:
285 | if self.data_weights_ is None:
286 | ICs.append(gmm.bic(data))
287 | else:
288 | ICs.append(gmm.bic(data, self.data_weights_))
289 |
290 | # Keep best model
291 | if loglikelihood > best_loglikelihood:
292 | best_loglikelihood = loglikelihood
293 | if i_iter == 0:
294 | list_of_GMMs.append(GMM.GaussianMixture(n_components=n_components))
295 |
296 | if self.stack_landscapes_:
297 | ICs[-1] = gmm.aic(data)
298 | else:
299 | ICs[-1] = gmm.bic(data)
300 |
301 | list_of_GMMs[-1].weights_ = gmm.weights_
302 | list_of_GMMs[-1].means_ = gmm.means_
303 | list_of_GMMs[-1].covariances_ = gmm.covariances_
304 |
305 | if self.stack_landscapes_:
306 | if self.max_n_components is None:
307 | gmm, _ = self._train_GMM(data, self.min_n_components)
308 | list_of_GMMs.append(gmm)
309 |
310 | ICs = np.asarray(ICs)
311 | model_weights = np.exp(-0.5 *(ICs-ICs.min()))
312 | model_weights /= model_weights.sum()
313 |
314 | # Fit mixture of density estimators using the validation data
315 | density_est = FEC.LandscapeStacker(data, list_of_validation_data, list_of_GMMs, n_splits=1,
316 | convergence_tol=self.convergence_tol_, n_iterations=self.n_iterations_,
317 | model_weights=model_weights)
318 |
319 | density = density_est.density(data_orig)
320 | if set_density_model:
321 | self.density_est_ = density_est
322 | else:
323 | # Estimate FE with best number of components (deduced from cross-validation)
324 | if self.n_splits_ > 1:
325 | print('Training final model with ' + str(best_n_components) + ' components.')
326 | best_loglikelihood = -np.inf
327 | density_est = GMM.GaussianMixture(n_components=best_n_components)
328 | # Fit multiple times to
329 | for i_iter in range(self.n_iterations_):
330 | gmm, loglikelihood = self._train_GMM(data, best_n_components)
331 |
332 | if loglikelihood > best_loglikelihood:
333 | best_loglikelihood = loglikelihood
334 | density_est.weights_ = gmm.weights_
335 | density_est.means_ = gmm.means_
336 | density_est.covariances_ = gmm.covariances_
337 | else:
338 | ICs = np.asarray(ICs)
339 | self.BICs_ = np.copy(ICs)
340 | model_ind = ICs.argmin()
341 | gmm = list_of_GMMs[model_ind]
342 | best_n_components = gmm.weights_.shape[0]
343 | density_est = GMM.GaussianMixture(n_components=best_n_components)
344 |
345 | print('Identifying final model with ' + str(density_est.n_components_) + ' components.')
346 |
347 | density_est.weights_ = gmm.weights_
348 | density_est.means_ = gmm.means_
349 | density_est.covariances_ = gmm.covariances_
350 |
351 | density = density_est.density(data_orig)
352 |
353 | if set_density_model:
354 | self.density_est_ = density_est
355 |
356 | if set_density_model:
357 | # Compute test set loglikelihood on the test set if test set exists
358 | if n_points_test > 0:
359 | self.test_set_loglikelihood = self.density_est_.loglikelihood(test_data)
360 | return self._free_energy(density)
361 | else:
362 | return self._free_energy(density), density_est
363 |
364 | def landscape(self):
365 | """
366 | Computing free energy landscape with
367 | G(x) = -kT*log(p(x|T))
368 | Returns the X,Y coordinate matrices (meshgrid) and
369 | their corresponding free energy.
370 | """
371 |
372 | if len(self.data_.shape) == 1:
373 | FE_points = self._fit_FE(self.data_[:,np.newaxis])
374 | else:
375 | FE_points = self._fit_FE(self.data_)
376 |
377 | print('Evaluating density in landscape')
378 | coords, density = self._density_landscape(self.density_est_)
379 |
380 | FE_landscape = self._free_energy(density)
381 |
382 | # Shift to zero
383 | self.min_FE_ = np.min(FE_landscape)
384 | FE_landscape = FE_landscape-self.min_FE_
385 | FE_points = FE_points-self.min_FE_
386 |
387 | self.FE_points_ = FE_points
388 | self.FE_landscape_ = FE_landscape
389 | self.coords_ = coords
390 |
391 | return coords, FE_landscape, FE_points
392 |
393 | def evaluate_free_energy(self,data):
394 | """
395 | Evaluate the free energy of given data in the current free energy model.
396 | """
397 | density = self.density_est_.density(data)
398 | free_energy = self._free_energy(density)
399 | if self.min_FE_ is not None:
400 | free_energy -= self.min_FE_
401 |
402 | return free_energy
403 |
404 | def population_states(self, n_sampled_points=10000):
405 | """
406 | Estimate the population of states (probability to be in a state) based on Mante-Carlo integration of
407 | the estimated density and state definitions.
408 | :param n_sampled_points:
409 | :return:
410 | """
411 |
412 | if self.stack_landscapes_:
413 | state_populations = None
414 | print('TODO: Estimating population of states is not possible with stacked landscapes yet.')
415 | else:
416 |
417 | print('Sampling points from density.')
418 | # Sample points from estimated density
419 | points = self.density_est_.sample(n_sampled_points)
420 |
421 | # Assign cluster labels of sampled points
422 | cluster_labels = self.evaluate_clustering(points)
423 |
424 | print('Computing state populations.')
425 | # Monte-Carlo integration (histogramming)
426 | self.state_populations_, _ = np.histogram(cluster_labels, bins=int(self.labels_.max()+1), range=(self.labels_.min(),self.labels_.max()),density=False)
427 |
428 | #print(state_populations)
429 | self.state_populations_ = self.state_populations_/self.state_populations_.sum()
430 |
431 | return self.state_populations_
432 |
433 | def evaluate_clustering(self, points, assign_transition_points=False):
434 | """
435 | Assign cluster indices to points based on precomputed density model clustering.
436 | """
437 | print('Assigning cluster labels based on precomputed density model clustering.')
438 | if self.cl_ is not None and self.cl_.clusterer_ is not None:
439 | labels = self.cl_.clusterer_.data_cluster_indices(cdist(points, self.cl_.clusterer_.grid_points_), self.cl_.clusterer_.grid_cluster_inds_)
440 |
441 | if assign_transition_points:
442 | labels = self.cl_.assign_transition_points(labels, points, self.density_est_)
443 |
444 | return labels
445 |
446 | def cluster(self, points, free_energies, eval_points=None, return_center_coords=False, assign_transition_points=False,use_FE_landscape=False, unravel_grid=True, transition_matrix=None):
447 | """
448 | Cluster points according to estimated density.
449 | """
450 |
451 | self.transition_matrix_ = transition_matrix
452 |
453 | print('Clustering free energy landscape...')
454 | self.cl_ = FEC.LandscapeClustering(self.stack_landscapes_,verbose=self.verbose_)
455 |
456 | if eval_points is not None and unravel_grid:
457 | tmp_points = []
458 | for x in points:
459 | tmp_points.append(np.ravel(x))
460 | points = np.asarray(tmp_points).T
461 |
462 | if len(points.shape) == 1:
463 | points = points[:,np.newaxis]
464 |
465 |
466 | if eval_points is not None:
467 | if len(eval_points.shape) == 1:
468 | eval_points = eval_points[:,np.newaxis]
469 |
470 | self.labels_, self.is_FE_min = self.cl_.cluster(self.density_est_, points, eval_points=eval_points, use_FE_landscape=use_FE_landscape, transition_matrix=self.transition_matrix_)
471 |
472 | self.core_labels_ = np.copy(self.labels_)
473 |
474 | if eval_points is not None:
475 | self.cluster_centers_ = self.cl_.get_cluster_representative(eval_points, self.labels_, free_energies)
476 | else:
477 | self.cluster_centers_ = self.cl_.get_cluster_representative(points, self.labels_, free_energies)
478 |
479 | if assign_transition_points:
480 | if eval_points is not None:
481 | self.labels_ = self.cl_.assign_transition_points(self.labels_, eval_points, self.density_est_)
482 | else:
483 | self.labels_ = self.cl_.assign_transition_points(self.labels_, points, self.density_est_)
484 |
485 | print('Done clustering.')
486 | if return_center_coords:
487 | return self.labels_, eval_points[self.cluster_centers_,:]
488 | else:
489 | return self.labels_, self.cluster_centers_
490 |
491 | def pathways(self, states_from, states_to,n_points=10, convergence_tol=1e-1, step_size=1e-3, max_iter=100):
492 | """
493 | Calculate minimum pathways between points (indices) in states_from and states_to.
494 | :param states_from:
495 | :param states_to:
496 | :param n_points:
497 | :param convergence_tol:
498 | :param step_size:
499 | :return:
500 | """
501 | pathway_estimator = FEC.FreeEnergyPathways(self.density_est_, self.data_, self.temperature_,
502 | n_points=n_points, convergence_tol=convergence_tol,
503 | step_size=step_size, ensemble_of_GMMs=self.stack_landscapes_,
504 | max_iter=max_iter)
505 | self.pathways_ = []
506 | for from_ind, to_ind in zip(states_from,states_to):
507 | self.pathways_.append(pathway_estimator.minimum_pathway(from_ind, to_ind))
508 |
509 | return
510 |
511 | def visualize(self,title="Free energy landscape", fontsize=30, savefig=True, xlabel='x', ylabel='y', zlabel='z', vmax=7.5,
512 | n_contour_levels=15, show_data=False, figsize= [12, 10], filename='free_energy_landscape', dx=1, ax=None, data_cmap='jet'):
513 |
514 | if self.n_dims_ > 3:
515 | print('Plotting does not support > 3 dimensions')
516 | return
517 |
518 | # Set custom colormaps
519 | my_cmap = copy.copy(matplotlib.cm.get_cmap('jet'))
520 | my_cmap.set_over('white')
521 | my_cmap_cont = matplotlib.colors.ListedColormap(['black'])
522 | my_cmap_cont.set_over('white')
523 |
524 | data_cmap = copy.copy(matplotlib.cm.get_cmap(data_cmap))
525 |
526 | plt.rcParams['figure.figsize'] = figsize
527 |
528 | if ax is None:
529 | fig = plt.figure()
530 | if self.n_dims_ < 3:
531 | ax = fig.add_subplot(1, 1, 1)
532 | else:
533 | ax = fig.add_subplot(111, projection='3d')
534 | ax.tick_params(labelsize=fontsize - 2)
535 |
536 | plt.tick_params(axis='both', which='major', labelsize=fontsize-4)
537 |
538 | for tick in ax.get_xticklabels():
539 | tick.set_fontname("Serif")
540 | tick.set_fontweight('light')
541 |
542 | for tick in ax.get_yticklabels():
543 | tick.set_fontname("Serif")
544 | tick.set_fontweight('light')
545 |
546 | # Plot free energy landscape
547 | FE_landscape = np.copy(self.FE_landscape_)
548 | FE_landscape[self.FE_landscape_ > vmax+0.5] = vmax+0.5
549 |
550 | if self.n_dims_ == 2:
551 | ctf = ax.contourf(self.coords_[0], self.coords_[1], FE_landscape, n_contour_levels, cmap=my_cmap, vmin=0, vmax=vmax)
552 | cb=plt.colorbar(ctf, label='[kcal/mol]')
553 | text = cb.ax.yaxis.label
554 | font = matplotlib.font_manager.FontProperties(size=fontsize-3,family='serif',weight='light')
555 | text.set_font_properties(font)
556 | cb.ax.tick_params(labelsize=fontsize-2)
557 |
558 | for tick in cb.ax.get_yticklabels():
559 | tick.set_fontname("Serif")
560 | tick.set_fontweight('light')
561 |
562 | ax.set_ylim([self.coords_[1].min(), self.coords_[1].max()])
563 | ax.set_ylabel(ylabel, fontsize=fontsize - 2,fontname='serif',fontweight='light')
564 | elif self.n_dims_ == 1:
565 | if self.standard_error_FE_ is not None:
566 | ax.fill_between(self.coords_[0], FE_landscape - self.standard_error_FE_, FE_landscape + self.standard_error_FE_, color='k', alpha=0.2,zorder=2)
567 | ax.plot(self.coords_[0], FE_landscape, linewidth=3,color='k',zorder=1)
568 | ax.set_ylabel('Free energy [kcal/mol]',fontsize=fontsize-2,fontname='serif',fontweight='light')
569 | else:
570 | sc = ax.scatter(self.data_[::dx,0], self.data_[::dx,1], self.data_[::dx,2], s=30, c=self.FE_points_[::dx], alpha=0.8, cmap=my_cmap, vmin=0, vmax=vmax, edgecolor='k')
571 |
572 | ax.set_ylim([self.coords_[1].min(), self.coords_[1].max()])
573 | ax.set_zlim([self.coords_[2].min(), self.coords_[2].max()])
574 |
575 | cb=plt.colorbar(sc,label='[kcal/mol]')
576 | text = cb.ax.yaxis.label
577 | font = matplotlib.font_manager.FontProperties(size=fontsize-3,family='serif',weight='light')
578 | text.set_font_properties(font)
579 | cb.ax.tick_params(labelsize=fontsize-2)
580 |
581 | ax.set_ylabel(ylabel, fontsize=fontsize - 2,fontname='serif',fontweight='light')
582 | ax.set_zlabel(zlabel, fontsize=fontsize - 2,fontname='serif',fontweight='light')
583 |
584 | ax.set_xlim([self.coords_[0].min(), self.coords_[0].max()])
585 |
586 | # Plot projected data points
587 | if show_data and self.n_dims_ < 3:
588 |
589 | # Plot projected data points
590 | if self.labels_ is not None:
591 | if self.n_dims_ > 1:
592 | transition_points=self.data_[self.labels_==0]
593 | core_points = self.data_[self.labels_ > 0]
594 | core_labels = self.labels_[self.labels_>0]
595 | ax.scatter(transition_points[::dx, 0], transition_points[::dx, 1], s=30, color=[0.67, 0.67, 0.67],alpha=0.5)
596 | ax.scatter(core_points[::dx, 0], core_points[::dx, 1], s=80, c=core_labels[::dx],
597 | edgecolor='k', cmap=data_cmap, label='Intermediate state',alpha=0.8)
598 | else:
599 | ax.scatter(self.data_[self.labels_==0], self.FE_points_[self.labels_==0], s=30, color=[0.67, 0.67, 0.65],alpha=0.6,zorder=3)
600 | ax.scatter(self.data_[self.labels_>0], self.FE_points_[self.labels_>0], s=50, c=self.labels_[self.labels_>0],
601 | edgecolor='k', cmap=data_cmap, label='Intermediate state',alpha=0.8,zorder=4)
602 | if fontsize > 18:
603 | plt.legend(fontsize=fontsize-10,facecolor=[0.9,0.9,0.92])
604 | else:
605 | plt.legend(fontsize=fontsize-4,facecolor=[0.9,0.9,0.92])
606 | else:
607 | if self.n_dims_ > 1:
608 | ax.scatter(self.data_[:, 0], self.data_[:, 1], s=30, color=[0.67, 0.67, 0.65],alpha=0.5)
609 | else:
610 | ax.scatter(self.data_, self.FE_points_[:, 1], s=30, color=[0.67, 0.67, 0.65],alpha=0.5)
611 |
612 | # Plot minimum pathways between states
613 | if self.pathways_ is not None and self.n_dims_ > 1:
614 | set_pathway_label = True
615 | for p in self.pathways_:
616 | if set_pathway_label:
617 | ax.plot(p[:, 0], p[:, 1], color=[43.0/256.0,46.0/256.0,60.0/256.0], linewidth=5, marker='', label='Pathway')
618 | set_pathway_label = False
619 | else:
620 | ax.plot(p[:, 0], p[:, 1], color=[43.0/256.0,46.0/256.0,60.0/256.0], linewidth=5, marker='')
621 |
622 | if fontsize > 18:
623 | plt.legend(fontsize=fontsize-10,facecolor=[0.9,0.9,0.92])
624 | else:
625 | plt.legend(fontsize=fontsize-4,facecolor=[0.9,0.9,0.92])
626 |
627 | # Plot cluster centers in landscape
628 | if self.cluster_centers_ is not None:
629 | if self.n_dims_ > 1:
630 | ax.scatter(self.data_[self.cluster_centers_,0], self.data_[self.cluster_centers_,1], marker='s', s=120,
631 | linewidth=4, facecolor='',edgecolor='w', label='Cluster center')
632 | else:
633 | ax.scatter(self.data_[self.cluster_centers_], self.FE_points_[self.cluster_centers_], marker='s', s=120,
634 | linewidth=4, facecolor='',edgecolor='w', label='Cluster center',zorder=5)
635 | if fontsize > 18:
636 | plt.legend(fontsize=fontsize-10,facecolor=[0.9,0.9,0.92])
637 | else:
638 | plt.legend(fontsize=fontsize-4,facecolor=[0.9,0.9,0.92])
639 | ax.set_title(title, fontsize=fontsize,fontname='serif',fontweight='light')
640 | ax.set_xlabel(xlabel, fontsize=fontsize - 2,fontname='serif',fontweight='light')
641 | plt.rc('xtick', labelsize=fontsize-2)
642 | plt.rc('ytick', labelsize=fontsize-2)
643 | matplotlib.rc('font',family='Serif')
644 |
645 | if savefig:
646 | plt.savefig(filename + '.svg')
647 | plt.savefig(filename + '.eps')
648 | plt.savefig(filename + '.png')
649 |
650 | return
651 |
--------------------------------------------------------------------------------
/free_energy_clustering/__init__.py:
--------------------------------------------------------------------------------
1 | from .GMM_free_energy import FreeEnergyClustering
2 | from .FE_landscape_clustering import LandscapeClustering
3 | from .stack_landscapes import LandscapeStacker
4 | from .free_energy_pathways import FreeEnergyPathways
5 |
--------------------------------------------------------------------------------
/free_energy_clustering/cluster_density.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.spatial.distance import cdist
3 |
4 |
5 | class ClusterDensity(object):
6 |
7 | def __init__(self, points, eval_points=None):
8 | self.grid_points_ = points
9 | self.points_ = eval_points
10 | self.grid_cluster_inds_ = None
11 | return
12 |
13 | def _construct_components(self,distance_matrix, is_FE_min, use_transition_matrix=False):
14 | # Build subgraphs with connected components of the isolated FE minima
15 | print('Constructing connected components.')
16 | n_points = distance_matrix.shape[0]
17 |
18 | graph = np.zeros((n_points,n_points))
19 |
20 | if use_transition_matrix:
21 | # Set distance matrix to information distance of transition matrix:
22 | # Kinetically close states have high transition probability but should have low distance
23 | distance_matrix = -np.log(distance_matrix+1e-9)
24 |
25 | # Sort distances in ascending order
26 | sort_inds = np.argsort(distance_matrix,axis=1)
27 |
28 | for i in range(n_points):
29 | if is_FE_min[i]:
30 | check_points = []
31 | neighbors = sort_inds[i,:]
32 | k_neighbors=1
33 |
34 | # Add neighbors until another potential component is reached
35 | for j in range(k_neighbors,n_points):
36 | current_neighbor = neighbors[j]
37 | if is_FE_min[current_neighbor]:
38 |
39 | neighbor_distance = distance_matrix[i,current_neighbor]
40 |
41 | if len(check_points) > 2:
42 | check_point_distances = distance_matrix[current_neighbor,np.asarray(check_points)]
43 | is_smaller_dist = check_point_distances < neighbor_distance
44 | if np.sum(is_smaller_dist) > 0:
45 | # A non-component point is closer to both the current point and
46 | # the other component point => the two component points are not neighbors
47 | break;
48 |
49 | # Add connection between neighbors
50 | graph[i,current_neighbor] = 1
51 | # Enforce symmetry
52 | graph[current_neighbor,i] = 1
53 | else:
54 | check_points.append(current_neighbor)
55 |
56 | # Sparsify graph to contain only the connected components
57 | graph = graph[is_FE_min,:]
58 | graph = graph[:,is_FE_min]
59 |
60 | return graph
61 |
62 | def _find_connected_components(self,graph):
63 | # Assign points to connected components
64 | print('Clustering data points.')
65 |
66 | n_points = graph.shape[0]
67 | component_indices = np.zeros(n_points)
68 | is_visited = np.zeros(n_points)
69 | all_inds = np.arange(n_points)
70 |
71 | i_component = 0
72 | while np.sum(is_visited) < is_visited.shape[0]:
73 | i_component += 1
74 | queue = []
75 | # get next unvisited point
76 | unvisited_points = all_inds[is_visited==0]
77 | queue.append(unvisited_points[0])
78 |
79 | while len(queue) > 0:
80 | current_point = queue.pop(0)
81 | if is_visited[current_point] == 0:
82 | is_visited[current_point] = 1
83 | component_indices[current_point] = i_component
84 |
85 | # get unvisited neighbors
86 | neighbors = all_inds[graph[current_point,:] > 0]
87 | for neighbor in neighbors:
88 | if is_visited[neighbor] == 0:
89 | queue.append(neighbor)
90 |
91 | return component_indices
92 |
93 | def data_cluster_indices(self, point_distances, cluster_indices_eval_points):
94 | """
95 | Set cluster indices according to the closest data point.
96 | """
97 | n_points = point_distances.shape[0]
98 | cluster_inds = np.zeros(n_points)
99 |
100 | min_inds = np.argmin(point_distances,axis=1)
101 |
102 | # Set cluster index of point to the same as the cluster index of evaluated (grid) point
103 | cluster_inds = cluster_indices_eval_points[min_inds]
104 | return cluster_inds
105 |
106 | def cluster_data(self, is_FE_min, transition_matrix=None):
107 |
108 | # Construct and detect connected components
109 | if transition_matrix is None:
110 | graph = self._construct_components(cdist(self.grid_points_,self.grid_points_), is_FE_min)
111 | else:
112 | print('Using transition probabilities to define distances.')
113 | graph = self._construct_components(transition_matrix, is_FE_min, use_transition_matrix=True)
114 |
115 | print('# Graph connections: '+str(np.sum(graph)))
116 | cluster_indices_grid_points = self._find_connected_components(graph)
117 |
118 | self.grid_cluster_inds_ = np.zeros(self.grid_points_.shape[0])
119 | self.grid_cluster_inds_[is_FE_min] = cluster_indices_grid_points
120 | if self.points_ is not None:
121 | cluster_indices = self.data_cluster_indices(cdist(self.points_,self.grid_points_),self.grid_cluster_inds_)
122 | else:
123 | cluster_indices = self.grid_cluster_inds_
124 |
125 | return cluster_indices
126 |
--------------------------------------------------------------------------------
/free_energy_clustering/cross_validation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.model_selection import KFold
3 |
4 |
5 | def split_train_validation(data, n_splits, shuffle=False):
6 | """
7 | Split the data into n_splits training and test sets.
8 | """
9 | kf = KFold(n_splits=n_splits, shuffle=shuffle)
10 |
11 | train_inds = []
12 | val_inds = []
13 |
14 | for train_ind, val_ind in kf.split(data):
15 | train_inds.append(train_ind)
16 | val_inds.append(val_ind)
17 |
18 | train_inds, val_inds = make_homogenous_validation_sets(train_inds, val_inds)
19 |
20 | return train_inds, val_inds
21 |
22 | def make_homogenous_validation_sets(train_inds, val_inds):
23 | """
24 | Ensure that the validation sets have equal amount of points.
25 | """
26 | min_val_inds = val_inds[0].shape[0]
27 | for i in range(len(val_inds)):
28 | if val_inds[i].shape[0] < min_val_inds:
29 | min_val_inds = val_inds[i].shape[0]
30 |
31 | for i in range(len(val_inds)):
32 | if val_inds[i].shape[0] > min_val_inds:
33 | n_inds_to_move = int(val_inds[i].shape[0]-min_val_inds)
34 | train_inds[i] = np.concatenate((train_inds[i], val_inds[i][0:n_inds_to_move]))
35 | val_inds[i] = val_inds[i][n_inds_to_move::]
36 |
37 | return train_inds, val_inds
38 |
39 | def get_train_validation_set(data, train_ind, val_inds):
40 | """
41 | Get the train and test set given their indices.
42 | """
43 | training_data = data[train_ind]
44 | validation_data = data[val_inds]
45 |
46 | return training_data, validation_data
47 |
--------------------------------------------------------------------------------
/free_energy_clustering/free_energy_pathways.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import free_energy_clustering as FEC
3 | import sys
4 |
5 | class FreeEnergyPathways(FEC.LandscapeClustering):
6 |
7 | def __init__(self, density_model, data, temperature=300, n_points=100, convergence_tol=1e-1, step_size=1e-3,
8 | ensemble_of_GMMs=False, max_iter=1000):
9 |
10 | FEC.LandscapeClustering.__init__(self,ensemble_of_GMMs=ensemble_of_GMMs)
11 |
12 | self.density_model_ = density_model
13 | self.convergence_tol_ = convergence_tol
14 | self.n_points_ = n_points
15 | self.data_ = data
16 | self.n_dims_ = data.shape[1]
17 | self.temperature_ = temperature # [K]
18 | self.boltzmann_constant_ = 0.0019872041 # [kcal/(mol K)]
19 | self.step_size_ = step_size
20 | self.max_iterations_ = max_iter
21 | return
22 |
23 | def _initialize_path(self, state_from, state_to):
24 | """
25 | Set initial path guess as straight path between the two states.
26 | :param state_from:
27 | :param state_to:
28 | :return:
29 | """
30 | path = np.zeros((self.n_points_, self.n_dims_))
31 | for i_dim in range(self.n_dims_):
32 | path[:,i_dim] = np.linspace(state_from[i_dim],state_to[i_dim],num=self.n_points_)
33 | return path
34 |
35 | def _length_of_subpaths(self, path):
36 | partial_path_lengths = np.zeros(self.n_points_)
37 |
38 | for i in range(1,self.n_points_):
39 | partial_path_lengths[i] = partial_path_lengths[i-1]+np.linalg.norm(path[i]-path[i-1])
40 |
41 | subpath_points = np.arange(self.n_points_)*partial_path_lengths[-1]/(self.n_points_-1)
42 |
43 | return partial_path_lengths, subpath_points
44 |
45 | def _equilibrate_path_points(self, path):
46 | """
47 | Spread points equidistantly along path.
48 | :param path:
49 | :return:
50 | """
51 | partial_path_lengths, subpath_points = self._length_of_subpaths(path)
52 |
53 | new_path = path
54 | for i in range(1,self.n_points_-1):
55 | s = subpath_points[i]
56 | for j in range(1,self.n_points_):
57 | if s > partial_path_lengths[j-1] and s < partial_path_lengths[j]:
58 | new_path[i] = path[j-1] + (s-partial_path_lengths[j-1])*(path[j]-path[j-1])/np.linalg.norm(path[j]-path[j-1])
59 | break
60 |
61 | return new_path
62 |
63 | def _update_path(self, path):
64 | """
65 | Update path with one minimization and equilibration of path points.
66 | :param path:
67 | :return:
68 | """
69 | density = self.density_model_.density(path)
70 | density[density<1e-15]=1e-15
71 |
72 | inner_derivative, _ = self._compute_gradients(self.density_model_, path)
73 | outer_derivative = -self.temperature_*self.boltzmann_constant_/density
74 | step = self.step_size_*(np.multiply(outer_derivative[:,np.newaxis],inner_derivative))
75 | new_path = path - step
76 | new_path = self._equilibrate_path_points(new_path)
77 | return new_path
78 |
79 | def minimum_pathway(self, state_from, state_to):
80 | """
81 | Compute minimum pathway between two states using the estimated free energy landscape based on GMM.
82 | :param state_from:
83 | :param state_to:
84 | :return:
85 | """
86 |
87 | # Set linear path between end points
88 | path = self._initialize_path(self.data_[state_from], self.data_[state_to])
89 |
90 | prev_path = np.inf * path
91 | counter = 1
92 | while np.linalg.norm(path-prev_path) > self.convergence_tol_:
93 | sys.stdout.write("\r" + 'Iteration: ' + str(counter) + '/' + str(self.max_iterations_))
94 | sys.stdout.flush()
95 | prev_path = np.copy(path)
96 | path = self._update_path(prev_path)
97 | if counter >= self.max_iterations_:
98 | break
99 | counter+=1
100 | print()
101 | return path
102 |
103 |
--------------------------------------------------------------------------------
/free_energy_clustering/stack_landscapes.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import free_energy_clustering.GMM as GMM
3 | import scipy.optimize as opt
4 |
5 | class LandscapeStacker(object):
6 | def __init__(self, data, list_of_validation_data, list_of_models, n_splits=1, convergence_tol=5e-3, n_iterations=1,
7 | model_weights=None):
8 | """
9 | Class for weighting density estimators with EM, based on how well they describe the validation dataset.
10 | :param data: [n_samples x n_dimensions]
11 | :param list_of_estimators:
12 | :param n_splits: Number of folds in K-fold cross-validation
13 | :param convergence_tol:
14 | """
15 | self.GMM_list_ = list_of_models
16 | self.val_data_list_ = list_of_validation_data
17 | self.data_ = data
18 | self.convergence_tol_ = convergence_tol
19 | self.n_models_ = int(len(list_of_models)/n_splits)
20 | self.n_splits_ = n_splits
21 | self.n_iterations_ = n_iterations
22 | self.n_components_list_ = []
23 |
24 | # Initlialize weights
25 | if model_weights is None:
26 | if self.n_models_ > 0:
27 | self.model_weights_ = 1.0 / self.n_models_ * np.ones(self.n_models_)
28 | else:
29 | self.model_weights_ = model_weights
30 | self._sparisify_model()
31 | print('Model weights: ' + str(self.model_weights_))
32 | print('GMM list: '+str(self.GMM_list_))
33 |
34 | self._set_n_component_list()
35 | print('# Components in models: '+str(self.n_components_list_))
36 | return
37 |
38 | def objective_function(self,W):
39 | # -log(likelihood)
40 | W /= W.sum()
41 | return -self.loglikelihood(self.val_data_list_, list_of_validation_data=True, weights=W)
42 |
43 | def fit(self):
44 | do_EM = True
45 |
46 | print('Training density model weights.')
47 |
48 | if do_EM:
49 | loglikelihood = -np.inf
50 | prev_loglikelihood = 0
51 | while (np.abs(prev_loglikelihood - loglikelihood) > self.convergence_tol_):
52 | beta = self._expectation()
53 | self._maximization(beta)
54 | prev_loglikelihood = loglikelihood
55 | loglikelihood = self.loglikelihood(self.val_data_list_, list_of_validation_data=True)
56 | else:
57 | self.model_weights_ = opt.fmin_cg(self.objective_function, self.model_weights_)
58 |
59 | # Keep only models with nonzero weight
60 | self._sparisify_model()
61 | self._set_n_component_list()
62 |
63 | # Train each density model on the full dataset.
64 | print('Training each model on the full dataset.')
65 | for i_model in range(self.n_models_):
66 | n_components = self.GMM_list_[i_model].n_components_
67 | print(' - Training model with '+str(n_components)+' components')
68 | best_loglikelihood = -np.inf
69 | for i_iter in range(self.n_iterations_):
70 | density_model = GMM.GaussianMixture(n_components=n_components,
71 | convergence_tol=self.convergence_tol_)
72 | density_model.fit(self.data_)
73 | loglikelihood = density_model.loglikelihood(self.data_)
74 | if loglikelihood > best_loglikelihood:
75 | best_loglikelihood = loglikelihood
76 | self.GMM_list_[i_model] = density_model
77 |
78 | self.n_components_list_ = np.asarray(self.n_components_list_)
79 | return
80 |
81 | def _set_n_component_list(self):
82 | """
83 | Set the list with number of components.
84 | :return:
85 | """
86 | self.n_components_list_ = []
87 | for i_model in range(self.n_models_):
88 | n_components = self.GMM_list_[i_model*self.n_splits_].weights_.shape[0]
89 | self.n_components_list_.append(n_components)
90 | return
91 |
92 | def _expectation(self):
93 | n_points = self.val_data_list_[0].shape[0]
94 |
95 | beta = np.zeros((self.n_splits_, self.n_models_, n_points))
96 |
97 | for i_split in range(self.n_splits_):
98 | for i_model in range(self.n_models_):
99 | ind = i_model*self.n_splits_+i_split
100 | beta[i_split, i_model, :] = self.model_weights_[i_model]*self.GMM_list_[ind].density(self.val_data_list_[ind])
101 |
102 | beta[i_split] /= np.sum(beta[i_split],axis=0)
103 |
104 | return beta
105 |
106 | def _maximization(self, beta):
107 | """
108 | Update density estimator weights.
109 | """
110 | self.model_weights_ = beta.sum(axis=(0,2))
111 |
112 | # Normalize Cat-distibution
113 | self.model_weights_ /= self.model_weights_.sum()
114 | return
115 |
116 | def _sparisify_model(self):
117 | """
118 | Remove all models with zero-weights (done after converged optimization).
119 | :return:
120 | """
121 | print('Removing zero-weighted models.')
122 | threshold = 1e-3
123 | n_models = np.sum(self.model_weights_>threshold)
124 | new_weights = []
125 | new_models = []
126 |
127 | for i_model in range(self.n_models_):
128 | if self.model_weights_[i_model] > threshold:
129 | new_weights.append(self.model_weights_[i_model])
130 | for i_split in range(self.n_splits_):
131 | new_models.append(self.GMM_list_[i_model*self.n_splits_+i_split])
132 |
133 | self.n_models_ = n_models
134 | self.GMM_list_ = new_models
135 | print(self.GMM_list_)
136 | self.model_weights_ = np.asarray(new_weights)
137 | self.model_weights_ /= self.model_weights_.sum()
138 | return
139 |
140 | def density(self, x, list_of_validation_data=False, weights=None):
141 | """
142 | Compute mixture of landscape density at the given points, x.
143 | x is either a numpy-array of size [n_samples x n_dims] or a list of
144 | validation datasets with length [self.n_models_].
145 | """
146 | if list_of_validation_data:
147 | n_points = x[0].shape[0]
148 | density = np.zeros(n_points*self.n_splits_)
149 | for i_model in range(self.n_models_):
150 | for i_split in range(self.n_splits_):
151 | if weights is None:
152 | density[n_points*i_split:n_points*(i_split+1)] += self.model_weights_[i_model]*self.GMM_list_[i_model*self.n_splits_+i_split].density(x[i_model*self.n_splits_+i_split])
153 | else:
154 | density[n_points*i_split:n_points*(i_split+1)] += weights[i_model]*self.GMM_list_[i_model*self.n_splits_+i_split].density(x[i_model*self.n_splits_+i_split])
155 | else:
156 | density = np.zeros(x.shape[0])
157 | for i_model in range(self.n_models_):
158 | density += self.model_weights_[i_model]*self.GMM_list_[i_model].density(x)
159 | return density
160 |
161 | def loglikelihood(self, x, list_of_validation_data=False,weights=None):
162 | """
163 | Compute log-likelihood.
164 | """
165 | density = self.density(x, list_of_validation_data=list_of_validation_data,weights=weights)
166 | density[density<1e-8]=1e-8
167 | return np.mean(np.log(density))
168 |
--------------------------------------------------------------------------------
/toy_models/Kmeans_cluster.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.cluster import KMeans
3 | from sklearn.metrics import silhouette_score
4 | from sklearn.neighbors import KNeighborsClassifier
5 |
6 | class KMeansCluster():
7 |
8 | def __init__(self,n_min_clusters,n_max_clusters):
9 | self.n_min_clusters_ = n_min_clusters
10 | self.n_max_clusters_ = n_max_clusters
11 | self.labels_ = None
12 | self.classifier = KNeighborsClassifier(n_neighbors=3)
13 | self.name='kmeans'
14 | return
15 |
16 | def cluster(self, x):
17 | print('Cluster data with K-means')
18 | all_cluster_labels = []
19 | silhouette_scores = np.zeros(self.n_max_clusters_-self.n_min_clusters_+1)
20 |
21 | for n_clusters in range(self.n_min_clusters_,self.n_max_clusters_+1):
22 | km = KMeans(n_clusters=n_clusters).fit(x)
23 | all_cluster_labels.append(km.labels_)
24 | silhouette_scores[n_clusters-self.n_min_clusters_] = silhouette_score(x, all_cluster_labels[-1])
25 |
26 | ind = np.argmax(silhouette_scores)
27 | self.labels_ = all_cluster_labels[ind]+1
28 |
29 | # Train kNN classifier
30 | self.classifier.fit(x, self.labels_)
31 | print('Cluster labels: '+str(np.unique(self.labels_)))
32 | return self.labels_
33 |
34 | def assign_cluster_labels(self, x):
35 | return self.classifier.predict(x)
--------------------------------------------------------------------------------
/toy_models/__init__.py:
--------------------------------------------------------------------------------
1 | from toy_models.toy_model_GMM_2D import GMM2D
2 | from toy_models.toy_model_nonlinear_GMM_2D import GMM2dNonlinear
3 | from toy_models.evaluate_toy_models import MethodEvaluator
4 | from toy_models.toy_model_multiple_GMMs import MultipleGMMs
5 | from toy_models.toy_model_moons import Moons
6 | from toy_models.toy_model_blobs import Blobs
--------------------------------------------------------------------------------
/toy_models/agglomerative_ward_cluster.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics import silhouette_score
3 | from sklearn.neighbors import KNeighborsClassifier
4 | from sklearn.cluster import AgglomerativeClustering
5 |
6 | class AWCluster():
7 |
8 | def __init__(self,n_min_clusters,n_max_clusters):
9 | self.n_min_clusters_ = n_min_clusters
10 | self.n_max_clusters_ = n_max_clusters
11 | self.labels_ = None
12 | self.classifier = KNeighborsClassifier(n_neighbors=3)
13 | self.name='AW'
14 | return
15 |
16 | def cluster(self, x):
17 | print('Cluster data with agglomerative-Ward')
18 | all_cluster_labels = []
19 | silhouette_scores = np.zeros(self.n_max_clusters_-self.n_min_clusters_+1)
20 |
21 | for n_clusters in range(self.n_min_clusters_,self.n_max_clusters_+1):
22 | aw = AgglomerativeClustering(n_clusters=n_clusters,linkage='ward').fit(x)
23 | all_cluster_labels.append(aw.labels_)
24 | silhouette_scores[n_clusters-self.n_min_clusters_] = silhouette_score(x, all_cluster_labels[-1])
25 |
26 | ind = np.argmax(silhouette_scores)
27 | self.labels_ = all_cluster_labels[ind]+1
28 |
29 | # Train kNN classifier
30 | self.classifier.fit(x, self.labels_)
31 | print('Cluster labels: '+str(np.unique(self.labels_)))
32 | return self.labels_
33 |
34 | def assign_cluster_labels(self, x):
35 | return self.classifier.predict(x)
--------------------------------------------------------------------------------
/toy_models/evaluate_toy_models.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy as np
3 | import toy_models as tm
4 | import free_energy_clustering as GMM_FE
5 | from toy_models import Kmeans_cluster as kmc
6 | from toy_models import spectral_cluster as sc
7 | from toy_models import agglomerative_ward_cluster as awc
8 |
9 | from sklearn.metrics import v_measure_score
10 | from sklearn.metrics import adjusted_mutual_info_score
11 | from sklearn.metrics.cluster import fowlkes_mallows_score
12 | from sklearn.cluster import KMeans
13 |
14 | class MethodEvaluator(object):
15 |
16 | def __init__(self, toy_model='GMM_2D', x_lims=None, n_grids=30, convergence_tol=1e-4,verbose=False, presampled_data=None, n_features=None, noise=0, n_components=3):
17 |
18 | if toy_model == 'GMM_2D':
19 | self.toy_model_ = tm.GMM2D()
20 | elif toy_model == 'mGMMs':
21 | self.toy_model_ = tm.MultipleGMMs()
22 | elif toy_model == 'blobs':
23 | self.toy_model_ = tm.Blobs(n_components=n_components, n_dims=n_features, noise=noise)
24 | elif toy_model == 'digits':
25 | self.toy_model_ = tm.Digits()
26 | elif toy_model == 'nonlinear_GMM_2D':
27 | self.toy_model_ = tm.GMM2dNonlinear()
28 | else:
29 | print('Toy model: '+str(toy_model)+' does not exist')
30 | sys.exit(0)
31 |
32 | self.cluster_score_ami_kmeans_ = None
33 | self.cluster_score_ami_AW_ = None
34 | self.cluster_score_ami_spectral_ = None
35 | self.cluster_score_ami_density_peaks_ = None
36 | self.cluster_score_ami_GMM_ = None
37 | self.cluster_score_ami_GMM_FE_min_ = None
38 |
39 | self.cluster_score_fm_kmeans_ = None
40 | self.cluster_score_fm_AW_ = None
41 | self.cluster_score_fm_spectral_ = None
42 | self.cluster_score_fm_density_peaks_ = None
43 | self.cluster_score_fm_GMM_ = None
44 | self.cluster_score_fm_GMM_FE_min_ = None
45 |
46 | self.cluster_score_vm_kmeans_ = None
47 | self.cluster_score_vm_AW_ = None
48 | self.cluster_score_vm_spectral_ = None
49 | self.cluster_score_vm_density_peaks_ = None
50 | self.cluster_score_vm_DPA_ = None
51 | self.cluster_score_vm_HDBSCAN_ = None
52 | self.cluster_score_vm_SDC_ = None
53 | self.cluster_score_vm_GMM_ = None
54 | self.cluster_score_vm_GMM_FE_min_ = None
55 |
56 | self.convergence_tol_ = convergence_tol
57 |
58 | self.x_lims_ = x_lims
59 | self.n_grids_ = n_grids
60 |
61 | self.presampled_data = presampled_data
62 |
63 | self.true_FE_ = None
64 | self.true_density_ = None
65 | self.true_labels_ = None
66 | self.test_set_ = None
67 | self.min_FE_ = None
68 | self.verbose_ = verbose
69 |
70 | self.set_true_free_energy()
71 | return
72 |
73 | def set_true_free_energy(self):
74 | """
75 | Create a free energy object that contains the true free energy and density on the given grid.
76 | :return:
77 | """
78 | # Create grid and evaluate density on it
79 | print('Setting true model.')
80 | self.test_set_ = self.toy_model_.sample(2000)
81 | self.true_FE_ = GMM_FE.FreeEnergyClustering(self.test_set_, x_lims=self.x_lims_, n_grids=self.n_grids_,verbose=False,
82 | convergence_tol=self.convergence_tol_)
83 | self.true_FE_.density_est_ = self.toy_model_
84 |
85 | coords, self.true_density_ = self.true_FE_._density_landscape(self.toy_model_)
86 |
87 | # Compute true free energy
88 | FE_landscape = self.true_FE_._free_energy(self.true_density_)
89 | self.min_FE_= np.min(FE_landscape)
90 | FE_landscape = FE_landscape - self.min_FE_
91 |
92 | # Set true free energy
93 | self.true_FE_.coords_ = coords
94 | self.true_FE_.FE_landscape_ = FE_landscape
95 |
96 | if hasattr(self.toy_model_,"assign_cluster_labels"):
97 | self.true_labels_ = self.toy_model_.assign_cluster_labels(self.test_set_)
98 | else:
99 | self.true_labels_, _ = self.true_FE_.cluster(coords, np.zeros(self.test_set_.shape[0]), self.test_set_)
100 | return
101 |
102 | def run_evaluation(self, n_runs=1, n_points=1000, n_iterations=1, min_n_components=2, max_n_components=25,
103 | n_splits=3, save_data=False, file_label='',n_microstates=None, all_methods=True,
104 | assign_transition_points=True):
105 | """
106 | Run multiple free energy estimations and evaluate performance.
107 | :param n_runs:
108 | :return:
109 | """
110 |
111 | if self.presampled_data is not None:
112 | sampled_data = self.presampled_data[0]
113 | true_clustering = self.presampled_data[1]
114 | n_runs = sampled_data.shape[0]
115 |
116 | self.cluster_score_ami_kmeans_ = np.zeros(n_runs)
117 | self.cluster_score_ami_AW_ = np.zeros(n_runs)
118 | self.cluster_score_ami_spectral_ = np.zeros(n_runs)
119 | self.cluster_score_ami_density_peaks_ = np.zeros(n_runs)
120 | self.cluster_score_ami_GMM_ = np.zeros(n_runs)
121 | self.cluster_score_ami_GMM_FE_min_ = np.zeros(n_runs)
122 |
123 | self.cluster_score_fm_kmeans_ = np.zeros(n_runs)
124 | self.cluster_score_fm_AW_ = np.zeros(n_runs)
125 | self.cluster_score_fm_spectral_ = np.zeros(n_runs)
126 | self.cluster_score_fm_density_peaks_ = np.zeros(n_runs)
127 | self.cluster_score_fm_GMM_ = np.zeros(n_runs)
128 | self.cluster_score_fm_GMM_FE_min_ = np.zeros(n_runs)
129 |
130 | self.cluster_score_vm_kmeans_ = np.zeros(n_runs)
131 | self.cluster_score_vm_AW_ = np.zeros(n_runs)
132 | self.cluster_score_vm_spectral_ = np.zeros(n_runs)
133 | self.cluster_score_vm_density_peaks_ = np.zeros(n_runs)
134 | self.cluster_score_vm_GMM_ = np.zeros(n_runs)
135 | self.cluster_score_vm_GMM_FE_min_ = np.zeros(n_runs)
136 |
137 | data = self.toy_model_.sample(3)
138 |
139 | # Create free energy estimators
140 | gmm_FE = GMM_FE.FreeEnergyClustering(data, min_n_components=min_n_components, max_n_components=max_n_components,
141 | x_lims=self.x_lims_, n_grids=self.n_grids_, stack_landscapes=False,
142 | n_splits=n_splits, n_iterations=n_iterations,convergence_tol=self.convergence_tol_,
143 | verbose=self.verbose_)
144 |
145 | km = kmc.KMeansCluster(min_n_components, max_n_components)
146 | aw = awc.AWCluster(min_n_components, max_n_components)
147 | spectral = sc.SpectralCluster(min_n_components, max_n_components)
148 |
149 | all_data = []
150 | for i_run in range(n_runs):
151 | print("Run: "+str(i_run+1)+'/'+str(n_runs))
152 |
153 | if self.presampled_data is None:
154 | # Sample data
155 | data = self.toy_model_.sample(n_points)
156 | else:
157 | data = sampled_data[i_run]
158 |
159 | all_data.append(data)
160 |
161 | print('Shape data: ' + str(data.shape))
162 |
163 | # Set data in model and estimate GMM density
164 | gmm_FE.data_ = data
165 | coords, est_FE_landsc, FE_points = gmm_FE.landscape()
166 |
167 | # Get true cluster labels
168 | if self.presampled_data is None:
169 | if hasattr(self.toy_model_, "assign_cluster_labels"):
170 | self.true_labels_ = self.toy_model_.assign_cluster_labels(data)
171 | else:
172 | print('Setting true labels.')
173 | self.true_labels_, _ = self.true_FE_.cluster(data, np.zeros(data.shape[0]))
174 | else:
175 | self.true_labels_ = true_clustering[i_run]
176 |
177 | # Cluster data with different methods
178 | if n_microstates is None:
179 | self.FE_min_labels, _ = gmm_FE.cluster(data, FE_points, assign_transition_points=assign_transition_points)
180 | else:
181 | kmea = KMeans(n_clusters=n_microstates).fit(data[::2])
182 | microstate_centers = kmea.cluster_centers_
183 | self.FE_min_labels, _ = gmm_FE.cluster(microstate_centers, FE_points, data, assign_transition_points=assign_transition_points, unravel_grid=False)
184 |
185 | if all_methods:
186 | self.km_labels = km.cluster(data)
187 | self.aw_labels = aw.cluster(data)
188 | self.spectral_labels = spectral.cluster(data)
189 |
190 | # Score clustering using different scoring metrics
191 | # V-measure score
192 | self.cluster_score_vm_GMM_FE_min_[i_run] = self._score_clustering(self.FE_min_labels,'vm')
193 | print(self.cluster_score_vm_GMM_FE_min_[i_run])
194 | if all_methods:
195 | self.cluster_score_vm_GMM_[i_run] = self._score_clustering(gmm_FE.density_est_.predict(data),'vm')
196 | self.cluster_score_vm_kmeans_[i_run] = self._score_clustering(self.km_labels,'vm')
197 | self.cluster_score_vm_AW_[i_run] = self._score_clustering(self.aw_labels,'vm')
198 | self.cluster_score_vm_spectral_[i_run] = self._score_clustering(self.spectral_labels,'vm')
199 |
200 | # Adjusted MI
201 | self.cluster_score_ami_GMM_FE_min_[i_run] = self._score_clustering(self.FE_min_labels,'ami')
202 | self.cluster_score_ami_GMM_[i_run] = self._score_clustering(gmm_FE.density_est_.predict(data),'ami')
203 | self.cluster_score_ami_kmeans_[i_run] = self._score_clustering(self.km_labels,'ami')
204 | self.cluster_score_ami_AW_[i_run] = self._score_clustering(self.aw_labels,'ami')
205 | self.cluster_score_ami_spectral_[i_run] = self._score_clustering(self.spectral_labels,'ami')
206 |
207 | # Fowlkes Mallows
208 | self.cluster_score_fm_GMM_FE_min_[i_run] = self._score_clustering(self.FE_min_labels,'fm')
209 | self.cluster_score_fm_GMM_[i_run] = self._score_clustering(gmm_FE.density_est_.predict(data),'fm')
210 | self.cluster_score_fm_kmeans_[i_run] = self._score_clustering(self.km_labels,'fm')
211 | self.cluster_score_fm_AW_[i_run] = self._score_clustering(self.aw_labels,'fm')
212 | self.cluster_score_fm_spectral_[i_run] = self._score_clustering(self.spectral_labels,'fm')
213 |
214 | if save_data:
215 | if self.presampled_data is None:
216 | np.save('data_out/sampled_data_'+self.toy_model_.name+file_label+'.npy',all_data)
217 |
218 | if False:
219 | np.save('data_out/cluster_score_fm_FE_min_'+self.toy_model_.name+file_label+'.npy',self.cluster_score_fm_GMM_FE_min_)
220 | np.save('data_out/cluster_score_fm_GMM_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_fm_GMM_)
221 | np.save('data_out/cluster_score_fm_kmeans_' + self.toy_model_.name +file_label +'.npy', self.cluster_score_fm_kmeans_)
222 | np.save('data_out/cluster_score_fm_AW_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_fm_AW_)
223 | np.save('data_out/cluster_score_fm_spectral_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_fm_spectral_)
224 |
225 | np.save('data_out/cluster_score_ami_FE_min_'+self.toy_model_.name+file_label+'.npy',self.cluster_score_ami_GMM_FE_min_)
226 | np.save('data_out/cluster_score_ami_GMM_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_GMM_)
227 | np.save('data_out/cluster_score_ami_kmeans_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_kmeans_)
228 | np.save('data_out/cluster_score_ami_AW_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_AW_)
229 | np.save('data_out/cluster_score_ami_spectral_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_ami_spectral_)
230 |
231 | np.save('data_out/cluster_score_vm_FE_min_'+self.toy_model_.name+file_label+'.npy',self.cluster_score_vm_GMM_FE_min_)
232 | if all_methods:
233 | np.save('data_out/cluster_score_vm_GMM_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_GMM_)
234 | np.save('data_out/cluster_score_vm_kmeans_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_kmeans_)
235 | np.save('data_out/cluster_score_vm_AW_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_AW_)
236 | np.save('data_out/cluster_score_vm_spectral_' + self.toy_model_.name + file_label+'.npy', self.cluster_score_vm_spectral_)
237 | return
238 |
239 | def _score_clustering(self, labels,metric='vm'):
240 | # Score clustering compared to true model
241 | if metric=='fm':
242 | score = fowlkes_mallows_score(self.true_labels_, labels)
243 | elif metric=='ami':
244 | score = adjusted_mutual_info_score(self.true_labels_, labels)
245 | else:
246 | score = v_measure_score(self.true_labels_[labels>0], labels[labels>0])
247 | return score
248 |
249 | def _FE_error(self, estimated_FE_landscape):
250 | error = np.mean(np.abs(estimated_FE_landscape-self.true_FE_.FE_landscape_))
251 | return error
252 |
253 | def _density_error(self, estimated_density):
254 | error = np.mean(np.abs(estimated_density - self.true_density_))
255 | return error
256 |
257 | def visualize(self):
258 | """
259 | Visualizing the quantities from estimations.
260 | :return:
261 | """
262 | plt.figure(1)
263 | ax1 = plt.add_suplot(1,2,1)
264 | # Plot free energy error
265 | ax1.plot(self.FE_errors_GMM_CV_, linewidth=4, label='GMM with cross-validation')
266 | ax1.plot(self.FE_errors_GMM_mix_models_, linewidth=4, label='GMM with mixture of models')
267 | plt.legend()
268 |
269 | # Plot density error
270 |
271 | # Plot log-likelihood of test set
272 |
273 | # Plot clustering score
274 |
275 | plt.show()
276 |
277 | return
278 |
--------------------------------------------------------------------------------
/toy_models/spectral_cluster.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.cluster import KMeans
3 | from scipy.sparse.linalg import eigsh
4 | from scipy.spatial.distance import cdist
5 | from sklearn.neighbors import KNeighborsClassifier
6 |
7 | class SpectralCluster():
8 |
9 | def __init__(self,n_min_clusters,n_max_clusters):
10 | self.n_min_clusters_ = n_min_clusters
11 | self.n_max_clusters_ = n_max_clusters
12 | self.labels_ = None
13 | self.classifier = KNeighborsClassifier(n_neighbors=3)
14 | self.name = 'spectral'
15 | return
16 |
17 | def transition_matrix(self, A):
18 | for i in range(A.shape[0]):
19 | A[i,i] = 0
20 | D = np.sum(A, axis=1)
21 | D_inv = np.diag(1 / np.sqrt(D))
22 | T = np.dot(D_inv, np.dot(A, D_inv))
23 | return T
24 |
25 | def get_n_clusters(self, transition_mat):
26 | print('Spectral embedding')
27 | eigenvalues, eigenvectors = np.linalg.eig(transition_mat)#eigsh(transition_mat, k=(self.n_max_clusters_ + 1));
28 |
29 | # Sort in descending order
30 | ind_sort = np.argsort(-eigenvalues)
31 | eigenvalues = eigenvalues[ind_sort]
32 |
33 | # Get largest eigengap
34 | eigengaps = -np.diff(eigenvalues)
35 | ind = np.argmax(eigengaps[self.n_min_clusters_:self.n_max_clusters_+1])
36 | n_clusters = ind+self.n_min_clusters_
37 | embedding = eigenvectors[:, ind_sort[0:n_clusters+1]]
38 | for i in range(embedding.shape[0]):
39 | embedding[i] /= np.linalg.norm(embedding[i])
40 |
41 | return n_clusters, embedding
42 |
43 | def cluster(self, x):
44 |
45 | # Set affinity matrix
46 | distances = cdist(x,x)
47 | distSort = np.sort(distances, axis=1)
48 | gamma = np.max(distSort[:,1])**2
49 |
50 | dist_squared = np.multiply(distances, distances)
51 |
52 | A = np.exp(-dist_squared/(2*gamma))
53 |
54 | print('Cluster data with spectral clustering')
55 | # Get transition matrix, select number of dimensions/clusters and project data
56 | transition_mat = self.transition_matrix(A)
57 | n_clusters, embedding = self.get_n_clusters(transition_mat)
58 |
59 | print('Cluster data with '+str(n_clusters)+' clusters.')
60 | km = KMeans(n_clusters=n_clusters).fit(embedding)
61 | self.labels_ = km.labels_+1
62 |
63 | # Train kNN classifier
64 | self.classifier.fit(x, self.labels_)
65 | print('Cluster labels: '+str(np.unique(self.labels_)))
66 | return self.labels_
67 |
68 | def assign_cluster_labels(self, x):
69 | return self.classifier.predict(x)
--------------------------------------------------------------------------------
/toy_models/toy_model_GMM_2D.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from free_energy_clustering.GMM import GaussianMixture
3 | from free_energy_clustering.GMM_free_energy import FreeEnergyClustering
4 |
5 | class GMM2D(GaussianMixture):
6 |
7 | def __init__(self):
8 | GaussianMixture.__init__(self, n_components=9)
9 |
10 | self.n_dims_ = 2
11 | self._set_parameters()
12 | self.name = 'GMM_2D'
13 | return
14 |
15 | def _set_cov(self, x11,x12,x22):
16 | tmp_cov = np.zeros((self.n_dims_, self.n_dims_))
17 |
18 | tmp_cov[0, 0] = x11
19 | tmp_cov[0, 1] = x12
20 | tmp_cov[1, 0] = x12
21 | tmp_cov[1, 1] = x22
22 | return tmp_cov
23 |
24 | def _set_parameters(self):
25 |
26 | self.means_ = np.asarray([ np.asarray([0.8,0.35]), np.asarray([0.45,0.52]), np.asarray([0.2,0.6]),
27 | np.asarray([0.05,0.8]), np.asarray([0.5,0.25]), np.asarray([0.5,0.25]),
28 | np.asarray([0.5, 0.25]), np.asarray([0.4, 0.34]), np.asarray([0.8,0.5])])
29 |
30 | covs = [np.zeros((2,2))]*self.n_components_
31 |
32 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002)
33 | covs[1] = self._set_cov(0.001, 0.0009, 0.001)
34 | covs[2] = self._set_cov(0.002, 0.001, 0.002)
35 | covs[3] = self._set_cov(0.003, 0.0008, 0.003)
36 | covs[4] = self._set_cov(0.0012, 0.0005, 0.0012)
37 | covs[5] = self._set_cov(0.005, 0.0, 0.0015)
38 | covs[6] = self._set_cov(0.002, 0.0, 0.002)
39 | covs[7] = self._set_cov(0.0012, 0.00, 0.002)
40 | covs[8] = self._set_cov(0.001, 0.0009, 0.001)
41 |
42 | self.covariances_ = covs
43 |
44 | self.weights_ = np.asarray([0.15,0.1,0.3,0.25,0.1,0.05,0.1,0.05,0.4])
45 | self.weights_ /= np.sum(self.weights_)
46 |
47 | return
48 |
--------------------------------------------------------------------------------
/toy_models/toy_model_blobs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import datasets
3 | from free_energy_clustering.GMM import GaussianMixture
4 |
5 | class Blobs(GaussianMixture):
6 | def __init__(self,n_components=3,n_dims=2,noise=0):
7 | GaussianMixture.__init__(self, n_components=n_components)
8 | self.labels_ = None
9 | self.data_ = None
10 | self.n_features_ = n_dims
11 | self.noise_level_ = noise
12 | self.name = 'blobs'
13 | return
14 |
15 | def sample(self, n_points):
16 | self.data_, self.labels_ = datasets.make_blobs(n_samples=n_points, n_features=self.n_features_)
17 | print(self.data_.shape)
18 |
19 | self.set_density()
20 |
21 | # Sample noise uniformly over space
22 | n_noise_points = int(self.noise_level_ * self.data_.shape[0])
23 | data_noise = np.random.uniform(self.data_.min(axis=0), self.data_.max(axis=0),
24 | size=(n_noise_points, self.data_.shape[1]))
25 |
26 | self.data_[0:n_noise_points] = data_noise
27 | return self.data_
28 |
29 | def set_density(self):
30 |
31 | unique_labels = np.unique(self.labels_)
32 |
33 | self.weights_ = np.zeros(self.n_components_)
34 | self.means_ = np.zeros((self.n_components_,self.n_features_))
35 | self.covariances_ = [np.zeros((self.n_features_,self.n_features_))]*self.n_components_
36 | for label in unique_labels:
37 | self.weights_[label] = np.mean(self.labels_==label)
38 | self.means_[label] = np.mean(self.data_[self.labels_==label],axis=0)
39 | self.covariances_[label] = np.cov(self.data_[self.labels_==label].T)
40 | return
41 |
42 | def assign_cluster_labels(self, x):
43 | return self.predict(x)
44 |
45 |
--------------------------------------------------------------------------------
/toy_models/toy_model_moons.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import datasets
3 | from scipy.spatial.distance import cdist
4 | from sklearn.neighbors import KNeighborsClassifier
5 |
6 | class Moons():
7 | def __init__(self):
8 | self.labels_ = None
9 | self.data_ = None
10 | self.classifier = KNeighborsClassifier(n_neighbors=3)
11 | self.name = 'moons'
12 | return
13 |
14 | def sample(self, n_points):
15 | self.data_, self.labels_ = datasets.make_moons(n_samples=n_points, noise=.05)
16 | self.classifier.fit(self.data_,self.labels_+1)
17 | return self.data_
18 |
19 | def density(self,x):
20 | min_dist = cdist(x,self.data_[::10]).min(axis=1)
21 | density = np.zeros(x.shape[0])
22 | density[min_dist < 5e-2] = 0.5
23 | density /= density.sum()
24 | return density
25 |
26 | def assign_cluster_labels(self, x):
27 | return self.classifier.predict(x)
28 |
29 |
--------------------------------------------------------------------------------
/toy_models/toy_model_multiple_GMMs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from free_energy_clustering.GMM import GaussianMixture
3 |
4 | class MultipleGMMs(GaussianMixture):
5 | def __init__(self):
6 | data = np.zeros((3,2))
7 | GaussianMixture.__init__(self, n_components=10)
8 |
9 | self.name = 'mGMMs'
10 | self.n_dims_ = 2
11 | self._set_parameters()
12 | return
13 |
14 | def sample_multi_GMM(self, n_points):
15 | """
16 | Sample from stacked GMMs.
17 | """
18 | sampled_points = np.zeros((n_points, self.n_dims_))
19 | prob_model = np.cumsum(self.model_weights_)
20 |
21 | r = np.random.uniform(size=n_points)
22 | is_point_sampled = np.zeros((n_points), dtype=int)
23 |
24 | for i_point in range(n_points):
25 | for i_model in range(self.n_models_):
26 | if r[i_point] <= prob_model[i_model]:
27 | is_point_sampled[i_model] += 1
28 | sampled_points[i_point,:] = self.GMM_list_[i_model].sample(1)
29 | break
30 | print('Sampled: '+str(is_point_sampled.sum())+'/'+str(n_points))
31 | return sampled_points
32 |
33 | def _set_cov(self, x11,x12,x22):
34 | tmp_cov = np.zeros((self.n_dims_, self.n_dims_))
35 |
36 | tmp_cov[0, 0] = x11
37 | tmp_cov[0, 1] = x12
38 | tmp_cov[1, 0] = x12
39 | tmp_cov[1, 1] = x22
40 | return tmp_cov
41 |
42 | def _set_GMM1(self):
43 | n_components = 4
44 | means = np.asarray([np.asarray([0.5,0.27]), np.asarray([0.5, 0.27]), np.asarray([0.5, 0.27]),
45 | np.asarray([0.5, 0.27])])
46 |
47 | covs = [np.zeros((2, 2))] * n_components
48 |
49 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002)
50 | covs[1] = self._set_cov(0.001, 0.0009, 0.001)
51 | covs[2] = self._set_cov(0.002, 0.001, 0.002)
52 | covs[3] = self._set_cov(0.003, 0.0008, 0.003)
53 |
54 | weights = np.asarray([0.5, 0.3,0.3,0.3])
55 | weights /= weights.sum()
56 | return means, covs, weights
57 |
58 | def _set_GMM2(self):
59 |
60 | n_components = 3
61 | means = np.asarray([np.asarray([0.45, 0.5]), np.asarray([0.45, 0.5]), np.asarray([0.45, 0.5])])
62 |
63 | covs = [np.zeros((2, 2))] * n_components
64 |
65 | covs[0] = self._set_cov(0.01, 0.0, 0.0001)
66 | covs[1] = self._set_cov(0.0003, 0.0003, 0.015)
67 | covs[2] = self._set_cov(0.0012, 0.00, 0.002)
68 |
69 | weights = np.asarray([0.5, 0.2, 0.3])
70 | weights /= weights.sum()
71 | return means, covs, weights
72 |
73 | def _set_GMM3(self):
74 | n_components = 3
75 | means = np.asarray(
76 | [np.asarray([0.05, 0.8]), np.asarray([0.05, 0.8]), np.asarray([0.05, 0.8])])
77 |
78 | covs = [np.zeros((2, 2))] * n_components
79 |
80 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002)
81 | covs[1] = self._set_cov(0.001, 0.0009, 0.001)
82 | covs[2] = self._set_cov(0.002, 0.001, 0.002)
83 |
84 | weights = np.ones(n_components)
85 | weights /= weights.sum()
86 | return means, covs, weights
87 |
88 | def _set_GMM12(self):
89 | n_components = 9
90 | means = np.asarray([ np.asarray([0.8,0.35]), np.asarray([0.45,0.5]), np.asarray([0.2,0.6]),
91 | np.asarray([0.05,0.8]), np.asarray([0.5,0.27]), np.asarray([0.5,0.27]),
92 | np.asarray([0.5, 0.27]), np.asarray([0.5, 0.4]), np.asarray([0.8,0.5])])
93 |
94 | covs = [np.zeros((2,2))]*n_components
95 |
96 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002)
97 | covs[1] = self._set_cov(0.001, 0.0009, 0.001)
98 | covs[2] = self._set_cov(0.002, 0.001, 0.002)
99 | covs[3] = self._set_cov(0.003, 0.0008, 0.003)
100 | covs[4] = self._set_cov(0.0012, 0.0005, 0.0012)
101 | covs[5] = self._set_cov(0.01, 0.0, 0.0015)
102 | covs[6] = self._set_cov(0.005, 0.001, 0.02)
103 | covs[7] = self._set_cov(0.002, -0.0001, 0.002)
104 | covs[8] = self._set_cov(0.001, 0.0009, 0.001)
105 |
106 | weights = np.asarray([0.15,0.1,0.5,0.25,0.1,0.05,0.1,0.05,0.4])
107 | weights /= weights.sum()
108 | return means, covs, weights
109 |
110 | def _set_GMM22(self):
111 |
112 | n_components = 3
113 | means = np.asarray([ np.asarray([0.6,0.35]), np.asarray([0.45,0.5]), np.asarray([0.19,0.62])])
114 |
115 | covs = [np.zeros((2,2))]*n_components
116 |
117 | covs[0] = self._set_cov(0.003, 0.0008, 0.003)
118 | covs[1] = self._set_cov(0.005, 0.0, 0.0015)
119 | covs[2] = self._set_cov(0.0012, 0.00, 0.002)
120 |
121 | weights = np.asarray([0.5,0.2,0.3])
122 | weights /= weights.sum()
123 | return means, covs, weights
124 |
125 |
126 | def _set_GMM32(self):
127 | n_components = 5
128 | means = np.asarray([ np.asarray([0.05,0.8]),np.asarray([0.05,0.8]), np.asarray([0.52,0.25]), np.asarray([0.52,0.27]), np.asarray([0.45, 0.5])])
129 |
130 | covs = [np.zeros((2,2))]*n_components
131 |
132 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002)
133 | covs[1] = self._set_cov(0.001, 0.0009, 0.001)
134 | covs[2] = self._set_cov(0.002, 0.001, 0.002)
135 | covs[3] = self._set_cov(0.003, 0.0008, 0.003)
136 | covs[4] = self._set_cov(0.0012, 0.0005, 0.0012)
137 |
138 | weights = np.asarray([0.15,0.25,0.10,0.3,0.2])
139 | weights /= weights.sum()
140 | return means, covs, weights
141 |
142 | def assign_cluster_labels(self,x):
143 | gamma = self._expectation(x)
144 | labels = np.argmax(gamma, axis=0)+1
145 | return labels
146 |
147 | def _expectation(self, x):
148 | n_points = x.shape[0]
149 | gamma = np.zeros((self.n_models_, n_points))
150 |
151 | for i_model in range(self.n_models_):
152 | gamma[i_model, :] = self.model_weights_[i_model] *self.GMM_list_[i_model].density(x)
153 | gamma /= np.sum(gamma, axis=0)
154 | return gamma
155 |
156 | def _set_parameters(self):
157 | n_components = 10
158 | means = np.asarray([np.asarray([0.5, 0.27]), np.asarray([0.5, 0.27]), np.asarray([0.5, 0.27]),
159 | np.asarray([0.5, 0.27]), np.asarray([0.40, 0.5]), np.asarray([0.4, 0.5]),
160 | np.asarray([0.4, 0.5]),np.asarray([0.05, 0.8]), np.asarray([0.05, 0.8]),
161 | np.asarray([0.05, 0.8])])
162 |
163 | covs = [np.zeros((2, 2))] * n_components
164 |
165 | covs[0] = self._set_cov(0.0021, 0.0005, 0.002)
166 | covs[1] = self._set_cov(0.001, 0.0009, 0.001)
167 | covs[2] = self._set_cov(0.002, 0.001, 0.002)
168 | covs[3] = self._set_cov(0.003, 0.0008, 0.003)
169 |
170 | weights1 = np.asarray([0.5, 0.3, 0.3, 0.3])
171 | weights1 /= weights1.sum()
172 |
173 | covs[4] = self._set_cov(0.01, 0.0, 0.0001)
174 | covs[5] = self._set_cov(0.0003, 0.0000, 0.0003)
175 | covs[6] = self._set_cov(0.0012, 0.00, 0.002)
176 |
177 | weights2 = np.asarray([0.5, 0.2, 0.3])
178 | weights2 /= weights2.sum()
179 |
180 | covs[7] = self._set_cov(0.0021, 0.0005, 0.002)
181 | covs[8] = self._set_cov(0.001, 0.0009, 0.001)
182 | covs[9] = self._set_cov(0.002, 0.001, 0.002)
183 |
184 | weights3 = np.ones(3)
185 | weights3 /= weights3.sum()
186 |
187 | weights = np.ravel(np.concatenate((0.25*weights1, 0.5*weights2, 0.25*weights3)))
188 |
189 | self.means_ = means
190 | self.covariances_ = covs
191 | self.weights_ = weights
192 |
193 | self.GMM_list_ = []
194 | self.GMM_list_.append(GaussianMixture(n_components=4))
195 | self.GMM_list_[-1].means_ = means[0:4,:]
196 | self.GMM_list_[-1].covariances_ = covs[0:4]
197 | self.GMM_list_[-1].weights_ = weights1
198 |
199 | self.GMM_list_.append(GaussianMixture(n_components=3))
200 | self.GMM_list_[-1].means_ = means[4:7,:]
201 | self.GMM_list_[-1].covariances_ = covs[4:7]
202 | self.GMM_list_[-1].weights_ = weights1
203 |
204 | self.GMM_list_.append(GaussianMixture(n_components=3))
205 | self.GMM_list_[-1].means_ = means[7::, :]
206 | self.GMM_list_[-1].covariances_ = covs[7::]
207 | self.GMM_list_[-1].weights_ = weights1
208 |
209 | self.model_weights_ = np.asarray([0.25,0.5,0.25])
210 | self.n_models_ = 3
211 | return
212 |
--------------------------------------------------------------------------------
/toy_models/toy_model_nonlinear_GMM_2D.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from free_energy_clustering.GMM import GaussianMixture
3 | from free_energy_clustering.GMM_free_energy import FreeEnergyClustering
4 |
5 | class GMM2dNonlinear():
6 |
7 | def __init__(self, transform_data=True):
8 |
9 | self.transform_data=transform_data
10 | self.n_dims_ = 2
11 | self.name = 'nonlinear_GMM_2D'
12 | n_components = 3
13 | self.GMM = GaussianMixture(n_components=n_components)
14 | self.n_components_ = n_components
15 | self._set_parameters()
16 | return
17 |
18 | def transform(self,x):
19 | x = np.asarray([x[:,0], x[:,1]+(2.0*x[:,0]**3)]).T # np.sqrt(np.abs(x[:,0]))]).T
20 | return x
21 |
22 | def detransform(self,x):
23 | x = np.asarray([x[:,0], x[:,1]-(2.0*x[:,0]**3)]).T # np.sqrt(np.abs(x[:,0]))]).T
24 | return x
25 |
26 | def sample(self, n_points):
27 | x = self.GMM.sample(n_points)
28 | if self.transform_data:
29 | x = self.transform(x)
30 | return x
31 |
32 | def assign_cluster_labels(self,x):
33 | if self.transform_data:
34 | x = self.detransform(x)
35 | labels = self.GMM.predict(x)+1
36 | #labels[labels==3] = 2
37 | return labels
38 |
39 | def density(self, x):
40 | if self.transform_data:
41 | x = self.detransform(x)
42 | return self.GMM.density(x)
43 |
44 | def _set_cov(self, x11,x12,x22):
45 | tmp_cov = np.zeros((self.n_dims_, self.n_dims_))
46 |
47 | tmp_cov[0, 0] = x11
48 | tmp_cov[0, 1] = x12
49 | tmp_cov[1, 0] = x12
50 | tmp_cov[1, 1] = x22
51 | return tmp_cov
52 |
53 | def _set_parameters(self):
54 |
55 | #self.GMM.means_ = np.asarray([np.asarray([0.0,0.6]), np.asarray([0.3,0.25]), np.asarray([0.3,0.25])])
56 | self.GMM.means_ = np.asarray([np.asarray([-0.8, 0.6]), np.asarray([-0.5, 0.25]), np.asarray([-0.6, 0.25])])
57 | covs = [np.zeros((2,2))]* self.GMM.n_components_
58 |
59 | covs[0] = self._set_cov(0.01, 0.005, 0.05)
60 | covs[1] = self._set_cov(0.05, -0.01, 0.015)
61 | covs[2] = self._set_cov(0.001, 0.000, 0.01)
62 | #covs[1] = self._set_cov(0.05, -0.01, 0.015)
63 | #covs[2] = self._set_cov(0.001, 0.000, 0.01)
64 |
65 | self.GMM.covariances_ = covs
66 |
67 | self.GMM.weights_ = np.asarray([0.8,0.08,0.08])
68 | self.GMM.weights_ /= np.sum(self.GMM.weights_)
69 |
70 | return
71 |
--------------------------------------------------------------------------------