├── gmm.gif
├── ll.gif
├── combined.gif
├── .gitattributes
├── __pycache__
    └── GMM.cpython-37.pyc
├── CITATION.cff
├── README.md
└── GMM.py


/gmm.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mr-easy/GMM-EM-Python/HEAD/gmm.gif


--------------------------------------------------------------------------------
/ll.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mr-easy/GMM-EM-Python/HEAD/ll.gif


--------------------------------------------------------------------------------
/combined.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mr-easy/GMM-EM-Python/HEAD/combined.gif


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/__pycache__/GMM.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mr-easy/GMM-EM-Python/HEAD/__pycache__/GMM.cpython-37.pyc


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: If you use this software, please cite it using these metadata.
 3 | title: GMM-EM-Python
 4 | abstract: Python implementation of EM algorithm for GMM. And visualization for 2D case.
 5 | authors:
 6 |   - name: Rishabh Gupta
 7 |     email: rishabhg1997@gmail.com
 8 | version: 1.0.0
 9 | date-released: 2021-06-17
10 | repository-code: "https://github.com/mr-easy/GMM-EM-Python"
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # GMM-EM-Python
 3 |  Python implementation of Expectation-Maximization algorithm (EM) for Gaussian Mixture Model (GMM).
 4 | 
 5 |  Code for GMM is in [GMM.py](GMM.py). It's very well documented on how to use it on your data. For an example and visualization for 2D set of points, see the notebook [EM_for_2D_GMM.ipynb](EM_for_2D_GMM.ipynb).
 6 | 
 7 |  ![](combined.gif)
 8 | 
 9 | ## Requirements:
10 |  - Numpy
11 |  - Scipy
12 |  - Matplotlib
13 |  
14 | ## Documentation:
15 |  **_class_ `GMM`:**
16 |  - **Parameters:**
17 |  	- `k`: Number of clusters
18 |  	- `dim`: Dimension
19 |  	- `mu`: Means of all clusters
20 |  	- `sigma`: Covariance matrix of all clusters
21 |  	- `pi`: Proportions of each cluster in P(x)
22 |  	- `colors`: Colors for each cluster for plotting
23 |  	- `z`: Latent variable Z (defined during EM init)
24 |  	- `num_points`: Number of data points (defined during EM init)
25 |  	- `data`: Dataset (defined during EM init)
26 |  - **Functions:**
27 |  	- `__init__(self, k, dim, init_mu=None, init_sigma=None, init_pi=None, colors=None)`
28 | 
29 | 		 Defines a model with known number of clusters and dimensions.
30 |  		 
31 |  		 input:
32 | 		 - `k`: Number of Gaussian clusters
33 | 	     - `dim`: Dimension 
34 | 	     - `init_mu`: initial value of mean of clusters `(k, dim)`
35 |                (default) random from uniform[-10, 10]
36 | 	     - `init_sigma`: initial value of covariance matrix of clusters `(k, dim, dim)`
37 |                   (default) Identity matrix for each cluster
38 | 	     - `init_pi`: initial value of cluster weights `(k,)`
39 |                (default) equal value to all cluster i.e. 1/k
40 | 	     - `colors`: Color valu for plotting each cluster `(k, 3)`
41 |               (default) random from uniform[0, 1]
42 | 
43 |     - `init_em(self, X)`
44 | 
45 |  		 Initialization for EM algorithm.
46 |  
47 |  		 input:
48 | 	     - `X`: data `(batch_size, dim)`
49 |     
50 |     - `e_step(self)`
51 |  
52 |  		 E-step of EM algorithm.
53 | 
54 |     - `m_step(self)`
55 | 
56 |  		 M-step of EM algorithm.
57 | 
58 |     - `log_likelihood(self, X)`
59 |  
60 |   		 Compute the log-likelihood of X under current parameters
61 | 
62 |   		 input:
63 | 	     - `X`: Data (batch_size, dim)
64 |  
65 |   		 output:
66 | 	     - `log-likelihood of X`: Sum_n Sum_k log(pi_k * N( X_n | mu_k, sigma_k ))
67 | 
68 |     - `plot_gaussian(self, mean, cov, ax, n_std=3.0, facecolor='none', **kwargs)`
69 |     
70 |     	 Utility function to plot one Gaussian from mean and covariance.
71 | 
72 |     - `draw(self, ax, n_std=2.0, facecolor='none', **kwargs)`
73 | 	  
74 | 	  	 Function to draw the Gaussians (only for two-dimensionl dataset).
75 | 
76 | ## TODO:
77 | - Handling singular covariance matrix problem.
78 | - Better initialization methods.


--------------------------------------------------------------------------------
/GMM.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy import random
  3 | from matplotlib.patches import Ellipse
  4 | import matplotlib.transforms as transforms
  5 | from scipy.stats import multivariate_normal
  6 | 
  7 | class GMM():
  8 |     def __init__(self, k, dim, init_mu=None, init_sigma=None, init_pi=None, colors=None):
  9 |         '''
 10 |         Define a model with known number of clusters and dimensions.
 11 |         input:
 12 |             - k: Number of Gaussian clusters
 13 |             - dim: Dimension 
 14 |             - init_mu: initial value of mean of clusters (k, dim)
 15 |                        (default) random from uniform[-10, 10]
 16 |             - init_sigma: initial value of covariance matrix of clusters (k, dim, dim)
 17 |                           (default) Identity matrix for each cluster
 18 |             - init_pi: initial value of cluster weights (k,)
 19 |                        (default) equal value to all cluster i.e. 1/k
 20 |             - colors: Color valu for plotting each cluster (k, 3)
 21 |                       (default) random from uniform[0, 1]
 22 |         '''
 23 |         self.k = k
 24 |         self.dim = dim
 25 |         if(init_mu is None):
 26 |             init_mu = random.rand(k, dim)*20 - 10
 27 |         self.mu = init_mu
 28 |         if(init_sigma is None):
 29 |             init_sigma = np.zeros((k, dim, dim))
 30 |             for i in range(k):
 31 |                 init_sigma[i] = np.eye(dim)
 32 |         self.sigma = init_sigma
 33 |         if(init_pi is None):
 34 |             init_pi = np.ones(self.k)/self.k
 35 |         self.pi = init_pi
 36 |         if(colors is None):
 37 |             colors = random.rand(k, 3)
 38 |         self.colors = colors
 39 |     
 40 |     def init_em(self, X):
 41 |         '''
 42 |         Initialization for EM algorithm.
 43 |         input:
 44 |             - X: data (batch_size, dim)
 45 |         '''
 46 |         self.data = X
 47 |         self.num_points = X.shape[0]
 48 |         self.z = np.zeros((self.num_points, self.k))
 49 |     
 50 |     def e_step(self):
 51 |         '''
 52 |         E-step of EM algorithm.
 53 |         '''
 54 |         for i in range(self.k):
 55 |             self.z[:, i] = self.pi[i] * multivariate_normal.pdf(self.data, mean=self.mu[i], cov=self.sigma[i])
 56 |         self.z /= self.z.sum(axis=1, keepdims=True)
 57 |     
 58 |     def m_step(self):
 59 |         '''
 60 |         M-step of EM algorithm.
 61 |         '''
 62 |         sum_z = self.z.sum(axis=0)
 63 |         self.pi = sum_z / self.num_points
 64 |         self.mu = np.matmul(self.z.T, self.data)
 65 |         self.mu /= sum_z[:, None]
 66 |         for i in range(self.k):
 67 |             j = np.expand_dims(self.data, axis=1) - self.mu[i]
 68 |             s = np.matmul(j.transpose([0, 2, 1]), j)
 69 |             self.sigma[i] = np.matmul(s.transpose(1, 2, 0), self.z[:, i] )
 70 |             self.sigma[i] /= sum_z[i]
 71 |             
 72 |     def log_likelihood(self, X):
 73 |         '''
 74 |         Compute the log-likelihood of X under current parameters
 75 |         input:
 76 |             - X: Data (batch_size, dim)
 77 |         output:
 78 |             - log-likelihood of X: Sum_n Sum_k log(pi_k * N( X_n | mu_k, sigma_k ))
 79 |         '''
 80 |         ll = []
 81 |         for d in X:
 82 |             tot = 0
 83 |             for i in range(self.k):
 84 |                 tot += self.pi[i] * multivariate_normal.pdf(d, mean=self.mu[i], cov=self.sigma[i])
 85 |             ll.append(np.log(tot))
 86 |         return np.sum(ll)
 87 |     
 88 |     def plot_gaussian(self, mean, cov, ax, n_std=3.0, facecolor='none', **kwargs):
 89 |         '''
 90 |         Utility function to plot one Gaussian from mean and covariance.
 91 |         '''
 92 |         pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
 93 |         ell_radius_x = np.sqrt(1 + pearson)
 94 |         ell_radius_y = np.sqrt(1 - pearson)
 95 |         ellipse = Ellipse((0, 0),
 96 |             width=ell_radius_x * 2,
 97 |             height=ell_radius_y * 2,
 98 |             facecolor=facecolor,
 99 |             **kwargs)
100 |         scale_x = np.sqrt(cov[0, 0]) * n_std
101 |         mean_x = mean[0]
102 |         scale_y = np.sqrt(cov[1, 1]) * n_std
103 |         mean_y = mean[1]
104 |         transf = transforms.Affine2D() \
105 |             .rotate_deg(45) \
106 |             .scale(scale_x, scale_y) \
107 |             .translate(mean_x, mean_y)
108 |         ellipse.set_transform(transf + ax.transData)
109 |         return ax.add_patch(ellipse)
110 | 
111 |     def draw(self, ax, n_std=2.0, facecolor='none', **kwargs):
112 |         '''
113 |         Function to draw the Gaussians.
114 |         Note: Only for two-dimensionl dataset
115 |         '''
116 |         if(self.dim != 2):
117 |             print("Drawing available only for 2D case.")
118 |             return
119 |         for i in range(self.k):
120 |             self.plot_gaussian(self.mu[i], self.sigma[i], ax, n_std=n_std, edgecolor=self.colors[i], **kwargs)


--------------------------------------------------------------------------------