├── wNMF
    ├── __init__.py
    └── wNMF.py
├── MANIFEST.in
├── .gitignore
├── LICENSE
├── setup.py
└── README.md


/wNMF/__init__.py:
--------------------------------------------------------------------------------
1 | from .wNMF import wNMF


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Data
 7 | data/
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 | 
28 | # Installer logs
29 | pip-log.txt
30 | pip-delete-this-directory.txt
31 | 
32 | # Jupyter Notebook
33 | .ipynb_checkpoints
34 | 
35 | # pyenv
36 | .python-version
37 | 
38 | # mypy
39 | .mypy_cache/
40 | .vscode
41 | .DS_Store
42 | __pycache__
43 | **/reporting
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-present, Scott Nanda
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf8
 3 | import setuptools
 4 | from os import path
 5 | 
 6 | 
 7 | 
 8 | ## Open README
 9 | here = path.abspath(path.dirname(__file__))
10 | readme_path = path.join(here, 'README.md')
11 | with open(readme_path, 'r') as f:
12 |     readme = f.read()
13 | 
14 | setuptools.setup(
15 |             name='wNMF',
16 |             version='0.0.42',
17 |             long_description=readme,
18 |             description='wNMF: weighted Non-Negative matrix Factorization',
19 |             long_description_content_type='text/markdown',
20 |             author='SN',
21 |             author_email='scottnanda@gmail.com',
22 |             url='https://github.com/asn32/weighted-nmf',
23 |             license='MIT License',
24 |             packages=['wNMF'],
25 |             python_requires='>=3.6',
26 |             install_requires='numpy>=1.13',
27 |             include_package_data=False,
28 |             classifiers=[
29 |                 "Programming Language :: Python",
30 |                 "Programming Language :: Python :: 3",
31 |                 "Programming Language :: Python :: 3.6",
32 |                 "Topic :: Scientific/Engineering :: Bio-Informatics",
33 |                 "Topic :: Scientific/Engineering :: Mathematics"
34 |             ]
35 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # wNMF: Weighted Non-Negative Matrix Factorization
 2 | 
 3 | ## About
 4 | `wNMF` implements a simple version of Non-Negative Matrix Factorization (NMF) that utilizes a weight matrix to weight the importance of each feature in each sample of the data matrix to be factorized.
 5 | 
 6 | `wNMF` is easy to use, because it behaves like an `sklearn.decomposition` model, but also allows for multiple fitting attempts.
 7 | 
 8 | More information about the modified multiplicative update algorithim utilized can be found here:
 9 | [Blondel, Vincent & Ho, Ngoc-Diep & Van Dooren, Paul. (2007). Weighted Nonnegative Matrix Factorization and Face Feature Extraction](https://pdfs.semanticscholar.org/e20e/98642009f13686a540c193fdbce2d509c3b8.pdf) 
10 | 
11 | `wNMF` specifically implements solutions for determining the decomposed matrices U and V when minimizing the Frobenius Norm or the Kullback-Leibler Divergence:
12 | 
13 | **Useful Links**
14 | - [Source on Github](https://github.com/asn32/weighted-nmf)
15 | - [Package on PyPI](https://pypi.org/project/wNMF/)
16 | 
17 | ## Installation
18 | This package is available on PyPI and can be installed with `pip`:
19 | ```bash
20 | $ pip install wNMF
21 | ```
22 | 
23 | Alternatively, download the source from [github](https://github.com/asn32/weighted-nmf) and install:
24 | ```bash
25 | $ git clone https://github.com/asn32/weighted-nmf.git
26 | $ cd weighted-nmf
27 | $ python3 setup.py install --user
28 | ```
29 | 
30 | ## Usage
31 | `wNMF` is a python library that can be imported.
32 | ```python
33 | from wNMF import wNMF
34 | ```
35 | And it can be used like an `sklearn.decomposition` model. 
36 | 
37 | First create an instance of the `wNMF` model by setting the number of components.
38 | 
39 | Other parameters can be set too, such as the loss function, maximum number of iterations, and whether or not to track the decreasing error over every single run.
40 | ```python
41 | ## Mock data, a 100x100 data matrix, reduce to 25 dimensions
42 | n=100
43 | features = 100
44 | components=25
45 | X = 100*np.random.uniform(size=n*features).reshape(features,n)
46 | W = np.ones_like(X)
47 | 
48 | ## Define the model / fit
49 | model = wNMF(n_components=25,
50 |             beta_loss='kullback-leibler',
51 |             max_iter=1000,
52 |             track_error=True)
53 | ```
54 | 
55 | Then, fit the model to the data using the instance methods `wNMF().fit` or `wNMF().fit_transform`.
56 | ```python
57 | fit = model.fit(X=X,W=W,n_run=5)
58 | ```
59 | 
60 | After the fit is complete, explore the fit quality by examining the decomposed matrices and / or overall error.
61 | ```python
62 | ## Get the best solutions
63 | lowest_error = fit.err
64 | best_V = fit.V
65 | best_U = fit.U
66 | 
67 | ## Or look at all the solutions from the 5 runs in this example
68 | all_Vs = fit.V_all
69 | ```
70 | 
71 | ## License
72 | wNMF is MIT-licensed
73 | 
74 | ## Disclaimer
75 | `wNMF` is provided with no guarantees
76 | 
77 | 


--------------------------------------------------------------------------------
/wNMF/wNMF.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Non-negative matrix factorization (NMF) using the "Weighted-NMF" algorithim (wNMF).
  3 | See "Weighted Nonnegative Matrix Factorization and Face Feature Extraction", Blondel, Ho and Dooren 2007
  4 | 
  5 | NMF decomposes a matrix X into two matrices U,V with a shared internal dimension, representing a reduced-dimension
  6 | latent space.
  7 | 
  8 | X = UV
  9 | 
 10 | Columns of U are the basis vectors for this latent space, and columns of V contain the set of coeffcients required
 11 | to represent each sample in A as a linear combination of the basis vectors in U. 
 12 | 
 13 | Weighted NMF: 
 14 |     Blondel, Ho and Dooren introduce a weight matrix W that pre-weights the importance of each feature (row) in
 15 |     each sample (column) of the data matrix X, such that W ⊗ X = UV, where ⊗ is the Hadamard product of W and X. 
 16 |     To determine U and V, given W and X, the authors develop a variation of the Multiplicative Update algorithim 
 17 |     proposed by (Lee, 1999) and (Lee, 2001) to minimize the Kubllback-Leibler divergence, or, 
 18 |     alternatively the Frobenius Norm. Variants of algorithims to solve the weighted-NMF problem by minimizing both
 19 |     KL-divergence and the Frobenius Norm are provided. See, reference.
 20 |     
 21 | '''
 22 | import numpy as np
 23 | 
 24 | class wNMF:
 25 |     '''
 26 |     Params
 27 |     ----------
 28 |     X : numpy.ndarray or coercible array-like object, float64
 29 |         A data matrix to be factorized, with dimensions (n_features, n_samples). 
 30 |         NOTE this is different from the SKLearn NMF function, which expects X to be (n_samples,n_features)
 31 |     
 32 |     W : numpy.ndarray or coercible array-like object, float64
 33 |         A weight matrix of same dimension as X, which weights each entry in X. Generally expected
 34 |         to be values ranging from 0 to 1, but can contain any non-negative entries.
 35 |     
 36 |     n_components : int
 37 |         The rank of the decomposition of X, alternatively the reduced dimension of the factorization.
 38 |         
 39 |     init : str --> ("random" , None) default "random"
 40 |         The initialization strategy for matrices U and V. Defaults to "random" if no value is provided
 41 |     
 42 |     beta_loss : string --> ("frobenius", "kullback-leibler") default "frobenius"
 43 |         The error to be minimized between W ⊗ X, and UV, using the approrite multiplicative update variant. 
 44 |         
 45 |     max_iter : int
 46 |         The maximum number of minimization iterations to perform before stopping.
 47 |     
 48 |     tol : float, default 1e-4
 49 |         If the relative error is found to change less than this amount after 20 iterations, or alternativley increase
 50 |         then minimization is completed.
 51 |         
 52 |     random_state : int default 12345
 53 |         Specifies a seed value to initilaize the numpy random number generator. Defaults to 12345
 54 |     
 55 |     rescale : bool, default False
 56 |         Controls whether to normalize the resulting U matrix columns such that each basis vector can be interpreted
 57 |         as a categorical probability distribution over the features in X. Useful for Signature Extraction, but invalidates
 58 |         the coefficients in V.
 59 |     
 60 |     track_error : bool, default False
 61 |         Controls whether to track the error of each wNMF fitting run, and store the result as a vector of length max_iter.
 62 |         One vector is generated per run and tracks the performace of that fitting run over time. By default this is false
 63 |         as it can slow down the overall fitting, and is primarily useful for diagnostics
 64 |     
 65 |     verbose : integer --> (0, 1) default 1
 66 |         The level of verbosity. If 1 is provided, reports n_features, n_samples, and n_components, as well as the current
 67 |         error every 100 iterations. 
 68 |         
 69 |    
 70 |     Returns
 71 |     -------
 72 |     
 73 |     U : numpy.ndarray, shape (n_features, n_components)
 74 |         The basis matrix for the reduced dimension latent space. Columns of U are basis vectors that can be 
 75 |         added with different weights to yield a sample from X (columns).
 76 | 
 77 |     V : numpy.ndarray, shape (n_components, n_samples)
 78 |         The coefficient matrix for the reduced dimension latent space. Columns of V are the reduced representation of
 79 |         each sample in X, decomposed into a linear combination of the basis vectors in U. Samples in X can be 'reconstructed'
 80 |         by multiplying a column of U by V. 
 81 |     
 82 |     reconstruction_error : float
 83 |         The reconstruction error between X, and W ⊗ UV, using the approriate error function specified in beta_loss
 84 |     
 85 |     n_iter : int
 86 |         The number of iterations at which the minimization terminated, maximal value is max_iter.
 87 |     
 88 | 
 89 |     This information can be accessed from the following variables, to mimic the SKlearn API
 90 |     
 91 |         U : self.components_
 92 |         |   The matrix U from the best run, with dimensions (n_features, n_components)
 93 |         |
 94 |         | : self.components_all_
 95 |         |    A tuple of length n_runs, with each entry containing a matrix U from a single run.
 96 |           
 97 |           
 98 |         V : self.coefficients_
 99 |         |    The matrix V from the best  run, with dimensions (n_features, n_components)
100 |         |   
101 |         | : self.coefficients_all_
102 |         |    A tuple of length n_runs, with each entry containing a matrix V from a single run.
103 |           
104 |         reconstruction_error : self.reconstruction_err_
105 |                         |       The reconstruction error from the best run, a float.
106 |                         |
107 |                         |    : self.reconstruction_err_all_
108 |                         |         A tuple of length n_runs, with each entry containing a the reconstruction error from a single run
109 |         
110 |         n_iter : self.n_iter_
111 |            |      The number of iterations at which the minimization terminated for the best fitting run
112 |            |      
113 |            |   : self.n_iter_all_
114 |            |      A tuple of length n_runs, with each entry containing the number of iterations at which minimization terminated for a single run
115 |         
116 |         
117 |     But can also be accessed more directly using what you would expect the variables to be named
118 |         
119 |         U : self.U
120 |         |   The matrix U from the best run, with dimensions (n_features, n_components)
121 |         |
122 |         | : self.U_all
123 |         |    A tuple of length n_runs, with each entry containing a matrix U from a single run.
124 |           
125 |           
126 |         V : self.V
127 |         |    The matrix V from the best  run, with dimensions (n_features, n_components)
128 |         |   
129 |         | : self.V_all
130 |         |    A tuple of length n_runs, with each entry containing a matrix V from a single run.
131 |           
132 |         reconstruction_error : self.err
133 |                         |       The reconstruction error from the best run, a float.
134 |                         |
135 |                         |    : self.err_all
136 |                         |         A tuple of length n_runs, with each entry containing a the reconstruction error from a single run
137 |         
138 |         n_iter : self.n_iter
139 |            |      The number of iterations at which the minimization terminated for the best fitting run
140 |            |      
141 |            |   : self.n_iter_all
142 |            |      A tuple of length n_runs, with each entry containing the number of iterations at which minimization terminated for a single run
143 |         
144 |         
145 |     Methods
146 |     -------
147 |     A set of methods that reflect the SKlearn model API (fit, fit_transform) are implemented. 
148 |     
149 |     fit(X,W,n_run,...): 
150 |         description: Fits an NMF model for the data X, and the weight matrix W
151 |         requires: X,W,n_run
152 |         returns: self - the wNMF object with access to all the return variables listed above
153 |     
154 |     fit_transform(X,W,n_run,...):
155 |         description: Fits an NMF model for the data X, weight matrix W, and returns the coefficient matrix V.
156 |         requires: X,W,n_run
157 |         returns: self.coefficients_  - specifically the best version of V (lowest self.err) identified in n_run's
158 |         
159 |     The other two methods, transform, and inverse transform do not make sense in the context of wNMF, as the
160 |     NMF model is fit with a specific weight matrix, and transforming the data with another weight matrix would not
161 |     be applicable. Hence, these methods are not implemented at the moment 
162 |     
163 |     Examples
164 |     --------
165 |     >>> import numpy as np
166 |     >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
167 |     >>> W = np.array([0.8,0.4],[0.1,0.1],[1,1],[0.7,0.3],[0.9,1],[0.01,0.04])
168 |     >>> from weighted-nmf import wNMF
169 |     >>> model = wNMF(n_components=3).fit(X,W,n_run=1)
170 |     >>> V = model.V
171 |     >>> V = model.coefficients_
172 |     >>> U = model.U
173 |     >>> U = model.components_
174 |     >>> iterations_to_convergence = model.n_iters_
175 |     >>> final_error = model.reconst_err_
176 |     >>> ## Accessing all matrices in n_run runs
177 |     >>> V_all = model.V_all
178 |     >>> V_all = model.coefficients_all_
179 |     >>> U_all = model.U_all
180 |     >>> U_all = model.components_all_
181 |     
182 |     References
183 |     ----------
184 |     Blondel, Vincent & Ho, Ngoc-Diep & Van Dooren, Paul. (2007). 
185 |     Weighted Nonnegative Matrix Factorization and Face Feature Extraction. 
186 |     Image and Vision Computing - IVC. 
187 |     
188 |     '''
189 | 
190 |     def __init__(self,n_components: int,
191 |                  init: str='random',beta_loss: str='frobenius',
192 |                  max_iter: int=1000,tol: float=1e-4,random_state: int=12345,
193 |                  rescale: bool=False,track_error: bool=False,
194 |                  verbose: int=1):
195 |         
196 |         ## init variables
197 |         self.n_components=n_components
198 |         self.init=init
199 |         self.beta_loss=beta_loss
200 |         self.max_iter=max_iter
201 |         self.tol=tol
202 |         self.random_state=random_state
203 |         self.rescale=rescale
204 |         self.track_error = track_error
205 |         self.verbose=verbose
206 |         
207 |         ## Return Variables
208 |         self.X= None
209 |         
210 |         ## Components / U
211 |         self.components_=None
212 |         self.U = None
213 |         self.components_all_=tuple()
214 |         self.U_all = tuple()
215 |         
216 |         ## coefficients / V
217 |         self.coefficients_=None
218 |         self.V = None
219 |         self.coefficients_all_=tuple()
220 |         self.V_all = tuple()
221 |         
222 |         ## Reconstruction error / reconst_err_
223 |         self.reconstruction_err_=None
224 |         self.err=None
225 |         self.reconstruction_err_all_=tuple()
226 |         self.err_all = tuple()
227 |         
228 |         self.err_stored=list()
229 |         
230 |         ## n_iters
231 |         self.n_iter_=None
232 |         self.n_iter_all_=tuple()
233 |         self.n_iter=None
234 |         self.n_iter_all=tuple()
235 |         
236 |         ## run check
237 |         self._check_init()
238 |     
239 |     def __repr__(self):
240 |         return f"wNMF Model with {self.n_components} Components"
241 |         
242 |     def _check_init(self):
243 |         '''
244 |         Function to check the values supplied during wNMF initialization for various run parameters. 
245 |         
246 |         Parameters checked : expected values
247 |             n_components  : int, greater than 0
248 |                     init  : string, 'random' | no other initialization strategies allowed at present
249 |                 beta_loss : string, ('kullback-leibler','frobenius')
250 |                  max_iter : int, greater than 0
251 |                       tol : float, greater than 0
252 |              random_state : int greater than or equal to zero
253 |                   rescale : boolean
254 |               track_error : boolean
255 |                   verbose : int, (0, 1)
256 |         
257 |         '''
258 |         ## check n_components is int > 0
259 |         if not isinstance(self.n_components,int) or self.n_components <=0:
260 |             raise ValueError(f"Number of components must be a positive integer greater than zero; got '{self.n_components}', of type {type(self.n_components)}")
261 |             
262 |         ## check init is random
263 |         if self.init != 'random':
264 |             raise ValueError(f"Only random initialization is supported; got '{self.init}' of type {type(self.init)}")
265 |         
266 |         ## check beta_loss is frobenius or kullback-leiblier
267 |         if self.beta_loss not in ['kullback-leibler','frobenius']:
268 |             raise ValueError(f"Selected loss must be either 'frobenius' or 'kullback-leibler'; got '{self.beta_loss}'")
269 |         
270 |         ## check max_iter is int > 0
271 |         if not isinstance(self.max_iter,int) or self.max_iter <=0:
272 |             raise ValueError(f"Number of iterations must be a positive integer greater than zero; got '{self.max_iter}', of type {type(self.max_iter)}")
273 |         
274 |         ## check tol is numeric > 0
275 |         if not isinstance(self.tol,float) or self.tol <=0:
276 |             raise ValueError(f"Error convergence criteria must be a positive float greater than zero; got '{self.tol}', of type {type(self.tol)}")
277 |         
278 |         ## check random_state is int > 0
279 |         if not isinstance(self.random_state,int) or self.random_state <0:
280 |             raise ValueError(f"Random state seed must be a positive integer, or zero; got '{self.random_state}', of type {type(self.random_state)}")
281 |         
282 |         ## check rescale is boolean
283 |         if not isinstance(self.rescale,bool):
284 |             raise ValueError(f"rescale must be a boolean; got '{self.rescale}', of type {type(self.rescale)}")
285 |         
286 |         ## check track_error is boolean
287 |         if not isinstance(self.track_error,bool):
288 |             raise ValueError(f"rescale must be a boolean; got '{self.track_error}', of type {type(self.track_error)}")
289 |         
290 |         ## check verbose is int 
291 |         if self.verbose !=0 and self.verbose !=1:
292 |             raise ValueError(f"Verbosity is specified with an it, 0 or 1; got '{self.verbose}', of type {type(self.verbose)}")
293 |              
294 |     def fit(self,X: np.ndarray,W: np.ndarray,n_run: int = 1):
295 |         '''
296 |         Function to fit a wNMF model to X, given weight matrix W. The fitting procedure utilizes a modified
297 |         multiplicative update algoithim (see reference), and is repeated n_run times. It is recommended to repeat
298 |         the fitting procedure multiple times (at least 100) and take the best solution (with the lowest error), or
299 |         alternatively to cluster multiple runs together. 
300 |         
301 |         The algorithim is roughly as follows:
302 |         1) Initialize matrices U (n_features,n_components) and V(n_components,n_samples) with random entries
303 |             scaled approximately to the mean of X divded by n_components
304 |         2) For each iteration, successively update U, then V using the aformentioned multiplicative update steps
305 |         3) Terminate the iterations of the number exceeds max_iter, or if error does not change within tol
306 |         4) Repeat 1-3 n_run times and select the best run, but store all runs. 
307 |         
308 |         Params
309 |         -------
310 |         X : numpy.ndarray or coercible array-like object
311 |             A data matrix to be factorized, with dimensions (n_features, n_samples). 
312 |             NOTE this is differnt from the SKLearn API, which expects X to be (n_samples,n_features)
313 | 
314 |         W : numpy.ndarray or coercible array-like object
315 |             A weight matrix of same dimension as X, which weights each entry in X. Generally expected
316 |             to be values ranging from 0 to 1, but can contain any non-negative entries.
317 |             
318 |         n_run : int
319 |             The number of times to repeat the wNMF fitting process on the data matrix X and weight matrix W,
320 |             where each attempt utilizes a unique random initialization. The best solution is then selected and
321 |             returned.
322 |         
323 |         Returns:
324 |         -------
325 |         self, with added variables ------
326 |         
327 |         SKLearn response API variables:
328 |             self.components_, 
329 |             self.coefficients_, 
330 |             self.n_iters_, 
331 |             self.reconst_err_
332 |         
333 |         Normal variables:
334 |             self.U
335 |             self.V
336 |             self.n_iters
337 |             self.err
338 |             
339 |         And lists containing all values for all runs
340 |             self.components_all_  / self.U_all
341 |             self.coefficients_all_ / self.V_all
342 |             self.n_iters_all_     / self.n_iters_all
343 |             self.reconst_err_all_ / self.err_all
344 |             
345 |         And the error tracker, if enabled
346 |             self.error_tracker
347 |         
348 |         '''
349 |         
350 |         ## Set the minimal value (that masks 0's) to be the smallest
351 |         ## step size for the data-type in matrix X.
352 |         self.epsmin = np.finfo(type(X[0,0])).eps
353 | 
354 |         
355 |         ## Try to coerce X and W to numpy arrays
356 |         X = self.coerce(X)
357 |         W = self.coerce(W)
358 |         
359 |         ## Check X and W are suitable for NMF
360 |         self._check_x_w(X,W)
361 |         
362 |         ## If passes, initialize random number generator using random_state
363 |         rng = self.init_random_generator()
364 |     
365 |         ## Extract relevant information from X
366 |         n_features, n_samples = X.shape
367 |         mean = np.mean(X)
368 |         
369 |         ## Initialize result storage
370 |         result = list()
371 |         
372 |         ## Begin Runs...
373 |         for r in range(0,n_run):
374 |     
375 |             if self.verbose==1:
376 |                 print(f"Beginning Run {r+1}...")
377 | 
378 |             ## Generate random initializatoins of U,V using random number generator
379 |             if self.verbose==1:
380 |                 print("|--- Initializing U,V")
381 |             U,V = self.initialize_u_v(rng,n_features,n_samples,mean)
382 |            
383 |             ## Factorize X into U,V given W
384 |             if self.verbose==1:
385 |                 print("|--- Running wNMF")
386 |                 
387 |             if self.beta_loss == 'frobenius':
388 |                 factorized = self.weighted_euclidean(X,U,V,W)
389 |                 
390 |             elif self.beta_loss == 'kullback-leibler':
391 |                 factorized = self.weighted_kullback_leibler(X,U,V,W)
392 |             
393 |             ## Rescale the columns of U (basis vectors) if needed
394 |             if self.rescale==True:
395 |                 if self.verbose==1:
396 |                     print("|--- Rescaling U basis vectors")
397 |                     
398 |                 factorized[0] = factorized[0]/np.sum(factorized[0],0)
399 |             
400 |             ## append the result and store it
401 |             result.append(factorized)
402 |             
403 |             if self.verbose==1:
404 |                 print("|--- Completed")
405 |         
406 |         ## transform the result from a list of tuples to a set of lists, each with multiple individual entries
407 |         result = list(zip(*result))
408 |     
409 |         ## Implementing the SKLearn model response API
410 |         self.U_all=result[0]
411 |         self.V_all=result[1]
412 |         self.n_iter_all=result[2]
413 |         self.err_all=result[3]
414 |         
415 |         ## if tracking errors, set variable to store tracked errors
416 |         if self.track_error:
417 |             self.error_tracker = result[4]
418 |         
419 |         ## setting up lists
420 |         self.components_all_= self.U_all
421 |         self.coefficients_all_ = self.V_all
422 |         self.n_iter_all_ = self.n_iter_all
423 |         self.reconstruction_err_all_ = self.err_all
424 |         
425 |         
426 |         ## finding best result
427 |         best_result = np.argmin(self.err_all)
428 |         
429 |         ## Index out the best result, and set variables
430 |         self.U = self.U_all[best_result]
431 |         self.components_=self.U
432 |         
433 |         self.V = self.V_all[best_result]
434 |         self.coefficients_=self.V
435 |         
436 |         self.n_iter=self.n_iter_all[best_result]
437 |         self.n_iter_=self.n_iter
438 |         
439 |         self.err = self.err_all[best_result]
440 |         self.reconstruction_err_ = self.err
441 |         
442 |         ## return entire wNMF object
443 |         return self
444 |        
445 |     def fit_transform(self,X: np.ndarray,W: np.ndarray,n_run: int = 1):
446 |         '''
447 |         Implements the fit_transform functionality from the SKlearn model API. Fits an NMF model to the
448 |         data matrix X, and weight matrix W. Determines the best solution U,V over n_run's. The data-matrix
449 |         is then "transformed" into its latent space coefficients given by the matrix V, or coefficients_. 
450 |         
451 |         Params:
452 |         ------
453 |         X : numpy.ndarray or coercible array-like object
454 |             A data matrix to be factorized, with dimensions (n_features, n_samples). 
455 |             NOTE this is differnt from the SKLearn API, which expects X to be (n_samples,n_features)
456 | 
457 |         W : numpy.ndarray or coercible array-like object
458 |             A weight matrix of same dimension as X, which weights each entry in X. Generally expected
459 |             to be values ranging from 0 to 1, but can contain any non-negative entries.
460 |             
461 |         n_run : int
462 |             The number of times to repeat the wNMF fitting process on the data matrix X and weight matrix W,
463 |             where each attempt utilizes a unique random initialization. The best solution is then selected and
464 |             returned.
465 |         
466 |         
467 |         Returns:
468 |         ------
469 |         f.coefficients : numpy.ndarray
470 |             The best fit matrix V, or coefficients_ in SKlearn API language 
471 |         
472 |         '''
473 |         f = self.fit(X,W,n_run=1)
474 |         
475 |         return f.coefficients_
476 |             
477 |     def weighted_euclidean(self,A: np.ndarray,U: np.ndarray,V: np.ndarray,W: np.ndarray):
478 |         '''
479 |         Function to perform minimization of the Frobenius / Euclidean norm in the
480 |         weighted-NMF case. 
481 |         
482 |         Params:
483 |         -------
484 |         A : numpy.ndarray, values > 0, (n_features, n_samples)
485 |             Data matrix to be factorized, referred to as X in the main code body, referred to as A here to make it easier to
486 |             read the update steps because the authors Blondel, Ho, Ngoc-Diep and Dooren use A.
487 |             
488 |         U : numpy.ndarray, values > 0, (n_features,n_components)
489 |             U matrix, randomly initialized entries.
490 |             
491 |         V : numpy.ndarray, values > 0 (n_components, n_samples)
492 |             V matrix, randomly initialized entries.
493 |             
494 |         W : numpy.ndarray, values > 0 (n_features, n_samples)
495 |             Weight matrix, weighting importance of each feature in each sample, for all samples in X
496 |             
497 |         
498 |         Returns:
499 |         ------
500 |         U : numpy.ndarray, values > 0, (n_features,n_components)
501 |             Optimized version of the U-matrix
502 |         
503 |         V : numpy.ndarray, values > 0 (n_components, n_samples)
504 |             Optimized version of the V-matrix
505 |             
506 |         i : int
507 |             The iteration at which the minimization procedure terminated
508 |         
509 |         err : float
510 |             The final error between the reconstruction UV and the actual values of W ⊗ X
511 |         
512 |         err_stored : numpy.ndarray 
513 |             A numpy vector containing the estimated reconstruction error at each minimization step
514 |             if self.track_error is True, otherwise an empty array of zeroes. 
515 |         
516 |         '''
517 |         epsmin = self.epsmin
518 |         err_stored = np.zeros(self.max_iter)
519 |         ## Begin iterations until max_iter
520 |         for i in range(0,self.max_iter):
521 |             ## Every 10 iterations conver zeroes to epsmin to prevent divide by zero error
522 |             if i % 10 == 0:
523 |                 V[V==0]=epsmin
524 |                 U[U==0]=epsmin                    
525 |             
526 |             ## If enabled, track errors using the Euclidean Norm loss function
527 |             if self.track_error:
528 |                 err_stored[i] = self.calculate_reconstruction_error(A,U,V,W)
529 | 
530 |             ## update V
531 |             V = V*( (U.T@(W*A))/((U.T@(W*(U@V)))) )
532 |             ## update U
533 |             U = U*( ((W*A)@V.T)/(((W*(U@V))@V.T)) )
534 | 
535 |         ## Calculate final reconstruction error
536 |         err = self.calculate_reconstruction_error(A,U,V,W)
537 |         return U,V,i,err,err_stored                
538 |                     
539 |     def weighted_kullback_leibler(self,A: np.ndarray,U: np.ndarray,V: np.ndarray,W: np.ndarray):
540 |         '''
541 |         Function to perform minimization of the Kullback-Leibler divergence in the
542 |         weighted-NMF case. 
543 |         
544 |         Params:
545 |         -------
546 |         A : numpy.ndarray, values > 0, (n_features, n_samples)
547 |             Data matrix to be factorized, referred to as X in the main code body, referred to as A here to make it easier to
548 |             read the update steps because the authors Blondel, Ho, Ngoc-Diep and Dooren use A.
549 |             
550 |         U : numpy.ndarray, values > 0, (n_features,n_components)
551 |             U matrix, randomly initialized entries.
552 |             
553 |         V : numpy.ndarray, values > 0 (n_components, n_samples)
554 |             V matrix, randomly initialized entries.
555 |             
556 |         W : numpy.ndarray, values > 0 (n_features, n_samples)
557 |             Weight matrix, weighting importance of each feature in each sample, for all samples in X
558 |             
559 |         
560 |         Returns:
561 |         ------
562 |         U : numpy.ndarray, values > 0, (n_features,n_components)
563 |             Optimized version of the U-matrix
564 |         
565 |         V : numpy.ndarray, values > 0 (n_components, n_samples)
566 |             Optimized version of the V-matrix
567 |             
568 |         i : int
569 |             The iteration at which the minimization procedure terminated
570 |         
571 |         err : float
572 |             The final error between the reconstruction UV and the actual values of W ⊗ X
573 |         
574 |         err_stored : numpy.ndarray 
575 |             A numpy vector containing the estimated reconstruction error at each minimization step
576 |             if self.track_error is True, otherwise an empty array of zeroes. 
577 |         
578 |         '''
579 |         epsmin = self.epsmin
580 |         err_stored = np.zeros(self.max_iter) 
581 |         ## Begin iterations until max_iter
582 |         for i in range(0,self.max_iter):
583 |             ## Every 10 iterations conver zeroes to epsmin to prevent divide by zero error
584 |             if i % 10 == 0:
585 |                 V[V==0]=epsmin
586 |                 U[U==0]=epsmin
587 |             
588 |             ## If enabled, track errors using KL-divergence loss function
589 |             if self.track_error:
590 |                 err_stored[i] = self.calculate_reconstruction_error(A,U,V,W)
591 |             
592 |             ## Update V
593 |             V = ((V)/(U.T@W))*(U.T@((W*A)/(U@V)))
594 |             ## Update U
595 |             U =((U)/(W@V.T))*(((W*A)/(U@V))@V.T)
596 |         
597 |         ## Calculate final reconstruction error
598 |         err = self.calculate_reconstruction_error(A,U,V,W)
599 |         return U,V,i,err,err_stored
600 |     
601 |     def coerce(self,matrix: np.ndarray):
602 |         '''
603 |         Function to coerce a matrix like object to a numpy.ndarray or return the array 
604 |         if it is already a numpy array. Used for converting X, W to suitable matrices. 
605 |         Throws an error from numpy if the object provided is not coercible. No guarantees 
606 |         are provided on what the coerced result looks like. Zeroes are also replaced with
607 |         epsmin to prevent potential underflow.
608 |         
609 |         Params:
610 |         -------
611 |         matrix : a numpy.ndarray or any object that can be coerced to an array by numpy.ndarray
612 |             An object that is or can be coerced to a numpy.ndarray
613 |         
614 |         Returns: 
615 |         -------
616 |         matrix : numpy.ndarray
617 |             A coerced verision of the provided matrix
618 |         
619 |         '''
620 |         
621 |         ## test if object is a numpy.ndarray / ndarray
622 |         if not isinstance(matrix,np.ndarray):
623 |             matrix = np.array(matrix)
624 |         
625 |         ## Convert 0 entries to epsmin to prevent underflow
626 |         matrix[matrix==0]=self.epsmin
627 |         return matrix
628 |             
629 |     def _check_x_w(self,X: np.ndarray,W: np.ndarray):
630 |         '''
631 |         Function to check the whether supplied X and W are suitable for NMF
632 |         
633 |            Conditions checked : expected values
634 |             X.shape, W.shape  : shapes / dimensions should be equal
635 |                 entries in X  : greater than or equal to 0, no NaNs
636 |                 entries in W  : greater than or equal to 0, no NaNs
637 |         X.shape, n_components : n_components < n_samples in X
638 | 
639 |         '''
640 |         ## check X and W are the same shape
641 |         if X.shape != W.shape:
642 |             raise ValueError("Dimensions of X and weight matrix W must be the same")
643 |                     
644 |         ## check if entries of X and W are greater than 0
645 |         if np.all(X>=0) == False:
646 |             raise ValueError("Entries of X must be positive or zero")
647 |                     
648 |         if np.all(W>=0) == False:
649 |             raise ValueError("Entries of W must be positive or zero")
650 |                     
651 |         ## Check for Nans / and halt if there are any
652 |         if np.any(np.isnan(X)):
653 |             raise ValueError("Entries of X must not contain NaN / NA, or missing entries")
654 |                     
655 |         if np.any(np.isnan(W)):
656 |             raise ValueError("Entries of W must not contain NaN / NA, or missing entries")
657 | 
658 |         ## check to ensure n_components < n_samples 
659 |         if  X.shape[1] < self.n_components: 
660 |             raise ValueError("Number of components cannot be greater than the number of samples (columns) in X")
661 |             
662 |     def init_random_generator(self):
663 |         '''
664 |         Function to initialize a numpy random number generator 
665 |         
666 |         Params:
667 |         -------
668 |         seed : random_seed, int, greater than 0
669 |             A random seed to initialize the random number generator. Default is 12345
670 |             
671 |         Returns:
672 |         -------
673 |         rng : numpy.random.RandomState
674 |             A numpy random number generator 
675 |         '''
676 |         ## initialize the numpy random generator with random seed
677 |         rng = np.random.RandomState(self.random_state)
678 |         return rng
679 |     
680 |     def initialize_u_v(self,random_number_generator: np.random.mtrand.RandomState,n_features: int,n_samples: int,mean: float):
681 |         '''
682 |         Function to randomly initialize U and V. U and V are initialized randomly but scaled to the mean
683 |         of X divided by n_components. 
684 |         
685 |         Params:
686 |         -------
687 |         random_number_generator : numpyp.random.RandomState
688 |             An initialized numpy random number generator with a set seed. 
689 |             
690 |         n_features : int 
691 |             The number of features in X, or rows of X
692 |             
693 |         n_samples : int
694 |             The number of samples in X, or columns of X
695 |         
696 |         mean : float
697 |             Estimated mean over the entire data-set X, used for scaling initilization to approximately 
698 |             similar range
699 |         
700 |         Returns:
701 |         -------
702 |         U : numpy.ndarray
703 |             The matrix U, with randomly initialized entries
704 |             
705 |         V : numpy.ndarray
706 |             The matrix V, with randomly initialized entries
707 |             
708 |         '''
709 |         ## estimate density by partitioning mean across components
710 |         est = np.sqrt(mean/self.n_components)
711 |         
712 |         ## generate entries of U/V using randn, scale by est
713 |         U = est*random_number_generator.randn(n_features,self.n_components)
714 |         np.abs(U,U) ## mutate in-place absolute value
715 |         
716 |         V = est*random_number_generator.randn(self.n_components,n_samples)
717 |         np.abs(V,V) ## mutate in-place absolute value
718 |         
719 |         ## set all zeroes (if there are any) to epsmin
720 |         V[V==0]=self.epsmin
721 |         U[U==0]=self.epsmin                    
722 |                     
723 |         return U,V
724 | 
725 |     def calculate_reconstruction_error(self,X: np.ndarray,U: np.ndarray,V: np.ndarray,W: np.ndarray):
726 |         '''
727 |         Function to calculate the reconstruction error of U,V to X, given W. The function to estimate the
728 |         error is based on the selected loss function, beta_loss
729 |         
730 |         Params:
731 |         ------
732 |         A : numpy.ndarray, values > 0, (n_features, n_samples)
733 |             Data matrix to be factorized / compared to
734 |             
735 |         U : numpy.ndarray, values > 0, (n_features,n_components)
736 |             U matrix
737 |             
738 |         V : numpy.ndarray, values > 0 (n_components, n_samples)
739 |             V matrix
740 |             
741 |         W : numpy.ndarray, values > 0 (n_features, n_samples)
742 |             Weight matrix, weighting importance of each feature in each sample, for all samples in X
743 |             
744 |         Returns:
745 |         ------
746 |         err : the estimated error using the selected loss function
747 |         
748 |         '''
749 |         
750 |         ## Replace zeroes with epsmin to prevent divide by zero / log(0) errors
751 |         V[V==0]=self.epsmin
752 |         U[U==0]=self.epsmin  
753 |         
754 |         ## select loss function and calculate error
755 |         if self.beta_loss=='frobenius':
756 |             rec = X-U@V
757 |             err = 0.5*np.sum(W*rec*rec)
758 |             
759 |         elif self.beta_loss=='kullback-leibler':
760 |             rec = U@V
761 |             err = np.sum(W*(X*np.log(X/rec)-X+rec))
762 |             
763 |         return err
764 | 


--------------------------------------------------------------------------------