├── setup.cfg
├── setup.py
├── .gitignore
├── pymf
    ├── __init__.py
    ├── vol.py
    ├── nmfnnls.py
    ├── greedycur.py
    ├── cmd.py
    ├── laesa.py
    ├── kmeans.py
    ├── cursl.py
    ├── snmf.py
    ├── cmeans.py
    ├── sivm_cur.py
    ├── nmfals.py
    ├── rnmf.py
    ├── nndsvd.py
    ├── dist.py
    ├── pca.py
    ├── bnmf.py
    ├── cur.py
    ├── aa.py
    ├── sivm_sgreedy.py
    ├── greedy.py
    ├── sivm_search.py
    ├── cnmf.py
    ├── sivm_gsat.py
    ├── nmf.py
    ├── sub.py
    ├── gmap.py
    ├── chnmf.py
    ├── svd.py
    └── sivm.py
├── README.txt
└── tests
    └── test_pymf.py


/setup.cfg:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --cov-report term-missing --cov pymf
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |     name='PyMF',
 5 |     version='0.1.9',
 6 |     description='Python Matrix Factorization Module',
 7 |     author='Christian Thurau',
 8 |     author_email='cthurau@googlemail.com',
 9 |     url='http://code.google.com/p/pymf/',
10 |     packages=setuptools.find_packages(),
11 |     license='OSI Approved :: GNU General Public License (GPL)',
12 |     install_requires=[
13 |         'cvxopt',
14 |         'numpy',
15 |         'scipy',
16 |     ],
17 |     extras_require={
18 |         'tests': [
19 |             'pytest',
20 |             'pytest-cov',
21 |         ],
22 |     },
23 |     tests_require=[
24 |         'pytest',
25 |         'pytest-cov',
26 |     ],
27 |     long_description=open('README.txt').read(),
28 | )
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/pymf/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | 
 7 | '''pymf is a package for several Matrix Factorization variants.-
 8 | Detailed documentation is available at http://pymf.googlecode.com
 9 | Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL)
10 | '''
11 | 
12 | 
13 | import numpy as np
14 | from scipy.sparse import issparse
15 | 
16 | from .nmf import *
17 | from .nmfals import *
18 | from .nmfnnls import *
19 | from .cnmf import *
20 | from .chnmf import *
21 | from .snmf import *
22 | from .aa import *
23 | 
24 | from .laesa import *
25 | from .bnmf import *
26 | 
27 | from .sub import *
28 | 
29 | from .svd import *
30 | from .pca import *
31 | from .cur import *
32 | from .sivm_cur import *
33 | from .cmd import *
34 | 
35 | from .kmeans import *
36 | from .cmeans import *
37 | 
38 | from .sivm import *
39 | from .sivm_sgreedy import *
40 | from .sivm_search import *
41 | from .sivm_gsat import *
42 | 
43 | from .gmap import *
44 | 


--------------------------------------------------------------------------------
/pymf/vol.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2.6
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF functions for computing matrix/simplex volumes
 8 | 
 9 |     cmdet(): Cayley-Menger Determinant
10 |     simplex_volume(): Ordinary simplex volume
11 | 
12 | """
13 | 
14 | 
15 | import numpy as np
16 | try:
17 |     from scipy.misc.common import factorial
18 | except:
19 |     from scipy.misc import factorial
20 | 
21 | __all__ = ["cmdet", "simplex"]
22 | 
23 | 
24 | def cmdet(d):
25 |     # compute the CMD determinant of the euclidean distance matrix d
26 |     # -> d should not be squared!
27 |     D = np.ones((d.shape[0]+1,d.shape[0]+1))
28 |     D[0,0] = 0.0
29 |     D[1:,1:] = d**2
30 |     j = np.float32(D.shape[0]-2)
31 |     f1 = (-1.0)**(j+1) / ( (2**j) * ((factorial(j))**2))
32 |     cmd = f1 * np.linalg.det(D)
33 |     # sometimes, for very small values "cmd" might be negative ...
34 |     return np.sqrt(np.abs(cmd))
35 | 
36 | 
37 | def simplex(d):
38 |     # compute the simplex volume using coordinates
39 |     D = np.ones((d.shape[0]+1, d.shape[1]))
40 |     D[1:,:] = d
41 |     vol = np.abs(np.linalg.det(D)) / factorial(d.shape[1] - 1)
42 |     return vol
43 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Matrix Factorization Methods for Python (pymf)
 2 | ==============================================
 3 | 
 4 | What is PyMF?
 5 | -------------
 6 | 
 7 | Python Matrix Factorization (PyMF) is a module for several constrained/unconstrained 
 8 | matrix factorization (and related) methods. The module is early alpha and not very well 
 9 | tested.
10 | 
11 | PyMF currently includes the following methods:
12 | 
13 |     * Non-negative matrix factorization (NMF)
14 |     * Convex non-negative matrix factorization (CNMF)
15 |     * Semi non-negative matrix factorization (SNMF)
16 |     * Archetypal analysis (AA)
17 |     * Simplex volume maximization (SiVM)
18 |     * Convex-hull non-negative matrix factorization (CHNMF)
19 |     * Binary matrix factorization (BNMF)
20 |     * Singular value decomposition (SVD)
21 |     * Principal component analysis (PCA)
22 |     * K-means clustering (Kmeans)
23 |     * CUR decomposition (CUR)
24 |     * Compaxt matrix decomposition (CMD) 
25 | 
26 | Where to get it
27 | ---------------
28 | 
29 | * Main website, documentation:  http://pymf.googlecode.com
30 | * Contact email: cthurau at googlemail.com
31 | 
32 | 
33 | Requires
34 | --------
35 | 
36 | * Linux, Mac OS-X or Windows
37 | * Python 2.5 or 2.6
38 | * NumPy, Cvxopt, Scipy
39 | 


--------------------------------------------------------------------------------
/pymf/nmfnnls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF Non-negative Matrix Factorization.
 8 | 
 9 |     NMFALS: Class for Non-negative Matrix Factorization using non negative
10 |             least squares optimization (requires scipy.optimize)
11 | 
12 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative
13 | Matrix Factorization, Nature 401(6755), 788-799.
14 | """
15 | 
16 | 
17 | 
18 | import scipy.optimize
19 | from .nmf import NMF
20 | 
21 | __all__ = ["NMFNNLS"]
22 | 
23 | class NMFNNLS(NMF):
24 |     """
25 |     NMFNNLS(data, num_bases=4)
26 | 
27 | 
28 |     Non-negative Matrix Factorization. Factorize a data matrix into two matrices
29 |     s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative
30 |     data. Uses the Lawsons and Hanson's algorithm for non negative constrained
31 |     least squares (-> also see scipy.optimize.nnls)
32 | 
33 |     Parameters
34 |     ----------
35 |     data : array_like, shape (_data_dimension, _num_samples)
36 |         the input data
37 |     num_bases: int, optional
38 |         Number of bases to compute (column rank of W and row rank of H).
39 |         4 (default)
40 | 
41 |     Attributes
42 |     ----------
43 |     W : "data_dimension x num_bases" matrix of basis vectors
44 |     H : "num bases x num_samples" matrix of coefficients
45 |     ferr : frobenius norm (after calling .factorize())
46 | 
47 |     Example
48 |     -------
49 |     Applying NMF to some rather stupid data set:
50 | 
51 |     >>> import numpy as np
52 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
53 |     >>> nmf_mdl = NMFALS(data, num_bases=2)
54 |     >>> nmf_mdl.factorize(niter=10)
55 | 
56 |     The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H.
57 |     To compute coefficients for an existing set of basis vectors simply copy W
58 |     to nmf_mdl.W, and set compute_w to False:
59 | 
60 |     >>> data = np.array([[1.5], [1.2]])
61 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
62 |     >>> nmf_mdl = NMFALS(data, num_bases=2)
63 |     >>> nmf_mdl.W = W
64 |     >>> nmf_mdl.factorize(niter=1, compute_w=False)
65 | 
66 |     The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H.
67 |     """
68 | 
69 |     def update_h(self):
70 |         def updatesingleH(i):
71 |             self.H[:,i] = scipy.optimize.nnls(self.W, self.data[:,i])[0]
72 | 
73 |         map(updatesingleH, xrange(self._num_samples))
74 | 
75 | 
76 |     def update_w(self):
77 |         def updatesingleW(i):
78 |             self.W[i,:] = scipy.optimize.nnls(self.H.T, self.data[i,:].T)[0]
79 | 
80 |         map(updatesingleW, xrange(self._data_dimension))
81 | 


--------------------------------------------------------------------------------
/pymf/greedycur.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2.6
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | #$Id$
 7 | """
 8 | PyMF CUR-like Sparse Column Based Matrix Reconstruction via Greedy Approximation[1]
 9 | 
10 |     GREEDYCUR: class for CUR-like decompositions using the GREEDY[2] algorithm.
11 | 
12 | [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III:
13 | Computing a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206.
14 | [2] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix
15 | Reconstruction via Greedy Approximation of SVD. ISAAC'2008.
16 | """
17 | 
18 | 
19 | import numpy as np
20 | from .greedy import GREEDY
21 | from .cur import CUR
22 | 
23 | __all__ = ["GREEDYCUR"]
24 | 
25 | class GREEDYCUR(CUR):
26 |     '''
27 |     GREEDYCUR(data,  data, k=-1, rrank=0, crank=0)
28 | 
29 |     GREEDY-CUR Decomposition. Factorize a data matrix into three matrices s.t.
30 |     F = | data - USV| is minimal. Unlike CUR, GREEDYCUR selects the rows
31 |     and columns using GREEDY, i.e. it tries to find rows/columns that are close
32 |     to SVD-based solutions.
33 | 
34 |     Parameters
35 |     ----------
36 |     data : array_like [data_dimension x num_samples]
37 |         the input data
38 |     rrank: int, optional
39 |         Number of rows to sample from data.
40 |         4 (default)
41 |     crank: int, optional
42 |         Number of columns to sample from data.
43 |         4 (default)
44 |     show_progress: bool, optional
45 |         Print some extra information
46 |         False (default)
47 | 
48 |     Attributes
49 |     ----------
50 |         U,S,V : submatrices s.t. data = USV
51 | 
52 |     Example
53 |     -------
54 |     >>> import numpy as np
55 |     >>> from greedycur import GREEDYCUR
56 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
57 |     >>> cur_mdl = GREEDYCUR(data, show_progress=False, rrank=1, crank=2)
58 |     >>> cur_mdl.factorize()
59 |     """
60 |     '''
61 | 
62 |     def sample(self, A, c):
63 |         # set k to a value lower than the number of bases, usually
64 |         # gives better results.
65 |         k = np.round(c - c/5.0)
66 |         greedy_mdl = GREEDY(A, k=k, num_bases=c)
67 |         greedy_mdl.factorize(compute_h=False, compute_err=False, niter=1)
68 |         return greedy_mdl.select
69 | 
70 | 
71 |     def factorize(self):
72 |         # sample row and column indices that maximize the volume of the submatrix
73 |         self._rid = self.sample(self.data.transpose(), self._rrank)
74 |         self._cid = self.sample(self.data, self._crank)
75 |         self._rcnt = np.ones(len(self._rid))
76 |         self._ccnt = np.ones(len(self._cid))
77 | 
78 |         self.computeUCR()
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     import doctest
83 |     doctest.testmod()
84 | 


--------------------------------------------------------------------------------
/pymf/cmd.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF Compact Matrix Decomposition [1]
 8 | 
 9 |     CMD(CUR):  Class for Compact Matrix Decomposition
10 | 
11 | [1] Sun, J., Xie, Y., Zhang, H. and Faloutsos, C. (2007), Less is More: Compact Matrix Decomposition for Large
12 | Sparse Graphs, in Proc. SIAM Int. Conf. on Data Mining.
13 | """
14 | 
15 | 
16 | import numpy as np
17 | from .cur import CUR
18 | 
19 | __all__ = ["CMD"]
20 | 
21 | class CMD(CUR):
22 |     """
23 |     CMD(data, rrank=0, crank=0)
24 | 
25 | 
26 |     Compact Matrix Decomposition. Factorize a data matrix into three matrices s.t.
27 |     F = | data - USV| is minimal. CMD randomly selects rows and columns from
28 |     data for building U and V, respectively.
29 | 
30 |     Parameters
31 |     ----------
32 |     data : array_like [data_dimension x num_samples]
33 |         the input data
34 |     rrank: int, optional
35 |         Number of rows to sample from data. Double entries are eliminiated s.t.
36 |         the resulting rank might be lower.
37 |         4 (default)
38 |     crank: int, optional
39 |         Number of columns to sample from data. Double entries are eliminiated s.t.
40 |         the resulting rank might be lower.
41 |         4 (default)
42 | 
43 |     Attributes
44 |     ----------
45 |         U,S,V : submatrices s.t. data = USV
46 | 
47 |     Example
48 |     -------
49 |     >>> import numpy as np
50 |     >>> from cmd import CMD
51 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
52 |     >>> cmd_mdl = CMD(data, show_progress=False, rrank=1, crank=2)
53 |     >>> cmd_mdl.factorize()
54 |     """
55 | 
56 |     def _cmdinit(self):
57 |         nrids = np.unique(self._rid)
58 |         ncids = np.unique(self._cid)
59 | 
60 |         self._rcnt = np.zeros(len(nrids))
61 |         self._ccnt = np.zeros(len(ncids))
62 | 
63 |         for i,idx in enumerate(nrids):
64 |             self._rcnt[i] = len(np.where(self._rid == idx)[0])
65 | 
66 |         for i,idx in enumerate(ncids):
67 |             self._ccnt[i] = len(np.where(self._cid == idx)[0])
68 | 
69 |         self._rid = np.int32(list(nrids))
70 |         self._cid = np.int32(list(ncids))
71 | 
72 |     def factorize(self):
73 |         """ Factorize s.t. CUR = data
74 | 
75 |             Updated Values
76 |             --------------
77 |             .C : updated values for C.
78 |             .U : updated values for U.
79 |             .R : updated values for R.
80 |         """
81 | 
82 |         [prow, pcol] = self.sample_probability()
83 | 
84 |         self._rid = self.sample(self._rrank, prow)
85 |         self._cid = self.sample(self._crank, pcol)
86 | 
87 |         self._cmdinit()
88 | 
89 |         self.computeUCR()
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     import doctest
94 |     doctest.testmod()
95 | 


--------------------------------------------------------------------------------
/pymf/laesa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF LAESA
 8 | """
 9 | 
10 | 
11 | import scipy.sparse
12 | import numpy as np
13 | 
14 | from .dist import *
15 | from .sivm import SIVM
16 | 
17 | __all__ = ["LAESA"]
18 | 
19 | class LAESA(SIVM):
20 |     """
21 |     LAESA(data, num_bases=4)
22 | 
23 | 
24 |     Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
25 |     F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
26 |     found by maximizing the volume of the resulting simplex (see [1]).
27 | 
28 |     Parameters
29 |     ----------
30 |     data : array_like, shape (_data_dimension, _num_samples)
31 |         the input data
32 |     num_bases: int, optional
33 |         Number of bases to compute (column rank of W and row rank of H).
34 |         4 (default)
35 | 
36 |     Attributes
37 |     ----------
38 |     W : "data_dimension x num_bases" matrix of basis vectors
39 |     H : "num bases x num_samples" matrix of coefficients
40 |     ferr : frobenius norm (after calling .factorize())
41 | 
42 |     Example
43 |     -------
44 |     Applying LAESA to some rather stupid data set:
45 | 
46 |     >>> import numpy as np
47 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
48 |     >>> laesa_mdl = LAESA(data, num_bases=2)
49 |     >>> laesa_mdl.factorize()
50 | 
51 |     The basis vectors are now stored in laesa_mdl.W, the coefficients in laesa_mdl.H.
52 |     To compute coefficients for an existing set of basis vectors simply    copy W
53 |     to laesa_mdl.W, and set compute_w to False:
54 | 
55 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
56 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
57 |     >>> laesa_mdl = LAESA(data, num_bases=2)
58 |     >>> laesa_mdl.W = W
59 |     >>> laesa_mdl.factorize(niter=1, compute_w=False)
60 | 
61 |     The result is a set of coefficients laesa_mdl.H, s.t. data = W * laesa_mdl.H.
62 |     """
63 |     def update_w(self):
64 |         # initialize some of the recursively updated distance measures
65 |         self.init_sivm()
66 |         distiter = self._distance(self.select[-1])
67 | 
68 |         for l in range(self._num_bases-1):
69 |             d = self._distance(self.select[-1])
70 | 
71 |             # replace distances in distiter
72 |             distiter = np.where(d<distiter, d, distiter)
73 | 
74 |             # detect the next best data point
75 |             self.select.append(np.argmax(distiter))
76 |             self._logger.info('cur_nodes: ' + str(self.select))
77 | 
78 |         # sort indices, otherwise h5py won't work
79 |         self.W = self.data[:, np.sort(self.select)]
80 | 
81 |         # but "unsort" it again to keep the correct order
82 |         self.W = self.W[:, np.argsort(np.argsort(self.select))]
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     import doctest
87 |     doctest.testmod()
88 | 


--------------------------------------------------------------------------------
/pymf/kmeans.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF K-means clustering (unary-convex matrix factorization).
 8 | """
 9 | 
10 | 
11 | import numpy as np
12 | import random
13 | 
14 | import dist
15 | from .nmf import NMF
16 | 
17 | __all__ = ["Kmeans"]
18 | 
19 | class Kmeans(NMF):
20 |     """
21 |     Kmeans(data, num_bases=4)
22 | 
23 |     K-means clustering. Factorize a data matrix into two matrices s.t.
24 |     F = | data - W*H | is minimal. H is restricted to unary vectors, W
25 |     is simply the mean over the corresponding samples in "data".
26 | 
27 |     Parameters
28 |     ----------
29 |     data : array_like, shape (_data_dimension, _num_samples)
30 |         the input data
31 |     num_bases: int, optional
32 |         Number of bases to compute (column rank of W and row rank of H).
33 |         4 (default)
34 | 
35 |     Attributes
36 |     ----------
37 |     W : "data_dimension x num_bases" matrix of basis vectors
38 |     H : "num bases x num_samples" matrix of coefficients
39 |     ferr : frobenius norm (after calling .factorize())
40 | 
41 |     Example
42 |     -------
43 |     Applying K-means to some rather stupid data set:
44 | 
45 |     >>> import numpy as np
46 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
47 |     >>> kmeans_mdl = Kmeans(data, num_bases=2)
48 |     >>> kmeans_mdl.factorize(niter=10)
49 | 
50 |     The basis vectors are now stored in kmeans_mdl.W, the coefficients in kmeans_mdl.H.
51 |     To compute coefficients for an existing set of basis vectors simply    copy W
52 |     to kmeans_mdl.W, and set compute_w to False:
53 | 
54 |     >>> data = np.array([[1.5], [1.2]])
55 |     >>> W = [[1.0, 0.0], [0.0, 1.0]]
56 |     >>> kmeans_mdl = Kmeans(data, num_bases=2)
57 |     >>> kmeans_mdl.W = W
58 |     >>> kmeans_mdl.factorize(niter=1, compute_w=False)
59 | 
60 |     The result is a set of coefficients kmeans_mdl.H, s.t. data = W * kmeans_mdl.H.
61 |     """
62 |     def init_h(self):
63 |         # W has to be present for H to be initialized
64 |         self.H = np.zeros((self._num_bases, self._num_samples))
65 |         self.update_h()
66 | 
67 |     def init_w(self):
68 |         # set W to some random data samples
69 |         sel = random.sample(xrange(self._num_samples), self._num_bases)
70 | 
71 |         # sort indices, otherwise h5py won't work
72 |         self.W = self.data[:, np.sort(sel)]
73 | 
74 | 
75 |     def update_h(self):
76 |         # and assign samples to the best matching centers
77 |         self.assigned = dist.vq(self.W, self.data)
78 |         self.H = np.zeros(self.H.shape)
79 |         self.H[self.assigned, range(self._num_samples)] = 1.0
80 | 
81 | 
82 |     def update_w(self):
83 |         for i in range(self._num_bases):
84 |             idx = np.where(self.assigned==i)[0]
85 |             n = len(idx)
86 |             if n > 1:
87 |                 self.W[:,i] = np.sum(self.data[:,idx], axis=1)/n
88 | 


--------------------------------------------------------------------------------
/pymf/cursl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | #$Id$
 7 | """
 8 | PyMF CUR Decomposition [1]
 9 | 
10 |     CURSL(SVD) : Class for CUR Decomposition (uses statistical leverage based sampling)
11 | 
12 | [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing
13 | a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206.
14 | """
15 | 
16 | 
17 | import numpy as np
18 | import scipy.sparse
19 | 
20 | from .svd import pinv, SVD
21 | from .cmd import CMD
22 | 
23 | __all__ = ["CURSL"]
24 | 
25 | class CURSL(CMD):
26 |     """
27 |     CURSL(data,  data, rrank=0, crank=0)
28 | 
29 |     CUR/CMD Decomposition. Factorize a data matrix into three matrices s.t.
30 |     F = | data - USV| is minimal. CURSL randomly selects rows and columns from
31 |     data for building U and V, respectively. The importance sampling is based
32 |     on a statistical leverage score from the top-k singular vectors (k is
33 |     currently set to 4/5*rrank and 4/5*crank).
34 | 
35 |     Parameters
36 |     ----------
37 |     data : array_like [data_dimension x num_samples]
38 |         the input data
39 |     rrank: int, optional
40 |         Number of rows to sample from data.
41 |         4 (default)
42 |     crank: int, optional
43 |         Number of columns to sample from data.
44 |         4 (default)
45 |     show_progress: bool, optional
46 |         Print some extra information
47 |         False (default)
48 | 
49 |     Attributes
50 |     ----------
51 |         U,S,V : submatrices s.t. data = USV  (or _C _U _R)
52 | 
53 |     Example
54 |     -------
55 |     >>> import numpy as np
56 |     >>> from cur import CUR
57 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
58 |     >>> cur_mdl = CURSL(data, show_progress=False, rrank=1, crank=2)
59 |     >>> cur_mdl.factorize()
60 |     """
61 | 
62 |     def __init__(self, data, k=-1, rrank=0, crank=0):
63 |         SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank)
64 | 
65 |     def sample_probability(self):
66 |         def comp_prob(d, k):
67 |             # compute statistical leverage score
68 |             c = np.round(k - k/5.0)
69 | 
70 |             svd_mdl = SVD(d, k=c)
71 |             svd_mdl.factorize()
72 | 
73 |             if scipy.sparse.issparse(self.data):
74 |                 A = svd_mdl.V.multiply(svd_mdl.V)
75 |                 ## Rule 1
76 |                 pcol = np.array(A.sum(axis=0)/k)
77 |             else:
78 |                 A = svd_mdl.V[:k,:]**2.0
79 |                 ## Rule 1
80 |                 pcol = A.sum(axis=0)/k
81 | 
82 |             #c = k * np.log(k/ (self._eps**2.0))
83 |             #pcol = c * pcol.reshape((-1,1))
84 |             pcol /= np.sum(pcol)
85 |             return pcol
86 | 
87 |         pcol = comp_prob(self.data, self._rrank)
88 |         prow = comp_prob(self.data.transpose(), self._crank)
89 | 
90 | 
91 |         return (prow.reshape(-1,1), pcol.reshape(-1,1))
92 | 


--------------------------------------------------------------------------------
/pymf/snmf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF Semi Non-negative Matrix Factorization.
 8 | 
 9 |     SNMF(NMF) : Class for semi non-negative matrix factorization
10 | 
11 | [1] Ding, C., Li, T. and Jordan, M.. Convex and Semi-Nonnegative Matrix Factorizations.
12 | IEEE Trans. on Pattern Analysis and Machine Intelligence 32(1), 45-55.
13 | """
14 | 
15 | 
16 | 
17 | import numpy as np
18 | 
19 | from .nmf import NMF
20 | 
21 | __all__ = ["SNMF"]
22 | 
23 | class SNMF(NMF):
24 |     """
25 |     SNMF(data, num_bases=4)
26 | 
27 |     Semi Non-negative Matrix Factorization. Factorize a data matrix into two
28 |     matrices s.t. F = | data - W*H | is minimal.
29 | 
30 |     Parameters
31 |     ----------
32 |     data : array_like, shape (_data_dimension, _num_samples)
33 |         the input data
34 |     num_bases: int, optional
35 |         Number of bases to compute (column rank of W and row rank of H).
36 |         4 (default)
37 | 
38 |     Attributes
39 |     ----------
40 |     W : "data_dimension x num_bases" matrix of basis vectors
41 |     H : "num bases x num_samples" matrix of coefficients
42 |     ferr : frobenius norm (after calling .factorize())
43 | 
44 |     Example
45 |     -------
46 |     Applying Semi-NMF to some rather stupid data set:
47 | 
48 |     >>> import numpy as np
49 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
50 |     >>> snmf_mdl = SNMF(data, num_bases=2)
51 |     >>> snmf_mdl.factorize(niter=10)
52 | 
53 |     The basis vectors are now stored in snmf_mdl.W, the coefficients in snmf_mdl.H.
54 |     To compute coefficients for an existing set of basis vectors simply    copy W
55 |     to snmf_mdl.W, and set compute_w to False:
56 | 
57 |     >>> data = np.array([[1.5], [1.2]])
58 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
59 |     >>> snmf_mdl = SNMF(data, num_bases=2)
60 |     >>> snmf_mdl.W = W
61 |     >>> snmf_mdl.factorize(niter=1, compute_w=False)
62 | 
63 |     The result is a set of coefficients snmf_mdl.H, s.t. data = W * snmf_mdl.H.
64 |     """
65 | 
66 | 
67 |     def update_w(self):
68 |         W1 = np.dot(self.data[:,:], self.H.T)
69 |         W2 = np.dot(self.H, self.H.T)
70 |         self.W = np.dot(W1, np.linalg.inv(W2))
71 | 
72 |     def update_h(self):
73 |         def separate_positive(m):
74 |             return (np.abs(m) + m)/2.0
75 | 
76 |         def separate_negative(m):
77 |             return (np.abs(m) - m)/2.0
78 | 
79 |         XW = np.dot(self.data[:,:].T, self.W)
80 | 
81 |         WW = np.dot(self.W.T, self.W)
82 |         WW_pos = separate_positive(WW)
83 |         WW_neg = separate_negative(WW)
84 | 
85 |         XW_pos = separate_positive(XW)
86 |         H1 = (XW_pos + np.dot(self.H.T, WW_neg)).T
87 | 
88 |         XW_neg = separate_negative(XW)
89 |         H2 = (XW_neg + np.dot(self.H.T,WW_pos)).T + 10**-9
90 | 
91 |         self.H *= np.sqrt(H1/H2)
92 | 
93 | if __name__ == "__main__":
94 |     import doctest
95 |     doctest.testmod()
96 | 


--------------------------------------------------------------------------------
/pymf/cmeans.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF K-means clustering (unary-convex matrix factorization).
 8 | Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL).
 9 | """
10 | 
11 | 
12 | 
13 | import numpy as np
14 | 
15 | from . import dist
16 | from .nmf import NMF
17 | 
18 | __all__ = ["Cmeans"]
19 | 
20 | class Cmeans(NMF):
21 |     """
22 |     cmeans(data, num_bases=4)
23 | 
24 | 
25 |     Fuzzy c-means soft clustering. Factorize a data matrix into two matrices s.t.
26 |     F = | data - W*H | is minimal. H is restricted to convexity (columns
27 |     sum to 1) W    is simply the weighted mean over the corresponding samples in
28 |     data. Note that the objective function is based on distances (?), hence the
29 |     Frobenius norm is probably not a good quality measure.
30 | 
31 |     Parameters
32 |     ----------
33 |     data : array_like, shape (_data_dimension, _num_samples)
34 |         the input data
35 |     num_bases: int, optional
36 |         Number of bases to compute (column rank of W and row rank of H).
37 |         4 (default)
38 | 
39 | 
40 |     Attributes
41 |     ----------
42 |     W : "data_dimension x num_bases" matrix of basis vectors
43 |     H : "num bases x num_samples" matrix of coefficients
44 |     ferr : frobenius norm (after calling .factorize())
45 | 
46 |     Example
47 |     -------
48 |     Applying C-means to some rather stupid data set:
49 | 
50 |     >>> import numpy as np
51 |     >>> from cmeans import Cmeans
52 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
53 |     >>> cmeans_mdl = Cmeans(data, num_bases=2, niter=10)
54 |     >>> cmeans_mdl.initialization()
55 |     >>> cmeans_mdl.factorize()
56 | 
57 |     The basis vectors are now stored in cmeans_mdl.W, the coefficients in cmeans_mdl.H.
58 |     To compute coefficients for an existing set of basis vectors simply    copy W
59 |     to cmeans_mdl.W, and set compute_w to False:
60 | 
61 |     >>> data = np.array([[1.5], [1.2]])
62 |     >>> W = [[1.0, 0.0], [0.0, 1.0]]
63 |     >>> cmeans_mdl = Cmeans(data, num_bases=2)
64 |     >>> cmeans_mdl.initialization()
65 |     >>> cmeans_mdl.W = W
66 |     >>> cmeans_mdl.factorize(compute_w=False, niter=50)
67 | 
68 |     The result is a set of coefficients kmeans_mdl.H, s.t. data = W * kmeans_mdl.H.
69 |     """
70 | 
71 |     def update_h(self):
72 |         # assign samples to best matching centres ...
73 |         m = 1.75
74 |         tmp_dist = dist.pdist(self.W, self.data, metric='l2') + self._EPS
75 |         self.H[:,:] = 0.0
76 | 
77 |         for i in range(self._num_bases):
78 |             for k in range(self._num_bases):
79 |                 self.H[i,:] += (tmp_dist[i,:]/tmp_dist[k,:])**(2.0/(m-1))
80 | 
81 |         self.H = np.where(self.H>0, 1.0/self.H, 0)
82 | 
83 |     def update_w(self):
84 |         for i in range(self._num_bases):
85 |             tmp = (self.H[i:i+1,:] * self.data).sum(axis=1)
86 |             self.W[:,i] = tmp/(self.H[i,:].sum() + self._EPS)
87 | 


--------------------------------------------------------------------------------
/pymf/sivm_cur.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF Simplex Volume Maximization for CUR [1]
 8 | 
 9 |     SIVMCUR: class for SiVM-CUR
10 | 
11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
13 | Conf. on Information and Knowledge Management. ACM. 2010.
14 | """
15 | 
16 | 
17 | import numpy as np
18 | import scipy
19 | from .sivm import SIVM
20 | from .cur import CUR
21 | 
22 | __all__ = ["SIVM_CUR"]
23 | 
24 | class SIVM_CUR(CUR):
25 |     '''
26 |     SIVM_CUR(data, num_bases=4, dist_measure='l2')
27 | 
28 |     Simplex Volume based CUR Decomposition. Factorize a data matrix into three
29 |     matrices s.t. F = | data - USV| is minimal. Unlike CUR, SIVMCUR selects the
30 |     rows and columns using SIVM, i.e. it tries to maximize the volume of the
31 |     enclosed simplex.
32 | 
33 |     Parameters
34 |     ----------
35 |     data : array_like [data_dimension x num_samples]
36 |         the input data
37 |     rrank: int, optional
38 |         Number of rows to sample from data.
39 |         4 (default)crank
40 |     crank: int, optional
41 |         Number of columns to sample from data.
42 |         4 (default)
43 |     dist_measure: string, optional
44 |         The distance measure for finding the next best candidate that
45 |         maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2']
46 |         'l2' (default)
47 | 
48 |     Attributes
49 |     ----------
50 |         U,S,V : submatrices s.t. data = USV
51 | 
52 |     Example
53 |     -------
54 |     >>> import numpy as np
55 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
56 |     >>> sivmcur_mdl = SIVM_CUR(data, show_progress=False, rrank=1, crank=2)
57 |     >>> sivmcur_mdl.factorize()
58 |     '''
59 | 
60 |     def __init__(self, data, k=-1, rrank=0, crank=0, dist_measure='l2', init='origin'):
61 |         CUR.__init__(self, data, k=k, rrank=rrank, crank=rrank)
62 |         self._dist_measure = dist_measure
63 |         self.init = init
64 | 
65 |     def sample(self, A, c):
66 |         # for optimizing the volume of the submatrix, set init to 'origin' (otherwise the volume of
67 |         # the ordinary simplex would be optimized)
68 |         sivm_mdl = SIVM(A, num_bases=c, dist_measure=self._dist_measure,
69 |                             init=self.init)
70 |         sivm_mdl.factorize(show_progress=False, compute_w=True, niter=1,
71 |                            compute_h=False, compute_err=False)
72 | 
73 |         return sivm_mdl.select
74 | 
75 | 
76 |     def factorize(self):
77 |         """ Factorize s.t. CUR = data
78 | 
79 |             Updated Values
80 |             --------------
81 |             .C : updated values for C.
82 |             .U : updated values for U.
83 |             .R : updated values for R.
84 |         """
85 |         # sample row and column indices that maximize the volume of the submatrix
86 |         self._rid = self.sample(self.data.transpose(), self._rrank)
87 |         self._cid = self.sample(self.data, self._crank)
88 | 
89 |         self._rcnt = np.ones(len(self._rid))
90 |         self._ccnt = np.ones(len(self._cid))
91 | 
92 |         self.computeUCR()
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     import doctest
97 |     doctest.testmod()
98 | 


--------------------------------------------------------------------------------
/pymf/nmfals.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright (C) Christian Thurau, 2010.
 4 | # Licensed under the GNU General Public License (GPL).
 5 | # http://www.gnu.org/licenses/gpl.txt
 6 | """
 7 | PyMF Non-negative Matrix Factorization.
 8 | 
 9 |     NMFALS: Class for Non-negative Matrix Factorization using alternating least
10 |             squares optimization (requires cvxopt)
11 | 
12 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative
13 | Matrix Factorization, Nature 401(6755), 788-799.
14 | """
15 | 
16 | 
17 | 
18 | import numpy as np
19 | from cvxopt import solvers, base
20 | from .nmf import NMF
21 | 
22 | __all__ = ["NMFALS"]
23 | 
24 | class NMFALS(NMF):
25 |     """
26 |     NMF(data, num_bases=4)
27 | 
28 | 
29 |     Non-negative Matrix Factorization. Factorize a data matrix into two matrices
30 |     s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative
31 |     data. Uses the an alternating least squares procedure (quite slow for larger
32 |     data sets)
33 | 
34 |     Parameters
35 |     ----------
36 |     data : array_like, shape (_data_dimension, _num_samples)
37 |         the input data
38 |     num_bases: int, optional
39 |         Number of bases to compute (column rank of W and row rank of H).
40 |         4 (default)
41 | 
42 |     Attributes
43 |     ----------
44 |     W : "data_dimension x num_bases" matrix of basis vectors
45 |     H : "num bases x num_samples" matrix of coefficients
46 |     ferr : frobenius norm (after calling .factorize())
47 | 
48 |     Example
49 |     -------
50 |     Applying NMF to some rather stupid data set:
51 | 
52 |     >>> import numpy as np
53 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
54 |     >>> nmf_mdl = NMFALS(data, num_bases=2)
55 |     >>> nmf_mdl.factorize(niter=10)
56 | 
57 |     The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H.
58 |     To compute coefficients for an existing set of basis vectors simply    copy W
59 |     to nmf_mdl.W, and set compute_w to False:
60 | 
61 |     >>> data = np.array([[1.5], [1.2]])
62 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
63 |     >>> nmf_mdl = NMFALS(data, num_bases=2)
64 |     >>> nmf_mdl.W = W
65 |     >>> nmf_mdl.factorize(niter=1, compute_w=False)
66 | 
67 |     The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H.
68 |     """
69 | 
70 |     def update_h(self):
71 |         def updatesingleH(i):
72 |             # optimize alpha using qp solver from cvxopt
73 |             FA = base.matrix(np.float64(np.dot(-self.W.T, self.data[:,i])))
74 |             al = solvers.qp(HA, FA, INQa, INQb)
75 |             self.H[:,i] = np.array(al['x']).reshape((1,-1))
76 | 
77 |         # float64 required for cvxopt
78 |         HA = base.matrix(np.float64(np.dot(self.W.T, self.W)))
79 |         INQa = base.matrix(-np.eye(self._num_bases))
80 |         INQb = base.matrix(0.0, (self._num_bases,1))
81 | 
82 |         map(updatesingleH, xrange(self._num_samples))
83 | 
84 | 
85 |     def update_w(self):
86 |         def updatesingleW(i):
87 |         # optimize alpha using qp solver from cvxopt
88 |             FA = base.matrix(np.float64(np.dot(-self.H, self.data[i,:].T)))
89 |             al = solvers.qp(HA, FA, INQa, INQb)
90 |             self.W[i,:] = np.array(al['x']).reshape((1,-1))
91 | 
92 |         # float64 required for cvxopt
93 |         HA = base.matrix(np.float64(np.dot(self.H, self.H.T)))
94 |         INQa = base.matrix(-np.eye(self._num_bases))
95 |         INQb = base.matrix(0.0, (self._num_bases,1))
96 | 
97 |         map(updatesingleW, xrange(self._data_dimension))
98 | 


--------------------------------------------------------------------------------
/pymf/rnmf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Non-negative Matrix Factorization.
  8 | 
  9 |     NMF: Class for Non-negative Matrix Factorization
 10 | 
 11 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative
 12 | Matrix Factorization, Nature 401(6755), 788-799.
 13 | """
 14 | 
 15 | 
 16 | import numpy as np
 17 | import logging
 18 | import logging.config
 19 | import scipy.sparse
 20 | 
 21 | from .nmf import NMF
 22 | 
 23 | __all__ = ["RNMF"]
 24 | 
 25 | class RNMF(NMF):
 26 |     """
 27 |     RNMF(data, num_bases=4)
 28 | 
 29 | 
 30 |     Non-negative Matrix Factorization. Factorize a data matrix into two matrices
 31 |     s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative
 32 |     data. Uses the classicial multiplicative update rule.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     data : array_like, shape (_data_dimension, _num_samples)
 37 |         the input data
 38 |     num_bases: int, optional
 39 |         Number of bases to compute (column rank of W and row rank of H).
 40 |         4 (default)
 41 | 
 42 |     Attributes
 43 |     ----------
 44 |     W : "data_dimension x num_bases" matrix of basis vectors
 45 |     H : "num bases x num_samples" matrix of coefficients
 46 |     ferr : frobenius norm (after calling .factorize())
 47 | 
 48 |     Example
 49 |     -------
 50 |     Applying NMF to some rather stupid data set:
 51 | 
 52 |     >>> import numpy as np
 53 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 54 |     >>> nmf_mdl = NMF(data, num_bases=2, niter=10)
 55 |     >>> nmf_mdl.factorize()
 56 | 
 57 |     The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H.
 58 |     To compute coefficients for an existing set of basis vectors simply    copy W
 59 |     to nmf_mdl.W, and set compute_w to False:
 60 | 
 61 |     >>> data = np.array([[1.5], [1.2]])
 62 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 63 |     >>> nmf_mdl = NMF(data, num_bases=2)
 64 |     >>> nmf_mdl.W = W
 65 |     >>> nmf_mdl.factorize(niter=20, compute_w=False)
 66 | 
 67 |     The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H.
 68 |     """
 69 | 
 70 |     def __init__(self, data, num_bases=4, lamb=2.0):
 71 |         # call inherited method
 72 |         NMF.__init__(self, data, num_bases=num_bases)
 73 |         self._lamb = lamb
 74 | 
 75 |     def soft_thresholding(self, X, lamb):
 76 |         X = np.where(np.abs(X) <= lamb, 0.0, X)
 77 |         X = np.where(X > lamb, X - lamb, X)
 78 |         X = np.where(X < -1.0*lamb, X + lamb, X)
 79 |         return X
 80 | 
 81 |     def init_w(self):
 82 |         self.W = np.random.random((self._data_dimension, self._num_bases))
 83 | 
 84 |     def init_h(self):
 85 |         self.H = np.random.random((self._num_bases, self._num_samples))
 86 |         self.H[:,:] = 1.0
 87 |         # normalized bases
 88 |         Wnorm = np.sqrt(np.sum(self.W**2.0, axis=0))
 89 |         self.W /= Wnorm
 90 | 
 91 |         for i in range(self.H.shape[0]):
 92 |             self.H[i,:] *= Wnorm[i]
 93 | 
 94 |         self.update_s()
 95 | 
 96 |     def update_s(self):
 97 |         self.S = self.data - np.dot(self.W, self.H)
 98 |         self.S = self.soft_thresholding(self.S, self._lamb)
 99 | 
100 |     def update_h(self):
101 |         # pre init H1, and H2 (necessary for storing matrices on disk)
102 |         H1 = np.dot(self.W.T, self.S - self.data)
103 |         H1 = np.abs(H1) - H1
104 |         H1 /= (2.0* np.dot(self.W.T, np.dot(self.W, self.H)))
105 |         self.H *= H1
106 | 
107 |         # adapt S
108 |         self.update_s()
109 | 
110 |     def update_w(self):
111 |         # pre init W1, and W2 (necessary for storing matrices on disk)
112 |         W1 = np.dot(self.S - self.data, self.H.T)
113 |         #W1 = np.dot(self.data - self.S, self.H.T)
114 |         W1 = np.abs(W1) - W1
115 |         W1 /= (2.0 * (np.dot(self.W, np.dot(self.H, self.H.T))))
116 |         self.W *= W1
117 | 
118 | if __name__ == "__main__":
119 |     import doctest
120 |     doctest.testmod()
121 | 


--------------------------------------------------------------------------------
/pymf/nndsvd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | #$Id$
  7 | """
  8 | PyMF Non-negative Double Singular Value Decompositions.
  9 | 
 10 |     NNDSVD: Class for Non-negative Double Singular Value Decompositions [1]
 11 | 
 12 | [1] C. Boutsidis and E. Gallopoulos (2008), SVD based initialization: A head
 13 | start for nonnegative matrix factorization, Pattern Recognition, 41, 1350-1362
 14 | """
 15 | 
 16 | 
 17 | import numpy as np
 18 | 
 19 | from .nmf import NMF
 20 | from .svd import SVD
 21 | 
 22 | __all__ = ["NNDSVD"]
 23 | 
 24 | class NNDSVD(NMF):
 25 |     """
 26 |     NNDSVD(data, num_bases=4)
 27 | 
 28 | 
 29 |     Non-negative Double Singular Value Decompositions. Factorize a data
 30 |     matrix into two matrices s.t. F = | data - W*H | = | is minimal. H, and
 31 |     W are restricted to non-negative data. NNDSVD is primarily used for
 32 |     initializing NMF.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     data : array_like, shape (_data_dimension, _num_samples)
 37 |         the input data
 38 |     num_bases: int, optional
 39 |         Number of bases to compute (column rank of W and row rank of H).
 40 |         4 (default)
 41 | 
 42 |     Attributes
 43 |     ----------
 44 |     W : "data_dimension x num_bases" matrix of basis vectors
 45 |     H : "num bases x num_samples" matrix of coefficients
 46 |     ferr : frobenius norm (after calling .factorize())
 47 | 
 48 |     Example
 49 |     -------
 50 |     Applying NNDSVD to some rather stupid data set:
 51 | 
 52 |     >>> import numpy as np
 53 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 54 |     >>> nndsvd_mdl = NNDSVD(data, num_bases=2)
 55 |     >>> nndsvd_mdl.factorize()
 56 | 
 57 |     The basis vectors are now stored in nndsvd_mdl.W, the coefficients in
 58 |     nndsvd_mdl.H. To initialize NMF with nndsvd_mdl.W, nndsvd_mdl.H
 59 |     simply copy W to nmf_mdl.W and H to nmf_mdl.H:
 60 | 
 61 |     >>> data = np.array([[1.5], [1.2]])
 62 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 63 |     >>> nmf_mdl = NMF(data, num_bases=2)
 64 |     >>> nmf_mdl.W = nndsvd_mdl.W
 65 |     >>> nmf_mdl.H = nndsvd_mdl.H
 66 |     >>> nmf_mdl.factorize(niter=20)
 67 | 
 68 |     The result is a set of (more optimal) coefficients nmf_mdl.H, nmf_mdl.W.
 69 |     """
 70 |     def init_w(self):
 71 |         self.W = np.zeros((self._data_dimension, self._num_bases))
 72 | 
 73 |     def init_h(self):
 74 |         self.H = np.zeros((self._num_bases, self._num_samples))
 75 | 
 76 |     def update_h(self):
 77 |         pass
 78 | 
 79 |     def update_w(self):
 80 |         svd_mdl = SVD(self.data)
 81 |         svd_mdl.factorize()
 82 | 
 83 |         U, S, V = svd_mdl.U, svd_mdl.S, svd_mdl.V
 84 | 
 85 |         # The first left singular vector is nonnegative
 86 |         # (abs is only used as values could be all negative)
 87 |         self.W[:,0] = np.sqrt(S[0,0]) * np.abs(U[:,0])
 88 | 
 89 |         #The first right singular vector is nonnegative
 90 |         self.H[0,:] = np.sqrt(S[0,0]) * np.abs(V[0,:].T)
 91 | 
 92 |         for i in range(1,self._num_bases):
 93 |             # Form the rank one factor
 94 |             Tmp = np.dot(U[:,i:i+1]*S[i,i], V[i:i+1,:])
 95 | 
 96 |             # zero out the negative elements
 97 |             Tmp = np.where(Tmp < 0, 0.0, Tmp)
 98 | 
 99 |             # Apply 2nd SVD
100 |             svd_mdl_2 = SVD(Tmp)
101 |             svd_mdl_2.factorize()
102 |             u, s, v = svd_mdl_2.U, svd_mdl_2.S, svd_mdl_2.V
103 | 
104 |             # The first left singular vector is nonnegative
105 |             self.W[:,i] = np.sqrt(s[0,0]) * np.abs(u[:,0])
106 | 
107 |             #The first right singular vector is nonnegative
108 |             self.H[i,:] = np.sqrt(s[0,0]) * np.abs(v[0,:].T)
109 | 
110 |     def factorize(self, niter=1, show_progress=False,
111 |                   compute_w=True, compute_h=True, compute_err=True):
112 | 
113 |         # enforce certain default values, otherwise it won't work
114 |         NMF.factorize(self, niter=1, show_progress=show_progress,
115 |                   compute_w=True, compute_h=True, compute_err=compute_err)
116 | 
117 | if __name__ == "__main__":
118 |     import doctest
119 |     doctest.testmod()
120 | 


--------------------------------------------------------------------------------
/tests/test_pymf.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | ##    pymf - Python Matrix Factorization library
 3 | ##    Copyright (C) 2010 Christian Thurau
 4 | ##
 5 | ##    This library is free software; you can redistribute it and/or
 6 | ##    modify it under the terms of the GNU Library General Public
 7 | ##    License as published by the Free Software Foundation; either
 8 | ##    version 2 of the License, or (at your option) any later version.
 9 | ##
10 | ##    This library is distributed in the hope that it will be useful,
11 | ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | ##    Library General Public License for more details.
14 | ##
15 | ##    You should have received a copy of the GNU Library General Public
16 | ##    License along with this library; if not, write to the Free
17 | ##    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 | ##
19 | ##    Christian Thurau
20 | ##    cthurau@gmail.com
21 | """
22 | 
23 | """
24 | 
25 | import pytest
26 | import pymf
27 | import time
28 | import numpy as np
29 | import scipy.sparse
30 | 
31 | 
32 | np.random.seed(400401)
33 | A = np.random.random((3, 50)) + 2.0
34 | B = scipy.sparse.csc_matrix(A)
35 | 
36 | 
37 | @pytest.mark.parametrize("A", [A, B])
38 | def test_pinv(A):
39 |     pymf.pinv(A)
40 | 
41 | 
42 | @pytest.mark.parametrize("A,func", [
43 |     (A, pymf.SVD),  # 'Singula Value Decomposition (SVD)', 'c<'
44 |     (A.T, pymf.SVD),  # 'Singula Value Decomposition (SVD)', 'c<'
45 |     (B, pymf.SVD),  # 'svd sparse', 'c<'
46 |     (A, pymf.CUR),  # 'CUR Matrix Decomposition', 'b<'
47 |     (B, pymf.CUR),  # 'CUR Matrix Decomposition (sparse data)', 'b<'
48 |     (A, pymf.CMD),  # 'Compact Matrix Decomposition (CMD)', 'm<'
49 |     (B, pymf.CMD),  # 'Compact Matrix Decomposition (CMD - sparse data)', 'm<'
50 |     (A, pymf.SIVM_CUR),  # 'Simplex Volume Maximization f. CUR (SIVMCUR)', 'm<'
51 |     (A, pymf.SIVM_CUR),  # 'Simplex Volume Maximization f. CUR (SIVMCUR)', 'm<'
52 | ])
53 | def test_svd(A, func):
54 |     stime = time.time()
55 |     m = func(A, rrank=2, crank=2)
56 |     m.factorize()
57 |     fro_norm = m.frobenius_norm()/(A.shape[0] + A.shape[1])
58 | 
59 |     assert fro_norm < 0.1
60 |     print 'Fro.: %d, elapsed %d' % (fro_norm, time.time() - stime)
61 | 
62 | 
63 | @pytest.mark.parametrize("A,func,niter,num_bases", [
64 |     (A, pymf.SIVM_SEARCH, 20, 2),  # 'SIVM_SEARCH', 'c<', num_bases=2
65 |     (A, pymf.SIVM_GSAT, 20, 4),  # 'SIVM_GSAT ', 'c<'
66 |     (A, pymf.SIVM_SGREEDY, 20, 4),  # 'SIVM Greedy ', 'c<'
67 |     (A, pymf.GMAP, 20, 4),  # 'GMAP ', 'c<'
68 |     (A, pymf.PCA, 20, 4),  # 'Principal Component Analysis (PCA)', 'c<'
69 |     (A, pymf.NMF, 20, 4),  # 'Non-negative Matrix Factorization (NMF)', 'rs'
70 |     (A, pymf.NMFALS, 10, 4),  # 'NMF u. alternating least squares (NMFALS)', 'rs', niter=10
71 |     (A, pymf.NMFNNLS, 10, 4),  # 'NMF u. non-neg. least squares (NMFNNLS)', 'rs', niter=10
72 |     (A, pymf.LAESA, 20, 4),  # 'Linear Approximating Eliminating Search Algorithm (LAESA)', 'rs'
73 |     (A, pymf.SIVM, 20, 4),  # 'Simplex Volume Maximization (SIVM)', 'bs'
74 |     (A, pymf.Kmeans, 20, 4),  # 'K-means clustering (Kmeans)', 'b*'
75 |     (A, pymf.Cmeans, 20, 4),  # 'C-means clustering (Cmeans)', 'b*'
76 |     (A, pymf.AA, 20, 4),  # 'Archetypal Analysis (AA)', 'bs'
77 |     (A, pymf.SNMF, 20, 4),  # 'Semi Non-negative Matrix Factorization (SNMF)', 'bo'
78 |     (A, pymf.CNMF, 20, 4),  # 'Convex non-negative Matrix Factorization (CNMF)', 'c<'
79 |     (A, pymf.CHNMF, 20, 4),  # 'Convex-hull non-negative Matrix Factorization (CHNMF)', 'm*'
80 |     (np.round(A-2.0), pymf.BNMF, 20, 4),  # 'Binary Matrix Factorization (BNMF)', 'b>'
81 | ])
82 | def test(A, func, niter, num_bases):
83 |     stime = time.time()
84 |     m = func(A, num_bases=num_bases)
85 |     m.factorize(show_progress=True, niter=niter)
86 |     fro_norm = m.ferr[-1]/(A.shape[0] + A.shape[1])
87 | 
88 |     assert fro_norm < 0.1
89 |     print 'Fro.: %d, elapsed %d' % (fro_norm, time.time() - stime)
90 | 
91 |     stime = time.time()
92 |     m.factorize(show_progress=False, compute_h=False, niter=niter)
93 |     m.factorize(show_progress=False, compute_w=False, niter=niter)
94 |     m.factorize(show_progress=False, compute_err=False, niter=niter)
95 |     m.factorize(show_progress=True, niter=20)
96 | 
97 |     print ' additional tests - elapsed:', time.time() - stime
98 | 


--------------------------------------------------------------------------------
/pymf/dist.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF several distance functions
  8 | 
  9 |     kl_divergence(): KL Divergence
 10 |     l1_distance(): L1 distance
 11 |     l2_distance(): L2 distance
 12 |     cosine_distance(): Cosine distance
 13 |     pdist(): Pairwise distance computation
 14 |     vq(): Vector quantization
 15 | 
 16 | """
 17 | 
 18 | 
 19 | import numpy as np
 20 | import scipy.sparse
 21 | 
 22 | __all__ = ["abs_cosine_distance", "kl_divergence", "l1_distance", "l2_distance",
 23 |            "weighted_abs_cosine_distance","cosine_distance","vq", "pdist"]
 24 | 
 25 | def kl_divergence(d, vec):
 26 |     b = vec*(1/d)
 27 |     b = np.where(b>0, np.log(b),0)
 28 |     b = vec * b
 29 |     b = np.sum(b - vec + d, axis=0).reshape((-1))
 30 |     return b
 31 | 
 32 | def l1_distance(d, vec):
 33 |     ret_val = np.sum(np.abs(d - vec), axis=0)
 34 |     ret_val = ret_val.reshape((-1))
 35 |     return ret_val
 36 | 
 37 | def sparse_l2_distance(d, vec):
 38 |     # compute the norm of d
 39 |     nd = (d.multiply(d)).sum(axis=0)
 40 |     nv = (vec.multiply(vec)).sum(axis=0)
 41 |     ret_val = nd + nv -  2.0*(d.T * vec).T
 42 |     return np.sqrt(ret_val)
 43 | 
 44 | def approx_l2_distance(d, vec):
 45 |     # Use random projections to approximate the conventional l2 distance
 46 |     k = np.round(np.log(d.shape[0]))
 47 |     #k = d.shape[0]
 48 |     R = np.random.randn(k, d.shape[0])
 49 |     R = R / np.sqrt((R**2).sum(axis=0))
 50 |     A = np.dot(R,d)
 51 |     B = np.dot(R, vec)
 52 |     ret_val = np.sum( (A - B)**2, axis=0)
 53 |     ret_val = np.sqrt(R.shape[1]/R.shape[0]) * np.sqrt(ret_val)
 54 |     ret_val = ret_val.reshape((-1))
 55 |     return ret_val
 56 | 
 57 | def l2_distance(d, vec):
 58 |     if scipy.sparse.issparse(d):
 59 |         ret_val = sparse_l2_distance(d, vec)
 60 |     else:
 61 |         ret_val = np.sqrt(((d[:,:] - vec)**2).sum(axis=0))
 62 | 
 63 |     return ret_val.reshape((-1))
 64 | 
 65 | def l2_distance_new(d,vec):
 66 |     # compute the norm of d
 67 |     nd = (d**2).sum(axis=0)
 68 |     nv = (vec**2).sum(axis=0)
 69 |     ret_val = nd + nv - 2.0*np.dot(d.T,vec.reshape((-1,1))).T
 70 | 
 71 |     return np.sqrt(ret_val)
 72 | 
 73 | def cosine_distance(d, vec):
 74 |     tmp = np.dot(np.transpose(d), vec)
 75 |     a = np.sqrt(np.sum(d**2, axis=0))
 76 |     b = np.sqrt(np.sum(vec**2))
 77 |     k = (a*b).reshape(-1) + (10**-9)
 78 | 
 79 |     # compute distance
 80 |     ret_val = 1.0 - tmp/k
 81 | 
 82 |     return ret_val.reshape((-1))
 83 | 
 84 | def abs_cosine_distance(d, vec, weighted=False):
 85 |     if scipy.sparse.issparse(d):
 86 |         tmp = np.array((d.T * vec).todense(), dtype=np.float32).reshape(-1)
 87 |         a = np.sqrt(np.array(d.multiply(d).sum(axis=0), dtype=np.float32).reshape(-1))
 88 |         b = np.sqrt(np.array(vec.multiply(vec).sum(axis=0), dtype=np.float32).reshape(-1))
 89 |     else:
 90 |         tmp = np.dot(np.transpose(d), vec).reshape(-1)
 91 |         a = np.sqrt(np.sum(d**2, axis=0)).reshape(-1)
 92 |         b = np.sqrt(np.sum(vec**2)).reshape(-1)
 93 | 
 94 |     k = (a*b).reshape(-1) + 10**-9
 95 | 
 96 |     # compute distance
 97 |     ret_val = 1.0 - np.abs(tmp/k)
 98 | 
 99 |     if weighted:
100 |         ret_val = ret_val * a
101 |     return ret_val.reshape((-1))
102 | 
103 | def weighted_abs_cosine_distance(d, vec):
104 |     ret_val = abs_cosine_distance(d, vec, weighted=True)
105 |     return ret_val
106 | 
107 | def pdist(A, B, metric='l2' ):
108 |     # compute pairwise distance between a data matrix A (d x n) and B (d x m).
109 |     # Returns a distance matrix d (n x m).
110 |     d = np.zeros((A.shape[1], B.shape[1]))
111 |     if A.shape[1] <= B.shape[1]:
112 |         for aidx in xrange(A.shape[1]):
113 |             if metric == 'l2':
114 |                 d[aidx:aidx+1,:] = l2_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
115 |             if metric == 'l1':
116 |                 d[aidx:aidx+1,:] = l1_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
117 |     else:
118 |         for bidx in xrange(B.shape[1]):
119 |             if metric == 'l2':
120 |                 d[:, bidx:bidx+1] = l2_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
121 |             if metric == 'l1':
122 |                 d[:, bidx:bidx+1] = l1_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
123 | 
124 |     return d
125 | 
126 | def vq(A, B, metric='l2'):
127 |     # assigns data samples in B to cluster centers A and
128 |     # returns an index list [assume n column vectors, d x n]
129 |     assigned = np.argmin(pdist(A,B, metric=metric), axis=0)
130 |     return assigned
131 | 


--------------------------------------------------------------------------------
/pymf/pca.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Principal Component Analysis.
  8 | 
  9 |     PCA: Class for Principal Component Analysis
 10 | """
 11 | 
 12 | 
 13 | 
 14 | import numpy as np
 15 | 
 16 | from .nmf import NMF
 17 | from .svd import SVD
 18 | 
 19 | 
 20 | __all__ = ["PCA"]
 21 | 
 22 | class PCA(NMF):
 23 |     """
 24 |     PCA(data, num_bases=4, center_mean=True)
 25 | 
 26 | 
 27 |     Archetypal Analysis. Factorize a data matrix into two matrices s.t.
 28 |     F = | data - W*H | is minimal. W is set to the eigenvectors of the
 29 |     data covariance.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     data : array_like, shape (_data_dimension, _num_samples)
 34 |         the input data
 35 |     num_bases: int, optional
 36 |         Number of bases to compute (column rank of W and row rank of H).
 37 |         4 (default)
 38 |     center_mean: bool, True
 39 |         Make sure that the data is centred around the mean.
 40 | 
 41 |     Attributes
 42 |     ----------
 43 |     W : "data_dimension x num_bases" matrix of basis vectors
 44 |     H : "num bases x num_samples" matrix of coefficients
 45 |     ferr : frobenius norm (after calling .factorize())
 46 | 
 47 |     Example
 48 |     -------
 49 |     Applying PCA to some rather stupid data set:
 50 | 
 51 |     >>> import numpy as np
 52 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 53 |     >>> pca_mdl = PCA(data, num_bases=2)
 54 |     >>> pca_mdl.factorize()
 55 | 
 56 |     The basis vectors are now stored in pca_mdl.W, the coefficients in pca_mdl.H.
 57 |     To compute coefficients for an existing set of basis vectors simply    copy W
 58 |     to pca_mdl.W, and set compute_w to False:
 59 | 
 60 |     >>> data = np.array([[1.5], [1.2]])
 61 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 62 |     >>> pca_mdl = PCA(data, num_bases=2)
 63 |     >>> pca_mdl.W = W
 64 |     >>> pca_mdl.factorize(compute_w=False)
 65 | 
 66 |     The result is a set of coefficients pca_mdl.H, s.t. data = W * pca_mdl.H.
 67 |     """
 68 | 
 69 |     def __init__(self, data, num_bases=0, center_mean=True):
 70 | 
 71 |         NMF.__init__(self, data, num_bases=num_bases)
 72 | 
 73 |         # center the data around the mean first
 74 |         self._center_mean = center_mean
 75 | 
 76 |         if self._center_mean:
 77 |             # copy the data before centering it
 78 |             self._data_orig = data
 79 |             self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1)
 80 |             self.data = self._data_orig -  self._meanv
 81 |         else:
 82 |             self.data = data
 83 | 
 84 |     def init_h(self):
 85 |         pass
 86 | 
 87 |     def init_w(self):
 88 |         pass
 89 | 
 90 |     def update_h(self):
 91 |         self.H = np.dot(self.W.T, self.data[:,:])
 92 | 
 93 |     def update_w(self):
 94 |         # compute eigenvectors and eigenvalues using SVD
 95 |         svd_mdl = SVD(self.data)
 96 |         svd_mdl.factorize()
 97 | 
 98 |         # argsort sorts in ascending order -> do reverese indexing
 99 |         # for accesing values in descending order
100 |         S = np.diag(svd_mdl.S)
101 |         order = np.argsort(S)[::-1]
102 | 
103 |         # select only a few eigenvectors  ...
104 |         if self._num_bases >0:
105 |             order = order[:self._num_bases]
106 | 
107 |         self.W = svd_mdl.U[:,order]
108 |         self.eigenvalues =  S[order]
109 | 
110 |     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
111 |                   compute_err=True, niter=1):
112 |         """ Factorize s.t. WH = data
113 | 
114 |             Parameters
115 |             ----------
116 |             show_progress : bool
117 |                     print some extra information to stdout.
118 |             compute_h : bool
119 |                     iteratively update values for H.
120 |             compute_w : bool
121 |                     iteratively update values for W.
122 |             compute_err : bool
123 |                     compute Frobenius norm |data-WH| after each update and store
124 |                     it to .ferr[k].
125 | 
126 |             Updated Values
127 |             --------------
128 |             .W : updated values for W.
129 |             .H : updated values for H.
130 |             .ferr : Frobenius norm |data-WH|.
131 |         """
132 | 
133 |         NMF.factorize(self, niter=1, show_progress=show_progress,
134 |                   compute_w=compute_w, compute_h=compute_h,
135 |                   compute_err=compute_err)
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     import doctest
140 |     doctest.testmod()
141 | 


--------------------------------------------------------------------------------
/pymf/bnmf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Binary Matrix Factorization [1]
  8 | 
  9 |     BNMF(NMF) : Class for binary matrix factorization
 10 | 
 11 | [1]Z. Zhang, T. Li, C. H. Q. Ding, X. Zhang: Binary Matrix Factorization with
 12 | Applications. ICDM 2007
 13 | """
 14 | 
 15 | 
 16 | import numpy as np
 17 | from .nmf import NMF
 18 | 
 19 | __all__ = ["BNMF"]
 20 | 
 21 | class BNMF(NMF):
 22 |     """
 23 |     BNMF(data, data, num_bases=4)
 24 |     Binary Matrix Factorization. Factorize a data matrix into two matrices s.t.
 25 |     F = | data - W*H | is minimal. H and W are restricted to binary values.
 26 | 
 27 |    Parameters
 28 |     ----------
 29 |     data : array_like, shape (_data_dimension, _num_samples)
 30 |         the input data
 31 |     num_bases: int, optional
 32 |         Number of bases to compute (column rank of W and row rank of H).
 33 |         4 (default)
 34 | 
 35 |     Attributes
 36 |     ----------
 37 |         W : "data_dimension x num_bases" matrix of basis vectors
 38 |         H : "num bases x num_samples" matrix of coefficients
 39 |         ferr : frobenius norm (after calling .factorize())
 40 | 
 41 |     Example
 42 |     -------
 43 |     Applying BNMF to some rather stupid data set:
 44 | 
 45 |     >>> import numpy as np
 46 |     >>> from bnmf import BNMF
 47 |     >>> data = np.array([[1.0, 0.0, 1.0], [0.0, 1.0, 1.0]])
 48 | 
 49 |     Use 2 basis vectors -> W shape(data_dimension, 2).
 50 | 
 51 |     >>> bnmf_mdl = BNMF(data, num_bases=2)
 52 | 
 53 |     Set number of iterations to 5 and start computing the factorization.
 54 | 
 55 |     >>> bnmf_mdl.factorize(niter=5)
 56 | 
 57 |     The basis vectors are now stored in bnmf_mdl.W, the coefficients in bnmf_mdl.H.
 58 |     To compute coefficients for an existing set of basis vectors simply copy W
 59 |     to bnmf_mdl.W, and set compute_w to False:
 60 | 
 61 |     >>> data = np.array([[0.0], [1.0]])
 62 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 63 |     >>> bnmf_mdl = BNMF(data, num_bases=2)
 64 |     >>> bnmf_mdl.W = W
 65 |     >>> bnmf_mdl.factorize(niter=10, compute_w=False)
 66 | 
 67 |     The result is a set of coefficients bnmf_mdl.H, s.t. data = W * bnmf_mdl.H.
 68 |     """
 69 | 
 70 |     # controls how fast lambda should increase:
 71 |     # this influence convergence to binary values during the update. A value
 72 |     # <1 will result in non-binary decompositions as the update rule effectively
 73 |     # is a conventional nmf update rule. Values >1 give more weight to making the
 74 |     # factorization binary with increasing iterations.
 75 |     # setting either W or H to 0 results make the resulting matrix non binary.
 76 |     _LAMB_INCREASE_W = 1.1
 77 |     _LAMB_INCREASE_H = 1.1
 78 | 
 79 |     def update_h(self):
 80 |         H1 = np.dot(self.W.T, self.data[:,:]) + 3.0*self._lamb_H*(self.H**2)
 81 |         H2 = np.dot(np.dot(self.W.T,self.W), self.H) + 2*self._lamb_H*(self.H**3) + self._lamb_H*self.H + 10**-9
 82 |         self.H *= H1/H2
 83 | 
 84 |         self._lamb_W = self._LAMB_INCREASE_W * self._lamb_W
 85 |         self._lamb_H = self._LAMB_INCREASE_H * self._lamb_H
 86 | 
 87 |     def update_w(self):
 88 |         W1 = np.dot(self.data[:,:], self.H.T) + 3.0*self._lamb_W*(self.W**2)
 89 |         W2 = np.dot(self.W, np.dot(self.H, self.H.T)) + 2.0*self._lamb_W*(self.W**3) + self._lamb_W*self.W  + 10**-9
 90 |         self.W *= W1/W2
 91 | 
 92 |     def factorize(self, niter=10, compute_w=True, compute_h=True,
 93 |                   show_progress=False, compute_err=True):
 94 |         """ Factorize s.t. WH = data
 95 | 
 96 |             Parameters
 97 |             ----------
 98 |             niter : int
 99 |                     number of iterations.
100 |             show_progress : bool
101 |                     print some extra information to stdout.
102 |             compute_h : bool
103 |                     iteratively update values for H.
104 |             compute_w : bool
105 |                     iteratively update values for W.
106 |             compute_err : bool
107 |                     compute Frobenius norm |data-WH| after each update and store
108 |                     it to .ferr[k].
109 | 
110 |             Updated Values
111 |             --------------
112 |             .W : updated values for W.
113 |             .H : updated values for H.
114 |             .ferr : Frobenius norm |data-WH| for each iteration.
115 |         """
116 | 
117 |         # init some learning parameters
118 |         self._lamb_W = 1.0/niter
119 |         self._lamb_H = 1.0/niter
120 | 
121 |         NMF.factorize(self, niter=niter, compute_w=compute_w,
122 |                       compute_h=compute_h, show_progress=show_progress,
123 |                       compute_err=compute_err)
124 | 
125 | if __name__ == "__main__":
126 |     import doctest
127 |     doctest.testmod()
128 | 


--------------------------------------------------------------------------------
/pymf/cur.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF CUR Decomposition [1]
  8 | 
  9 |     CUR(SVD) : Class for CUR Decomposition
 10 | 
 11 | [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing
 12 | a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206.
 13 | """
 14 | 
 15 | 
 16 | import numpy as np
 17 | import scipy.sparse
 18 | 
 19 | from .svd import pinv, SVD
 20 | 
 21 | 
 22 | __all__ = ["CUR"]
 23 | 
 24 | class CUR(SVD):
 25 |     """
 26 |     CUR(data,  data, k=-1, rrank=0, crank=0)
 27 | 
 28 |     CUR Decomposition. Factorize a data matrix into three matrices s.t.
 29 |     F = | data - USV| is minimal. CUR randomly selects rows and columns from
 30 |     data for building U and V, respectively.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     data : array_like [data_dimension x num_samples]
 35 |         the input data
 36 |     rrank: int, optional
 37 |         Number of rows to sample from data.
 38 |         4 (default)
 39 |     crank: int, optional
 40 |         Number of columns to sample from data.
 41 |         4 (default)
 42 |     show_progress: bool, optional
 43 |         Print some extra information
 44 |         False (default)
 45 | 
 46 |     Attributes
 47 |     ----------
 48 |         U,S,V : submatrices s.t. data = USV
 49 | 
 50 |     Example
 51 |     -------
 52 |     >>> import numpy as np
 53 |     >>> from cur import CUR
 54 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 55 |     >>> cur_mdl = CUR(data, show_progress=False, rrank=1, crank=2)
 56 |     >>> cur_mdl.factorize()
 57 |     """
 58 | 
 59 |     def __init__(self, data, k=-1, rrank=0, crank=0):
 60 |         SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank)
 61 | 
 62 |         # select all data samples for computing the error:
 63 |         # note that this might take very long, adjust self._rset and self._cset
 64 |         # for faster computations.
 65 |         self._rset = range(self._rows)
 66 |         self._cset = range(self._cols)
 67 | 
 68 | 
 69 |     def sample(self, s, probs):
 70 |         prob_rows = np.cumsum(probs.flatten())
 71 |         temp_ind = np.zeros(s, np.int32)
 72 | 
 73 |         for i in range(s):
 74 |             v = np.random.rand()
 75 | 
 76 |             try:
 77 |                 tempI = np.where(prob_rows >= v)[0]
 78 |                 temp_ind[i] = tempI[0]
 79 |             except:
 80 |                 temp_ind[i] = len(prob_rows)
 81 | 
 82 |         return np.sort(temp_ind)
 83 | 
 84 |     def sample_probability(self):
 85 | 
 86 |         if scipy.sparse.issparse(self.data):
 87 |             dsquare = self.data.multiply(self.data)
 88 |         else:
 89 |             dsquare = self.data[:,:]**2
 90 | 
 91 |         prow = np.array(dsquare.sum(axis=1), np.float64)
 92 |         pcol = np.array(dsquare.sum(axis=0), np.float64)
 93 | 
 94 |         prow /= prow.sum()
 95 |         pcol /= pcol.sum()
 96 | 
 97 |         return (prow.reshape(-1,1), pcol.reshape(-1,1))
 98 | 
 99 |     def computeUCR(self):
100 |         # the next  lines do NOT work with h5py if CUR is used -> double indices in self.cid or self.rid
101 |         # can occur and are not supported by h5py. When using h5py data, always use CMD which ignores
102 |         # reoccuring row/column selections.
103 | 
104 |         if scipy.sparse.issparse(self.data):
105 |             self._C = self.data[:, self._cid] * scipy.sparse.csc_matrix(np.diag(self._ccnt**(1/2)))
106 |             self._R = scipy.sparse.csc_matrix(np.diag(self._rcnt**(1/2))) * self.data[self._rid,:]
107 | 
108 |             self._U = pinv(self._C, self._k) * self.data[:,:] * pinv(self._R, self._k)
109 | 
110 |         else:
111 |             self._C = np.dot(self.data[:, self._cid].reshape((self._rows, len(self._cid))), np.diag(self._ccnt**(1/2)))
112 |             self._R = np.dot(np.diag(self._rcnt**(1/2)), self.data[self._rid,:].reshape((len(self._rid), self._cols)))
113 | 
114 |             self._U = np.dot(np.dot(pinv(self._C, self._k), self.data[:,:]),
115 |                              pinv(self._R, self._k))
116 | 
117 |         # set some standard (with respect to SVD) variable names
118 |         self.U = self._C
119 |         self.S = self._U
120 |         self.V = self._R
121 | 
122 |     def factorize(self):
123 |         """ Factorize s.t. CUR = data
124 | 
125 |             Updated Values
126 |             --------------
127 |             .C : updated values for C.
128 |             .U : updated values for U.
129 |             .R : updated values for R.
130 |         """
131 |         [prow, pcol] = self.sample_probability()
132 |         self._rid = self.sample(self._rrank, prow)
133 |         self._cid = self.sample(self._crank, pcol)
134 | 
135 |         self._rcnt = np.ones(len(self._rid))
136 |         self._ccnt = np.ones(len(self._cid))
137 | 
138 |         self.computeUCR()
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     import doctest
143 |     doctest.testmod()
144 | 


--------------------------------------------------------------------------------
/pymf/aa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Archetypal Analysis [1]
  8 | 
  9 |     AA: class for Archetypal Analysis
 10 | 
 11 | [1] Cutler, A. Breiman, L. (1994), "Archetypal Analysis", Technometrics 36(4),
 12 | 338-347.
 13 | """
 14 | 
 15 | 
 16 | import numpy as np
 17 | from .dist import vq
 18 | from cvxopt import solvers, base
 19 | 
 20 | from .svd import pinv
 21 | from .nmf import NMF
 22 | 
 23 | __all__ = ["AA"]
 24 | 
 25 | class AA(NMF):
 26 |     """
 27 |     AA(data, num_bases=4)
 28 | 
 29 |     Archetypal Analysis. Factorize a data matrix into two matrices s.t.
 30 |     F = | data - W*H | = | data - data*beta*H| is minimal. H and beta
 31 |     are restricted to convexity (beta >=0, sum(beta, axis=1) = [1 .. 1]).
 32 |     Factorization is solved via an alternating least squares optimization
 33 |     using the quadratic programming solver from cvxopt.
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     data : array_like, shape (_data_dimension, _num_samples)
 38 |         the input data
 39 |     num_bases: int, optional
 40 |         Number of bases to compute (column rank of W and row rank of H).
 41 |         4 (default)
 42 | 
 43 | 
 44 |     Attributes
 45 |     ----------
 46 |     W : "data_dimension x num_bases" matrix of basis vectors
 47 |     H : "num bases x num_samples" matrix of coefficients
 48 |     beta : "num_bases x num_samples" matrix of basis vector coefficients
 49 |         (for constructing W s.t. W = beta * data.T )
 50 |     ferr : frobenius norm (after calling .factorize())
 51 | 
 52 |     Example
 53 |     -------
 54 |     Applying AA to some rather stupid data set:
 55 | 
 56 |     >>> import numpy as np
 57 |     >>> from aa import AA
 58 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 59 | 
 60 |     Use 2 basis vectors -> W shape(data_dimension, 2).
 61 | 
 62 |     >>> aa_mdl = AA(data, num_bases=2)
 63 | 
 64 |     Set number of iterations to 5 and start computing the factorization.
 65 | 
 66 |     >>> aa_mdl.factorize(niter=5)
 67 | 
 68 |     The basis vectors are now stored in aa_mdl.W, the coefficients in aa_mdl.H.
 69 |     To compute coefficients for an existing set of basis vectors simply copy W
 70 |     to aa_mdl.W, and set compute_w to False:
 71 | 
 72 |     >>> data = np.array([[1.5], [1.2]])
 73 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 74 |     >>> aa_mdl = AA(data, num_bases=2)
 75 |     >>> aa_mdl.W = W
 76 |     >>> aa_mdl.factorize(niter=5, compute_w=False)
 77 | 
 78 |     The result is a set of coefficients aa_mdl.H, s.t. data = W * aa_mdl.H.
 79 |     """
 80 |     # set cvxopt options
 81 |     solvers.options['show_progress'] = False
 82 | 
 83 |     def init_h(self):
 84 |         self.H = np.random.random((self._num_bases, self._num_samples))
 85 |         self.H /= self.H.sum(axis=0)
 86 | 
 87 |     def init_w(self):
 88 |         self.beta = np.random.random((self._num_bases, self._num_samples))
 89 |         self.beta /= self.beta.sum(axis=0)
 90 |         self.W = np.dot(self.beta, self.data.T).T
 91 |         self.W = np.random.random((self._data_dimension, self._num_bases))
 92 | 
 93 |     def update_h(self):
 94 |         """ alternating least squares step, update H under the convexity
 95 |         constraint """
 96 |         def update_single_h(i):
 97 |             """ compute single H[:,i] """
 98 |             # optimize alpha using qp solver from cvxopt
 99 |             FA = base.matrix(np.float64(np.dot(-self.W.T, self.data[:,i])))
100 |             al = solvers.qp(HA, FA, INQa, INQb, EQa, EQb)
101 |             self.H[:,i] = np.array(al['x']).reshape((1, self._num_bases))
102 | 
103 |         EQb = base.matrix(1.0, (1,1))
104 |         # float64 required for cvxopt
105 |         HA = base.matrix(np.float64(np.dot(self.W.T, self.W)))
106 |         INQa = base.matrix(-np.eye(self._num_bases))
107 |         INQb = base.matrix(0.0, (self._num_bases,1))
108 |         EQa = base.matrix(1.0, (1, self._num_bases))
109 | 
110 |         for i in xrange(self._num_samples):
111 |             update_single_h(i)
112 | 
113 |     def update_w(self):
114 |         """ alternating least squares step, update W under the convexity
115 |         constraint """
116 |         def update_single_w(i):
117 |             """ compute single W[:,i] """
118 |             # optimize beta     using qp solver from cvxopt
119 |             FB = base.matrix(np.float64(np.dot(-self.data.T, W_hat[:,i])))
120 |             be = solvers.qp(HB, FB, INQa, INQb, EQa, EQb)
121 |             self.beta[i,:] = np.array(be['x']).reshape((1, self._num_samples))
122 | 
123 |         # float64 required for cvxopt
124 |         HB = base.matrix(np.float64(np.dot(self.data[:,:].T, self.data[:,:])))
125 |         EQb = base.matrix(1.0, (1, 1))
126 |         W_hat = np.dot(self.data, pinv(self.H))
127 |         INQa = base.matrix(-np.eye(self._num_samples))
128 |         INQb = base.matrix(0.0, (self._num_samples, 1))
129 |         EQa = base.matrix(1.0, (1, self._num_samples))
130 | 
131 |         for i in xrange(self._num_bases):
132 |             update_single_w(i)
133 | 
134 |         self.W = np.dot(self.beta, self.data.T).T
135 | 
136 | if __name__ == "__main__":
137 |     import doctest
138 |     doctest.testmod()
139 | 


--------------------------------------------------------------------------------
/pymf/sivm_sgreedy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2.6
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Simplex Volume Maximization [1]
  8 | 
  9 |     SIVM_SGREEDY: class for greedy-search SiVM
 10 | 
 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
 13 | Conf. on Information and Knowledge Management. ACM. 2010.
 14 | """
 15 | 
 16 | 
 17 | import numpy as np
 18 | import time
 19 | 
 20 | from .dist import *
 21 | from .vol import *
 22 | from .sivm_search import SIVM_SEARCH
 23 | 
 24 | __all__ = ["SIVM_SGREEDY"]
 25 | 
 26 | class SIVM_SGREEDY(SIVM_SEARCH):
 27 |     """
 28 |     SIVM(data, num_bases=4, niter=100, show_progress=True, compW=True)
 29 | 
 30 | 
 31 |     Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
 32 |     F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
 33 |     found by maximizing the volume of the resulting simplex (see [1]). A solution
 34 |     is found by employing a simple greedy max-vol strategy.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     data : array_like
 39 |         the input data
 40 |     num_bases: int, optional
 41 |         Number of bases to compute (column rank of W and row rank of H).
 42 |         4 (default)
 43 |     niter: int, optional
 44 |         Number of iterations of the alternating optimization.
 45 |         100 (default)
 46 |     show_progress: bool, optional
 47 |         Print some extra information
 48 |         False (default)
 49 |     compW: bool, optional
 50 |         Compute W (True) or only H (False). Useful for using basis vectors
 51 |         from another convexity constrained matrix factorization function
 52 |         (e.g. svmnmf) (if set to "True" niter can be set to "1")
 53 |     compH: bool, optional
 54 |         Compute H (True) or only H (False). Useful for using precomputed
 55 |         basis vectors.
 56 |     dist_measure: string, optional
 57 |         The distance measure for finding the next best candidate that
 58 |         maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2']
 59 |         'l2' (default)
 60 |     optimize_lower_bound: bool, optional
 61 |         Use the alternative selection criterion that optimizes the lower
 62 |         bound (see [1])
 63 |         False (default)
 64 | 
 65 |     Attributes
 66 |     ----------
 67 |         W : "data_dimension x num_bases" matrix of basis vectors
 68 |         H : "num bases x num_samples" matrix of coefficients
 69 | 
 70 |         ferr : frobenius norm (after applying .factoriz())
 71 | 
 72 |     Example
 73 |     -------
 74 |     Applying SIVM to some rather stupid data set:
 75 | 
 76 |     >>> import numpy as np
 77 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 78 |     >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=10)
 79 |     >>> sivm_mdl.initialization()
 80 |     >>> sivm_mdl.factorize()
 81 | 
 82 |     The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
 83 |     To compute coefficients for an existing set of basis vectors simply    copy W
 84 |     to sivm_mdl.W, and set compW to False:
 85 | 
 86 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
 87 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 88 |     >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=1, compW=False)
 89 |     >>> sivm_mdl.initialization()
 90 |     >>> sivm_mdl.W = W
 91 |     >>> sivm_mdl.factorize()
 92 | 
 93 |     The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
 94 |     """
 95 | 
 96 |     def update_w(self):
 97 |         # compute distance matrix -> requiresd for the volume
 98 |         self.init_sivm()
 99 |         next_sel = list([self.select[0]])
100 |         self.select = []
101 | 
102 |         self._v = []
103 |         self._t = []
104 |         stime = time.time()
105 | 
106 |         for iter in range(self._num_bases-1):
107 |             # add new selections to openset
108 |             next_sel = list(np.sort(next_sel))
109 |             D = pdist(self.data[:, next_sel], self.data[:, next_sel])
110 |             V = np.zeros(self.data.shape[1])
111 |             d = np.zeros((D.shape[0]+1,D.shape[1]+1))
112 |             d[:D.shape[0], :D.shape[1]] = D[:,:]
113 | 
114 |             for i in range(self.data.shape[1]):
115 |                 # create a temp selection
116 |                 dtmp = l2_distance(self.data[:,next_sel], self.data[:,i:i+1])
117 |                 d[:-1,-1] = dtmp
118 |                 d[-1,:-1] = dtmp
119 |                 # compute volume for temp selection
120 |                 V[i] = cmdet(d)
121 | 
122 |             next_index = np.argmax(V)
123 |             next_sel.append(next_index)
124 |             self._v.append(np.max(V))
125 | 
126 |             self._logger.info('Iter:' + str(iter))
127 |             self._logger.info('Current selection:' + str(next_sel))
128 |             self._logger.info('Current volume:' + str(self._v[-1]))
129 |             self._t.append(time.time() - stime)
130 | 
131 |         # update some values ...
132 |         self.select = list(next_sel)
133 |         self.W = self.data[:, self.select]
134 | 
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     import doctest
139 |     doctest.testmod()
140 | 


--------------------------------------------------------------------------------
/pymf/greedy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2.6
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | #$Id$
  7 | """
  8 | PyMF GREEDY[1]
  9 | 
 10 |     GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1].
 11 | 
 12 | 
 13 | [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix
 14 | Reconstruction via Greedy Approximation of SVD. ISAAC'2008.
 15 | """
 16 | 
 17 | 
 18 | import time
 19 | import scipy.sparse
 20 | import numpy as np
 21 | from .svd import *
 22 | from .nmf import NMF
 23 | 
 24 | __all__ = ["GREEDY"]
 25 | 
 26 | class GREEDY(NMF):
 27 |     """
 28 |     GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True)
 29 | 
 30 | 
 31 |     Deterministic Sparse Column Based Matrix Reconstruction via Greedy
 32 |     Approximation of SVD. Factorize a data matrix into two matrices s.t.
 33 |     F = | data - W*H | is minimal. W is iteratively selected as columns
 34 |     of data.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     data : array_like, shape (_data_dimension, _num_samples)
 39 |         the input data
 40 |     num_bases: int, optional
 41 |         Number of bases to compute (column rank of W and row rank of H).
 42 |         4 (default)
 43 |     k   : number of singular vectors for the SVD step of the algorithm
 44 |         num_bases (default)
 45 | 
 46 |     Attributes
 47 |     ----------
 48 |     W : "data_dimension x num_bases" matrix of basis vectors
 49 |     H : "num bases x num_samples" matrix of coefficients
 50 |     ferr : frobenius norm (after calling .factorize())
 51 | 
 52 |     Example
 53 |     -------
 54 |     Applying GREEDY to some rather stupid data set:
 55 | 
 56 |     >>> import numpy as np
 57 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 58 |     >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10)
 59 |     >>> greedy_mdl.factorize()
 60 | 
 61 |     The basis vectors are now stored in greedy_mdl.W, the coefficients in
 62 |     greedy_mdl.H. To compute coefficients for an existing set of basis
 63 |     vectors simply  copy W to greedy_mdl.W, and set compW to False:
 64 | 
 65 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
 66 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 67 |     >>> greedy_mdl = GREEDY(data, num_bases=2)
 68 |     >>> greedy_mdl.W = W
 69 |     >>> greedy_mdl.factorize(compute_w=False)
 70 | 
 71 |     The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H.
 72 |     """
 73 | 
 74 | 
 75 |     def __init__(self, data, k=-1, num_bases=4):
 76 |         # call inherited method
 77 |         NMF.__init__(self, data, num_bases=num_bases)
 78 |         self._k = k
 79 |         if self._k == -1:
 80 |             self._k = num_bases
 81 | 
 82 |     def update_h(self):
 83 |         if scipy.sparse.issparse(self.data):
 84 |             self.H = pinv(self.W) * self.data
 85 |         else:
 86 |             self.H = np.dot(pinv(self.W), self.data)
 87 | 
 88 |     def update_w(self):
 89 |         def normalize_matrix(K):
 90 |             """ Normalize a matrix K s.t. columns have Euclidean-norm |1|
 91 |             """
 92 |             if scipy.sparse.issparse(K):
 93 |                 L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:]
 94 |                 s = np.where(L > 0.0)[0]
 95 |                 L[s] = L[s]**-1
 96 |                 KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc')
 97 |                 K = K*KN
 98 |             else:
 99 |                 L = np.sqrt((K**2).sum(axis=0))
100 |                 s = np.where(L > 0.0)[0]
101 |                 L[s] = L[s]**-1
102 |                 K = K*L
103 |             return K
104 | 
105 |         self._t = np.zeros((self._num_bases))
106 |         t0 = time.time()
107 |         self.select = []
108 | 
109 |         # normalize data
110 |         A = self.data.copy()
111 | 
112 |         svd_mdl = SVD(A, k=self._k)
113 |         svd_mdl.factorize()
114 | 
115 |         if scipy.sparse.issparse(self.data):
116 |             B = svd_mdl.U * svd_mdl.S
117 |             B = B.tocsc()
118 |         else:
119 |             B = np.dot(svd_mdl.U, svd_mdl.S)
120 |             B = B[:, :self._num_bases]
121 | 
122 |         for i in range(self._num_bases):
123 |             A = normalize_matrix(A)
124 | 
125 |             if scipy.sparse.issparse(self.data):
126 |                 T = B.transpose() * A
127 |                 T = np.array(T.multiply(T).sum(axis=0))[0,:]
128 | 
129 |                 # next selected column index
130 |                 T[self.select] = 0.0
131 |                 idx = np.argmax(T)
132 |                 Aidx = A[:, idx].copy()
133 |                 self.select.append(idx)
134 | 
135 |                 # update B
136 |                 BC = Aidx.transpose() * B
137 |                 B = B - (Aidx*BC)
138 | 
139 |                 # update A
140 |                 AC = Aidx.transpose() * A
141 |                 A = A - (Aidx*AC)
142 | 
143 |             else:
144 |                 T = np.dot(B.transpose(), A)
145 |                 T = np.sum(T**2.0, axis=0)
146 | 
147 |                 # next selected column index
148 |                 T[self.select] = 0.0
149 |                 idx = np.argmax(T)
150 |                 self.select.append(idx)
151 | 
152 |                 # update B
153 |                 BC = np.dot(B.transpose(),A[:,idx])
154 |                 B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1))
155 | 
156 |                 # and A
157 |                 AC = np.dot(A.transpose(),A[:,idx])
158 |                 A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1))
159 | 
160 | 
161 |             # detect the next best data point
162 |             self._logger.info('searching for next best column ...')
163 |             self._logger.info('cur_columns: ' + str(self.select))
164 |             self._t[i] = time.time() - t0
165 | 
166 |         # sort indices, otherwise h5py won't work
167 |         self.W = self.data[:, np.sort(self.select)]
168 | 
169 |         # "unsort" it again to keep the correct order
170 |         self.W = self.W[:, np.argsort(np.argsort(self.select))]
171 | 
172 | if __name__ == "__main__":
173 |     import doctest
174 |     doctest.testmod()
175 | 


--------------------------------------------------------------------------------
/pymf/sivm_search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2.6
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Simplex Volume Maximization [1]
  8 | 
  9 |     SIVM_SEARCH: class for search-SiVM
 10 | 
 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
 13 | Conf. on Information and Knowledge Management. ACM. 2010.
 14 | """
 15 | 
 16 | 
 17 | import scipy.sparse
 18 | import numpy as np
 19 | from scipy import inf
 20 | try:
 21 |     from scipy.misc.common import factorial
 22 | except:
 23 |     from scipy.misc import factorial
 24 | 
 25 | from .dist import *
 26 | from .vol import *
 27 | from .sivm import SIVM
 28 | 
 29 | __all__ = ["SIVM_SEARCH"]
 30 | 
 31 | class SIVM_SEARCH(SIVM):
 32 |     """
 33 |     SIVM_SEARCH(data, num_bases=4, dist_measure='l2')
 34 | 
 35 | 
 36 |     Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
 37 |     F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
 38 |     found by maximizing the volume of the resulting simplex (see [1]). A solution
 39 |     is found by employing a simple A-star like search strategy.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     data : array_like, shape (_data_dimension, _num_samples)
 44 |         the input data
 45 |     num_bases: int, optional
 46 |         Number of bases to compute (column rank of W and row rank of H).
 47 |         4 (default)
 48 |     dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
 49 |         Standard is 'l2' which maximizes the volume of the simplex. In contrast,
 50 |         'cosine' maximizes the volume of a cone (see [1] for details).
 51 |      init : string (default: 'fastmap')
 52 |         'fastmap' or 'origin'. Sets the method used for finding the very first
 53 |         basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
 54 |         the two vectors that have the largest pairwise distance.
 55 |     Attributes
 56 |     ----------
 57 |     W : "data_dimension x num_bases" matrix of basis vectors
 58 |     H : "num bases x num_samples" matrix of coefficients
 59 |     ferr : frobenius norm (after calling .factorize())
 60 | 
 61 |     Example
 62 |     -------
 63 |     Applying SIVM to some rather stupid data set:
 64 | 
 65 |     >>> import numpy as np
 66 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 67 |     >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2)
 68 |     >>> sivm_mdl.factorize()
 69 | 
 70 |     The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
 71 |     To compute coefficients for an existing set of basis vectors simply copy W
 72 |     to sivm_mdl.W, and set compute_w to False:
 73 | 
 74 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
 75 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 76 |     >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2)
 77 |     >>> sivm_mdl.W = W
 78 |     >>> sivm_mdl.factorize(compute_w=False)
 79 | 
 80 |     The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
 81 |     """
 82 | 
 83 |     def update_w(self):
 84 |         def h(sel,D,k):
 85 |             # compute the volume for a selection of sel columns
 86 |             # and a k-1 simplex (-> k columns have to be selected)
 87 |             mv = np.max(D)
 88 | 
 89 |             # fill the remaining distance by the maximal overall found distance
 90 |             d = np.zeros((k,k)) + mv
 91 |             for i in range(k):
 92 |                 d[i,i] = 0.0
 93 | 
 94 |             for idx_i,i in enumerate(sel):
 95 |                 for idx_j,j in enumerate(sel):
 96 |                     d[idx_i,idx_j] = D[i,j]
 97 | 
 98 |             return d
 99 | 
100 |         # compute distance matrix -> required for the volume
101 |         D = pdist(self.data, self.data)
102 |         Openset = {}
103 | 
104 |         for i in range(self._num_samples):
105 |             # compute volume for temp selection
106 |             d = h([i],D,self._num_bases)
107 |             Vtmp = cmdet(d)
108 |             Openset[tuple([i])] = Vtmp
109 | 
110 |         Closedset = {}
111 |         finished = False
112 |         self._v = []
113 |         self.init_sivm()
114 |         next_sel = np.array([self.select[0]])
115 |         iter = 0
116 | 
117 |         while not finished:
118 |             # add the current selection to closedset
119 |             Closedset[(tuple(next_sel))] = []
120 | 
121 |             for i in range(D.shape[0]):
122 |                 # create a temp selection
123 |                 tmp_sel = np.array(next_sel).flatten()
124 |                 tmp_sel = np.concatenate((tmp_sel, [i]),axis=0)
125 |                 tmp_sel = np.unique(tmp_sel)
126 |                 tmp_sel = list(tmp_sel)
127 |                 hkey = tuple(tmp_sel)
128 | 
129 |                 if len(tmp_sel) > len(next_sel) and (
130 |                     not Closedset.has_key(hkey)) and (
131 |                     not Openset.has_key(hkey)):
132 | 
133 |                     # compute volume for temp selection
134 |                     d = h(tmp_sel, D, self._num_bases)
135 |                     Vtmp = cmdet(d)
136 | 
137 |                     # add to openset
138 |                     Openset[hkey] = Vtmp
139 | 
140 |             # get next best tuple
141 |             vmax = 0.0
142 |             for (k,v) in Openset.iteritems():
143 |                 if v > vmax:
144 |                     next_sel = k
145 |                     vmax = v
146 | 
147 |             self._logger.info('Iter:' + str(iter))
148 |             self._logger.info('Current selection:' + str(next_sel))
149 |             self._logger.info('Current volume:' + str(vmax))
150 |             self._v.append(vmax)
151 | 
152 |             # remove next_sel from openset
153 |             Openset.pop(next_sel)
154 | 
155 |             if len(list(next_sel)) == self._num_bases:
156 |                 finished = True
157 |             iter += 1
158 | 
159 |         # update some values ...
160 |         self.select = list(next_sel)
161 |         self.W = self.data[:, self.select]
162 | 
163 | if __name__ == "__main__":
164 |     import doctest
165 |     doctest.testmod()
166 | 


--------------------------------------------------------------------------------
/pymf/cnmf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Convex Matrix Factorization [1]
  8 | 
  9 |     CNMF(NMF) : Class for convex matrix factorization
 10 | 
 11 | [1] Ding, C., Li, T. and Jordan, M.. Convex and Semi-Nonnegative Matrix Factorizations.
 12 | IEEE Trans. on Pattern Analysis and Machine Intelligence 32(1), 45-55.
 13 | """
 14 | 
 15 | 
 16 | import numpy as np
 17 | import logging
 18 | from .nmf import NMF
 19 | from .kmeans import Kmeans
 20 | 
 21 | 
 22 | __all__ = ["CNMF"]
 23 | 
 24 | class CNMF(NMF):
 25 |     """
 26 |     CNMF(data, num_bases=4)
 27 | 
 28 | 
 29 |     Convex NMF. Factorize a data matrix into two matrices s.t.
 30 |     F = | data - W*H | = | data - data*beta*H| is minimal. H and beta
 31 |     are restricted to convexity (beta >=0, sum(beta, axis=1) = [1 .. 1]).
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     data : array_like, shape (_data_dimension, _num_samples)
 36 |         the input data
 37 |     num_bases: int, optional
 38 |         Number of bases to compute (column rank of W and row rank of H).
 39 |         4 (default)
 40 | 
 41 |     Attributes
 42 |     ----------
 43 |     W : "data_dimension x num_bases" matrix of basis vectors
 44 |     H : "num bases x num_samples" matrix of coefficients
 45 |     ferr : frobenius norm (after calling .factorize())
 46 | 
 47 |     Example
 48 |     -------
 49 |     Applying CNMF to some rather stupid data set:
 50 | 
 51 |     >>> import numpy as np
 52 |     >>> from cnmf import CNMF
 53 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 54 |     >>> cnmf_mdl = CNMF(data, num_bases=2)
 55 |     >>> cnmf_mdl.factorize(niter=10)
 56 | 
 57 |     The basis vectors are now stored in cnmf_mdl.W, the coefficients in cnmf_mdl.H.
 58 |     To compute coefficients for an existing set of basis vectors simply    copy W
 59 |     to cnmf_mdl.W, and set compute_w to False:
 60 | 
 61 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
 62 |     >>> W = [[1.0, 0.0], [0.0, 1.0]]
 63 |     >>> cnmf_mdl = CNMF(data, num_bases=2)
 64 |     >>> cnmf_mdl.W = W
 65 |     >>> cnmf_mdl.factorize(compute_w=False, niter=1)
 66 | 
 67 |     The result is a set of coefficients acnmf_mdl.H, s.t. data = W * cnmf_mdl.H.
 68 |     """
 69 | 
 70 |     # see .factorize() for the update of W and H
 71 |     # -> proper decoupling of W/H not possible ...
 72 |     def update_w(self):
 73 |         pass
 74 | 
 75 |     def update_h(self):
 76 |         pass
 77 | 
 78 |     def init_h(self):
 79 |         if not hasattr(self, 'H'):
 80 |             # init basic matrices
 81 |             self.H = np.zeros((self._num_bases, self._num_samples))
 82 | 
 83 |             # initialize using k-means
 84 |             km = Kmeans(self.data[:,:], num_bases=self._num_bases)
 85 |             km.factorize(niter=10)
 86 |             assign = km.assigned
 87 | 
 88 |             num_i = np.zeros(self._num_bases)
 89 |             for i in range(self._num_bases):
 90 |                 num_i[i] = len(np.where(assign == i)[0])
 91 | 
 92 |             self.H.T[range(len(assign)), assign] = 1.0
 93 |             self.H += 0.2*np.ones((self._num_bases, self._num_samples))
 94 | 
 95 |         if not hasattr(self, 'G'):
 96 |             self.G = np.zeros((self._num_samples, self._num_bases))
 97 | 
 98 |             self.G[range(len(assign)), assign] = 1.0
 99 |             self.G += 0.01
100 |             self.G /= np.tile(np.reshape(num_i[assign],(-1,1)), self.G.shape[1])
101 | 
102 |         if not hasattr(self,'W'):
103 |             self.W = np.dot(self.data[:,:], self.G)
104 | 
105 |     def init_w(self):
106 |         pass
107 | 
108 |     def factorize(self, niter=10, compute_w=True, compute_h=True,
109 |                   compute_err=True, show_progress=False):
110 |         """ Factorize s.t. WH = data
111 | 
112 |             Parameters
113 |             ----------
114 |             niter : int
115 |                     number of iterations.
116 |             show_progress : bool
117 |                     print some extra information to stdout.
118 |             compute_h : bool
119 |                     iteratively update values for H.
120 |             compute_w : bool
121 |                     iteratively update values for W.
122 |             compute_err : bool
123 |                     compute Frobenius norm |data-WH| after each update and store
124 |                     it to .ferr[k].
125 | 
126 |             Updated Values
127 |             --------------
128 |             .W : updated values for W.
129 |             .H : updated values for H.
130 |             .ferr : Frobenius norm |data-WH| for each iteration.
131 |         """
132 | 
133 |         if not hasattr(self,'W'):
134 |                self.init_w()
135 | 
136 |         if not hasattr(self,'H'):
137 |                 self.init_h()
138 | 
139 |         def separate_positive(m):
140 |             return (np.abs(m) + m)/2.0
141 | 
142 |         def separate_negative(m):
143 |             return (np.abs(m) - m)/2.0
144 | 
145 |         if show_progress:
146 |             self._logger.setLevel(logging.INFO)
147 |         else:
148 |             self._logger.setLevel(logging.ERROR)
149 | 
150 |         XtX = np.dot(self.data[:,:].T, self.data[:,:])
151 |         XtX_pos = separate_positive(XtX)
152 |         XtX_neg = separate_negative(XtX)
153 | 
154 |         self.ferr = np.zeros(niter)
155 |         # iterate over W and H
156 | 
157 |         for i in xrange(niter):
158 |             # update H
159 |             XtX_neg_x_W = np.dot(XtX_neg, self.G)
160 |             XtX_pos_x_W = np.dot(XtX_pos, self.G)
161 | 
162 |             if compute_h:
163 |                 H_x_WT = np.dot(self.H.T, self.G.T)
164 |                 ha = XtX_pos_x_W + np.dot(H_x_WT, XtX_neg_x_W)
165 |                 hb = XtX_neg_x_W + np.dot(H_x_WT, XtX_pos_x_W) + 10**-9
166 |                 self.H = (self.H.T*np.sqrt(ha/hb)).T
167 | 
168 |             # update W
169 |             if compute_w:
170 |                 HT_x_H = np.dot(self.H, self.H.T)
171 |                 wa = np.dot(XtX_pos, self.H.T) + np.dot(XtX_neg_x_W, HT_x_H)
172 |                 wb = np.dot(XtX_neg, self.H.T) + np.dot(XtX_pos_x_W, HT_x_H) + 10**-9
173 | 
174 |                 self.G *= np.sqrt(wa/wb)
175 |                 self.W = np.dot(self.data[:,:], self.G)
176 | 
177 |             if compute_err:
178 |                 self.ferr[i] = self.frobenius_norm()
179 |                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) +
180 |                 ' FN:' + str(self.ferr[i]))
181 |             else:
182 |                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter))
183 | 
184 |             if i > 1 and compute_err:
185 |                 if self.converged(i):
186 |                     self.ferr = self.ferr[:i]
187 |                     break
188 | 
189 | if __name__ == "__main__":
190 |     import doctest
191 |     doctest.testmod()
192 | 


--------------------------------------------------------------------------------
/pymf/sivm_gsat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2.6
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Simplex Volume Maximization [1]
  8 | 
  9 |     SIVM_GSAT: class for gsat-SiVM
 10 | 
 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
 13 | Conf. on Information and Knowledge Management. ACM. 2010.
 14 | """
 15 | 
 16 | 
 17 | import logging
 18 | import numpy as np
 19 | from .dist import *
 20 | from .vol import cmdet
 21 | from .sivm import SIVM
 22 | 
 23 | __all__ = ["SIVM_GSAT"]
 24 | 
 25 | class SIVM_GSAT(SIVM):
 26 |     """
 27 |     SIVM(data, num_bases=4, dist_measure='l2')
 28 | 
 29 | 
 30 |     Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
 31 |     F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
 32 |     found by maximizing the volume of the resulting simplex (see [1]). Can be
 33 |     applied to data streams using the .online_update_w(vec) function which decides
 34 |     on adding data sample "vec" to the already selected basis vectors.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     data : array_like, shape (_data_dimension, _num_samples)
 39 |         the input data
 40 |     num_bases: int, optional
 41 |         Number of bases to compute (column rank of W and row rank of H).
 42 |         4 (default)
 43 |     dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
 44 |         Standard is 'l2' which maximizes the volume of the simplex. In contrast,
 45 |         'cosine' maximizes the volume of a cone (see [1] for details).
 46 |      init : string (default: 'fastmap')
 47 |         'fastmap' or 'origin'. Sets the method used for finding the very first
 48 |         basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
 49 |         the two vectors that have the largest pairwise distance.
 50 |     Attributes
 51 |     ----------
 52 |     W : "data_dimension x num_bases" matrix of basis vectors
 53 |     H : "num bases x num_samples" matrix of coefficients
 54 |     ferr : frobenius norm (after calling .factorize())
 55 | 
 56 |     Example
 57 |     -------
 58 |     Applying SIVM to some rather stupid data set:
 59 | 
 60 |     >>> import numpy as np
 61 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 62 |     >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
 63 |     >>> sivm_mdl.factorize()
 64 | 
 65 |     The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
 66 |     To compute coefficients for an existing set of basis vectors simply    copy W
 67 |     to sivm_mdl.W, and set compute_w to False:
 68 | 
 69 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
 70 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 71 |     >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
 72 |     >>> sivm_mdl.W = W
 73 |     >>> sivm_mdl.factorize(compute_w=False)
 74 | 
 75 |     The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
 76 |     """
 77 | 
 78 |     def init_w(self):
 79 |         self.select = range(self._num_bases)
 80 |         self.W = self.data[:, self.select]
 81 | 
 82 |     def online_update_w(self, vec):
 83 |         # update D if it does not exist
 84 |         k = self._num_bases
 85 |         if not hasattr(self, 'D'):
 86 |             self.D = np.zeros((k + 1, k + 1))
 87 |             self.D[:k, :k] = pdist(self.W, self.W)
 88 |             self.V = cmdet(self.D[:k, :k])
 89 | 
 90 |         tmp_d = self._distfunc(self.W, vec.reshape((-1,1)))
 91 |         self.D[k, :-1] = tmp_d
 92 |         self.D[:-1, k] = tmp_d
 93 | 
 94 |         v = np.zeros((self._num_bases + 1))
 95 | 
 96 |         for i in range(self._num_bases):
 97 |                 # compute volume for each combination...
 98 |                 s = np.setdiff1d(range(self._num_bases + 1), [i])
 99 |                 v[i] = cmdet((self.D[s,:])[:,s])
100 | 
101 |         # select index that maximizes the volume
102 |         v[-1] = self.V
103 |         s = np.argmax(v)
104 | 
105 |         if s < self._num_bases:
106 |             self.W[:,s] = vec
107 |             self.D[:self._num_bases, :self._num_bases] = pdist(self.W, self.W)
108 | 
109 |             if not hasattr(self, '_v'):
110 |                 self._v = [self.V]
111 |             self.V = v[s]
112 |             self._v.append(v[s])
113 | 
114 |             self._logger.info('Volume increased:' + str(self.V))
115 |             return True, s
116 | 
117 |         return False,-1
118 | 
119 |     def update_w(self):
120 |         n = np.int(np.floor(np.random.random() * self._num_samples))
121 |         if n not in self.select:
122 |             updated, s = self.online_update_w(self.data[:,n])
123 |             if updated:
124 |                 self.select[s] = n
125 |                 self._logger.info('Current selection:' + str(self.select))
126 | 
127 | 
128 |     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
129 |                   compute_err=True, niter=1):
130 |         """ Factorize s.t. WH = data
131 | 
132 |             Parameters
133 |             ----------
134 |             show_progress : bool
135 |                     print some extra information to stdout.
136 |             niter : int
137 |                     number of iterations.
138 |             compute_h : bool
139 |                     iteratively update values for H.
140 |             compute_w : bool
141 |                     iteratively update values for W.
142 |             compute_err : bool
143 |                     compute Frobenius norm |data-WH| after each update and store
144 |                     it to .ferr[k].
145 | 
146 |             Updated Values
147 |             --------------
148 |             .W : updated values for W.
149 |             .H : updated values for H.
150 |             .ferr : Frobenius norm |data-WH|.
151 |         """
152 |         if show_progress:
153 |             self._logger.setLevel(logging.INFO)
154 |         else:
155 |             self._logger.setLevel(logging.ERROR)
156 | 
157 |         # create W and H if they don't already exist
158 |         # -> any custom initialization to W,H should be done before
159 |         if not hasattr(self,'W'):
160 |                self.init_w()
161 | 
162 |         if not hasattr(self,'H'):
163 |                 self.init_h()
164 | 
165 |         if compute_err:
166 |             self.ferr = np.zeros(niter)
167 | 
168 |         for i in xrange(niter):
169 |             if compute_w:
170 |                 self.update_w()
171 | 
172 |             if compute_h:
173 |                 self.update_h()
174 | 
175 |             if compute_err:
176 |                 self.ferr[i] = self.frobenius_norm()
177 |                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) +
178 |                     ' FN:' + str(self.ferr[i]))
179 |             else:
180 |                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter))
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     import doctest
185 |     doctest.testmod()
186 | 


--------------------------------------------------------------------------------
/pymf/nmf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Non-negative Matrix Factorization.
  8 | 
  9 |     NMF: Class for Non-negative Matrix Factorization
 10 | 
 11 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative
 12 | Matrix Factorization, Nature 401(6755), 788-799.
 13 | """
 14 | 
 15 | 
 16 | import numpy as np
 17 | import logging
 18 | import logging.config
 19 | import scipy.sparse
 20 | 
 21 | __all__ = ["NMF"]
 22 | 
 23 | class NMF():
 24 |     """
 25 |     NMF(data, num_bases=4)
 26 | 
 27 | 
 28 |     Non-negative Matrix Factorization. Factorize a data matrix into two matrices
 29 |     s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative
 30 |     data. Uses the classicial multiplicative update rule.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     data : array_like, shape (_data_dimension, _num_samples)
 35 |         the input data
 36 |     num_bases: int, optional
 37 |         Number of bases to compute (column rank of W and row rank of H).
 38 |         4 (default)
 39 | 
 40 |     Attributes
 41 |     ----------
 42 |     W : "data_dimension x num_bases" matrix of basis vectors
 43 |     H : "num bases x num_samples" matrix of coefficients
 44 |     ferr : frobenius norm (after calling .factorize())
 45 | 
 46 |     Example
 47 |     -------
 48 |     Applying NMF to some rather stupid data set:
 49 | 
 50 |     >>> import numpy as np
 51 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 52 |     >>> nmf_mdl = NMF(data, num_bases=2, niter=10)
 53 |     >>> nmf_mdl.factorize()
 54 | 
 55 |     The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H.
 56 |     To compute coefficients for an existing set of basis vectors simply    copy W
 57 |     to nmf_mdl.W, and set compute_w to False:
 58 | 
 59 |     >>> data = np.array([[1.5], [1.2]])
 60 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 61 |     >>> nmf_mdl = NMF(data, num_bases=2)
 62 |     >>> nmf_mdl.W = W
 63 |     >>> nmf_mdl.factorize(niter=20, compute_w=False)
 64 | 
 65 |     The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H.
 66 |     """
 67 | 
 68 |     # some small value
 69 |     _EPS = 10**-8
 70 | 
 71 |     def __init__(self, data, num_bases=4):
 72 | 
 73 |         def setup_logging():
 74 |             # create logger
 75 |             self._logger = logging.getLogger("pymf")
 76 | 
 77 |             # add ch to logger
 78 |             if len(self._logger.handlers) < 1:
 79 |                 # create console handler and set level to debug
 80 |                 ch = logging.StreamHandler()
 81 |                 ch.setLevel(logging.DEBUG)
 82 |                 # create formatter
 83 |                 formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
 84 | 
 85 |                 # add formatter to ch
 86 |                 ch.setFormatter(formatter)
 87 | 
 88 |                 self._logger.addHandler(ch)
 89 | 
 90 |         setup_logging()
 91 | 
 92 |         # set variables
 93 |         self.data = data
 94 |         self._num_bases = num_bases
 95 | 
 96 |         # initialize H and W to random values
 97 |         (self._data_dimension, self._num_samples) = self.data.shape
 98 | 
 99 | 
100 |     def frobenius_norm(self):
101 |         """ Frobenius norm (||data - WH||) of a data matrix and a low rank
102 |         approximation given by WH
103 | 
104 |         Returns:
105 |             frobenius norm: F = ||data - WH||
106 |         """
107 | 
108 |         # check if W and H exist
109 |         if hasattr(self,'H') and hasattr(self,'W') and not scipy.sparse.issparse(self.data):
110 |             err = np.sqrt( np.sum((self.data[:,:] - np.dot(self.W, self.H))**2 ))
111 |         else:
112 |             err = -123456
113 | 
114 |         return err
115 | 
116 |     def init_w(self):
117 |         self.W = np.random.random((self._data_dimension, self._num_bases))
118 | 
119 |     def init_h(self):
120 |         self.H = np.random.random((self._num_bases, self._num_samples))
121 | 
122 |     def update_h(self):
123 |             # pre init H1, and H2 (necessary for storing matrices on disk)
124 |             H2 = np.dot(np.dot(self.W.T, self.W), self.H) + 10**-9
125 |             self.H *= np.dot(self.W.T, self.data[:,:])
126 |             self.H /= H2
127 | 
128 |     def update_w(self):
129 |             # pre init W1, and W2 (necessary for storing matrices on disk)
130 |             W2 = np.dot(np.dot(self.W, self.H), self.H.T) + 10**-9
131 |             self.W *= np.dot(self.data[:,:], self.H.T)
132 |             self.W /= W2
133 | 
134 |     def converged(self, i):
135 |         derr = np.abs(self.ferr[i] - self.ferr[i-1])/self._num_samples
136 |         if derr < self._EPS:
137 |             return True
138 |         else:
139 |             return False
140 | 
141 |     def factorize(self, niter=1, show_progress=False,
142 |                   compute_w=True, compute_h=True, compute_err=True):
143 |         """ Factorize s.t. WH = data
144 | 
145 |             Parameters
146 |             ----------
147 |             niter : int
148 |                     number of iterations.
149 |             show_progress : bool
150 |                     print some extra information to stdout.
151 |             compute_h : bool
152 |                     iteratively update values for H.
153 |             compute_w : bool
154 |                     iteratively update values for W.
155 |             compute_err : bool
156 |                     compute Frobenius norm |data-WH| after each update and store
157 |                     it to .ferr[k].
158 | 
159 |             Updated Values
160 |             --------------
161 |             .W : updated values for W.
162 |             .H : updated values for H.
163 |             .ferr : Frobenius norm |data-WH| for each iteration.
164 |         """
165 | 
166 |         if show_progress:
167 |             self._logger.setLevel(logging.INFO)
168 |         else:
169 |             self._logger.setLevel(logging.ERROR)
170 | 
171 |         # create W and H if they don't already exist
172 |         # -> any custom initialization to W,H should be done before
173 |         if not hasattr(self,'W'):
174 |                self.init_w()
175 | 
176 |         if not hasattr(self,'H'):
177 |                 self.init_h()
178 | 
179 |         if compute_err:
180 |             self.ferr = np.zeros(niter)
181 | 
182 |         for i in xrange(niter):
183 |             if compute_w:
184 |                 self.update_w()
185 | 
186 |             if compute_h:
187 |                 self.update_h()
188 | 
189 |             if compute_err:
190 |                 self.ferr[i] = self.frobenius_norm()
191 |                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) +
192 |                 ' FN:' + str(self.ferr[i]))
193 |             else:
194 |                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter))
195 | 
196 | 
197 |             # check if the err is not changing anymore
198 |             if i > 1 and compute_err:
199 |                 if self.converged(i):
200 |                     # adjust the error measure
201 |                     self.ferr = self.ferr[:i]
202 |                     break
203 | 
204 | if __name__ == "__main__":
205 |     import doctest
206 |     doctest.testmod()
207 | 


--------------------------------------------------------------------------------
/pymf/sub.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Matrix sampling methods
  8 | 
  9 |     SUB: apply one of the matrix factorization methods of PyMF
 10 |          on sampled data for computing W, then compute H.
 11 | 
 12 | Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL).
 13 | """
 14 | 
 15 | 
 16 | 
 17 | import numpy as np
 18 | import random
 19 | #from itertools import combinations
 20 | from .chnmf import combinations
 21 | 
 22 | from . import dist
 23 | from .chnmf import quickhull
 24 | from .nmf import NMF
 25 | from .pca import PCA
 26 | from .kmeans import Kmeans
 27 | from .laesa import LAESA
 28 | from .sivm import SIVM
 29 | 
 30 | __all__ = ["SUB"]
 31 | 
 32 | class SUB(NMF):
 33 |     """
 34 |     SUB(data, mfmethod, sstrategy='rand', nsub=20, show_progress=True, mapW=False,
 35 |     base_sel=2,    num_bases=3 , niterH=1, niter=100, compute_h=True, compute_w=True, )
 36 | 
 37 |     Evaluate a matrix factorization method "mfmethod" for a certain sampling
 38 |     strategy "sstrategy". This is particular useful for very large datasets.
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     todo ...
 43 | 
 44 |     Attributes
 45 |     ----------
 46 |     todo ....
 47 |     """
 48 | 
 49 |     def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2,
 50 |                 num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'):
 51 |         NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w)
 52 | 
 53 |         self._niterH = niterH
 54 |         self._nsub = nsub
 55 |         self.data = data
 56 |         self._mfmethod = mfmethod
 57 |         self._mapW = mapW
 58 |         self._sstrategy = sstrategy
 59 |         self._base_sel = base_sel
 60 | 
 61 |         # assign the correct distance function
 62 |         if self._sstrategy == 'cur':
 63 |             self._subfunc = self.curselect
 64 | 
 65 |         elif self._sstrategy == 'kmeans':
 66 |             self._subfunc = self.kmeansselect
 67 | 
 68 |         elif self._sstrategy == 'hull':
 69 |             self._subfunc = self.hullselect
 70 | 
 71 |         elif self._sstrategy == 'laesa':
 72 |             self._subfunc = self.laesaselect
 73 | 
 74 |         elif self._sstrategy == 'sivm':
 75 |             self._subfunc = self.sivmselect
 76 | 
 77 |         else:
 78 |             self._subfunc = self.randselect
 79 | 
 80 |     def hullselect(self):
 81 | 
 82 |         def selectHullPoints(data, n=20):
 83 |             """ select data points for pairwise projections of the first n
 84 |             dimensions """
 85 | 
 86 |             # iterate over all projections and select data points
 87 |             idx = np.array([])
 88 | 
 89 |             # iterate over some pairwise combinations of dimensions
 90 |             for i in combinations(range(n), 2):
 91 | 
 92 |                 # sample convex hull points in 2D projection
 93 |                 convex_hull_d = quickhull(data[i, :].T)
 94 | 
 95 |                 # get indices for convex hull data points
 96 |                 idx = np.append(idx, dist.vq(data[i, :], convex_hull_d.T))
 97 |                 idx = np.unique(idx)
 98 | 
 99 |             return np.int32(idx)
100 | 
101 | 
102 |         # determine convex hull data points only if the total
103 |         # amount of available data is >50
104 |         #if self.data.shape[1] > 50:
105 |         pcamodel = PCA(self.data, show_progress=self._show_progress)
106 |         pcamodel.factorize()
107 | 
108 |         idx = selectHullPoints(pcamodel.H, n=self._base_sel)
109 | 
110 |         # set the number of subsampled data
111 |         self.nsub = len(idx)
112 | 
113 |         return idx
114 | 
115 |     def kmeansselect(self):
116 |             kmeans_mdl = Kmeans(self.data, num_bases=self._nsub)
117 |             kmeans_mdl.initialization()
118 |             kmeans_mdl.factorize()
119 | 
120 |             # pick data samples closest to the centres
121 |             idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W)
122 |             return idx
123 | 
124 |     def curselect(self):
125 |         def sample_probability():
126 |             dsquare = self.data[:,:]**2
127 | 
128 |             pcol = np.array(dsquare.sum(axis=0))
129 |             pcol /= pcol.sum()
130 | 
131 |             return (pcol.reshape(-1,1))
132 | 
133 |         probs = sample_probability()
134 |         prob_cols = np.cumsum(probs.flatten()) #.flatten()
135 |         temp_ind = np.zeros(self._nsub, np.int32)
136 | 
137 |         for i in range(self._nsub):
138 |             tempI = np.where(prob_cols >= np.random.rand())[0]
139 |             temp_ind[i] = tempI[0]
140 | 
141 |         return np.sort(temp_ind)
142 | 
143 |     def sivmselect(self):
144 |         sivmmdl = SIVM(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine')
145 | 
146 |         sivmmdl.initialization()
147 |         sivmmdl.factorize()
148 |         idx = sivmmdl.select
149 |         return idx
150 | 
151 |     def laesaselect(self):
152 |         laesamdl = LAESA(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine')
153 |         laesamdl.initialization()
154 |         laesamdl.factorize()
155 |         idx = laesamdl.select
156 |         return idx
157 | 
158 | 
159 |     def randselect(self):
160 |         idx = random.sample(xrange(self._num_samples), self._nsub)
161 |         return np.sort(np.int32(idx))
162 | 
163 |     def update_w(self):
164 | 
165 |         idx = self._subfunc()
166 |         idx = np.sort(np.int32(idx))
167 | 
168 | 
169 |         mdl_small = self._mfmethod(self.data[:, idx],
170 |                                 num_bases=self._num_bases,
171 |                                 show_progress=self._show_progress,
172 |                                 compute_w=True)
173 | 
174 |         # initialize W, H, and beta
175 |         mdl_small.initialization()
176 | 
177 |         # determine W
178 |         mdl_small.factorize()
179 | 
180 | 
181 |         self.mdl = self._mfmethod(self.data[:, :],
182 |                                     num_bases=self._num_bases ,
183 |                                     show_progress=self._show_progress,
184 |                                     compute_w=False)
185 | 
186 | 
187 |         self.mdl.initialization()
188 | 
189 |         if self._mapW:
190 |             # compute pairwise distances
191 |             #distance = vq(self.data, self.W)
192 |             _Wmapped_index = dist.vq(self.mdl.data, mdl_small.W)
193 | 
194 |             # do not directly assign, i.e. Wdist = self.data[:,sel]
195 |             # as self might be unsorted (in non ascending order)
196 |             # -> sorting sel would screw the matching to W if
197 |             # self.data is stored as a hdf5 table (see h5py)
198 |             for i,s in enumerate(_Wmapped_index):
199 |                 self.mdl.W[:,i] = self.mdl.data[:,s]
200 |         else:
201 |             self.mdl.W = np.copy(mdl_small.W)
202 | 
203 |     def update_h(self):
204 |         self.mdl.factorize()
205 | 
206 |     def factorize(self):
207 |         """Do factorization s.t. data = dot(dot(data,beta),H), under the convexity constraint
208 |             beta >=0, sum(beta)=1, H >=0, sum(H)=1
209 |         """
210 |         # compute new coefficients for reconstructing data points
211 |         self.update_w()
212 | 
213 |         # for CHNMF it is sometimes useful to only compute
214 |         # the basis vectors
215 |         if self._compute_h:
216 |             self.update_h()
217 | 
218 |         self.W = self.mdl.W
219 |         self.H = self.mdl.H
220 | 
221 |         self.ferr = np.zeros(1)
222 |         self.ferr[0] = self.mdl.frobenius_norm()
223 |         self._print_cur_status(' Fro:' + str(self.ferr[0]))
224 | 
225 | if __name__ == "__main__":
226 |     import doctest
227 |     doctest.testmod()
228 | 


--------------------------------------------------------------------------------
/pymf/gmap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Geometric-Map
  8 | 
  9 |     GMAP: Class for Geometric-Map
 10 | """
 11 | 
 12 | 
 13 | import scipy.sparse
 14 | import numpy as np
 15 | 
 16 | from .dist import *
 17 | from .aa import AA
 18 | from .kmeans import Kmeans
 19 | 
 20 | __all__ = ["GMAP"]
 21 | 
 22 | class GMAP(AA):
 23 |     """
 24 |     GMAP(data, num_bases=4, dist_measure='l2')
 25 | 
 26 | 
 27 |     Geometric-Map. Factorize a data matrix into two matrices s.t.
 28 |     F = | data - W*H | is minimal. G-MAP can emulate/approximate several
 29 |     standard methods including PCA, NMF, and AA.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     data : array_like, shape (_data_dimension, _num_samples)
 34 |         the input data
 35 |     num_bases: int, optional
 36 |         Number of bases to compute (column rank of W and row rank of H).
 37 |         4 (default)
 38 |     method : one of 'pca' ,'nmf', 'aa', default is 'pca' which emulates
 39 |         Principal Component Analysis using the geometric map method ('nmf'
 40 |         emulates Non-negative Matrix Factorization, 'aa' emulates Archetypal
 41 |         Analysis).
 42 |     robust_map : bool, optional
 43 |         use robust_map or the standard max-val selection
 44 |         [see "On FastMap and the Convex Hull of Multivariate Data: Toward
 45 |         Fast and Robust Dimension Reduction", Ostrouchov and Samatova, PAMI
 46 |         2005]
 47 |     Attributes
 48 |     ----------
 49 |     W : "data_dimension x num_bases" matrix of basis vectors
 50 |     H : "num bases x num_samples" matrix of coefficients
 51 |     ferr : frobenius norm (after calling .factorize())
 52 | 
 53 |     Example
 54 |     -------
 55 |     Applying GMAP to some rather stupid data set:
 56 | 
 57 |     >>> import numpy as np
 58 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 59 |     >>> gmap_mdl = GMAP(data, num_bases=2)
 60 |     >>> gmap_mdl.factorize()
 61 | 
 62 |     The basis vectors are now stored in gmap_mdl.W, the coefficients in gmap_mdl.H.
 63 |     To compute coefficients for an existing set of basis vectors simply copy W
 64 |     to gmap_mdl.W, and set compute_w to False:
 65 | 
 66 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
 67 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 68 |     >>> gmap_mdl = GMAP(data, num_bases=2)
 69 |     >>> gmap_mdl.W = W
 70 |     >>> gmap_mdl.factorize(compute_w=False)
 71 | 
 72 |     The result is a set of coefficients gmap_mdl.H, s.t. data = W * gmap_mdl.H.
 73 |     """
 74 | 
 75 |     # always overwrite the default number of iterations
 76 |     # -> any value other does not make sense.
 77 |     _NITER = 1
 78 | 
 79 |     def __init__(self, data, num_bases=4, method='pca', robust_map=True):
 80 | 
 81 |         AA.__init__(self, data, num_bases=num_bases)
 82 |         self.sub = []
 83 |         self._robust_map = robust_map
 84 |         self._method = method
 85 | 
 86 | 
 87 |     def init_h(self):
 88 |         self.H = np.zeros((self._num_bases, self._num_samples))
 89 | 
 90 |     def init_w(self):
 91 |         self.W = np.zeros((self._data_dimension, self._num_bases))
 92 | 
 93 |     def update_w(self):
 94 |         """ compute new W """
 95 | 
 96 |         def select_next(iterval):
 97 |             """ select the next best data sample using robust map
 98 |             or simply the max iterval ... """
 99 | 
100 |             if self._robust_map:
101 |                 k = np.argsort(iterval)[::-1]
102 |                 d_sub = self.data[:,k[:self._robust_nselect]]
103 |                 self.sub.extend(k[:self._robust_nselect])
104 | 
105 |                 # cluster d_sub
106 |                 kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster)
107 |                 kmeans_mdl.factorize(niter=10)
108 | 
109 |                 # get largest cluster
110 |                 h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0]
111 |                 largest_cluster = np.argmax(h)
112 |                 sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub)
113 |                 sel = k[np.argmin(sel)]
114 |             else:
115 |                 sel = np.argmax(iterval)
116 | 
117 |             return sel
118 | 
119 |         EPS = 10**-8
120 | 
121 |         if scipy.sparse.issparse(self.data):
122 |             norm_data = np.sqrt(self.data.multiply(self.data).sum(axis=0))
123 |             norm_data = np.array(norm_data).reshape((-1))
124 |         else:
125 |             norm_data = np.sqrt(np.sum(self.data**2, axis=0))
126 | 
127 | 
128 |         self.select = []
129 | 
130 |         if self._method == 'pca' or self._method == 'aa':
131 |             iterval = norm_data.copy()
132 | 
133 |         if self._method == 'nmf':
134 |             iterval = np.sum(self.data, axis=0)/(np.sqrt(self.data.shape[0])*norm_data)
135 |             iterval = 1.0 - iterval
136 | 
137 |         self.select.append(select_next(iterval))
138 | 
139 | 
140 |         for l in range(1, self._num_bases):
141 | 
142 |             if scipy.sparse.issparse(self.data):
143 |                 c = self.data[:, self.select[-1]:self.select[-1]+1].T * self.data
144 |                 c = np.array(c.todense())
145 |             else:
146 |                 c = np.dot(self.data[:,self.select[-1]], self.data)
147 | 
148 |             c = c/(norm_data * norm_data[self.select[-1]])
149 | 
150 |             if self._method == 'pca':
151 |                 c = 1.0 - np.abs(c)
152 |                 c = c * norm_data
153 | 
154 |             elif self._method == 'aa':
155 |                 c = (c*-1.0 + 1.0)/2.0
156 |                 c = c * norm_data
157 | 
158 |             elif self._method == 'nmf':
159 |                 c = 1.0 - np.abs(c)
160 | 
161 |             ### update the estimated volume
162 |             iterval = c * iterval
163 | 
164 |             # detect the next best data point
165 |             self.select.append(select_next(iterval))
166 | 
167 |             self._logger.info('cur_nodes: ' + str(self.select))
168 | 
169 |         # sort indices, otherwise h5py won't work
170 |         self.W = self.data[:, np.sort(self.select)]
171 | 
172 |         # "unsort" it again to keep the correct order
173 |         self.W = self.W[:, np.argsort(np.argsort(self.select))]
174 | 
175 |     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
176 |                   compute_err=True, robust_cluster=3, niter=1, robust_nselect=-1):
177 |         """ Factorize s.t. WH = data
178 | 
179 |             Parameters
180 |             ----------
181 |             show_progress : bool
182 |                     print some extra information to stdout.
183 |                     False, default
184 |             compute_h : bool
185 |                     iteratively update values for H.
186 |                     True, default
187 |             compute_w : bool
188 |                     iteratively update values for W.
189 |                     default, True
190 |             compute_err : bool
191 |                     compute Frobenius norm |data-WH| after each update and store
192 |                     it to .ferr[k].
193 |             robust_cluster : int, optional
194 |                     set the number of clusters for robust map selection.
195 |                     3, default
196 |             robust_nselect : int, optional
197 |                     set the number of samples to consider for robust map
198 |                     selection.
199 |                     -1, default (automatically determine suitable number)
200 | 
201 |             Updated Values
202 |             --------------
203 |             .W : updated values for W.
204 |             .H : updated values for H.
205 |             .ferr : Frobenius norm |data-WH|.
206 |         """
207 |         self._robust_cluster = robust_cluster
208 |         self._robust_nselect = robust_nselect
209 | 
210 |         if self._robust_nselect == -1:
211 |             self._robust_nselect = np.round(np.log(self.data.shape[1])*2)
212 | 
213 |         AA.factorize(self, niter=1, show_progress=show_progress,
214 |                   compute_w=compute_w, compute_h=compute_h,
215 |                   compute_err=compute_err)
216 | 
217 | if __name__ == "__main__":
218 |     import doctest
219 |     doctest.testmod()
220 | 


--------------------------------------------------------------------------------
/pymf/chnmf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Convex Hull Non-negative Matrix Factorization [1]
  8 | 
  9 |     CHNMF(NMF) : Class for Convex-hull NMF
 10 |     quickhull : Function for finding the convex hull in 2D
 11 | 
 12 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Convex Non-Negative Matrix
 13 | Factorization in the Wild. ICDM 2009.
 14 | """
 15 | 
 16 | 
 17 | import numpy as np
 18 | 
 19 | from itertools import combinations
 20 | from .dist import vq
 21 | from .pca import PCA
 22 | from .aa import AA
 23 | 
 24 | __all__ = ["CHNMF"]
 25 | 
 26 | 
 27 | def quickhull(sample):
 28 |     """ Find data points on the convex hull of a supplied data set
 29 | 
 30 |     Args:
 31 |         sample: data points as column vectors n x d
 32 |                     n - number samples
 33 |                     d - data dimension (should be two)
 34 | 
 35 |     Returns:
 36 |         a k x d matrix containint the convex hull data points
 37 |     """
 38 | 
 39 |     link = lambda a, b: np.concatenate((a, b[1:]))
 40 |     edge = lambda a, b: np.concatenate(([a], [b]))
 41 | 
 42 |     def dome(sample, base):
 43 |         h, t = base
 44 |         dists = np.dot(sample - h, np.dot(((0, -1), (1, 0)), (t - h)))
 45 |         outer = np.repeat(sample, dists > 0, axis=0)
 46 | 
 47 |         if len(outer):
 48 |             pivot = sample[np.argmax(dists)]
 49 |             return link(dome(outer, edge(h, pivot)),
 50 |                 dome(outer, edge(pivot, t)))
 51 |         else:
 52 |             return base
 53 | 
 54 |     if len(sample) > 2:
 55 |         axis = sample[:, 0]
 56 |         base = np.take(sample, [np.argmin(axis), np.argmax(axis)], axis=0)
 57 |         return link(dome(sample, base),
 58 |             dome(sample, base[::-1]))
 59 |     else:
 60 |         return sample
 61 | 
 62 | class CHNMF(AA):
 63 |     """
 64 |     CHNMF(data, num_bases=4)
 65 | 
 66 |     Convex Hull Non-negative Matrix Factorization. Factorize a data matrix into
 67 |     two matrices s.t. F = | data - W*H | is minimal. H is restricted to convexity
 68 |     (H >=0, sum(H, axis=1) = [1 .. 1]) and W resides on actual data points.
 69 |     Factorization is solved via an alternating least squares optimization using
 70 |     the quadratic programming solver from cvxopt. The results are usually
 71 |     equivalent to Archetypal Analysis (pymf.AA) but CHNMF also works for very
 72 |     large datasets.
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     data : array_like, shape (_data_dimension, _num_samples)
 77 |         the input data
 78 |     num_bases: int, optional
 79 |         Number of bases to compute (column rank of W and row rank of H).
 80 |         4 (default)
 81 |     base_sel: int,
 82 |         Number of pairwise basis vector projections. Set to a value< rank(data).
 83 |         Computation time scale exponentially with this value, usually rather low
 84 |         values are sufficient (3-10).
 85 | 
 86 |     Attributes
 87 |     ----------
 88 |         W : "data_dimension x num_bases" matrix of basis vectors
 89 |         H : "num bases x num_samples" matrix of coefficients
 90 |         ferr : frobenius norm (after calling .factorize())
 91 | 
 92 |     Example
 93 |     -------
 94 |     Applying CHNMF to some rather stupid data set:
 95 | 
 96 |     >>> import numpy as np
 97 |     >>> from chnmf import CHNMF
 98 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 99 | 
100 |     Use 2 basis vectors -> W shape(data_dimension, 2).
101 | 
102 |     >>> chnmf_mdl = CHNMF(data, num_bases=2)
103 | 
104 |     And start computing the factorization.
105 | 
106 |     >>> chnmf_mdl.factorize()
107 | 
108 |     The basis vectors are now stored in chnmf_mdl.W, the coefficients in
109 |     chnmf_mdl.H. To compute coefficients for an existing set of basis vectors
110 |     simply copy W to chnmf_mdl.W, and set compute_w to False:
111 | 
112 |     >>> data = np.array([[1.5, 2.0], [1.2, 1.8]])
113 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
114 |     >>> chnmf_mdl = CHNMF(data, num_bases=2)
115 |     >>> chnmf_mdl.W = W
116 |     >>> chnmf_mdl.factorize(compute_w=False)
117 | 
118 |     The result is a set of coefficients chnmf_mdl.H, s.t. data = W * chnmf_mdl.H.
119 |     """
120 | 
121 |     def __init__(self, data, num_bases=4, base_sel=3):
122 | 
123 |         # call inherited method
124 |         AA.__init__(self, data, num_bases=num_bases)
125 | 
126 |         # base sel should never be larger than the actual data dimension
127 |         self._base_sel = base_sel
128 |         if base_sel > self.data.shape[0]:
129 |             self._base_sel = self.data.shape[0]
130 | 
131 |     def init_h(self):
132 |         self.H = np.zeros((self._num_bases, self._num_samples))
133 | 
134 |     def init_w(self):
135 |         self.W = np.zeros((self._data_dimension, self._num_bases))
136 | 
137 |     def _map_w_to_data(self):
138 |         """ Return data points that are most similar to basis vectors W
139 |         """
140 | 
141 |         # assign W to the next best data sample
142 |         self._Wmapped_index = vq(self.data, self.W)
143 |         self.Wmapped = np.zeros(self.W.shape)
144 | 
145 |         # do not directly assign, i.e. Wdist = self.data[:,sel]
146 |         # as self might be unsorted (in non ascending order)
147 |         # -> sorting sel would screw the matching to W if
148 |         # self.data is stored as a hdf5 table (see h5py)
149 |         for i, s in enumerate(self._Wmapped_index):
150 |             self.Wmapped[:,i] = self.data[:,s]
151 | 
152 |     def update_w(self):
153 |         """ compute new W """
154 |         def select_hull_points(data, n=3):
155 |             """ select data points for pairwise projections of the first n
156 |             dimensions """
157 | 
158 |             # iterate over all projections and select data points
159 |             idx = np.array([])
160 | 
161 |             # iterate over some pairwise combinations of dimensions
162 |             for i in combinations(range(n), 2):
163 |                 # sample convex hull points in 2D projection
164 |                 convex_hull_d = quickhull(data[i, :].T)
165 | 
166 |                 # get indices for convex hull data points
167 |                 idx = np.append(idx, vq(data[i, :], convex_hull_d.T))
168 |                 idx = np.unique(idx)
169 | 
170 |             return np.int32(idx)
171 | 
172 |         # determine convex hull data points using either PCA or random
173 |         # projections
174 |         method = 'randomprojection'
175 |         if method == 'pca':
176 |             pcamodel = PCA(self.data)
177 |             pcamodel.factorize(show_progress=False)
178 |             proj = pcamodel.H
179 |         else:
180 |             R = np.random.randn(self._base_sel, self._data_dimension)
181 |             proj = np.dot(R, self.data)
182 | 
183 |         self._hull_idx = select_hull_points(proj, n=self._base_sel)
184 |         aa_mdl = AA(self.data[:, self._hull_idx], num_bases=self._num_bases)
185 | 
186 |         # determine W
187 |         aa_mdl.factorize(niter=50, compute_h=True, compute_w=True,
188 |                          compute_err=True, show_progress=False)
189 | 
190 |         self.W = aa_mdl.W
191 |         self._map_w_to_data()
192 | 
193 |     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
194 |                   compute_err=True, niter=1):
195 |         """ Factorize s.t. WH = data
196 | 
197 |             Parameters
198 |             ----------
199 |             show_progress : bool
200 |                     print some extra information to stdout.
201 |             compute_h : bool
202 |                     iteratively update values for H.
203 |             compute_w : bool
204 |                     iteratively update values for W.
205 |             compute_err : bool
206 |                     compute Frobenius norm |data-WH| after each update and store
207 |                     it to .ferr[k].
208 | 
209 |             Updated Values
210 |             --------------
211 |             .W : updated values for W.
212 |             .H : updated values for H.
213 |             .ferr : Frobenius norm |data-WH|.
214 |         """
215 | 
216 |         AA.factorize(self, niter=1, show_progress=show_progress,
217 |                   compute_w=compute_w, compute_h=compute_h,
218 |                   compute_err=compute_err)
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     import doctest
223 |     doctest.testmod()
224 | 


--------------------------------------------------------------------------------
/pymf/svd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Singular Value Decomposition.
  8 | 
  9 |     SVD : Class for Singular Value Decomposition
 10 |     pinv() : Compute the pseudoinverse of a Matrix
 11 | 
 12 | """
 13 | 
 14 | 
 15 | 
 16 | from numpy.linalg import eigh
 17 | import scipy.sparse
 18 | 
 19 | try:
 20 |     import scipy.sparse.linalg.eigen.arpack as linalg
 21 | except (ImportError, AttributeError):
 22 |     import scipy.sparse.linalg as linalg
 23 | 
 24 | 
 25 | import numpy as np
 26 | 
 27 | def pinv(A, k=-1, eps=10**-8):
 28 |     # Compute Pseudoinverse of a matrix
 29 |     # calculate SVD
 30 |     svd_mdl =  SVD(A, k=k)
 31 |     svd_mdl.factorize()
 32 | 
 33 |     S = svd_mdl.S
 34 |     Sdiag = S.diagonal()
 35 |     Sdiag = np.where(Sdiag >eps, 1.0/Sdiag, 0.0)
 36 | 
 37 |     for i in range(S.shape[0]):
 38 |         S[i,i] = Sdiag[i]
 39 | 
 40 |     if scipy.sparse.issparse(A):
 41 |         A_p = svd_mdl.V.T * (S *  svd_mdl.U.T)
 42 |     else:
 43 |         A_p = np.dot(svd_mdl.V.T, np.core.multiply(np.diag(S)[:,np.newaxis], svd_mdl.U.T))
 44 | 
 45 |     return A_p
 46 | 
 47 | 
 48 | class SVD():
 49 |     """
 50 |     SVD(data, show_progress=False)
 51 | 
 52 | 
 53 |     Singular Value Decomposition. Factorize a data matrix into three matrices s.t.
 54 |     F = | data - USV| is minimal. U and V correspond to eigenvectors of the matrices
 55 |     data*data.T and data.T*data.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     data : array_like [data_dimension x num_samples]
 60 |         the input data
 61 | 
 62 |     Attributes
 63 |     ----------
 64 |         U,S,V : submatrices s.t. data = USV
 65 | 
 66 |     Example
 67 |     -------
 68 |     >>> import numpy as np
 69 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 70 |     >>> svd_mdl = SVD(data, show_progress=False)
 71 |     >>> svd_mdl.factorize()
 72 |     """
 73 | 
 74 |     _EPS=10**-8
 75 | 
 76 |     def __init__(self, data, k=-1, rrank=0, crank=0):
 77 |         self.data = data
 78 |         (self._rows, self._cols) = self.data.shape
 79 |         if rrank > 0:
 80 |             self._rrank = rrank
 81 |         else:
 82 |             self._rrank = self._rows
 83 | 
 84 |         if crank > 0:
 85 |             self._crank = crank
 86 |         else:
 87 |             self._crank = self._cols
 88 | 
 89 |         # set the rank to either rrank or crank
 90 |         self._k = k
 91 | 
 92 |     def frobenius_norm(self):
 93 |         """ Frobenius norm (||data - USV||) for a data matrix and a low rank
 94 |         approximation given by SVH using rank k for U and V
 95 | 
 96 |         Returns:
 97 |             frobenius norm: F = ||data - USV||
 98 |         """
 99 |         if scipy.sparse.issparse(self.data):
100 |             err = self.data - self.U*self.S*self.V
101 |             err = err.multiply(err)
102 |             err = np.sqrt(err.sum())
103 |         else:
104 |             err = self.data[:,:] - np.dot(np.dot(self.U, self.S), self.V)
105 |             err = np.sqrt(np.sum(err**2))
106 | 
107 |         return err
108 | 
109 | 
110 |     def factorize(self):
111 |         def _right_svd():
112 |             AA = np.dot(self.data[:,:], self.data[:,:].T)
113 |             values, u_vectors = eigh(AA)
114 | 
115 |             # get rid of too low eigenvalues
116 |             u_vectors = u_vectors[:, values > self._EPS]
117 |             values = values[values > self._EPS]
118 | 
119 |             # sort eigenvectors according to largest value
120 |             idx = np.argsort(values)
121 |             values = values[idx[::-1]]
122 | 
123 |             # argsort sorts in ascending order -> access is backwards
124 |             self.U = u_vectors[:,idx[::-1]]
125 | 
126 |             # compute S
127 |             self.S = np.diag(np.sqrt(values))
128 | 
129 |             # and the inverse of it
130 |             S_inv = np.diag(np.sqrt(values)**-1)
131 | 
132 |             # compute V from it
133 |             self.V = np.dot(S_inv, np.dot(self.U[:,:].T, self.data[:,:]))
134 | 
135 | 
136 |         def _left_svd():
137 |             AA = np.dot(self.data[:,:].T, self.data[:,:])
138 |             values, v_vectors = eigh(AA)
139 | 
140 |             # get rid of too low eigenvalues
141 |             v_vectors = v_vectors[:, values > self._EPS]
142 |             values = values[values > self._EPS]
143 | 
144 |             # sort eigenvectors according to largest value
145 |             # argsort sorts in ascending order -> access is backwards
146 |             idx = np.argsort(values)[::-1]
147 |             values = values[idx]
148 | 
149 |             # compute S
150 |             self.S= np.diag(np.sqrt(values))
151 | 
152 |             # and the inverse of it
153 |             S_inv = np.diag(1.0/np.sqrt(values))
154 | 
155 |             Vtmp = v_vectors[:,idx]
156 | 
157 |             self.U = np.dot(np.dot(self.data[:,:], Vtmp), S_inv)
158 |             self.V = Vtmp.T
159 | 
160 |         def _sparse_right_svd():
161 |             ## for some reasons arpack does not allow computation of rank(A) eigenvectors (??)    #
162 |             AA = self.data*self.data.transpose()
163 |             if self.data.shape[0] > 1:
164 |                 # do not compute full rank if desired
165 |                 if self._k > 0 and self._k < self.data.shape[0]-1:
166 |                     k = self._k
167 |                 else:
168 |                     k = self.data.shape[0]-1
169 | 
170 |                 try:
171 |                     values, u_vectors = linalg.eigen_symmetric(AA,k=k)
172 |                 except AttributeError:
173 |                     values, u_vectors = linalg.eigsh(AA,k=k)
174 |             else:
175 |                 values, u_vectors = eigh(AA.todense())
176 | 
177 |             # get rid of too low eigenvalues
178 |             u_vectors = u_vectors[:, values > self._EPS]
179 |             values = values[values > self._EPS]
180 | 
181 |             # sort eigenvectors according to largest value
182 |             idx = np.argsort(values)
183 |             values = values[idx[::-1]]
184 | 
185 |             # argsort sorts in ascending order -> access is backwards
186 |             self.U = scipy.sparse.csc_matrix(u_vectors[:,idx[::-1]])
187 | 
188 |             # compute S
189 |             self.S = scipy.sparse.csc_matrix(np.diag(np.sqrt(values)))
190 | 
191 |             # and the inverse of it
192 |             S_inv = scipy.sparse.csc_matrix(np.diag(1.0/np.sqrt(values)))
193 | 
194 |             # compute V from it
195 |             self.V = self.U.transpose() * self.data
196 |             self.V = S_inv * self.V
197 | 
198 |         def _sparse_left_svd():
199 |             # for some reasons arpack does not allow computation of rank(A) eigenvectors (??)
200 |             AA = self.data.transpose()*self.data
201 | 
202 |             if self.data.shape[1] > 1:
203 |                 # do not compute full rank if desired
204 |                 if self._k > 0 and self._k < self.data.shape[1]-1:
205 |                     k = self._k
206 |                 else:
207 |                     k = self.data.shape[1]-1
208 |                 try:
209 |                     values, v_vectors = linalg.eigen_symmetric(AA,k=k)
210 |                 except AttributeError:
211 |                     values, v_vectors = linalg.eigsh(AA,k=k)
212 |             else:
213 |                 values, v_vectors = eigh(AA.todense())
214 |             # get rid of too low eigenvalues
215 |             v_vectors = v_vectors[:, values > self._EPS]
216 |             values = values[values > self._EPS]
217 | 
218 |             # sort eigenvectors according to largest value
219 |             idx = np.argsort(values)
220 |             values = values[idx[::-1]]
221 | 
222 |             # argsort sorts in ascending order -> access is backwards
223 |             self.V = scipy.sparse.csc_matrix(v_vectors[:,idx[::-1]])
224 | 
225 |             # compute S
226 |             self.S = scipy.sparse.csc_matrix(np.diag(np.sqrt(values)))
227 | 
228 |             # and the inverse of it
229 |             S_inv = scipy.sparse.csc_matrix(np.diag(1.0/np.sqrt(values)))
230 | 
231 |             self.U = self.data * self.V * S_inv
232 |             self.V = self.V.transpose()
233 | 
234 | 
235 |         if self._rows > self._cols:
236 |             if scipy.sparse.issparse(self.data):
237 |                 _sparse_left_svd()
238 |             else:
239 |                 _left_svd()
240 |         else:
241 |             if scipy.sparse.issparse(self.data):
242 |                 _sparse_right_svd()
243 |             else:
244 |                 _right_svd()
245 | 
246 | if __name__ == "__main__":
247 |     import doctest
248 |     doctest.testmod()
249 | 


--------------------------------------------------------------------------------
/pymf/sivm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (C) Christian Thurau, 2010.
  4 | # Licensed under the GNU General Public License (GPL).
  5 | # http://www.gnu.org/licenses/gpl.txt
  6 | """
  7 | PyMF Simplex Volume Maximization [1]
  8 | 
  9 |     SIVM: class for SiVM
 10 | 
 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
 13 | Conf. on Information and Knowledge Management. ACM. 2010.
 14 | """
 15 | 
 16 | 
 17 | import scipy.sparse
 18 | import numpy as np
 19 | 
 20 | from .dist import *
 21 | from .aa import AA
 22 | 
 23 | __all__ = ["SIVM"]
 24 | 
 25 | class SIVM(AA):
 26 |     """
 27 |     SIVM(data, num_bases=4, dist_measure='l2')
 28 | 
 29 | 
 30 |     Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
 31 |     F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
 32 |     found by maximizing the volume of the resulting simplex (see [1]).
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     data : array_like, shape (_data_dimension, _num_samples)
 37 |         the input data
 38 |     num_bases: int, optional
 39 |         Number of bases to compute (column rank of W and row rank of H).
 40 |         4 (default)
 41 |     dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
 42 |         Standard is 'l2' which maximizes the volume of the simplex. In contrast,
 43 |         'cosine' maximizes the volume of a cone (see [1] for details).
 44 |      init : string (default: 'fastmap')
 45 |         'fastmap' or 'origin'. Sets the method used for finding the very first
 46 |         basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
 47 |         the two vectors that have the largest pairwise distance.
 48 |     Attributes
 49 |     ----------
 50 |     W : "data_dimension x num_bases" matrix of basis vectors
 51 |     H : "num bases x num_samples" matrix of coefficients
 52 |     ferr : frobenius norm (after calling .factorize())
 53 | 
 54 |     Example
 55 |     -------
 56 |     Applying SIVM to some rather stupid data set:
 57 | 
 58 |     >>> import numpy as np
 59 |     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
 60 |     >>> sivm_mdl = SIVM(data, num_bases=2)
 61 |     >>> sivm_mdl.factorize()
 62 | 
 63 |     The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
 64 |     To compute coefficients for an existing set of basis vectors simply copy W
 65 |     to sivm_mdl.W, and set compute_w to False:
 66 | 
 67 |     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
 68 |     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
 69 |     >>> sivm_mdl = SIVM(data, num_bases=2)
 70 |     >>> sivm_mdl.W = W
 71 |     >>> sivm_mdl.factorize(compute_w=False)
 72 | 
 73 |     The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
 74 |     """
 75 | 
 76 |     # always overwrite the default number of iterations
 77 |     # -> any value other does not make sense.
 78 |     _NITER = 1
 79 | 
 80 |     def __init__(self, data, num_bases=4, dist_measure='l2',  init='fastmap'):
 81 | 
 82 |         AA.__init__(self, data, num_bases=num_bases)
 83 | 
 84 |         self._dist_measure = dist_measure
 85 |         self._init = init
 86 | 
 87 |         # assign the correct distance function
 88 |         if self._dist_measure == 'l1':
 89 |             self._distfunc = l1_distance
 90 | 
 91 |         elif self._dist_measure == 'l2':
 92 |             self._distfunc = l2_distance
 93 | 
 94 |         elif self._dist_measure == 'cosine':
 95 |             self._distfunc = cosine_distance
 96 | 
 97 |         elif self._dist_measure == 'abs_cosine':
 98 |             self._distfunc = abs_cosine_distance
 99 | 
100 |         elif self._dist_measure == 'weighted_abs_cosine':
101 |             self._distfunc = weighted_abs_cosine_distance
102 | 
103 |         elif self._dist_measure == 'kl':
104 |             self._distfunc = kl_divergence
105 | 
106 | 
107 |     def _distance(self, idx):
108 |         """ compute distances of a specific data point to all other samples"""
109 | 
110 |         if scipy.sparse.issparse(self.data):
111 |             step = self.data.shape[1]
112 |         else:
113 |             step = 50000
114 | 
115 |         d = np.zeros((self.data.shape[1]))
116 |         if idx == -1:
117 |             # set vec to origin if idx=-1
118 |             vec = np.zeros((self.data.shape[0], 1))
119 |             if scipy.sparse.issparse(self.data):
120 |                 vec = scipy.sparse.csc_matrix(vec)
121 |         else:
122 |             vec = self.data[:, idx:idx+1]
123 | 
124 |         self._logger.info('compute distance to node ' + str(idx))
125 | 
126 |         # slice data into smaller chunks
127 |         for idx_start in range(0, self.data.shape[1], step):
128 |             if idx_start + step > self.data.shape[1]:
129 |                 idx_end = self.data.shape[1]
130 |             else:
131 |                 idx_end = idx_start + step
132 | 
133 |             d[idx_start:idx_end] = self._distfunc(
134 |                 self.data[:,idx_start:idx_end], vec)
135 |             self._logger.info('completed:' +
136 |                 str(idx_end/(self.data.shape[1]/100.0)) + "%")
137 |         return d
138 | 
139 |     def init_h(self):
140 |         self.H = np.zeros((self._num_bases, self._num_samples))
141 | 
142 |     def init_w(self):
143 |         self.W = np.zeros((self._data_dimension, self._num_bases))
144 | 
145 |     def init_sivm(self):
146 |         self.select = []
147 |         if self._init == 'fastmap':
148 |             # Fastmap like initialization
149 |             # set the starting index for fastmap initialization
150 |             cur_p = 0
151 | 
152 |             # after 3 iterations the first "real" index is found
153 |             for i in range(3):
154 |                 d = self._distance(cur_p)
155 |                 cur_p = np.argmax(d)
156 | 
157 |             # store maximal found distance -> later used for "a" (->update_w)
158 |             self._maxd = np.max(d)
159 |             self.select.append(cur_p)
160 | 
161 |         elif self._init == 'origin':
162 |             # set first vertex to origin
163 |             cur_p = -1
164 |             d = self._distance(cur_p)
165 |             self._maxd = np.max(d)
166 |             self.select.append(cur_p)
167 | 
168 |     def update_w(self):
169 |         """ compute new W """
170 |         EPS = 10**-8
171 |         self.init_sivm()
172 | 
173 |         # initialize some of the recursively updated distance measures ....
174 |         d_square = np.zeros((self.data.shape[1]))
175 |         d_sum = np.zeros((self.data.shape[1]))
176 |         d_i_times_d_j = np.zeros((self.data.shape[1]))
177 |         distiter = np.zeros((self.data.shape[1]))
178 |         a = np.log(self._maxd)
179 |         a_inc = a.copy()
180 | 
181 |         for l in range(1, self._num_bases):
182 |             d = self._distance(self.select[l-1])
183 | 
184 |             # take the log of d (sually more stable that d)
185 |             d = np.log(d + EPS)
186 | 
187 |             d_i_times_d_j += d * d_sum
188 |             d_sum += d
189 |             d_square += d**2
190 |             distiter = d_i_times_d_j + a*d_sum - (l/2.0) * d_square
191 | 
192 |             # detect the next best data point
193 |             self.select.append(np.argmax(distiter))
194 | 
195 |             self._logger.info('cur_nodes: ' + str(self.select))
196 | 
197 |         # sort indices, otherwise h5py won't work
198 |         self.W = self.data[:, np.sort(self.select)]
199 | 
200 |         # "unsort" it again to keep the correct order
201 |         self.W = self.W[:, np.argsort(np.argsort(self.select))]
202 | 
203 |     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
204 |                   compute_err=True, niter=1):
205 |         """ Factorize s.t. WH = data
206 | 
207 |             Parameters
208 |             ----------
209 |             show_progress : bool
210 |                     print some extra information to stdout.
211 |             compute_h : bool
212 |                     iteratively update values for H.
213 |             compute_w : bool
214 |                     iteratively update values for W.
215 |             compute_err : bool
216 |                     compute Frobenius norm |data-WH| after each update and store
217 |                     it to .ferr[k].
218 | 
219 |             Updated Values
220 |             --------------
221 |             .W : updated values for W.
222 |             .H : updated values for H.
223 |             .ferr : Frobenius norm |data-WH|.
224 |         """
225 | 
226 |         AA.factorize(self, niter=1, show_progress=show_progress,
227 |                   compute_w=compute_w, compute_h=compute_h,
228 |                   compute_err=compute_err)
229 | 
230 | if __name__ == "__main__":
231 |     import doctest
232 |     doctest.testmod()
233 | 


--------------------------------------------------------------------------------