├── setup.cfg ├── setup.py ├── .gitignore ├── pymf ├── __init__.py ├── vol.py ├── nmfnnls.py ├── greedycur.py ├── cmd.py ├── laesa.py ├── kmeans.py ├── cursl.py ├── snmf.py ├── cmeans.py ├── sivm_cur.py ├── nmfals.py ├── rnmf.py ├── nndsvd.py ├── dist.py ├── pca.py ├── bnmf.py ├── cur.py ├── aa.py ├── sivm_sgreedy.py ├── greedy.py ├── sivm_search.py ├── cnmf.py ├── sivm_gsat.py ├── nmf.py ├── sub.py ├── gmap.py ├── chnmf.py ├── svd.py └── sivm.py ├── README.txt └── tests └── test_pymf.py /setup.cfg: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --cov-report term-missing --cov pymf 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name='PyMF', 5 | version='0.1.9', 6 | description='Python Matrix Factorization Module', 7 | author='Christian Thurau', 8 | author_email='cthurau@googlemail.com', 9 | url='http://code.google.com/p/pymf/', 10 | packages=setuptools.find_packages(), 11 | license='OSI Approved :: GNU General Public License (GPL)', 12 | install_requires=[ 13 | 'cvxopt', 14 | 'numpy', 15 | 'scipy', 16 | ], 17 | extras_require={ 18 | 'tests': [ 19 | 'pytest', 20 | 'pytest-cov', 21 | ], 22 | }, 23 | tests_require=[ 24 | 'pytest', 25 | 'pytest-cov', 26 | ], 27 | long_description=open('README.txt').read(), 28 | ) 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /pymf/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | 7 | '''pymf is a package for several Matrix Factorization variants.- 8 | Detailed documentation is available at http://pymf.googlecode.com 9 | Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL) 10 | ''' 11 | 12 | 13 | import numpy as np 14 | from scipy.sparse import issparse 15 | 16 | from .nmf import * 17 | from .nmfals import * 18 | from .nmfnnls import * 19 | from .cnmf import * 20 | from .chnmf import * 21 | from .snmf import * 22 | from .aa import * 23 | 24 | from .laesa import * 25 | from .bnmf import * 26 | 27 | from .sub import * 28 | 29 | from .svd import * 30 | from .pca import * 31 | from .cur import * 32 | from .sivm_cur import * 33 | from .cmd import * 34 | 35 | from .kmeans import * 36 | from .cmeans import * 37 | 38 | from .sivm import * 39 | from .sivm_sgreedy import * 40 | from .sivm_search import * 41 | from .sivm_gsat import * 42 | 43 | from .gmap import * 44 | -------------------------------------------------------------------------------- /pymf/vol.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF functions for computing matrix/simplex volumes 8 | 9 | cmdet(): Cayley-Menger Determinant 10 | simplex_volume(): Ordinary simplex volume 11 | 12 | """ 13 | 14 | 15 | import numpy as np 16 | try: 17 | from scipy.misc.common import factorial 18 | except: 19 | from scipy.misc import factorial 20 | 21 | __all__ = ["cmdet", "simplex"] 22 | 23 | 24 | def cmdet(d): 25 | # compute the CMD determinant of the euclidean distance matrix d 26 | # -> d should not be squared! 27 | D = np.ones((d.shape[0]+1,d.shape[0]+1)) 28 | D[0,0] = 0.0 29 | D[1:,1:] = d**2 30 | j = np.float32(D.shape[0]-2) 31 | f1 = (-1.0)**(j+1) / ( (2**j) * ((factorial(j))**2)) 32 | cmd = f1 * np.linalg.det(D) 33 | # sometimes, for very small values "cmd" might be negative ... 34 | return np.sqrt(np.abs(cmd)) 35 | 36 | 37 | def simplex(d): 38 | # compute the simplex volume using coordinates 39 | D = np.ones((d.shape[0]+1, d.shape[1])) 40 | D[1:,:] = d 41 | vol = np.abs(np.linalg.det(D)) / factorial(d.shape[1] - 1) 42 | return vol 43 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Matrix Factorization Methods for Python (pymf) 2 | ============================================== 3 | 4 | What is PyMF? 5 | ------------- 6 | 7 | Python Matrix Factorization (PyMF) is a module for several constrained/unconstrained 8 | matrix factorization (and related) methods. The module is early alpha and not very well 9 | tested. 10 | 11 | PyMF currently includes the following methods: 12 | 13 | * Non-negative matrix factorization (NMF) 14 | * Convex non-negative matrix factorization (CNMF) 15 | * Semi non-negative matrix factorization (SNMF) 16 | * Archetypal analysis (AA) 17 | * Simplex volume maximization (SiVM) 18 | * Convex-hull non-negative matrix factorization (CHNMF) 19 | * Binary matrix factorization (BNMF) 20 | * Singular value decomposition (SVD) 21 | * Principal component analysis (PCA) 22 | * K-means clustering (Kmeans) 23 | * CUR decomposition (CUR) 24 | * Compaxt matrix decomposition (CMD) 25 | 26 | Where to get it 27 | --------------- 28 | 29 | * Main website, documentation: http://pymf.googlecode.com 30 | * Contact email: cthurau at googlemail.com 31 | 32 | 33 | Requires 34 | -------- 35 | 36 | * Linux, Mac OS-X or Windows 37 | * Python 2.5 or 2.6 38 | * NumPy, Cvxopt, Scipy 39 | -------------------------------------------------------------------------------- /pymf/nmfnnls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Non-negative Matrix Factorization. 8 | 9 | NMFALS: Class for Non-negative Matrix Factorization using non negative 10 | least squares optimization (requires scipy.optimize) 11 | 12 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative 13 | Matrix Factorization, Nature 401(6755), 788-799. 14 | """ 15 | 16 | 17 | 18 | import scipy.optimize 19 | from .nmf import NMF 20 | 21 | __all__ = ["NMFNNLS"] 22 | 23 | class NMFNNLS(NMF): 24 | """ 25 | NMFNNLS(data, num_bases=4) 26 | 27 | 28 | Non-negative Matrix Factorization. Factorize a data matrix into two matrices 29 | s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative 30 | data. Uses the Lawsons and Hanson's algorithm for non negative constrained 31 | least squares (-> also see scipy.optimize.nnls) 32 | 33 | Parameters 34 | ---------- 35 | data : array_like, shape (_data_dimension, _num_samples) 36 | the input data 37 | num_bases: int, optional 38 | Number of bases to compute (column rank of W and row rank of H). 39 | 4 (default) 40 | 41 | Attributes 42 | ---------- 43 | W : "data_dimension x num_bases" matrix of basis vectors 44 | H : "num bases x num_samples" matrix of coefficients 45 | ferr : frobenius norm (after calling .factorize()) 46 | 47 | Example 48 | ------- 49 | Applying NMF to some rather stupid data set: 50 | 51 | >>> import numpy as np 52 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 53 | >>> nmf_mdl = NMFALS(data, num_bases=2) 54 | >>> nmf_mdl.factorize(niter=10) 55 | 56 | The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H. 57 | To compute coefficients for an existing set of basis vectors simply copy W 58 | to nmf_mdl.W, and set compute_w to False: 59 | 60 | >>> data = np.array([[1.5], [1.2]]) 61 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 62 | >>> nmf_mdl = NMFALS(data, num_bases=2) 63 | >>> nmf_mdl.W = W 64 | >>> nmf_mdl.factorize(niter=1, compute_w=False) 65 | 66 | The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H. 67 | """ 68 | 69 | def update_h(self): 70 | def updatesingleH(i): 71 | self.H[:,i] = scipy.optimize.nnls(self.W, self.data[:,i])[0] 72 | 73 | map(updatesingleH, xrange(self._num_samples)) 74 | 75 | 76 | def update_w(self): 77 | def updatesingleW(i): 78 | self.W[i,:] = scipy.optimize.nnls(self.H.T, self.data[i,:].T)[0] 79 | 80 | map(updatesingleW, xrange(self._data_dimension)) 81 | -------------------------------------------------------------------------------- /pymf/greedycur.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | #$Id$ 7 | """ 8 | PyMF CUR-like Sparse Column Based Matrix Reconstruction via Greedy Approximation[1] 9 | 10 | GREEDYCUR: class for CUR-like decompositions using the GREEDY[2] algorithm. 11 | 12 | [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: 13 | Computing a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206. 14 | [2] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix 15 | Reconstruction via Greedy Approximation of SVD. ISAAC'2008. 16 | """ 17 | 18 | 19 | import numpy as np 20 | from .greedy import GREEDY 21 | from .cur import CUR 22 | 23 | __all__ = ["GREEDYCUR"] 24 | 25 | class GREEDYCUR(CUR): 26 | ''' 27 | GREEDYCUR(data, data, k=-1, rrank=0, crank=0) 28 | 29 | GREEDY-CUR Decomposition. Factorize a data matrix into three matrices s.t. 30 | F = | data - USV| is minimal. Unlike CUR, GREEDYCUR selects the rows 31 | and columns using GREEDY, i.e. it tries to find rows/columns that are close 32 | to SVD-based solutions. 33 | 34 | Parameters 35 | ---------- 36 | data : array_like [data_dimension x num_samples] 37 | the input data 38 | rrank: int, optional 39 | Number of rows to sample from data. 40 | 4 (default) 41 | crank: int, optional 42 | Number of columns to sample from data. 43 | 4 (default) 44 | show_progress: bool, optional 45 | Print some extra information 46 | False (default) 47 | 48 | Attributes 49 | ---------- 50 | U,S,V : submatrices s.t. data = USV 51 | 52 | Example 53 | ------- 54 | >>> import numpy as np 55 | >>> from greedycur import GREEDYCUR 56 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 57 | >>> cur_mdl = GREEDYCUR(data, show_progress=False, rrank=1, crank=2) 58 | >>> cur_mdl.factorize() 59 | """ 60 | ''' 61 | 62 | def sample(self, A, c): 63 | # set k to a value lower than the number of bases, usually 64 | # gives better results. 65 | k = np.round(c - c/5.0) 66 | greedy_mdl = GREEDY(A, k=k, num_bases=c) 67 | greedy_mdl.factorize(compute_h=False, compute_err=False, niter=1) 68 | return greedy_mdl.select 69 | 70 | 71 | def factorize(self): 72 | # sample row and column indices that maximize the volume of the submatrix 73 | self._rid = self.sample(self.data.transpose(), self._rrank) 74 | self._cid = self.sample(self.data, self._crank) 75 | self._rcnt = np.ones(len(self._rid)) 76 | self._ccnt = np.ones(len(self._cid)) 77 | 78 | self.computeUCR() 79 | 80 | 81 | if __name__ == "__main__": 82 | import doctest 83 | doctest.testmod() 84 | -------------------------------------------------------------------------------- /pymf/cmd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Compact Matrix Decomposition [1] 8 | 9 | CMD(CUR): Class for Compact Matrix Decomposition 10 | 11 | [1] Sun, J., Xie, Y., Zhang, H. and Faloutsos, C. (2007), Less is More: Compact Matrix Decomposition for Large 12 | Sparse Graphs, in Proc. SIAM Int. Conf. on Data Mining. 13 | """ 14 | 15 | 16 | import numpy as np 17 | from .cur import CUR 18 | 19 | __all__ = ["CMD"] 20 | 21 | class CMD(CUR): 22 | """ 23 | CMD(data, rrank=0, crank=0) 24 | 25 | 26 | Compact Matrix Decomposition. Factorize a data matrix into three matrices s.t. 27 | F = | data - USV| is minimal. CMD randomly selects rows and columns from 28 | data for building U and V, respectively. 29 | 30 | Parameters 31 | ---------- 32 | data : array_like [data_dimension x num_samples] 33 | the input data 34 | rrank: int, optional 35 | Number of rows to sample from data. Double entries are eliminiated s.t. 36 | the resulting rank might be lower. 37 | 4 (default) 38 | crank: int, optional 39 | Number of columns to sample from data. Double entries are eliminiated s.t. 40 | the resulting rank might be lower. 41 | 4 (default) 42 | 43 | Attributes 44 | ---------- 45 | U,S,V : submatrices s.t. data = USV 46 | 47 | Example 48 | ------- 49 | >>> import numpy as np 50 | >>> from cmd import CMD 51 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 52 | >>> cmd_mdl = CMD(data, show_progress=False, rrank=1, crank=2) 53 | >>> cmd_mdl.factorize() 54 | """ 55 | 56 | def _cmdinit(self): 57 | nrids = np.unique(self._rid) 58 | ncids = np.unique(self._cid) 59 | 60 | self._rcnt = np.zeros(len(nrids)) 61 | self._ccnt = np.zeros(len(ncids)) 62 | 63 | for i,idx in enumerate(nrids): 64 | self._rcnt[i] = len(np.where(self._rid == idx)[0]) 65 | 66 | for i,idx in enumerate(ncids): 67 | self._ccnt[i] = len(np.where(self._cid == idx)[0]) 68 | 69 | self._rid = np.int32(list(nrids)) 70 | self._cid = np.int32(list(ncids)) 71 | 72 | def factorize(self): 73 | """ Factorize s.t. CUR = data 74 | 75 | Updated Values 76 | -------------- 77 | .C : updated values for C. 78 | .U : updated values for U. 79 | .R : updated values for R. 80 | """ 81 | 82 | [prow, pcol] = self.sample_probability() 83 | 84 | self._rid = self.sample(self._rrank, prow) 85 | self._cid = self.sample(self._crank, pcol) 86 | 87 | self._cmdinit() 88 | 89 | self.computeUCR() 90 | 91 | 92 | if __name__ == "__main__": 93 | import doctest 94 | doctest.testmod() 95 | -------------------------------------------------------------------------------- /pymf/laesa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF LAESA 8 | """ 9 | 10 | 11 | import scipy.sparse 12 | import numpy as np 13 | 14 | from .dist import * 15 | from .sivm import SIVM 16 | 17 | __all__ = ["LAESA"] 18 | 19 | class LAESA(SIVM): 20 | """ 21 | LAESA(data, num_bases=4) 22 | 23 | 24 | Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. 25 | F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively 26 | found by maximizing the volume of the resulting simplex (see [1]). 27 | 28 | Parameters 29 | ---------- 30 | data : array_like, shape (_data_dimension, _num_samples) 31 | the input data 32 | num_bases: int, optional 33 | Number of bases to compute (column rank of W and row rank of H). 34 | 4 (default) 35 | 36 | Attributes 37 | ---------- 38 | W : "data_dimension x num_bases" matrix of basis vectors 39 | H : "num bases x num_samples" matrix of coefficients 40 | ferr : frobenius norm (after calling .factorize()) 41 | 42 | Example 43 | ------- 44 | Applying LAESA to some rather stupid data set: 45 | 46 | >>> import numpy as np 47 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 48 | >>> laesa_mdl = LAESA(data, num_bases=2) 49 | >>> laesa_mdl.factorize() 50 | 51 | The basis vectors are now stored in laesa_mdl.W, the coefficients in laesa_mdl.H. 52 | To compute coefficients for an existing set of basis vectors simply copy W 53 | to laesa_mdl.W, and set compute_w to False: 54 | 55 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 56 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 57 | >>> laesa_mdl = LAESA(data, num_bases=2) 58 | >>> laesa_mdl.W = W 59 | >>> laesa_mdl.factorize(niter=1, compute_w=False) 60 | 61 | The result is a set of coefficients laesa_mdl.H, s.t. data = W * laesa_mdl.H. 62 | """ 63 | def update_w(self): 64 | # initialize some of the recursively updated distance measures 65 | self.init_sivm() 66 | distiter = self._distance(self.select[-1]) 67 | 68 | for l in range(self._num_bases-1): 69 | d = self._distance(self.select[-1]) 70 | 71 | # replace distances in distiter 72 | distiter = np.where(d>> import numpy as np 46 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 47 | >>> kmeans_mdl = Kmeans(data, num_bases=2) 48 | >>> kmeans_mdl.factorize(niter=10) 49 | 50 | The basis vectors are now stored in kmeans_mdl.W, the coefficients in kmeans_mdl.H. 51 | To compute coefficients for an existing set of basis vectors simply copy W 52 | to kmeans_mdl.W, and set compute_w to False: 53 | 54 | >>> data = np.array([[1.5], [1.2]]) 55 | >>> W = [[1.0, 0.0], [0.0, 1.0]] 56 | >>> kmeans_mdl = Kmeans(data, num_bases=2) 57 | >>> kmeans_mdl.W = W 58 | >>> kmeans_mdl.factorize(niter=1, compute_w=False) 59 | 60 | The result is a set of coefficients kmeans_mdl.H, s.t. data = W * kmeans_mdl.H. 61 | """ 62 | def init_h(self): 63 | # W has to be present for H to be initialized 64 | self.H = np.zeros((self._num_bases, self._num_samples)) 65 | self.update_h() 66 | 67 | def init_w(self): 68 | # set W to some random data samples 69 | sel = random.sample(xrange(self._num_samples), self._num_bases) 70 | 71 | # sort indices, otherwise h5py won't work 72 | self.W = self.data[:, np.sort(sel)] 73 | 74 | 75 | def update_h(self): 76 | # and assign samples to the best matching centers 77 | self.assigned = dist.vq(self.W, self.data) 78 | self.H = np.zeros(self.H.shape) 79 | self.H[self.assigned, range(self._num_samples)] = 1.0 80 | 81 | 82 | def update_w(self): 83 | for i in range(self._num_bases): 84 | idx = np.where(self.assigned==i)[0] 85 | n = len(idx) 86 | if n > 1: 87 | self.W[:,i] = np.sum(self.data[:,idx], axis=1)/n 88 | -------------------------------------------------------------------------------- /pymf/cursl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | #$Id$ 7 | """ 8 | PyMF CUR Decomposition [1] 9 | 10 | CURSL(SVD) : Class for CUR Decomposition (uses statistical leverage based sampling) 11 | 12 | [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing 13 | a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206. 14 | """ 15 | 16 | 17 | import numpy as np 18 | import scipy.sparse 19 | 20 | from .svd import pinv, SVD 21 | from .cmd import CMD 22 | 23 | __all__ = ["CURSL"] 24 | 25 | class CURSL(CMD): 26 | """ 27 | CURSL(data, data, rrank=0, crank=0) 28 | 29 | CUR/CMD Decomposition. Factorize a data matrix into three matrices s.t. 30 | F = | data - USV| is minimal. CURSL randomly selects rows and columns from 31 | data for building U and V, respectively. The importance sampling is based 32 | on a statistical leverage score from the top-k singular vectors (k is 33 | currently set to 4/5*rrank and 4/5*crank). 34 | 35 | Parameters 36 | ---------- 37 | data : array_like [data_dimension x num_samples] 38 | the input data 39 | rrank: int, optional 40 | Number of rows to sample from data. 41 | 4 (default) 42 | crank: int, optional 43 | Number of columns to sample from data. 44 | 4 (default) 45 | show_progress: bool, optional 46 | Print some extra information 47 | False (default) 48 | 49 | Attributes 50 | ---------- 51 | U,S,V : submatrices s.t. data = USV (or _C _U _R) 52 | 53 | Example 54 | ------- 55 | >>> import numpy as np 56 | >>> from cur import CUR 57 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 58 | >>> cur_mdl = CURSL(data, show_progress=False, rrank=1, crank=2) 59 | >>> cur_mdl.factorize() 60 | """ 61 | 62 | def __init__(self, data, k=-1, rrank=0, crank=0): 63 | SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank) 64 | 65 | def sample_probability(self): 66 | def comp_prob(d, k): 67 | # compute statistical leverage score 68 | c = np.round(k - k/5.0) 69 | 70 | svd_mdl = SVD(d, k=c) 71 | svd_mdl.factorize() 72 | 73 | if scipy.sparse.issparse(self.data): 74 | A = svd_mdl.V.multiply(svd_mdl.V) 75 | ## Rule 1 76 | pcol = np.array(A.sum(axis=0)/k) 77 | else: 78 | A = svd_mdl.V[:k,:]**2.0 79 | ## Rule 1 80 | pcol = A.sum(axis=0)/k 81 | 82 | #c = k * np.log(k/ (self._eps**2.0)) 83 | #pcol = c * pcol.reshape((-1,1)) 84 | pcol /= np.sum(pcol) 85 | return pcol 86 | 87 | pcol = comp_prob(self.data, self._rrank) 88 | prow = comp_prob(self.data.transpose(), self._crank) 89 | 90 | 91 | return (prow.reshape(-1,1), pcol.reshape(-1,1)) 92 | -------------------------------------------------------------------------------- /pymf/snmf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Semi Non-negative Matrix Factorization. 8 | 9 | SNMF(NMF) : Class for semi non-negative matrix factorization 10 | 11 | [1] Ding, C., Li, T. and Jordan, M.. Convex and Semi-Nonnegative Matrix Factorizations. 12 | IEEE Trans. on Pattern Analysis and Machine Intelligence 32(1), 45-55. 13 | """ 14 | 15 | 16 | 17 | import numpy as np 18 | 19 | from .nmf import NMF 20 | 21 | __all__ = ["SNMF"] 22 | 23 | class SNMF(NMF): 24 | """ 25 | SNMF(data, num_bases=4) 26 | 27 | Semi Non-negative Matrix Factorization. Factorize a data matrix into two 28 | matrices s.t. F = | data - W*H | is minimal. 29 | 30 | Parameters 31 | ---------- 32 | data : array_like, shape (_data_dimension, _num_samples) 33 | the input data 34 | num_bases: int, optional 35 | Number of bases to compute (column rank of W and row rank of H). 36 | 4 (default) 37 | 38 | Attributes 39 | ---------- 40 | W : "data_dimension x num_bases" matrix of basis vectors 41 | H : "num bases x num_samples" matrix of coefficients 42 | ferr : frobenius norm (after calling .factorize()) 43 | 44 | Example 45 | ------- 46 | Applying Semi-NMF to some rather stupid data set: 47 | 48 | >>> import numpy as np 49 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 50 | >>> snmf_mdl = SNMF(data, num_bases=2) 51 | >>> snmf_mdl.factorize(niter=10) 52 | 53 | The basis vectors are now stored in snmf_mdl.W, the coefficients in snmf_mdl.H. 54 | To compute coefficients for an existing set of basis vectors simply copy W 55 | to snmf_mdl.W, and set compute_w to False: 56 | 57 | >>> data = np.array([[1.5], [1.2]]) 58 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 59 | >>> snmf_mdl = SNMF(data, num_bases=2) 60 | >>> snmf_mdl.W = W 61 | >>> snmf_mdl.factorize(niter=1, compute_w=False) 62 | 63 | The result is a set of coefficients snmf_mdl.H, s.t. data = W * snmf_mdl.H. 64 | """ 65 | 66 | 67 | def update_w(self): 68 | W1 = np.dot(self.data[:,:], self.H.T) 69 | W2 = np.dot(self.H, self.H.T) 70 | self.W = np.dot(W1, np.linalg.inv(W2)) 71 | 72 | def update_h(self): 73 | def separate_positive(m): 74 | return (np.abs(m) + m)/2.0 75 | 76 | def separate_negative(m): 77 | return (np.abs(m) - m)/2.0 78 | 79 | XW = np.dot(self.data[:,:].T, self.W) 80 | 81 | WW = np.dot(self.W.T, self.W) 82 | WW_pos = separate_positive(WW) 83 | WW_neg = separate_negative(WW) 84 | 85 | XW_pos = separate_positive(XW) 86 | H1 = (XW_pos + np.dot(self.H.T, WW_neg)).T 87 | 88 | XW_neg = separate_negative(XW) 89 | H2 = (XW_neg + np.dot(self.H.T,WW_pos)).T + 10**-9 90 | 91 | self.H *= np.sqrt(H1/H2) 92 | 93 | if __name__ == "__main__": 94 | import doctest 95 | doctest.testmod() 96 | -------------------------------------------------------------------------------- /pymf/cmeans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF K-means clustering (unary-convex matrix factorization). 8 | Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL). 9 | """ 10 | 11 | 12 | 13 | import numpy as np 14 | 15 | from . import dist 16 | from .nmf import NMF 17 | 18 | __all__ = ["Cmeans"] 19 | 20 | class Cmeans(NMF): 21 | """ 22 | cmeans(data, num_bases=4) 23 | 24 | 25 | Fuzzy c-means soft clustering. Factorize a data matrix into two matrices s.t. 26 | F = | data - W*H | is minimal. H is restricted to convexity (columns 27 | sum to 1) W is simply the weighted mean over the corresponding samples in 28 | data. Note that the objective function is based on distances (?), hence the 29 | Frobenius norm is probably not a good quality measure. 30 | 31 | Parameters 32 | ---------- 33 | data : array_like, shape (_data_dimension, _num_samples) 34 | the input data 35 | num_bases: int, optional 36 | Number of bases to compute (column rank of W and row rank of H). 37 | 4 (default) 38 | 39 | 40 | Attributes 41 | ---------- 42 | W : "data_dimension x num_bases" matrix of basis vectors 43 | H : "num bases x num_samples" matrix of coefficients 44 | ferr : frobenius norm (after calling .factorize()) 45 | 46 | Example 47 | ------- 48 | Applying C-means to some rather stupid data set: 49 | 50 | >>> import numpy as np 51 | >>> from cmeans import Cmeans 52 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 53 | >>> cmeans_mdl = Cmeans(data, num_bases=2, niter=10) 54 | >>> cmeans_mdl.initialization() 55 | >>> cmeans_mdl.factorize() 56 | 57 | The basis vectors are now stored in cmeans_mdl.W, the coefficients in cmeans_mdl.H. 58 | To compute coefficients for an existing set of basis vectors simply copy W 59 | to cmeans_mdl.W, and set compute_w to False: 60 | 61 | >>> data = np.array([[1.5], [1.2]]) 62 | >>> W = [[1.0, 0.0], [0.0, 1.0]] 63 | >>> cmeans_mdl = Cmeans(data, num_bases=2) 64 | >>> cmeans_mdl.initialization() 65 | >>> cmeans_mdl.W = W 66 | >>> cmeans_mdl.factorize(compute_w=False, niter=50) 67 | 68 | The result is a set of coefficients kmeans_mdl.H, s.t. data = W * kmeans_mdl.H. 69 | """ 70 | 71 | def update_h(self): 72 | # assign samples to best matching centres ... 73 | m = 1.75 74 | tmp_dist = dist.pdist(self.W, self.data, metric='l2') + self._EPS 75 | self.H[:,:] = 0.0 76 | 77 | for i in range(self._num_bases): 78 | for k in range(self._num_bases): 79 | self.H[i,:] += (tmp_dist[i,:]/tmp_dist[k,:])**(2.0/(m-1)) 80 | 81 | self.H = np.where(self.H>0, 1.0/self.H, 0) 82 | 83 | def update_w(self): 84 | for i in range(self._num_bases): 85 | tmp = (self.H[i:i+1,:] * self.data).sum(axis=1) 86 | self.W[:,i] = tmp/(self.H[i,:].sum() + self._EPS) 87 | -------------------------------------------------------------------------------- /pymf/sivm_cur.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Simplex Volume Maximization for CUR [1] 8 | 9 | SIVMCUR: class for SiVM-CUR 10 | 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 13 | Conf. on Information and Knowledge Management. ACM. 2010. 14 | """ 15 | 16 | 17 | import numpy as np 18 | import scipy 19 | from .sivm import SIVM 20 | from .cur import CUR 21 | 22 | __all__ = ["SIVM_CUR"] 23 | 24 | class SIVM_CUR(CUR): 25 | ''' 26 | SIVM_CUR(data, num_bases=4, dist_measure='l2') 27 | 28 | Simplex Volume based CUR Decomposition. Factorize a data matrix into three 29 | matrices s.t. F = | data - USV| is minimal. Unlike CUR, SIVMCUR selects the 30 | rows and columns using SIVM, i.e. it tries to maximize the volume of the 31 | enclosed simplex. 32 | 33 | Parameters 34 | ---------- 35 | data : array_like [data_dimension x num_samples] 36 | the input data 37 | rrank: int, optional 38 | Number of rows to sample from data. 39 | 4 (default)crank 40 | crank: int, optional 41 | Number of columns to sample from data. 42 | 4 (default) 43 | dist_measure: string, optional 44 | The distance measure for finding the next best candidate that 45 | maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2'] 46 | 'l2' (default) 47 | 48 | Attributes 49 | ---------- 50 | U,S,V : submatrices s.t. data = USV 51 | 52 | Example 53 | ------- 54 | >>> import numpy as np 55 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 56 | >>> sivmcur_mdl = SIVM_CUR(data, show_progress=False, rrank=1, crank=2) 57 | >>> sivmcur_mdl.factorize() 58 | ''' 59 | 60 | def __init__(self, data, k=-1, rrank=0, crank=0, dist_measure='l2', init='origin'): 61 | CUR.__init__(self, data, k=k, rrank=rrank, crank=rrank) 62 | self._dist_measure = dist_measure 63 | self.init = init 64 | 65 | def sample(self, A, c): 66 | # for optimizing the volume of the submatrix, set init to 'origin' (otherwise the volume of 67 | # the ordinary simplex would be optimized) 68 | sivm_mdl = SIVM(A, num_bases=c, dist_measure=self._dist_measure, 69 | init=self.init) 70 | sivm_mdl.factorize(show_progress=False, compute_w=True, niter=1, 71 | compute_h=False, compute_err=False) 72 | 73 | return sivm_mdl.select 74 | 75 | 76 | def factorize(self): 77 | """ Factorize s.t. CUR = data 78 | 79 | Updated Values 80 | -------------- 81 | .C : updated values for C. 82 | .U : updated values for U. 83 | .R : updated values for R. 84 | """ 85 | # sample row and column indices that maximize the volume of the submatrix 86 | self._rid = self.sample(self.data.transpose(), self._rrank) 87 | self._cid = self.sample(self.data, self._crank) 88 | 89 | self._rcnt = np.ones(len(self._rid)) 90 | self._ccnt = np.ones(len(self._cid)) 91 | 92 | self.computeUCR() 93 | 94 | 95 | if __name__ == "__main__": 96 | import doctest 97 | doctest.testmod() 98 | -------------------------------------------------------------------------------- /pymf/nmfals.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Non-negative Matrix Factorization. 8 | 9 | NMFALS: Class for Non-negative Matrix Factorization using alternating least 10 | squares optimization (requires cvxopt) 11 | 12 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative 13 | Matrix Factorization, Nature 401(6755), 788-799. 14 | """ 15 | 16 | 17 | 18 | import numpy as np 19 | from cvxopt import solvers, base 20 | from .nmf import NMF 21 | 22 | __all__ = ["NMFALS"] 23 | 24 | class NMFALS(NMF): 25 | """ 26 | NMF(data, num_bases=4) 27 | 28 | 29 | Non-negative Matrix Factorization. Factorize a data matrix into two matrices 30 | s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative 31 | data. Uses the an alternating least squares procedure (quite slow for larger 32 | data sets) 33 | 34 | Parameters 35 | ---------- 36 | data : array_like, shape (_data_dimension, _num_samples) 37 | the input data 38 | num_bases: int, optional 39 | Number of bases to compute (column rank of W and row rank of H). 40 | 4 (default) 41 | 42 | Attributes 43 | ---------- 44 | W : "data_dimension x num_bases" matrix of basis vectors 45 | H : "num bases x num_samples" matrix of coefficients 46 | ferr : frobenius norm (after calling .factorize()) 47 | 48 | Example 49 | ------- 50 | Applying NMF to some rather stupid data set: 51 | 52 | >>> import numpy as np 53 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 54 | >>> nmf_mdl = NMFALS(data, num_bases=2) 55 | >>> nmf_mdl.factorize(niter=10) 56 | 57 | The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H. 58 | To compute coefficients for an existing set of basis vectors simply copy W 59 | to nmf_mdl.W, and set compute_w to False: 60 | 61 | >>> data = np.array([[1.5], [1.2]]) 62 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 63 | >>> nmf_mdl = NMFALS(data, num_bases=2) 64 | >>> nmf_mdl.W = W 65 | >>> nmf_mdl.factorize(niter=1, compute_w=False) 66 | 67 | The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H. 68 | """ 69 | 70 | def update_h(self): 71 | def updatesingleH(i): 72 | # optimize alpha using qp solver from cvxopt 73 | FA = base.matrix(np.float64(np.dot(-self.W.T, self.data[:,i]))) 74 | al = solvers.qp(HA, FA, INQa, INQb) 75 | self.H[:,i] = np.array(al['x']).reshape((1,-1)) 76 | 77 | # float64 required for cvxopt 78 | HA = base.matrix(np.float64(np.dot(self.W.T, self.W))) 79 | INQa = base.matrix(-np.eye(self._num_bases)) 80 | INQb = base.matrix(0.0, (self._num_bases,1)) 81 | 82 | map(updatesingleH, xrange(self._num_samples)) 83 | 84 | 85 | def update_w(self): 86 | def updatesingleW(i): 87 | # optimize alpha using qp solver from cvxopt 88 | FA = base.matrix(np.float64(np.dot(-self.H, self.data[i,:].T))) 89 | al = solvers.qp(HA, FA, INQa, INQb) 90 | self.W[i,:] = np.array(al['x']).reshape((1,-1)) 91 | 92 | # float64 required for cvxopt 93 | HA = base.matrix(np.float64(np.dot(self.H, self.H.T))) 94 | INQa = base.matrix(-np.eye(self._num_bases)) 95 | INQb = base.matrix(0.0, (self._num_bases,1)) 96 | 97 | map(updatesingleW, xrange(self._data_dimension)) 98 | -------------------------------------------------------------------------------- /pymf/rnmf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Non-negative Matrix Factorization. 8 | 9 | NMF: Class for Non-negative Matrix Factorization 10 | 11 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative 12 | Matrix Factorization, Nature 401(6755), 788-799. 13 | """ 14 | 15 | 16 | import numpy as np 17 | import logging 18 | import logging.config 19 | import scipy.sparse 20 | 21 | from .nmf import NMF 22 | 23 | __all__ = ["RNMF"] 24 | 25 | class RNMF(NMF): 26 | """ 27 | RNMF(data, num_bases=4) 28 | 29 | 30 | Non-negative Matrix Factorization. Factorize a data matrix into two matrices 31 | s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative 32 | data. Uses the classicial multiplicative update rule. 33 | 34 | Parameters 35 | ---------- 36 | data : array_like, shape (_data_dimension, _num_samples) 37 | the input data 38 | num_bases: int, optional 39 | Number of bases to compute (column rank of W and row rank of H). 40 | 4 (default) 41 | 42 | Attributes 43 | ---------- 44 | W : "data_dimension x num_bases" matrix of basis vectors 45 | H : "num bases x num_samples" matrix of coefficients 46 | ferr : frobenius norm (after calling .factorize()) 47 | 48 | Example 49 | ------- 50 | Applying NMF to some rather stupid data set: 51 | 52 | >>> import numpy as np 53 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 54 | >>> nmf_mdl = NMF(data, num_bases=2, niter=10) 55 | >>> nmf_mdl.factorize() 56 | 57 | The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H. 58 | To compute coefficients for an existing set of basis vectors simply copy W 59 | to nmf_mdl.W, and set compute_w to False: 60 | 61 | >>> data = np.array([[1.5], [1.2]]) 62 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 63 | >>> nmf_mdl = NMF(data, num_bases=2) 64 | >>> nmf_mdl.W = W 65 | >>> nmf_mdl.factorize(niter=20, compute_w=False) 66 | 67 | The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H. 68 | """ 69 | 70 | def __init__(self, data, num_bases=4, lamb=2.0): 71 | # call inherited method 72 | NMF.__init__(self, data, num_bases=num_bases) 73 | self._lamb = lamb 74 | 75 | def soft_thresholding(self, X, lamb): 76 | X = np.where(np.abs(X) <= lamb, 0.0, X) 77 | X = np.where(X > lamb, X - lamb, X) 78 | X = np.where(X < -1.0*lamb, X + lamb, X) 79 | return X 80 | 81 | def init_w(self): 82 | self.W = np.random.random((self._data_dimension, self._num_bases)) 83 | 84 | def init_h(self): 85 | self.H = np.random.random((self._num_bases, self._num_samples)) 86 | self.H[:,:] = 1.0 87 | # normalized bases 88 | Wnorm = np.sqrt(np.sum(self.W**2.0, axis=0)) 89 | self.W /= Wnorm 90 | 91 | for i in range(self.H.shape[0]): 92 | self.H[i,:] *= Wnorm[i] 93 | 94 | self.update_s() 95 | 96 | def update_s(self): 97 | self.S = self.data - np.dot(self.W, self.H) 98 | self.S = self.soft_thresholding(self.S, self._lamb) 99 | 100 | def update_h(self): 101 | # pre init H1, and H2 (necessary for storing matrices on disk) 102 | H1 = np.dot(self.W.T, self.S - self.data) 103 | H1 = np.abs(H1) - H1 104 | H1 /= (2.0* np.dot(self.W.T, np.dot(self.W, self.H))) 105 | self.H *= H1 106 | 107 | # adapt S 108 | self.update_s() 109 | 110 | def update_w(self): 111 | # pre init W1, and W2 (necessary for storing matrices on disk) 112 | W1 = np.dot(self.S - self.data, self.H.T) 113 | #W1 = np.dot(self.data - self.S, self.H.T) 114 | W1 = np.abs(W1) - W1 115 | W1 /= (2.0 * (np.dot(self.W, np.dot(self.H, self.H.T)))) 116 | self.W *= W1 117 | 118 | if __name__ == "__main__": 119 | import doctest 120 | doctest.testmod() 121 | -------------------------------------------------------------------------------- /pymf/nndsvd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | #$Id$ 7 | """ 8 | PyMF Non-negative Double Singular Value Decompositions. 9 | 10 | NNDSVD: Class for Non-negative Double Singular Value Decompositions [1] 11 | 12 | [1] C. Boutsidis and E. Gallopoulos (2008), SVD based initialization: A head 13 | start for nonnegative matrix factorization, Pattern Recognition, 41, 1350-1362 14 | """ 15 | 16 | 17 | import numpy as np 18 | 19 | from .nmf import NMF 20 | from .svd import SVD 21 | 22 | __all__ = ["NNDSVD"] 23 | 24 | class NNDSVD(NMF): 25 | """ 26 | NNDSVD(data, num_bases=4) 27 | 28 | 29 | Non-negative Double Singular Value Decompositions. Factorize a data 30 | matrix into two matrices s.t. F = | data - W*H | = | is minimal. H, and 31 | W are restricted to non-negative data. NNDSVD is primarily used for 32 | initializing NMF. 33 | 34 | Parameters 35 | ---------- 36 | data : array_like, shape (_data_dimension, _num_samples) 37 | the input data 38 | num_bases: int, optional 39 | Number of bases to compute (column rank of W and row rank of H). 40 | 4 (default) 41 | 42 | Attributes 43 | ---------- 44 | W : "data_dimension x num_bases" matrix of basis vectors 45 | H : "num bases x num_samples" matrix of coefficients 46 | ferr : frobenius norm (after calling .factorize()) 47 | 48 | Example 49 | ------- 50 | Applying NNDSVD to some rather stupid data set: 51 | 52 | >>> import numpy as np 53 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 54 | >>> nndsvd_mdl = NNDSVD(data, num_bases=2) 55 | >>> nndsvd_mdl.factorize() 56 | 57 | The basis vectors are now stored in nndsvd_mdl.W, the coefficients in 58 | nndsvd_mdl.H. To initialize NMF with nndsvd_mdl.W, nndsvd_mdl.H 59 | simply copy W to nmf_mdl.W and H to nmf_mdl.H: 60 | 61 | >>> data = np.array([[1.5], [1.2]]) 62 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 63 | >>> nmf_mdl = NMF(data, num_bases=2) 64 | >>> nmf_mdl.W = nndsvd_mdl.W 65 | >>> nmf_mdl.H = nndsvd_mdl.H 66 | >>> nmf_mdl.factorize(niter=20) 67 | 68 | The result is a set of (more optimal) coefficients nmf_mdl.H, nmf_mdl.W. 69 | """ 70 | def init_w(self): 71 | self.W = np.zeros((self._data_dimension, self._num_bases)) 72 | 73 | def init_h(self): 74 | self.H = np.zeros((self._num_bases, self._num_samples)) 75 | 76 | def update_h(self): 77 | pass 78 | 79 | def update_w(self): 80 | svd_mdl = SVD(self.data) 81 | svd_mdl.factorize() 82 | 83 | U, S, V = svd_mdl.U, svd_mdl.S, svd_mdl.V 84 | 85 | # The first left singular vector is nonnegative 86 | # (abs is only used as values could be all negative) 87 | self.W[:,0] = np.sqrt(S[0,0]) * np.abs(U[:,0]) 88 | 89 | #The first right singular vector is nonnegative 90 | self.H[0,:] = np.sqrt(S[0,0]) * np.abs(V[0,:].T) 91 | 92 | for i in range(1,self._num_bases): 93 | # Form the rank one factor 94 | Tmp = np.dot(U[:,i:i+1]*S[i,i], V[i:i+1,:]) 95 | 96 | # zero out the negative elements 97 | Tmp = np.where(Tmp < 0, 0.0, Tmp) 98 | 99 | # Apply 2nd SVD 100 | svd_mdl_2 = SVD(Tmp) 101 | svd_mdl_2.factorize() 102 | u, s, v = svd_mdl_2.U, svd_mdl_2.S, svd_mdl_2.V 103 | 104 | # The first left singular vector is nonnegative 105 | self.W[:,i] = np.sqrt(s[0,0]) * np.abs(u[:,0]) 106 | 107 | #The first right singular vector is nonnegative 108 | self.H[i,:] = np.sqrt(s[0,0]) * np.abs(v[0,:].T) 109 | 110 | def factorize(self, niter=1, show_progress=False, 111 | compute_w=True, compute_h=True, compute_err=True): 112 | 113 | # enforce certain default values, otherwise it won't work 114 | NMF.factorize(self, niter=1, show_progress=show_progress, 115 | compute_w=True, compute_h=True, compute_err=compute_err) 116 | 117 | if __name__ == "__main__": 118 | import doctest 119 | doctest.testmod() 120 | -------------------------------------------------------------------------------- /tests/test_pymf.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | ## pymf - Python Matrix Factorization library 3 | ## Copyright (C) 2010 Christian Thurau 4 | ## 5 | ## This library is free software; you can redistribute it and/or 6 | ## modify it under the terms of the GNU Library General Public 7 | ## License as published by the Free Software Foundation; either 8 | ## version 2 of the License, or (at your option) any later version. 9 | ## 10 | ## This library is distributed in the hope that it will be useful, 11 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | ## Library General Public License for more details. 14 | ## 15 | ## You should have received a copy of the GNU Library General Public 16 | ## License along with this library; if not, write to the Free 17 | ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 | ## 19 | ## Christian Thurau 20 | ## cthurau@gmail.com 21 | """ 22 | 23 | """ 24 | 25 | import pytest 26 | import pymf 27 | import time 28 | import numpy as np 29 | import scipy.sparse 30 | 31 | 32 | np.random.seed(400401) 33 | A = np.random.random((3, 50)) + 2.0 34 | B = scipy.sparse.csc_matrix(A) 35 | 36 | 37 | @pytest.mark.parametrize("A", [A, B]) 38 | def test_pinv(A): 39 | pymf.pinv(A) 40 | 41 | 42 | @pytest.mark.parametrize("A,func", [ 43 | (A, pymf.SVD), # 'Singula Value Decomposition (SVD)', 'c<' 44 | (A.T, pymf.SVD), # 'Singula Value Decomposition (SVD)', 'c<' 45 | (B, pymf.SVD), # 'svd sparse', 'c<' 46 | (A, pymf.CUR), # 'CUR Matrix Decomposition', 'b<' 47 | (B, pymf.CUR), # 'CUR Matrix Decomposition (sparse data)', 'b<' 48 | (A, pymf.CMD), # 'Compact Matrix Decomposition (CMD)', 'm<' 49 | (B, pymf.CMD), # 'Compact Matrix Decomposition (CMD - sparse data)', 'm<' 50 | (A, pymf.SIVM_CUR), # 'Simplex Volume Maximization f. CUR (SIVMCUR)', 'm<' 51 | (A, pymf.SIVM_CUR), # 'Simplex Volume Maximization f. CUR (SIVMCUR)', 'm<' 52 | ]) 53 | def test_svd(A, func): 54 | stime = time.time() 55 | m = func(A, rrank=2, crank=2) 56 | m.factorize() 57 | fro_norm = m.frobenius_norm()/(A.shape[0] + A.shape[1]) 58 | 59 | assert fro_norm < 0.1 60 | print 'Fro.: %d, elapsed %d' % (fro_norm, time.time() - stime) 61 | 62 | 63 | @pytest.mark.parametrize("A,func,niter,num_bases", [ 64 | (A, pymf.SIVM_SEARCH, 20, 2), # 'SIVM_SEARCH', 'c<', num_bases=2 65 | (A, pymf.SIVM_GSAT, 20, 4), # 'SIVM_GSAT ', 'c<' 66 | (A, pymf.SIVM_SGREEDY, 20, 4), # 'SIVM Greedy ', 'c<' 67 | (A, pymf.GMAP, 20, 4), # 'GMAP ', 'c<' 68 | (A, pymf.PCA, 20, 4), # 'Principal Component Analysis (PCA)', 'c<' 69 | (A, pymf.NMF, 20, 4), # 'Non-negative Matrix Factorization (NMF)', 'rs' 70 | (A, pymf.NMFALS, 10, 4), # 'NMF u. alternating least squares (NMFALS)', 'rs', niter=10 71 | (A, pymf.NMFNNLS, 10, 4), # 'NMF u. non-neg. least squares (NMFNNLS)', 'rs', niter=10 72 | (A, pymf.LAESA, 20, 4), # 'Linear Approximating Eliminating Search Algorithm (LAESA)', 'rs' 73 | (A, pymf.SIVM, 20, 4), # 'Simplex Volume Maximization (SIVM)', 'bs' 74 | (A, pymf.Kmeans, 20, 4), # 'K-means clustering (Kmeans)', 'b*' 75 | (A, pymf.Cmeans, 20, 4), # 'C-means clustering (Cmeans)', 'b*' 76 | (A, pymf.AA, 20, 4), # 'Archetypal Analysis (AA)', 'bs' 77 | (A, pymf.SNMF, 20, 4), # 'Semi Non-negative Matrix Factorization (SNMF)', 'bo' 78 | (A, pymf.CNMF, 20, 4), # 'Convex non-negative Matrix Factorization (CNMF)', 'c<' 79 | (A, pymf.CHNMF, 20, 4), # 'Convex-hull non-negative Matrix Factorization (CHNMF)', 'm*' 80 | (np.round(A-2.0), pymf.BNMF, 20, 4), # 'Binary Matrix Factorization (BNMF)', 'b>' 81 | ]) 82 | def test(A, func, niter, num_bases): 83 | stime = time.time() 84 | m = func(A, num_bases=num_bases) 85 | m.factorize(show_progress=True, niter=niter) 86 | fro_norm = m.ferr[-1]/(A.shape[0] + A.shape[1]) 87 | 88 | assert fro_norm < 0.1 89 | print 'Fro.: %d, elapsed %d' % (fro_norm, time.time() - stime) 90 | 91 | stime = time.time() 92 | m.factorize(show_progress=False, compute_h=False, niter=niter) 93 | m.factorize(show_progress=False, compute_w=False, niter=niter) 94 | m.factorize(show_progress=False, compute_err=False, niter=niter) 95 | m.factorize(show_progress=True, niter=20) 96 | 97 | print ' additional tests - elapsed:', time.time() - stime 98 | -------------------------------------------------------------------------------- /pymf/dist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF several distance functions 8 | 9 | kl_divergence(): KL Divergence 10 | l1_distance(): L1 distance 11 | l2_distance(): L2 distance 12 | cosine_distance(): Cosine distance 13 | pdist(): Pairwise distance computation 14 | vq(): Vector quantization 15 | 16 | """ 17 | 18 | 19 | import numpy as np 20 | import scipy.sparse 21 | 22 | __all__ = ["abs_cosine_distance", "kl_divergence", "l1_distance", "l2_distance", 23 | "weighted_abs_cosine_distance","cosine_distance","vq", "pdist"] 24 | 25 | def kl_divergence(d, vec): 26 | b = vec*(1/d) 27 | b = np.where(b>0, np.log(b),0) 28 | b = vec * b 29 | b = np.sum(b - vec + d, axis=0).reshape((-1)) 30 | return b 31 | 32 | def l1_distance(d, vec): 33 | ret_val = np.sum(np.abs(d - vec), axis=0) 34 | ret_val = ret_val.reshape((-1)) 35 | return ret_val 36 | 37 | def sparse_l2_distance(d, vec): 38 | # compute the norm of d 39 | nd = (d.multiply(d)).sum(axis=0) 40 | nv = (vec.multiply(vec)).sum(axis=0) 41 | ret_val = nd + nv - 2.0*(d.T * vec).T 42 | return np.sqrt(ret_val) 43 | 44 | def approx_l2_distance(d, vec): 45 | # Use random projections to approximate the conventional l2 distance 46 | k = np.round(np.log(d.shape[0])) 47 | #k = d.shape[0] 48 | R = np.random.randn(k, d.shape[0]) 49 | R = R / np.sqrt((R**2).sum(axis=0)) 50 | A = np.dot(R,d) 51 | B = np.dot(R, vec) 52 | ret_val = np.sum( (A - B)**2, axis=0) 53 | ret_val = np.sqrt(R.shape[1]/R.shape[0]) * np.sqrt(ret_val) 54 | ret_val = ret_val.reshape((-1)) 55 | return ret_val 56 | 57 | def l2_distance(d, vec): 58 | if scipy.sparse.issparse(d): 59 | ret_val = sparse_l2_distance(d, vec) 60 | else: 61 | ret_val = np.sqrt(((d[:,:] - vec)**2).sum(axis=0)) 62 | 63 | return ret_val.reshape((-1)) 64 | 65 | def l2_distance_new(d,vec): 66 | # compute the norm of d 67 | nd = (d**2).sum(axis=0) 68 | nv = (vec**2).sum(axis=0) 69 | ret_val = nd + nv - 2.0*np.dot(d.T,vec.reshape((-1,1))).T 70 | 71 | return np.sqrt(ret_val) 72 | 73 | def cosine_distance(d, vec): 74 | tmp = np.dot(np.transpose(d), vec) 75 | a = np.sqrt(np.sum(d**2, axis=0)) 76 | b = np.sqrt(np.sum(vec**2)) 77 | k = (a*b).reshape(-1) + (10**-9) 78 | 79 | # compute distance 80 | ret_val = 1.0 - tmp/k 81 | 82 | return ret_val.reshape((-1)) 83 | 84 | def abs_cosine_distance(d, vec, weighted=False): 85 | if scipy.sparse.issparse(d): 86 | tmp = np.array((d.T * vec).todense(), dtype=np.float32).reshape(-1) 87 | a = np.sqrt(np.array(d.multiply(d).sum(axis=0), dtype=np.float32).reshape(-1)) 88 | b = np.sqrt(np.array(vec.multiply(vec).sum(axis=0), dtype=np.float32).reshape(-1)) 89 | else: 90 | tmp = np.dot(np.transpose(d), vec).reshape(-1) 91 | a = np.sqrt(np.sum(d**2, axis=0)).reshape(-1) 92 | b = np.sqrt(np.sum(vec**2)).reshape(-1) 93 | 94 | k = (a*b).reshape(-1) + 10**-9 95 | 96 | # compute distance 97 | ret_val = 1.0 - np.abs(tmp/k) 98 | 99 | if weighted: 100 | ret_val = ret_val * a 101 | return ret_val.reshape((-1)) 102 | 103 | def weighted_abs_cosine_distance(d, vec): 104 | ret_val = abs_cosine_distance(d, vec, weighted=True) 105 | return ret_val 106 | 107 | def pdist(A, B, metric='l2' ): 108 | # compute pairwise distance between a data matrix A (d x n) and B (d x m). 109 | # Returns a distance matrix d (n x m). 110 | d = np.zeros((A.shape[1], B.shape[1])) 111 | if A.shape[1] <= B.shape[1]: 112 | for aidx in xrange(A.shape[1]): 113 | if metric == 'l2': 114 | d[aidx:aidx+1,:] = l2_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1)) 115 | if metric == 'l1': 116 | d[aidx:aidx+1,:] = l1_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1)) 117 | else: 118 | for bidx in xrange(B.shape[1]): 119 | if metric == 'l2': 120 | d[:, bidx:bidx+1] = l2_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1)) 121 | if metric == 'l1': 122 | d[:, bidx:bidx+1] = l1_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1)) 123 | 124 | return d 125 | 126 | def vq(A, B, metric='l2'): 127 | # assigns data samples in B to cluster centers A and 128 | # returns an index list [assume n column vectors, d x n] 129 | assigned = np.argmin(pdist(A,B, metric=metric), axis=0) 130 | return assigned 131 | -------------------------------------------------------------------------------- /pymf/pca.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Principal Component Analysis. 8 | 9 | PCA: Class for Principal Component Analysis 10 | """ 11 | 12 | 13 | 14 | import numpy as np 15 | 16 | from .nmf import NMF 17 | from .svd import SVD 18 | 19 | 20 | __all__ = ["PCA"] 21 | 22 | class PCA(NMF): 23 | """ 24 | PCA(data, num_bases=4, center_mean=True) 25 | 26 | 27 | Archetypal Analysis. Factorize a data matrix into two matrices s.t. 28 | F = | data - W*H | is minimal. W is set to the eigenvectors of the 29 | data covariance. 30 | 31 | Parameters 32 | ---------- 33 | data : array_like, shape (_data_dimension, _num_samples) 34 | the input data 35 | num_bases: int, optional 36 | Number of bases to compute (column rank of W and row rank of H). 37 | 4 (default) 38 | center_mean: bool, True 39 | Make sure that the data is centred around the mean. 40 | 41 | Attributes 42 | ---------- 43 | W : "data_dimension x num_bases" matrix of basis vectors 44 | H : "num bases x num_samples" matrix of coefficients 45 | ferr : frobenius norm (after calling .factorize()) 46 | 47 | Example 48 | ------- 49 | Applying PCA to some rather stupid data set: 50 | 51 | >>> import numpy as np 52 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 53 | >>> pca_mdl = PCA(data, num_bases=2) 54 | >>> pca_mdl.factorize() 55 | 56 | The basis vectors are now stored in pca_mdl.W, the coefficients in pca_mdl.H. 57 | To compute coefficients for an existing set of basis vectors simply copy W 58 | to pca_mdl.W, and set compute_w to False: 59 | 60 | >>> data = np.array([[1.5], [1.2]]) 61 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 62 | >>> pca_mdl = PCA(data, num_bases=2) 63 | >>> pca_mdl.W = W 64 | >>> pca_mdl.factorize(compute_w=False) 65 | 66 | The result is a set of coefficients pca_mdl.H, s.t. data = W * pca_mdl.H. 67 | """ 68 | 69 | def __init__(self, data, num_bases=0, center_mean=True): 70 | 71 | NMF.__init__(self, data, num_bases=num_bases) 72 | 73 | # center the data around the mean first 74 | self._center_mean = center_mean 75 | 76 | if self._center_mean: 77 | # copy the data before centering it 78 | self._data_orig = data 79 | self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1) 80 | self.data = self._data_orig - self._meanv 81 | else: 82 | self.data = data 83 | 84 | def init_h(self): 85 | pass 86 | 87 | def init_w(self): 88 | pass 89 | 90 | def update_h(self): 91 | self.H = np.dot(self.W.T, self.data[:,:]) 92 | 93 | def update_w(self): 94 | # compute eigenvectors and eigenvalues using SVD 95 | svd_mdl = SVD(self.data) 96 | svd_mdl.factorize() 97 | 98 | # argsort sorts in ascending order -> do reverese indexing 99 | # for accesing values in descending order 100 | S = np.diag(svd_mdl.S) 101 | order = np.argsort(S)[::-1] 102 | 103 | # select only a few eigenvectors ... 104 | if self._num_bases >0: 105 | order = order[:self._num_bases] 106 | 107 | self.W = svd_mdl.U[:,order] 108 | self.eigenvalues = S[order] 109 | 110 | def factorize(self, show_progress=False, compute_w=True, compute_h=True, 111 | compute_err=True, niter=1): 112 | """ Factorize s.t. WH = data 113 | 114 | Parameters 115 | ---------- 116 | show_progress : bool 117 | print some extra information to stdout. 118 | compute_h : bool 119 | iteratively update values for H. 120 | compute_w : bool 121 | iteratively update values for W. 122 | compute_err : bool 123 | compute Frobenius norm |data-WH| after each update and store 124 | it to .ferr[k]. 125 | 126 | Updated Values 127 | -------------- 128 | .W : updated values for W. 129 | .H : updated values for H. 130 | .ferr : Frobenius norm |data-WH|. 131 | """ 132 | 133 | NMF.factorize(self, niter=1, show_progress=show_progress, 134 | compute_w=compute_w, compute_h=compute_h, 135 | compute_err=compute_err) 136 | 137 | 138 | if __name__ == "__main__": 139 | import doctest 140 | doctest.testmod() 141 | -------------------------------------------------------------------------------- /pymf/bnmf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Binary Matrix Factorization [1] 8 | 9 | BNMF(NMF) : Class for binary matrix factorization 10 | 11 | [1]Z. Zhang, T. Li, C. H. Q. Ding, X. Zhang: Binary Matrix Factorization with 12 | Applications. ICDM 2007 13 | """ 14 | 15 | 16 | import numpy as np 17 | from .nmf import NMF 18 | 19 | __all__ = ["BNMF"] 20 | 21 | class BNMF(NMF): 22 | """ 23 | BNMF(data, data, num_bases=4) 24 | Binary Matrix Factorization. Factorize a data matrix into two matrices s.t. 25 | F = | data - W*H | is minimal. H and W are restricted to binary values. 26 | 27 | Parameters 28 | ---------- 29 | data : array_like, shape (_data_dimension, _num_samples) 30 | the input data 31 | num_bases: int, optional 32 | Number of bases to compute (column rank of W and row rank of H). 33 | 4 (default) 34 | 35 | Attributes 36 | ---------- 37 | W : "data_dimension x num_bases" matrix of basis vectors 38 | H : "num bases x num_samples" matrix of coefficients 39 | ferr : frobenius norm (after calling .factorize()) 40 | 41 | Example 42 | ------- 43 | Applying BNMF to some rather stupid data set: 44 | 45 | >>> import numpy as np 46 | >>> from bnmf import BNMF 47 | >>> data = np.array([[1.0, 0.0, 1.0], [0.0, 1.0, 1.0]]) 48 | 49 | Use 2 basis vectors -> W shape(data_dimension, 2). 50 | 51 | >>> bnmf_mdl = BNMF(data, num_bases=2) 52 | 53 | Set number of iterations to 5 and start computing the factorization. 54 | 55 | >>> bnmf_mdl.factorize(niter=5) 56 | 57 | The basis vectors are now stored in bnmf_mdl.W, the coefficients in bnmf_mdl.H. 58 | To compute coefficients for an existing set of basis vectors simply copy W 59 | to bnmf_mdl.W, and set compute_w to False: 60 | 61 | >>> data = np.array([[0.0], [1.0]]) 62 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 63 | >>> bnmf_mdl = BNMF(data, num_bases=2) 64 | >>> bnmf_mdl.W = W 65 | >>> bnmf_mdl.factorize(niter=10, compute_w=False) 66 | 67 | The result is a set of coefficients bnmf_mdl.H, s.t. data = W * bnmf_mdl.H. 68 | """ 69 | 70 | # controls how fast lambda should increase: 71 | # this influence convergence to binary values during the update. A value 72 | # <1 will result in non-binary decompositions as the update rule effectively 73 | # is a conventional nmf update rule. Values >1 give more weight to making the 74 | # factorization binary with increasing iterations. 75 | # setting either W or H to 0 results make the resulting matrix non binary. 76 | _LAMB_INCREASE_W = 1.1 77 | _LAMB_INCREASE_H = 1.1 78 | 79 | def update_h(self): 80 | H1 = np.dot(self.W.T, self.data[:,:]) + 3.0*self._lamb_H*(self.H**2) 81 | H2 = np.dot(np.dot(self.W.T,self.W), self.H) + 2*self._lamb_H*(self.H**3) + self._lamb_H*self.H + 10**-9 82 | self.H *= H1/H2 83 | 84 | self._lamb_W = self._LAMB_INCREASE_W * self._lamb_W 85 | self._lamb_H = self._LAMB_INCREASE_H * self._lamb_H 86 | 87 | def update_w(self): 88 | W1 = np.dot(self.data[:,:], self.H.T) + 3.0*self._lamb_W*(self.W**2) 89 | W2 = np.dot(self.W, np.dot(self.H, self.H.T)) + 2.0*self._lamb_W*(self.W**3) + self._lamb_W*self.W + 10**-9 90 | self.W *= W1/W2 91 | 92 | def factorize(self, niter=10, compute_w=True, compute_h=True, 93 | show_progress=False, compute_err=True): 94 | """ Factorize s.t. WH = data 95 | 96 | Parameters 97 | ---------- 98 | niter : int 99 | number of iterations. 100 | show_progress : bool 101 | print some extra information to stdout. 102 | compute_h : bool 103 | iteratively update values for H. 104 | compute_w : bool 105 | iteratively update values for W. 106 | compute_err : bool 107 | compute Frobenius norm |data-WH| after each update and store 108 | it to .ferr[k]. 109 | 110 | Updated Values 111 | -------------- 112 | .W : updated values for W. 113 | .H : updated values for H. 114 | .ferr : Frobenius norm |data-WH| for each iteration. 115 | """ 116 | 117 | # init some learning parameters 118 | self._lamb_W = 1.0/niter 119 | self._lamb_H = 1.0/niter 120 | 121 | NMF.factorize(self, niter=niter, compute_w=compute_w, 122 | compute_h=compute_h, show_progress=show_progress, 123 | compute_err=compute_err) 124 | 125 | if __name__ == "__main__": 126 | import doctest 127 | doctest.testmod() 128 | -------------------------------------------------------------------------------- /pymf/cur.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF CUR Decomposition [1] 8 | 9 | CUR(SVD) : Class for CUR Decomposition 10 | 11 | [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing 12 | a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206. 13 | """ 14 | 15 | 16 | import numpy as np 17 | import scipy.sparse 18 | 19 | from .svd import pinv, SVD 20 | 21 | 22 | __all__ = ["CUR"] 23 | 24 | class CUR(SVD): 25 | """ 26 | CUR(data, data, k=-1, rrank=0, crank=0) 27 | 28 | CUR Decomposition. Factorize a data matrix into three matrices s.t. 29 | F = | data - USV| is minimal. CUR randomly selects rows and columns from 30 | data for building U and V, respectively. 31 | 32 | Parameters 33 | ---------- 34 | data : array_like [data_dimension x num_samples] 35 | the input data 36 | rrank: int, optional 37 | Number of rows to sample from data. 38 | 4 (default) 39 | crank: int, optional 40 | Number of columns to sample from data. 41 | 4 (default) 42 | show_progress: bool, optional 43 | Print some extra information 44 | False (default) 45 | 46 | Attributes 47 | ---------- 48 | U,S,V : submatrices s.t. data = USV 49 | 50 | Example 51 | ------- 52 | >>> import numpy as np 53 | >>> from cur import CUR 54 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 55 | >>> cur_mdl = CUR(data, show_progress=False, rrank=1, crank=2) 56 | >>> cur_mdl.factorize() 57 | """ 58 | 59 | def __init__(self, data, k=-1, rrank=0, crank=0): 60 | SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank) 61 | 62 | # select all data samples for computing the error: 63 | # note that this might take very long, adjust self._rset and self._cset 64 | # for faster computations. 65 | self._rset = range(self._rows) 66 | self._cset = range(self._cols) 67 | 68 | 69 | def sample(self, s, probs): 70 | prob_rows = np.cumsum(probs.flatten()) 71 | temp_ind = np.zeros(s, np.int32) 72 | 73 | for i in range(s): 74 | v = np.random.rand() 75 | 76 | try: 77 | tempI = np.where(prob_rows >= v)[0] 78 | temp_ind[i] = tempI[0] 79 | except: 80 | temp_ind[i] = len(prob_rows) 81 | 82 | return np.sort(temp_ind) 83 | 84 | def sample_probability(self): 85 | 86 | if scipy.sparse.issparse(self.data): 87 | dsquare = self.data.multiply(self.data) 88 | else: 89 | dsquare = self.data[:,:]**2 90 | 91 | prow = np.array(dsquare.sum(axis=1), np.float64) 92 | pcol = np.array(dsquare.sum(axis=0), np.float64) 93 | 94 | prow /= prow.sum() 95 | pcol /= pcol.sum() 96 | 97 | return (prow.reshape(-1,1), pcol.reshape(-1,1)) 98 | 99 | def computeUCR(self): 100 | # the next lines do NOT work with h5py if CUR is used -> double indices in self.cid or self.rid 101 | # can occur and are not supported by h5py. When using h5py data, always use CMD which ignores 102 | # reoccuring row/column selections. 103 | 104 | if scipy.sparse.issparse(self.data): 105 | self._C = self.data[:, self._cid] * scipy.sparse.csc_matrix(np.diag(self._ccnt**(1/2))) 106 | self._R = scipy.sparse.csc_matrix(np.diag(self._rcnt**(1/2))) * self.data[self._rid,:] 107 | 108 | self._U = pinv(self._C, self._k) * self.data[:,:] * pinv(self._R, self._k) 109 | 110 | else: 111 | self._C = np.dot(self.data[:, self._cid].reshape((self._rows, len(self._cid))), np.diag(self._ccnt**(1/2))) 112 | self._R = np.dot(np.diag(self._rcnt**(1/2)), self.data[self._rid,:].reshape((len(self._rid), self._cols))) 113 | 114 | self._U = np.dot(np.dot(pinv(self._C, self._k), self.data[:,:]), 115 | pinv(self._R, self._k)) 116 | 117 | # set some standard (with respect to SVD) variable names 118 | self.U = self._C 119 | self.S = self._U 120 | self.V = self._R 121 | 122 | def factorize(self): 123 | """ Factorize s.t. CUR = data 124 | 125 | Updated Values 126 | -------------- 127 | .C : updated values for C. 128 | .U : updated values for U. 129 | .R : updated values for R. 130 | """ 131 | [prow, pcol] = self.sample_probability() 132 | self._rid = self.sample(self._rrank, prow) 133 | self._cid = self.sample(self._crank, pcol) 134 | 135 | self._rcnt = np.ones(len(self._rid)) 136 | self._ccnt = np.ones(len(self._cid)) 137 | 138 | self.computeUCR() 139 | 140 | 141 | if __name__ == "__main__": 142 | import doctest 143 | doctest.testmod() 144 | -------------------------------------------------------------------------------- /pymf/aa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Archetypal Analysis [1] 8 | 9 | AA: class for Archetypal Analysis 10 | 11 | [1] Cutler, A. Breiman, L. (1994), "Archetypal Analysis", Technometrics 36(4), 12 | 338-347. 13 | """ 14 | 15 | 16 | import numpy as np 17 | from .dist import vq 18 | from cvxopt import solvers, base 19 | 20 | from .svd import pinv 21 | from .nmf import NMF 22 | 23 | __all__ = ["AA"] 24 | 25 | class AA(NMF): 26 | """ 27 | AA(data, num_bases=4) 28 | 29 | Archetypal Analysis. Factorize a data matrix into two matrices s.t. 30 | F = | data - W*H | = | data - data*beta*H| is minimal. H and beta 31 | are restricted to convexity (beta >=0, sum(beta, axis=1) = [1 .. 1]). 32 | Factorization is solved via an alternating least squares optimization 33 | using the quadratic programming solver from cvxopt. 34 | 35 | Parameters 36 | ---------- 37 | data : array_like, shape (_data_dimension, _num_samples) 38 | the input data 39 | num_bases: int, optional 40 | Number of bases to compute (column rank of W and row rank of H). 41 | 4 (default) 42 | 43 | 44 | Attributes 45 | ---------- 46 | W : "data_dimension x num_bases" matrix of basis vectors 47 | H : "num bases x num_samples" matrix of coefficients 48 | beta : "num_bases x num_samples" matrix of basis vector coefficients 49 | (for constructing W s.t. W = beta * data.T ) 50 | ferr : frobenius norm (after calling .factorize()) 51 | 52 | Example 53 | ------- 54 | Applying AA to some rather stupid data set: 55 | 56 | >>> import numpy as np 57 | >>> from aa import AA 58 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 59 | 60 | Use 2 basis vectors -> W shape(data_dimension, 2). 61 | 62 | >>> aa_mdl = AA(data, num_bases=2) 63 | 64 | Set number of iterations to 5 and start computing the factorization. 65 | 66 | >>> aa_mdl.factorize(niter=5) 67 | 68 | The basis vectors are now stored in aa_mdl.W, the coefficients in aa_mdl.H. 69 | To compute coefficients for an existing set of basis vectors simply copy W 70 | to aa_mdl.W, and set compute_w to False: 71 | 72 | >>> data = np.array([[1.5], [1.2]]) 73 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 74 | >>> aa_mdl = AA(data, num_bases=2) 75 | >>> aa_mdl.W = W 76 | >>> aa_mdl.factorize(niter=5, compute_w=False) 77 | 78 | The result is a set of coefficients aa_mdl.H, s.t. data = W * aa_mdl.H. 79 | """ 80 | # set cvxopt options 81 | solvers.options['show_progress'] = False 82 | 83 | def init_h(self): 84 | self.H = np.random.random((self._num_bases, self._num_samples)) 85 | self.H /= self.H.sum(axis=0) 86 | 87 | def init_w(self): 88 | self.beta = np.random.random((self._num_bases, self._num_samples)) 89 | self.beta /= self.beta.sum(axis=0) 90 | self.W = np.dot(self.beta, self.data.T).T 91 | self.W = np.random.random((self._data_dimension, self._num_bases)) 92 | 93 | def update_h(self): 94 | """ alternating least squares step, update H under the convexity 95 | constraint """ 96 | def update_single_h(i): 97 | """ compute single H[:,i] """ 98 | # optimize alpha using qp solver from cvxopt 99 | FA = base.matrix(np.float64(np.dot(-self.W.T, self.data[:,i]))) 100 | al = solvers.qp(HA, FA, INQa, INQb, EQa, EQb) 101 | self.H[:,i] = np.array(al['x']).reshape((1, self._num_bases)) 102 | 103 | EQb = base.matrix(1.0, (1,1)) 104 | # float64 required for cvxopt 105 | HA = base.matrix(np.float64(np.dot(self.W.T, self.W))) 106 | INQa = base.matrix(-np.eye(self._num_bases)) 107 | INQb = base.matrix(0.0, (self._num_bases,1)) 108 | EQa = base.matrix(1.0, (1, self._num_bases)) 109 | 110 | for i in xrange(self._num_samples): 111 | update_single_h(i) 112 | 113 | def update_w(self): 114 | """ alternating least squares step, update W under the convexity 115 | constraint """ 116 | def update_single_w(i): 117 | """ compute single W[:,i] """ 118 | # optimize beta using qp solver from cvxopt 119 | FB = base.matrix(np.float64(np.dot(-self.data.T, W_hat[:,i]))) 120 | be = solvers.qp(HB, FB, INQa, INQb, EQa, EQb) 121 | self.beta[i,:] = np.array(be['x']).reshape((1, self._num_samples)) 122 | 123 | # float64 required for cvxopt 124 | HB = base.matrix(np.float64(np.dot(self.data[:,:].T, self.data[:,:]))) 125 | EQb = base.matrix(1.0, (1, 1)) 126 | W_hat = np.dot(self.data, pinv(self.H)) 127 | INQa = base.matrix(-np.eye(self._num_samples)) 128 | INQb = base.matrix(0.0, (self._num_samples, 1)) 129 | EQa = base.matrix(1.0, (1, self._num_samples)) 130 | 131 | for i in xrange(self._num_bases): 132 | update_single_w(i) 133 | 134 | self.W = np.dot(self.beta, self.data.T).T 135 | 136 | if __name__ == "__main__": 137 | import doctest 138 | doctest.testmod() 139 | -------------------------------------------------------------------------------- /pymf/sivm_sgreedy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Simplex Volume Maximization [1] 8 | 9 | SIVM_SGREEDY: class for greedy-search SiVM 10 | 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 13 | Conf. on Information and Knowledge Management. ACM. 2010. 14 | """ 15 | 16 | 17 | import numpy as np 18 | import time 19 | 20 | from .dist import * 21 | from .vol import * 22 | from .sivm_search import SIVM_SEARCH 23 | 24 | __all__ = ["SIVM_SGREEDY"] 25 | 26 | class SIVM_SGREEDY(SIVM_SEARCH): 27 | """ 28 | SIVM(data, num_bases=4, niter=100, show_progress=True, compW=True) 29 | 30 | 31 | Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. 32 | F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively 33 | found by maximizing the volume of the resulting simplex (see [1]). A solution 34 | is found by employing a simple greedy max-vol strategy. 35 | 36 | Parameters 37 | ---------- 38 | data : array_like 39 | the input data 40 | num_bases: int, optional 41 | Number of bases to compute (column rank of W and row rank of H). 42 | 4 (default) 43 | niter: int, optional 44 | Number of iterations of the alternating optimization. 45 | 100 (default) 46 | show_progress: bool, optional 47 | Print some extra information 48 | False (default) 49 | compW: bool, optional 50 | Compute W (True) or only H (False). Useful for using basis vectors 51 | from another convexity constrained matrix factorization function 52 | (e.g. svmnmf) (if set to "True" niter can be set to "1") 53 | compH: bool, optional 54 | Compute H (True) or only H (False). Useful for using precomputed 55 | basis vectors. 56 | dist_measure: string, optional 57 | The distance measure for finding the next best candidate that 58 | maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2'] 59 | 'l2' (default) 60 | optimize_lower_bound: bool, optional 61 | Use the alternative selection criterion that optimizes the lower 62 | bound (see [1]) 63 | False (default) 64 | 65 | Attributes 66 | ---------- 67 | W : "data_dimension x num_bases" matrix of basis vectors 68 | H : "num bases x num_samples" matrix of coefficients 69 | 70 | ferr : frobenius norm (after applying .factoriz()) 71 | 72 | Example 73 | ------- 74 | Applying SIVM to some rather stupid data set: 75 | 76 | >>> import numpy as np 77 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 78 | >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=10) 79 | >>> sivm_mdl.initialization() 80 | >>> sivm_mdl.factorize() 81 | 82 | The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. 83 | To compute coefficients for an existing set of basis vectors simply copy W 84 | to sivm_mdl.W, and set compW to False: 85 | 86 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 87 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 88 | >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=1, compW=False) 89 | >>> sivm_mdl.initialization() 90 | >>> sivm_mdl.W = W 91 | >>> sivm_mdl.factorize() 92 | 93 | The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. 94 | """ 95 | 96 | def update_w(self): 97 | # compute distance matrix -> requiresd for the volume 98 | self.init_sivm() 99 | next_sel = list([self.select[0]]) 100 | self.select = [] 101 | 102 | self._v = [] 103 | self._t = [] 104 | stime = time.time() 105 | 106 | for iter in range(self._num_bases-1): 107 | # add new selections to openset 108 | next_sel = list(np.sort(next_sel)) 109 | D = pdist(self.data[:, next_sel], self.data[:, next_sel]) 110 | V = np.zeros(self.data.shape[1]) 111 | d = np.zeros((D.shape[0]+1,D.shape[1]+1)) 112 | d[:D.shape[0], :D.shape[1]] = D[:,:] 113 | 114 | for i in range(self.data.shape[1]): 115 | # create a temp selection 116 | dtmp = l2_distance(self.data[:,next_sel], self.data[:,i:i+1]) 117 | d[:-1,-1] = dtmp 118 | d[-1,:-1] = dtmp 119 | # compute volume for temp selection 120 | V[i] = cmdet(d) 121 | 122 | next_index = np.argmax(V) 123 | next_sel.append(next_index) 124 | self._v.append(np.max(V)) 125 | 126 | self._logger.info('Iter:' + str(iter)) 127 | self._logger.info('Current selection:' + str(next_sel)) 128 | self._logger.info('Current volume:' + str(self._v[-1])) 129 | self._t.append(time.time() - stime) 130 | 131 | # update some values ... 132 | self.select = list(next_sel) 133 | self.W = self.data[:, self.select] 134 | 135 | 136 | 137 | if __name__ == "__main__": 138 | import doctest 139 | doctest.testmod() 140 | -------------------------------------------------------------------------------- /pymf/greedy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | #$Id$ 7 | """ 8 | PyMF GREEDY[1] 9 | 10 | GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1]. 11 | 12 | 13 | [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix 14 | Reconstruction via Greedy Approximation of SVD. ISAAC'2008. 15 | """ 16 | 17 | 18 | import time 19 | import scipy.sparse 20 | import numpy as np 21 | from .svd import * 22 | from .nmf import NMF 23 | 24 | __all__ = ["GREEDY"] 25 | 26 | class GREEDY(NMF): 27 | """ 28 | GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True) 29 | 30 | 31 | Deterministic Sparse Column Based Matrix Reconstruction via Greedy 32 | Approximation of SVD. Factorize a data matrix into two matrices s.t. 33 | F = | data - W*H | is minimal. W is iteratively selected as columns 34 | of data. 35 | 36 | Parameters 37 | ---------- 38 | data : array_like, shape (_data_dimension, _num_samples) 39 | the input data 40 | num_bases: int, optional 41 | Number of bases to compute (column rank of W and row rank of H). 42 | 4 (default) 43 | k : number of singular vectors for the SVD step of the algorithm 44 | num_bases (default) 45 | 46 | Attributes 47 | ---------- 48 | W : "data_dimension x num_bases" matrix of basis vectors 49 | H : "num bases x num_samples" matrix of coefficients 50 | ferr : frobenius norm (after calling .factorize()) 51 | 52 | Example 53 | ------- 54 | Applying GREEDY to some rather stupid data set: 55 | 56 | >>> import numpy as np 57 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 58 | >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10) 59 | >>> greedy_mdl.factorize() 60 | 61 | The basis vectors are now stored in greedy_mdl.W, the coefficients in 62 | greedy_mdl.H. To compute coefficients for an existing set of basis 63 | vectors simply copy W to greedy_mdl.W, and set compW to False: 64 | 65 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 66 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 67 | >>> greedy_mdl = GREEDY(data, num_bases=2) 68 | >>> greedy_mdl.W = W 69 | >>> greedy_mdl.factorize(compute_w=False) 70 | 71 | The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H. 72 | """ 73 | 74 | 75 | def __init__(self, data, k=-1, num_bases=4): 76 | # call inherited method 77 | NMF.__init__(self, data, num_bases=num_bases) 78 | self._k = k 79 | if self._k == -1: 80 | self._k = num_bases 81 | 82 | def update_h(self): 83 | if scipy.sparse.issparse(self.data): 84 | self.H = pinv(self.W) * self.data 85 | else: 86 | self.H = np.dot(pinv(self.W), self.data) 87 | 88 | def update_w(self): 89 | def normalize_matrix(K): 90 | """ Normalize a matrix K s.t. columns have Euclidean-norm |1| 91 | """ 92 | if scipy.sparse.issparse(K): 93 | L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:] 94 | s = np.where(L > 0.0)[0] 95 | L[s] = L[s]**-1 96 | KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc') 97 | K = K*KN 98 | else: 99 | L = np.sqrt((K**2).sum(axis=0)) 100 | s = np.where(L > 0.0)[0] 101 | L[s] = L[s]**-1 102 | K = K*L 103 | return K 104 | 105 | self._t = np.zeros((self._num_bases)) 106 | t0 = time.time() 107 | self.select = [] 108 | 109 | # normalize data 110 | A = self.data.copy() 111 | 112 | svd_mdl = SVD(A, k=self._k) 113 | svd_mdl.factorize() 114 | 115 | if scipy.sparse.issparse(self.data): 116 | B = svd_mdl.U * svd_mdl.S 117 | B = B.tocsc() 118 | else: 119 | B = np.dot(svd_mdl.U, svd_mdl.S) 120 | B = B[:, :self._num_bases] 121 | 122 | for i in range(self._num_bases): 123 | A = normalize_matrix(A) 124 | 125 | if scipy.sparse.issparse(self.data): 126 | T = B.transpose() * A 127 | T = np.array(T.multiply(T).sum(axis=0))[0,:] 128 | 129 | # next selected column index 130 | T[self.select] = 0.0 131 | idx = np.argmax(T) 132 | Aidx = A[:, idx].copy() 133 | self.select.append(idx) 134 | 135 | # update B 136 | BC = Aidx.transpose() * B 137 | B = B - (Aidx*BC) 138 | 139 | # update A 140 | AC = Aidx.transpose() * A 141 | A = A - (Aidx*AC) 142 | 143 | else: 144 | T = np.dot(B.transpose(), A) 145 | T = np.sum(T**2.0, axis=0) 146 | 147 | # next selected column index 148 | T[self.select] = 0.0 149 | idx = np.argmax(T) 150 | self.select.append(idx) 151 | 152 | # update B 153 | BC = np.dot(B.transpose(),A[:,idx]) 154 | B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1)) 155 | 156 | # and A 157 | AC = np.dot(A.transpose(),A[:,idx]) 158 | A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1)) 159 | 160 | 161 | # detect the next best data point 162 | self._logger.info('searching for next best column ...') 163 | self._logger.info('cur_columns: ' + str(self.select)) 164 | self._t[i] = time.time() - t0 165 | 166 | # sort indices, otherwise h5py won't work 167 | self.W = self.data[:, np.sort(self.select)] 168 | 169 | # "unsort" it again to keep the correct order 170 | self.W = self.W[:, np.argsort(np.argsort(self.select))] 171 | 172 | if __name__ == "__main__": 173 | import doctest 174 | doctest.testmod() 175 | -------------------------------------------------------------------------------- /pymf/sivm_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Simplex Volume Maximization [1] 8 | 9 | SIVM_SEARCH: class for search-SiVM 10 | 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 13 | Conf. on Information and Knowledge Management. ACM. 2010. 14 | """ 15 | 16 | 17 | import scipy.sparse 18 | import numpy as np 19 | from scipy import inf 20 | try: 21 | from scipy.misc.common import factorial 22 | except: 23 | from scipy.misc import factorial 24 | 25 | from .dist import * 26 | from .vol import * 27 | from .sivm import SIVM 28 | 29 | __all__ = ["SIVM_SEARCH"] 30 | 31 | class SIVM_SEARCH(SIVM): 32 | """ 33 | SIVM_SEARCH(data, num_bases=4, dist_measure='l2') 34 | 35 | 36 | Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. 37 | F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively 38 | found by maximizing the volume of the resulting simplex (see [1]). A solution 39 | is found by employing a simple A-star like search strategy. 40 | 41 | Parameters 42 | ---------- 43 | data : array_like, shape (_data_dimension, _num_samples) 44 | the input data 45 | num_bases: int, optional 46 | Number of bases to compute (column rank of W and row rank of H). 47 | 4 (default) 48 | dist_measure : one of 'l2' ,'cosine', 'l1', 'kl' 49 | Standard is 'l2' which maximizes the volume of the simplex. In contrast, 50 | 'cosine' maximizes the volume of a cone (see [1] for details). 51 | init : string (default: 'fastmap') 52 | 'fastmap' or 'origin'. Sets the method used for finding the very first 53 | basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of 54 | the two vectors that have the largest pairwise distance. 55 | Attributes 56 | ---------- 57 | W : "data_dimension x num_bases" matrix of basis vectors 58 | H : "num bases x num_samples" matrix of coefficients 59 | ferr : frobenius norm (after calling .factorize()) 60 | 61 | Example 62 | ------- 63 | Applying SIVM to some rather stupid data set: 64 | 65 | >>> import numpy as np 66 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 67 | >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2) 68 | >>> sivm_mdl.factorize() 69 | 70 | The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. 71 | To compute coefficients for an existing set of basis vectors simply copy W 72 | to sivm_mdl.W, and set compute_w to False: 73 | 74 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 75 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 76 | >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2) 77 | >>> sivm_mdl.W = W 78 | >>> sivm_mdl.factorize(compute_w=False) 79 | 80 | The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. 81 | """ 82 | 83 | def update_w(self): 84 | def h(sel,D,k): 85 | # compute the volume for a selection of sel columns 86 | # and a k-1 simplex (-> k columns have to be selected) 87 | mv = np.max(D) 88 | 89 | # fill the remaining distance by the maximal overall found distance 90 | d = np.zeros((k,k)) + mv 91 | for i in range(k): 92 | d[i,i] = 0.0 93 | 94 | for idx_i,i in enumerate(sel): 95 | for idx_j,j in enumerate(sel): 96 | d[idx_i,idx_j] = D[i,j] 97 | 98 | return d 99 | 100 | # compute distance matrix -> required for the volume 101 | D = pdist(self.data, self.data) 102 | Openset = {} 103 | 104 | for i in range(self._num_samples): 105 | # compute volume for temp selection 106 | d = h([i],D,self._num_bases) 107 | Vtmp = cmdet(d) 108 | Openset[tuple([i])] = Vtmp 109 | 110 | Closedset = {} 111 | finished = False 112 | self._v = [] 113 | self.init_sivm() 114 | next_sel = np.array([self.select[0]]) 115 | iter = 0 116 | 117 | while not finished: 118 | # add the current selection to closedset 119 | Closedset[(tuple(next_sel))] = [] 120 | 121 | for i in range(D.shape[0]): 122 | # create a temp selection 123 | tmp_sel = np.array(next_sel).flatten() 124 | tmp_sel = np.concatenate((tmp_sel, [i]),axis=0) 125 | tmp_sel = np.unique(tmp_sel) 126 | tmp_sel = list(tmp_sel) 127 | hkey = tuple(tmp_sel) 128 | 129 | if len(tmp_sel) > len(next_sel) and ( 130 | not Closedset.has_key(hkey)) and ( 131 | not Openset.has_key(hkey)): 132 | 133 | # compute volume for temp selection 134 | d = h(tmp_sel, D, self._num_bases) 135 | Vtmp = cmdet(d) 136 | 137 | # add to openset 138 | Openset[hkey] = Vtmp 139 | 140 | # get next best tuple 141 | vmax = 0.0 142 | for (k,v) in Openset.iteritems(): 143 | if v > vmax: 144 | next_sel = k 145 | vmax = v 146 | 147 | self._logger.info('Iter:' + str(iter)) 148 | self._logger.info('Current selection:' + str(next_sel)) 149 | self._logger.info('Current volume:' + str(vmax)) 150 | self._v.append(vmax) 151 | 152 | # remove next_sel from openset 153 | Openset.pop(next_sel) 154 | 155 | if len(list(next_sel)) == self._num_bases: 156 | finished = True 157 | iter += 1 158 | 159 | # update some values ... 160 | self.select = list(next_sel) 161 | self.W = self.data[:, self.select] 162 | 163 | if __name__ == "__main__": 164 | import doctest 165 | doctest.testmod() 166 | -------------------------------------------------------------------------------- /pymf/cnmf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Convex Matrix Factorization [1] 8 | 9 | CNMF(NMF) : Class for convex matrix factorization 10 | 11 | [1] Ding, C., Li, T. and Jordan, M.. Convex and Semi-Nonnegative Matrix Factorizations. 12 | IEEE Trans. on Pattern Analysis and Machine Intelligence 32(1), 45-55. 13 | """ 14 | 15 | 16 | import numpy as np 17 | import logging 18 | from .nmf import NMF 19 | from .kmeans import Kmeans 20 | 21 | 22 | __all__ = ["CNMF"] 23 | 24 | class CNMF(NMF): 25 | """ 26 | CNMF(data, num_bases=4) 27 | 28 | 29 | Convex NMF. Factorize a data matrix into two matrices s.t. 30 | F = | data - W*H | = | data - data*beta*H| is minimal. H and beta 31 | are restricted to convexity (beta >=0, sum(beta, axis=1) = [1 .. 1]). 32 | 33 | Parameters 34 | ---------- 35 | data : array_like, shape (_data_dimension, _num_samples) 36 | the input data 37 | num_bases: int, optional 38 | Number of bases to compute (column rank of W and row rank of H). 39 | 4 (default) 40 | 41 | Attributes 42 | ---------- 43 | W : "data_dimension x num_bases" matrix of basis vectors 44 | H : "num bases x num_samples" matrix of coefficients 45 | ferr : frobenius norm (after calling .factorize()) 46 | 47 | Example 48 | ------- 49 | Applying CNMF to some rather stupid data set: 50 | 51 | >>> import numpy as np 52 | >>> from cnmf import CNMF 53 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 54 | >>> cnmf_mdl = CNMF(data, num_bases=2) 55 | >>> cnmf_mdl.factorize(niter=10) 56 | 57 | The basis vectors are now stored in cnmf_mdl.W, the coefficients in cnmf_mdl.H. 58 | To compute coefficients for an existing set of basis vectors simply copy W 59 | to cnmf_mdl.W, and set compute_w to False: 60 | 61 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 62 | >>> W = [[1.0, 0.0], [0.0, 1.0]] 63 | >>> cnmf_mdl = CNMF(data, num_bases=2) 64 | >>> cnmf_mdl.W = W 65 | >>> cnmf_mdl.factorize(compute_w=False, niter=1) 66 | 67 | The result is a set of coefficients acnmf_mdl.H, s.t. data = W * cnmf_mdl.H. 68 | """ 69 | 70 | # see .factorize() for the update of W and H 71 | # -> proper decoupling of W/H not possible ... 72 | def update_w(self): 73 | pass 74 | 75 | def update_h(self): 76 | pass 77 | 78 | def init_h(self): 79 | if not hasattr(self, 'H'): 80 | # init basic matrices 81 | self.H = np.zeros((self._num_bases, self._num_samples)) 82 | 83 | # initialize using k-means 84 | km = Kmeans(self.data[:,:], num_bases=self._num_bases) 85 | km.factorize(niter=10) 86 | assign = km.assigned 87 | 88 | num_i = np.zeros(self._num_bases) 89 | for i in range(self._num_bases): 90 | num_i[i] = len(np.where(assign == i)[0]) 91 | 92 | self.H.T[range(len(assign)), assign] = 1.0 93 | self.H += 0.2*np.ones((self._num_bases, self._num_samples)) 94 | 95 | if not hasattr(self, 'G'): 96 | self.G = np.zeros((self._num_samples, self._num_bases)) 97 | 98 | self.G[range(len(assign)), assign] = 1.0 99 | self.G += 0.01 100 | self.G /= np.tile(np.reshape(num_i[assign],(-1,1)), self.G.shape[1]) 101 | 102 | if not hasattr(self,'W'): 103 | self.W = np.dot(self.data[:,:], self.G) 104 | 105 | def init_w(self): 106 | pass 107 | 108 | def factorize(self, niter=10, compute_w=True, compute_h=True, 109 | compute_err=True, show_progress=False): 110 | """ Factorize s.t. WH = data 111 | 112 | Parameters 113 | ---------- 114 | niter : int 115 | number of iterations. 116 | show_progress : bool 117 | print some extra information to stdout. 118 | compute_h : bool 119 | iteratively update values for H. 120 | compute_w : bool 121 | iteratively update values for W. 122 | compute_err : bool 123 | compute Frobenius norm |data-WH| after each update and store 124 | it to .ferr[k]. 125 | 126 | Updated Values 127 | -------------- 128 | .W : updated values for W. 129 | .H : updated values for H. 130 | .ferr : Frobenius norm |data-WH| for each iteration. 131 | """ 132 | 133 | if not hasattr(self,'W'): 134 | self.init_w() 135 | 136 | if not hasattr(self,'H'): 137 | self.init_h() 138 | 139 | def separate_positive(m): 140 | return (np.abs(m) + m)/2.0 141 | 142 | def separate_negative(m): 143 | return (np.abs(m) - m)/2.0 144 | 145 | if show_progress: 146 | self._logger.setLevel(logging.INFO) 147 | else: 148 | self._logger.setLevel(logging.ERROR) 149 | 150 | XtX = np.dot(self.data[:,:].T, self.data[:,:]) 151 | XtX_pos = separate_positive(XtX) 152 | XtX_neg = separate_negative(XtX) 153 | 154 | self.ferr = np.zeros(niter) 155 | # iterate over W and H 156 | 157 | for i in xrange(niter): 158 | # update H 159 | XtX_neg_x_W = np.dot(XtX_neg, self.G) 160 | XtX_pos_x_W = np.dot(XtX_pos, self.G) 161 | 162 | if compute_h: 163 | H_x_WT = np.dot(self.H.T, self.G.T) 164 | ha = XtX_pos_x_W + np.dot(H_x_WT, XtX_neg_x_W) 165 | hb = XtX_neg_x_W + np.dot(H_x_WT, XtX_pos_x_W) + 10**-9 166 | self.H = (self.H.T*np.sqrt(ha/hb)).T 167 | 168 | # update W 169 | if compute_w: 170 | HT_x_H = np.dot(self.H, self.H.T) 171 | wa = np.dot(XtX_pos, self.H.T) + np.dot(XtX_neg_x_W, HT_x_H) 172 | wb = np.dot(XtX_neg, self.H.T) + np.dot(XtX_pos_x_W, HT_x_H) + 10**-9 173 | 174 | self.G *= np.sqrt(wa/wb) 175 | self.W = np.dot(self.data[:,:], self.G) 176 | 177 | if compute_err: 178 | self.ferr[i] = self.frobenius_norm() 179 | self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) + 180 | ' FN:' + str(self.ferr[i])) 181 | else: 182 | self._logger.info('Iteration ' + str(i+1) + '/' + str(niter)) 183 | 184 | if i > 1 and compute_err: 185 | if self.converged(i): 186 | self.ferr = self.ferr[:i] 187 | break 188 | 189 | if __name__ == "__main__": 190 | import doctest 191 | doctest.testmod() 192 | -------------------------------------------------------------------------------- /pymf/sivm_gsat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Simplex Volume Maximization [1] 8 | 9 | SIVM_GSAT: class for gsat-SiVM 10 | 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 13 | Conf. on Information and Knowledge Management. ACM. 2010. 14 | """ 15 | 16 | 17 | import logging 18 | import numpy as np 19 | from .dist import * 20 | from .vol import cmdet 21 | from .sivm import SIVM 22 | 23 | __all__ = ["SIVM_GSAT"] 24 | 25 | class SIVM_GSAT(SIVM): 26 | """ 27 | SIVM(data, num_bases=4, dist_measure='l2') 28 | 29 | 30 | Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. 31 | F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively 32 | found by maximizing the volume of the resulting simplex (see [1]). Can be 33 | applied to data streams using the .online_update_w(vec) function which decides 34 | on adding data sample "vec" to the already selected basis vectors. 35 | 36 | Parameters 37 | ---------- 38 | data : array_like, shape (_data_dimension, _num_samples) 39 | the input data 40 | num_bases: int, optional 41 | Number of bases to compute (column rank of W and row rank of H). 42 | 4 (default) 43 | dist_measure : one of 'l2' ,'cosine', 'l1', 'kl' 44 | Standard is 'l2' which maximizes the volume of the simplex. In contrast, 45 | 'cosine' maximizes the volume of a cone (see [1] for details). 46 | init : string (default: 'fastmap') 47 | 'fastmap' or 'origin'. Sets the method used for finding the very first 48 | basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of 49 | the two vectors that have the largest pairwise distance. 50 | Attributes 51 | ---------- 52 | W : "data_dimension x num_bases" matrix of basis vectors 53 | H : "num bases x num_samples" matrix of coefficients 54 | ferr : frobenius norm (after calling .factorize()) 55 | 56 | Example 57 | ------- 58 | Applying SIVM to some rather stupid data set: 59 | 60 | >>> import numpy as np 61 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 62 | >>> sivm_mdl = SIVM_GSAT(data, num_bases=2) 63 | >>> sivm_mdl.factorize() 64 | 65 | The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. 66 | To compute coefficients for an existing set of basis vectors simply copy W 67 | to sivm_mdl.W, and set compute_w to False: 68 | 69 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 70 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 71 | >>> sivm_mdl = SIVM_GSAT(data, num_bases=2) 72 | >>> sivm_mdl.W = W 73 | >>> sivm_mdl.factorize(compute_w=False) 74 | 75 | The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. 76 | """ 77 | 78 | def init_w(self): 79 | self.select = range(self._num_bases) 80 | self.W = self.data[:, self.select] 81 | 82 | def online_update_w(self, vec): 83 | # update D if it does not exist 84 | k = self._num_bases 85 | if not hasattr(self, 'D'): 86 | self.D = np.zeros((k + 1, k + 1)) 87 | self.D[:k, :k] = pdist(self.W, self.W) 88 | self.V = cmdet(self.D[:k, :k]) 89 | 90 | tmp_d = self._distfunc(self.W, vec.reshape((-1,1))) 91 | self.D[k, :-1] = tmp_d 92 | self.D[:-1, k] = tmp_d 93 | 94 | v = np.zeros((self._num_bases + 1)) 95 | 96 | for i in range(self._num_bases): 97 | # compute volume for each combination... 98 | s = np.setdiff1d(range(self._num_bases + 1), [i]) 99 | v[i] = cmdet((self.D[s,:])[:,s]) 100 | 101 | # select index that maximizes the volume 102 | v[-1] = self.V 103 | s = np.argmax(v) 104 | 105 | if s < self._num_bases: 106 | self.W[:,s] = vec 107 | self.D[:self._num_bases, :self._num_bases] = pdist(self.W, self.W) 108 | 109 | if not hasattr(self, '_v'): 110 | self._v = [self.V] 111 | self.V = v[s] 112 | self._v.append(v[s]) 113 | 114 | self._logger.info('Volume increased:' + str(self.V)) 115 | return True, s 116 | 117 | return False,-1 118 | 119 | def update_w(self): 120 | n = np.int(np.floor(np.random.random() * self._num_samples)) 121 | if n not in self.select: 122 | updated, s = self.online_update_w(self.data[:,n]) 123 | if updated: 124 | self.select[s] = n 125 | self._logger.info('Current selection:' + str(self.select)) 126 | 127 | 128 | def factorize(self, show_progress=False, compute_w=True, compute_h=True, 129 | compute_err=True, niter=1): 130 | """ Factorize s.t. WH = data 131 | 132 | Parameters 133 | ---------- 134 | show_progress : bool 135 | print some extra information to stdout. 136 | niter : int 137 | number of iterations. 138 | compute_h : bool 139 | iteratively update values for H. 140 | compute_w : bool 141 | iteratively update values for W. 142 | compute_err : bool 143 | compute Frobenius norm |data-WH| after each update and store 144 | it to .ferr[k]. 145 | 146 | Updated Values 147 | -------------- 148 | .W : updated values for W. 149 | .H : updated values for H. 150 | .ferr : Frobenius norm |data-WH|. 151 | """ 152 | if show_progress: 153 | self._logger.setLevel(logging.INFO) 154 | else: 155 | self._logger.setLevel(logging.ERROR) 156 | 157 | # create W and H if they don't already exist 158 | # -> any custom initialization to W,H should be done before 159 | if not hasattr(self,'W'): 160 | self.init_w() 161 | 162 | if not hasattr(self,'H'): 163 | self.init_h() 164 | 165 | if compute_err: 166 | self.ferr = np.zeros(niter) 167 | 168 | for i in xrange(niter): 169 | if compute_w: 170 | self.update_w() 171 | 172 | if compute_h: 173 | self.update_h() 174 | 175 | if compute_err: 176 | self.ferr[i] = self.frobenius_norm() 177 | self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) + 178 | ' FN:' + str(self.ferr[i])) 179 | else: 180 | self._logger.info('Iteration ' + str(i+1) + '/' + str(niter)) 181 | 182 | 183 | if __name__ == "__main__": 184 | import doctest 185 | doctest.testmod() 186 | -------------------------------------------------------------------------------- /pymf/nmf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Non-negative Matrix Factorization. 8 | 9 | NMF: Class for Non-negative Matrix Factorization 10 | 11 | [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative 12 | Matrix Factorization, Nature 401(6755), 788-799. 13 | """ 14 | 15 | 16 | import numpy as np 17 | import logging 18 | import logging.config 19 | import scipy.sparse 20 | 21 | __all__ = ["NMF"] 22 | 23 | class NMF(): 24 | """ 25 | NMF(data, num_bases=4) 26 | 27 | 28 | Non-negative Matrix Factorization. Factorize a data matrix into two matrices 29 | s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative 30 | data. Uses the classicial multiplicative update rule. 31 | 32 | Parameters 33 | ---------- 34 | data : array_like, shape (_data_dimension, _num_samples) 35 | the input data 36 | num_bases: int, optional 37 | Number of bases to compute (column rank of W and row rank of H). 38 | 4 (default) 39 | 40 | Attributes 41 | ---------- 42 | W : "data_dimension x num_bases" matrix of basis vectors 43 | H : "num bases x num_samples" matrix of coefficients 44 | ferr : frobenius norm (after calling .factorize()) 45 | 46 | Example 47 | ------- 48 | Applying NMF to some rather stupid data set: 49 | 50 | >>> import numpy as np 51 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 52 | >>> nmf_mdl = NMF(data, num_bases=2, niter=10) 53 | >>> nmf_mdl.factorize() 54 | 55 | The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H. 56 | To compute coefficients for an existing set of basis vectors simply copy W 57 | to nmf_mdl.W, and set compute_w to False: 58 | 59 | >>> data = np.array([[1.5], [1.2]]) 60 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 61 | >>> nmf_mdl = NMF(data, num_bases=2) 62 | >>> nmf_mdl.W = W 63 | >>> nmf_mdl.factorize(niter=20, compute_w=False) 64 | 65 | The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H. 66 | """ 67 | 68 | # some small value 69 | _EPS = 10**-8 70 | 71 | def __init__(self, data, num_bases=4): 72 | 73 | def setup_logging(): 74 | # create logger 75 | self._logger = logging.getLogger("pymf") 76 | 77 | # add ch to logger 78 | if len(self._logger.handlers) < 1: 79 | # create console handler and set level to debug 80 | ch = logging.StreamHandler() 81 | ch.setLevel(logging.DEBUG) 82 | # create formatter 83 | formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s") 84 | 85 | # add formatter to ch 86 | ch.setFormatter(formatter) 87 | 88 | self._logger.addHandler(ch) 89 | 90 | setup_logging() 91 | 92 | # set variables 93 | self.data = data 94 | self._num_bases = num_bases 95 | 96 | # initialize H and W to random values 97 | (self._data_dimension, self._num_samples) = self.data.shape 98 | 99 | 100 | def frobenius_norm(self): 101 | """ Frobenius norm (||data - WH||) of a data matrix and a low rank 102 | approximation given by WH 103 | 104 | Returns: 105 | frobenius norm: F = ||data - WH|| 106 | """ 107 | 108 | # check if W and H exist 109 | if hasattr(self,'H') and hasattr(self,'W') and not scipy.sparse.issparse(self.data): 110 | err = np.sqrt( np.sum((self.data[:,:] - np.dot(self.W, self.H))**2 )) 111 | else: 112 | err = -123456 113 | 114 | return err 115 | 116 | def init_w(self): 117 | self.W = np.random.random((self._data_dimension, self._num_bases)) 118 | 119 | def init_h(self): 120 | self.H = np.random.random((self._num_bases, self._num_samples)) 121 | 122 | def update_h(self): 123 | # pre init H1, and H2 (necessary for storing matrices on disk) 124 | H2 = np.dot(np.dot(self.W.T, self.W), self.H) + 10**-9 125 | self.H *= np.dot(self.W.T, self.data[:,:]) 126 | self.H /= H2 127 | 128 | def update_w(self): 129 | # pre init W1, and W2 (necessary for storing matrices on disk) 130 | W2 = np.dot(np.dot(self.W, self.H), self.H.T) + 10**-9 131 | self.W *= np.dot(self.data[:,:], self.H.T) 132 | self.W /= W2 133 | 134 | def converged(self, i): 135 | derr = np.abs(self.ferr[i] - self.ferr[i-1])/self._num_samples 136 | if derr < self._EPS: 137 | return True 138 | else: 139 | return False 140 | 141 | def factorize(self, niter=1, show_progress=False, 142 | compute_w=True, compute_h=True, compute_err=True): 143 | """ Factorize s.t. WH = data 144 | 145 | Parameters 146 | ---------- 147 | niter : int 148 | number of iterations. 149 | show_progress : bool 150 | print some extra information to stdout. 151 | compute_h : bool 152 | iteratively update values for H. 153 | compute_w : bool 154 | iteratively update values for W. 155 | compute_err : bool 156 | compute Frobenius norm |data-WH| after each update and store 157 | it to .ferr[k]. 158 | 159 | Updated Values 160 | -------------- 161 | .W : updated values for W. 162 | .H : updated values for H. 163 | .ferr : Frobenius norm |data-WH| for each iteration. 164 | """ 165 | 166 | if show_progress: 167 | self._logger.setLevel(logging.INFO) 168 | else: 169 | self._logger.setLevel(logging.ERROR) 170 | 171 | # create W and H if they don't already exist 172 | # -> any custom initialization to W,H should be done before 173 | if not hasattr(self,'W'): 174 | self.init_w() 175 | 176 | if not hasattr(self,'H'): 177 | self.init_h() 178 | 179 | if compute_err: 180 | self.ferr = np.zeros(niter) 181 | 182 | for i in xrange(niter): 183 | if compute_w: 184 | self.update_w() 185 | 186 | if compute_h: 187 | self.update_h() 188 | 189 | if compute_err: 190 | self.ferr[i] = self.frobenius_norm() 191 | self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) + 192 | ' FN:' + str(self.ferr[i])) 193 | else: 194 | self._logger.info('Iteration ' + str(i+1) + '/' + str(niter)) 195 | 196 | 197 | # check if the err is not changing anymore 198 | if i > 1 and compute_err: 199 | if self.converged(i): 200 | # adjust the error measure 201 | self.ferr = self.ferr[:i] 202 | break 203 | 204 | if __name__ == "__main__": 205 | import doctest 206 | doctest.testmod() 207 | -------------------------------------------------------------------------------- /pymf/sub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Matrix sampling methods 8 | 9 | SUB: apply one of the matrix factorization methods of PyMF 10 | on sampled data for computing W, then compute H. 11 | 12 | Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL). 13 | """ 14 | 15 | 16 | 17 | import numpy as np 18 | import random 19 | #from itertools import combinations 20 | from .chnmf import combinations 21 | 22 | from . import dist 23 | from .chnmf import quickhull 24 | from .nmf import NMF 25 | from .pca import PCA 26 | from .kmeans import Kmeans 27 | from .laesa import LAESA 28 | from .sivm import SIVM 29 | 30 | __all__ = ["SUB"] 31 | 32 | class SUB(NMF): 33 | """ 34 | SUB(data, mfmethod, sstrategy='rand', nsub=20, show_progress=True, mapW=False, 35 | base_sel=2, num_bases=3 , niterH=1, niter=100, compute_h=True, compute_w=True, ) 36 | 37 | Evaluate a matrix factorization method "mfmethod" for a certain sampling 38 | strategy "sstrategy". This is particular useful for very large datasets. 39 | 40 | Parameters 41 | ---------- 42 | todo ... 43 | 44 | Attributes 45 | ---------- 46 | todo .... 47 | """ 48 | 49 | def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2, 50 | num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'): 51 | NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w) 52 | 53 | self._niterH = niterH 54 | self._nsub = nsub 55 | self.data = data 56 | self._mfmethod = mfmethod 57 | self._mapW = mapW 58 | self._sstrategy = sstrategy 59 | self._base_sel = base_sel 60 | 61 | # assign the correct distance function 62 | if self._sstrategy == 'cur': 63 | self._subfunc = self.curselect 64 | 65 | elif self._sstrategy == 'kmeans': 66 | self._subfunc = self.kmeansselect 67 | 68 | elif self._sstrategy == 'hull': 69 | self._subfunc = self.hullselect 70 | 71 | elif self._sstrategy == 'laesa': 72 | self._subfunc = self.laesaselect 73 | 74 | elif self._sstrategy == 'sivm': 75 | self._subfunc = self.sivmselect 76 | 77 | else: 78 | self._subfunc = self.randselect 79 | 80 | def hullselect(self): 81 | 82 | def selectHullPoints(data, n=20): 83 | """ select data points for pairwise projections of the first n 84 | dimensions """ 85 | 86 | # iterate over all projections and select data points 87 | idx = np.array([]) 88 | 89 | # iterate over some pairwise combinations of dimensions 90 | for i in combinations(range(n), 2): 91 | 92 | # sample convex hull points in 2D projection 93 | convex_hull_d = quickhull(data[i, :].T) 94 | 95 | # get indices for convex hull data points 96 | idx = np.append(idx, dist.vq(data[i, :], convex_hull_d.T)) 97 | idx = np.unique(idx) 98 | 99 | return np.int32(idx) 100 | 101 | 102 | # determine convex hull data points only if the total 103 | # amount of available data is >50 104 | #if self.data.shape[1] > 50: 105 | pcamodel = PCA(self.data, show_progress=self._show_progress) 106 | pcamodel.factorize() 107 | 108 | idx = selectHullPoints(pcamodel.H, n=self._base_sel) 109 | 110 | # set the number of subsampled data 111 | self.nsub = len(idx) 112 | 113 | return idx 114 | 115 | def kmeansselect(self): 116 | kmeans_mdl = Kmeans(self.data, num_bases=self._nsub) 117 | kmeans_mdl.initialization() 118 | kmeans_mdl.factorize() 119 | 120 | # pick data samples closest to the centres 121 | idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W) 122 | return idx 123 | 124 | def curselect(self): 125 | def sample_probability(): 126 | dsquare = self.data[:,:]**2 127 | 128 | pcol = np.array(dsquare.sum(axis=0)) 129 | pcol /= pcol.sum() 130 | 131 | return (pcol.reshape(-1,1)) 132 | 133 | probs = sample_probability() 134 | prob_cols = np.cumsum(probs.flatten()) #.flatten() 135 | temp_ind = np.zeros(self._nsub, np.int32) 136 | 137 | for i in range(self._nsub): 138 | tempI = np.where(prob_cols >= np.random.rand())[0] 139 | temp_ind[i] = tempI[0] 140 | 141 | return np.sort(temp_ind) 142 | 143 | def sivmselect(self): 144 | sivmmdl = SIVM(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine') 145 | 146 | sivmmdl.initialization() 147 | sivmmdl.factorize() 148 | idx = sivmmdl.select 149 | return idx 150 | 151 | def laesaselect(self): 152 | laesamdl = LAESA(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine') 153 | laesamdl.initialization() 154 | laesamdl.factorize() 155 | idx = laesamdl.select 156 | return idx 157 | 158 | 159 | def randselect(self): 160 | idx = random.sample(xrange(self._num_samples), self._nsub) 161 | return np.sort(np.int32(idx)) 162 | 163 | def update_w(self): 164 | 165 | idx = self._subfunc() 166 | idx = np.sort(np.int32(idx)) 167 | 168 | 169 | mdl_small = self._mfmethod(self.data[:, idx], 170 | num_bases=self._num_bases, 171 | show_progress=self._show_progress, 172 | compute_w=True) 173 | 174 | # initialize W, H, and beta 175 | mdl_small.initialization() 176 | 177 | # determine W 178 | mdl_small.factorize() 179 | 180 | 181 | self.mdl = self._mfmethod(self.data[:, :], 182 | num_bases=self._num_bases , 183 | show_progress=self._show_progress, 184 | compute_w=False) 185 | 186 | 187 | self.mdl.initialization() 188 | 189 | if self._mapW: 190 | # compute pairwise distances 191 | #distance = vq(self.data, self.W) 192 | _Wmapped_index = dist.vq(self.mdl.data, mdl_small.W) 193 | 194 | # do not directly assign, i.e. Wdist = self.data[:,sel] 195 | # as self might be unsorted (in non ascending order) 196 | # -> sorting sel would screw the matching to W if 197 | # self.data is stored as a hdf5 table (see h5py) 198 | for i,s in enumerate(_Wmapped_index): 199 | self.mdl.W[:,i] = self.mdl.data[:,s] 200 | else: 201 | self.mdl.W = np.copy(mdl_small.W) 202 | 203 | def update_h(self): 204 | self.mdl.factorize() 205 | 206 | def factorize(self): 207 | """Do factorization s.t. data = dot(dot(data,beta),H), under the convexity constraint 208 | beta >=0, sum(beta)=1, H >=0, sum(H)=1 209 | """ 210 | # compute new coefficients for reconstructing data points 211 | self.update_w() 212 | 213 | # for CHNMF it is sometimes useful to only compute 214 | # the basis vectors 215 | if self._compute_h: 216 | self.update_h() 217 | 218 | self.W = self.mdl.W 219 | self.H = self.mdl.H 220 | 221 | self.ferr = np.zeros(1) 222 | self.ferr[0] = self.mdl.frobenius_norm() 223 | self._print_cur_status(' Fro:' + str(self.ferr[0])) 224 | 225 | if __name__ == "__main__": 226 | import doctest 227 | doctest.testmod() 228 | -------------------------------------------------------------------------------- /pymf/gmap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Geometric-Map 8 | 9 | GMAP: Class for Geometric-Map 10 | """ 11 | 12 | 13 | import scipy.sparse 14 | import numpy as np 15 | 16 | from .dist import * 17 | from .aa import AA 18 | from .kmeans import Kmeans 19 | 20 | __all__ = ["GMAP"] 21 | 22 | class GMAP(AA): 23 | """ 24 | GMAP(data, num_bases=4, dist_measure='l2') 25 | 26 | 27 | Geometric-Map. Factorize a data matrix into two matrices s.t. 28 | F = | data - W*H | is minimal. G-MAP can emulate/approximate several 29 | standard methods including PCA, NMF, and AA. 30 | 31 | Parameters 32 | ---------- 33 | data : array_like, shape (_data_dimension, _num_samples) 34 | the input data 35 | num_bases: int, optional 36 | Number of bases to compute (column rank of W and row rank of H). 37 | 4 (default) 38 | method : one of 'pca' ,'nmf', 'aa', default is 'pca' which emulates 39 | Principal Component Analysis using the geometric map method ('nmf' 40 | emulates Non-negative Matrix Factorization, 'aa' emulates Archetypal 41 | Analysis). 42 | robust_map : bool, optional 43 | use robust_map or the standard max-val selection 44 | [see "On FastMap and the Convex Hull of Multivariate Data: Toward 45 | Fast and Robust Dimension Reduction", Ostrouchov and Samatova, PAMI 46 | 2005] 47 | Attributes 48 | ---------- 49 | W : "data_dimension x num_bases" matrix of basis vectors 50 | H : "num bases x num_samples" matrix of coefficients 51 | ferr : frobenius norm (after calling .factorize()) 52 | 53 | Example 54 | ------- 55 | Applying GMAP to some rather stupid data set: 56 | 57 | >>> import numpy as np 58 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 59 | >>> gmap_mdl = GMAP(data, num_bases=2) 60 | >>> gmap_mdl.factorize() 61 | 62 | The basis vectors are now stored in gmap_mdl.W, the coefficients in gmap_mdl.H. 63 | To compute coefficients for an existing set of basis vectors simply copy W 64 | to gmap_mdl.W, and set compute_w to False: 65 | 66 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 67 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 68 | >>> gmap_mdl = GMAP(data, num_bases=2) 69 | >>> gmap_mdl.W = W 70 | >>> gmap_mdl.factorize(compute_w=False) 71 | 72 | The result is a set of coefficients gmap_mdl.H, s.t. data = W * gmap_mdl.H. 73 | """ 74 | 75 | # always overwrite the default number of iterations 76 | # -> any value other does not make sense. 77 | _NITER = 1 78 | 79 | def __init__(self, data, num_bases=4, method='pca', robust_map=True): 80 | 81 | AA.__init__(self, data, num_bases=num_bases) 82 | self.sub = [] 83 | self._robust_map = robust_map 84 | self._method = method 85 | 86 | 87 | def init_h(self): 88 | self.H = np.zeros((self._num_bases, self._num_samples)) 89 | 90 | def init_w(self): 91 | self.W = np.zeros((self._data_dimension, self._num_bases)) 92 | 93 | def update_w(self): 94 | """ compute new W """ 95 | 96 | def select_next(iterval): 97 | """ select the next best data sample using robust map 98 | or simply the max iterval ... """ 99 | 100 | if self._robust_map: 101 | k = np.argsort(iterval)[::-1] 102 | d_sub = self.data[:,k[:self._robust_nselect]] 103 | self.sub.extend(k[:self._robust_nselect]) 104 | 105 | # cluster d_sub 106 | kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster) 107 | kmeans_mdl.factorize(niter=10) 108 | 109 | # get largest cluster 110 | h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0] 111 | largest_cluster = np.argmax(h) 112 | sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub) 113 | sel = k[np.argmin(sel)] 114 | else: 115 | sel = np.argmax(iterval) 116 | 117 | return sel 118 | 119 | EPS = 10**-8 120 | 121 | if scipy.sparse.issparse(self.data): 122 | norm_data = np.sqrt(self.data.multiply(self.data).sum(axis=0)) 123 | norm_data = np.array(norm_data).reshape((-1)) 124 | else: 125 | norm_data = np.sqrt(np.sum(self.data**2, axis=0)) 126 | 127 | 128 | self.select = [] 129 | 130 | if self._method == 'pca' or self._method == 'aa': 131 | iterval = norm_data.copy() 132 | 133 | if self._method == 'nmf': 134 | iterval = np.sum(self.data, axis=0)/(np.sqrt(self.data.shape[0])*norm_data) 135 | iterval = 1.0 - iterval 136 | 137 | self.select.append(select_next(iterval)) 138 | 139 | 140 | for l in range(1, self._num_bases): 141 | 142 | if scipy.sparse.issparse(self.data): 143 | c = self.data[:, self.select[-1]:self.select[-1]+1].T * self.data 144 | c = np.array(c.todense()) 145 | else: 146 | c = np.dot(self.data[:,self.select[-1]], self.data) 147 | 148 | c = c/(norm_data * norm_data[self.select[-1]]) 149 | 150 | if self._method == 'pca': 151 | c = 1.0 - np.abs(c) 152 | c = c * norm_data 153 | 154 | elif self._method == 'aa': 155 | c = (c*-1.0 + 1.0)/2.0 156 | c = c * norm_data 157 | 158 | elif self._method == 'nmf': 159 | c = 1.0 - np.abs(c) 160 | 161 | ### update the estimated volume 162 | iterval = c * iterval 163 | 164 | # detect the next best data point 165 | self.select.append(select_next(iterval)) 166 | 167 | self._logger.info('cur_nodes: ' + str(self.select)) 168 | 169 | # sort indices, otherwise h5py won't work 170 | self.W = self.data[:, np.sort(self.select)] 171 | 172 | # "unsort" it again to keep the correct order 173 | self.W = self.W[:, np.argsort(np.argsort(self.select))] 174 | 175 | def factorize(self, show_progress=False, compute_w=True, compute_h=True, 176 | compute_err=True, robust_cluster=3, niter=1, robust_nselect=-1): 177 | """ Factorize s.t. WH = data 178 | 179 | Parameters 180 | ---------- 181 | show_progress : bool 182 | print some extra information to stdout. 183 | False, default 184 | compute_h : bool 185 | iteratively update values for H. 186 | True, default 187 | compute_w : bool 188 | iteratively update values for W. 189 | default, True 190 | compute_err : bool 191 | compute Frobenius norm |data-WH| after each update and store 192 | it to .ferr[k]. 193 | robust_cluster : int, optional 194 | set the number of clusters for robust map selection. 195 | 3, default 196 | robust_nselect : int, optional 197 | set the number of samples to consider for robust map 198 | selection. 199 | -1, default (automatically determine suitable number) 200 | 201 | Updated Values 202 | -------------- 203 | .W : updated values for W. 204 | .H : updated values for H. 205 | .ferr : Frobenius norm |data-WH|. 206 | """ 207 | self._robust_cluster = robust_cluster 208 | self._robust_nselect = robust_nselect 209 | 210 | if self._robust_nselect == -1: 211 | self._robust_nselect = np.round(np.log(self.data.shape[1])*2) 212 | 213 | AA.factorize(self, niter=1, show_progress=show_progress, 214 | compute_w=compute_w, compute_h=compute_h, 215 | compute_err=compute_err) 216 | 217 | if __name__ == "__main__": 218 | import doctest 219 | doctest.testmod() 220 | -------------------------------------------------------------------------------- /pymf/chnmf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Convex Hull Non-negative Matrix Factorization [1] 8 | 9 | CHNMF(NMF) : Class for Convex-hull NMF 10 | quickhull : Function for finding the convex hull in 2D 11 | 12 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Convex Non-Negative Matrix 13 | Factorization in the Wild. ICDM 2009. 14 | """ 15 | 16 | 17 | import numpy as np 18 | 19 | from itertools import combinations 20 | from .dist import vq 21 | from .pca import PCA 22 | from .aa import AA 23 | 24 | __all__ = ["CHNMF"] 25 | 26 | 27 | def quickhull(sample): 28 | """ Find data points on the convex hull of a supplied data set 29 | 30 | Args: 31 | sample: data points as column vectors n x d 32 | n - number samples 33 | d - data dimension (should be two) 34 | 35 | Returns: 36 | a k x d matrix containint the convex hull data points 37 | """ 38 | 39 | link = lambda a, b: np.concatenate((a, b[1:])) 40 | edge = lambda a, b: np.concatenate(([a], [b])) 41 | 42 | def dome(sample, base): 43 | h, t = base 44 | dists = np.dot(sample - h, np.dot(((0, -1), (1, 0)), (t - h))) 45 | outer = np.repeat(sample, dists > 0, axis=0) 46 | 47 | if len(outer): 48 | pivot = sample[np.argmax(dists)] 49 | return link(dome(outer, edge(h, pivot)), 50 | dome(outer, edge(pivot, t))) 51 | else: 52 | return base 53 | 54 | if len(sample) > 2: 55 | axis = sample[:, 0] 56 | base = np.take(sample, [np.argmin(axis), np.argmax(axis)], axis=0) 57 | return link(dome(sample, base), 58 | dome(sample, base[::-1])) 59 | else: 60 | return sample 61 | 62 | class CHNMF(AA): 63 | """ 64 | CHNMF(data, num_bases=4) 65 | 66 | Convex Hull Non-negative Matrix Factorization. Factorize a data matrix into 67 | two matrices s.t. F = | data - W*H | is minimal. H is restricted to convexity 68 | (H >=0, sum(H, axis=1) = [1 .. 1]) and W resides on actual data points. 69 | Factorization is solved via an alternating least squares optimization using 70 | the quadratic programming solver from cvxopt. The results are usually 71 | equivalent to Archetypal Analysis (pymf.AA) but CHNMF also works for very 72 | large datasets. 73 | 74 | Parameters 75 | ---------- 76 | data : array_like, shape (_data_dimension, _num_samples) 77 | the input data 78 | num_bases: int, optional 79 | Number of bases to compute (column rank of W and row rank of H). 80 | 4 (default) 81 | base_sel: int, 82 | Number of pairwise basis vector projections. Set to a value< rank(data). 83 | Computation time scale exponentially with this value, usually rather low 84 | values are sufficient (3-10). 85 | 86 | Attributes 87 | ---------- 88 | W : "data_dimension x num_bases" matrix of basis vectors 89 | H : "num bases x num_samples" matrix of coefficients 90 | ferr : frobenius norm (after calling .factorize()) 91 | 92 | Example 93 | ------- 94 | Applying CHNMF to some rather stupid data set: 95 | 96 | >>> import numpy as np 97 | >>> from chnmf import CHNMF 98 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 99 | 100 | Use 2 basis vectors -> W shape(data_dimension, 2). 101 | 102 | >>> chnmf_mdl = CHNMF(data, num_bases=2) 103 | 104 | And start computing the factorization. 105 | 106 | >>> chnmf_mdl.factorize() 107 | 108 | The basis vectors are now stored in chnmf_mdl.W, the coefficients in 109 | chnmf_mdl.H. To compute coefficients for an existing set of basis vectors 110 | simply copy W to chnmf_mdl.W, and set compute_w to False: 111 | 112 | >>> data = np.array([[1.5, 2.0], [1.2, 1.8]]) 113 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 114 | >>> chnmf_mdl = CHNMF(data, num_bases=2) 115 | >>> chnmf_mdl.W = W 116 | >>> chnmf_mdl.factorize(compute_w=False) 117 | 118 | The result is a set of coefficients chnmf_mdl.H, s.t. data = W * chnmf_mdl.H. 119 | """ 120 | 121 | def __init__(self, data, num_bases=4, base_sel=3): 122 | 123 | # call inherited method 124 | AA.__init__(self, data, num_bases=num_bases) 125 | 126 | # base sel should never be larger than the actual data dimension 127 | self._base_sel = base_sel 128 | if base_sel > self.data.shape[0]: 129 | self._base_sel = self.data.shape[0] 130 | 131 | def init_h(self): 132 | self.H = np.zeros((self._num_bases, self._num_samples)) 133 | 134 | def init_w(self): 135 | self.W = np.zeros((self._data_dimension, self._num_bases)) 136 | 137 | def _map_w_to_data(self): 138 | """ Return data points that are most similar to basis vectors W 139 | """ 140 | 141 | # assign W to the next best data sample 142 | self._Wmapped_index = vq(self.data, self.W) 143 | self.Wmapped = np.zeros(self.W.shape) 144 | 145 | # do not directly assign, i.e. Wdist = self.data[:,sel] 146 | # as self might be unsorted (in non ascending order) 147 | # -> sorting sel would screw the matching to W if 148 | # self.data is stored as a hdf5 table (see h5py) 149 | for i, s in enumerate(self._Wmapped_index): 150 | self.Wmapped[:,i] = self.data[:,s] 151 | 152 | def update_w(self): 153 | """ compute new W """ 154 | def select_hull_points(data, n=3): 155 | """ select data points for pairwise projections of the first n 156 | dimensions """ 157 | 158 | # iterate over all projections and select data points 159 | idx = np.array([]) 160 | 161 | # iterate over some pairwise combinations of dimensions 162 | for i in combinations(range(n), 2): 163 | # sample convex hull points in 2D projection 164 | convex_hull_d = quickhull(data[i, :].T) 165 | 166 | # get indices for convex hull data points 167 | idx = np.append(idx, vq(data[i, :], convex_hull_d.T)) 168 | idx = np.unique(idx) 169 | 170 | return np.int32(idx) 171 | 172 | # determine convex hull data points using either PCA or random 173 | # projections 174 | method = 'randomprojection' 175 | if method == 'pca': 176 | pcamodel = PCA(self.data) 177 | pcamodel.factorize(show_progress=False) 178 | proj = pcamodel.H 179 | else: 180 | R = np.random.randn(self._base_sel, self._data_dimension) 181 | proj = np.dot(R, self.data) 182 | 183 | self._hull_idx = select_hull_points(proj, n=self._base_sel) 184 | aa_mdl = AA(self.data[:, self._hull_idx], num_bases=self._num_bases) 185 | 186 | # determine W 187 | aa_mdl.factorize(niter=50, compute_h=True, compute_w=True, 188 | compute_err=True, show_progress=False) 189 | 190 | self.W = aa_mdl.W 191 | self._map_w_to_data() 192 | 193 | def factorize(self, show_progress=False, compute_w=True, compute_h=True, 194 | compute_err=True, niter=1): 195 | """ Factorize s.t. WH = data 196 | 197 | Parameters 198 | ---------- 199 | show_progress : bool 200 | print some extra information to stdout. 201 | compute_h : bool 202 | iteratively update values for H. 203 | compute_w : bool 204 | iteratively update values for W. 205 | compute_err : bool 206 | compute Frobenius norm |data-WH| after each update and store 207 | it to .ferr[k]. 208 | 209 | Updated Values 210 | -------------- 211 | .W : updated values for W. 212 | .H : updated values for H. 213 | .ferr : Frobenius norm |data-WH|. 214 | """ 215 | 216 | AA.factorize(self, niter=1, show_progress=show_progress, 217 | compute_w=compute_w, compute_h=compute_h, 218 | compute_err=compute_err) 219 | 220 | 221 | if __name__ == "__main__": 222 | import doctest 223 | doctest.testmod() 224 | -------------------------------------------------------------------------------- /pymf/svd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Singular Value Decomposition. 8 | 9 | SVD : Class for Singular Value Decomposition 10 | pinv() : Compute the pseudoinverse of a Matrix 11 | 12 | """ 13 | 14 | 15 | 16 | from numpy.linalg import eigh 17 | import scipy.sparse 18 | 19 | try: 20 | import scipy.sparse.linalg.eigen.arpack as linalg 21 | except (ImportError, AttributeError): 22 | import scipy.sparse.linalg as linalg 23 | 24 | 25 | import numpy as np 26 | 27 | def pinv(A, k=-1, eps=10**-8): 28 | # Compute Pseudoinverse of a matrix 29 | # calculate SVD 30 | svd_mdl = SVD(A, k=k) 31 | svd_mdl.factorize() 32 | 33 | S = svd_mdl.S 34 | Sdiag = S.diagonal() 35 | Sdiag = np.where(Sdiag >eps, 1.0/Sdiag, 0.0) 36 | 37 | for i in range(S.shape[0]): 38 | S[i,i] = Sdiag[i] 39 | 40 | if scipy.sparse.issparse(A): 41 | A_p = svd_mdl.V.T * (S * svd_mdl.U.T) 42 | else: 43 | A_p = np.dot(svd_mdl.V.T, np.core.multiply(np.diag(S)[:,np.newaxis], svd_mdl.U.T)) 44 | 45 | return A_p 46 | 47 | 48 | class SVD(): 49 | """ 50 | SVD(data, show_progress=False) 51 | 52 | 53 | Singular Value Decomposition. Factorize a data matrix into three matrices s.t. 54 | F = | data - USV| is minimal. U and V correspond to eigenvectors of the matrices 55 | data*data.T and data.T*data. 56 | 57 | Parameters 58 | ---------- 59 | data : array_like [data_dimension x num_samples] 60 | the input data 61 | 62 | Attributes 63 | ---------- 64 | U,S,V : submatrices s.t. data = USV 65 | 66 | Example 67 | ------- 68 | >>> import numpy as np 69 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 70 | >>> svd_mdl = SVD(data, show_progress=False) 71 | >>> svd_mdl.factorize() 72 | """ 73 | 74 | _EPS=10**-8 75 | 76 | def __init__(self, data, k=-1, rrank=0, crank=0): 77 | self.data = data 78 | (self._rows, self._cols) = self.data.shape 79 | if rrank > 0: 80 | self._rrank = rrank 81 | else: 82 | self._rrank = self._rows 83 | 84 | if crank > 0: 85 | self._crank = crank 86 | else: 87 | self._crank = self._cols 88 | 89 | # set the rank to either rrank or crank 90 | self._k = k 91 | 92 | def frobenius_norm(self): 93 | """ Frobenius norm (||data - USV||) for a data matrix and a low rank 94 | approximation given by SVH using rank k for U and V 95 | 96 | Returns: 97 | frobenius norm: F = ||data - USV|| 98 | """ 99 | if scipy.sparse.issparse(self.data): 100 | err = self.data - self.U*self.S*self.V 101 | err = err.multiply(err) 102 | err = np.sqrt(err.sum()) 103 | else: 104 | err = self.data[:,:] - np.dot(np.dot(self.U, self.S), self.V) 105 | err = np.sqrt(np.sum(err**2)) 106 | 107 | return err 108 | 109 | 110 | def factorize(self): 111 | def _right_svd(): 112 | AA = np.dot(self.data[:,:], self.data[:,:].T) 113 | values, u_vectors = eigh(AA) 114 | 115 | # get rid of too low eigenvalues 116 | u_vectors = u_vectors[:, values > self._EPS] 117 | values = values[values > self._EPS] 118 | 119 | # sort eigenvectors according to largest value 120 | idx = np.argsort(values) 121 | values = values[idx[::-1]] 122 | 123 | # argsort sorts in ascending order -> access is backwards 124 | self.U = u_vectors[:,idx[::-1]] 125 | 126 | # compute S 127 | self.S = np.diag(np.sqrt(values)) 128 | 129 | # and the inverse of it 130 | S_inv = np.diag(np.sqrt(values)**-1) 131 | 132 | # compute V from it 133 | self.V = np.dot(S_inv, np.dot(self.U[:,:].T, self.data[:,:])) 134 | 135 | 136 | def _left_svd(): 137 | AA = np.dot(self.data[:,:].T, self.data[:,:]) 138 | values, v_vectors = eigh(AA) 139 | 140 | # get rid of too low eigenvalues 141 | v_vectors = v_vectors[:, values > self._EPS] 142 | values = values[values > self._EPS] 143 | 144 | # sort eigenvectors according to largest value 145 | # argsort sorts in ascending order -> access is backwards 146 | idx = np.argsort(values)[::-1] 147 | values = values[idx] 148 | 149 | # compute S 150 | self.S= np.diag(np.sqrt(values)) 151 | 152 | # and the inverse of it 153 | S_inv = np.diag(1.0/np.sqrt(values)) 154 | 155 | Vtmp = v_vectors[:,idx] 156 | 157 | self.U = np.dot(np.dot(self.data[:,:], Vtmp), S_inv) 158 | self.V = Vtmp.T 159 | 160 | def _sparse_right_svd(): 161 | ## for some reasons arpack does not allow computation of rank(A) eigenvectors (??) # 162 | AA = self.data*self.data.transpose() 163 | if self.data.shape[0] > 1: 164 | # do not compute full rank if desired 165 | if self._k > 0 and self._k < self.data.shape[0]-1: 166 | k = self._k 167 | else: 168 | k = self.data.shape[0]-1 169 | 170 | try: 171 | values, u_vectors = linalg.eigen_symmetric(AA,k=k) 172 | except AttributeError: 173 | values, u_vectors = linalg.eigsh(AA,k=k) 174 | else: 175 | values, u_vectors = eigh(AA.todense()) 176 | 177 | # get rid of too low eigenvalues 178 | u_vectors = u_vectors[:, values > self._EPS] 179 | values = values[values > self._EPS] 180 | 181 | # sort eigenvectors according to largest value 182 | idx = np.argsort(values) 183 | values = values[idx[::-1]] 184 | 185 | # argsort sorts in ascending order -> access is backwards 186 | self.U = scipy.sparse.csc_matrix(u_vectors[:,idx[::-1]]) 187 | 188 | # compute S 189 | self.S = scipy.sparse.csc_matrix(np.diag(np.sqrt(values))) 190 | 191 | # and the inverse of it 192 | S_inv = scipy.sparse.csc_matrix(np.diag(1.0/np.sqrt(values))) 193 | 194 | # compute V from it 195 | self.V = self.U.transpose() * self.data 196 | self.V = S_inv * self.V 197 | 198 | def _sparse_left_svd(): 199 | # for some reasons arpack does not allow computation of rank(A) eigenvectors (??) 200 | AA = self.data.transpose()*self.data 201 | 202 | if self.data.shape[1] > 1: 203 | # do not compute full rank if desired 204 | if self._k > 0 and self._k < self.data.shape[1]-1: 205 | k = self._k 206 | else: 207 | k = self.data.shape[1]-1 208 | try: 209 | values, v_vectors = linalg.eigen_symmetric(AA,k=k) 210 | except AttributeError: 211 | values, v_vectors = linalg.eigsh(AA,k=k) 212 | else: 213 | values, v_vectors = eigh(AA.todense()) 214 | # get rid of too low eigenvalues 215 | v_vectors = v_vectors[:, values > self._EPS] 216 | values = values[values > self._EPS] 217 | 218 | # sort eigenvectors according to largest value 219 | idx = np.argsort(values) 220 | values = values[idx[::-1]] 221 | 222 | # argsort sorts in ascending order -> access is backwards 223 | self.V = scipy.sparse.csc_matrix(v_vectors[:,idx[::-1]]) 224 | 225 | # compute S 226 | self.S = scipy.sparse.csc_matrix(np.diag(np.sqrt(values))) 227 | 228 | # and the inverse of it 229 | S_inv = scipy.sparse.csc_matrix(np.diag(1.0/np.sqrt(values))) 230 | 231 | self.U = self.data * self.V * S_inv 232 | self.V = self.V.transpose() 233 | 234 | 235 | if self._rows > self._cols: 236 | if scipy.sparse.issparse(self.data): 237 | _sparse_left_svd() 238 | else: 239 | _left_svd() 240 | else: 241 | if scipy.sparse.issparse(self.data): 242 | _sparse_right_svd() 243 | else: 244 | _right_svd() 245 | 246 | if __name__ == "__main__": 247 | import doctest 248 | doctest.testmod() 249 | -------------------------------------------------------------------------------- /pymf/sivm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (C) Christian Thurau, 2010. 4 | # Licensed under the GNU General Public License (GPL). 5 | # http://www.gnu.org/licenses/gpl.txt 6 | """ 7 | PyMF Simplex Volume Maximization [1] 8 | 9 | SIVM: class for SiVM 10 | 11 | [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 12 | Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 13 | Conf. on Information and Knowledge Management. ACM. 2010. 14 | """ 15 | 16 | 17 | import scipy.sparse 18 | import numpy as np 19 | 20 | from .dist import * 21 | from .aa import AA 22 | 23 | __all__ = ["SIVM"] 24 | 25 | class SIVM(AA): 26 | """ 27 | SIVM(data, num_bases=4, dist_measure='l2') 28 | 29 | 30 | Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. 31 | F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively 32 | found by maximizing the volume of the resulting simplex (see [1]). 33 | 34 | Parameters 35 | ---------- 36 | data : array_like, shape (_data_dimension, _num_samples) 37 | the input data 38 | num_bases: int, optional 39 | Number of bases to compute (column rank of W and row rank of H). 40 | 4 (default) 41 | dist_measure : one of 'l2' ,'cosine', 'l1', 'kl' 42 | Standard is 'l2' which maximizes the volume of the simplex. In contrast, 43 | 'cosine' maximizes the volume of a cone (see [1] for details). 44 | init : string (default: 'fastmap') 45 | 'fastmap' or 'origin'. Sets the method used for finding the very first 46 | basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of 47 | the two vectors that have the largest pairwise distance. 48 | Attributes 49 | ---------- 50 | W : "data_dimension x num_bases" matrix of basis vectors 51 | H : "num bases x num_samples" matrix of coefficients 52 | ferr : frobenius norm (after calling .factorize()) 53 | 54 | Example 55 | ------- 56 | Applying SIVM to some rather stupid data set: 57 | 58 | >>> import numpy as np 59 | >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) 60 | >>> sivm_mdl = SIVM(data, num_bases=2) 61 | >>> sivm_mdl.factorize() 62 | 63 | The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. 64 | To compute coefficients for an existing set of basis vectors simply copy W 65 | to sivm_mdl.W, and set compute_w to False: 66 | 67 | >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) 68 | >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) 69 | >>> sivm_mdl = SIVM(data, num_bases=2) 70 | >>> sivm_mdl.W = W 71 | >>> sivm_mdl.factorize(compute_w=False) 72 | 73 | The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. 74 | """ 75 | 76 | # always overwrite the default number of iterations 77 | # -> any value other does not make sense. 78 | _NITER = 1 79 | 80 | def __init__(self, data, num_bases=4, dist_measure='l2', init='fastmap'): 81 | 82 | AA.__init__(self, data, num_bases=num_bases) 83 | 84 | self._dist_measure = dist_measure 85 | self._init = init 86 | 87 | # assign the correct distance function 88 | if self._dist_measure == 'l1': 89 | self._distfunc = l1_distance 90 | 91 | elif self._dist_measure == 'l2': 92 | self._distfunc = l2_distance 93 | 94 | elif self._dist_measure == 'cosine': 95 | self._distfunc = cosine_distance 96 | 97 | elif self._dist_measure == 'abs_cosine': 98 | self._distfunc = abs_cosine_distance 99 | 100 | elif self._dist_measure == 'weighted_abs_cosine': 101 | self._distfunc = weighted_abs_cosine_distance 102 | 103 | elif self._dist_measure == 'kl': 104 | self._distfunc = kl_divergence 105 | 106 | 107 | def _distance(self, idx): 108 | """ compute distances of a specific data point to all other samples""" 109 | 110 | if scipy.sparse.issparse(self.data): 111 | step = self.data.shape[1] 112 | else: 113 | step = 50000 114 | 115 | d = np.zeros((self.data.shape[1])) 116 | if idx == -1: 117 | # set vec to origin if idx=-1 118 | vec = np.zeros((self.data.shape[0], 1)) 119 | if scipy.sparse.issparse(self.data): 120 | vec = scipy.sparse.csc_matrix(vec) 121 | else: 122 | vec = self.data[:, idx:idx+1] 123 | 124 | self._logger.info('compute distance to node ' + str(idx)) 125 | 126 | # slice data into smaller chunks 127 | for idx_start in range(0, self.data.shape[1], step): 128 | if idx_start + step > self.data.shape[1]: 129 | idx_end = self.data.shape[1] 130 | else: 131 | idx_end = idx_start + step 132 | 133 | d[idx_start:idx_end] = self._distfunc( 134 | self.data[:,idx_start:idx_end], vec) 135 | self._logger.info('completed:' + 136 | str(idx_end/(self.data.shape[1]/100.0)) + "%") 137 | return d 138 | 139 | def init_h(self): 140 | self.H = np.zeros((self._num_bases, self._num_samples)) 141 | 142 | def init_w(self): 143 | self.W = np.zeros((self._data_dimension, self._num_bases)) 144 | 145 | def init_sivm(self): 146 | self.select = [] 147 | if self._init == 'fastmap': 148 | # Fastmap like initialization 149 | # set the starting index for fastmap initialization 150 | cur_p = 0 151 | 152 | # after 3 iterations the first "real" index is found 153 | for i in range(3): 154 | d = self._distance(cur_p) 155 | cur_p = np.argmax(d) 156 | 157 | # store maximal found distance -> later used for "a" (->update_w) 158 | self._maxd = np.max(d) 159 | self.select.append(cur_p) 160 | 161 | elif self._init == 'origin': 162 | # set first vertex to origin 163 | cur_p = -1 164 | d = self._distance(cur_p) 165 | self._maxd = np.max(d) 166 | self.select.append(cur_p) 167 | 168 | def update_w(self): 169 | """ compute new W """ 170 | EPS = 10**-8 171 | self.init_sivm() 172 | 173 | # initialize some of the recursively updated distance measures .... 174 | d_square = np.zeros((self.data.shape[1])) 175 | d_sum = np.zeros((self.data.shape[1])) 176 | d_i_times_d_j = np.zeros((self.data.shape[1])) 177 | distiter = np.zeros((self.data.shape[1])) 178 | a = np.log(self._maxd) 179 | a_inc = a.copy() 180 | 181 | for l in range(1, self._num_bases): 182 | d = self._distance(self.select[l-1]) 183 | 184 | # take the log of d (sually more stable that d) 185 | d = np.log(d + EPS) 186 | 187 | d_i_times_d_j += d * d_sum 188 | d_sum += d 189 | d_square += d**2 190 | distiter = d_i_times_d_j + a*d_sum - (l/2.0) * d_square 191 | 192 | # detect the next best data point 193 | self.select.append(np.argmax(distiter)) 194 | 195 | self._logger.info('cur_nodes: ' + str(self.select)) 196 | 197 | # sort indices, otherwise h5py won't work 198 | self.W = self.data[:, np.sort(self.select)] 199 | 200 | # "unsort" it again to keep the correct order 201 | self.W = self.W[:, np.argsort(np.argsort(self.select))] 202 | 203 | def factorize(self, show_progress=False, compute_w=True, compute_h=True, 204 | compute_err=True, niter=1): 205 | """ Factorize s.t. WH = data 206 | 207 | Parameters 208 | ---------- 209 | show_progress : bool 210 | print some extra information to stdout. 211 | compute_h : bool 212 | iteratively update values for H. 213 | compute_w : bool 214 | iteratively update values for W. 215 | compute_err : bool 216 | compute Frobenius norm |data-WH| after each update and store 217 | it to .ferr[k]. 218 | 219 | Updated Values 220 | -------------- 221 | .W : updated values for W. 222 | .H : updated values for H. 223 | .ferr : Frobenius norm |data-WH|. 224 | """ 225 | 226 | AA.factorize(self, niter=1, show_progress=show_progress, 227 | compute_w=compute_w, compute_h=compute_h, 228 | compute_err=compute_err) 229 | 230 | if __name__ == "__main__": 231 | import doctest 232 | doctest.testmod() 233 | --------------------------------------------------------------------------------