├── .gitignore ├── BestMap.py ├── BuildAdjacency.py ├── DataProjection.py ├── Hungarian.py ├── LICENSE ├── OutlierDetection.py ├── ReadMe.md ├── SSC.py ├── SparseCoefRecovery.py └── SpectralClustering.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /BestMap.py: -------------------------------------------------------------------------------- 1 | # bestmap: permute labels of L2 to match L1 as good as possible 2 | 3 | import numpy as np 4 | import sys 5 | from Hungarian import * 6 | 7 | 8 | def BestMap(L1, L2): 9 | 10 | L1 = L1.flatten(order='F').astype(float) 11 | L2 = L2.flatten(order='F').astype(float) 12 | if L1.size != L2.size: 13 | sys.exit('size(L1) must == size(L2)') 14 | Label1 = np.unique(L1) 15 | nClass1 = Label1.size 16 | Label2 = np.unique(L2) 17 | nClass2 = Label2.size 18 | nClass = max(nClass1, nClass2) 19 | 20 | # For Hungarian - Label2 are Workers, Label1 are Tasks. 21 | G = np.zeros([nClass, nClass]).astype(float) 22 | for i in range(0, nClass2): 23 | for j in range(0, nClass1): 24 | G[i, j] = np.sum(np.logical_and(L2 == Label2[i], L1 == Label1[j])) 25 | 26 | c = Hungarian(-G) 27 | newL2 = np.zeros(L2.shape) 28 | for i in range(0, nClass2): 29 | newL2[L2 == Label2[i]] = Label1[c[i]] 30 | return newL2 31 | 32 | 33 | if __name__ == "__main__": 34 | pass 35 | -------------------------------------------------------------------------------- /BuildAdjacency.py: -------------------------------------------------------------------------------- 1 | # This function takes a NxN coefficient matrix and returns a NxN adjacency 2 | # matrix by choosing only the K strongest connections in the similarity graph 3 | # CMat: NxN coefficient matrix 4 | # K: number of strongest edges to keep; if K=0 use all the coefficients 5 | # CKSym: NxN symmetric adjacency matrix 6 | 7 | 8 | import numpy as np 9 | 10 | 11 | def BuildAdjacency(CMat, K): 12 | CMat = CMat.astype(float) 13 | CKSym = None 14 | N, _ = CMat.shape 15 | CAbs = np.absolute(CMat).astype(float) 16 | for i in range(0, N): 17 | c = CAbs[:, i] 18 | PInd = np.flip(np.argsort(c), 0) 19 | CAbs[:, i] = CAbs[:, i] / float(np.absolute(c[PInd[0]])) 20 | CSym = np.add(CAbs, CAbs.T).astype(float) 21 | if K != 0: 22 | Ind = np.flip(np.argsort(CSym, axis=0), 0) 23 | CK = np.zeros([N, N]).astype(float) 24 | for i in range(0, N): 25 | for j in range(0, K): 26 | CK[Ind[j, i], i] = CSym[Ind[j, i], i] / float(np.absolute(CSym[Ind[0, i], i])) 27 | CKSym = np.add(CK, CK.T) 28 | else: 29 | CKSym = CSym 30 | return CKSym 31 | 32 | 33 | if __name__ == "__main__": 34 | pass 35 | -------------------------------------------------------------------------------- /DataProjection.py: -------------------------------------------------------------------------------- 1 | # This function takes the D x N data matrix with columns indicating 2 | # different data points and project the D dimensional data into the r 3 | # dimensional space. Different types of projections are possible: 4 | # (1) Projection using PCA 5 | # (2) Projection using random projections with iid elements from N(0,1/r) 6 | # (3) Projection using random projections with iid elements from symmetric 7 | # bernoulli distribution: +1/sqrt(r),-1/sqrt(r) elements with same probability 8 | # X: D x N data matrix of N data points 9 | # r: dimension of the space to project the data to 10 | # type: type of projection, {'PCA','NormalProj','BernoulliProj'} 11 | # Xp: r x N data matrix of N projectred data points 12 | 13 | import numpy as np 14 | import math 15 | 16 | 17 | def DataProjection(X, r, type='NormalProj'): 18 | Xp = None 19 | D, N = X.shape 20 | if r == 0: 21 | Xp = X 22 | else: 23 | if type == 'PCA': 24 | isEcon = False 25 | if D > N: 26 | isEcon = True 27 | U, S, V = np.linalg.svd(X.T, full_matrices=isEcon) 28 | Xp = U[:, 0:r].T 29 | if type == 'NormalProj': 30 | normP = (1.0 / math.sqrt(r)) * np.random.randn(r * D, 1) 31 | PrN = normP.reshape(r, D, order='F') 32 | Xp = np.matmul(PrN, X) 33 | if type == 'BernoulliProj': 34 | bp = np.random.rand(r * D, 1) 35 | Bp = (1.0 / math.sqrt(r)) * (bp >= .5) - (1.0 / math.sqrt(r)) * (bp < .5) 36 | PrB = Bp.reshape(r, D, order='F') 37 | Xp = np.matmul(PrB, X) 38 | return Xp 39 | 40 | 41 | if __name__ == "__main__": 42 | pass 43 | -------------------------------------------------------------------------------- /Hungarian.py: -------------------------------------------------------------------------------- 1 | # Solve the Assignment problem using the Hungarian method. 2 | # input A - square cost matrix 3 | # return - the optimal assignment 4 | 5 | import numpy as np 6 | from scipy.optimize import linear_sum_assignment 7 | 8 | 9 | def Hungarian(A): 10 | _, col_ind = linear_sum_assignment(A) 11 | # Cost can be found as A[row_ind, col_ind].sum() 12 | return col_ind 13 | 14 | 15 | if __name__ == "__main__": 16 | pass 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Abhinav Garg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /OutlierDetection.py: -------------------------------------------------------------------------------- 1 | # This function takes the coefficient matrix resulted from sparse 2 | # representation using \ell_1 minimization. If a point cannot be written as 3 | # a linear combination of other points, it should be an outlier. The 4 | # function detects the indices of outliers and modifies the coefficient 5 | # matrix and the ground-truth accordingly. 6 | # CMat: NxN coefficient matrix 7 | # s: Nx1 ground-truth vector 8 | # CMatC: coefficient matrix after eliminating Nans 9 | # sc: ground-truth after eliminating outliers 10 | # OutlierIndx: indices of outliers in {1,2,...,N} 11 | # Fail: True if number of inliers is less than number of groups, False otherwise 12 | 13 | import numpy as np 14 | 15 | 16 | def OutlierDetection(CMat, s): 17 | n = np.amax(s) 18 | _, N = CMat.shape 19 | OutlierIndx = list() 20 | FailCnt = 0 21 | Fail = False 22 | 23 | for i in range(0, N): 24 | c = CMat[:, i] 25 | if np.sum(np.isnan(c)) >= 1: 26 | OutlierIndx.append(i) 27 | FailCnt += 1 28 | sc = s.astype(float) 29 | sc[OutlierIndx] = np.nan 30 | CMatC = CMat.astype(float) 31 | CMatC[OutlierIndx, :] = np.nan 32 | CMatC[:, OutlierIndx] = np.nan 33 | OutlierIndx = OutlierIndx 34 | 35 | if FailCnt > (N - n): 36 | CMatC = np.nan 37 | sc = np.nan 38 | Fail = True 39 | return CMatC, sc, OutlierIndx, Fail 40 | 41 | 42 | if __name__ == "__main__": 43 | pass 44 | -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | ### Python implementation of Sparse Subspace Clustering algorithm 2 | 3 | Sparse Subspace Clustering is a subspace clustering algorithm based on techniques from sparse representation theory. 4 | 5 | - See [Sparse Subspace Clustering](http://www.vision.jhu.edu/ssc.htm) for more information. 6 | - This implementation is based on [SSC code for MATLAB (using CVX)](http://www.vision.jhu.edu/code/fetchcode.php?id=3) provided by [JHU Vision Lab](http://www.vision.jhu.edu/code/). 7 | - Requirements - numpy, scipy, sklearn, cvxpy. Tested with Python 3. 8 | - cvxpy python package can be installed from [cvxpy page](https://cvxgrp.github.io/cvxpy/index.html). 9 | - Start exploring with `SSC.py`. `SSC_test()` method in this file provides a basic example of subspace clustering. To run: `python SSC.py`. 10 | 11 | Note: A significant effort went into this code. If you decide to use this code, I would really appreciate an email at [garg26@wisc.edu](mailto:garg26@wisc.edu). -------------------------------------------------------------------------------- /SSC.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from DataProjection import * 3 | from BuildAdjacency import * 4 | from OutlierDetection import * 5 | from BestMap import * 6 | from SpectralClustering import * 7 | from SparseCoefRecovery import * 8 | 9 | 10 | def SSC_test(): 11 | # Basic test to check SSC. 12 | 13 | D = 30 # Dimension of ambient space 14 | n = 2 # Number of subspaces 15 | d1 = 1 16 | d2 = 1 # d1 and d2: dimension of subspace 1 and 2 17 | N1 = 50 18 | N2 = 50 # N1 and N2: number of points in subspace 1 and 2 19 | # Generating N1 points in a d1 dim. subspace 20 | X1 = np.random.randn(D, d1) * np.random.randn(d1, N1) 21 | # Generating N2 points in a d2 dim. subspace 22 | X2 = np.random.randn(D, d2) * np.random.randn(d2, N2) 23 | X = np.concatenate((X1, X2), axis=1) 24 | 25 | # Generating the ground-truth for evaluating clustering results 26 | s = np.concatenate((1 * np.ones([1, N1]), 2 * np.ones([1, N2])), axis=1) 27 | r = 0 # Enter the projection dimension e.g. r = d*n, enter r = 0 to not project 28 | Cst = 0 # Enter 1 to use the additional affine constraint sum(c) == 1 29 | OptM = 'L1Perfect' # OptM can be {'L1Perfect','L1Noise','Lasso','L1ED'} 30 | lmbda = 0.001 # Regularization parameter in 'Lasso' or the noise level for 'L1Noise' 31 | # Number of top coefficients to build the similarity graph, enter K=0 for using the whole coefficients 32 | K = max(d1, d2) 33 | if Cst == 1: 34 | K = max(d1, d2) + 1 # For affine subspaces, the number of coefficients to pick is dimension + 1 35 | 36 | Xp = DataProjection(X, r, 'NormalProj') 37 | CMat = SparseCoefRecovery(Xp, Cst, OptM, lmbda) 38 | # Make small values 0 39 | eps = np.finfo(float).eps 40 | CMat[np.abs(CMat) < eps] = 0 41 | 42 | CMatC, sc, OutlierIndx, Fail = OutlierDetection(CMat, s) 43 | 44 | if Fail == False: 45 | CKSym = BuildAdjacency(CMatC, K) 46 | Grps = SpectralClustering(CKSym, n) 47 | Grps = BestMap(sc, Grps) 48 | Missrate = float(np.sum(sc != Grps)) / sc.size 49 | print("Misclassification rate: {:.4f} %".format(Missrate * 100)) 50 | else: 51 | print("Something failed") 52 | 53 | 54 | if __name__ == "__main__": 55 | SSC_test() 56 | -------------------------------------------------------------------------------- /SparseCoefRecovery.py: -------------------------------------------------------------------------------- 1 | # This function takes the D x N matrix of N data points and write every 2 | # point as a sparse linear combination of other points. 3 | # Xp: D x N matrix of N data points 4 | # cst: 1 if using the affine constraint sum(c)=1, else 0 5 | # Opt: type of optimization, {'L1Perfect','L1Noisy','Lasso','L1ED'} 6 | # lambda: regularizartion parameter of LASSO, typically between 0.001 and 7 | # 0.1 or the noise level for 'L1Noise' 8 | # CMat: N x N matrix of coefficients, column i correspond to the sparse 9 | # coefficients of data point in column i of Xp 10 | 11 | # For this to work install cvxpy from: 12 | # https://cvxgrp.github.io/cvxpy/install/index.html 13 | 14 | import numpy as np 15 | import cvxpy as cvx 16 | 17 | 18 | def SparseCoefRecovery(Xp, cst=0, Opt='Lasso', lmbda=0.001): 19 | D, N = Xp.shape 20 | CMat = np.zeros([N, N]) 21 | for i in range(0, N): 22 | y = Xp[:, i] 23 | if i == 0: 24 | Y = Xp[:, i + 1:] 25 | elif i > 0 and i < N - 1: 26 | Y = np.concatenate((Xp[:, 0:i], Xp[:, i + 1:N]), axis=1) 27 | else: 28 | Y = Xp[:, 0:N - 1] 29 | 30 | if cst == 1: 31 | if Opt == 'Lasso': 32 | c = cvx.Variable(N - 1, 1) 33 | obj = cvx.Minimize(cvx.norm(c, 1) + lmbda * cvx.norm(Y * c - y)) 34 | constraint = [cvx.sum(c) == 1] 35 | prob = cvx.Problem(obj, constraint) 36 | prob.solve() 37 | elif Opt == 'L1Perfect': 38 | c = cvx.Variable(N - 1, 1) 39 | obj = cvx.Minimize(cvx.norm(c, 1)) 40 | constraint = [Y * c == y, cvx.sum(c) == 1] 41 | prob = cvx.Problem(obj, constraint) 42 | prob.solve() 43 | elif Opt == 'L1Noise': 44 | c = cvx.Variable(N - 1, 1) 45 | obj = cvx.Minimize(cvx.norm(c, 1)) 46 | constraint = [(Y * c - y) <= lmbda, cvx.sum(c) == 1] 47 | prob = cvx.Problem(obj, constraint) 48 | prob.solve() 49 | elif Opt == 'L1ED': 50 | c = cvx.Variable(N - 1 + D, 1) 51 | obj = cvx.Minimize(cvx.norm(c, 1)) 52 | constraint = [np.concatenate((Y, np.identity(D)), axis=1) 53 | * c == y, cvx.sum(c[0:N - 1]) == 1] 54 | prob = cvx.Problem(obj, constraint) 55 | prob.solve() 56 | else: 57 | if Opt == 'Lasso': 58 | c = cvx.Variable(N - 1, 1) 59 | obj = cvx.Minimize(cvx.norm(c, 1) + lmbda * cvx.norm(Y * c - y)) 60 | prob = cvx.Problem(obj) 61 | prob.solve() 62 | elif Opt == 'L1Perfect': 63 | c = cvx.Variable(N - 1, 1) 64 | obj = cvx.Minimize(cvx.norm(c, 1)) 65 | constraint = [Y * c == y] 66 | prob = cvx.Problem(obj, constraint) 67 | prob.solve() 68 | elif Opt == 'L1Noise': 69 | c = cvx.Variable(N - 1, 1) 70 | obj = cvx.Minimize(cvx.norm(c, 1)) 71 | constraint = [(Y * c - y) <= lmbda] 72 | prob = cvx.Problem(obj, constraint) 73 | prob.solve() 74 | elif Opt == 'L1ED': 75 | c = cvx.Variable(N - 1 + D, 1) 76 | obj = cvx.Minimize(cvx.norm(c, 1)) 77 | constraint = [np.concatenate((Y, np.identity(D)), axis=1) * c == y] 78 | prob = cvx.Problem(obj, constraint) 79 | prob.solve() 80 | 81 | if i == 0: 82 | CMat[0, 0] = 0 83 | CMat[1:N, 0] = c.value[0: N - 1] 84 | elif i > 0 and i < N - 1: 85 | CMat[0:i, i] = c.value[0:i] 86 | CMat[i, i] = 0 87 | CMat[i + 1:N, i] = c.value[i:N - 1] 88 | else: 89 | CMat[0:N - 1, N - 1] = c.value[0:N - 1] 90 | CMat[N - 1, N - 1] = 0 91 | return CMat 92 | 93 | 94 | if __name__ == "__main__": 95 | pass 96 | -------------------------------------------------------------------------------- /SpectralClustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import KMeans 3 | from scipy.sparse import identity 4 | 5 | 6 | def SpectralClustering(CKSym, n): 7 | # This is direct port of JHU vision lab code. Could probably use sklearn SpectralClustering. 8 | CKSym = CKSym.astype(float) 9 | N, _ = CKSym.shape 10 | MAXiter = 1000 # Maximum number of iterations for KMeans 11 | REPlic = 20 # Number of replications for KMeans 12 | 13 | DN = np.diag(np.divide(1, np.sqrt(np.sum(CKSym, axis=0) + np.finfo(float).eps))) 14 | LapN = identity(N).toarray().astype(float) - np.matmul(np.matmul(DN, CKSym), DN) 15 | _, _, vN = np.linalg.svd(LapN) 16 | vN = vN.T 17 | kerN = vN[:, N - n:N] 18 | normN = np.sqrt(np.sum(np.square(kerN), axis=1)) 19 | kerNS = np.divide(kerN, normN.reshape(len(normN), 1) + np.finfo(float).eps) 20 | km = KMeans(n_clusters=n, n_init=REPlic, max_iter=MAXiter, n_jobs=-1).fit(kerNS) 21 | return km.labels_ 22 | 23 | 24 | if __name__ == "__main__": 25 | pass 26 | --------------------------------------------------------------------------------