├── .gitignore
├── BestMap.py
├── BuildAdjacency.py
├── DataProjection.py
├── Hungarian.py
├── LICENSE
├── OutlierDetection.py
├── ReadMe.md
├── SSC.py
├── SparseCoefRecovery.py
└── SpectralClustering.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/BestMap.py:
--------------------------------------------------------------------------------
 1 | # bestmap: permute labels of L2 to match L1 as good as possible
 2 | 
 3 | import numpy as np
 4 | import sys
 5 | from Hungarian import *
 6 | 
 7 | 
 8 | def BestMap(L1, L2):
 9 | 
10 |     L1 = L1.flatten(order='F').astype(float)
11 |     L2 = L2.flatten(order='F').astype(float)
12 |     if L1.size != L2.size:
13 |         sys.exit('size(L1) must == size(L2)')
14 |     Label1 = np.unique(L1)
15 |     nClass1 = Label1.size
16 |     Label2 = np.unique(L2)
17 |     nClass2 = Label2.size
18 |     nClass = max(nClass1, nClass2)
19 | 
20 |     # For Hungarian - Label2 are Workers, Label1 are Tasks.
21 |     G = np.zeros([nClass, nClass]).astype(float)
22 |     for i in range(0, nClass2):
23 |         for j in range(0, nClass1):
24 |             G[i, j] = np.sum(np.logical_and(L2 == Label2[i], L1 == Label1[j]))
25 | 
26 |     c = Hungarian(-G)
27 |     newL2 = np.zeros(L2.shape)
28 |     for i in range(0, nClass2):
29 |         newL2[L2 == Label2[i]] = Label1[c[i]]
30 |     return newL2
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     pass
35 | 


--------------------------------------------------------------------------------
/BuildAdjacency.py:
--------------------------------------------------------------------------------
 1 | # This function takes a NxN coefficient matrix and returns a NxN adjacency
 2 | # matrix by choosing only the K strongest connections in the similarity graph
 3 | # CMat: NxN coefficient matrix
 4 | # K: number of strongest edges to keep; if K=0 use all the coefficients
 5 | # CKSym: NxN symmetric adjacency matrix
 6 | 
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def BuildAdjacency(CMat, K):
12 |     CMat = CMat.astype(float)
13 |     CKSym = None
14 |     N, _ = CMat.shape
15 |     CAbs = np.absolute(CMat).astype(float)
16 |     for i in range(0, N):
17 |         c = CAbs[:, i]
18 |         PInd = np.flip(np.argsort(c), 0)
19 |         CAbs[:, i] = CAbs[:, i] / float(np.absolute(c[PInd[0]]))
20 |     CSym = np.add(CAbs, CAbs.T).astype(float)
21 |     if K != 0:
22 |         Ind = np.flip(np.argsort(CSym, axis=0), 0)
23 |         CK = np.zeros([N, N]).astype(float)
24 |         for i in range(0, N):
25 |             for j in range(0, K):
26 |                 CK[Ind[j, i], i] = CSym[Ind[j, i], i] / float(np.absolute(CSym[Ind[0, i], i]))
27 |         CKSym = np.add(CK, CK.T)
28 |     else:
29 |         CKSym = CSym
30 |     return CKSym
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     pass
35 | 


--------------------------------------------------------------------------------
/DataProjection.py:
--------------------------------------------------------------------------------
 1 | # This function takes the D x N data matrix with columns indicating
 2 | # different data points and project the D dimensional data into the r
 3 | # dimensional space. Different types of projections are possible:
 4 | # (1) Projection using PCA
 5 | # (2) Projection using random projections with iid elements from N(0,1/r)
 6 | # (3) Projection using random projections with iid elements from symmetric
 7 | # bernoulli distribution: +1/sqrt(r),-1/sqrt(r) elements with same probability
 8 | # X: D x N data matrix of N data points
 9 | # r: dimension of the space to project the data to
10 | # type: type of projection, {'PCA','NormalProj','BernoulliProj'}
11 | # Xp: r x N data matrix of N projectred data points
12 | 
13 | import numpy as np
14 | import math
15 | 
16 | 
17 | def DataProjection(X, r, type='NormalProj'):
18 |     Xp = None
19 |     D, N = X.shape
20 |     if r == 0:
21 |         Xp = X
22 |     else:
23 |         if type == 'PCA':
24 |             isEcon = False
25 |             if D > N:
26 |                 isEcon = True
27 |             U, S, V = np.linalg.svd(X.T, full_matrices=isEcon)
28 |             Xp = U[:, 0:r].T
29 |         if type == 'NormalProj':
30 |             normP = (1.0 / math.sqrt(r)) * np.random.randn(r * D, 1)
31 |             PrN = normP.reshape(r, D, order='F')
32 |             Xp = np.matmul(PrN, X)
33 |         if type == 'BernoulliProj':
34 |             bp = np.random.rand(r * D, 1)
35 |             Bp = (1.0 / math.sqrt(r)) * (bp >= .5) - (1.0 / math.sqrt(r)) * (bp < .5)
36 |             PrB = Bp.reshape(r, D, order='F')
37 |             Xp = np.matmul(PrB, X)
38 |     return Xp
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     pass
43 | 


--------------------------------------------------------------------------------
/Hungarian.py:
--------------------------------------------------------------------------------
 1 | # Solve the Assignment problem using the Hungarian method.
 2 | # input A - square cost matrix
 3 | # return - the optimal assignment
 4 | 
 5 | import numpy as np
 6 | from scipy.optimize import linear_sum_assignment
 7 | 
 8 | 
 9 | def Hungarian(A):
10 |     _, col_ind = linear_sum_assignment(A)
11 |     # Cost can be found as A[row_ind, col_ind].sum()
12 |     return col_ind
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     pass
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Abhinav Garg
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/OutlierDetection.py:
--------------------------------------------------------------------------------
 1 | # This function takes the coefficient matrix resulted from sparse
 2 | # representation using \ell_1 minimization. If a point cannot be written as
 3 | # a linear combination of other points, it should be an outlier. The
 4 | # function detects the indices of outliers and modifies the coefficient
 5 | # matrix and the ground-truth accordingly.
 6 | # CMat: NxN coefficient matrix
 7 | # s: Nx1 ground-truth vector
 8 | # CMatC: coefficient matrix after eliminating Nans
 9 | # sc: ground-truth after eliminating outliers
10 | # OutlierIndx: indices of outliers in {1,2,...,N}
11 | # Fail: True if number of inliers is less than number of groups, False otherwise
12 | 
13 | import numpy as np
14 | 
15 | 
16 | def OutlierDetection(CMat, s):
17 |     n = np.amax(s)
18 |     _, N = CMat.shape
19 |     OutlierIndx = list()
20 |     FailCnt = 0
21 |     Fail = False
22 | 
23 |     for i in range(0, N):
24 |         c = CMat[:, i]
25 |         if np.sum(np.isnan(c)) >= 1:
26 |             OutlierIndx.append(i)
27 |             FailCnt += 1
28 |     sc = s.astype(float)
29 |     sc[OutlierIndx] = np.nan
30 |     CMatC = CMat.astype(float)
31 |     CMatC[OutlierIndx, :] = np.nan
32 |     CMatC[:, OutlierIndx] = np.nan
33 |     OutlierIndx = OutlierIndx
34 | 
35 |     if FailCnt > (N - n):
36 |         CMatC = np.nan
37 |         sc = np.nan
38 |         Fail = True
39 |     return CMatC, sc, OutlierIndx, Fail
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     pass
44 | 


--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
 1 | ### Python implementation of Sparse Subspace Clustering algorithm
 2 | 
 3 | Sparse Subspace Clustering is a subspace clustering algorithm based on techniques from sparse representation theory.
 4 | 
 5 | - See [Sparse Subspace Clustering](http://www.vision.jhu.edu/ssc.htm) for more information.
 6 | - This implementation is based on [SSC code for MATLAB (using CVX)](http://www.vision.jhu.edu/code/fetchcode.php?id=3) provided by [JHU Vision Lab](http://www.vision.jhu.edu/code/).
 7 | - Requirements - numpy, scipy, sklearn, cvxpy. Tested with Python 3.
 8 | - cvxpy python package can be installed from [cvxpy page](https://cvxgrp.github.io/cvxpy/index.html).
 9 | - Start exploring with `SSC.py`. `SSC_test()` method in this file provides a basic example of subspace clustering. To run: `python SSC.py`.
10 | 
11 | Note: A significant effort went into this code. If you decide to use this code, I would really appreciate an email at [garg26@wisc.edu](mailto:garg26@wisc.edu).


--------------------------------------------------------------------------------
/SSC.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from DataProjection import *
 3 | from BuildAdjacency import *
 4 | from OutlierDetection import *
 5 | from BestMap import *
 6 | from SpectralClustering import *
 7 | from SparseCoefRecovery import *
 8 | 
 9 | 
10 | def SSC_test():
11 |     # Basic test to check SSC.
12 | 
13 |     D = 30  # Dimension of ambient space
14 |     n = 2  # Number of subspaces
15 |     d1 = 1
16 |     d2 = 1  # d1 and d2: dimension of subspace 1 and 2
17 |     N1 = 50
18 |     N2 = 50  # N1 and N2: number of points in subspace 1 and 2
19 |     # Generating N1 points in a d1 dim. subspace
20 |     X1 = np.random.randn(D, d1) * np.random.randn(d1, N1)
21 |     # Generating N2 points in a d2 dim. subspace
22 |     X2 = np.random.randn(D, d2) * np.random.randn(d2, N2)
23 |     X = np.concatenate((X1, X2), axis=1)
24 | 
25 |     # Generating the ground-truth for evaluating clustering results
26 |     s = np.concatenate((1 * np.ones([1, N1]), 2 * np.ones([1, N2])), axis=1)
27 |     r = 0  # Enter the projection dimension e.g. r = d*n, enter r = 0 to not project
28 |     Cst = 0  # Enter 1 to use the additional affine constraint sum(c) == 1
29 |     OptM = 'L1Perfect'  # OptM can be {'L1Perfect','L1Noise','Lasso','L1ED'}
30 |     lmbda = 0.001  # Regularization parameter in 'Lasso' or the noise level for 'L1Noise'
31 |     # Number of top coefficients to build the similarity graph, enter K=0 for using the whole coefficients
32 |     K = max(d1, d2)
33 |     if Cst == 1:
34 |         K = max(d1, d2) + 1  # For affine subspaces, the number of coefficients to pick is dimension + 1
35 | 
36 |     Xp = DataProjection(X, r, 'NormalProj')
37 |     CMat = SparseCoefRecovery(Xp, Cst, OptM, lmbda)
38 |     # Make small values 0
39 |     eps = np.finfo(float).eps
40 |     CMat[np.abs(CMat) < eps] = 0
41 | 
42 |     CMatC, sc, OutlierIndx, Fail = OutlierDetection(CMat, s)
43 | 
44 |     if Fail == False:
45 |         CKSym = BuildAdjacency(CMatC, K)
46 |         Grps = SpectralClustering(CKSym, n)
47 |         Grps = BestMap(sc, Grps)
48 |         Missrate = float(np.sum(sc != Grps)) / sc.size
49 |         print("Misclassification rate: {:.4f} %".format(Missrate * 100))
50 |     else:
51 |         print("Something failed")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     SSC_test()
56 | 


--------------------------------------------------------------------------------
/SparseCoefRecovery.py:
--------------------------------------------------------------------------------
 1 | # This function takes the D x N matrix of N data points and write every
 2 | # point as a sparse linear combination of other points.
 3 | # Xp: D x N matrix of N data points
 4 | # cst: 1 if using the affine constraint sum(c)=1, else 0
 5 | # Opt: type of optimization, {'L1Perfect','L1Noisy','Lasso','L1ED'}
 6 | # lambda: regularizartion parameter of LASSO, typically between 0.001 and
 7 | # 0.1 or the noise level for 'L1Noise'
 8 | # CMat: N x N matrix of coefficients, column i correspond to the sparse
 9 | # coefficients of data point in column i of Xp
10 | 
11 | # For this to work install cvxpy from:
12 | # https://cvxgrp.github.io/cvxpy/install/index.html
13 | 
14 | import numpy as np
15 | import cvxpy as cvx
16 | 
17 | 
18 | def SparseCoefRecovery(Xp, cst=0, Opt='Lasso', lmbda=0.001):
19 |     D, N = Xp.shape
20 |     CMat = np.zeros([N, N])
21 |     for i in range(0, N):
22 |         y = Xp[:, i]
23 |         if i == 0:
24 |             Y = Xp[:, i + 1:]
25 |         elif i > 0 and i < N - 1:
26 |             Y = np.concatenate((Xp[:, 0:i], Xp[:, i + 1:N]), axis=1)
27 |         else:
28 |             Y = Xp[:, 0:N - 1]
29 | 
30 |         if cst == 1:
31 |             if Opt == 'Lasso':
32 |                 c = cvx.Variable(N - 1, 1)
33 |                 obj = cvx.Minimize(cvx.norm(c, 1) + lmbda * cvx.norm(Y * c - y))
34 |                 constraint = [cvx.sum(c) == 1]
35 |                 prob = cvx.Problem(obj, constraint)
36 |                 prob.solve()
37 |             elif Opt == 'L1Perfect':
38 |                 c = cvx.Variable(N - 1, 1)
39 |                 obj = cvx.Minimize(cvx.norm(c, 1))
40 |                 constraint = [Y * c == y, cvx.sum(c) == 1]
41 |                 prob = cvx.Problem(obj, constraint)
42 |                 prob.solve()
43 |             elif Opt == 'L1Noise':
44 |                 c = cvx.Variable(N - 1, 1)
45 |                 obj = cvx.Minimize(cvx.norm(c, 1))
46 |                 constraint = [(Y * c - y) <= lmbda, cvx.sum(c) == 1]
47 |                 prob = cvx.Problem(obj, constraint)
48 |                 prob.solve()
49 |             elif Opt == 'L1ED':
50 |                 c = cvx.Variable(N - 1 + D, 1)
51 |                 obj = cvx.Minimize(cvx.norm(c, 1))
52 |                 constraint = [np.concatenate((Y, np.identity(D)), axis=1)
53 |                               * c == y, cvx.sum(c[0:N - 1]) == 1]
54 |                 prob = cvx.Problem(obj, constraint)
55 |                 prob.solve()
56 |         else:
57 |             if Opt == 'Lasso':
58 |                 c = cvx.Variable(N - 1, 1)
59 |                 obj = cvx.Minimize(cvx.norm(c, 1) + lmbda * cvx.norm(Y * c - y))
60 |                 prob = cvx.Problem(obj)
61 |                 prob.solve()
62 |             elif Opt == 'L1Perfect':
63 |                 c = cvx.Variable(N - 1, 1)
64 |                 obj = cvx.Minimize(cvx.norm(c, 1))
65 |                 constraint = [Y * c == y]
66 |                 prob = cvx.Problem(obj, constraint)
67 |                 prob.solve()
68 |             elif Opt == 'L1Noise':
69 |                 c = cvx.Variable(N - 1, 1)
70 |                 obj = cvx.Minimize(cvx.norm(c, 1))
71 |                 constraint = [(Y * c - y) <= lmbda]
72 |                 prob = cvx.Problem(obj, constraint)
73 |                 prob.solve()
74 |             elif Opt == 'L1ED':
75 |                 c = cvx.Variable(N - 1 + D, 1)
76 |                 obj = cvx.Minimize(cvx.norm(c, 1))
77 |                 constraint = [np.concatenate((Y, np.identity(D)), axis=1) * c == y]
78 |                 prob = cvx.Problem(obj, constraint)
79 |                 prob.solve()
80 | 
81 |         if i == 0:
82 |             CMat[0, 0] = 0
83 |             CMat[1:N, 0] = c.value[0: N - 1]
84 |         elif i > 0 and i < N - 1:
85 |             CMat[0:i, i] = c.value[0:i]
86 |             CMat[i, i] = 0
87 |             CMat[i + 1:N, i] = c.value[i:N - 1]
88 |         else:
89 |             CMat[0:N - 1, N - 1] = c.value[0:N - 1]
90 |             CMat[N - 1, N - 1] = 0
91 |     return CMat
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     pass
96 | 


--------------------------------------------------------------------------------
/SpectralClustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.cluster import KMeans
 3 | from scipy.sparse import identity
 4 | 
 5 | 
 6 | def SpectralClustering(CKSym, n):
 7 |     # This is direct port of JHU vision lab code. Could probably use sklearn SpectralClustering.
 8 |     CKSym = CKSym.astype(float)
 9 |     N, _ = CKSym.shape
10 |     MAXiter = 1000  # Maximum number of iterations for KMeans
11 |     REPlic = 20  # Number of replications for KMeans
12 | 
13 |     DN = np.diag(np.divide(1, np.sqrt(np.sum(CKSym, axis=0) + np.finfo(float).eps)))
14 |     LapN = identity(N).toarray().astype(float) - np.matmul(np.matmul(DN, CKSym), DN)
15 |     _, _, vN = np.linalg.svd(LapN)
16 |     vN = vN.T
17 |     kerN = vN[:, N - n:N]
18 |     normN = np.sqrt(np.sum(np.square(kerN), axis=1))
19 |     kerNS = np.divide(kerN, normN.reshape(len(normN), 1) + np.finfo(float).eps)
20 |     km = KMeans(n_clusters=n, n_init=REPlic, max_iter=MAXiter, n_jobs=-1).fit(kerNS)
21 |     return km.labels_
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     pass
26 | 


--------------------------------------------------------------------------------