├── .gitignore
├── .gitmodules
├── README.md
├── background_subtraction.py
├── background_subtraction_visualize.py
├── plot_benchmark.py
├── pypropack
├── robustpca.py
├── test_robustpca.py
└── topic_extraction.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lib/pypropack"]
2 | 	path = lib/pypropack
3 | 	url = git://github.com/jakevdp/pypropack.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | PyRPCA
 2 | ======
 3 | 
 4 | Robust PCA in Python. Methods are from the http://perception.csl.illinois.edu/matrix-rank/sample_code.html and papers therein.
 5 | 
 6 | Requirement
 7 | ===========
 8 |  * scipy
 9 |  * numpy
10 |  * pypropack(optional)
11 |  * scikit-learn
12 |  * nosetest
13 | 
14 | Scripts
15 | =======
16 |  * `test_robustpca.py` test whether the algorithms included can recovery the synthetic data successfully. Use `nosetest test_robustpca.py`
17 |  * `plot_benchmark.py` plot the benchmarks with synthetic data generated with different parameters. Use `python2 plot_benchmark.py`
18 |  * `background_subtraction.py` generate the result using the escalator dataset. Use `python2 background_subtraction.py`. This will generate the `.mat` files with respect to each algorithms and can be directly readable from matlab. Furthermore, `background_subtraction_visualize.py` could be used to generate a video. The temporary image files are located in `/tmp/robust_pca_tmp/` which should be created first.
19 |  * `topic_extraction.py` extracts the keywords from the 20newsgroup dataset. It will generate two files, one is `origin.txt` and another is `keyword.txt`. The keyword and the original text on the same line is one-one mapped.
20 | 
21 | Aknowledgement
22 | ==============
23 | Special thanks for the following two resources and their authors.
24 |  * http://perception.csl.illinois.edu/matrix-rank/sample_code.html
25 |  * www.stanford.edu/~peleato/math301_slides.pdf‎
26 | 


--------------------------------------------------------------------------------
/background_subtraction.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | from scipy.io import loadmat, savemat
 3 | from scipy.misc import imresize
 4 | import os
 5 | import numpy as np
 6 | from robustpca import *
 7 | from numpy.linalg import svd
 8 | ### urllib
 9 | 
10 | videoFname = "escalator_data.mat"
11 | 
12 | 
13 | def download_video_clip():
14 |     webFile = urllib2.urlopen(
15 |         r'http://cvxr.com/tfocs/demos/rpca/escalator_data.mat')
16 |     filename = './' + videoFname
17 |     localFile = open(filename, 'w')
18 |     localFile.write(webFile.read())
19 |     localFile.close()
20 | 
21 | if __name__ == "__main__":
22 |     if not os.path.exists('./' + videoFname):
23 |         print 'Video file not found, downloading'
24 |         download_video_clip()
25 |     X = loadmat(videoFname)['X'].astype(np.double)/255.
26 |     nclip = X.shape[1]
27 |     # lmbda = .01
28 |     lmbda = .01
29 |     # lmbdas = {'APG':.01,
30 |     #           'ALM':.01,
31 |     #           'ADMM':.01,
32 |     #           'SVT':.01}
33 |     # save a basic value
34 |     ################################### Mean ################################
35 |     Xmean = X.mean(axis=1)
36 |     A = np.tile(Xmean, (nclip, 1)).T
37 |     E = X - A
38 |     A = A.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255.
39 |     E = E.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255.
40 |     savemat("./MEAN_background_subtraction.mat", {"A": A, "E": E})
41 |     ################################## SVD ##################################
42 |     U, S, V = svd(X, full_matrices=False)
43 |     r = 20
44 |     A = np.dot(np.dot(U[:, :r], np.diag(S[:r])), V[:r, :])
45 |     E = X - A
46 |     A = A.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255.
47 |     E = E.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255.
48 |     savemat("./PCA_background_subtraction.mat", {"A": A, "E": E})
49 |     ################################# Robust PCA ############################
50 |     for fname in method.keys():
51 |         m = method[fname]
52 |         A, E = m(X, lmbda=lmbda, maxiter=100, verbose=0)
53 |         A = A.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255.
54 |         E = E.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255.
55 |         savemat("./%s_background_subtraction.mat"%(fname), {"A": A, "E": E})
56 | 


--------------------------------------------------------------------------------
/background_subtraction_visualize.py:
--------------------------------------------------------------------------------
 1 | ## making a movie
 2 | 
 3 | 
 4 | import os
 5 | import sys
 6 | import matplotlib.pyplot as plt
 7 | from scipy.io import loadmat
 8 | import numpy as np
 9 | from matplotlib import cm
10 | import matplotlib
11 | from robustpca import method
12 | from plot_benchmark import mlabdefaults
13 | 
14 | if __name__ == "__main__":
15 |     mlabdefaults()
16 |     matplotlib.rcParams['savefig.dpi'] = 200
17 |     files = []
18 |     cache_path = '/tmp/robust_pca_tmp'
19 |     if not os.path.exists(cache_path):
20 |         os.mkdir(cache_path)
21 |     all_methods = method.keys()
22 |     all_methods.append('MEAN')
23 |     all_methods.append('PCA')
24 |     for fname in all_methods:
25 |         if not os.path.exists('%s/%s_tmp'%(cache_path, fname)):
26 |             os.mkdir("%s/%s_tmp"%(cache_path, fname))
27 |             mat = loadmat('./%s_background_subtraction.mat'%(fname))
28 |             org = loadmat('./escalator_data.mat')['X'].reshape(
29 |                 160, 130, 200).swapaxes(0, 1)
30 |             fig = plt.figure()
31 |             ax = fig.add_subplot(111)
32 |             for i in range(200):  # 50 frames
33 |                 ax.cla()
34 |                 ax.axis("off")
35 |                 ax.imshow(np.hstack((mat['A'][:, :, i],
36 |                                      mat['E'][:, :, i], org[:, :, i])), cm.gray)
37 |                 fname_ = '%s/%s_tmp/_tmp%03d.png'%(cache_path, fname, i)
38 |                 print 'Saving frame', fname_
39 |                 fig.tight_layout()
40 |                 fig.savefig(fname_, bbox_inches="tight")
41 |                 files.append(fname_)
42 |         print 'Making movie animation.mpg - this make take a while'
43 |         os.system("mencoder 'mf://%s/%s_tmp/_tmp*.png' -mf type=png:fps=10 -ovc lavc -lavcopts vcodec=wmv2 -oac copy -o %s_animation.mpg"%(cache_path, fname, fname))
44 | 


--------------------------------------------------------------------------------
/plot_benchmark.py:
--------------------------------------------------------------------------------
  1 | from robustpca import *
  2 | import robustpca as rp
  3 | from time import time
  4 | from numpy.random import randn
  5 | from numpy.random import shuffle
  6 | import numpy as np
  7 | import matplotlib
  8 | import matplotlib.cm as cm
  9 | import cPickle as cP
 10 | k_colors = ["r", "b", "y", "m", "c", "g", "#FFA500", "k"];
 11 | k_markers = "o*dxs^vD";
 12 | from pylab import gcf
 13 | import pylab
 14 | import os
 15 | 
 16 | def mlabdefaults():
 17 |     matplotlib.rcParams['lines.linewidth'] = 1.5
 18 |     matplotlib.rcParams['savefig.dpi'] = 300
 19 |     matplotlib.rcParams['font.size'] = 22.
 20 |     matplotlib.rcParams['font.family'] = "Times New Roman"
 21 |     matplotlib.rcParams['legend.fontsize'] = "small"
 22 |     matplotlib.rcParams['legend.fancybox'] = True
 23 |     matplotlib.rcParams['lines.markersize'] = 10
 24 |     matplotlib.rcParams['figure.figsize'] = 8, 5.6
 25 |     matplotlib.rcParams['legend.labelspacing'] = 0.1
 26 |     matplotlib.rcParams['legend.borderpad'] = 0.1
 27 |     matplotlib.rcParams['legend.borderaxespad'] = 0.2
 28 |     matplotlib.rcParams['font.monospace'] = "Courier New"
 29 | 
 30 | 
 31 | def synthesized_data(rank, dim, n_sample, sparse_ratio, sparse_mag):
 32 |     Basis = randn(rank, dim)
 33 |     W = randn(n_sample, rank)
 34 |     TrueMat = np.dot(W, Basis)
 35 |     # initalize a sparse matrix
 36 |     E = randn(TrueMat.size) * sparse_mag
 37 |     idx = np.arange(E.size)
 38 |     shuffle(idx)
 39 |     E[idx[int(sparse_ratio * E.size):]] = 0
 40 |     E = E.reshape(TrueMat.shape)
 41 |     # calculate the observation
 42 |     Observed = TrueMat + E  # 1000 x 100
 43 |     return Observed, TrueMat, E
 44 | 
 45 | 
 46 | def savefig(filename, fig=None):
 47 |     if fig is None:
 48 |         gcf().savefig(filename, bbox_inches='tight')
 49 |     else:
 50 |         fig.savefig(filename, bbox_inches='tight')
 51 | 
 52 | 
 53 | def generate_plot(x, results, xlabel=' ', ylabel=' ', keys=None, fname=None, me=1, title=None):
 54 |     mlabdefaults()
 55 |     fig = pylab.figure()
 56 |     ax = fig.add_subplot(111)
 57 |     if keys is None:
 58 |         keys = results.keys()
 59 |     for i, m in enumerate(keys):
 60 |         y = results[m]
 61 |         ax.set_yscale('log')
 62 |         ax.plot(x[:len(y)], y,
 63 |                 color=k_colors[i],
 64 |                 linestyle="-",
 65 |                 marker=k_markers[i], markevery=me)
 66 |     ax.set_xlabel(xlabel)
 67 |     ax.set_ylabel(ylabel)
 68 |     # ax.set_xlim(1,100)
 69 |     # ax.set_ylim(0,1.0)
 70 |     # ax.set_yticks(np.arange(0.0,1.1,0.1))
 71 |     ax.legend(keys, loc=4)
 72 |     ax.yaxis.grid(color='gray', linestyle='dashed')
 73 |     ax.xaxis.grid(color='gray', linestyle='dashed')
 74 |     if title is not None:
 75 |         fig.suptitle(title)
 76 |     if fname is None:
 77 |         pylab.show()
 78 |     else:
 79 |         savefig(fname, fig)
 80 | 
 81 | figurename = lambda x: "./benchfigures/%s"%(x)
 82 | 
 83 | 
 84 | def rank_experiment():
 85 |     for name in method.keys():
 86 |         result[name] = []
 87 |     x = [3, 5, 7, 9, 12]
 88 |     for rank in x:
 89 |         print "Rank %d"%(rank)
 90 |         mat, A, E = synthesized_data(rank=rank, dim=30,
 91 |                                      n_sample=1000, sparse_ratio=.05, sparse_mag=10)
 92 |         for name in method.keys():
 93 |             print "\t %s"%(name)
 94 |             time0 = time()
 95 |             m = method[name]
 96 |             A_, E_ = m(mat.T, lmbda=.1, verbose=0)
 97 |             escaped = time() - time0
 98 |             print "\t escaped %f"%(escaped)
 99 |             result[name].append(escaped)
100 |     cP.dump({'x': x, 'results': result}, open(figurename(
101 |         "rank_exp.pk"), 'w'), protocol=-1)
102 |     generate_plot(x, results=result, xlabel='Rank', ylabel='Time',
103 |                   keys=None, fname=figurename("rank_exp"))
104 | 
105 | 
106 | def sparse_ratio_experiment():
107 |     for name in method.keys():
108 |         result[name] = []
109 |     x = [0.01, 0.05, 0.1, 0.15, 0.2]
110 |     for sr in x:
111 |         print "sparse ratio %f"%(sr)
112 |         mat, A, E = synthesized_data(rank=5, dim=30,
113 |                                      n_sample=1000, sparse_ratio=sr, sparse_mag=10)
114 |         for name in method.keys():
115 |             print "\t %s"%(name)
116 |             time0 = time()
117 |             m = method[name]
118 |             A_, E_ = m(mat.T, lmbda=.1, verbose=0)
119 |             escaped = time() - time0
120 |             print "\t escaped %f"%(escaped)
121 |             result[name].append(escaped)
122 |     cP.dump({'x': x, 'results': result}, open(figurename(
123 |         "sparse_ratio_exp.pk"), 'w'), protocol=-1)
124 |     generate_plot(x, results=result, xlabel='Sparsity Ratio',
125 |                   ylabel='Time', keys=None, fname=figurename("sparse_ratio_exp"))
126 | 
127 | 
128 | def sample_number_experiment():
129 |     for name in method.keys():
130 |         result[name] = []
131 |     x = [300, 500, 800, 1000, 1500]
132 |     for sr in x:
133 |         print "number of samples %d"%(sr)
134 |         mat, A, E = synthesized_data(rank=5, dim=30,
135 |                                      n_sample=sr, sparse_ratio=0.05, sparse_mag=10)
136 |         for name in method.keys():
137 |             print "\t %s"%(name)
138 |             time0 = time()
139 |             m = method[name]
140 |             A_, E_ = m(mat.T, lmbda=.1, verbose=0)
141 |             escaped = time() - time0
142 |             print "\t escaped %f"%(escaped)
143 |             result[name].append(escaped)
144 |     cP.dump({'x': x, 'results': result}, open(figurename(
145 |         "data_number_exp.pk"), 'w'), protocol=-1)
146 |     generate_plot(x, results=result, xlabel='Number of Training Samples',
147 |                   ylabel='Time', keys=None, fname=figurename("data_number_exp"))
148 | 
149 | 
150 | def convergency_experiment():
151 |     result = {}
152 |     for name in method.keys():
153 |         result[name] = []
154 |     maxiter = 100
155 |     for sparse_ratio in [0.05, 0.1, 0.2]:
156 |         for rank in [5, 10, 20]:
157 |             cachename = figurename("iteration_exp_rank%d_sparse%0.2f.pk"%(rank, sparse_ratio));
158 |             if not os.path.exists(cachename):
159 |                 mat, A, E = synthesized_data(rank=rank, dim=30,
160 |                                              n_sample=1000, sparse_ratio=sparse_ratio, sparse_mag=10)
161 |                 for name in method.keys():
162 |                     print "\t %s"%(name)
163 |                     time0 = time()
164 |                     m = method[name]
165 |                     A_, E_, obj = m(
166 |                         mat.T, lmbda=.1, verbose=2, maxiter=maxiter)
167 |                     escaped = time() - time0
168 |                     print "\t escaped %f"%(escaped)
169 |                     result[name] = obj
170 |                 result['Optimal'] = np.ones(maxiter) * rp._monitor(A, E, A+E)
171 |                 cP.dump({'x': np.arange(maxiter), 'results': result}, open(cachename, 'w'), protocol=-1)
172 |             else:
173 |                 tmp = cP.load(open(cachename, 'rb'))
174 |                 result = tmp['results']
175 |             generate_plot(np.arange(maxiter), results=result, xlabel='Iteration', ylabel='Objective Value', keys=None, fname=figurename("iteration_exp_rank%d_sparse%0.2f.eps"%(rank, sparse_ratio)), me=3, title="Rank = %d, Sparse Error Ratio = %0.2f"%(rank, sparse_ratio))
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     rank_experiment()
180 |     sparse_ratio_experiment()
181 |     sample_number_experiment()
182 |     convergency_experiment()
183 | 


--------------------------------------------------------------------------------
/pypropack:
--------------------------------------------------------------------------------
1 | lib/pypropack/pypropack/


--------------------------------------------------------------------------------
/robustpca.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import TransformerMixin, BaseEstimator
  2 | import numpy as np
  3 | import scipy.sparse as sp
  4 | 
  5 | try:
  6 |     from pypropack import svdp
  7 |     raise ValueError
  8 |     svd = lambda X, k: svdp(X, k, 'L', kmax=max(100, 10 * k))
  9 |     import warnings
 10 |     with warnings.catch_warnings():
 11 |         warnings.simplefilter("ignore")
 12 | except:
 13 |     from scipy.linalg import svd as svd_
 14 | 
 15 |     def svd(X, k=-1):
 16 |         U, S, V = svd_(X, full_matrices=False)
 17 |         if k < 0:
 18 |             return U, S, V
 19 |         else:
 20 |             return U[:, :k], S[:k], V[:k, :]
 21 | 
 22 | # The problem solved is
 23 | #                   min  : tau * (|A|_* + \lmbda |E|_1) + .5 * |(A,E)|_F^2
 24 | #              subject to: A + E = D
 25 | 
 26 | 
 27 | def _monitor(A, E, D, lmbda=0.1):
 28 |     diags = svd(A, min(A.shape))[1]
 29 |     print "|A|_*", np.abs(diags).sum()
 30 |     print "|A|_0", (np.abs(diags) > 1e-6).sum()
 31 |     print "|E|_1", np.abs(D - A).sum()
 32 |     print "|D-A-E|_F", _fro(D - A - E)
 33 |     return np.abs(diags).sum() + lmbda * np.abs(D - A).sum()
 34 | 
 35 | 
 36 | def _pos(A):
 37 |     return A * (A > 0)
 38 | 
 39 | 
 40 | def _fro(A):
 41 |     return np.sqrt((A * A).sum())
 42 | 
 43 | 
 44 | def singular_value_thresholding(D, maxiter=25000, lmbda=1.0, tau=1e4, delta=.9, verbose=2):
 45 |     """
 46 |     Singular Value Thresholding
 47 |     """
 48 |     # initialization
 49 |     _matshape = D.shape
 50 |     primal_tol = 1e-5
 51 |     Y = np.zeros(shape=_matshape)
 52 |     A = np.zeros(shape=_matshape)
 53 |     E = np.zeros(shape=_matshape)
 54 |     rankA = 0
 55 |     obj = []
 56 |     for iter in range(maxiter):
 57 |         U, S, V = svd(Y, rankA+1)
 58 |         A = np.dot(np.dot(U, np.diag(_pos(S - tau))), V)
 59 |         E = np.sign(Y) * _pos(np.abs(Y) - lmbda * tau)
 60 |         M = D - A - E
 61 |         rankA = (S > tau).sum()
 62 |         Y = Y + delta * M
 63 |         if verbose >= 2:
 64 |             obj.append(_monitor(A, E, D))
 65 |         if _fro(D-A-E)/_fro(D) < primal_tol:
 66 |             if verbose >= 2:
 67 |                 print "Converged at iter %d"%iter
 68 |             break
 69 |     if verbose >= 2:
 70 |         return A, E, obj
 71 |     else:
 72 |         return A, E
 73 | 
 74 | 
 75 | def accelerate_proximal_gradient(D, lmbda, maxiter=25000, tol=1e-7,
 76 |                                  continuation=True,
 77 |                                  eta=.9, mu=1e-3, verbose=2):
 78 |     """
 79 |     Accelerated Proximal Gradient (Partial SVD Version)
 80 |     """
 81 |     obj = []
 82 |     m, n = D.shape
 83 | 
 84 |     t_k = 1.
 85 |     tk_old = 1.
 86 |     tau_0 = 2.
 87 | 
 88 |     A_old = np.zeros(D.shape)
 89 |     E_old = np.zeros(D.shape)
 90 |     A = np.zeros(D.shape)
 91 |     E = np.zeros(D.shape)
 92 | 
 93 |     # This comes from the code
 94 |     if continuation:
 95 |         mu_0 = svd(D, 1)[1]
 96 |         mu_k = mu_0;
 97 |         mu_bar = 1e-9 * mu_0
 98 |     else:
 99 |         mu_k = mu;
100 | 
101 |     tau_k = tau_0;
102 |     converged = False;
103 |     sv = 5.;
104 | 
105 |     for iter in range(maxiter):
106 |         YA = A + ((tk_old - 1)/t_k)*(A-A_old);
107 |         YE = E + ((tk_old - 1)/t_k)*(E-E_old);
108 | 
109 |         A_old = YA - (1/tau_k)*(YA+YE-D);
110 |         E_old = YE - (1/tau_k)*(YA+YE-D);
111 | 
112 |         U, S, V = svd(A_old);
113 | 
114 |         svp = (S > mu_k/tau_k).sum();
115 |         # this line to update the number of singular values comes from the code
116 |         if svp < sv:
117 |             sv = min(svp + 1, n);
118 |         else:
119 |             sv = min(svp + round(0.05*n), n);
120 | 
121 |         A_new = np.dot(
122 |             np.dot(U[:, :svp], np.diag(S[:svp] - mu_k/tau_k)), V[:svp, :])
123 |         E_new = np.sign(E_old) * _pos(np.abs(E_old) - lmbda * mu_k / tau_k);
124 | 
125 |         t_kp1 = 0.5*(1+np.sqrt(1+4*t_k*t_k));
126 | 
127 |         A_old = A_new + E_new - YA - YE;
128 |         YA = tau_k*(YA-A_new) + A_old;
129 |         YE = tau_k*(YE-E_new) + A_old;
130 | 
131 |         s1 = np.sqrt((YA**2).sum()+(YE**2).sum())
132 |         s2 = np.sqrt((A_new**2).sum()+(E_new**2).sum());
133 | 
134 |         if s1 / (tau_k*max(1, s2)) <= tol and iter > 10:
135 |             break;
136 | 
137 |         if continuation:
138 |             mu_k = max(0.9*mu_k, mu_bar);
139 | 
140 |         tk_old = t_k;
141 |         t_k = t_kp1;
142 |         A_old = A;
143 |         E_old = E;
144 |         A = A_new;
145 |         E = E_new;
146 | 
147 |         if verbose >= 2:
148 |             obj.append(_monitor(A, E, D))
149 |         if (not converged) and iter >= maxiter:
150 |             print 'Maximum iterations reached'
151 |             converged = True;
152 |     if verbose >= 2:
153 |         return A, E, obj
154 |     else:
155 |         return A, E
156 | 
157 | 
158 | def dual_method():
159 |     """
160 |     Dual Method
161 |     """
162 | 
163 |     pass
164 | 
165 | 
166 | def augmented_largrange_multiplier(D, lmbda, tol=1e-7, maxiter=25000, verbose=2, inexact=True):
167 |     """
168 |     Augmented Lagrange Multiplier
169 |     """
170 |     obj = []
171 |     Y = np.sign(D)
172 |     norm_two = svd(Y, 1)[1]
173 |     norm_inf = np.abs(Y).max() / lmbda
174 |     dual_norm = np.max([norm_two, norm_inf])
175 |     Y = Y / dual_norm
176 |     A = np.zeros(Y.shape)
177 |     E = np.zeros(Y.shape)
178 |     dnorm = _fro(D)
179 |     tol_primal = 1e-6 * dnorm
180 |     total_svd = 0
181 |     mu = .5/norm_two
182 |     rho = 6
183 | 
184 |     sv = 5
185 |     svp = sv
186 | 
187 |     n = Y.shape[0]
188 | 
189 |     for iter in range(maxiter):
190 |         primal_converged = False
191 |         sv = sv + np.round(n * 0.1)
192 |         primal_iter = 0
193 |         while not primal_converged:
194 |             Eraw = D - A + (1/mu) * Y
195 |             Eupdate = np.maximum(
196 |                 Eraw - lmbda/mu, 0) + np.minimum(Eraw + lmbda / mu, 0)
197 |             U, S, V = svd(D - Eupdate + (1 / mu) * Y, sv)
198 |             svp = (S > 1/mu).sum()
199 |             if svp < sv:
200 |                 sv = np.min([svp + 1, n])
201 |             else:
202 |                 sv = np.min([svp + round(.05 * n), n])
203 |             Aupdate = np.dot(
204 |                 np.dot(U[:, :svp], np.diag(S[:svp] - 1/mu)), V[:svp, :])
205 |             if primal_iter % 10 == 0 and verbose >= 2:
206 |                 print _fro(A - Aupdate)
207 |             if (_fro(A - Aupdate) < tol_primal and _fro(E - Eupdate) < tol_primal) or (inexact and primal_iter > 5):
208 |                 primal_converged = True
209 |                 if verbose >= 2:
210 |                     print "Primal Converged at Iter %d"%(primal_iter)
211 |             A = Aupdate
212 |             E = Eupdate
213 |             primal_iter = primal_iter + 1
214 |             total_svd = total_svd + 1
215 |         Z = D - A - E
216 |         Y = Y + mu * Z
217 |         mu = rho * mu
218 |         if np.sqrt((Z**2).sum()) / dnorm < tol:
219 |             if verbose >= 2:
220 |                 print "Converged at Iter %d" % (iter)
221 |             break
222 |         else:
223 |             if verbose >= 2:
224 |                 obj.append(_monitor(A, E, D))
225 |     if verbose >= 2:
226 |         return A, E, obj
227 |     else:
228 |         return A, E
229 | 
230 | 
231 | def alternating_direction_method_of_multipliers(D, lmbda, rho=1., maxiter=25000, verbose=2, tol=1e-2):
232 |     def soft_thresh(X, sigma):
233 |         return np.maximum(X - sigma, 0) - np.maximum(-X - sigma, 0);
234 |     obj = []
235 |     m, n = D.shape
236 |     A = D;
237 |     E = D - A;
238 |     W = np.ones(D.shape)/rho;
239 |     rhoupdate = rho;
240 |     for k in range(maxiter):
241 |         U, S, V = svd(D-E-W);
242 |         Aupdate = np.dot(np.dot(U, np.diag(soft_thresh(S, 1/rho))), V);
243 |         Eupdate = soft_thresh(D-Aupdate-W, lmbda/rho);
244 |         Wupdate = W + (Aupdate + Eupdate - D);
245 |         primal_resid = _fro(Aupdate + Eupdate - D)
246 |         dual_resid = rho*_fro(Eupdate - E)
247 |         # this is from the stanford slide
248 |         if primal_resid > 10*dual_resid:
249 |             rhoupdate = 2*rho;
250 |             Wupdate = Wupdate/2;
251 |         elif dual_resid > 10*primal_resid:
252 |             rhoupdate = rho/2;
253 |             Wupdate = 2*Wupdate;
254 |         else:
255 |             rhoupdate = rho
256 |         A = Aupdate;
257 |         E = Eupdate;
258 |         W = Wupdate;
259 |         rho = rhoupdate;
260 |         if primal_resid <= tol and dual_resid <= tol:
261 |             if verbose >= 2:
262 |                 print 'Converged to tol=%e in %d iterations\n'%(tol, k)
263 |             break;
264 |         if verbose >= 2:
265 |             obj.append(_monitor(A, E, D))
266 |     if verbose >= 2:
267 |         return A, E, obj
268 |     else:
269 |         return A, E
270 | 
271 | method = {"SVT": singular_value_thresholding,
272 |           "ALM": augmented_largrange_multiplier,
273 |           "ADMM": alternating_direction_method_of_multipliers,
274 |           "APG": accelerate_proximal_gradient}
275 | 
276 | 
277 | class RobustPCA(BaseEstimator, TransformerMixin):
278 |     """
279 |     Robust PCA
280 |     """
281 |     def __init__(self, alpha=.1, copy=True, method='svt'):
282 |         """
283 |         """
284 |         pass
285 | 
286 |     def tranform():
287 |         """
288 |         Tranform
289 |         """
290 |         pass
291 | 


--------------------------------------------------------------------------------
/test_robustpca.py:
--------------------------------------------------------------------------------
 1 | from robustpca import *
 2 | from numpy.random import randn
 3 | from numpy.random import shuffle
 4 | import numpy as np
 5 | from numpy.testing import assert_almost_equal
 6 | 
 7 | # initalize a basis of rank 10
 8 | Basis = randn(5, 100)
 9 | W = randn(300, 5)
10 | TrueMat = np.dot(W, Basis)
11 | # initalize a sparse matrix
12 | sparse_ratio = 0.05
13 | sparse_mag = 10
14 | E = randn(TrueMat.size) * sparse_mag
15 | idx = np.arange(E.size)
16 | shuffle(idx)
17 | E[idx[int(sparse_ratio * E.size):]] = 0
18 | E = E.reshape(TrueMat.shape)
19 | # calculate the observation
20 | Observed = TrueMat + E  # 1000 x 100
21 | 
22 | 
23 | def test_singular_value_thresholding():
24 |     return
25 |     A_, E_ = singular_value_thresholding(
26 |         Observed.T, lmbda=0.1, tau=1e3, maxiter=100, verbose=0)
27 |     assert (
28 |         np.sqrt(((A_ - TrueMat.T)**2).sum()) / np.sqrt((A_**2).sum())) < .01
29 |     assert (np.abs(E_ - E.T).sum() / np.abs(E).sum()) < .01
30 | 
31 | 
32 | def test_augmented_largrange_multiplier():
33 |     return
34 |     A_, E_ = augmented_largrange_multiplier(
35 |         Observed.T, lmbda=.1, inexact=True, verbose=0)
36 |     assert_almost_equal(A_, TrueMat.T, decimal=2)
37 |     assert_almost_equal(E_, E.T, decimal=2)
38 |     A_, E_ = augmented_largrange_multiplier(
39 |         Observed.T, lmbda=.1, inexact=False, verbose=0)
40 |     assert_almost_equal(A_, TrueMat.T, decimal=2)
41 |     assert_almost_equal(E_, E.T, decimal=2)
42 | 
43 | 
44 | def test_accelerate_proximal_gradient():
45 |     return
46 |     A_, E_ = accelerate_proximal_gradient(Observed.T, lmbda=.1, verbose=0)
47 |     assert_almost_equal(A_, TrueMat.T, decimal=2)
48 |     assert_almost_equal(E_, E.T, decimal=2)
49 | 
50 | 
51 | def test_alternating_direction_method_of_multipliers():
52 |     A_, E_ = alternating_direction_method_of_multipliers(
53 |         Observed.T, lmbda=.1, verbose=0)
54 |     assert_almost_equal(A_, TrueMat.T, decimal=2)
55 |     assert_almost_equal(E_, E.T, decimal=2)
56 | 


--------------------------------------------------------------------------------
/topic_extraction.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from time import time
 3 | from sklearn.feature_extraction import text
 4 | from sklearn import decomposition
 5 | from sklearn import datasets
 6 | from robustpca import *
 7 | 
 8 | n_samples = 5000
 9 | n_features = 2000
10 | n_topics = 10
11 | n_top_words = 5
12 | 
13 | # Load the 20 newsgroups dataset and vectorize it using the most common word
14 | # frequency with TF-IDF weighting (without top 5% stop words)
15 | 
16 | t0 = time()
17 | print("Loading dataset and extracting TF-IDF features...")
18 | dataset = datasets.fetch_20newsgroups(shuffle=True, random_state=1)
19 | 
20 | vectorizer = text.CountVectorizer(max_df=0.95, max_features=n_features)
21 | counts = vectorizer.fit_transform(dataset.data[:n_samples])
22 | tfidf = text.TfidfTransformer().fit_transform(counts)
23 | print("done in %0.3fs." % (time() - t0))
24 | 
25 | # Fit the NMF model
26 | print("Fitting the NMF model on with n_samples=%d and n_features=%d..."
27 |       % (n_samples, n_features))
28 | # import pdb; pdb.set_trace()
29 | A, E = augmented_largrange_multiplier(np.array(tfidf.todense().T), lmbda=.1, maxiter=20, inexact=True)  # decomposition.NMF(n_components=n_topics).fit(tfidf)
30 | print("done in %0.3fs." % (time() - t0))
31 | # Inverse the vectorizer vocabulary to be able
32 | feature_names = vectorizer.get_feature_names()
33 | 
34 | original_text = open("original.txt", "w")
35 | subtract_text = open("keywords.txt", "w")
36 | 
37 | for topic_idx, topic in enumerate(np.abs(E.T)):
38 |     print("Topic #%d:" % topic_idx)
39 |     subtract_text.write(" ".join([feature_names[i]
40 |                                   for i in topic.argsort()[:-n_top_words - 1:-1] if topic[i] != 0]))
41 |     original_text.write(" ".join([feature_names[i]
42 |                                   for i in xrange(n_features) if tfidf[topic_idx, i] != 0]))
43 |     subtract_text.write("\n")
44 |     original_text.write("\n")
45 |     print()
46 | 


--------------------------------------------------------------------------------