├── .gitignore ├── .gitmodules ├── README.md ├── background_subtraction.py ├── background_subtraction_visualize.py ├── plot_benchmark.py ├── pypropack ├── robustpca.py ├── test_robustpca.py └── topic_extraction.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/pypropack"] 2 | path = lib/pypropack 3 | url = git://github.com/jakevdp/pypropack.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PyRPCA 2 | ====== 3 | 4 | Robust PCA in Python. Methods are from the http://perception.csl.illinois.edu/matrix-rank/sample_code.html and papers therein. 5 | 6 | Requirement 7 | =========== 8 | * scipy 9 | * numpy 10 | * pypropack(optional) 11 | * scikit-learn 12 | * nosetest 13 | 14 | Scripts 15 | ======= 16 | * `test_robustpca.py` test whether the algorithms included can recovery the synthetic data successfully. Use `nosetest test_robustpca.py` 17 | * `plot_benchmark.py` plot the benchmarks with synthetic data generated with different parameters. Use `python2 plot_benchmark.py` 18 | * `background_subtraction.py` generate the result using the escalator dataset. Use `python2 background_subtraction.py`. This will generate the `.mat` files with respect to each algorithms and can be directly readable from matlab. Furthermore, `background_subtraction_visualize.py` could be used to generate a video. The temporary image files are located in `/tmp/robust_pca_tmp/` which should be created first. 19 | * `topic_extraction.py` extracts the keywords from the 20newsgroup dataset. It will generate two files, one is `origin.txt` and another is `keyword.txt`. The keyword and the original text on the same line is one-one mapped. 20 | 21 | Aknowledgement 22 | ============== 23 | Special thanks for the following two resources and their authors. 24 | * http://perception.csl.illinois.edu/matrix-rank/sample_code.html 25 | * www.stanford.edu/~peleato/math301_slides.pdf‎ 26 | -------------------------------------------------------------------------------- /background_subtraction.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from scipy.io import loadmat, savemat 3 | from scipy.misc import imresize 4 | import os 5 | import numpy as np 6 | from robustpca import * 7 | from numpy.linalg import svd 8 | ### urllib 9 | 10 | videoFname = "escalator_data.mat" 11 | 12 | 13 | def download_video_clip(): 14 | webFile = urllib2.urlopen( 15 | r'http://cvxr.com/tfocs/demos/rpca/escalator_data.mat') 16 | filename = './' + videoFname 17 | localFile = open(filename, 'w') 18 | localFile.write(webFile.read()) 19 | localFile.close() 20 | 21 | if __name__ == "__main__": 22 | if not os.path.exists('./' + videoFname): 23 | print 'Video file not found, downloading' 24 | download_video_clip() 25 | X = loadmat(videoFname)['X'].astype(np.double)/255. 26 | nclip = X.shape[1] 27 | # lmbda = .01 28 | lmbda = .01 29 | # lmbdas = {'APG':.01, 30 | # 'ALM':.01, 31 | # 'ADMM':.01, 32 | # 'SVT':.01} 33 | # save a basic value 34 | ################################### Mean ################################ 35 | Xmean = X.mean(axis=1) 36 | A = np.tile(Xmean, (nclip, 1)).T 37 | E = X - A 38 | A = A.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255. 39 | E = E.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255. 40 | savemat("./MEAN_background_subtraction.mat", {"A": A, "E": E}) 41 | ################################## SVD ################################## 42 | U, S, V = svd(X, full_matrices=False) 43 | r = 20 44 | A = np.dot(np.dot(U[:, :r], np.diag(S[:r])), V[:r, :]) 45 | E = X - A 46 | A = A.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255. 47 | E = E.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255. 48 | savemat("./PCA_background_subtraction.mat", {"A": A, "E": E}) 49 | ################################# Robust PCA ############################ 50 | for fname in method.keys(): 51 | m = method[fname] 52 | A, E = m(X, lmbda=lmbda, maxiter=100, verbose=0) 53 | A = A.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255. 54 | E = E.reshape(160, 130, X.shape[1]).swapaxes(0, 1) * 255. 55 | savemat("./%s_background_subtraction.mat"%(fname), {"A": A, "E": E}) 56 | -------------------------------------------------------------------------------- /background_subtraction_visualize.py: -------------------------------------------------------------------------------- 1 | ## making a movie 2 | 3 | 4 | import os 5 | import sys 6 | import matplotlib.pyplot as plt 7 | from scipy.io import loadmat 8 | import numpy as np 9 | from matplotlib import cm 10 | import matplotlib 11 | from robustpca import method 12 | from plot_benchmark import mlabdefaults 13 | 14 | if __name__ == "__main__": 15 | mlabdefaults() 16 | matplotlib.rcParams['savefig.dpi'] = 200 17 | files = [] 18 | cache_path = '/tmp/robust_pca_tmp' 19 | if not os.path.exists(cache_path): 20 | os.mkdir(cache_path) 21 | all_methods = method.keys() 22 | all_methods.append('MEAN') 23 | all_methods.append('PCA') 24 | for fname in all_methods: 25 | if not os.path.exists('%s/%s_tmp'%(cache_path, fname)): 26 | os.mkdir("%s/%s_tmp"%(cache_path, fname)) 27 | mat = loadmat('./%s_background_subtraction.mat'%(fname)) 28 | org = loadmat('./escalator_data.mat')['X'].reshape( 29 | 160, 130, 200).swapaxes(0, 1) 30 | fig = plt.figure() 31 | ax = fig.add_subplot(111) 32 | for i in range(200): # 50 frames 33 | ax.cla() 34 | ax.axis("off") 35 | ax.imshow(np.hstack((mat['A'][:, :, i], 36 | mat['E'][:, :, i], org[:, :, i])), cm.gray) 37 | fname_ = '%s/%s_tmp/_tmp%03d.png'%(cache_path, fname, i) 38 | print 'Saving frame', fname_ 39 | fig.tight_layout() 40 | fig.savefig(fname_, bbox_inches="tight") 41 | files.append(fname_) 42 | print 'Making movie animation.mpg - this make take a while' 43 | os.system("mencoder 'mf://%s/%s_tmp/_tmp*.png' -mf type=png:fps=10 -ovc lavc -lavcopts vcodec=wmv2 -oac copy -o %s_animation.mpg"%(cache_path, fname, fname)) 44 | -------------------------------------------------------------------------------- /plot_benchmark.py: -------------------------------------------------------------------------------- 1 | from robustpca import * 2 | import robustpca as rp 3 | from time import time 4 | from numpy.random import randn 5 | from numpy.random import shuffle 6 | import numpy as np 7 | import matplotlib 8 | import matplotlib.cm as cm 9 | import cPickle as cP 10 | k_colors = ["r", "b", "y", "m", "c", "g", "#FFA500", "k"]; 11 | k_markers = "o*dxs^vD"; 12 | from pylab import gcf 13 | import pylab 14 | import os 15 | 16 | def mlabdefaults(): 17 | matplotlib.rcParams['lines.linewidth'] = 1.5 18 | matplotlib.rcParams['savefig.dpi'] = 300 19 | matplotlib.rcParams['font.size'] = 22. 20 | matplotlib.rcParams['font.family'] = "Times New Roman" 21 | matplotlib.rcParams['legend.fontsize'] = "small" 22 | matplotlib.rcParams['legend.fancybox'] = True 23 | matplotlib.rcParams['lines.markersize'] = 10 24 | matplotlib.rcParams['figure.figsize'] = 8, 5.6 25 | matplotlib.rcParams['legend.labelspacing'] = 0.1 26 | matplotlib.rcParams['legend.borderpad'] = 0.1 27 | matplotlib.rcParams['legend.borderaxespad'] = 0.2 28 | matplotlib.rcParams['font.monospace'] = "Courier New" 29 | 30 | 31 | def synthesized_data(rank, dim, n_sample, sparse_ratio, sparse_mag): 32 | Basis = randn(rank, dim) 33 | W = randn(n_sample, rank) 34 | TrueMat = np.dot(W, Basis) 35 | # initalize a sparse matrix 36 | E = randn(TrueMat.size) * sparse_mag 37 | idx = np.arange(E.size) 38 | shuffle(idx) 39 | E[idx[int(sparse_ratio * E.size):]] = 0 40 | E = E.reshape(TrueMat.shape) 41 | # calculate the observation 42 | Observed = TrueMat + E # 1000 x 100 43 | return Observed, TrueMat, E 44 | 45 | 46 | def savefig(filename, fig=None): 47 | if fig is None: 48 | gcf().savefig(filename, bbox_inches='tight') 49 | else: 50 | fig.savefig(filename, bbox_inches='tight') 51 | 52 | 53 | def generate_plot(x, results, xlabel=' ', ylabel=' ', keys=None, fname=None, me=1, title=None): 54 | mlabdefaults() 55 | fig = pylab.figure() 56 | ax = fig.add_subplot(111) 57 | if keys is None: 58 | keys = results.keys() 59 | for i, m in enumerate(keys): 60 | y = results[m] 61 | ax.set_yscale('log') 62 | ax.plot(x[:len(y)], y, 63 | color=k_colors[i], 64 | linestyle="-", 65 | marker=k_markers[i], markevery=me) 66 | ax.set_xlabel(xlabel) 67 | ax.set_ylabel(ylabel) 68 | # ax.set_xlim(1,100) 69 | # ax.set_ylim(0,1.0) 70 | # ax.set_yticks(np.arange(0.0,1.1,0.1)) 71 | ax.legend(keys, loc=4) 72 | ax.yaxis.grid(color='gray', linestyle='dashed') 73 | ax.xaxis.grid(color='gray', linestyle='dashed') 74 | if title is not None: 75 | fig.suptitle(title) 76 | if fname is None: 77 | pylab.show() 78 | else: 79 | savefig(fname, fig) 80 | 81 | figurename = lambda x: "./benchfigures/%s"%(x) 82 | 83 | 84 | def rank_experiment(): 85 | for name in method.keys(): 86 | result[name] = [] 87 | x = [3, 5, 7, 9, 12] 88 | for rank in x: 89 | print "Rank %d"%(rank) 90 | mat, A, E = synthesized_data(rank=rank, dim=30, 91 | n_sample=1000, sparse_ratio=.05, sparse_mag=10) 92 | for name in method.keys(): 93 | print "\t %s"%(name) 94 | time0 = time() 95 | m = method[name] 96 | A_, E_ = m(mat.T, lmbda=.1, verbose=0) 97 | escaped = time() - time0 98 | print "\t escaped %f"%(escaped) 99 | result[name].append(escaped) 100 | cP.dump({'x': x, 'results': result}, open(figurename( 101 | "rank_exp.pk"), 'w'), protocol=-1) 102 | generate_plot(x, results=result, xlabel='Rank', ylabel='Time', 103 | keys=None, fname=figurename("rank_exp")) 104 | 105 | 106 | def sparse_ratio_experiment(): 107 | for name in method.keys(): 108 | result[name] = [] 109 | x = [0.01, 0.05, 0.1, 0.15, 0.2] 110 | for sr in x: 111 | print "sparse ratio %f"%(sr) 112 | mat, A, E = synthesized_data(rank=5, dim=30, 113 | n_sample=1000, sparse_ratio=sr, sparse_mag=10) 114 | for name in method.keys(): 115 | print "\t %s"%(name) 116 | time0 = time() 117 | m = method[name] 118 | A_, E_ = m(mat.T, lmbda=.1, verbose=0) 119 | escaped = time() - time0 120 | print "\t escaped %f"%(escaped) 121 | result[name].append(escaped) 122 | cP.dump({'x': x, 'results': result}, open(figurename( 123 | "sparse_ratio_exp.pk"), 'w'), protocol=-1) 124 | generate_plot(x, results=result, xlabel='Sparsity Ratio', 125 | ylabel='Time', keys=None, fname=figurename("sparse_ratio_exp")) 126 | 127 | 128 | def sample_number_experiment(): 129 | for name in method.keys(): 130 | result[name] = [] 131 | x = [300, 500, 800, 1000, 1500] 132 | for sr in x: 133 | print "number of samples %d"%(sr) 134 | mat, A, E = synthesized_data(rank=5, dim=30, 135 | n_sample=sr, sparse_ratio=0.05, sparse_mag=10) 136 | for name in method.keys(): 137 | print "\t %s"%(name) 138 | time0 = time() 139 | m = method[name] 140 | A_, E_ = m(mat.T, lmbda=.1, verbose=0) 141 | escaped = time() - time0 142 | print "\t escaped %f"%(escaped) 143 | result[name].append(escaped) 144 | cP.dump({'x': x, 'results': result}, open(figurename( 145 | "data_number_exp.pk"), 'w'), protocol=-1) 146 | generate_plot(x, results=result, xlabel='Number of Training Samples', 147 | ylabel='Time', keys=None, fname=figurename("data_number_exp")) 148 | 149 | 150 | def convergency_experiment(): 151 | result = {} 152 | for name in method.keys(): 153 | result[name] = [] 154 | maxiter = 100 155 | for sparse_ratio in [0.05, 0.1, 0.2]: 156 | for rank in [5, 10, 20]: 157 | cachename = figurename("iteration_exp_rank%d_sparse%0.2f.pk"%(rank, sparse_ratio)); 158 | if not os.path.exists(cachename): 159 | mat, A, E = synthesized_data(rank=rank, dim=30, 160 | n_sample=1000, sparse_ratio=sparse_ratio, sparse_mag=10) 161 | for name in method.keys(): 162 | print "\t %s"%(name) 163 | time0 = time() 164 | m = method[name] 165 | A_, E_, obj = m( 166 | mat.T, lmbda=.1, verbose=2, maxiter=maxiter) 167 | escaped = time() - time0 168 | print "\t escaped %f"%(escaped) 169 | result[name] = obj 170 | result['Optimal'] = np.ones(maxiter) * rp._monitor(A, E, A+E) 171 | cP.dump({'x': np.arange(maxiter), 'results': result}, open(cachename, 'w'), protocol=-1) 172 | else: 173 | tmp = cP.load(open(cachename, 'rb')) 174 | result = tmp['results'] 175 | generate_plot(np.arange(maxiter), results=result, xlabel='Iteration', ylabel='Objective Value', keys=None, fname=figurename("iteration_exp_rank%d_sparse%0.2f.eps"%(rank, sparse_ratio)), me=3, title="Rank = %d, Sparse Error Ratio = %0.2f"%(rank, sparse_ratio)) 176 | 177 | 178 | if __name__ == "__main__": 179 | rank_experiment() 180 | sparse_ratio_experiment() 181 | sample_number_experiment() 182 | convergency_experiment() 183 | -------------------------------------------------------------------------------- /pypropack: -------------------------------------------------------------------------------- 1 | lib/pypropack/pypropack/ -------------------------------------------------------------------------------- /robustpca.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import TransformerMixin, BaseEstimator 2 | import numpy as np 3 | import scipy.sparse as sp 4 | 5 | try: 6 | from pypropack import svdp 7 | raise ValueError 8 | svd = lambda X, k: svdp(X, k, 'L', kmax=max(100, 10 * k)) 9 | import warnings 10 | with warnings.catch_warnings(): 11 | warnings.simplefilter("ignore") 12 | except: 13 | from scipy.linalg import svd as svd_ 14 | 15 | def svd(X, k=-1): 16 | U, S, V = svd_(X, full_matrices=False) 17 | if k < 0: 18 | return U, S, V 19 | else: 20 | return U[:, :k], S[:k], V[:k, :] 21 | 22 | # The problem solved is 23 | # min : tau * (|A|_* + \lmbda |E|_1) + .5 * |(A,E)|_F^2 24 | # subject to: A + E = D 25 | 26 | 27 | def _monitor(A, E, D, lmbda=0.1): 28 | diags = svd(A, min(A.shape))[1] 29 | print "|A|_*", np.abs(diags).sum() 30 | print "|A|_0", (np.abs(diags) > 1e-6).sum() 31 | print "|E|_1", np.abs(D - A).sum() 32 | print "|D-A-E|_F", _fro(D - A - E) 33 | return np.abs(diags).sum() + lmbda * np.abs(D - A).sum() 34 | 35 | 36 | def _pos(A): 37 | return A * (A > 0) 38 | 39 | 40 | def _fro(A): 41 | return np.sqrt((A * A).sum()) 42 | 43 | 44 | def singular_value_thresholding(D, maxiter=25000, lmbda=1.0, tau=1e4, delta=.9, verbose=2): 45 | """ 46 | Singular Value Thresholding 47 | """ 48 | # initialization 49 | _matshape = D.shape 50 | primal_tol = 1e-5 51 | Y = np.zeros(shape=_matshape) 52 | A = np.zeros(shape=_matshape) 53 | E = np.zeros(shape=_matshape) 54 | rankA = 0 55 | obj = [] 56 | for iter in range(maxiter): 57 | U, S, V = svd(Y, rankA+1) 58 | A = np.dot(np.dot(U, np.diag(_pos(S - tau))), V) 59 | E = np.sign(Y) * _pos(np.abs(Y) - lmbda * tau) 60 | M = D - A - E 61 | rankA = (S > tau).sum() 62 | Y = Y + delta * M 63 | if verbose >= 2: 64 | obj.append(_monitor(A, E, D)) 65 | if _fro(D-A-E)/_fro(D) < primal_tol: 66 | if verbose >= 2: 67 | print "Converged at iter %d"%iter 68 | break 69 | if verbose >= 2: 70 | return A, E, obj 71 | else: 72 | return A, E 73 | 74 | 75 | def accelerate_proximal_gradient(D, lmbda, maxiter=25000, tol=1e-7, 76 | continuation=True, 77 | eta=.9, mu=1e-3, verbose=2): 78 | """ 79 | Accelerated Proximal Gradient (Partial SVD Version) 80 | """ 81 | obj = [] 82 | m, n = D.shape 83 | 84 | t_k = 1. 85 | tk_old = 1. 86 | tau_0 = 2. 87 | 88 | A_old = np.zeros(D.shape) 89 | E_old = np.zeros(D.shape) 90 | A = np.zeros(D.shape) 91 | E = np.zeros(D.shape) 92 | 93 | # This comes from the code 94 | if continuation: 95 | mu_0 = svd(D, 1)[1] 96 | mu_k = mu_0; 97 | mu_bar = 1e-9 * mu_0 98 | else: 99 | mu_k = mu; 100 | 101 | tau_k = tau_0; 102 | converged = False; 103 | sv = 5.; 104 | 105 | for iter in range(maxiter): 106 | YA = A + ((tk_old - 1)/t_k)*(A-A_old); 107 | YE = E + ((tk_old - 1)/t_k)*(E-E_old); 108 | 109 | A_old = YA - (1/tau_k)*(YA+YE-D); 110 | E_old = YE - (1/tau_k)*(YA+YE-D); 111 | 112 | U, S, V = svd(A_old); 113 | 114 | svp = (S > mu_k/tau_k).sum(); 115 | # this line to update the number of singular values comes from the code 116 | if svp < sv: 117 | sv = min(svp + 1, n); 118 | else: 119 | sv = min(svp + round(0.05*n), n); 120 | 121 | A_new = np.dot( 122 | np.dot(U[:, :svp], np.diag(S[:svp] - mu_k/tau_k)), V[:svp, :]) 123 | E_new = np.sign(E_old) * _pos(np.abs(E_old) - lmbda * mu_k / tau_k); 124 | 125 | t_kp1 = 0.5*(1+np.sqrt(1+4*t_k*t_k)); 126 | 127 | A_old = A_new + E_new - YA - YE; 128 | YA = tau_k*(YA-A_new) + A_old; 129 | YE = tau_k*(YE-E_new) + A_old; 130 | 131 | s1 = np.sqrt((YA**2).sum()+(YE**2).sum()) 132 | s2 = np.sqrt((A_new**2).sum()+(E_new**2).sum()); 133 | 134 | if s1 / (tau_k*max(1, s2)) <= tol and iter > 10: 135 | break; 136 | 137 | if continuation: 138 | mu_k = max(0.9*mu_k, mu_bar); 139 | 140 | tk_old = t_k; 141 | t_k = t_kp1; 142 | A_old = A; 143 | E_old = E; 144 | A = A_new; 145 | E = E_new; 146 | 147 | if verbose >= 2: 148 | obj.append(_monitor(A, E, D)) 149 | if (not converged) and iter >= maxiter: 150 | print 'Maximum iterations reached' 151 | converged = True; 152 | if verbose >= 2: 153 | return A, E, obj 154 | else: 155 | return A, E 156 | 157 | 158 | def dual_method(): 159 | """ 160 | Dual Method 161 | """ 162 | 163 | pass 164 | 165 | 166 | def augmented_largrange_multiplier(D, lmbda, tol=1e-7, maxiter=25000, verbose=2, inexact=True): 167 | """ 168 | Augmented Lagrange Multiplier 169 | """ 170 | obj = [] 171 | Y = np.sign(D) 172 | norm_two = svd(Y, 1)[1] 173 | norm_inf = np.abs(Y).max() / lmbda 174 | dual_norm = np.max([norm_two, norm_inf]) 175 | Y = Y / dual_norm 176 | A = np.zeros(Y.shape) 177 | E = np.zeros(Y.shape) 178 | dnorm = _fro(D) 179 | tol_primal = 1e-6 * dnorm 180 | total_svd = 0 181 | mu = .5/norm_two 182 | rho = 6 183 | 184 | sv = 5 185 | svp = sv 186 | 187 | n = Y.shape[0] 188 | 189 | for iter in range(maxiter): 190 | primal_converged = False 191 | sv = sv + np.round(n * 0.1) 192 | primal_iter = 0 193 | while not primal_converged: 194 | Eraw = D - A + (1/mu) * Y 195 | Eupdate = np.maximum( 196 | Eraw - lmbda/mu, 0) + np.minimum(Eraw + lmbda / mu, 0) 197 | U, S, V = svd(D - Eupdate + (1 / mu) * Y, sv) 198 | svp = (S > 1/mu).sum() 199 | if svp < sv: 200 | sv = np.min([svp + 1, n]) 201 | else: 202 | sv = np.min([svp + round(.05 * n), n]) 203 | Aupdate = np.dot( 204 | np.dot(U[:, :svp], np.diag(S[:svp] - 1/mu)), V[:svp, :]) 205 | if primal_iter % 10 == 0 and verbose >= 2: 206 | print _fro(A - Aupdate) 207 | if (_fro(A - Aupdate) < tol_primal and _fro(E - Eupdate) < tol_primal) or (inexact and primal_iter > 5): 208 | primal_converged = True 209 | if verbose >= 2: 210 | print "Primal Converged at Iter %d"%(primal_iter) 211 | A = Aupdate 212 | E = Eupdate 213 | primal_iter = primal_iter + 1 214 | total_svd = total_svd + 1 215 | Z = D - A - E 216 | Y = Y + mu * Z 217 | mu = rho * mu 218 | if np.sqrt((Z**2).sum()) / dnorm < tol: 219 | if verbose >= 2: 220 | print "Converged at Iter %d" % (iter) 221 | break 222 | else: 223 | if verbose >= 2: 224 | obj.append(_monitor(A, E, D)) 225 | if verbose >= 2: 226 | return A, E, obj 227 | else: 228 | return A, E 229 | 230 | 231 | def alternating_direction_method_of_multipliers(D, lmbda, rho=1., maxiter=25000, verbose=2, tol=1e-2): 232 | def soft_thresh(X, sigma): 233 | return np.maximum(X - sigma, 0) - np.maximum(-X - sigma, 0); 234 | obj = [] 235 | m, n = D.shape 236 | A = D; 237 | E = D - A; 238 | W = np.ones(D.shape)/rho; 239 | rhoupdate = rho; 240 | for k in range(maxiter): 241 | U, S, V = svd(D-E-W); 242 | Aupdate = np.dot(np.dot(U, np.diag(soft_thresh(S, 1/rho))), V); 243 | Eupdate = soft_thresh(D-Aupdate-W, lmbda/rho); 244 | Wupdate = W + (Aupdate + Eupdate - D); 245 | primal_resid = _fro(Aupdate + Eupdate - D) 246 | dual_resid = rho*_fro(Eupdate - E) 247 | # this is from the stanford slide 248 | if primal_resid > 10*dual_resid: 249 | rhoupdate = 2*rho; 250 | Wupdate = Wupdate/2; 251 | elif dual_resid > 10*primal_resid: 252 | rhoupdate = rho/2; 253 | Wupdate = 2*Wupdate; 254 | else: 255 | rhoupdate = rho 256 | A = Aupdate; 257 | E = Eupdate; 258 | W = Wupdate; 259 | rho = rhoupdate; 260 | if primal_resid <= tol and dual_resid <= tol: 261 | if verbose >= 2: 262 | print 'Converged to tol=%e in %d iterations\n'%(tol, k) 263 | break; 264 | if verbose >= 2: 265 | obj.append(_monitor(A, E, D)) 266 | if verbose >= 2: 267 | return A, E, obj 268 | else: 269 | return A, E 270 | 271 | method = {"SVT": singular_value_thresholding, 272 | "ALM": augmented_largrange_multiplier, 273 | "ADMM": alternating_direction_method_of_multipliers, 274 | "APG": accelerate_proximal_gradient} 275 | 276 | 277 | class RobustPCA(BaseEstimator, TransformerMixin): 278 | """ 279 | Robust PCA 280 | """ 281 | def __init__(self, alpha=.1, copy=True, method='svt'): 282 | """ 283 | """ 284 | pass 285 | 286 | def tranform(): 287 | """ 288 | Tranform 289 | """ 290 | pass 291 | -------------------------------------------------------------------------------- /test_robustpca.py: -------------------------------------------------------------------------------- 1 | from robustpca import * 2 | from numpy.random import randn 3 | from numpy.random import shuffle 4 | import numpy as np 5 | from numpy.testing import assert_almost_equal 6 | 7 | # initalize a basis of rank 10 8 | Basis = randn(5, 100) 9 | W = randn(300, 5) 10 | TrueMat = np.dot(W, Basis) 11 | # initalize a sparse matrix 12 | sparse_ratio = 0.05 13 | sparse_mag = 10 14 | E = randn(TrueMat.size) * sparse_mag 15 | idx = np.arange(E.size) 16 | shuffle(idx) 17 | E[idx[int(sparse_ratio * E.size):]] = 0 18 | E = E.reshape(TrueMat.shape) 19 | # calculate the observation 20 | Observed = TrueMat + E # 1000 x 100 21 | 22 | 23 | def test_singular_value_thresholding(): 24 | return 25 | A_, E_ = singular_value_thresholding( 26 | Observed.T, lmbda=0.1, tau=1e3, maxiter=100, verbose=0) 27 | assert ( 28 | np.sqrt(((A_ - TrueMat.T)**2).sum()) / np.sqrt((A_**2).sum())) < .01 29 | assert (np.abs(E_ - E.T).sum() / np.abs(E).sum()) < .01 30 | 31 | 32 | def test_augmented_largrange_multiplier(): 33 | return 34 | A_, E_ = augmented_largrange_multiplier( 35 | Observed.T, lmbda=.1, inexact=True, verbose=0) 36 | assert_almost_equal(A_, TrueMat.T, decimal=2) 37 | assert_almost_equal(E_, E.T, decimal=2) 38 | A_, E_ = augmented_largrange_multiplier( 39 | Observed.T, lmbda=.1, inexact=False, verbose=0) 40 | assert_almost_equal(A_, TrueMat.T, decimal=2) 41 | assert_almost_equal(E_, E.T, decimal=2) 42 | 43 | 44 | def test_accelerate_proximal_gradient(): 45 | return 46 | A_, E_ = accelerate_proximal_gradient(Observed.T, lmbda=.1, verbose=0) 47 | assert_almost_equal(A_, TrueMat.T, decimal=2) 48 | assert_almost_equal(E_, E.T, decimal=2) 49 | 50 | 51 | def test_alternating_direction_method_of_multipliers(): 52 | A_, E_ = alternating_direction_method_of_multipliers( 53 | Observed.T, lmbda=.1, verbose=0) 54 | assert_almost_equal(A_, TrueMat.T, decimal=2) 55 | assert_almost_equal(E_, E.T, decimal=2) 56 | -------------------------------------------------------------------------------- /topic_extraction.py: -------------------------------------------------------------------------------- 1 | 2 | from time import time 3 | from sklearn.feature_extraction import text 4 | from sklearn import decomposition 5 | from sklearn import datasets 6 | from robustpca import * 7 | 8 | n_samples = 5000 9 | n_features = 2000 10 | n_topics = 10 11 | n_top_words = 5 12 | 13 | # Load the 20 newsgroups dataset and vectorize it using the most common word 14 | # frequency with TF-IDF weighting (without top 5% stop words) 15 | 16 | t0 = time() 17 | print("Loading dataset and extracting TF-IDF features...") 18 | dataset = datasets.fetch_20newsgroups(shuffle=True, random_state=1) 19 | 20 | vectorizer = text.CountVectorizer(max_df=0.95, max_features=n_features) 21 | counts = vectorizer.fit_transform(dataset.data[:n_samples]) 22 | tfidf = text.TfidfTransformer().fit_transform(counts) 23 | print("done in %0.3fs." % (time() - t0)) 24 | 25 | # Fit the NMF model 26 | print("Fitting the NMF model on with n_samples=%d and n_features=%d..." 27 | % (n_samples, n_features)) 28 | # import pdb; pdb.set_trace() 29 | A, E = augmented_largrange_multiplier(np.array(tfidf.todense().T), lmbda=.1, maxiter=20, inexact=True) # decomposition.NMF(n_components=n_topics).fit(tfidf) 30 | print("done in %0.3fs." % (time() - t0)) 31 | # Inverse the vectorizer vocabulary to be able 32 | feature_names = vectorizer.get_feature_names() 33 | 34 | original_text = open("original.txt", "w") 35 | subtract_text = open("keywords.txt", "w") 36 | 37 | for topic_idx, topic in enumerate(np.abs(E.T)): 38 | print("Topic #%d:" % topic_idx) 39 | subtract_text.write(" ".join([feature_names[i] 40 | for i in topic.argsort()[:-n_top_words - 1:-1] if topic[i] != 0])) 41 | original_text.write(" ".join([feature_names[i] 42 | for i in xrange(n_features) if tfidf[topic_idx, i] != 0])) 43 | subtract_text.write("\n") 44 | original_text.write("\n") 45 | print() 46 | --------------------------------------------------------------------------------