├── README.md ├── CompareGRQI.m ├── GPower.m ├── SSVD.m └── GRQI.m /README.md: -------------------------------------------------------------------------------- 1 | Generalized Rayleigh Quotient Iteration 2 | ======================================= 3 | 4 | MATLAB code for the paper: 5 | 6 | V. Kuleshov, Fast algorithms for sparse principal componenent analysis 7 | based on Rayleigh quotient iteration. Proceedings of the 30th International 8 | Conference on Machine Learning, Atlanta, GA, 2013. 9 | 10 | Send feedback to [Volodymyr Kuleshov](http://web.stanford.edu/~kuleshov/). 11 | 12 | Contents 13 | -------- 14 | 15 | `GRQI.m`: An implementation of Algorithm 2 in the ICML paper. Function `GRQI` 16 | computes K sparse principal components using generalized Rayleigh quotient iteration. 17 | 18 | `GPower.m`: An implementation of Algorithms 3 and 4 in the ICML paper. 19 | Function `GPower` computes K sparse principal components using the generalized 20 | power method as implemented in the paper by Journee et al. 21 | 22 | `SSVD.m`: An implementation of Algorithm 5 in the ICML paper. Function `SSVD` 23 | computes a pair of sparse singular vectors. 24 | 25 | `CompareGRQI.m`: Script that comapres `GRQI` with `GPower` and generates a series of plots. 26 | -------------------------------------------------------------------------------- /CompareGRQI.m: -------------------------------------------------------------------------------- 1 | %%% This script compares GRQI vs. GPower on random matrices. 2 | 3 | randn('seed',1) 4 | n = 500; 5 | 6 | A = randn(n,n); 7 | A = A'*A; 8 | 9 | [~, log0] = GRQI(A,44,1,Inf,0,150,1e-6); 10 | [~, log1] = GPower(A,1.7,1,0,150,1e-6); 11 | 12 | log01 = log0{1}; 13 | log11 = log1{1}; 14 | 15 | nm_errors = log0{1}.errors; 16 | nm_variances = log0{1}.variances; 17 | nm_sparsities = log0{1}.sparsities; 18 | 19 | power_errors = log1{1}.errors; 20 | power_variances = log1{1}.variances; 21 | power_sparsities = log1{1}.sparsities; 22 | 23 | subplot(3,2,1); 24 | plot(nm_errors); 25 | t = title('Convergence rate (GRQI)', 'FontWeight','bold'); 26 | set(t, 'FontSize', 11); 27 | xlabel('Iterations') 28 | ylabel('|| x - x_{prev} ||') 29 | xlim([1,6]) 30 | ylim([0,0.8]) 31 | 32 | subplot(3,2,2); 33 | plot(power_errors); 34 | t = title('Convergence rate (GPower)', 'FontWeight','bold'); 35 | set(t, 'FontSize', 11); 36 | xlabel('Iterations') 37 | ylabel('|| x - x_{prev} ||') 38 | xlim([0 150]); 39 | 40 | subplot(3,2,3); 41 | plot(nm_variances); 42 | t = title('Variance (GRQI)', 'FontWeight','bold'); 43 | set(t, 'FontSize', 11); 44 | xlabel('Iterations') 45 | ylabel('Variance') 46 | xlim([1,6]) 47 | 48 | subplot(3,2,4); 49 | plot(power_variances); 50 | t = title('Variance (GPower)', 'FontWeight','bold'); 51 | set(t, 'FontSize', 11); 52 | xlabel('Iterations') 53 | ylabel('Variance') 54 | xlim([0 150]); 55 | 56 | subplot(3,2,5); 57 | plot(nm_sparsities); 58 | t = title('Sparsity (GRQI)', 'FontWeight','bold'); 59 | set(t, 'FontSize', 11); 60 | xlabel('Iterations') 61 | ylabel('Number of non-zero entries') 62 | xlim([1,6]) 63 | 64 | subplot(3,2,6); 65 | plot(power_sparsities); 66 | t = title('Sparsity (GPower)', 'FontWeight','bold'); 67 | set(t, 'FontSize', 11); 68 | xlabel('Iterations') 69 | ylabel('Number of non-zero entries') 70 | xlim([0 150]); 71 | ylim([35, 60]); -------------------------------------------------------------------------------- /GPower.m: -------------------------------------------------------------------------------- 1 | function [Q, full_log] = GPower(X,rho,K,alpha,maxit,thr) 2 | 3 | % GRQI Computes sparse principal components 4 | % [Q, log] = GRQI(X, k, K, J, alpha, maxit, thr) computes K sparse 5 | % principal components of symmetric matrix X. 6 | % 7 | % INPUTS: 8 | % X: Data matrix 9 | % rho: Regularization parameter 10 | % K: Number of sparse principal componenents to compute 11 | % alpha: Deflation parameter 12 | % maxit: Maximum number of iterations to take 13 | % thr: Accuracy threshold 14 | % 15 | % OUTPUTS: 16 | % Q: Matrix of sparse principal componenents 17 | % log: Runtime stats 18 | % 19 | % Principal componenets are computed using a technique called the 20 | % Generalized Power Method For more information on the method see 21 | % 22 | % M. Journee, Y. Nesterov, P. Richtarik, R. Sepulchre, Generalized power 23 | % Method for sparse principal component analysis, arXiv:0811.4724v1, 2008 24 | 25 | n = size(X,1); 26 | Q = zeros(n,K); 27 | full_log = cell(K,1); 28 | 29 | % This is necessary for comparison to GRQI. GPower assumes that the 30 | % diagonal matrix X is factored as X = A'*A. In typical scenarios, X is 31 | % going to be the covariance matrix and A will be the data matrix. 32 | A = chol(X); 33 | assert(all(all(A'*A - X < 1e-4))); 34 | 35 | for i=1:K 36 | % Log statistics for this run: 37 | log = struct('errors', [], 'variances', [], 'sparsities', []); 38 | 39 | % First, initialize x_0 40 | 41 | column_norms = sqrt(sum(X.^2,1)); 42 | [~, col_i] = max(column_norms); 43 | x = X(:,col_i)/norm(X(:,col_i)); 44 | 45 | err = 1; iter = 0; 46 | disp(i); 47 | 48 | while err > thr && iter0).*Ax); 66 | % x=grad/norm(grad); 67 | 68 | err = norm(x-oldx); 69 | 70 | % Save run statistics 71 | log.errors = [log.errors err]; 72 | log.variances = [log.variances var]; 73 | log.sparsities = [log.sparsities nnz(tresh)]; 74 | 75 | iter = iter + 1; 76 | 77 | fprintf('%d \t %d: %f \t %f \t %d\n', i, iter, err, var, ... 78 | nnz(tresh)); 79 | end 80 | 81 | Ax=A'*x; 82 | z=sign(Ax).*max(abs(Ax)-rho,0); 83 | if max(abs(z>0))>0, 84 | z=z/norm(z); 85 | end 86 | x=z; 87 | 88 | % Save run statistics 89 | log.errors = [log.errors err]; 90 | log.variances = [log.variances var]; 91 | log.sparsities = [log.sparsities nnz(tresh)]; 92 | full_log{i} = log; 93 | 94 | fprintf('RESULTS:\n\tPrecision: %f\n\tVariance: %f\n\tSparsity: %d\n',... 95 | err, var, nnz(x)); 96 | 97 | Q(:,i) = x; 98 | 99 | % Perform partial deflation 100 | X = X - alpha*var*x*x'; 101 | end 102 | end -------------------------------------------------------------------------------- /SSVD.m: -------------------------------------------------------------------------------- 1 | function [u, v, d, iter] = SSVD(X,k_u,k_v,J,thr,maxit) 2 | 3 | % SSVD Computes sparse singular vectors 4 | % [Q, log] = SSVD(X, k_u, k_v, J, thr, maxit) computes a pair of sparse 5 | % singular vectors of the rectangular matrix X, each having at most k_u 6 | % or k_v non-zero indices. 7 | % 8 | % INPUTS: 9 | % X: Data matrix 10 | % k_u: Maximum number of non-zero indices in u 11 | % k_v: Maximum number of non-zero indices in v 12 | % J: Number of power method steps to be taken 13 | % maxit: Maximum number of iterations to take 14 | % thr: Accuracy threshold 15 | % 16 | % OUTPUTS: 17 | % u: Left singular value 18 | % v: Right singular value 19 | % d: Variance explained 20 | % iter: Number of iterations taken 21 | % 22 | % Singular vectors are computed using a technique called Generalized 23 | % Rayleigh quotient iteration. At every iteration, the non-zero indices 24 | % are updated using Rayleigh quotient iteration. For the first J 25 | % iterations, every index is also updates using a step of the power 26 | % method. Afterwards, the iterate is projected on the set defined by 27 | % the sparsity constraints. 28 | % 29 | % In order to handle rectangular matrices, Generalized Rayleigh quotient 30 | % iteration is applied on the symmetric matrix Y = [0 X'; X 0]. However, 31 | % Y is never explicitely formed. Instead, we perform inversions on 32 | % submatrices of Y using the matrix inversion lemma. 33 | % 34 | % For more information on the method see the paper 35 | % 36 | % V. Kuleshov, Fast algorithms for sparse principal component Analysis 37 | % based on Rayleigh quotient iteration. Proceedings of the 30th 38 | % International Conference on Machine Learning, Atlanta, GA, 2013. 39 | 40 | % First, initialize u_0, v_0, mu 41 | 42 | column_norms = sqrt(sum(X.^2,1)); 43 | [~, idx] = max(column_norms); 44 | u = X(:,idx)/norm(X(:,idx)); 45 | u = l0_project(u,k_u); 46 | 47 | row_norms = sqrt(sum(X.^2,2)); 48 | [~, idx] = max(row_norms); 49 | v = (X(idx,:)/norm(X(idx,:)))'; 50 | v = l0_project(v,k_v); 51 | 52 | mu = u'*X*v/(norm(u)*norm(v)); 53 | 54 | err = 1; iter = 0; 55 | 56 | while err > thr && iter < maxit; 57 | 58 | oldu = u; 59 | oldv = v; 60 | 61 | % Compute working sets 62 | 63 | Wo_u = find(u ~= 0); 64 | Wo_v = find(v ~= 0); 65 | 66 | % Perform a step of Rayleigh quotient iteration of the working set 67 | 68 | A = X(Wo_u,Wo_v); 69 | [m, ~] = size(A); 70 | u_Wo = u(Wo_u); 71 | v_Wo = v(Wo_v); 72 | 73 | % We now invert [0 A'; A 0] - mu*I using the matrix inversion 74 | % lemma: 75 | % 76 | % B = [-mu*eye(n) A'; A -mu*eye(m)]; 77 | % Binv = [(1/mu^2)*A'*inv(S)*A - (1/mu)*eye(n) (1/mu)*A'*inv(S); 78 | % inv(S)*A inv(S);] 79 | % 80 | % where S is the Schur complement: 81 | S = (A*A')/mu - mu*eye(m); 82 | 83 | % (1,1) block 84 | Av = A*v_Wo; 85 | SiAv = S \ Av; 86 | AtSiAv = A'*SiAv; 87 | v_part1 = AtSiAv / (mu^2) - v_Wo/mu; 88 | 89 | % (1,2) block 90 | Siu = S \ u_Wo; 91 | AtSiu = A'*Siu; 92 | v_part2 = AtSiu / mu; 93 | v(Wo_v) = v_part1 + v_part2; 94 | 95 | % (2,1) block 96 | u_part1 = SiAv / mu; 97 | 98 | % (2,2) block 99 | u_part2 = Siu; 100 | 101 | u(Wo_u) = u_part1 + u_part2; 102 | mu = u'*X*v/(norm(u)*norm(v)); 103 | 104 | u = u / norm(u); 105 | v = v / norm(v); 106 | 107 | % Perform a step of the Power method on all indices 108 | if iter < J 109 | u = X*v; 110 | v = X'*u; 111 | end 112 | 113 | % Project on the intersection of the l0 and l2 balls 114 | 115 | u = l0_project(u,k_u); 116 | v = l0_project(v,k_v); 117 | 118 | [~, n] = size(X); 119 | x = [v; u]; 120 | x = x/norm(x); 121 | v = x(1:n); 122 | u = x(n+1:end); 123 | 124 | erru = norm(oldu-u,2); 125 | errv = norm(oldv-v,2); 126 | err = erru + errv; 127 | 128 | variance = u'*X*v/(norm(v)*norm(u)); 129 | 130 | fprintf('%d: %f \t%f\n',iter,err,variance); 131 | 132 | iter = iter + 1; 133 | end 134 | 135 | d = variance; 136 | 137 | fprintf('RESULTS:\nVariance: %f\n\tSparsity: %d, %d\n',... 138 | variance ,nnz(u), nnz(v)); 139 | 140 | end 141 | 142 | function x = l0_project(x, k) 143 | [~, idx] = sort(abs(x),'descend'); 144 | idx_to_zero = idx(k+1:end); 145 | x(idx_to_zero) = 0; 146 | x = x / norm(x); 147 | end 148 | -------------------------------------------------------------------------------- /GRQI.m: -------------------------------------------------------------------------------- 1 | function [Q, full_log] = GRQI(X,k,K,J,alpha,maxit,thr) 2 | 3 | % GRQI Computes sparse principal components 4 | % [Q, log] = GRQI(X, k, K, J, alpha, maxit, thr) computes K principal 5 | % components of symmetric matrix X, each having at most k non-zero 6 | % indices. 7 | % 8 | % INPUTS: 9 | % X: Data matrix 10 | % k: Maximum number of non-zero indices 11 | % K: Number of sparse principal componenents to compute 12 | % J: Number of power method steps to be taken 13 | % alpha: Deflation parameter 14 | % maxit: Maximum number of iterations to take 15 | % thr: Accuracy threshold 16 | % 17 | % OUTPUTS: 18 | % Q: Matrix of sparse principal componenents 19 | % log: Runtime stats 20 | % 21 | % Principal componenets are computed using a technique called Generalized 22 | % Rayleigh quotient iteration. At every iteration, the non-zero indices 23 | % are updated using Rayleigh quotient iteration. For the first J 24 | % iterations, every index is also updates using a step of the power 25 | % method. Afterwards, the iterate is projected on the set defined by 26 | % ||x||_0 <= k and ||x||_2 <= 1. 27 | % 28 | % After computing each principal component, it deflates the matrix X by 29 | % removing a fraction alpha of the variance explained by last component. 30 | % 31 | % For more information on the method see the paper 32 | % 33 | % V. Kuleshov, Fast algorithms for sparse principal component Analysis 34 | % based on Rayleigh quotient iteration. Proceedings of the 30th 35 | % International Conference on Machine Learning, Atlanta, GA, 2013. 36 | 37 | n = size(X,1); 38 | Q = zeros(n,K); 39 | full_log = cell(K,1); 40 | 41 | for i=1:K 42 | % Log statistics for this run: 43 | log = struct('errors', [], 'variances', [], 'sparsities', []); 44 | 45 | % First, initialize x_0 46 | 47 | % We recommend taking the largest column of the input matrix: 48 | column_norms = sqrt(sum(X.^2,1)); 49 | [~, col_i] = max(column_norms); 50 | x = X(:,col_i); 51 | x = l0_project(x,k); 52 | mu = x'*X*x/(x'*x); 53 | 54 | % Another option is to initalize randomly, and use a mu that is close 55 | % to the largest eigenvalue. The value of mu can be guessed or 56 | % computed. 57 | % x = randn(n,1); x = x/norm(x); 58 | % mu = 1900; 59 | 60 | err = 1; iter = 0; 61 | disp(i); 62 | 63 | while err > thr && iter < maxit; 64 | oldx = x; 65 | 66 | % Compute working set 67 | Wo = find(x~=0); 68 | 69 | % Perform a Rayleigh quotient iteration update across the working 70 | % set. 71 | A = X(Wo,Wo); 72 | 73 | % The code below is a more numerically stable way of doing the 74 | % update: 75 | % x(Wo) = (A-mu*eye(size(A))) \ x(Wo); 76 | % It performs a step of Newton's method on the KKT conditions of 77 | % the problem max x'*A*x s.t. 0.5*x'*x == 1. 78 | % One can check using the matrix inversion theorem that they 79 | % ultimately produce the same update to x(Wo). 80 | 81 | G = -(A-mu*eye(size(A))); 82 | DF = [G, x(Wo); x(Wo)', 0]; 83 | F = [G*x(Wo); 0.5*(x(Wo)'*x(Wo) - 1)]; 84 | delta = DF \ -F; 85 | x(Wo) = x(Wo) + delta(1:end-1); 86 | 87 | % Update mu and renormalize 88 | mu = x(Wo)'*X(Wo,Wo)*x(Wo)/(x(Wo)'*x(Wo)); 89 | x = x/norm(x); 90 | 91 | % Perform a Power method step across all indices 92 | if (iter < J) 93 | x = X*x; 94 | end 95 | 96 | % Project on the intersection of the l2 and l0 balls 97 | x = l0_project(x,k); 98 | 99 | err = norm(oldx - x,2); 100 | variance = x'*X*x/(x'*x); 101 | 102 | iter = iter + 1; 103 | 104 | % Save run statistics 105 | log.errors = [log.errors err]; 106 | log.variances = [log.variances variance]; 107 | log.sparsities = [log.sparsities nnz(x)]; 108 | 109 | errors = log.errors; 110 | variances = log.variances; 111 | sparsities = log.sparsities; 112 | 113 | % Print current state 114 | fprintf('%d \t %d: %f \t %f \t %d\n', i, iter, err, ... 115 | variance, nnz(x)); 116 | end 117 | 118 | log.errors = [log.errors err]; 119 | log.variances = [log.variances variance]; 120 | log.sparsities = [log.sparsities nnz(x)]; 121 | full_log{i} = log; 122 | 123 | fprintf('RESULTS:\n\tPrecision: %f\n\tVariance: %f\n\tSparsity: %d\n',... 124 | err, variance, nnz(x)); 125 | 126 | Q(:,i) = x; 127 | 128 | % Perform partial deflation 129 | X = X - alpha*variance*x*x'; 130 | 131 | end 132 | end 133 | 134 | function x = l0_project(x, k) 135 | [~, idx] = sort(abs(x),'descend'); 136 | idx_to_zero = idx(k+1:end); 137 | x(idx_to_zero) = 0; 138 | x = x / norm(x); 139 | end --------------------------------------------------------------------------------