├── README.md
├── CompareGRQI.m
├── GPower.m
├── SSVD.m
└── GRQI.m


/README.md:
--------------------------------------------------------------------------------
 1 | Generalized Rayleigh Quotient Iteration
 2 | =======================================
 3 | 
 4 | MATLAB code for the paper:
 5 | 
 6 | 	V. Kuleshov, Fast algorithms for sparse principal componenent analysis 
 7 | 	based on Rayleigh quotient iteration. Proceedings of the 30th International
 8 | 	Conference on Machine Learning, Atlanta, GA, 2013.
 9 | 
10 | Send feedback to [Volodymyr Kuleshov](http://web.stanford.edu/~kuleshov/).
11 | 
12 | Contents
13 | --------
14 | 
15 | `GRQI.m`: An implementation of Algorithm 2 in the ICML paper. Function `GRQI`
16 | computes K sparse principal components using generalized Rayleigh quotient iteration.
17 | 
18 | `GPower.m`: An implementation of Algorithms 3 and 4 in the ICML paper. 
19 | Function `GPower` computes K sparse principal components using the generalized
20 | power method as implemented in the paper by Journee et al.
21 | 
22 | `SSVD.m`: An implementation of Algorithm 5 in the ICML paper. Function `SSVD` 
23 | computes a pair of sparse singular vectors.
24 | 
25 | `CompareGRQI.m`: Script that comapres `GRQI` with `GPower` and generates a series of plots.
26 | 


--------------------------------------------------------------------------------
/CompareGRQI.m:
--------------------------------------------------------------------------------
 1 | %%% This script compares GRQI vs. GPower on random matrices.
 2 | 
 3 | randn('seed',1)
 4 | n = 500;
 5 | 
 6 | A = randn(n,n);
 7 | A = A'*A;
 8 | 
 9 | [~, log0] = GRQI(A,44,1,Inf,0,150,1e-6);
10 | [~, log1] = GPower(A,1.7,1,0,150,1e-6);
11 | 
12 | log01 = log0{1};
13 | log11 = log1{1};
14 | 
15 | nm_errors = log0{1}.errors;
16 | nm_variances = log0{1}.variances;
17 | nm_sparsities = log0{1}.sparsities;
18 | 
19 | power_errors = log1{1}.errors;
20 | power_variances = log1{1}.variances;
21 | power_sparsities = log1{1}.sparsities;
22 | 
23 | subplot(3,2,1);
24 | plot(nm_errors);
25 | t = title('Convergence rate (GRQI)', 'FontWeight','bold');
26 | set(t, 'FontSize', 11);
27 | xlabel('Iterations')
28 | ylabel('|| x - x_{prev} ||')
29 | xlim([1,6])
30 | ylim([0,0.8])
31 | 
32 | subplot(3,2,2);
33 | plot(power_errors);
34 | t = title('Convergence rate (GPower)', 'FontWeight','bold');
35 | set(t, 'FontSize', 11);
36 | xlabel('Iterations')
37 | ylabel('|| x - x_{prev} ||')
38 | xlim([0 150]);
39 | 
40 | subplot(3,2,3);
41 | plot(nm_variances);
42 | t = title('Variance (GRQI)', 'FontWeight','bold');
43 | set(t, 'FontSize', 11);
44 | xlabel('Iterations')
45 | ylabel('Variance')
46 | xlim([1,6])
47 | 
48 | subplot(3,2,4);
49 | plot(power_variances);
50 | t = title('Variance (GPower)', 'FontWeight','bold');
51 | set(t, 'FontSize', 11);
52 | xlabel('Iterations')
53 | ylabel('Variance')
54 | xlim([0 150]);
55 | 
56 | subplot(3,2,5);
57 | plot(nm_sparsities);
58 | t = title('Sparsity (GRQI)', 'FontWeight','bold');
59 | set(t, 'FontSize', 11);
60 | xlabel('Iterations')
61 | ylabel('Number of non-zero entries')
62 | xlim([1,6])
63 | 
64 | subplot(3,2,6);
65 | plot(power_sparsities);
66 | t = title('Sparsity (GPower)', 'FontWeight','bold');
67 | set(t, 'FontSize', 11);
68 | xlabel('Iterations')
69 | ylabel('Number of non-zero entries')
70 | xlim([0 150]);
71 | ylim([35, 60]);


--------------------------------------------------------------------------------
/GPower.m:
--------------------------------------------------------------------------------
  1 | function [Q, full_log] = GPower(X,rho,K,alpha,maxit,thr)
  2 | 
  3 | % GRQI  Computes sparse principal components
  4 | %   [Q, log] = GRQI(X, k, K, J, alpha, maxit, thr) computes K sparse
  5 | %   principal components of symmetric matrix X.
  6 | %
  7 | %   INPUTS:
  8 | %       X:      Data matrix
  9 | %       rho:    Regularization parameter
 10 | %       K:      Number of sparse principal componenents to compute
 11 | %       alpha:  Deflation parameter
 12 | %       maxit:  Maximum number of iterations to take
 13 | %       thr:    Accuracy threshold
 14 | %
 15 | %   OUTPUTS:
 16 | %       Q:      Matrix of sparse principal componenents
 17 | %       log:    Runtime stats
 18 | %
 19 | %   Principal componenets are computed using a technique called the 
 20 | %   Generalized Power Method For more information on the method see
 21 | %
 22 | %   M. Journee, Y. Nesterov, P. Richtarik, R. Sepulchre, Generalized power 
 23 | %   Method for sparse principal component analysis, arXiv:0811.4724v1, 2008
 24 | 
 25 | n = size(X,1);
 26 | Q = zeros(n,K);
 27 | full_log = cell(K,1);
 28 | 
 29 | % This is necessary for comparison to GRQI. GPower assumes that the
 30 | % diagonal matrix X is factored as X = A'*A. In typical scenarios, X is
 31 | % going to be the covariance matrix and A will be the data matrix.
 32 | A = chol(X);
 33 | assert(all(all(A'*A - X < 1e-4)));
 34 | 
 35 | for i=1:K
 36 |     % Log statistics for this run:
 37 |     log = struct('errors', [], 'variances', [], 'sparsities', []);
 38 |     
 39 |     % First, initialize x_0
 40 | 
 41 |     column_norms = sqrt(sum(X.^2,1));
 42 |     [~, col_i] = max(column_norms);
 43 |     x = X(:,col_i)/norm(X(:,col_i));
 44 |     
 45 |     err = 1; iter = 0;
 46 |     disp(i);
 47 |         
 48 |     while err > thr && iter<maxit;
 49 |         oldx = x;
 50 | 
 51 |         % The l0/l1 updates below are taken directly from the source code
 52 |         % of Journee et al. (2008)
 53 |         
 54 |         % l1 update:
 55 |         Ax=A'*x;
 56 |         tresh=sign(Ax).*max(abs(Ax)-rho,0);
 57 |         grad=A*tresh;
 58 |         x=grad/norm(grad);
 59 |         
 60 |         var = tresh'*X*tresh/(tresh'*tresh);
 61 | 
 62 |         % l0 update:
 63 | %         Ax=A'*x;
 64 | %         tresh=max(Ax.^2-rho,0);
 65 | %         grad=A*((tresh>0).*Ax);
 66 | %         x=grad/norm(grad);   
 67 |                     
 68 |         err = norm(x-oldx);
 69 |         
 70 |         % Save run statistics
 71 |         log.errors = [log.errors err];
 72 |         log.variances = [log.variances var];
 73 |         log.sparsities = [log.sparsities nnz(tresh)];
 74 |         
 75 |         iter = iter + 1;
 76 |         
 77 |         fprintf('%d \t %d: %f \t %f \t %d\n', i, iter, err, var, ...
 78 |                                                     nnz(tresh));
 79 |     end
 80 |     
 81 |     Ax=A'*x;
 82 |     z=sign(Ax).*max(abs(Ax)-rho,0);
 83 |     if max(abs(z>0))>0, 
 84 |         z=z/norm(z);
 85 |     end
 86 |     x=z;
 87 |     
 88 |     % Save run statistics
 89 |     log.errors = [log.errors err];
 90 |     log.variances = [log.variances var];
 91 |     log.sparsities = [log.sparsities nnz(tresh)];
 92 |     full_log{i} = log;
 93 |    
 94 |     fprintf('RESULTS:\n\tPrecision: %f\n\tVariance: %f\n\tSparsity: %d\n',...
 95 |         err, var, nnz(x));
 96 |     
 97 |     Q(:,i) = x;
 98 |     
 99 |     % Perform partial deflation
100 |     X = X - alpha*var*x*x';
101 | end
102 | end


--------------------------------------------------------------------------------
/SSVD.m:
--------------------------------------------------------------------------------
  1 | function [u, v, d, iter] = SSVD(X,k_u,k_v,J,thr,maxit)
  2 | 
  3 | % SSVD  Computes sparse singular vectors
  4 | %   [Q, log] = SSVD(X, k_u, k_v, J, thr, maxit) computes a pair of sparse
  5 | %   singular vectors of the rectangular matrix X, each having at most k_u 
  6 | %   or k_v non-zero indices.
  7 | %
  8 | %   INPUTS:
  9 | %       X:      Data matrix
 10 | %       k_u:    Maximum number of non-zero indices in u
 11 | %       k_v:    Maximum number of non-zero indices in v
 12 | %       J:      Number of power method steps to be taken
 13 | %       maxit:  Maximum number of iterations to take
 14 | %       thr:    Accuracy threshold
 15 | %
 16 | %   OUTPUTS:
 17 | %       u:      Left singular value
 18 | %       v:      Right singular value
 19 | %       d:      Variance explained
 20 | %       iter:   Number of iterations taken
 21 | %
 22 | %   Singular vectors are computed using a technique called Generalized
 23 | %   Rayleigh quotient iteration. At every iteration, the non-zero indices
 24 | %   are updated using Rayleigh quotient iteration. For the first J
 25 | %   iterations, every index is also updates using a step of the power
 26 | %   method. Afterwards, the iterate is projected on the set defined by 
 27 | %   the sparsity constraints.
 28 | %
 29 | %   In order to handle rectangular matrices, Generalized Rayleigh quotient
 30 | %   iteration is applied on the symmetric matrix Y = [0 X'; X 0]. However, 
 31 | %   Y is never explicitely formed. Instead, we perform inversions on
 32 | %   submatrices of Y using the matrix inversion lemma.
 33 | %
 34 | %   For more information on the method see the paper
 35 | %
 36 | %   V. Kuleshov, Fast algorithms for sparse principal component Analysis 
 37 | %   based on Rayleigh quotient iteration. Proceedings of the 30th 
 38 | %   International Conference on Machine Learning, Atlanta, GA, 2013.
 39 |     
 40 | % First, initialize u_0, v_0, mu
 41 | 
 42 | column_norms = sqrt(sum(X.^2,1));
 43 | [~, idx] = max(column_norms);
 44 | u = X(:,idx)/norm(X(:,idx));
 45 | u = l0_project(u,k_u);
 46 | 
 47 | row_norms = sqrt(sum(X.^2,2));
 48 | [~, idx] = max(row_norms);
 49 | v = (X(idx,:)/norm(X(idx,:)))';
 50 | v = l0_project(v,k_v);
 51 | 
 52 | mu = u'*X*v/(norm(u)*norm(v));
 53 | 
 54 | err = 1; iter = 0;
 55 | 
 56 | while err > thr && iter < maxit;
 57 | 
 58 |     oldu = u;
 59 |     oldv = v;
 60 | 
 61 |     % Compute working sets
 62 | 
 63 |     Wo_u = find(u ~= 0);
 64 |     Wo_v = find(v ~= 0);
 65 | 
 66 |     % Perform a step of Rayleigh quotient iteration of the working set
 67 | 
 68 |     A = X(Wo_u,Wo_v);
 69 |     [m, ~] = size(A);
 70 |     u_Wo = u(Wo_u);
 71 |     v_Wo = v(Wo_v);
 72 | 
 73 |     % We now invert [0 A'; A 0] - mu*I using the matrix inversion
 74 |     % lemma:
 75 |     %
 76 |     % B = [-mu*eye(n) A'; A -mu*eye(m)];
 77 |     % Binv = [(1/mu^2)*A'*inv(S)*A - (1/mu)*eye(n) (1/mu)*A'*inv(S);
 78 |     %          inv(S)*A inv(S);]
 79 |     %
 80 |     % where S is the Schur complement:
 81 |     S = (A*A')/mu - mu*eye(m);
 82 | 
 83 |     % (1,1) block
 84 |     Av = A*v_Wo;
 85 |     SiAv = S \ Av;
 86 |     AtSiAv = A'*SiAv;
 87 |     v_part1 = AtSiAv / (mu^2) - v_Wo/mu;
 88 | 
 89 |     % (1,2) block
 90 |     Siu = S \ u_Wo;
 91 |     AtSiu = A'*Siu;
 92 |     v_part2 = AtSiu / mu;
 93 |     v(Wo_v) = v_part1 + v_part2;
 94 | 
 95 |     % (2,1) block
 96 |     u_part1 = SiAv / mu;
 97 | 
 98 |     % (2,2) block
 99 |     u_part2 = Siu;
100 | 
101 |     u(Wo_u) = u_part1 + u_part2;
102 |     mu = u'*X*v/(norm(u)*norm(v));
103 | 
104 |     u = u / norm(u);
105 |     v = v / norm(v);
106 | 
107 |     % Perform a step of the Power method on all indices
108 |     if iter < J
109 |         u = X*v;
110 |         v = X'*u;
111 |     end
112 | 
113 |     % Project on the intersection of the l0 and l2 balls
114 | 
115 |     u = l0_project(u,k_u);
116 |     v = l0_project(v,k_v);
117 | 
118 |     [~, n] = size(X);
119 |     x = [v; u];
120 |     x = x/norm(x);
121 |     v = x(1:n);
122 |     u = x(n+1:end);
123 | 
124 |     erru = norm(oldu-u,2);
125 |     errv = norm(oldv-v,2);
126 |     err = erru + errv;
127 | 
128 |     variance = u'*X*v/(norm(v)*norm(u));
129 | 
130 |     fprintf('%d: %f \t%f\n',iter,err,variance);
131 | 
132 |     iter = iter + 1;
133 | end
134 | 
135 | d = variance;
136 | 
137 | fprintf('RESULTS:\nVariance: %f\n\tSparsity: %d, %d\n',...
138 |     variance ,nnz(u), nnz(v));
139 |     
140 | end
141 | 
142 | function x = l0_project(x, k)
143 |     [~, idx] = sort(abs(x),'descend');
144 |     idx_to_zero = idx(k+1:end);
145 |     x(idx_to_zero) = 0;
146 |     x = x / norm(x);
147 | end
148 | 


--------------------------------------------------------------------------------
/GRQI.m:
--------------------------------------------------------------------------------
  1 | function [Q, full_log] = GRQI(X,k,K,J,alpha,maxit,thr)
  2 | 
  3 | % GRQI  Computes sparse principal components
  4 | %   [Q, log] = GRQI(X, k, K, J, alpha, maxit, thr) computes K principal
  5 | %   components of symmetric matrix X, each having at most k non-zero
  6 | %   indices.
  7 | %
  8 | %   INPUTS:
  9 | %       X:      Data matrix
 10 | %       k:      Maximum number of non-zero indices
 11 | %       K:      Number of sparse principal componenents to compute
 12 | %       J:      Number of power method steps to be taken
 13 | %       alpha:  Deflation parameter
 14 | %       maxit:  Maximum number of iterations to take
 15 | %       thr:    Accuracy threshold
 16 | %
 17 | %   OUTPUTS:
 18 | %       Q:      Matrix of sparse principal componenents
 19 | %       log:    Runtime stats
 20 | %
 21 | %   Principal componenets are computed using a technique called Generalized
 22 | %   Rayleigh quotient iteration. At every iteration, the non-zero indices
 23 | %   are updated using Rayleigh quotient iteration. For the first J
 24 | %   iterations, every index is also updates using a step of the power
 25 | %   method. Afterwards, the iterate is projected on the set defined by 
 26 | %   ||x||_0 <= k  and ||x||_2 <= 1.
 27 | %
 28 | %   After computing each principal component, it deflates the matrix X by 
 29 | %   removing a fraction alpha of the variance explained by last component.
 30 | %
 31 | %   For more information on the method see the paper
 32 | %
 33 | %   V. Kuleshov, Fast algorithms for sparse principal component Analysis 
 34 | %   based on Rayleigh quotient iteration. Proceedings of the 30th 
 35 | %   International Conference on Machine Learning, Atlanta, GA, 2013.
 36 | 
 37 | n = size(X,1);
 38 | Q = zeros(n,K);
 39 | full_log = cell(K,1);
 40 | 
 41 | for i=1:K
 42 |     % Log statistics for this run:
 43 |     log = struct('errors', [], 'variances', [], 'sparsities', []);
 44 |     
 45 |     % First, initialize x_0
 46 |     
 47 |     % We recommend taking the largest column of the input matrix:
 48 |     column_norms = sqrt(sum(X.^2,1));
 49 |     [~, col_i] = max(column_norms);
 50 |     x = X(:,col_i);
 51 |     x = l0_project(x,k);
 52 |     mu = x'*X*x/(x'*x);
 53 |     
 54 |     % Another option is to initalize randomly, and use a mu that is close
 55 |     % to the largest eigenvalue. The value of mu can be guessed or
 56 |     % computed.    
 57 |     % x = randn(n,1); x = x/norm(x);
 58 |     % mu = 1900;
 59 | 
 60 |     err = 1; iter = 0;
 61 |     disp(i);
 62 |         
 63 |     while err > thr && iter < maxit;
 64 |         oldx = x;
 65 |         
 66 |         % Compute working set
 67 |         Wo = find(x~=0);
 68 |         
 69 |         % Perform a Rayleigh quotient iteration update across the working
 70 |         % set.
 71 |         A = X(Wo,Wo);
 72 |         
 73 |         % The code below is a more numerically stable way of doing the
 74 |         % update:
 75 |         % x(Wo) = (A-mu*eye(size(A))) \ x(Wo);
 76 |         % It performs a step of Newton's method on the KKT conditions of
 77 |         % the problem max x'*A*x s.t. 0.5*x'*x == 1.
 78 |         % One can check using the matrix inversion theorem that they 
 79 |         % ultimately produce the same update to x(Wo).
 80 |         
 81 |         G = -(A-mu*eye(size(A)));
 82 |         DF = [G, x(Wo); x(Wo)', 0];
 83 |         F = [G*x(Wo); 0.5*(x(Wo)'*x(Wo) - 1)];
 84 |         delta = DF \ -F;
 85 |         x(Wo) = x(Wo) + delta(1:end-1);
 86 |         
 87 |         % Update mu and renormalize
 88 |         mu = x(Wo)'*X(Wo,Wo)*x(Wo)/(x(Wo)'*x(Wo));
 89 |         x = x/norm(x);
 90 | 
 91 |         % Perform a Power method step across all indices
 92 |         if (iter < J)
 93 |             x = X*x;
 94 |         end
 95 | 
 96 |         % Project on the intersection of the l2 and l0 balls
 97 |         x = l0_project(x,k);
 98 | 
 99 |         err = norm(oldx - x,2);
100 |         variance = x'*X*x/(x'*x);
101 |         
102 |         iter = iter + 1;
103 |         
104 |         % Save run statistics
105 |         log.errors = [log.errors err];
106 |         log.variances = [log.variances variance];
107 |         log.sparsities = [log.sparsities nnz(x)];
108 |         
109 |         errors = log.errors;
110 |         variances = log.variances;
111 |         sparsities = log.sparsities;
112 |         
113 |         % Print current state
114 |         fprintf('%d \t %d: %f \t %f \t %d\n', i, iter, err, ...
115 |                                               variance, nnz(x));
116 |     end
117 |     
118 |     log.errors = [log.errors err];
119 |     log.variances = [log.variances variance];
120 |     log.sparsities = [log.sparsities nnz(x)];
121 |     full_log{i} = log;
122 |    
123 |     fprintf('RESULTS:\n\tPrecision: %f\n\tVariance: %f\n\tSparsity: %d\n',...
124 |         err, variance, nnz(x));
125 |     
126 |     Q(:,i) = x;
127 |     
128 |     % Perform partial deflation
129 |     X = X - alpha*variance*x*x';
130 |      
131 | end
132 | end
133 | 
134 | function x = l0_project(x, k)
135 |     [~, idx] = sort(abs(x),'descend');
136 |     idx_to_zero = idx(k+1:end);
137 |     x(idx_to_zero) = 0;
138 |     x = x / norm(x);
139 | end


--------------------------------------------------------------------------------