├── +utils ├── zerodiag.m ├── putdiag.m ├── spdmat.m ├── pearsonIIIpval.m ├── distkern.m ├── sqdist.m ├── kernel.m ├── fwht.m ├── rotation_angle_axis.m ├── whiten.m ├── ucenter.m ├── spatialMedian.m ├── dcenter.m ├── jbld.m ├── DyadUpdate.c ├── nystrom.m ├── sigest.m ├── rbf.m ├── mexDyadUpdate.c ├── poldecomp.m ├── tri2sqind.m ├── permMoments.m ├── approxmtimes.m ├── mexHadamard.c └── rfm.m ├── +sphere ├── vpq.m ├── spatialSign.m ├── ajne.m ├── psivec.m ├── gine.m ├── rpcdf.m ├── gine3.m ├── gineajne.m ├── rp.m ├── rayleigh.m ├── bingham.m ├── rppdf.m ├── rptest.m ├── sumchi2cdf.m ├── signtest.m ├── vmfrnd.m └── jsn.m ├── .gitignore ├── +diff ├── mmd_.m ├── mmd.m ├── hotell2.m ├── mmdtest.m ├── covtest.m ├── kstest2d.m └── minentest.m ├── +dim ├── krztest.m ├── krzsim.m └── cpca.m ├── Testing ├── test_uniSphereTestPower_plot.m ├── test_sphericity.m ├── test_uniSphereTestPower2.m ├── test_dcorr.m ├── test_sphericity3.m ├── Test_dcov_dcorr.m ├── test_uniSphereTestPower.m ├── test_rank.m ├── test_PAIRS.m ├── test_uniSphereTestNull.m ├── test_covtest.m ├── test_sphericity4.m ├── test_sphericity2.m └── Test_rv.m ├── +dep ├── rdc.m ├── rank.m ├── dcorr.m ├── rvtest.m ├── rv.m ├── fdcov.m ├── ranktest.m ├── rpdcov.m ├── dcov.m ├── hsic.m ├── dcorrtest.m └── dcovtest.m ├── setup_highdim.m ├── README.md ├── DepTest1.m └── UniSphereTest.m /+utils/zerodiag.m: -------------------------------------------------------------------------------- 1 | function M = zerodiag(M) 2 | 3 | M = utils.putdiag(M,0); 4 | -------------------------------------------------------------------------------- /+sphere/vpq.m: -------------------------------------------------------------------------------- 1 | function v = vpq(p,q) 2 | 3 | v = nchoosek(p+q-2,p-1) + nchoosek(p+q-2,p-1); 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.mat 3 | temp/* 4 | *.mexmaci64 5 | *.mexa64 6 | *.mexw64 7 | *.mexw32 8 | 9 | -------------------------------------------------------------------------------- /+utils/putdiag.m: -------------------------------------------------------------------------------- 1 | function M = putdiag(M,x) 2 | 3 | [m,n] = size(M); 4 | 5 | assert((numel(x)==1)||(numel(x)==min(m,n)),'Wrong # of elements for diagonal'); 6 | 7 | M(1:(m+1):min(m*m,m*n)) = x; 8 | -------------------------------------------------------------------------------- /+utils/spdmat.m: -------------------------------------------------------------------------------- 1 | % Generate a dense n x n symmetric, positive definite matrix 2 | function A = spdmat(n) 3 | 4 | A = rand(n,n); 5 | A = A+A'; 6 | % since A(i,j) < 1 by construction and a symmetric diagonally dominant matrix 7 | % is symmetric positive definite, which can be ensured by adding nI 8 | A = A + n*eye(n); 9 | -------------------------------------------------------------------------------- /+diff/mmd_.m: -------------------------------------------------------------------------------- 1 | % TODO 2 | % o Looks like the definition of MMD in Gretton's publicly available code 3 | % differs slightly from their paper (re. the diagonal terms) 4 | 5 | function stat = mmd_(K,L,KL,m,n,biased) 6 | 7 | if biased 8 | stat = (sum(K(:))+m)/m^2 + (sum(L(:))+n)/n^2 - 2*sum(KL(:))/m/n; 9 | else 10 | stat = sum(K(:))/m/(m-1) + sum(L(:))/n/(n-1) - 2*sum(KL(:))/m/n; 11 | end -------------------------------------------------------------------------------- /+dim/krztest.m: -------------------------------------------------------------------------------- 1 | function [pval,stat,delta] = krztest(x,y,s) 2 | 3 | nboot = 500; 4 | 5 | [m,p] = size(x); 6 | [n,q] = size(x); 7 | 8 | [k,stat,delta,R] = dim.krzsim(x,y,s); 9 | yR = y*R'; 10 | 11 | for i = 1:nboot 12 | ind = unidrnd(m,m,1); 13 | xb = x(ind,:); 14 | ind = unidrnd(n,n,1); 15 | yb = yR(ind,:); 16 | [~,Tm(i)] = dim.krzsim(xb,yb,s); 17 | end 18 | %hist(Tm); 19 | pval = sum(Tm<=stat)/nboot; 20 | 21 | -------------------------------------------------------------------------------- /+utils/pearsonIIIpval.m: -------------------------------------------------------------------------------- 1 | % Calculate p-value for statistic of the form trace(A*B) using Pearson 2 | % Type III approximation using exact first three moments of the 3 | % permutation distribution 4 | function [pval,stat] = pearsonIIIpval(A,B,stat) 5 | 6 | if nargin < 3 7 | stat = sum(sum(A.*B)); 8 | end 9 | 10 | % Exact moments of permutation distribution 11 | [mu,sigma2,skew] = utils.permMoments(A,B); 12 | 13 | stat = (stat - mu)/sqrt(sigma2); 14 | 15 | if skew >= 0 16 | pval = gamcdf(stat - (-2/skew),4/skew^2,skew/2,'upper'); 17 | else 18 | as = abs(skew); 19 | pval = gamcdf(skew/as*stat + 2/as,4/skew^2,as/2); 20 | end 21 | -------------------------------------------------------------------------------- /+utils/distkern.m: -------------------------------------------------------------------------------- 1 | % Sejdinovic et al, pg. 2272, example 15 2 | % Brownian distance kernel 3 | function k = distkern(X,Y,varargin) 4 | 5 | par = inputParser; 6 | par.KeepUnmatched = true; 7 | addRequired(par,'X',@isnumeric); 8 | addRequired(par,'Y',@isnumeric); 9 | addParamValue(par,'index',1,@(x) isscalar(x) && (x>0) && (x<=2)); 10 | parse(par,X,Y,varargin{:}); 11 | 12 | Yt = Y'; 13 | XX = sqrt(sum(X.*X,2)); 14 | YY = sqrt(sum(Yt.*Yt)); 15 | D = sqrt(utils.sqdist(X,Y)); 16 | 17 | if par.Results.index ~= 1 18 | XX = XX.^par.Results.index; 19 | YY = YY.^par.Results.index; 20 | D = D.^par.Results.index; 21 | end 22 | 23 | k = 0.5 * (bsxfun(@plus,XX,YY) - D); 24 | -------------------------------------------------------------------------------- /+dim/krzsim.m: -------------------------------------------------------------------------------- 1 | % Krzanowski similarity 2 | function [k,Tm,delta,R] = krzsim(x,y,m) 3 | 4 | S1 = cov(x); 5 | S2 = cov(y); 6 | 7 | [Q1,D1] = eig(S1); 8 | [Q2,D2] = eig(S2); 9 | 10 | Q11 = Q1(:,1:m); 11 | Q12 = Q1(:,(m+1):end); 12 | Q21 = Q2(:,1:m); 13 | Q22 = Q2(:,(m+1):end); 14 | 15 | [k,delta,u,v] = princvec(Q11,Q21); 16 | [~,~,u2,v2] = princvec(Q12,Q22); 17 | 18 | R = [u u2]*[v';v2']; 19 | Tm = m - k; 20 | %Tm = trace(Q12'*Q21*Q21'*Q12); 21 | 22 | function [k,delta,u,v] = princvec(L,M) 23 | N = L'*M*M'*L; 24 | [V,D] = eig(N); 25 | lambda = diag(D); 26 | % Krzanowski similarity 27 | %k = trace(N) 28 | k = sum(lambda); 29 | sl = lambda.^0.5; 30 | delta = real(rad2deg(acos(sl))); 31 | 32 | u = L*V; 33 | v = M*M'*u; 34 | -------------------------------------------------------------------------------- /+utils/sqdist.m: -------------------------------------------------------------------------------- 1 | % Squared euclidean distance matrix 2 | % Faster than pdist2(x,x) & squareform(pdist(x)) 3 | % 4 | % x = randn(5000,1000); 5 | % y = randn(200,1000); 6 | % tic; sqrt(utils.sqdist(x)); toc 7 | % tic; pdist2(x,x); toc 8 | % norm(utils.sqdist(x) - pdist2(x,x).^2,'fro') 9 | % norm(utils.sqdist(x,y) - pdist2(x,y).^2,'fro') 10 | function D = sqdist(X,Y) 11 | 12 | if (nargin == 1) || isempty(Y) 13 | XX = sum(X.*X,2); 14 | D = bsxfun(@plus,XX,XX') - 2*(X*X'); 15 | else 16 | [m,p] = size(X); 17 | [n,q] = size(Y); 18 | assert(p==q,'Input dimensions must match'); 19 | 20 | Yt = Y'; 21 | XX = sum(X.*X,2); 22 | YY = sum(Yt.*Yt,1); 23 | D = bsxfun(@plus,XX,YY) - 2*(X*Yt); 24 | end 25 | 26 | %D(D<0) = 0; 27 | -------------------------------------------------------------------------------- /+utils/kernel.m: -------------------------------------------------------------------------------- 1 | function [K,varargout] = kernel(X,Y,varargin) 2 | 3 | par = inputParser; 4 | par.KeepUnmatched = true; 5 | addRequired(par,'X',@isnumeric); 6 | addRequired(par,'Y',@isnumeric); 7 | addParamValue(par,'kernel','rbf',@ischar); 8 | parse(par,X,Y,varargin{:}); 9 | 10 | switch lower(par.Results.kernel) 11 | case {'linear'} 12 | if isempty(Y) 13 | K = X*X'; 14 | else 15 | K = X*Y'; 16 | end 17 | case {'poly'} 18 | % TODO 19 | case {'rbf' 'gaussian' 'gauss'} 20 | [K,sigma] = utils.rbf(X,Y,par.Unmatched); 21 | if nargout > 1 22 | varargout{1} = sigma; 23 | end 24 | case {'brownian' 'dist' 'distance'} 25 | if isempty(Y) 26 | K = utils.distkern(X,X); 27 | else 28 | K = utils.distkern(X,Y); 29 | end 30 | end -------------------------------------------------------------------------------- /Testing/test_uniSphereTestPower_plot.m: -------------------------------------------------------------------------------- 1 | load('/Users/brian/Dropbox/Temp/sphere/Testing/test_uniSphereTestPower_n80_1.mat'); 2 | 3 | prob_r1 = prob_r; 4 | prob_ga1 = prob_ga; 5 | prob_p1 = prob_p; 6 | 7 | load('/Users/brian/Dropbox/Temp/sphere/Testing/test_uniSphereTestPower_n80_2.mat'); 8 | 9 | prob_r2 = prob_r; 10 | prob_ga2 = prob_ga; 11 | prob_p2 = prob_p; 12 | 13 | prob_r = (prob_r1+prob_r2)/2; 14 | prob_ga = (prob_ga1+prob_ga2)/2; 15 | prob_p = (prob_p1+prob_p2)/2; 16 | 17 | 18 | figure; 19 | for i = 1:3 20 | subplot(3,1,i); hold on 21 | plot(kappa,prob_r(:,i),'-',kappa,prob_ga(:,i),'-',kappa,prob_p(:,i),'--'); 22 | % plot(kappa,prob_ga(:,i),'--'); 23 | % plot(kappa,prob_p(:,i),':'); 24 | title(sprintf('dimension = %g',p(i))); 25 | if i == 1 26 | legend({'Rayleigh','Gine-Ajne','PAIRS'}) 27 | end 28 | end 29 | 30 | ylabel('Empirical power') 31 | xlabel('Kappa'); -------------------------------------------------------------------------------- /+utils/fwht.m: -------------------------------------------------------------------------------- 1 | % FWHT Fast Discrete Walsh-Hadamard Transform 2 | % 3 | % Y = fwht(X) 4 | % 5 | % Wrapper for efficient mex version of FWHT (mexHadamard.c). 6 | % 7 | % INPUTS 8 | % X - input matrix or column vector 9 | % 10 | % OUTPUTS 11 | % Y - transformed data 12 | 13 | function Y = fwht(X) 14 | 15 | [n,m] = size(X); 16 | n2 = nextpow2(n); 17 | 18 | % Zero-pad to nextpow2 19 | if n ~= 2^n2 20 | X = [X ; zeros(2^n2-n,m)]; 21 | end 22 | 23 | try 24 | % Scaled to match Matlab fwht 25 | Y = utils.mexHadamard(X)/2^n2; 26 | catch err 27 | if strcmp(err.identifier,'MATLAB:UndefinedFunction') 28 | warning('fwht:mex',... 29 | sprintf(['Mex file ''mexHadamard.c'' has not be compiled\n'... 30 | 'Transform will be done with slow Matlab version.'])); 31 | Y = fwht(X,2^n2,'hadamard'); 32 | else 33 | rethrow(err); 34 | end 35 | end -------------------------------------------------------------------------------- /+sphere/spatialSign.m: -------------------------------------------------------------------------------- 1 | % SPATIALSIGN Project data onto unit hypersphere 2 | % 3 | % U = spatialSign(x) 4 | % 5 | % INPUTS 6 | % x - [n x p] matrix, p being data-dimensionality 7 | % 8 | % OUTPUTS 9 | % U - [n x p] matrix, each row normalized to unit length 10 | 11 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 12 | % The full license and most recent version of the code can be found at: 13 | % https://github.com/brian-lau/highdim 14 | % 15 | % This program is free software: you can redistribute it and/or modify 16 | % it under the terms of the GNU General Public License as published by 17 | % the Free Software Foundation, either version 3 of the License, or 18 | % (at your option) any later version. 19 | % 20 | % This program is distributed in the hope that it will be useful, 21 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 22 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 | % GNU General Public License for more details. 24 | 25 | function U = spatialSign(x) 26 | 27 | U = bsxfun(@rdivide,x,sqrt(sum(x.^2,2))); 28 | U(isnan(U)) = 0; -------------------------------------------------------------------------------- /+utils/rotation_angle_axis.m: -------------------------------------------------------------------------------- 1 | function R = rotation_angle_axis(theta,u) 2 | %ROTATION_ANGLE_AXIS The Rodrigues' formula for rotation matrices. 3 | % R = ROTATION_ANGLE_AXIS(THETA,U) 4 | % 5 | % The formula recieves an angle of rotation given by theta and a unit vector, 6 | % u, that defines the axis of rotation. 7 | % 8 | % ARGUMENT DESCRIPTION: 9 | % THETA - angle of rotation (radians). 10 | % U - unit vector 11 | % 12 | % OUTPUT DESCRIPTION: 13 | % R - rotation matrix. 14 | % 15 | % Example 16 | % ------------- 17 | % R = rotation_angle_axis(deg2rad(pi/6),[sqrt(2)/2, 0.0, sqrt(2)/2]) 18 | % 19 | 20 | % Credits: 21 | % Daniel Simoes Lopes 22 | % IDMEC 23 | % Instituto Superior Tecnico - Universidade Tecnica de Lisboa 24 | % danlopes (at) dem ist utl pt 25 | % http://web.ist.utl.pt/daniel.s.lopes/ 26 | % 27 | % July 2011 original version. 28 | 29 | 30 | %__________________________________________________________________________ 31 | % Rodrigues' rotation formula. 32 | u = u./norm(u,2); 33 | S = [ 0 u(3) -u(2); 34 | -u(3) 0 u(1); 35 | u(2) -u(1) 0 ]; 36 | R = eye(3) + sin(theta)*S + (1-cos(theta))*S^2; -------------------------------------------------------------------------------- /Testing/test_sphericity.m: -------------------------------------------------------------------------------- 1 | % Zou et al (2014). Multivariate sign-based high-dimensional tests for 2 | % sphericity. Biometrika 101: 229-236 3 | 4 | % bias-corrected sign test 5 | % Check null distribution approximation 6 | n = 1000; 7 | p1 = zeros(n,1); 8 | s1 = zeros(n,1); 9 | p0 = zeros(n,1); 10 | s0 = zeros(n,1); 11 | for i = 1:n 12 | %x = randn(40,100); 13 | x = trnd(4,40,100); 14 | [p1(i),s1(i)] = sphere.signtest(x,'test','bcs','approx',false); 15 | [p0(i),s0(i)] = sphere.signtest(x,'test','bcs','approx',true); 16 | end 17 | 18 | figure; 19 | dx = 0.1; xx = -3:dx:3; 20 | n = histc(s0,xx); 21 | subplot(211);hold on 22 | bar(xx,n./sum(n),'histc'); 23 | plot(xx,normpdf(xx)*dx,'m'); 24 | title('normal approximation'); 25 | n = histc(s1,xx); 26 | subplot(212);hold on 27 | bar(xx,n./sum(n),'histc'); 28 | plot(xx,normpdf(xx)*dx,'m'); 29 | title('exact'); 30 | 31 | % Standard sign test 32 | % Check null distribution approximation 33 | n = 1000; 34 | p = zeros(n,1); 35 | s = zeros(n,1); 36 | for i = 1:n 37 | x = randn(10,3); 38 | [p(i),s(i)] = sphere.signtest(x,'test','sign'); 39 | end 40 | 41 | figure; 42 | dx = 1; xx = 0:1:25; 43 | n = histc(s,xx); 44 | hold on 45 | bar(xx,n./sum(n),'histc'); 46 | plot(xx,chi2pdf(xx,(3+2)*(3-1)/2)*dx,'m') 47 | -------------------------------------------------------------------------------- /Testing/test_uniSphereTestPower2.m: -------------------------------------------------------------------------------- 1 | % pairsClusterTest from here: https://sites.google.com/site/antimatt/software 2 | % randvonMisesFisherm from here: http://www.stat.pitt.edu/sungkyu/MiscPage.html 3 | clear all; 4 | n = 60; 5 | p = [4 8 16];%[4 10 20]; 6 | sigma = [1 10 20 40];%[0 1 2 4]; 7 | reps = 50;%2500; 8 | 9 | prob_ga = zeros(numel(sigma),numel(p)); 10 | prob_p = zeros(numel(sigma),numel(p)); 11 | 12 | test = UniSphereTest('autoRun',false); 13 | test.params.nboot = 500; 14 | for i = 1:numel(sigma) 15 | for j = 1:numel(p) 16 | for k = 1:reps 17 | 18 | x = zeros(n,p(j)); 19 | count = 0; 20 | for m = 1:p(j) 21 | S = eye(p(j)); 22 | if (rand < 0.25) && (count <=6) 23 | S(m,m) = sigma(i); 24 | count = count + 1; 25 | end 26 | x = x + mvnrnd(zeros(1,p(j)),S,n); 27 | end 28 | 29 | test.x = x; 30 | 31 | test.test = 'gine-ajne'; test.run(); 32 | h_ga(k) = test.h; 33 | 34 | [clusteriness, temp, dists, k2] = pairsClusterTest(x); 35 | pv(k) = temp; 36 | 37 | end 38 | prob_ga(i,j) = mean(h_ga); 39 | prob_p(i,j) = mean(pv<=0.05); 40 | end 41 | i 42 | end 43 | -------------------------------------------------------------------------------- /+sphere/ajne.m: -------------------------------------------------------------------------------- 1 | % AJNE Ajne statistic for spherical uniformity 2 | % 3 | % A = ajne(U) 4 | % 5 | % INPUTS 6 | % U - [n x p] matrix, n samples with dimensionality p 7 | % the data should already be projected to the unit hypersphere 8 | % 9 | % OUTPUTS 10 | % A - statistic 11 | % 12 | % REFERENCE 13 | % Mardia, KV, Jupp, PE (2000). Directional Statistics. John Wiley 14 | % 15 | % SEE ALSO 16 | % UniSphereTest, spatialSign 17 | 18 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 19 | % The full license and most recent version of the code can be found at: 20 | % https://github.com/brian-lau/highdim 21 | % 22 | % This program is free software: you can redistribute it and/or modify 23 | % it under the terms of the GNU General Public License as published by 24 | % the Free Software Foundation, either version 3 of the License, or 25 | % (at your option) any later version. 26 | % 27 | % This program is distributed in the hope that it will be useful, 28 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | % GNU General Public License for more details. 31 | 32 | function A = ajne(U) 33 | 34 | [n,p] = size(U); 35 | 36 | psi = sphere.psivec(U,n); 37 | % eq. 10.4.10 38 | A = (n/4) - (1/(n*pi))*sum(psi); 39 | 40 | -------------------------------------------------------------------------------- /+sphere/psivec.m: -------------------------------------------------------------------------------- 1 | % PSIVEC Vector pairwise angles, i < j 2 | % 3 | % psi = psivec(U,n) 4 | % 5 | % INPUTS 6 | % U - [n x p] matrix, n samples with dimensionality p 7 | % the data should already be projected to the unit hypersphere 8 | % n - number of samples 9 | % 10 | % OUTPUTS 11 | % psi - vector from psi matrix (U*U'), i < j 12 | % 13 | % REFERENCE 14 | % Mardia, KV, Jupp, PE (2000). Directional Statistics. John Wiley 15 | % 16 | % SEE ALSO 17 | % gine, gine3, ajne, gineajne 18 | 19 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 20 | % The full license and most recent version of the code can be found at: 21 | % https://github.com/brian-lau/highdim 22 | % 23 | % This program is free software: you can redistribute it and/or modify 24 | % it under the terms of the GNU General Public License as published by 25 | % the Free Software Foundation, either version 3 of the License, or 26 | % (at your option) any later version. 27 | % 28 | % This program is distributed in the hope that it will be useful, 29 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 30 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 | % GNU General Public License for more details. 32 | 33 | function psi = psivec(U,n) 34 | 35 | xx = triu(U*U',1); 36 | ind = triu(ones(n,n),1); 37 | psi = acos(xx(ind==1)); 38 | -------------------------------------------------------------------------------- /+sphere/gine.m: -------------------------------------------------------------------------------- 1 | % GINE Gine statistic for spherical uniformity 2 | % 3 | % G = gine(U) 4 | % 5 | % INPUTS 6 | % U - [n x p] matrix, n samples with dimensionality p 7 | % the data should already be projected to the unit hypersphere 8 | % 9 | % OUTPUTS 10 | % G - statistic 11 | % 12 | % REFERENCE 13 | % Mardia, KV, Jupp, PE (2000). Directional Statistics. John Wiley 14 | % 15 | % SEE ALSO 16 | % UniSphereTest, spatialSign 17 | 18 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 19 | % The full license and most recent version of the code can be found at: 20 | % https://github.com/brian-lau/highdim 21 | % 22 | % This program is free software: you can redistribute it and/or modify 23 | % it under the terms of the GNU General Public License as published by 24 | % the Free Software Foundation, either version 3 of the License, or 25 | % (at your option) any later version. 26 | % 27 | % This program is distributed in the hope that it will be useful, 28 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | % GNU General Public License for more details. 31 | 32 | function G = gine(U) 33 | 34 | [n,p] = size(U); 35 | 36 | psi = sphere.psivec(U,n); 37 | % eq. 10.7.5, avoiding overflow 38 | G = n/2 - (p-1)/(2*n) * ( exp(gammaln((p-1)/2) - gammaln(p/2)) )^2 * sum(sin(psi)); 39 | -------------------------------------------------------------------------------- /+utils/whiten.m: -------------------------------------------------------------------------------- 1 | %function [X,mu,invMat] = whiten(X,epsilon) 2 | % 3 | % ZCA whitening of a data matrix (make the covariance matrix identity) 4 | % 5 | % WARNING 6 | % This form of whitening performs poorly if the number of dimensions are 7 | % much greater than the number of instances 8 | % 9 | % INPUT 10 | % X: rows are the instances, columns are the features 11 | % epsilon: small number to compensate for nearly 0 eigenvalue [DEFAULT = 12 | % 0.0001] 13 | % 14 | % OUTPUT 15 | % Xwh: whitened data, rows are instances, columns are features 16 | % whMat: the whitening matrix 17 | 18 | % Copyright (c) 2012, Colorado Reed 19 | % All rights reserved. 20 | % 21 | % Redistribution and use in source and binary forms, with or without 22 | % modification, are permitted provided that the following conditions are 23 | % met: 24 | % 25 | % * Redistributions of source code must retain the above copyright 26 | % notice, this list of conditions and the following disclaimer. 27 | % * Redistributions in binary form must reproduce the above copyright 28 | % notice, this list of conditions and the following disclaimer in 29 | % the documentation and/or other materials provided with the distribution 30 | 31 | function [X,whMat] = whiten(X,epsilon) 32 | 33 | if nargin < 2 34 | epsilon = 0.0001; 35 | end 36 | 37 | mu = mean(X); 38 | X = bsxfun(@minus, X, mu); 39 | A = X'*X; 40 | [V,D,~] = svd(A); 41 | whMat = sqrt(size(X,1)-1)*V*sqrtm(inv(D + eye(size(D))*epsilon))*V'; 42 | X = X*whMat; 43 | -------------------------------------------------------------------------------- /+sphere/rpcdf.m: -------------------------------------------------------------------------------- 1 | % RPCDF CDF of angles on a uniform hypersphere 2 | % 3 | % c = rpcdf(theta,p,dx) 4 | % 5 | % INPUTS 6 | % theta - angles (radians) to evaluate pdf 7 | % p - dimensionality (R^p) 8 | % 9 | % OPTIONAL 10 | % dx - resolution (default = 0.001); 11 | % 12 | % OUTPUTS 13 | % h - cdf 14 | % 15 | % REFERENCE 16 | % Cai, T et al (2013). Distribution of angles in random packing on 17 | % spheres. J of Machine Learning Research 14: 1837-1864. 18 | % 19 | % SEE ALSO 20 | % rppdf, rp 21 | 22 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 23 | % The full license and most recent version of the code can be found at: 24 | % https://github.com/brian-lau/highdim 25 | % 26 | % This program is free software: you can redistribute it and/or modify 27 | % it under the terms of the GNU General Public License as published by 28 | % the Free Software Foundation, either version 3 of the License, or 29 | % (at your option) any later version. 30 | % 31 | % This program is distributed in the hope that it will be useful, 32 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 33 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 34 | % GNU General Public License for more details. 35 | 36 | function c = rpcdf(theta,p,dx) 37 | 38 | if nargin < 3 39 | dx = 0.001; 40 | end 41 | 42 | x = 0:dx:pi; 43 | h = sphere.rppdf(x,p); 44 | 45 | c = cumtrapz(x,h); 46 | c = interp1(x,c,theta); -------------------------------------------------------------------------------- /+sphere/gine3.m: -------------------------------------------------------------------------------- 1 | % GINE3 Gine test for spherical uniformity (p=3) 2 | % 3 | % [pval,Fn] = gine3(U) 4 | % 5 | % INPUTS 6 | % U - [n x 3] matrix, n samples with dimensionality 3 7 | % the data should already be projected to the unit hypersphere 8 | % 9 | % OUTPUTS 10 | % pval - p-value 11 | % Fn - statistic 12 | % 13 | % REFERENCE 14 | % Mardia, KV, Jupp, PE (2000). Directional Statistics. John Wiley 15 | % 16 | % SEE ALSO 17 | % UniSphereTest, spatialSign 18 | 19 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 20 | % The full license and most recent version of the code can be found at: 21 | % https://github.com/brian-lau/highdim 22 | % 23 | % This program is free software: you can redistribute it and/or modify 24 | % it under the terms of the GNU General Public License as published by 25 | % the Free Software Foundation, either version 3 of the License, or 26 | % (at your option) any later version. 27 | % 28 | % This program is distributed in the hope that it will be useful, 29 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 30 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 | % GNU General Public License for more details. 32 | 33 | function [pval,Fn] = gine3(U) 34 | 35 | [n,p] = size(U); 36 | 37 | if p ~= 3 38 | error('Only valid for p = 3'); 39 | end 40 | 41 | psi = sphere.psivec(U,n); 42 | % eq. 10.4.8 43 | Fn = (3*n)/2 - (4/(n*pi)) * sum(psi + sin(psi)); 44 | 45 | pval = 1 - sphere.sumchi2cdf(Fn,3); 46 | -------------------------------------------------------------------------------- /+utils/ucenter.m: -------------------------------------------------------------------------------- 1 | % UCENTER U-center distance matrix 2 | % 3 | % [X,X_j,X__] = ucenter(X) 4 | % 5 | % U-center distance matrix 6 | % 7 | % X_{ij} - X_{i.}/(n-2) - X_{.j}/(n-2) + X_{..}/((n-1)(n-2)), i \neq j 8 | % 9 | % and zero diagonal 10 | % 11 | % INPUTS 12 | % X - [n x n] symmetric distance matrix 13 | % 14 | % OUTPUTS 15 | % X - centered distance matrix 16 | % X_j - column means of X (input) 17 | % X__ - mean of X (input) 18 | % 19 | % SEE ALSO 20 | % pcenter 21 | 22 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 23 | % The full license and most recent version of the code can be found at: 24 | % https://github.com/brian-lau/highdim 25 | % 26 | % This program is free software: you can redistribute it and/or modify 27 | % it under the terms of the GNU General Public License as published by 28 | % the Free Software Foundation, either version 3 of the License, or 29 | % (at your option) any later version. 30 | % 31 | % This program is distributed in the hope that it will be useful, 32 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 33 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 34 | % GNU General Public License for more details. 35 | 36 | function [X,X_j,X__] = ucenter(X) 37 | 38 | [n,m] = size(X); 39 | assert(m==n,'UCENTER operates on square, symmetric distance matrices'); 40 | 41 | X_j = sum(X); 42 | X__ = sum(X_j); % sum(X(:)) 43 | X = X - bsxfun(@plus,X_j,X_j')/(n-2) + X__/((n-1)*(n-2)); 44 | X(1:(n+1):n*n) = 0; 45 | -------------------------------------------------------------------------------- /+utils/spatialMedian.m: -------------------------------------------------------------------------------- 1 | function y = spatialMedian(X,tol,y,max_iter) 2 | % Calculate the geometric median for a set of observations (mean under a 3 | % Laplacian noise distribution) This is using Weiszfeld's algorithm. 4 | % 5 | % In: 6 | % X : the data, as in mean 7 | % tol : tolerance (default: 1.e-5) 8 | % y : initial value (default: median(X)) 9 | % max_iter : max number of iterations (default: 500) 10 | % 11 | % Out: 12 | % g : geometric median over X 13 | 14 | % https://github.com/sccn/BCILAB/code/misc/geometric_median.m 15 | % Copyright (C) Christian Kothe, SCCN, 2012, christian@sccn.ucsd.edu 16 | % 17 | % This program is free software; you can redistribute it and/or modify it 18 | % under the terms of the GNU General Public License as published by the 19 | % Free Software Foundation; either version 2 of the License, or (at your 20 | % option) any later version. 21 | % 22 | % This program is distributed in the hope that it will be useful, but 23 | % WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 | % or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 | % for more details. 26 | 27 | if ~exist('tol','var') || isempty(tol) 28 | tol = 1.e-5; end 29 | if ~exist('y','var') || isempty(y) 30 | y = median(X); end 31 | if ~exist('max_iter','var') || isempty(max_iter) 32 | max_iter = 500; end 33 | 34 | for i=1:max_iter 35 | invnorms = 1./sqrt(sum(bsxfun(@minus,X,y).^2,2)); 36 | [y,oldy] = deal(sum(bsxfun(@times,X,invnorms)) / sum(invnorms),y); 37 | if norm(y-oldy)/norm(y) < tol 38 | break; end 39 | end -------------------------------------------------------------------------------- /+utils/dcenter.m: -------------------------------------------------------------------------------- 1 | % DCENTER Double-center distance matrix 2 | % 3 | % [X,X_j,X__] = dcenter(X) 4 | % 5 | % Double-centers distance matrix X: 6 | % 7 | % X_{ij} - X_{i.}/n - X_{.j}/n + X_{..}/n^2, all i, j 8 | % 9 | % Faster & more memory-efficient than using a centering matrix 10 | % H = eye(n) - ones(n)/n; X = H*X*H; 11 | % 12 | % INPUTS 13 | % X - [n x n] symmetric distance matrix 14 | % 15 | % OUTPUTS 16 | % X - centered distance matrix 17 | % X_j - column means of X (input) 18 | % X__ - mean of X (input) 19 | % 20 | % SEE ALSO 21 | % ucenter 22 | 23 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 24 | % The full license and most recent version of the code can be found at: 25 | % https://github.com/brian-lau/highdim 26 | % 27 | % This program is free software: you can redistribute it and/or modify 28 | % it under the terms of the GNU General Public License as published by 29 | % the Free Software Foundation, either version 3 of the License, or 30 | % (at your option) any later version. 31 | % 32 | % This program is distributed in the hope that it will be useful, 33 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 34 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 35 | % GNU General Public License for more details. 36 | 37 | function [X,X_j,X__] = dcenter(X) 38 | 39 | [n,m] = size(X); 40 | assert(m==n,'DCENTER operates on square, symmetric distance matrices'); 41 | 42 | X_j = mean(X); 43 | X__ = mean(X_j); % mean(X(:)) 44 | X = X - bsxfun(@plus,X_j,X_j') + X__; -------------------------------------------------------------------------------- /Testing/test_dcorr.m: -------------------------------------------------------------------------------- 1 | 2 | %% Table 1 from 3 | % Szekely & Rizzo (2013). The distance correlation t-test of independence 4 | % in high dimension. J Multiv Analysis 117: 193-213 5 | % Note that their table is a single sample 6 | clear; 7 | n = 30; 8 | p = [1 2 4 8 16 32 64 128 256 512 1024 2048 4096]; 9 | reps = 1; 10 | 11 | for i = 1:numel(p) 12 | for j = 1:reps 13 | x = rand(30,p(i)); 14 | y = rand(30,p(i)); 15 | r(j,i) = dep.dcorr(x,y); 16 | 17 | rstar(j,i) = dep.dcorr(x,y,true); 18 | T(j,i) = sqrt(n*(n-3)/2-1)*rstar(j,i)/sqrt(1-rstar(j,i)^2); 19 | end 20 | end 21 | 22 | table(p',mean(r,1)',mean(rstar,1)',mean(T,1)',... 23 | 'VariableNames',{'pq','R','Rstar','T'}) 24 | 25 | % [pval,r,T] =dep.dcorrtest([1 2 3 4 5]',[1.4 1.4 3.5 4.2 4.8]') 26 | % DepTest2([1 2 3 4 5]',[1.4 1.4 3.5 4.2 4.8]','test','dcorr') 27 | % % Replicate using R 'energy' package 28 | % dcor.ttest(c(1,2,3,4,5),c(1.4,1.4,3.5,4.2,4.8)) 29 | % 30 | % dcor t-test of independence 31 | % 32 | % data: c(1, 2, 3, 4, 5) and c(1.4, 1.4, 3.5, 4.2, 4.8) 33 | % T = 5.6569, df = 4, p-value = 0.002406 34 | % sample estimates: 35 | % Bias corrected dcor 36 | % 0.942809 37 | 38 | % Section 3, example 1, page 200 39 | clear; 40 | n = 30; 41 | p = 30; 42 | q = 30; 43 | reps = 1000; 44 | 45 | for i = 1:reps 46 | x = rand(n,p); 47 | y = rand(n,q); 48 | [pval(i),~,T(i)] = dep.dcorrtest(x,y); 49 | end 50 | 51 | clear; 52 | n = 30; 53 | p = 30; 54 | q = 30; 55 | reps = 1000; 56 | 57 | for i = 1:reps 58 | x = rand(n,p); 59 | y = x + sqrt(.2)*randn(n,q); % I think there is a typo in the paper 60 | [pval(i),~,T(i)] = dep.dcorrtest(x,y); 61 | end -------------------------------------------------------------------------------- /+utils/jbld.m: -------------------------------------------------------------------------------- 1 | % JBLD Jensen-Bregman LogDet Divergence 2 | % 3 | % div = jbld(x,y) 4 | % 5 | % INPUTS 6 | % x - [n x n] positive semi-definite matrix 7 | % y - [n x n] positive semi-definite matrix 8 | % 9 | % OUTPUTS 10 | % div - Jensen-Bregman LogDet Divergence 11 | % 12 | % REFERENCE 13 | % Cherian et al (2012). Jensen-Bregman LogDet Divergence with Application 14 | % to Efficient Similarity Search for Covariance Matrices. 15 | % Trans Pattern Analysis & Machine Intelligence 16 | 17 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 18 | % The full license and most recent version of the code can be found at: 19 | % https://github.com/brian-lau/highdim 20 | % 21 | % This program is free software: you can redistribute it and/or modify 22 | % it under the terms of the GNU General Public License as published by 23 | % the Free Software Foundation, either version 3 of the License, or 24 | % (at your option) any later version. 25 | % 26 | % This program is distributed in the hope that it will be useful, 27 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | % GNU General Public License for more details. 30 | 31 | function div = jbld(x,y) 32 | 33 | [m,p] = size(x); 34 | [n,q] = size(y); 35 | 36 | if (m~=n) || (p~=q) 37 | error('x and y must be the same size'); 38 | end 39 | 40 | cxy = chol((x+y)/2); 41 | cx = chol(x); 42 | cy = chol(y); 43 | div = log(prod(diag(cxy).^2)) - log(prod(diag(cx).^2)*prod(diag(cy).^2))/2; 44 | 45 | % div2 = log(det((x+y)/2)) - log(det(x*y))/2; -------------------------------------------------------------------------------- /+sphere/gineajne.m: -------------------------------------------------------------------------------- 1 | % GINEAJNE Weighted Gine/Ajne statistic for spherical uniformity 2 | % 3 | % F = gineajne(U) 4 | % 5 | % A weighted sum of Gine's and Anje's statistics is consistent against 6 | % all alternatives to uniformity on S^(p-1), the unit sphere in R^p. 7 | % 8 | % INPUTS 9 | % U - [n x p] matrix, n samples with dimensionality p 10 | % the data should already be projected to the unit hypersphere 11 | % 12 | % OUTPUTS 13 | % F - statistic 14 | % 15 | % REFERENCE 16 | % Prentice, MJ (1978). On invariant tests of uniformity for directions 17 | % and orientations. Annals of Statistics 6: 169-176. 18 | % 19 | % SEE ALSO 20 | % UniSphereTest, gine, ajne, spatialSign 21 | 22 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 23 | % The full license and most recent version of the code can be found at: 24 | % https://github.com/brian-lau/highdim 25 | % 26 | % This program is free software: you can redistribute it and/or modify 27 | % it under the terms of the GNU General Public License as published by 28 | % the Free Software Foundation, either version 3 of the License, or 29 | % (at your option) any later version. 30 | % 31 | % This program is distributed in the hope that it will be useful, 32 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 33 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 34 | % GNU General Public License for more details. 35 | 36 | function F = gineajne(U) 37 | 38 | [n,p] = size(U); 39 | 40 | psi = sphere.psivec(U,n); 41 | G = n/2 - (p-1)/(2*n) * ( exp(gammaln((p-1)/2) - gammaln(p/2)) )^2 * sum(sin(psi)); 42 | A = (n/4) - (1/(n*pi))*sum(psi); 43 | F = G + A; 44 | -------------------------------------------------------------------------------- /Testing/test_sphericity3.m: -------------------------------------------------------------------------------- 1 | %% Compare bias-corrected sign test size & power with table 1 from 2 | % Zou et al (2014). Multivariate sign-based high-dimensional tests for 3 | % sphericity. Biometrika 101: 229-236 4 | clear all; 5 | n = [40 80]; 6 | p = [55 181 642]; 7 | reps = 100; 8 | v = [0 0.125 0.250]; 9 | 10 | tic; 11 | for i = 1:numel(n) 12 | for j = 1:numel(p) 13 | for k = 1:numel(v) 14 | for m = 1:reps 15 | y = randn(n(i),p(j)); 16 | vp = round(v(k)*p(j)); 17 | A = [sqrt(2)*ones(vp,1) ; ones(p(j)-vp,1)]; 18 | x = (diag(A)*y')'; 19 | pval(m) = sphere.signtest(x,'test','bcs'); 20 | end 21 | prob(i,j,k) = mean(pval<=0.05); 22 | end 23 | toc 24 | end 25 | end 26 | 27 | 100*prob 28 | 29 | % reps = 2000 % 24.11.2014 30 | % approx = true 31 | % ans(:,:,1) = 32 | % 33 | % 4.7000 5.7500 5.8000 34 | % 6.2500 3.9500 4.7000 35 | % 36 | % ans(:,:,2) = 37 | % 38 | % 45.1500 47.8500 49.9000 39 | % 87.6500 93.3000 94.1500 40 | % 41 | % ans(:,:,3) = 42 | % 43 | % 64.7000 69.6500 72.4500 44 | % 98.8000 99.3500 99.6500 45 | 46 | % reps = 2000 % 25.11.2014 47 | % approx = false 48 | % ans(:,:,1) = 49 | % 50 | % 5.8000 5.8000 5.9000 51 | % 5.5500 5.1500 4.4000 52 | % 53 | % ans(:,:,2) = 54 | % 55 | % 43.9000 50.4500 50.4500 56 | % 86.6000 92.8500 94.6500 57 | % 58 | % ans(:,:,3) = 59 | % 60 | % 67.2000 68.7000 71.8500 61 | % 98.7500 99.6500 99.6000 62 | 63 | % values from Zou et al. Table 1 64 | pZ(:,:,1) = [... 65 | 4.9 4.9 5.1;... 66 | 4.7 5.2 5.1]; 67 | pZ(:,:,2) = [... 68 | 41 47 49;... 69 | 84 91 94]; 70 | pZ(:,:,3) = [... 71 | 64 68 72;... 72 | 99 100 100] 73 | -------------------------------------------------------------------------------- /+sphere/rp.m: -------------------------------------------------------------------------------- 1 | % RP Random projection stat for spherical uniformity 2 | % 3 | % stat = rp(U,k) 4 | % 5 | % INPUTS 6 | % U - [n x p] matrix, n samples with dimensionality p 7 | % the data should already be projected to the unit hypersphere 8 | % k - number of random vectors to project onto 9 | % 10 | % OUTPUTS 11 | % stat - [n x k] vector of of angles between data and k random vectors 12 | % 13 | % REFERENCE 14 | % Cuesta-Albertos, JA et al (2009). On projection-based tests for 15 | % directional and compositional data. Stat Comput 19: 367-380 16 | % Cuesta-Albertos, JA et al (2007). A sharp form of the Cramer-Wold 17 | % theorem. J Theor Probab 20: 201-209 18 | % 19 | % SEE ALSO 20 | % UniSphereTest, spatialSign 21 | 22 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 23 | % The full license and most recent version of the code can be found at: 24 | % https://github.com/brian-lau/highdim 25 | % 26 | % This program is free software: you can redistribute it and/or modify 27 | % it under the terms of the GNU General Public License as published by 28 | % the Free Software Foundation, either version 3 of the License, or 29 | % (at your option) any later version. 30 | % 31 | % This program is distributed in the hope that it will be useful, 32 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 33 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 34 | % GNU General Public License for more details. 35 | 36 | function stat = rp(U,k) 37 | 38 | [n,p] = size(U); 39 | 40 | % Uniform random directions 41 | u0 = sphere.spatialSign(randn(k,p)); 42 | stat = zeros(n,k); 43 | 44 | for i = 1:k 45 | stat(:,i) = acos(U*u0(i,:)'); 46 | end 47 | -------------------------------------------------------------------------------- /+sphere/rayleigh.m: -------------------------------------------------------------------------------- 1 | % RAYLEIGH Rayleigh statistic for spherical uniformity 2 | % 3 | % [pval,R] = rayleigh(U) 4 | % 5 | % Most powerful invariant test against von Mises alternative. 6 | % Not consistent against alternatives with zero resultant length 7 | % (Mardia & Jupp, pg 209). 8 | % 9 | % INPUTS 10 | % U - [n x p] matrix, n samples with dimensionality p 11 | % the data should already be projected to the unit hypersphere 12 | % 13 | % OUTPUTS 14 | % pval - p-value 15 | % R - statistic 16 | % 17 | % REFERENCE 18 | % Mardia, KV, Jupp, PE (2000). Directional Statistics. John Wiley 19 | % 20 | % SEE ALSO 21 | % UniSphereTest, spatialSign 22 | 23 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 24 | % The full license and most recent version of the code can be found at: 25 | % https://github.com/brian-lau/highdim 26 | % 27 | % This program is free software: you can redistribute it and/or modify 28 | % it under the terms of the GNU General Public License as published by 29 | % the Free Software Foundation, either version 3 of the License, or 30 | % (at your option) any later version. 31 | % 32 | % This program is distributed in the hope that it will be useful, 33 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 34 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 35 | % GNU General Public License for more details. 36 | 37 | function [pval,R] = rayleigh(U) 38 | 39 | [n,p] = size(U); 40 | 41 | if 0 42 | R = (p/n)*sum(sum(U*U')); 43 | else 44 | % Modified Rayleigh test statistic (Mardia & Jupp, eq. 10.4.6) 45 | Ubar = mean(U); 46 | T = n*p*sum(Ubar.^2); 47 | R = (1-1/(2*n))*T + (1/(2*n*(p+2)))*T^2; 48 | end 49 | 50 | pval = 1 - chi2cdf(R,p); 51 | -------------------------------------------------------------------------------- /+utils/DyadUpdate.c: -------------------------------------------------------------------------------- 1 | /* 2 | * The main routine for DyadUpdate.c 3 | * 4 | * Huo & Szekely (2017). Fast Computing for Distance Covariance, 5 | * Technometrics, 2016, 58, 435?447. 6 | * 7 | * Copyright (c) 2014 Xiaoming Huo 8 | */ 9 | 10 | #ifdef MATLAB_MEX_FILE 11 | #include "mex.h" 12 | #define calloc mxCalloc 13 | #define free mxFree 14 | #endif 15 | 16 | #define S(i) s_p[i-1] 17 | #define Y(i) Y_p[i-1] 18 | #define C(i) C_p[i-1] 19 | #define GAMMA(i) GAMMA_p[i-1] 20 | 21 | void DyadUpdate(double GAMMA_p[],double Y_p[],double C_p[],const int n) 22 | { 23 | int L,ii,ell,k,pos,scale,s_length; 24 | double *s_p; 25 | 26 | L = (int) ceil(log((double) n)/log((double) 2)); 27 | s_length = (int) pow(((double) 2), ((double) L+1)); 28 | s_p = (double *) calloc(s_length, sizeof(double)); 29 | 30 | for(ii=2;ii<=n;ii++){ 31 | for(ell=0;ell<=L-1;ell++){ 32 | k = (int) ceil(Y(ii-1)/((int) pow(((double) 2), ((double) ell)))); 33 | pos = k; 34 | if(ell>0){ 35 | for(scale=ell-1;scale>=0;scale--){ 36 | pos = pos + (int) pow(((double) 2), ((double) L-scale)); 37 | } 38 | } 39 | S(pos) = S(pos) + C(ii-1); 40 | } 41 | for(ell=0;ell<=L-1;ell++){ 42 | k = (int) floor((double) (Y(ii)-1)/((int) pow(((double) 2), ((double) ell)))); 43 | if((double) k/2 > (int) floor(((double) k)/2)){ 44 | pos = k; 45 | if(ell>0){ 46 | for(scale=ell-1;scale>=0;scale--){ 47 | pos = pos + (int) pow(((double) 2), ((double) L-scale)); 48 | } 49 | } 50 | GAMMA(ii) = GAMMA(ii) + S(pos); 51 | } 52 | } 53 | } 54 | 55 | free(s_p); 56 | } 57 | 58 | #undef S 59 | #undef Y 60 | #undef C 61 | #undef GAMMA 62 | -------------------------------------------------------------------------------- /Testing/Test_dcov_dcorr.m: -------------------------------------------------------------------------------- 1 | % xUnit framework required 2 | % https://psexton.github.io/matlab-xunit/ 3 | 4 | % energy package 1.6.2 5 | % > dcov(c(1,2,3,4),c(1,1,2,6)) 6 | % [1] 1.118034 7 | % > dcor(c(1,2,3,4),c(1,1,2,6)) 8 | % [1] 0.8947853 9 | % > dcov(c(1,2,3),c(.5,2,3.4)) 10 | % [1] 0.846197 11 | % > dcor(c(1,2,3),c(.5,2,3.4)) 12 | % [1] 0.9998217 13 | % > dcov(c(-11,2,3),c(.5,2,3.4)) 14 | % [1] 2.258591 15 | % > dcor(c(-11,2,3),c(.5,2,3.4)) 16 | % [1] 0.9206351 17 | 18 | classdef Test_dcov_dcorr < TestCase 19 | properties 20 | end 21 | 22 | methods 23 | function self = Test_dcov_dcorr(name) 24 | self = self@TestCase(name); 25 | end 26 | 27 | function setUp(self) 28 | end 29 | 30 | function test_dcov1(self) 31 | d = dep.dcov([1 2 3 4]',[1 1 2 6]'); 32 | assertElementsAlmostEqual(d,1.118034,'absolute',1e-5); 33 | end 34 | 35 | function test_dcov2(self) 36 | d = dep.dcov([1 2 3]',[.5 2 3.4]'); 37 | assertElementsAlmostEqual(d,0.846197,'absolute',1e-5); 38 | end 39 | 40 | function test_dcov3(self) 41 | d = dep.dcov([-11 2 3]',[.5 2 3.4]'); 42 | assertElementsAlmostEqual(d,2.258591,'absolute',1e-5); 43 | end 44 | 45 | function test_dcorr1(self) 46 | d = dep.dcorr([1 2 3 4]',[1 1 2 6]'); 47 | assertElementsAlmostEqual(d,0.8947853,'absolute',1e-5); 48 | end 49 | 50 | function test_dcorr2(self) 51 | d = dep.dcorr([1 2 3]',[.5 2 3.4]'); 52 | assertElementsAlmostEqual(d,0.9998217,'absolute',1e-5); 53 | end 54 | 55 | function test_dcorr3(self) 56 | d = dep.dcorr([-11 2 3]',[.5 2 3.4]'); 57 | assertElementsAlmostEqual(d,0.9206351,'absolute',1e-5); 58 | end 59 | 60 | function tearDown(self) 61 | end 62 | end 63 | end -------------------------------------------------------------------------------- /+utils/nystrom.m: -------------------------------------------------------------------------------- 1 | % NYSTROM Nystrom approximation of kernel matrix 2 | % 3 | % [phi,K] = nystrom(X,varargin) 4 | % 5 | % INPUTS 6 | % X - [n x p] n samples of dimensionality p 7 | % 8 | % OPTIONAL 9 | % c - scalar, number of columns to sample (without replacement) 10 | % rsvd - boolean indicating whether to use randomized SVD 11 | % tol - scalar tolerance for truncating small singular values 12 | % 13 | % Additional name/value pairs are passed through to RSVD if true. 14 | % 15 | % OUTPUTS 16 | % phi - approximate feature mapped data 17 | % K - approximate Gram matrix 18 | % 19 | % REFERENCES 20 | % Wang (2015). A Practical Guide to Randomized Matrix Computations with 21 | % MATLAB Implementations. https://arxiv.org/abs/1505.07570 22 | % 23 | % SEE ALSO 24 | % rsvd 25 | 26 | function [phi,K] = nystrom(X,varargin) 27 | 28 | par = inputParser; 29 | par.KeepUnmatched = true; 30 | addRequired(par,'X',@isnumeric); 31 | addParamValue(par,'c',[],@(x) isnumeric(x) && isscalar(x)); 32 | addParamValue(par,'rsvd',false,@islogical); 33 | addParamValue(par,'tol',[],@(x) isnumeric(x) && isscalar(x)); 34 | parse(par,X,varargin{:}); 35 | 36 | [n,p] = size(X); 37 | if isempty(par.Results.c) 38 | c = fix(0.25*n); % Default to 25% columns 39 | else 40 | c = min(par.Results.c,n); 41 | end 42 | 43 | ind = randperm(n); 44 | ind = ind(1:c); 45 | C = utils.kernel(X,X(ind,:),par.Unmatched); % C = K(:,ind) 46 | W = C(ind,:); 47 | 48 | if par.Results.rsvd 49 | %[U,S] = utils.rsvd(W,par.Unmatched); 50 | [U,S] = utils.rsvd(W,30,10,3); 51 | else 52 | [U,S] = svd(W); 53 | end 54 | s = diag(S); 55 | if isempty(par.Results.tol) 56 | tol = max(size(W)) * eps(norm(s,inf)); % from pinv 57 | else 58 | tol = par.Results.tol; 59 | end 60 | c = sum(s > tol); 61 | s = 1./sqrt(s(1:c)); 62 | UW = bsxfun(@times,U(:,1:c),s'); 63 | 64 | phi = C*UW; 65 | 66 | if nargout == 2 67 | K = phi*phi'; 68 | end -------------------------------------------------------------------------------- /+dep/rdc.m: -------------------------------------------------------------------------------- 1 | % RDC Randomized dependence coefficient 2 | % 3 | % r = rdc(x,y,varargin) 4 | % 5 | % RDC is the largest canonical correlation as computed by RCCA on random 6 | % features of the copula transformations of two random samples 7 | % 8 | % INPUTS 9 | % x - [n x p] n samples of dimensionality p 10 | % y - [n x q] n samples of dimensionality q 11 | % 12 | % OPTIONAL 13 | % k 14 | % s 15 | % f 16 | % demean 17 | % 18 | % OUTPUTS 19 | % 20 | % REFERENCE 21 | % 22 | 23 | % Based on R code: 24 | % https://github.com/lopezpaz/randomized_dependence_coefficient/blob/master/code/algorithms.r 25 | % rdc <- function(x,y,k=20,s=1/6,f=sin) { 26 | % x <- cbind(apply(as.matrix(x),2,function(u)rank(u)/length(u)),1) 27 | % y <- cbind(apply(as.matrix(y),2,function(u)rank(u)/length(u)),1) 28 | % x <- s/ncol(x)*x%*%matrix(rnorm(ncol(x)*k),ncol(x)) 29 | % y <- s/ncol(y)*y%*%matrix(rnorm(ncol(y)*k),ncol(y)) 30 | % cancor(cbind(f(x),1),cbind(f(y),1))$cor[1] 31 | % } 32 | 33 | function r = rdc(x,y,varargin) 34 | 35 | par = inputParser; 36 | par.KeepUnmatched = true; 37 | addRequired(par,'x',@isnumeric); 38 | addRequired(par,'y',@isnumeric); 39 | addParamValue(par,'k',20,@isscalar); 40 | addParamValue(par,'s',1/6,@isscalar); 41 | addParamValue(par,'f',@sin,@(x) isa(x,'function_handle')); 42 | addParamValue(par,'demean',false,@islogical); 43 | parse(par,x,y,varargin{:}); 44 | 45 | n = size(x,1); 46 | if par.Results.demean 47 | x = bsxfun(@minus,x,mean(x)); 48 | y = bsxfun(@minus,y,mean(y)); 49 | end 50 | 51 | x = [tiedrank(x)/n ones(n,1)]; 52 | y = [tiedrank(y)/n ones(n,1)]; 53 | 54 | f = par.Results.f; 55 | s = par.Results.s; 56 | k = par.Results.k; 57 | x = f(s/size(x,2)*x*randn(size(x,2),k)); 58 | y = f(s/size(y,2)*y*randn(size(y,2),k)); 59 | 60 | warning('off','stats:canoncorr:NotFullRank'); 61 | [~,~,r] = canoncorr([x ones(n,1)],[y ones(n,1)]); 62 | warning('on','stats:canoncorr:NotFullRank'); 63 | 64 | r = r(1); 65 | 66 | -------------------------------------------------------------------------------- /+sphere/bingham.m: -------------------------------------------------------------------------------- 1 | % BINGHAM Bingham test for spherical uniformity 2 | % 3 | % [pval,B] = bingham(U) 4 | % 5 | % Antipodially symmetric 6 | % Not consistent against alternatives with E[xx'] = (1/p)*Ip 7 | % 8 | % INPUTS 9 | % U - [n x p] matrix, n samples with dimensionality p 10 | % the data should already be projected to the unit hypersphere 11 | % 12 | % OUTPUTS 13 | % pval - p-value 14 | % B - statistic 15 | % 16 | % REFERENCE 17 | % Mardia, KV, Jupp, PE (2000). Directional Statistics. John Wiley 18 | % 19 | % SEE ALSO 20 | % UniSphereTest, spatialSign 21 | 22 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 23 | % The full license and most recent version of the code can be found at: 24 | % https://github.com/brian-lau/highdim 25 | % 26 | % This program is free software: you can redistribute it and/or modify 27 | % it under the terms of the GNU General Public License as published by 28 | % the Free Software Foundation, either version 3 of the License, or 29 | % (at your option) any later version. 30 | % 31 | % This program is distributed in the hope that it will be useful, 32 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 33 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 34 | % GNU General Public License for more details. 35 | 36 | function [pval,B] = bingham(U) 37 | 38 | [n,p] = size(U); 39 | 40 | if 1 41 | % eq. 10.7.1 42 | T = (1/n)*U'*U; 43 | B = ((n*p*(p+2))/2)*(trace(T^2) - 1/p); 44 | else 45 | % Modified Bingham test statistic (Mardia & Jupp, eq. 10.7.3) 46 | % seems to blow up for certain data? 47 | T = (1/n)*U'*U; 48 | B = ((n*p*(p+2))/2)*(trace(T^2) - 1/p); 49 | B0 = (2*p^2+3*p+4)/(6*(p+4)); 50 | B1 = -(4*p^2+3*p-4)/(3*(p+4)*(p^2+p+2)); 51 | B2 = 4*(p^2-4)/(3*(p+4)*(p^2+p+2)*(p^2+p+6)); 52 | B = B*(1 - (1/n)*(B0 + B1*B + B2*B^2)); 53 | end 54 | 55 | pval = 1 - chi2cdf(B,((p-1)*(p+2))/2); 56 | -------------------------------------------------------------------------------- /+dep/rank.m: -------------------------------------------------------------------------------- 1 | % RANK Rank-based statistics for testing independence 2 | % 3 | % r = rank(x,type) 4 | % 5 | % INPUTS 6 | % x - [n x p] matrix, n samples with dimensionality p 7 | % 8 | % OPTIONAL 9 | % type - 'spearman' - R1 from Han & Liu (DEFAULT) 10 | % 'kendall' - R2 from Han & Liu 11 | % 12 | % OUTPUTS 13 | % r - rank statistic 14 | % 15 | % REFERENCE 16 | % Han & Liu (2014). Distribution-free tests of independence with 17 | % applications to testing more structures. arXiv:1410.4179v1 18 | % 19 | % SEE ALSO 20 | % DepTest1, ranktest 21 | 22 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 23 | % The full license and most recent version of the code can be found at: 24 | % https://github.com/brian-lau/highdim 25 | % 26 | % This program is free software: you can redistribute it and/or modify 27 | % it under the terms of the GNU General Public License as published by 28 | % the Free Software Foundation, either version 3 of the License, or 29 | % (at your option) any later version. 30 | % 31 | % This program is distributed in the hope that it will be useful, 32 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 33 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 34 | % GNU General Public License for more details. 35 | 36 | function r = rank(x,type) 37 | 38 | if nargin < 2 39 | type = 'spearman'; 40 | end 41 | 42 | [n,p] = size(x); 43 | 44 | switch lower(type) 45 | case {'spearman','s'} 46 | rho = corr(x,'type','spearman'); 47 | rho2 = rho.^2; 48 | rho2 = tril(rho2,-1); 49 | r = (n-1)*max(rho2(:)) - 4*log(p) + log(log(p)); 50 | case {'kendall','k'} 51 | %tau = corr(x,'type','kendall'); 52 | tau = dep.kendalltau(x); 53 | tau2 = tau.^2; 54 | tau2 = tril(tau2,-1); 55 | r = ((9*n*(n-1))/(2*(2*n+5)))*max(tau2(:)) - 4*log(p) + log(log(p)); 56 | otherwise 57 | error('Unknown type'); 58 | end -------------------------------------------------------------------------------- /+dep/dcorr.m: -------------------------------------------------------------------------------- 1 | % DCORR Distance correlation 2 | % 3 | % r = dcorr(x,y,varargin) 4 | % 5 | % INPUTS 6 | % x - [n x p] n samples of dimensionality p 7 | % y - [n x q] n samples of dimensionality q 8 | % 9 | % OPTIONAL (as name/value pairs, order irrelevant) 10 | % unbiased - true indicates bias-corrected estimate (default=false) 11 | % dist - true indicates x & y are distance matrices (default=false) 12 | % doublecenter - true indicates x & y are double-centered distance 13 | % matrices (default=false) 14 | % 15 | % OUTPUTS 16 | % r - distance correlation between x,y 17 | % 18 | % REFERENCE 19 | % Szekely et al (2007). Measuring and testing independence by correlation 20 | % of distances. Ann Statist 35: 2769-2794 21 | % Szekely & Rizzo (2013). The distance correlation t-test of independence 22 | % in high dimension. J Multiv Analysis 117: 193-213 23 | % 24 | % SEE ALSO 25 | % dcorrtest, dcov 26 | 27 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 28 | % The full license and most recent version of the code can be found at: 29 | % https://github.com/brian-lau/highdim 30 | % 31 | % This program is free software: you can redistribute it and/or modify 32 | % it under the terms of the GNU General Public License as published by 33 | % the Free Software Foundation, either version 3 of the License, or 34 | % (at your option) any later version. 35 | % 36 | % This program is distributed in the hope that it will be useful, 37 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 38 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 39 | % GNU General Public License for more details. 40 | 41 | function r = dcorr(x,y,varargin) 42 | 43 | par = inputParser; 44 | par.KeepUnmatched = true; 45 | addRequired(par,'x',@isnumeric); 46 | addRequired(par,'y',@isnumeric); 47 | parse(par,x,y,varargin{:}); 48 | 49 | [d,dvx,dvy] = dep.dcov(x,y,par.Unmatched); 50 | if (dvx*dvy) > eps 51 | r = d/sqrt(dvx*dvy); 52 | else 53 | r = 0; 54 | end 55 | -------------------------------------------------------------------------------- /+sphere/rppdf.m: -------------------------------------------------------------------------------- 1 | % RPPDF Distribution of angles on a uniform hypersphere 2 | % 3 | % h = rppdf(theta,p) 4 | % 5 | % The distribution of pairwise angles between vectors X1,...,Xn that 6 | % are random points independently chosen with the uniform distribution 7 | % on S^(p-1), the unit sphere in R^p. 8 | % 9 | % INPUTS 10 | % theta - angles (radians) to evaluate pdf 11 | % p - dimensionality (R^p) 12 | % 13 | % OUTPUTS 14 | % h - pdf 15 | % 16 | % EXAMPLE 17 | % p = 8; 18 | % x = randn(50000,p); 19 | % U = sphere.spatialSign(x); 20 | % u0 = sphere.spatialSign(randn(1,p)); 21 | % dx = 0.05; xx = 0:dx:pi; 22 | % n = histc(acos(U*u0'),xx); 23 | % hold on 24 | % bar(xx,n./sum(n),'histc'); 25 | % plot(xx,sphere.rppdf(xx,p)*dx,'m') 26 | % 27 | % integral(@(x) sphere.rppdf(x,p),0,pi) 28 | % 29 | % REFERENCE 30 | % Cai, T et al (2013). Distribution of angles in random packing on 31 | % spheres. J of Machine Learning Research 14: 1837-1864. 32 | % 33 | % SEE ALSO 34 | % rp, rpcdf 35 | 36 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 37 | % The full license and most recent version of the code can be found at: 38 | % https://github.com/brian-lau/highdim 39 | % 40 | % This program is free software: you can redistribute it and/or modify 41 | % it under the terms of the GNU General Public License as published by 42 | % the Free Software Foundation, either version 3 of the License, or 43 | % (at your option) any later version. 44 | % 45 | % This program is distributed in the hope that it will be useful, 46 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 47 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 48 | % GNU General Public License for more details. 49 | 50 | function h = rppdf(theta,p) 51 | 52 | assert(all(theta>=0)&&all(theta<=pi),'theta must be 0<=theta<=pi.'); 53 | assert((mod(p,1)==0)&&(p>1),'p must be integer > 0.'); 54 | 55 | h = (1/sqrt(pi)) * exp( gammaln(p/2) - gammaln((p-1)/2) )*... 56 | (sin(theta).^(p-2)); 57 | -------------------------------------------------------------------------------- /+utils/sigest.m: -------------------------------------------------------------------------------- 1 | % SIGEST Estimate bandwidth of Gaussian kernel 2 | % 3 | % sigma = sigest(X,varargin) 4 | % 5 | % INPUTS 6 | % X - [n x p] m samples of dimensionality p 7 | % 8 | % OPTIONAL 9 | % sigest - string indicating method for estimating sigma, 10 | % 'median' - Median heuristic, Gretton et al. 2012 11 | % 'adapt' - 12 | % frac - scalar (0,1] indicating fraction of data to use for sigest 13 | % 14 | % OUTPUTS 15 | % sigma - standard deviation of Gaussian kernel 16 | % 17 | % SEE ALSO 18 | % rbf 19 | 20 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 21 | % The full license and most recent version of the code can be found at: 22 | % https://github.com/brian-lau/highdim 23 | % 24 | % This program is free software: you can redistribute it and/or modify 25 | % it under the terms of the GNU General Public License as published by 26 | % the Free Software Foundation, either version 3 of the License, or 27 | % (at your option) any later version. 28 | % 29 | % This program is distributed in the hope that it will be useful, 30 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 31 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 32 | % GNU General Public License for more details. 33 | 34 | function sigma = sigest(X,varargin) 35 | 36 | par = inputParser; 37 | par.KeepUnmatched = true; 38 | addRequired(par,'X',@isnumeric); 39 | addParamValue(par,'frac',[],@(x) isscalar(x) && (x>0) && (x<=1)); 40 | addParamValue(par,'sigest','median',@ischar); 41 | parse(par,X,varargin{:}); 42 | 43 | [n,p] = size(X); 44 | if isempty(par.Results.frac) 45 | ind = ceil(n*0.1); 46 | X = X(1:min(n,ind),:); 47 | elseif par.Results.frac ~= 1 48 | ind = ceil(n*par.Results.frac); 49 | X = X(1:min(n,ind),:); 50 | end 51 | 52 | switch lower(par.Results.sigest) 53 | case {'median'} 54 | % Median heuristic, Gretton et al. 2012 55 | sigma = sqrt(0.5*median(pdist(X).^2)); 56 | case {'adapt'} 57 | % TODO 58 | otherwise 59 | error('Unknown sigma estimator'); 60 | end -------------------------------------------------------------------------------- /setup_highdim.m: -------------------------------------------------------------------------------- 1 | function setup_highdim 2 | 3 | %% Setup path 4 | baseDirectory = fileparts(mfilename('fullpath')); 5 | addpath(genpath_ignoreHiddenDir(baseDirectory)); 6 | 7 | %% Compile DyadUpdate 8 | if exist('+utils/mexDyadUpdate','file')~=3 9 | here = pwd; 10 | cd( fullfile(baseDirectory,'+utils') ); 11 | disp('Compiling DyadUpdate code'); 12 | mex -largeArrayDims -O mexDyadUpdate.c 13 | cd(here); 14 | end 15 | 16 | %% FJLT (Fast Hadamard) code 17 | if exist('+utils/mexHadamard','file')~=3 18 | here = pwd; 19 | cd( fullfile(baseDirectory,'+utils') ); 20 | disp('Compiling fast Hadamard code'); 21 | if isunix 22 | % Assuming we are using gcc, so I know some fancier flags 23 | % This might make a difference on new computers (> 2012) that have AVX 24 | mex -O CFLAGS="\$CFLAGS -march=native -O3" mexHadamard.c -DNO_UCHAR 25 | else 26 | mex -O mexHadamard.c 27 | end 28 | cd(here); 29 | end 30 | 31 | 32 | function p = genpath_ignoreHiddenDir(d) 33 | %% 34 | % initialise variables 35 | classsep = '@'; % qualifier for overloaded class directories 36 | packagesep = '+'; % qualifier for overloaded package directories 37 | p = ''; % path to be returned 38 | 39 | % Generate path based on given root directory 40 | files = dir(d); 41 | if isempty(files) 42 | return 43 | end 44 | 45 | % Add d to the path even if it is empty. 46 | p = [p d pathsep]; 47 | 48 | % set logical vector for subdirectory entries in d 49 | isdir = logical(cat(1,files.isdir)); 50 | % 51 | % Recursively descend through directories which are neither 52 | % private nor "class" directories. 53 | % 54 | dirs = files(isdir); % select only directory entries from the current listing 55 | 56 | for i=1:length(dirs) 57 | dirname = dirs(i).name; 58 | if ~strcmp( dirname,'.') && ... 59 | ~strcmp( dirname,'..') && ... 60 | ~strncmp( dirname,classsep,1) && ... 61 | ~strncmp( dirname,packagesep,1) && ... 62 | ~strcmp( dirname,'private') && ... 63 | ~strcmpi( dirname(1), '.' ) % added in order to exclude .git/ files 64 | p = [p genpath(fullfile(d,dirname))]; % recursive calling of this function. 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /+utils/rbf.m: -------------------------------------------------------------------------------- 1 | % RBF Kernel matrix using Gaussian radial basis 2 | % 3 | % [k,sigma] = rbf(x,y,varargin) 4 | % 5 | % INPUTS 6 | % x - [m x p] m samples of dimensionality p 7 | % y - [n x p] n samples of dimensionality p 8 | % OR [], empty 9 | % 10 | % OPTIONAL 11 | % sigma - scalar, standard deviation of Gaussian kernel, default = [] 12 | % Only valid when sigma = [] 13 | % sigest - string indicating method for estimating sigma, 14 | % 'median' - Median heuristic, Gretton et al. 2012 15 | % 'adapt' - 16 | % frac - scalar (0,1] indicating fraction of data to use for sigest 17 | % 18 | % Additional name/value pairs are passed through to function for 19 | % estimating the kernel when using an approximation method. 20 | % 21 | % OUTPUTS 22 | % k - kernel matrix 23 | % sigma - standard deviation of Gaussian kernel 24 | % 25 | % SEE ALSO 26 | % sigest 27 | 28 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 29 | % The full license and most recent version of the code can be found at: 30 | % https://github.com/brian-lau/highdim 31 | % 32 | % This program is free software: you can redistribute it and/or modify 33 | % it under the terms of the GNU General Public License as published by 34 | % the Free Software Foundation, either version 3 of the License, or 35 | % (at your option) any later version. 36 | % 37 | % This program is distributed in the hope that it will be useful, 38 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 39 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 40 | % GNU General Public License for more details. 41 | 42 | function [k,sigma] = rbf(x,y,varargin) 43 | 44 | par = inputParser; 45 | par.KeepUnmatched = true; 46 | addRequired(par,'x',@isnumeric); 47 | addRequired(par,'y',@isnumeric); 48 | addParamValue(par,'sigma',[],@(x) isnumeric(x) && isscalar(x)); 49 | parse(par,x,y,varargin{:}); 50 | 51 | if isempty(par.Results.sigma) 52 | % Set sigma based on first input 53 | sigma = utils.sigest(x,par.Unmatched); 54 | else 55 | sigma = par.Results.sigma; 56 | end 57 | 58 | k = exp(-utils.sqdist(x,y)/(2*sigma^2)); -------------------------------------------------------------------------------- /+diff/mmd.m: -------------------------------------------------------------------------------- 1 | % MMD Maximal mean discrepancy 2 | % 3 | % [m,sigma] = mmd(x,y,varargin) 4 | % 5 | % INPUTS 6 | % x - [m x p] m samples of dimensionality p 7 | % y - [n x p] n samples of dimensionality p 8 | % 9 | % OPTIONAL (name/value pairs) 10 | % sigma - gaussian bandwidth, default = median heuristic 11 | % biased - boolean indicated biased estimator (default=false) 12 | % 13 | % OUTPUTS 14 | % stat - maximal mean discrepancy 15 | % 16 | % REFERENCE 17 | % Gretton et al (2012). A kernel two-sample test. 18 | % Journal of Machine Learning Research 13: 723-773 19 | % 20 | % SEE ALSO 21 | % mmdtest 22 | 23 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 24 | % The full license and most recent version of the code can be found at: 25 | % https://github.com/brian-lau/highdim 26 | % 27 | % This program is free software: you can redistribute it and/or modify 28 | % it under the terms of the GNU General Public License as published by 29 | % the Free Software Foundation, either version 3 of the License, or 30 | % (at your option) any later version. 31 | % 32 | % This program is distributed in the hope that it will be useful, 33 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 34 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 35 | % GNU General Public License for more details. 36 | 37 | function [stat,K,L,KL,sigma,biased] = mmd(x,y,varargin) 38 | 39 | par = inputParser; 40 | par.KeepUnmatched = true; 41 | addRequired(par,'x',@isnumeric); 42 | addRequired(par,'y',@isnumeric); 43 | addParamValue(par,'sigma',[],@isnumeric); 44 | addParamValue(par,'biased',false,@(x) isnumeric(x) || islogical(x)); 45 | parse(par,x,y,varargin{:}); 46 | 47 | [m,p] = size(x); 48 | [n,q] = size(y); 49 | if p ~= q 50 | error('x and y must have same dimensionality (# of columns)'); 51 | end 52 | 53 | if isempty(par.Results.sigma) 54 | % Median heuristic, Gretton et al. 2012 55 | sigma = sqrt(0.5*median(pdist([x;y]).^2)); 56 | else 57 | sigma = par.Results.sigma; 58 | end 59 | 60 | K = utils.rbf(sigma,x); 61 | L = utils.rbf(sigma,y); 62 | KL = utils.rbf(sigma,x,y); 63 | K = utils.zerodiag(K); 64 | L = utils.zerodiag(L); 65 | 66 | biased = par.Results.biased; 67 | stat = diff.mmd_(K,L,KL,m,n,biased); 68 | -------------------------------------------------------------------------------- /Testing/test_uniSphereTestPower.m: -------------------------------------------------------------------------------- 1 | % pairsClusterTest from here: https://sites.google.com/site/antimatt/software 2 | % randvonMisesFisherm from here: http://www.stat.pitt.edu/sungkyu/MiscPage.html 3 | clear all; 4 | n = 80;% 1000]; 5 | p = [4 8 16];%[4 10 20]; 6 | kappa = [0 0.25 0.5 1 2 4];%[0 1 2 4]; 7 | reps = 500;%2500; 8 | 9 | prob_r = zeros(numel(kappa),numel(p)); 10 | prob_rp = zeros(numel(kappa),numel(p)); 11 | prob_b = zeros(numel(kappa),numel(p)); 12 | prob_g = zeros(numel(kappa),numel(p)); 13 | prob_a = zeros(numel(kappa),numel(p)); 14 | prob_ga = zeros(numel(kappa),numel(p)); 15 | prob_p = zeros(numel(kappa),numel(p)); 16 | 17 | test = UniSphereTest('autoRun',false); 18 | test.params.nboot = 500; 19 | tic; 20 | for i = 1:numel(kappa) 21 | for j = 1:numel(p) 22 | for k = 1:reps 23 | x = sphere.vmfrnd(p(j),n,kappa(i))'; 24 | 25 | % with noise 26 | %x = [randn(n,p(j)) ; sphere.vmfrnd(p(j),n,kappa(i))']; 27 | 28 | % antipodally symmetric 29 | % mu = zeros(1,p(j)); 30 | % mu(end) = 1; 31 | % x = [sphere.vmfrnd(p(j),n/2,kappa(i),mu)' ;... 32 | % sphere.vmfrnd(p(j),n/2,kappa(i),-mu)']; 33 | % mixture of vmf 34 | % mu = zeros(1,p(j)); 35 | % mu(end) = 1; 36 | % x = [sphere.vmfrnd(p(j),n/3,kappa(i),mu)' ;... 37 | % sphere.vmfrnd(p(j),n/3,kappa(i),-mu)' ;... 38 | % sphere.vmfrnd(p(j),n/3,kappa(i),rand(size(mu)))']; 39 | 40 | test.x = x; 41 | 42 | test.test = 'rayleigh'; test.run(); 43 | h_r(k) = test.h; 44 | test.test = 'randproj'; test.run(); 45 | h_rp(k) = test.h; 46 | test.test = 'bingham'; test.run(); 47 | h_b(k) = test.h; 48 | test.test = 'gine'; test.run(); 49 | h_g(k) = test.h; 50 | test.test = 'ajne'; test.run(); 51 | h_a(k) = test.h; 52 | test.test = 'gine-ajne'; test.run(); 53 | h_ga(k) = test.h; 54 | 55 | [clusteriness, temp, dists, k2] = pairsClusterTest(x); 56 | pv(k) = temp; 57 | 58 | end 59 | prob_r(i,j) = mean(h_r); 60 | prob_rp(i,j) = mean(h_rp); 61 | prob_b(i,j) = mean(h_b); 62 | prob_g(i,j) = mean(h_g); 63 | prob_a(i,j) = mean(h_a); 64 | prob_ga(i,j) = mean(h_ga); 65 | prob_p(i,j) = mean(pv<=0.05); 66 | end 67 | toc 68 | i 69 | end 70 | -------------------------------------------------------------------------------- /+dep/rvtest.m: -------------------------------------------------------------------------------- 1 | % RVTEST Test RV coefficient of dependence 2 | % 3 | % [pval,rv,stat] = rvtest(x,y) 4 | % 5 | % INPUTS 6 | % x - [n x p] n samples of dimensionality p 7 | % y - [n x q] n samples of dimensionality q 8 | % 9 | % OUTPUTS 10 | % pval - p-value from Pearson type III approximation 11 | % rv - RV coefficient 12 | % stat - test statistic, normalized RV coefficient 13 | % 14 | % REFERENCE 15 | % Josse et al (2008). Testing the significance of the RV coefficient. 16 | % Computational Statistics and Data Analysis 53: 82-91 17 | % 18 | % SEE ALSO 19 | % rv, dcorr, dcorrtest, DepTest2 20 | 21 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 22 | % The full license and most recent version of the code can be found at: 23 | % https://github.com/brian-lau/highdim 24 | % 25 | % This program is free software: you can redistribute it and/or modify 26 | % it under the terms of the GNU General Public License as published by 27 | % the Free Software Foundation, either version 3 of the License, or 28 | % (at your option) any later version. 29 | % 30 | % This program is distributed in the hope that it will be useful, 31 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 32 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 33 | % GNU General Public License for more details. 34 | 35 | function [pval,rv,stat] = rvtest(x,y) 36 | 37 | [n,~] = size(x); 38 | assert(n == size(y,1),'RVTEST requires x and y to have the same # of samples'); 39 | 40 | [rv,xx,yy] = dep.rv(x,y); 41 | 42 | % mean 43 | bx = trace(xx)^2/trace(xx^2); 44 | by = trace(yy)^2/trace(yy^2); 45 | mu_rv = sqrt(bx*by)/(n-1); 46 | 47 | % variance 48 | tx = (n-1)/((n-3)*(n-1-bx)) * ... 49 | (n*(n+1)*(sum(diag(xx).^2)/trace(xx^2)) - (n-1)*(bx+2)); 50 | ty = (n-1)/((n-3)*(n-1-by)) * ... 51 | (n*(n+1)*(sum(diag(yy).^2)/trace(yy^2)) - (n-1)*(by+2)); 52 | var_rv = (2*(n-1-bx)*(n-1-by))/((n+1)*(n-1)^2*(n-2)) *... 53 | (1 + ((n-3)/(2*n*(n-1)))*tx*ty); 54 | 55 | % Standardized RV coefficient 56 | stat = (rv - mu_rv)/sqrt(var_rv); 57 | 58 | % Skewness estimate for Pearson III approximation 59 | [~,~,skew] = utils.permMoments(xx,yy); 60 | 61 | if skew >= 0 62 | pval = gamcdf(stat - (-2/skew),4/skew^2,skew/2,'upper'); 63 | else 64 | as = abs(skew); 65 | pval = gamcdf(skew/as*stat + 2/as,4/skew^2,as/2); 66 | end 67 | 68 | end -------------------------------------------------------------------------------- /+utils/mexDyadUpdate.c: -------------------------------------------------------------------------------- 1 | /* function gamma = DyadUpdate(y,c) 2 | * 3 | * Inputs 4 | * y 5 | * c 6 | * Output 7 | * gamma 8 | * 9 | * Huo & Szekely (2017). Fast Computing for Distance Covariance, 10 | * Technometrics, 2016, 58, 435?447. 11 | * 12 | * Copyright (c) 2014 Xiaoming Huo 13 | */ 14 | 15 | #include 16 | #include 17 | #include "mex.h" 18 | #include "matrix.h" 19 | 20 | /* Input Arguments */ 21 | #define Y prhs[0] 22 | #define C prhs[1] 23 | 24 | /* Output Arguments */ 25 | #define GAMMA plhs[0] 26 | 27 | /* subroutines declaration */ 28 | void DyadUpdate(double GAMMA_p[],double Y_p[],double C_p[],const int n); 29 | 30 | /* Gateway Routine */ 31 | void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[]) 32 | { 33 | /* Variables declarations */ 34 | int n,m_Y,n_Y,m_c,n_c; 35 | double *Y_p,*C_p,*GAMMA_p; 36 | 37 | /* Check for proper number of arguments. */ 38 | if (nrhs != 2) { 39 | mexErrMsgTxt("DyadUpdate requires 2 input arguments."); 40 | } else if (nlhs != 1) { 41 | mexErrMsgTxt("DyadUpdate requires 1 output arguments."); 42 | } 43 | 44 | /* i1. first input */ 45 | /* Get dimensions for 1st input. It should be a column vector. */ 46 | m_Y =(int) mxGetM(Y); n_Y =(int) mxGetN(Y); 47 | if (n_Y > 1) 48 | mexErrMsgTxt("'Y' must be a column vector."); 49 | if (mxIsComplex(Y)) 50 | mexErrMsgTxt("'Y' must be a Real vector."); 51 | 52 | /* Get pointers to the inputs */ 53 | Y_p = mxGetPr(Y); n = m_Y; 54 | 55 | /* i2. second input */ 56 | /* Get dimensions for 2nd input. It should be a vector. */ 57 | m_c =(int) mxGetM(C); n_c =(int) mxGetN(C); 58 | if (n_c > 1) 59 | mexErrMsgTxt("'C' must a column vector."); 60 | if (mxIsComplex(C)) 61 | mexErrMsgTxt("'C' must be a Real vector."); 62 | if (m_c != m_Y) 63 | mexErrMsgTxt("Inputs Y and C must have the same dimensions."); 64 | 65 | /* Get pointers to the inputs */ 66 | C_p = mxGetPr(C); 67 | 68 | /* o1. output */ 69 | GAMMA = mxCreateDoubleMatrix((int) n, (int) 1, mxREAL); 70 | if (GAMMA == NULL) 71 | mexErrMsgTxt("Could not allocate memory for GAMMA."); 72 | 73 | GAMMA_p = mxGetPr(GAMMA); 74 | 75 | /* Call subroutine to do the computation */ 76 | DyadUpdate(GAMMA_p,Y_p,C_p,n); 77 | return; 78 | } 79 | 80 | #undef Y 81 | #undef C 82 | #undef GAMMA 83 | 84 | #include "DyadUpdate.c" 85 | -------------------------------------------------------------------------------- /+dep/rv.m: -------------------------------------------------------------------------------- 1 | % RV RV coefficient of dependence 2 | % 3 | % [r,xx,yy] = rv(x,y,varargin) 4 | % 5 | % INPUTS 6 | % x - [n x p] n samples of dimensionality p 7 | % y - [n x q] n samples of dimensionality q 8 | % 9 | % OPTIONAL (name/value pairs) 10 | % type - 'mod' to calculate modified RV (Smiles et al), default='standard' 11 | % demean - boolean indicating to subtract mean for each var, default=TRUE 12 | % 13 | % OUTPUTS 14 | % r - RV coefficient 15 | % xx - inner product matrix of x 16 | % yy - inner product matrix of y 17 | % 18 | % REFERENCE 19 | % Smilde et al (2009). Matrix correlations for high-dimensional data: 20 | % the modified RV-coefficient. Bioinformatics 25: 401-405 21 | % 22 | % SEE ALSO 23 | % rvtest, dcorr, dcorrtest, DepTest2 24 | 25 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 26 | % The full license and most recent version of the code can be found at: 27 | % https://github.com/brian-lau/highdim 28 | % 29 | % This program is free software: you can redistribute it and/or modify 30 | % it under the terms of the GNU General Public License as published by 31 | % the Free Software Foundation, either version 3 of the License, or 32 | % (at your option) any later version. 33 | % 34 | % This program is distributed in the hope that it will be useful, 35 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 36 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 37 | % GNU General Public License for more details. 38 | 39 | function [r,xx,yy] = rv(x,y,varargin) 40 | 41 | par = inputParser; 42 | par.KeepUnmatched = true; 43 | addRequired(par,'x',@isnumeric); 44 | addRequired(par,'y',@isnumeric); 45 | addParamValue(par,'type','standard',@ischar); 46 | addParamValue(par,'demean',true,@islogical); 47 | parse(par,x,y,varargin{:}); 48 | 49 | [n,~] = size(x); 50 | assert(n == size(y,1),'RV requires x and y to have the same # of samples'); 51 | 52 | if par.Results.demean 53 | x = bsxfun(@minus,x,mean(x)); 54 | y = bsxfun(@minus,y,mean(y)); 55 | end 56 | xx = x*x'; 57 | yy = y*y'; 58 | 59 | switch lower(par.Results.type) 60 | case {'mod'} 61 | dind = 1:(n+1):n*n; 62 | xx(dind) = xx(dind)' - diag(xx); 63 | yy(dind) = yy(dind)' - diag(yy); 64 | r = trace(xx*yy) / sqrt(trace(xx^2)*trace(yy^2)); 65 | otherwise 66 | r = trace(xx*yy) / sqrt(trace(xx^2)*trace(yy^2)); 67 | end 68 | -------------------------------------------------------------------------------- /+diff/hotell2.m: -------------------------------------------------------------------------------- 1 | % HOTELL2 Hotelling's T-Squared test for two multivariate samples 2 | % 3 | % [pval,T2] = hotell2(x,y) 4 | % 5 | % Hotelling's T-Squared test for comparing d-dimensional data from two 6 | % independent samples, assuming normality w/ common covariance matrix. 7 | % 8 | % INPUTS 9 | % x - [n1 x d] matrix 10 | % y - [n2 x d] matrix 11 | % 12 | % OUTPUTS 13 | % pval - asymptotic p-value 14 | % T2 - Hotelling T^2 statistic 15 | % 16 | % REFERENCE 17 | % Mardia, K, Kent, J, Bibby J (1979) Multivariate Analysis. Section 3.6.1 18 | % 19 | % SEE ALSO 20 | % kstest2d, minentest 21 | 22 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 23 | % The full license and most recent version of the code can be found on GitHub: 24 | % https://github.com/brian-lau/highdim 25 | % 26 | % This program is free software: you can redistribute it and/or modify 27 | % it under the terms of the GNU General Public License as published by 28 | % the Free Software Foundation, either version 3 of the License, or 29 | % (at your option) any later version. 30 | % 31 | % This program is distributed in the hope that it will be useful, 32 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 33 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 34 | % GNU General Public License for more details. 35 | 36 | function [pval,T2] = hotell2(x,y) 37 | 38 | [nx,px] = size(x); 39 | [ny,py] = size(y); 40 | 41 | if px ~= py 42 | error('# of columns in X and Y must match'); 43 | else 44 | p = px; 45 | end 46 | 47 | n = nx + ny; 48 | mux = mean(x); 49 | muy = mean(y); 50 | 51 | Sx = cov(x); 52 | Sy = cov(y); 53 | 54 | % Hotelling T2 statistic, Section 3.6.1 Mardia et al. 55 | %Su = ((nx-1)*Sx + (ny-1)*Sy) / (n-2); 56 | Su = (nx*Sx + ny*Sy) / (n-2); % unbiased estimate 57 | d = mux - muy; 58 | D2 = d*inv(Su)*d'; 59 | T2 = ((nx*ny)/n)*D2; 60 | F = T2 * (n-p-1) / ((n-2)*p); 61 | 62 | pval = 1 - fcdf(F,p,n-p-1); 63 | 64 | if nargout == 0 65 | fprintf('-------------------------------\n'); 66 | fprintf(' nx = %g\n',nx); 67 | fprintf(' ny = %g\n',ny); 68 | fprintf(' mean(x) = '); 69 | fprintf('%1.3f, ',mux); 70 | fprintf('\n'); 71 | fprintf(' mean(y) = '); 72 | fprintf('%1.3f, ',muy); 73 | fprintf('\n'); 74 | fprintf(' T2 = %5.3f\n',T2); 75 | fprintf(' F(%g,%g) = %5.3f\n',p,n-p-1,F); 76 | fprintf(' p = %5.5f\n',pval); 77 | fprintf('-------------------------------\n'); 78 | end -------------------------------------------------------------------------------- /Testing/test_rank.m: -------------------------------------------------------------------------------- 1 | %% Reproduce size and power from: 2 | % Han & Liu (2014). Distribution-free tests of independence with 3 | % applications to testing more structures. arXiv:1410.4179v1 4 | % Table 1 5 | 6 | % Model 1 7 | n = [60 100]; 8 | p = [50 100 200 400 800]; 9 | reps = 10; 10 | 11 | d = DepTest1(); 12 | 13 | tic; 14 | for i = 1:numel(n) 15 | for j = 1:numel(p) 16 | for k = 1:reps 17 | x = randn(n(i),p(j)); 18 | 19 | d.x = x; 20 | h(k) = d.h; 21 | end 22 | prob(i,j) = mean(h); 23 | end 24 | toc 25 | end 26 | % prob = 27 | % 28 | % 0.0240 0.0110 0.0070 0.0060 0.0030 29 | % 0.0230 0.0200 0.0180 0.0150 0.0050 30 | 31 | % Model 5 32 | n = [60 100]; 33 | p = [50 100 200 400 800]; 34 | reps = 100; 35 | 36 | d = DepTest1(); 37 | 38 | tic; 39 | for i = 1:numel(n) 40 | for j = 1:numel(p) 41 | for k = 1:reps 42 | dim = p(j); 43 | ind = triu(ones(dim,dim),1); 44 | f_ind = find(ind); 45 | r = randperm(numel(f_ind)); 46 | nz = f_ind(r(1:4)); 47 | t = zeros(dim,dim); 48 | t(nz) = rand(4,1); 49 | t = t + t'; 50 | 51 | [~,D] = eig(eye(dim)+t); 52 | lambdamin = min(diag(D)); 53 | delta = (-lambdamin+0.05)*(lambdamin<=0); 54 | R = eye(dim) + t + delta*eye(dim); 55 | 56 | x = mvnrnd(zeros(p(j),1),R,n(i)); 57 | 58 | d.x = x; 59 | h(k) = d.h; 60 | end 61 | prob(i,j) = mean(h); 62 | end 63 | toc 64 | end 65 | 66 | % Model 7 67 | n = [60 100]; 68 | p = [50 100 200 400 800]; 69 | reps = 100; 70 | 71 | d = DepTest1(); 72 | 73 | tic; 74 | for i = 1:numel(n) 75 | for j = 1:numel(p) 76 | for k = 1:reps 77 | dim = p(j); 78 | ind = triu(ones(dim,dim),1); 79 | f_ind = find(ind); 80 | r = randperm(numel(f_ind)); 81 | nz = f_ind(r(1:4)); 82 | t = zeros(dim,dim); 83 | t(nz) = rand(4,1); 84 | t = t + t'; 85 | 86 | [~,D] = eig(eye(dim)+t); 87 | lambdamin = min(diag(D)); 88 | delta = (-lambdamin+0.05)*(lambdamin<=0); 89 | R = eye(dim) + t + delta*eye(dim); 90 | 91 | x = mvnrnd(zeros(p(j),1),R,n(i)); 92 | 93 | d.x = x.^3; 94 | h(k) = d.h; 95 | end 96 | prob(i,j) = mean(h); 97 | end 98 | toc 99 | end 100 | -------------------------------------------------------------------------------- /+dep/fdcov.m: -------------------------------------------------------------------------------- 1 | % FDCOV Fast distance covariance 2 | % 3 | % d = fdcov(x,y) 4 | % 5 | % Estimate (unbiased) distance covariance using Huo & Szekely algorithm, 6 | % which has O(n log n) complexity and O(n) storage compared to 7 | % O(n^2) complexity and O(n^2) storage of the naive estimator. 8 | % Valid for univariate and real inputs. 9 | % 10 | % INPUTS 11 | % x - [n x 1] samples 12 | % y - [n x 1] samples 13 | % 14 | % OUTPUTS 15 | % d - distance covariance between x,y 16 | % 17 | % REFERENCE 18 | % Huo & Szekely (2016). Fast Computing for Distance Covariance, 19 | % Technometrics, 58, 435?447. DOI:10.1080/00401706.2015.1054435 20 | % 21 | % SEE ALSO 22 | % fdcorr, rpdcov 23 | 24 | % Modified from supplementary materials of Huo & Szekely 25 | % $ Copyright (C) 2014 Xiaoming Huo $ 26 | 27 | function d = fdcov(x,y) 28 | 29 | n = length(x); 30 | assert(isvector(x) && isvector(y),'FDCOV requires x & y to be univariate'); 31 | assert(n == numel(y),'FDCOV requires x & y to be the same length'); 32 | 33 | if isrow(x) 34 | x = x'; 35 | end 36 | 37 | if isrow(y) 38 | y = y'; 39 | end 40 | 41 | temp = (1:n)'; 42 | [vx,Ix0] = sort(x); Ix(Ix0) = temp; Ix = Ix'; 43 | [vy,Iy0] = sort(y); Iy(Iy0) = temp; Iy = Iy'; 44 | sx = cumsum(vx); 45 | sy = cumsum(vy); 46 | alphax = Ix - 1; 47 | alphay = Iy - 1; 48 | betax = sx(Ix) - vx(Ix); 49 | betay = sy(Iy) - vy(Iy); 50 | xdot = sum(x); 51 | ydot = sum(y); 52 | 53 | aidot = xdot + (2*alphax-n).*x - 2*betax; 54 | bidot = ydot + (2*alphay-n).*y - 2*betay; 55 | Sab = sum(aidot.*bidot); 56 | 57 | adotdot = 2*sum(alphax.*x) - 2*sum(betax); 58 | bdotdot = 2*sum(alphay.*y) - 2*sum(betay); 59 | 60 | gamma_1 = partialSum2D(x,y,ones(n,1)); 61 | gamma_x = partialSum2D(x,y,x); 62 | gamma_y = partialSum2D(x,y,y); 63 | gamma_xy = partialSum2D(x,y,x.*y); 64 | 65 | aijbij = sum(x.*y.*gamma_1 + gamma_xy - x.*gamma_y - y.*gamma_x); 66 | d = aijbij/n/(n-3) - 2*Sab/n/(n-2)/(n-3) + adotdot*bdotdot/n/(n-1)/(n-2)/(n-3); 67 | 68 | function gamma = partialSum2D(x,y,c) 69 | 70 | n = length(x); 71 | temp = (1:n)'; 72 | 73 | [~,Ix0] = sort(x); 74 | Ix(Ix0) = temp; % Ix = order stat 75 | 76 | y = y(Ix0); 77 | c = c(Ix0); % so x is at increasing order 78 | [~,Iy0] = sort(y); 79 | Iy(Iy0) = temp; 80 | y = Iy'; % y is a perm of {1,...,n} 81 | 82 | sy = cumsum(c(Iy0)) - c(Iy0); 83 | sx = cumsum(c) - c; 84 | cdot = sum(c); 85 | 86 | gamma1 = utils.mexDyadUpdate(y,c); 87 | 88 | gamma = cdot - c - 2*sy(Iy) - 2*sx + 4*gamma1; 89 | gamma = gamma(Ix); -------------------------------------------------------------------------------- /Testing/test_PAIRS.m: -------------------------------------------------------------------------------- 1 | % Run some simulations to test the power of the test used by Raposo et al 2 | % to detect non-uniform distributions on a hypersphere. 3 | 4 | % You will need the highdim library here: 5 | 6 | % as well as the following functions 7 | % fdr_bh from here: 8 | % http://www.mathworks.com/matlabcentral/fileexchange/27418-benjamini---hochberg-yekutieli-procedure-for-controlling-false-discovery-rate 9 | % pairsClusterTest from here: 10 | % https://sites.google.com/site/antimatt/RaposoKaufmanChurchland2014.zip 11 | % randvonMisesFisherm from here: 12 | % http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisherm.zip 13 | 14 | n = 100; % sample size 15 | p = [4 8 16 32]; % dimensionality 16 | kappa = [0 1 2]; % von-Mises concentration, 0 is uniform for checking size 17 | reps = 100; % repetitions of experiment 18 | 19 | prob_r = zeros(numel(kappa),numel(p)); 20 | prob_rp = zeros(numel(kappa),numel(p)); 21 | prob_ga = zeros(numel(kappa),numel(p)); 22 | prob_p = zeros(numel(kappa),numel(p)); 23 | 24 | tic; 25 | for i = 1:numel(kappa) 26 | for j = 1:numel(p) 27 | for k = 1:reps 28 | % Simple unimodal model 29 | x = randvonMisesFisherm(p(j),n,kappa(i))'; 30 | 31 | pval_r(k) = uniSphereTest(x,'rayleigh'); 32 | pval_rp(k) = uniSphereTest(x,'rp'); 33 | pval_ga(k) = uniSphereTest(x,'ga'); 34 | 35 | [clusteriness, temp, dists, k2] = pairsClusterTest(x); 36 | pval_p(k) = temp; 37 | 38 | % PCA reduce first? 39 | % [~, ~, latent] = princomp(x); 40 | % vaf = cumsum(latent)./sum(latent); 41 | % ind = find(vaf>=.9); 42 | % [clusteriness, temp, dists, k2] = pairsClusterTest(x(:,1:ind(1))); 43 | % pval_p(i,j,k) = temp; 44 | end 45 | prob_r(i,j) = mean(pval_r<0.05); 46 | prob_rp(i,j) = mean(pval_rp<0.05); 47 | prob_ga(i,j) = mean(pval_ga<0.05); 48 | prob_p(i,j) = mean(pval_p<0.05); 49 | end 50 | toc 51 | end 52 | 53 | figure; 54 | subplot(221); hold on 55 | plot(kappa,prob_r,'--'); 56 | title('Rayleigh test'); 57 | legend('p=4','8','16','32') 58 | axis([kappa(1) kappa(end) 0 1]); 59 | subplot(222); hold on 60 | plot(kappa,prob_r,'--'); 61 | plot(kappa,prob_ga); 62 | title('Gine-Ajne test (solid)'); 63 | axis([kappa(1) kappa(end) 0 1]); 64 | subplot(223); hold on 65 | plot(kappa,prob_r,'--'); 66 | plot(kappa,prob_rp); 67 | title('Random projection test (solid)'); 68 | axis([kappa(1) kappa(end) 0 1]); 69 | subplot(224); hold on 70 | plot(kappa,prob_r,'--'); 71 | plot(kappa,prob_p); 72 | title('PAIRS test (solid)'); 73 | axis([kappa(1) kappa(end) 0 1]); 74 | 75 | -------------------------------------------------------------------------------- /+utils/poldecomp.m: -------------------------------------------------------------------------------- 1 | function [R U V] = poldecomp(F) 2 | %POLDECOMP Performs the polar decomposition of a regular square matrix. 3 | % [R U V] = POLDECOMP(F) factorizes a non-singular square matrix F such 4 | % that F=R*U and F=V*R, where 5 | % U and V are symmetric, positive definite matrices and 6 | % R is a rotational matrix 7 | % 8 | % See also EIG, DIAG, REPMAT 9 | 10 | % Copyright (c) 2014, Zoltan Csati 11 | % All rights reserved. 12 | % 13 | % Redistribution and use in source and binary forms, with or without 14 | % modification, are permitted provided that the following conditions are 15 | % met: 16 | % 17 | % * Redistributions of source code must retain the above copyright 18 | % notice, this list of conditions and the following disclaimer. 19 | % * Redistributions in binary form must reproduce the above copyright 20 | % notice, this list of conditions and the following disclaimer in 21 | % the documentation and/or other materials provided with the distribution 22 | % 23 | % THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | % AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | % IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 | % ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 27 | % LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 28 | % CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 29 | % SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 30 | % INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 31 | % CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 | % ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 | % POSSIBILITY OF SUCH DAMAGE. 34 | 35 | % This kind of decomposition is often used in continuum mechanics so it is 36 | % convenient to comment the code that way. From now, we use the matrix 37 | % formalism of tensors. C is the right Cauchy-Green deformation tensor, 38 | % F is the deformation tensor, lambda is the stretch. 39 | 40 | % Check input 41 | [m n] = size(F); 42 | if m ~= n 43 | error('Matrix must be square.'); 44 | end 45 | 46 | C = F'*F; 47 | [Q0 lambdasquare] = eig(C); 48 | lambda = sqrt(diag((lambdasquare))); % extract the components 49 | % Uinv is the inverse of U and is constructed with the help of Q0. Uinv is 50 | % produced in the same base as F not in the base of its eigenvectors. 51 | Uinv = repmat(1./lambda',size(F,1),1).*Q0*Q0'; 52 | % Using the definition, R, U and V can now be calculated 53 | R = F*Uinv; 54 | U = R'*F; 55 | V = F*R'; -------------------------------------------------------------------------------- /+diff/mmdtest.m: -------------------------------------------------------------------------------- 1 | % MMDTEST Two-sample maximal mean discrepancy test 2 | % 3 | % [pval,stat,boot] = mmdtest(x,y,varargin) 4 | % 5 | % Given a sample X1,...,Xm from a p-dimensional multivariate distribution, 6 | % and a sample Y1,...,Xn from a q-dimensional multivariate distribution, 7 | % test the hypothesis: 8 | % 9 | % H0 : X and Y are drawn from the same distribution 10 | % 11 | % INPUTS 12 | % x - [m x p] m samples of dimensionality p 13 | % y - [n x p] n samples of dimensionality p 14 | % 15 | % OPTIONAL 16 | % nboot - # bootstrap samples (default = 1000) 17 | % sigma - gaussian bandwidth (default = median heuristic) 18 | % biased - boolean indicated biased estimator (default = false) 19 | % 20 | % OUTPUTS 21 | % pval - p-value 22 | % stat - maximal mean discrepancy 23 | % boot - bootstrap samples 24 | % 25 | % REFERENCE 26 | % Gretton et al (2012). A kernel two-sample test. 27 | % Journal of Machine Learning Research 13: 723-773 28 | % 29 | % SEE ALSO 30 | % mmd, DepTest2 31 | 32 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 33 | % The full license and most recent version of the code can be found at: 34 | % https://github.com/brian-lau/highdim 35 | % 36 | % This program is free software: you can redistribute it and/or modify 37 | % it under the terms of the GNU General Public License as published by 38 | % the Free Software Foundation, either version 3 of the License, or 39 | % (at your option) any later version. 40 | % 41 | % This program is distributed in the hope that it will be useful, 42 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 43 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 44 | % GNU General Public License for more details. 45 | 46 | function [pval,stat,boot] = mmdtest(x,y,varargin) 47 | 48 | par = inputParser; 49 | par.KeepUnmatched = true; 50 | addRequired(par,'x',@isnumeric); 51 | addRequired(par,'y',@isnumeric); 52 | addParamValue(par,'nboot',1000,@(x) isscalar(x)&&isnumeric(x)); 53 | parse(par,x,y,varargin{:}); 54 | 55 | [m,p] = size(x); 56 | [n,q] = size(y); 57 | if p ~= q 58 | error('x and y must have same dimensionality (# of columns)'); 59 | end 60 | 61 | [stat,K,L,KL,sigma,biased] = diff.mmd(x,y,par.Unmatched); 62 | 63 | nboot = par.Results.nboot; 64 | boot = zeros(nboot,1); 65 | % aggregated kernel matrix 66 | M = [K KL; KL' L]; 67 | for i = 1:nboot 68 | ind = randperm(n+m); 69 | K = M(ind(1:m),ind(1:m)); 70 | L = M(ind(m+1:end),ind(m+1:end)); 71 | KL = M(ind(1:n),ind(m+1:end)); 72 | boot(i) = diff.mmd_(K,L,KL,m,n,biased); 73 | end 74 | 75 | pval = sum(boot>=stat)./nboot; 76 | -------------------------------------------------------------------------------- /+sphere/rptest.m: -------------------------------------------------------------------------------- 1 | % RPTEST Random projection test for spherical uniformity 2 | % 3 | % [pval,stat] = rptest(U,varargin) 4 | % 5 | % INPUTS 6 | % U - [n x p] matrix, n samples with dimensionality p 7 | % the data should already be projected to the unit hypersphere 8 | % 9 | % OPTIONAL 10 | % test - 11 | % 12 | % OUTPUTS 13 | % pval - p-value 14 | % stat - statistic, projections onto k random p-vectors 15 | % 16 | % REFERENCE 17 | % Cuesta-Albertos, JA et al (2009). On projection-based tests for 18 | % directional and compositional data. Stat Comput 19: 367-380 19 | % Cuesta-Albertos, JA et al (2007). A sharp form of the Cramer-Wold 20 | % theorem. J Theor Probab 20: 201-209 21 | % 22 | % SEE ALSO 23 | % UniSphereTest, rp, rppdf, rpcdf 24 | 25 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 26 | % The full license and most recent version of the code can be found at: 27 | % https://github.com/brian-lau/highdim 28 | % 29 | % This program is free software: you can redistribute it and/or modify 30 | % it under the terms of the GNU General Public License as published by 31 | % the Free Software Foundation, either version 3 of the License, or 32 | % (at your option) any later version. 33 | % 34 | % This program is distributed in the hope that it will be useful, 35 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 36 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 37 | % GNU General Public License for more details. 38 | 39 | function [pval,stat] = rptest(U,varargin) 40 | 41 | import sphere.* 42 | 43 | par = inputParser; 44 | par.KeepUnmatched = true; 45 | addRequired(par,'U',@isnumeric); 46 | addParamValue(par,'correction','fdr',@ischar); 47 | addParamValue(par,'nmc',2000,@isnumeric); 48 | addParamValue(par,'k',20,@isnumeric); 49 | addParamValue(par,'dist','empirical',@ischar); 50 | parse(par,U,varargin{:}); 51 | k = par.Results.k; 52 | 53 | [n,p] = size(U); 54 | stat = rp(U,k); 55 | 56 | switch lower(par.Results.dist) 57 | case 'asymp' 58 | pval = zeros(k,1); 59 | for i = 1:k 60 | test_cdf = [ stat(:,i) , rpcdf(stat(:,i),p)]; 61 | [~,pval(i)] = kstest(stat(:,i),'CDF',test_cdf); 62 | end 63 | otherwise % empirical 64 | Umc = spatialSign(randn(par.Results.nmc,p)); 65 | u0 = spatialSign(randn(1,p)); 66 | Ymc = acos(Umc*u0'); 67 | pval = zeros(k,1); 68 | for i = 1:k 69 | [~,pval(i)] = kstest2(stat(:,i),Ymc); 70 | end 71 | end 72 | 73 | switch lower(par.Results.correction) 74 | case 'bonferroni' 75 | adj_p = pval*k; 76 | case 'fdr' 77 | [~,~,adj_p] = utils.fdr_bh(pval,.05,'pdep'); 78 | otherwise 79 | error('Invalid p-value correction'); 80 | end 81 | pval = min(adj_p); 82 | -------------------------------------------------------------------------------- /+utils/tri2sqind.m: -------------------------------------------------------------------------------- 1 | function [i,j,k] = tri2sqind( m, k ) 2 | %TRI2SQIND subscript and linear indices for upper tri portion of matrix 3 | % 4 | % get indices into a square matrix for a vector representing a the upper 5 | % triangular portion of a matrix such as those returned by pdist. 6 | % 7 | % [i,j,k] = tri2sqind( m, k ) 8 | % If V is a hypothetical vector representing the upper triangular portion 9 | % of a matrix (not including the diagonal) and 10 | % M is the size of a square matrix and 11 | % K is an optional vector of indices into V then tri2sqind returns 12 | % (i,j) the subscripted indices into the equivalent square matrix. 13 | % K is an integer index into the equivalent square matrix 14 | % 15 | % Example 16 | % X = randn(5, 20); 17 | % Y = pdist(X, 'euclidean'); 18 | % [i,j,k] = tri2sqind( 5 ); 19 | % S = squareform(Y); 20 | % isequal( Y(:), S(k) ); 21 | % Z = zeros(5); 22 | % Z(k) = Y; 23 | % 24 | % Copyright 2012 Mike Boedigheimer 25 | % Amgen Inc. 26 | % Department of Computational Biology 27 | % mboedigh@amgen.com 28 | 29 | % Copyright (c) 2013, Michael Boedigheimer; Chris Rorden 30 | % All rights reserved. 31 | % 32 | % Redistribution and use in source and binary forms, with or without 33 | % modification, are permitted provided that the following conditions are 34 | % met: 35 | % 36 | % * Redistributions of source code must retain the above copyright 37 | % notice, this list of conditions and the following disclaimer. 38 | % * Redistributions in binary form must reproduce the above copyright 39 | % notice, this list of conditions and the following disclaimer in 40 | % the documentation and/or other materials provided with the distribution 41 | % 42 | % THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 43 | % AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 | % IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 | % ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 46 | % LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 47 | % CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 48 | % SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 49 | % INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 50 | % CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 51 | % ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 52 | % POSSIBILITY OF SUCH DAMAGE. 53 | 54 | max_k = m*(m-1)/2; 55 | 56 | if ( nargin < 2 ) 57 | k = (1:max_k)'; 58 | end; 59 | 60 | if any( k > max_k ) 61 | error('linstats:tri2sqind:InvalidArgument', 'ind2subl:Out of range subscript'); 62 | end; 63 | 64 | 65 | i = floor(m+1/2-sqrt(m^2-m+1/4-2.*(k-1))); 66 | j = k - (i-1).*(m-i/2)+i; 67 | k = sub2ind( [m m], i, j ); 68 | %end tri2sqind() -------------------------------------------------------------------------------- /Testing/test_uniSphereTestNull.m: -------------------------------------------------------------------------------- 1 | %% Check the 95th percentiles of the statistics under uniformity 2 | 3 | %% Gine & Bingham 4 | clear all; 5 | n = [10 30 50 100 150]; 6 | p = [10 20 30 40 50 100]; 7 | reps = 200; 8 | 9 | tic; 10 | for i = 1:numel(n) 11 | for j = 1:numel(p) 12 | for k = 1:reps 13 | x = randn(n(i),p(j)); 14 | U = sphere.spatialSign(x); 15 | G(i,j,k) = sphere.gine(U); 16 | [~,B(i,j,k)] = sphere.bingham(U); 17 | end 18 | pctlG(i,j) = prctile(G(i,j,:),95); 19 | pctlB(i,j) = prctile(B(i,j,:),95); 20 | end 21 | toc 22 | end 23 | 24 | % Note that n,p refer to dim,samples in 25 | % Figueiredo & Gomes (2003). Power of Tests of Uniformity Defined on the 26 | % Hypersphere. Communications in Statistics 32: 87-94 27 | pB = ... 28 | [71.249, 243.558, 514.535, NaN, NaN, NaN;... 29 | 71.631, 243.709, 515.788, 885.647, NaN, NaN;... 30 | 72.040, 244.300, 515.409, 886.401, 1356.267, 5214.739;... 31 | NaN, NaN, NaN, 885.969, 1355.913, 5219.373;... 32 | NaN, NaN, NaN, NaN, 1357.249, 5215.061]; 33 | 34 | pG = ... 35 | [0.588, 0.543, 0.528, NaN, NaN, NaN;... 36 | 0.590, 0.544, 0.529, 0.521, NaN, NaN;... 37 | 0.592, 0.544, 0.529, 0.521, 0.516, 0.508;... 38 | NaN, NaN, NaN, 0.521, 0.516, 0.508;... 39 | NaN, NaN, NaN, NaN, 0.517, 0.509]; 40 | 41 | pctlB-pB 42 | pctlG-pG 43 | 44 | %% Rayleigh & Anje 45 | clear all; 46 | n = [10 30 50 70 100 150]; 47 | p = [10 20 30 40 50 100]; 48 | reps = 500; 49 | 50 | tic; 51 | for i = 1:numel(n) 52 | for j = 1:numel(p) 53 | for k = 1:reps 54 | x = randn(n(i),p(j)); 55 | U = sphere.spatialSign(x); 56 | [~,R(i,j,k)] = sphere.rayleigh(U); 57 | A(i,j,k) = sphere.ajne(U); 58 | end 59 | pctlR(i,j) = prctile(R(i,j,:),95); 60 | pctlA(i,j) = prctile(A(i,j,:),95); 61 | end 62 | toc 63 | end 64 | 65 | % Note that n,p refer to samples,dim in 66 | % Figueiredo (2007) Comparison of tests of uniformity defined on the 67 | % hypersphere. Statistics & Probability Letters 77: 329-334 68 | % 69 | pR = ... 70 | [17.763, 30.694, 42.818, 54.723, 66.227, 122.647;... 71 | 18.168, 31.193, 43.373, 55.625, 66.896, 124.296;... 72 | 18.051, 31.305, 43.923, 55.631, 66.609, 124.318;... 73 | 18.045, 31.317, 43.806, 55.820, 67.162, 123.986;... 74 | 18.176, 31.195, 43.753, 55.557, 67.356, 123.091;... 75 | 18.335, 31.511, 43.699, 55.551, 67.681, 124.109]; 76 | 77 | pA = ... 78 | [0.379, 0.337, 0.319, 0.309, 0.302, 0.286;... 79 | 0.387, 0.341, 0.322, 0.313, 0.304, 0.289;... 80 | 0.385, 0.342, 0.325, 0.313, 0.303, 0.289;... 81 | 0.384, 0.342, 0.324, 0.314, 0.305, 0.288;... 82 | 0.387, 0.341, 0.324, 0.313, 0.306, 0.288;... 83 | 0.390, 0.344, 0.324, 0.313, 0.307, 0.289]; 84 | 85 | pctlR-pR 86 | pctlA-pA -------------------------------------------------------------------------------- /+dep/ranktest.m: -------------------------------------------------------------------------------- 1 | % RANKTEST Rank-based tests high-dimensional independence 2 | % 3 | % [pval,r,rmc] = ranktest(x,varargin) 4 | % 5 | % Given a sample X1,...,Xn from a p-dimensional multivariate distribution, 6 | % test the hypothesis: 7 | % 8 | % H0 : X1,...,Xp are mutually independent 9 | % 10 | % INPUTS 11 | % x - [n x p] matrix, n samples with dimensionality p 12 | % 13 | % OPTIONAL (name/value pairs) 14 | % test - 'spearman' - R1 from Han & Liu (default) 15 | % 'kendall' - R2 from Han & Liu 16 | % empirical - boolean to monte-carlo sample null distribution 17 | % DEFAULT=FALSE, which uses asymptotic distribution 18 | % nmc - number of monte-carlo samples, if empirical=true 19 | % rmc - vector of monte-carlo samples. Since the null distribution is 20 | % distribution-free (does not depend on data other than size), if 21 | % you have already estimated the empirical, you can avoid doing 22 | % it again 23 | % 24 | % OUTPUTS 25 | % pval - p-value 26 | % r - rank statistic 27 | % rmc - monte-carlo samples of empirical null 28 | % 29 | % REFERENCE 30 | % Han & Liu (2014). Distribution-free tests of independence with 31 | % applications to testing more structures. arXiv:1410.4179v1 32 | % 33 | % SEE ALSO 34 | % rank, DepTest1 35 | 36 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 37 | % The full license and most recent version of the code can be found at: 38 | % https://github.com/brian-lau/highdim 39 | % 40 | % This program is free software: you can redistribute it and/or modify 41 | % it under the terms of the GNU General Public License as published by 42 | % the Free Software Foundation, either version 3 of the License, or 43 | % (at your option) any later version. 44 | % 45 | % This program is distributed in the hope that it will be useful, 46 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 47 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 48 | % GNU General Public License for more details. 49 | 50 | function [pval,r,rmc] = ranktest(x,varargin) 51 | 52 | par = inputParser; 53 | par.KeepUnmatched = true; 54 | addRequired(par,'x',@isnumeric); 55 | addParamValue(par,'test','spearman',@ischar); 56 | addParamValue(par,'empirical',false,@(x) isnumeric(x) || islogical(x)); 57 | addParamValue(par,'nmc',1000,@(x) isnumeric(x) && isscalar(x)); 58 | addParamValue(par,'rmc',[],@isnumeric); 59 | parse(par,x,varargin{:}); 60 | 61 | [n,p] = size(x); 62 | r = dep.rank(x,par.Results.test); 63 | 64 | if par.Results.empirical 65 | nmc = par.Results.nmc; 66 | rmc = par.Results.rmc; 67 | if isempty(rmc) 68 | rmc = zeros(nmc,1); 69 | else 70 | pval = sum(rmc>=r)/nmc; 71 | return; 72 | end 73 | % Otherwise re-estimate, TODO: check whether this depends on n,p? 74 | for i = 1:nmc 75 | xmc = randn(n,p); 76 | rmc(i) = dep.rank(xmc,par.Results.test); 77 | end 78 | pval = sum(rmc>=r)/nmc; 79 | else 80 | % Asymptotic, extreme value type 1 cdf 81 | cdf = @(y) exp(-exp(-y/2)/sqrt(8*pi)); 82 | pval = 1 - cdf(r); 83 | rmc = []; 84 | end -------------------------------------------------------------------------------- /+sphere/sumchi2cdf.m: -------------------------------------------------------------------------------- 1 | % SUMCHI2CDF CDF for infinite weighted sums of chi-square 2 | % 3 | % Fxval = sumchi2cdf(xval,p) 4 | % 5 | % INPUTS 6 | % xval 7 | % p 8 | % 9 | % OUTPUTS 10 | % Fxval - CDF value 11 | % 12 | % REFERENCE 13 | % Keilson J et al (1983). Significance points for some tests of uniformity 14 | % on the sphere. J Statist Comput Simul 17: 195-218. 15 | 16 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 17 | % The full license and most recent version of the code can be found at: 18 | % https://github.com/brian-lau/highdim 19 | % 20 | % This program is free software: you can redistribute it and/or modify 21 | % it under the terms of the GNU General Public License as published by 22 | % the Free Software Foundation, either version 3 of the License, or 23 | % (at your option) any later version. 24 | % 25 | % This program is distributed in the hope that it will be useful, 26 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 27 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 28 | % GNU General Public License for more details. 29 | 30 | function Fxval = sumchi2cdf(xval,p) 31 | 32 | switch p 33 | case 3 34 | % Table V from Keilson et al (1983) 35 | if xval > 5 36 | Fxval = 1; 37 | elseif xval < 0 38 | Fxval = 0; 39 | else 40 | x = 0:.05:5; 41 | Fx = [0 0 0 0 0 0 0 0 ,... 42 | 0.00006,0.00040,0.00170,0.00500,0.01152,0.02228,0.03792,0.05860,... 43 | 0.08408,0.11385,0.14718,0.18331,0.22144,0.26083,0.30083,0.34085,... 44 | 0.38043,0.41917,0.45670,0.49303,0.52776,0.56085,0.59224,0.62191,... 45 | 0.64985,0.67609,0.70067,0.72364,0.74506,0.76501,0.78355,0.80076,... 46 | 0.81671,0.83148,0.84514,0.85777,0.86942,0.88017,0.89008,0.89921,... 47 | 0.90762,0.91535,0.92246,0.92899,0.93500,0.94051,0.94557,0.95021,... 48 | 0.95446,0.95836,0.96194,0.96522,0.96822,0.97096,0.97348,0.97578,... 49 | 0.97788,0.97981,0.98157,0.98318,0.98465,0.98600,0.98723,0.98835,... 50 | 0.98937,0.99031,0.99116,0.99194,0.99266,0.99331,0.99390,0.99444,... 51 | 0.99493,0.99538,0.99580,0.99617,0.99651,0.99682,0.99711,0.99737,... 52 | 0.99760,0.99782,0.99801,0.99819,0.99835,0.99850,0.99863,0.99876,... 53 | 0.99887,0.99897,0.99906,0.99915,0.99923]; 54 | Fxval = interp1(x,Fx,xval,'linear'); 55 | end 56 | otherwise 57 | error('No approximation for p requested'); 58 | end 59 | 60 | % 61 | % alpha = (p-1)/2; 62 | % q = 1:10; 63 | % a2 = (p*(2*q-1))/(8*pi*(2*q+p)) *... 64 | % (gamma(alpha + 0.5)*gamma(q-0.5)) ./... 65 | % (gamma(q+alpha+0.5)).^2; 66 | % 67 | % temp = 0; 68 | % for i = 1:numel(q) 69 | % vp2q = vpq(p,2*q(i)); 70 | % temp = temp + a2(i) * chi2pdf(xval,vp2q); 71 | % end 72 | % 73 | 74 | % 75 | % p = 3 76 | % syms theta 77 | % hp = (1/sqrt(pi)) * (gamma(p/2)/(gamma((p-1)/2)*sqrt(2)))*... 78 | % (sin(theta).^(p-2)); 79 | % qsym = simplify(int(hp,theta,p)); % Solve integral symbolically 80 | % pretty(qsym) 81 | % 82 | % double(subs(qsym,{theta},{0:.1:pi})) 83 | -------------------------------------------------------------------------------- /+dep/rpdcov.m: -------------------------------------------------------------------------------- 1 | % RPDCOV Randomly projected distance covariance 2 | % 3 | % [d,omega_k] = rpdcov(x,y,k) 4 | % 5 | % Estimate (unbiased) distance covariance using Huang & Huo algorithm, 6 | % which has O(nk (log n + p + q)) complexity and O(max(n,k)) storage 7 | % compared to O(n^2(p + q)) complexity and O(n^2) storage of the naive 8 | % estimator. 9 | % 10 | % The random projection estimator is an unbiased estimator of distance 11 | % covariance (bias-corrected variant). The difference converges to zero 12 | % at a rate no worse than O(1/sqrt(k)), where k is the number of random 13 | % projections. 14 | % 15 | % The direct estimator will perform better when high-dimensional data 16 | % have low-dimensional dependency structure. 17 | % 18 | % INPUTS 19 | % x - [n x p] n samples of dimensionality p 20 | % y - [n x q] n samples of dimensionality q 21 | % 22 | % OPTIONAL 23 | % k - scalar integer, number of random projections, default = 50 24 | % 25 | % OUTPUTS 26 | % d - distance covariance between x,y 27 | % omega_k - distance covariance of k univariate random projections 28 | % 29 | % EXAMPLE 30 | % rng(1234) 31 | % n = 10000; p = 500; q = p; 32 | % x = rand(n,p); 33 | % y = x.^2; 34 | % tic; dep.dcov(x,y,'unbiased',true) % naive (unbiased) estimator 35 | % toc 36 | % tic; dep.rpdcov(x,y) 37 | % toc 38 | % tic; dep.rpdcov(x,y,100) 39 | % toc 40 | % 41 | % REFERENCE 42 | % Huang & Huo (2017). A statistically and numerically efficient 43 | % independence test based on random projections and distance 44 | % covariance. arxiv.org/abs/1701.06054v1 45 | % 46 | % SEE ALSO 47 | % fdcov, dcov 48 | 49 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 50 | % The full license and most recent version of the code can be found at: 51 | % https://github.com/brian-lau/highdim 52 | % 53 | % This program is free software: you can redistribute it and/or modify 54 | % it under the terms of the GNU General Public License as published by 55 | % the Free Software Foundation, either version 3 of the License, or 56 | % (at your option) any later version. 57 | % 58 | % This program is distributed in the hope that it will be useful, 59 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 60 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 61 | % GNU General Public License for more details. 62 | 63 | % o parfor 64 | % o data too large to fit in memory 65 | 66 | function [d,omega_k] = rpdcov(x,y,k) 67 | 68 | if nargin < 3 69 | k = 50; 70 | end 71 | 72 | [nx,p] = size(x); 73 | [ny,q] = size(y); 74 | assert(nx == ny,'RPDCOV requires x and y to have the same # of samples'); 75 | 76 | % Normalization constants, avoiding overflow 77 | Cp = sqrt(pi) * exp(gammaln((p+1)/2) - gammaln(p/2)); 78 | Cq = sqrt(pi) * exp(gammaln((q+1)/2) - gammaln(q/2)); 79 | 80 | omega_k = zeros(k,1); 81 | for kk = 1:k 82 | % Project onto random basis on unit hypersphere 83 | ux = x * sphere.spatialSign(randn(1,p))'; 84 | vy = y * sphere.spatialSign(randn(1,q))'; 85 | 86 | % Fast O(n log n) distance covariance 87 | omega_k(kk) = dep.fdcov(ux,vy); 88 | end 89 | 90 | d = mean(Cp*Cq*omega_k); -------------------------------------------------------------------------------- /+diff/covtest.m: -------------------------------------------------------------------------------- 1 | % COVTEST Two-sample covariance matrix test 2 | % 3 | % [pval,stat,Mthresh] = covtest(x,y,varargin) 4 | % 5 | % Given a sample X1,...,Xm from a p-dimensional multivariate distribution, 6 | % and a sample Y1,...,Xn from a q-dimensional multivariate distribution, 7 | % test one of the hypotheses: 8 | % 9 | % H0 : cov(X) = cov(Y) 10 | % 11 | % It is also possible to test the support of cov(x) ~= cov(y), which is 12 | % controlled at family-wise error rate = alpha. 13 | % 14 | % INPUTS 15 | % x - [m x p] m samples of dimensionality p 16 | % y - [n x p] n samples of dimensionality p 17 | % 18 | % OPTIONAL 19 | % alpha - level for test of support cov(x) ~= cov(y) (default = 0.05) 20 | % 21 | % OUTPUTS 22 | % pval - p-value 23 | % stat - statistic 24 | % Mthresh - support cov(x) ~= cov(y), indicating significantly different 25 | % entries at level alpha 26 | % 27 | % REFERENCE 28 | % Cai et al (2013). Two-sample covariance matrix testing and support 29 | % recovery in high-dimensional and sparse settings. Journal of the 30 | % American Statistical Association 108: 265-277 31 | 32 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 33 | % The full license and most recent version of the code can be found at: 34 | % https://github.com/brian-lau/highdim 35 | % 36 | % This program is free software: you can redistribute it and/or modify 37 | % it under the terms of the GNU General Public License as published by 38 | % the Free Software Foundation, either version 3 of the License, or 39 | % (at your option) any later version. 40 | % 41 | % This program is distributed in the hope that it will be useful, 42 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 43 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 44 | % GNU General Public License for more details. 45 | 46 | % TODO 47 | % o small sample (n<30) modification 48 | % x support test 49 | % o row test 50 | 51 | function [pval,stat,Mthresh] = covtest(x,y,varargin) 52 | 53 | par = inputParser; 54 | par.KeepUnmatched = true; 55 | addRequired(par,'x',@isnumeric); 56 | addRequired(par,'y',@isnumeric); 57 | addParamValue(par,'alpha',0.05,@(x) isscalar(x)&&isnumeric(x)); 58 | addParamValue(par,'row',[],@(x) isscalar(x)&&isnumeric(x)); 59 | parse(par,x,y,varargin{:}); 60 | 61 | [m,p] = size(x); 62 | [n,q] = size(y); 63 | 64 | if ne(p,q) 65 | error('Dimensions must match'); 66 | end 67 | 68 | Sx = cov(x,1); 69 | Sy = cov(y,1); 70 | x_theta = normvar(x,p,m,Sx); 71 | y_theta = normvar(y,p,n,Sy); 72 | M = (Sx - Sy).^2 ./ (x_theta/m + y_theta/n); % eq 2 73 | Mn = max(max(triu(M))); 74 | 75 | stat = Mn - 4*log(p) + log(log(p)); 76 | cdf = @(y) exp(-exp(-y/2)/sqrt(8*pi)); 77 | pval = 1 - cdf(stat); 78 | 79 | if nargout > 2 80 | % Support Sx-Sy 81 | Mthresh = M; 82 | q = -log(8*pi) - 2*log(log(1 - par.Results.alpha))^(-1); 83 | Mthresh = Mthresh >= (4*log(p) - log(log(p)) + q); 84 | Mthresh = utils.putdiag(Mthresh,diag(M) >= 2*log(p)); 85 | end 86 | 87 | function theta = normvar(x,p,n,S) 88 | mu = mean(x); 89 | theta = zeros(p,p); 90 | for i = 1:p 91 | for j = 1:p 92 | for k = 1:n 93 | theta(i,j) = theta(i,j) +... 94 | ((x(k,i) - mu(i))*(x(k,j) - mu(j)) - S(i,j))^2; 95 | end 96 | end 97 | end 98 | theta = theta/n; 99 | -------------------------------------------------------------------------------- /+sphere/signtest.m: -------------------------------------------------------------------------------- 1 | % SIGNTEST Nonparametric test for high-dimensional sphericity 2 | % 3 | % [pval,stat] = signtest(x,varargin) 4 | % 5 | % Tests whether the covariance matrix of a sample X1, ..., Xn from a 6 | % p-dimensional multivariate distribution is proportional to the identity. 7 | % This test is non-parametric, relying only on the spatial sign of the 8 | % data. 9 | % 10 | % INPUTS 11 | % x - [n x p] matrix, n samples with dimensionality p 12 | % 13 | % OPTIONAL (name/value pairs) 14 | % test - 'sign' - standard multivariate sign, biased if p grows 15 | % 'bcs' - corrected sign, p can increase as n^2, (DEFAULT) 16 | % approx - multivariate normal approximation (DEFAULT=true) 17 | % 18 | % OUTPUTS 19 | % pval - p-value 20 | % stat - statistic 21 | % 22 | % REFERENCE 23 | % Zou et al (2014). Multivariate sign-based high-dimensional tests for 24 | % sphericity. Biometrika 101: 229-236 25 | % 26 | % SEE ALSO 27 | % UniSphereTest 28 | 29 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 30 | % The full license and most recent version of the code can be found at: 31 | % https://github.com/brian-lau/highdim 32 | % 33 | % This program is free software: you can redistribute it and/or modify 34 | % it under the terms of the GNU General Public License as published by 35 | % the Free Software Foundation, either version 3 of the License, or 36 | % (at your option) any later version. 37 | % 38 | % This program is distributed in the hope that it will be useful, 39 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 40 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 41 | % GNU General Public License for more details. 42 | 43 | function [pval,stat] = signtest(x,varargin) 44 | 45 | import sphere.* 46 | 47 | par = inputParser; 48 | par.KeepUnmatched = true; 49 | addRequired(par,'x',@isnumeric); 50 | addParamValue(par,'test','bcs',@ischar); 51 | addParamValue(par,'approx',true,@(x) isnumeric(x) || islogical(x)); 52 | parse(par,x,varargin{:}); 53 | 54 | [n,p] = size(x); 55 | theta = utils.spatialMedian(x); 56 | U = spatialSign(bsxfun(@minus,x,theta)); 57 | 58 | % TODO, block process for n large 59 | UtU = U*U'; 60 | UtU(sub2ind([n n],1:n,1:n)) = 0; 61 | UtU = sum(UtU(:).^2); 62 | 63 | switch lower(par.Results.test) 64 | case {'sign','s'} 65 | Q = p/n + (n*(n-1)/n^2) * (p/(n*(n-1))) * UtU - 1; 66 | stat = n*(p+2)*Q/2; 67 | pval = 1 - chi2cdf(stat,(p+2)*(p-1)/2); 68 | case {'bcs','b'} 69 | % Bias-corrected sign test, p = O(n^2) 70 | Q = (p/(n*(n-1))) * UtU - 1; 71 | sigma0 = sqrt( 4*(p-1)/(n*(n-1)*(p+2)) ); 72 | 73 | if par.Results.approx 74 | % Approximation when x is multivariate normal 75 | deltanp = n^(-2) + 2*n^(-3); 76 | else 77 | % General case (Theorem 1, Zou et al.) 78 | R = sqrt(sum(bsxfun(@minus,x,theta).^2,2)); 79 | Rstar = R + U*theta' - sum(theta.^2)./(2*R); 80 | erk2 = erk(Rstar,2,n); 81 | deltanp = (1/n^2) * (2 - 2*erk2 + erk2^2) ... 82 | + (1/n^3) * (8*erk2 - 6*erk2^2 ... 83 | + 2*erk2*erk(Rstar,3,n) - 2*erk(Rstar,3,n)); 84 | end 85 | 86 | stat = (Q - p*deltanp) / sigma0; 87 | pval = 1 - normcdf(stat); 88 | otherwise 89 | error('Unknown test.'); 90 | end 91 | 92 | function y = erk(Rstar,k,n) 93 | d = sum(1./Rstar)^k; 94 | y = n^(k-1) * sum( Rstar.^(-k) ./ d); 95 | -------------------------------------------------------------------------------- /+dim/cpca.m: -------------------------------------------------------------------------------- 1 | % CPCA Common principal component analysis 2 | % 3 | % [Q,D,iter] = cpca(S,n,varargin) 4 | % 5 | % INPUTS 6 | % S - covariance matrices, [n x n x groups] matrix or cell array 7 | % n - sample size for each S_i, vector or cell array 8 | % 9 | % OPTIONAL 10 | % k - number of common components to return (default = all) 11 | % maxit - maximum number of iterations (default = 100) 12 | % tol - stopping criteria (default = 1e-6) 13 | % 14 | % OUTPUTS 15 | % 16 | % REFERENCE 17 | % Trendafilov (2010). Stepwise estimation of common principal 18 | % components. Computational Statistics & Data Analysis 54: 3446-3457 19 | % 20 | % Based on Matlab code provided by Dr. Trendafilov, modified to include 21 | % stopping criterion. 22 | 23 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 24 | % The full license and most recent version of the code can be found at: 25 | % https://github.com/brian-lau/highdim 26 | % 27 | % This program is free software: you can redistribute it and/or modify 28 | % it under the terms of the GNU General Public License as published by 29 | % the Free Software Foundation, either version 3 of the License, or 30 | % (at your option) any later version. 31 | % 32 | % This program is distributed in the hope that it will be useful, 33 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 34 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 35 | % GNU General Public License for more details. 36 | 37 | function [Q,D,iter] = cpca(S,n,varargin) 38 | 39 | par = inputParser; 40 | par.KeepUnmatched = true; 41 | addRequired(par,'S',@(x) isnumeric(x)||iscell(x)); 42 | addRequired(par,'n',@(x) isnumeric(x)||iscell(x)); 43 | addParamValue(par,'k',[],@isnumeric); 44 | addParamValue(par,'maxit',100,@(x) isnumeric(x) && isscalar(x)); 45 | addParamValue(par,'tol',1e-6,@isnumeric); 46 | parse(par,S,n,varargin{:}); 47 | 48 | if iscell(S) 49 | S = cat(3,S{:}); 50 | end 51 | 52 | if iscell(n) 53 | n = cat(2,n{:}); 54 | end 55 | 56 | p = size(S,1); 57 | nS = size(S,3); 58 | if nS ~= numel(n) 59 | error('n should indicate the # of samples for each group'); 60 | end 61 | 62 | if isempty(par.Results.k) 63 | k = p; 64 | elseif par.Results.k <= p 65 | k = par.Results.k; 66 | else 67 | error('k must be less than dimensionality'); 68 | end 69 | 70 | nf = n./sum(n); 71 | D = zeros(k,nS); 72 | Q = zeros(p,k); 73 | Qw = eye(p); 74 | s = zeros(p); 75 | for j = 1:nS 76 | s = s + nf(j)*S(:,:,j); 77 | end 78 | 79 | [q0,d0] = eig(s); 80 | if d0(1,1) < d0(p,p) 81 | q0 = q0(:,p:-1:1); 82 | end 83 | 84 | iter = zeros(1,k); 85 | for i = 1:k 86 | q = q0(:,i); 87 | d = zeros(1,nS); 88 | for j = 1:nS 89 | d(j) = q'*S(:,:,j)*q; 90 | end 91 | 92 | crit = 1; 93 | while (iter(i) < par.Results.maxit) && (crit > par.Results.tol) 94 | s = zeros(p); 95 | for j = 1:nS 96 | s = s + n(j)*S(:,:,j)/d(j); 97 | end 98 | 99 | w = s*q; 100 | if i ~= 1 101 | w = Qw*w; 102 | end 103 | q = w/((w'*w)^.5); 104 | 105 | for j = 1:nS 106 | d(j) = q'*S(:,:,j)*q; 107 | end 108 | 109 | if iter(i) > 1 110 | crit = old - norm(d); 111 | end 112 | old = norm(d); 113 | iter(i) = iter(i) + 1; 114 | end 115 | 116 | D(i,:) = d; 117 | Q(:,i) = q; 118 | Qw = Qw - q*q'; 119 | end 120 | -------------------------------------------------------------------------------- /+utils/permMoments.m: -------------------------------------------------------------------------------- 1 | % PERMMOMENTS Exact moments of permutation distribution 2 | % 3 | % [mu,sigma2,skew] = permMoments(A1,A2,approx) 4 | % 5 | % Returns the first three moments of the permutation distribution of 6 | % T = trace(A1*A2). Exact expressions have been obtained by Kazi-Aoual 7 | % et al (1995). The specific formulation used here comes from Bilodeau 8 | % and Guetsop Nangue (2017). 9 | % 10 | % INPUTS 11 | % A1 - [n x n] matrix 12 | % A2 - [n x n] matrix 13 | % 14 | % OPTIONAL 15 | % approx - scalar integer >= 0, positive integers determine rank of 16 | % approximate multiplication A1*A2, default = 0 (exact) 17 | % 18 | % REFERENCE 19 | % Bilodeau & Guetsop Nangue (2017). Approximations to permutation tests 20 | % of independence between two random vectors. 21 | % Computational Statistics & Data Analysis, submitted. 22 | % Kazi-Aoual et al (1995). Refined approximations to permutation tests 23 | % for multivariate inference. Computational Statistics & Data Analysis. 24 | % 20: 643-656 25 | 26 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 27 | % The full license and most recent version of the code can be found at: 28 | % https://github.com/brian-lau/highdim 29 | % 30 | % This program is free software: you can redistribute it and/or modify 31 | % it under the terms of the GNU General Public License as published by 32 | % the Free Software Foundation, either version 3 of the License, or 33 | % (at your option) any later version. 34 | % 35 | % This program is distributed in the hope that it will be useful, 36 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 37 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 38 | % GNU General Public License for more details. 39 | 40 | function [mu,sigma2,skew] = permMoments(A1,A2,approx) 41 | 42 | if nargin < 3 43 | approx = 0; 44 | end 45 | 46 | assert(all(size(A1)==size(A2)),'A1 and A2 must have the same size.'); 47 | [m,n] = size(A1); 48 | assert(m==n,'A1 and A2 must be square.'); 49 | 50 | [T(1),T2(1),S2(1),T3(1),S3(1),U(1),R(1),B(1)] = useful(A1,approx); 51 | [T(2),T2(2),S2(2),T3(2),S3(2),U(2),R(2),B(2)] = useful(A2,approx); 52 | 53 | % First moment 54 | m1 = prod(T)/n + prod(-T)/(n*(n-1)); 55 | 56 | % Second moment 57 | m2 = prod(S2)/n... 58 | + ( prod(T.^2-S2) + 2*prod(T2-S2) + 4*prod(-S2) ) / (n*(n-1))... 59 | + ( 4*prod(2*S2-T2) + 2*prod(2*S2-T.^2) ) / (n*(n-1)*(n-2))... 60 | + prod(2*T2-6*S2+T.^2) / (n*(n-1)*(n-2)*(n-3)); 61 | 62 | % Third moment 63 | SP1 = prod(S3)/n; 64 | SP2 = ( 4*prod(-S3+U) + 3*prod(T.*S2-S3) + 6*prod(-S3)... 65 | + 12*prod(-S3+R) + 6*prod(-S3+B) ) / (n*(n-1)); 66 | SP3 = ( 3*prod(-T.*S2+2*S3) + prod(T.^3-3*T.*S2+2*S3)... 67 | + 12*prod(-T.*S2+2*S3-B) + 12*prod(2*S3-R) + 24*prod(2*S3-R-B)... 68 | + 6*prod(T.*(T2-S2)+2*S3-2*R) + 24*prod(2*S3-U-R)... 69 | + 8*prod(T3+2*S3-3*R) ) / (n*(n-1)*(n-2)); 70 | SP4 = ( 12*prod(T.*S2-6*S3+2*R+2*B) + 6*prod(T.*(-T2+S2)-6*S3+2*U+4*R)... 71 | + 3*prod(-T.^3+5*T.*S2-6*S3+2*B) + 12*prod(T.*(-T2+2*S2)-6*S3+3*R+2*B)... 72 | + 8*prod(-6*S3+2*U+3*R) + 24*prod(-T3-6*S3+U+5*R+B) ) / (n*(n-1)*(n-2)*(n-3)); 73 | SP5 = ( 3*prod(T.^3+2*T.*(T2-5*S2) + 24*S3-8*R-8*B)... 74 | + 12*prod(T.*(T2-2*S2) + 2*T3+24*S3-4*U-16*R-4*B) ) / (n*(n-1)*(n-2)*(n-3)*(n-4)); 75 | SP6 = prod(-T.^3-6*T.*(T2-3*S2)-8*T3-120*S3+16*U+72*R+24*B)... 76 | / (n*(n-1)*(n-2)*(n-3)*(n-4)*(n-5)); 77 | m3 = SP1 + SP2 + SP3 + SP4 + SP5 + SP6; 78 | 79 | mu = m1; 80 | sigma2 = m2 - m1^2; 81 | skew = (m3 - 3*sigma2*m1 - m1^3) / (sigma2^(3/2)); 82 | 83 | function [T,T2,S2,T3,S3,U,R,B] = useful(A,approx) 84 | T = trace(A); 85 | if approx 86 | AA = utils.approxmtimes(A,A,approx); 87 | else 88 | AA = A*A; 89 | end 90 | T2 = sum(sum(A.^2)); 91 | S2 = sum(diag(A.^2)); 92 | T3 = sum(sum(AA.*A)); 93 | S3 = sum(diag(A).^3); 94 | U = sum(sum(A.^2.*A)); 95 | R = diag(A)'*diag(AA); 96 | B = diag(A)'*A*diag(A); 97 | -------------------------------------------------------------------------------- /Testing/test_covtest.m: -------------------------------------------------------------------------------- 1 | % Check empirical size and power against table 1 from 2 | % Cai et al (2013). Two-sample covariance matrix testing and support 3 | % recovery in high-dimensional and sparse settings. Journal of the 4 | % American Statistical Association 108: 265-277 5 | 6 | clear all; 7 | p = 200; 8 | n = 60; 9 | model = 3; 10 | 11 | for i = 1:500 12 | if model == 3 13 | sigma = zeros(p); 14 | for ii = 1:p 15 | for jj = 1:p 16 | if ii < jj 17 | if rand < 0.05 18 | sigma(ii,jj) = 0.5; 19 | end 20 | end 21 | end 22 | end 23 | sigma = sigma + sigma'; 24 | sigma = utils.putdiag(sigma,1); 25 | [~,ds] = eig(sigma); 26 | d = abs(min(diag(ds))) + 0.05; 27 | D = diag(unifrnd(0.5,2.5,p,1)); 28 | S = sqrt(D)*((sigma+d*eye(p))/(1+d))*sqrt(D); 29 | elseif model == 2 30 | % Model 2 31 | for ii = 1:p 32 | for jj = 1:p 33 | sigma(ii,jj) = 0.5^abs(ii-jj); 34 | end 35 | end 36 | D = diag(unifrnd(0.5,2.5,p,1)); 37 | S = D^.5*sigma*D^.5; 38 | elseif model == 4 39 | % Model 4 40 | for ii = 1:p 41 | for jj = 1:p 42 | delta(ii,jj) = (-1)^(ii+jj)*0.4^(abs(ii-jj)^(1/10)); 43 | end 44 | end 45 | O = diag(unifrnd(1,5,p,1)); 46 | S = O*delta*O; 47 | end 48 | U = zeros(p,p); 49 | [~,~,k] = utils.tri2sqind(p); 50 | r = randperm(numel(k)); 51 | U(k(r(1:4))) = unifrnd(0,4,4,1)*max(diag(S)); 52 | U = U + U'; 53 | [~,da] = eig(S); 54 | [~,db] = eig(S+U); 55 | d = abs(min([diag(da);diag(db)])) + 0.05; 56 | 57 | S1 = S + d*eye(p); 58 | S2 = S + U + d*(eye(p)); 59 | 60 | x = mvnrnd(zeros(1,p),S1,n); 61 | y = mvnrnd(zeros(1,p),S2,n); 62 | [pval(i),stat(i)] = diff.covtest(x,y); 63 | end 64 | 65 | %% Support recovery 66 | % Not quite matching yet. I think this is due to a problem generating exactly 67 | % the same covariance matrices as Cai et al. The off diagonal terms do not fall 68 | % into the same range (pg 272 of paper). 69 | clear all; 70 | p = 50; 71 | n = 100; 72 | model = 4; 73 | 74 | if model == 3 75 | sigma = zeros(p); 76 | for ii = 1:p 77 | for jj = 1:p 78 | if ii < jj 79 | if rand < 0.05 80 | sigma(ii,jj) = 0.5; 81 | end 82 | end 83 | end 84 | end 85 | sigma = sigma + sigma'; 86 | sigma = utils.putdiag(sigma,1); 87 | [~,ds] = eig(sigma); 88 | d = abs(min(diag(ds))) + 0.05; 89 | D = eye(p); 90 | S = D^.5*((sigma+d*eye(p))/(1+d))*D^.5; 91 | elseif model == 2 92 | % Model 2 93 | for ii = 1:p 94 | for jj = 1:p 95 | sigma(ii,jj) = 0.5^abs(ii-jj); 96 | end 97 | end 98 | D = eye(p); 99 | S = D^.5*sigma*D^.5; 100 | elseif model == 4 101 | % Model 4 102 | for ii = 1:p 103 | for jj = 1:p 104 | delta(ii,jj) = (-1)^(ii+jj)*0.4^(abs(ii-jj)^(1/10)); 105 | end 106 | end 107 | O = eye(p); 108 | S = O*delta*O; 109 | end 110 | U = zeros(p,p); 111 | [~,~,k] = utils.tri2sqind(p); 112 | r = randperm(numel(k)); 113 | U(k(r(1:25))) = 2; 114 | U = U + U'; 115 | [~,da] = eig(S); 116 | [~,db] = eig(S+U); 117 | d = abs(min([diag(da);diag(db)])) + 0.05; 118 | 119 | S1 = (S + d*eye(p))/(1+d); 120 | S2 = (S + U + d*(eye(p)))/(1+d); 121 | sdiff = S2-S1; 122 | min(sdiff(sdiff>0)) 123 | sd = (S2-S1)>0; 124 | for i = 1:100 125 | x = mvnrnd(zeros(1,p),S1,n); 126 | y = mvnrnd(zeros(1,p),S2,n); 127 | [pval(i),stat(i),Mthresh] = diff.covtest(x,y); 128 | temp = Mthresh & sd; 129 | s(i) = sum(temp(:))/sqrt(sum(Mthresh(:))*sum(sd(:))); 130 | end 131 | 132 | % Check aymptotic distribution 133 | % figure; 134 | % dx = 0.1; xx = -5:dx:25; 135 | % n = histc(stat,xx); 136 | % hold on 137 | % plot(xx,cumsum(n)./sum(n)); 138 | % plot(xx,exp((-1/sqrt(8*pi))*exp(-xx/2)),'r') 139 | -------------------------------------------------------------------------------- /+sphere/vmfrnd.m: -------------------------------------------------------------------------------- 1 | function [ X ] = vmfrnd(m, n, kappa, mu) 2 | % RANDVONMISESFISHERM Random number generation from von Mises Fisher 3 | % distribution. 4 | % X = randvonMisesFisherm(m, n, kappa) returns n samples of random unit 5 | % directions in m dimensional space, with concentration parameter kappa, 6 | % and the direction parameter mu = e_m 7 | % X = randvonMisesFisherm(m, n, kappa, mu) with direction parameter mu 8 | % (m-dimensional column unit vector) 9 | % 10 | % Sungkyu Jung, Feb 3, 2010. 11 | 12 | if nargin < 3, help randvonMisesFisher3, return, end 13 | if nargin == 3, muflag = false; 14 | else muflag = true; 15 | end 16 | 17 | if m < 2; 18 | disp('Message from randvonMisesFisherm.m: dimension m must be > 2'); 19 | disp('Message from randvonMisesFisherm.m: Set m to be 2'); 20 | m = 2; 21 | end 22 | 23 | if kappa < 0; 24 | disp('Message from randvonMisesFisherm.m: kappa must be >= 0'); 25 | disp('Message from randvonMisesFisherm.m: Set kappa to be 0'); 26 | kappa = 0; 27 | end 28 | 29 | % 30 | % the following algorithm is following the modified Ulrich's algorithm 31 | % discussed by Andrew T.A. Wood in "SIMULATION OF THE VON MISES FISHER 32 | % DISTRIBUTION", COMMUN. STATIST 23(1), 1994. 33 | 34 | % step 0 : initialize 35 | b = (-2*kappa + sqrt(4*kappa^2 + (m-1)^2))/(m-1); 36 | x0 = (1-b)/(1+b); 37 | c = kappa*x0 + (m-1)*log(1-x0^2); 38 | 39 | % step 1 & step 2 40 | nnow = n; w = []; 41 | %cnt = 0; 42 | while(true) 43 | ntrial = max(round(nnow*1.2),nnow+10) ; 44 | Z = betarnd((m-1)/2,(m-1)/2,ntrial,1); 45 | U = rand(ntrial,1); 46 | W = (1-(1+b)*Z)./(1-(1-b)*Z); 47 | 48 | indicator = kappa*W + (m-1)*log(1-x0*W) - c >= log(U); 49 | if sum(indicator) >= nnow 50 | w1 = W(indicator); 51 | w = [w ;w1(1:nnow)]; 52 | break; 53 | else 54 | w = [w ; W(indicator)]; 55 | nnow = nnow-sum(indicator); 56 | %cnt = cnt+1;disp(['retrial' num2str(cnt) '.' num2str(sum(indicator))]); 57 | end 58 | end 59 | 60 | % step 3 61 | V = UNIFORMdirections(m-1,n); 62 | X = [repmat(sqrt(1-w'.^2),m-1,1).*V ;w']; 63 | 64 | if muflag 65 | mu = mu / norm(mu); 66 | X = rotMat(mu)'*X; 67 | end 68 | end 69 | 70 | 71 | function V = UNIFORMdirections(m,n) 72 | % generate n uniformly distributed m dim'l random directions 73 | % Using the logic: "directions of Normal distribution are uniform on sphere" 74 | 75 | V = zeros(m,n); 76 | nr = randn(m,n); %Normal random 77 | for i=1:n 78 | while 1 79 | ni=nr(:,i)'*nr(:,i); % length of ith vector 80 | % exclude too small values to avoid numerical discretization 81 | if ni<1e-10 82 | % so repeat random generation 83 | nr(:,i)=randn(m,1); 84 | else 85 | V(:,i)=nr(:,i)/sqrt(ni); 86 | break; 87 | end 88 | end 89 | end 90 | 91 | end 92 | 93 | function rot = rotMat(b,a,alpha) 94 | % ROTMAT returns a rotation matrix that rotates unit vector b to a 95 | % 96 | % rot = rotMat(b) returns a d x d rotation matrix that rotate 97 | % unit vector b to the north pole (0,0,...,0,1) 98 | % 99 | % rot = rotMat(b,a ) returns a d x d rotation matrix that rotate 100 | % unit vector b to a 101 | % 102 | % rot = rotMat(b,a,alpha) returns a d x d rotation matrix that rotate 103 | % unit vector b towards a by alpha (in radian) 104 | % 105 | % See also . 106 | 107 | % Last updated Nov 7, 2009 108 | % Sungkyu Jung 109 | 110 | 111 | [s1 s2]=size(b); 112 | d = max(s1,s2); 113 | b= b/norm(b); 114 | if min(s1,s2) ~= 1 || nargin==0 , help rotMat, return, end 115 | 116 | if s1<=s2; b = b'; end 117 | 118 | if nargin == 1; 119 | a = [zeros(d-1,1); 1]; 120 | alpha = acos(a'*b); 121 | end 122 | 123 | if nargin == 2; 124 | alpha = acos(a'*b); 125 | end 126 | if abs(a'*b - 1) < 1e-15; rot = eye(d); return, end 127 | if abs(a'*b + 1) < 1e-15; rot = -eye(d); return, end 128 | 129 | c = b - a * (a'*b); c = c / norm(c); 130 | A = a*c' - c*a' ; 131 | 132 | rot = eye(d) + sin(alpha)*A + (cos(alpha) - 1)*(a*a' +c*c'); 133 | end 134 | 135 | -------------------------------------------------------------------------------- /+sphere/jsn.m: -------------------------------------------------------------------------------- 1 | % JSN John, Sugiura, Nagao test of high-deminsional sphericity 2 | % 3 | % [pval,stat] = jsn(x,varargin) 4 | % 5 | % Given a sample X1,...,Xn from a p-dimensional multivariate distribution, 6 | % test the hypothesis: 7 | % 8 | % H0 : Covariance matrix of sample is proportional to the identity 9 | % 10 | % This test is the locally most powerful invariant test for sphericity, 11 | % is n-consistent, and remains valid even when n and p grow together 12 | % (method='john' or 'nagao'). Moreover, the n,p-consistent variant 13 | % (method = 'wang') only requires the existence of fourth moments. 14 | % 15 | % INPUTS 16 | % x - [n x p] matrix, n samples with dimensionality p 17 | % 18 | % OPTIONAL (name/value pairs) 19 | % test - 'john' - fixed p, n goes to infinity (DEFAULT) 20 | % 'nagao' - Box-Bartlett like refinements to asymptotic dist 21 | % 'wang' - p,n -> inf, p/n -> y>0, universal 22 | % 23 | % OUTPUTS 24 | % pval - p-value 25 | % stat - statistic 26 | % 27 | % REFERENCE 28 | % Ledoit & Wolf (2002). Some hypothesis tests for the covariance matrix 29 | % when the dimension is large compared to the sample size. Annals of 30 | % Statistics 30: 1081-1102 31 | % Wang, Q and Yao J (2013). On the sphericity test with large-dimensional 32 | % observations. Electronic Journal of Statistics 7: 2164-2192 33 | % 34 | % SEE ALSO 35 | % DepTest1 36 | 37 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 38 | % The full license and most recent version of the code can be found at: 39 | % https://github.com/brian-lau/highdim 40 | % 41 | % This program is free software: you can redistribute it and/or modify 42 | % it under the terms of the GNU General Public License as published by 43 | % the Free Software Foundation, either version 3 of the License, or 44 | % (at your option) any later version. 45 | % 46 | % This program is distributed in the hope that it will be useful, 47 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 48 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 49 | % GNU General Public License for more details. 50 | 51 | function [pval,stat] = jsn(x,varargin) 52 | 53 | par = inputParser; 54 | par.KeepUnmatched = true; 55 | addRequired(par,'x',@isnumeric); 56 | addParamValue(par,'test','john',@ischar); 57 | parse(par,x,varargin{:}); 58 | 59 | [n,p] = size(x); 60 | 61 | % Ledoit & Wolf (2002) 62 | S = cov(x,0); 63 | U = (1/p)*trace((S/((1/p)*trace(S)) - eye(p))^2); 64 | T2 = (n-1)*p/2*U; 65 | 66 | switch lower(par.Results.test) 67 | case {'john','j'} 68 | f = 0.5*p*(p+1) - 1; 69 | pval = 1 - chi2cdf(T2,f); 70 | stat = T2; 71 | case {'nagao','n'} 72 | f = 0.5*p*(p+1) - 1; 73 | % From Nagao (1973) theorem 5.1 74 | ap = (1/12)*(p^3+3*p^2-8*p-12-200/p); 75 | bp = (1/8)*(-2*p^3-5*p^2+7*p+12+420/p); 76 | cp = (1/4)*(p^3+2*p^2-p-2-216/p); 77 | dp = (1/24)*(-2*p^3-3*p^2+p+436/p); 78 | 79 | Pf = chi2cdf(T2,f); 80 | Pf2 = chi2cdf(T2,f+2); 81 | Pf4 = chi2cdf(T2,f+4); 82 | Pf6 = chi2cdf(T2,f+6); 83 | P = Pf + (1/n)*(ap*Pf6 + bp*Pf4 + cp*Pf2 + dp*Pf); 84 | % Truncate negative p-values 85 | pval = max(0,1 - P); 86 | stat = T2; 87 | case {'wang','w'} 88 | % Wang & Yao (2013), theorem 2.2 89 | N = n-1; 90 | if all(isreal(x)) 91 | k = 2; 92 | else 93 | k = 1; 94 | end 95 | b = (1/(N*p)) * sum(abs(x(:)).^4) - k - 1; 96 | stat = N*U-p; 97 | pval = 1 - normcdf(stat,k+b-1,sqrt(2*k)); 98 | otherwise 99 | error('Unknown method'); 100 | end 101 | 102 | %% Various equivalent definitions of T2 103 | % % John (1972) 104 | % U = (trace(S^2)) / (trace(S))^2; 105 | % T = (p*U-1)/(p-1); 106 | % T2 = (0.5*n*p)*(p-1)*T; 107 | % % Wang & Yao (2013) 108 | % [~,D] = eig(S); 109 | % l = diag(D); 110 | % lbar = mean(l); 111 | % T2 = ((n*p)/2) * (sum((l-lbar).^2)/p) / lbar^2; 112 | % % Nagao (1973) 3.6 113 | % T2 = ((p^2*n)/2) * trace((S./trace(S) - eye(p)./p)^2); 114 | -------------------------------------------------------------------------------- /+diff/kstest2d.m: -------------------------------------------------------------------------------- 1 | % kstest2d Two-dimensional, 2-sample Kolmorogov-Smirnov test 2 | % 3 | % [p,D] = kstest2d(s1,s2); 4 | % 5 | % Compare two, 2-dimensional distributions using Fasano & Franceschini's 6 | % generalization of the KS-test. 7 | % 8 | % The analytic distribution of the statistic is unknown, and p-values 9 | % are estimated using an approximation (Press et al., 1992) to FF's Monte 10 | % Carlo simulations. 11 | % 12 | % INPUTS 13 | % s1 - [n1 x 2] matrix 14 | % s2 - [n2 x 2] matrix 15 | % 16 | % OUTPUTS 17 | % p - approximate p-value 18 | % D - K-S statistic 19 | % 20 | % REFERENCE 21 | % Fasano, G, Franceschini, A (1987) A multidimensional version of the 22 | % Kolmorogov-Smirnov test. Mon Not R astr Soc 225: 155-170 23 | % Press et al (1992). Numerical Recipes in C, section 14.7 24 | % 25 | % SEE ALSO 26 | % minentest, hotell2, DepTest2 27 | 28 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 29 | % The full license and most recent version of the code can be found on GitHub: 30 | % https://github.com/brian-lau/highdim 31 | % 32 | % This program is free software: you can redistribute it and/or modify 33 | % it under the terms of the GNU General Public License as published by 34 | % the Free Software Foundation, either version 3 of the License, or 35 | % (at your option) any later version. 36 | % 37 | % This program is distributed in the hope that it will be useful, 38 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 39 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 40 | % GNU General Public License for more details. 41 | % 42 | % REVISION HISTORY: 43 | % brian 03.14.06 written 44 | % brian 08.23.11 added flag to assign point to quadrant that maximizes D 45 | % http://www.nr.com/forum/showthread.php?t=576 46 | 47 | function [p,D] = kstest2d(s1,s2) 48 | 49 | assign_point = false; % Set true to assign center point to maximizing quadrant 50 | % Leave this false if you want FF's original procedure 51 | 52 | [n1,m1] = size(s1); 53 | [n2,m2] = size(s2); 54 | 55 | if ~all([m1,m2]==2) 56 | error('# of columns in X and Y must equal 2'); 57 | end 58 | 59 | D = zeros(n1+n2,4); 60 | count = 0; 61 | for i = 1:n1 62 | count = count + 1; 63 | [a1,b1,c1,d1] = quadcnt(s1(i,1),s1(i,2),s1,n1-1); 64 | [a2,b2,c2,d2] = quadcnt(s1(i,1),s1(i,2),s2,n2); 65 | 66 | temp = abs([a1-a2 , b1-b2 , c1-c2 , d1-d2]); 67 | if assign_point 68 | % Assign point to quadrant where it maximizes difference 69 | ind = find(max(temp)); 70 | if length(ind) >= 1 71 | ind = ind(1); % take first maximum 72 | temp(ind) = temp(ind) + 1/length(s1); 73 | end 74 | end 75 | D(count,:) = temp; 76 | end 77 | for i = 1:n2 78 | count = count + 1; 79 | [a1,b1,c1,d1] = quadcnt(s2(i,1),s2(i,2),s1,n1); 80 | [a2,b2,c2,d2] = quadcnt(s2(i,1),s2(i,2),s2,n2-1); 81 | 82 | temp = abs([a1-a2 , b1-b2 , c1-c2 , d1-d2]); 83 | if assign_point 84 | % Assign point to quadrant where it maximizes difference 85 | ind = find(max(temp)); 86 | if length(ind) >= 1 87 | ind = ind(1); % take first maximum 88 | temp(ind) = temp(ind) + 1/length(s2); 89 | end 90 | end 91 | D(count,:) = temp; 92 | end 93 | 94 | D = max(max(D)); 95 | 96 | % Average correlation coefficients 97 | r1 = corrcoef(s1); r1 = r1(1,2); 98 | r2 = corrcoef(s2); r2 = r2(1,2); 99 | rr = 0.5*(r1*r1 + r2*r2); 100 | 101 | p = probks(n1,n2,D,rr); 102 | 103 | %----- Count fractions of points in s in quadrants defined around point (x,y). 104 | % s is a nx2 matrix 105 | % 106 | % a|b 107 | %----- 108 | % c|d 109 | % 110 | % Currently, the point x,y is not counted in any fraction 111 | function [a,b,c,d] = quadcnt(x,y,s,d) 112 | 113 | slx = s(:,1)x; 115 | sly = s(:,2)y; 117 | 118 | inda = slx & sgy; 119 | indb = sgx & sgy; 120 | indc = slx & sly; 121 | indd = sgx & sly; 122 | 123 | a = sum(inda)/d; 124 | b = sum(indb)/d; 125 | c = sum(indc)/d; 126 | d = sum(indd)/d; 127 | 128 | %----- Asymptotic Q-function to approximate the 2-sided P-value 129 | function p = probks(n1,n2,D,rr) 130 | 131 | % Numerical Recipes in C, section 14.7 132 | N = (n1*n2)/(n1+n2); 133 | lambda = (sqrt(N)*D) / (1 + sqrt(1 - rr)*(.25 - .75/sqrt(N))); 134 | 135 | j = (1:101)'; 136 | p = 2 * sum((-1).^(j-1).*exp(-2*lambda*lambda*j.^2)); 137 | p = min(max(p,0),1); 138 | 139 | 140 | -------------------------------------------------------------------------------- /+dep/dcov.m: -------------------------------------------------------------------------------- 1 | % DCOV Distance covariance 2 | % 3 | % [d,dvx,dvy,A,B] = dcov(x,y,varargin) 4 | % 5 | % INPUTS 6 | % x - [n x p] n samples of dimensionality p 7 | % y - [n x q] n samples of dimensionality q 8 | % 9 | % OPTIONAL (as name/value pairs, order irrelevant) 10 | % unbiased - true indicates bias-corrected estimate (default=false) 11 | % index - scalar in (0,2], exponent on Euclidean distance, default = 1 12 | % dist - true indicates x & y are distance matrices (default=false) 13 | % doublecenter - true indicates x & y are double-centered distance 14 | % matrices (default=false) 15 | % 16 | % OUTPUTS 17 | % d - distance covariance between x & y 18 | % dvx - x sample distance variance 19 | % dvy - y sample distance variance 20 | % A - double-centered or U-centered distance matrix for x 21 | % B - double-centered or U-centered distance matrix for y 22 | % 23 | % EXAMPLE 24 | % rng(1234) 25 | % n = 1000; p = 50; q = p; 26 | % x = rand(n,p); 27 | % y = x.^2; 28 | % d = dep.dcov(x,y) 29 | % 30 | % % Equivalence between distance covariance (squared) & HSIC 31 | % h = dep.hsic(x,y,'kernel','brownian'); 32 | % [4*h d^2] 33 | % 34 | % REFERENCE 35 | % Szekely et al (2007). Measuring and testing independence by correlation 36 | % of distances. Ann Statist 35: 2769-2794 37 | % Szekely & Rizzo (2013). The distance correlation t-test of independence 38 | % in high dimension. J Multiv Analysis 117: 193-213 39 | % 40 | % SEE ALSO 41 | % dcovtest, dcorr, dcorrtest, rpdcov, fdcov 42 | 43 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 44 | % The full license and most recent version of the code can be found at: 45 | % https://github.com/brian-lau/highdim 46 | % 47 | % This program is free software: you can redistribute it and/or modify 48 | % it under the terms of the GNU General Public License as published by 49 | % the Free Software Foundation, either version 3 of the License, or 50 | % (at your option) any later version. 51 | % 52 | % This program is distributed in the hope that it will be useful, 53 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 54 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 55 | % GNU General Public License for more details. 56 | 57 | function [d,dvx,dvy,A,B] = dcov(x,y,varargin) 58 | 59 | par = inputParser; 60 | par.KeepUnmatched = true; 61 | par.PartialMatching = false; 62 | addRequired(par,'x',@isnumeric); 63 | addRequired(par,'y',@isnumeric); 64 | addParamValue(par,'approx','none',@ischar); 65 | addParamValue(par,'unbiased',false,@isscalar); 66 | addParamValue(par,'index',1,@(x) isscalar(x) && (x>0) && (x<=2)); 67 | addParamValue(par,'dist',false,@isscalar); 68 | addParamValue(par,'doublecenter',false,@isscalar); 69 | parse(par,x,y,varargin{:}); 70 | 71 | [n,~] = size(x); 72 | assert(n == size(y,1),'DCOV requires x and y to have the same # of samples'); 73 | 74 | if par.Results.doublecenter 75 | % Inputs are already double-centered distance matrices 76 | A = x; 77 | B = y; 78 | else 79 | if par.Results.dist 80 | % Inputs are euclidean distance matrices 81 | a = x; 82 | b = y; 83 | elseif strcmp(par.Results.approx,'nystrom') 84 | % Looks like A&B are scaled versions of K&L 85 | % utils.dcenter(L)*2+B 86 | [h,K,L] = dep.hsic(x,y,'approx','nys','kernel','brownian',par.Unmatched); 87 | d = sqrt(4*h); 88 | if nargout > 1 89 | A = -2*utils.dcenter(K*K'); 90 | B = -2*utils.dcenter(L*L'); 91 | dvx = sqrt(sum(sum(A.*A))/n^2); 92 | dvy = sqrt(sum(sum(B.*B))/n^2); 93 | end 94 | return; 95 | % elseif any(strcmp(par.Results.approx,{'rp' 'randomproj'})) 96 | % 97 | else 98 | % Distance matrices 99 | a = sqrt(utils.sqdist(x,x)); 100 | b = sqrt(utils.sqdist(y,y)); 101 | end 102 | 103 | if par.Results.index ~= 1 104 | a = a.^par.Results.index; 105 | b = b.^par.Results.index; 106 | end 107 | end 108 | 109 | if par.Results.unbiased 110 | A = utils.ucenter(a); 111 | B = utils.ucenter(b); 112 | 113 | d = sum(sum(A.*B))/(n*(n-3)); 114 | if nargout > 1 115 | dvx = sum(sum(A.*A))/(n*(n-3)); 116 | dvy = sum(sum(B.*B))/(n*(n-3)); 117 | end 118 | else 119 | A = utils.dcenter(a); 120 | B = utils.dcenter(b); 121 | 122 | d = sqrt(sum(sum(A.*B))/n^2); 123 | if nargout > 1 124 | dvx = sqrt(sum(sum(A.*A))/n^2); 125 | dvy = sqrt(sum(sum(B.*B))/n^2); 126 | end 127 | end -------------------------------------------------------------------------------- /+utils/approxmtimes.m: -------------------------------------------------------------------------------- 1 | % APPROXMTIMES Approximate matrix multiplication 2 | % 3 | % AB = approxmtimes(A,B,c,method,uni) 4 | % 5 | % Given matrices A [m x n] and B [n x p], approximates the product A*B 6 | % with a sum of rank-one matrices by selecting c columns (rows) of A (B) 7 | % 8 | % A*B \approx \sum_{c in C} A(:,c)*B(c,:) 9 | % 10 | % Two algorithms are available, one using randomized selection (Drineas 11 | % et al) and the other using greedy deterministic selection (Belabbas & 12 | % Wolfe). 13 | % 14 | % Complexity: 15 | % sampling - O(c(m+n+p)) 16 | % greedy - O(m(n+c^2) + c^3) 17 | % 18 | % INPUTS 19 | % A - [m x n] matrix 20 | % B - [n x p] matrix 21 | % c - scalar < n, approximant rank 22 | % 23 | % OPTIONAL 24 | % method - string indicating approximation algorithm (default = 'sampling') 25 | % 'sampling' - monte-carlo column-row selections using either 26 | % uniform probabilities or probabilities that 27 | % minimize expected normwise absolute error 28 | % 'greedy' - deterministic approximation to optimal subset 29 | % uni - boolean indicating uniform sampling (default = false) 30 | % only applies for method = 'random' 31 | % 32 | % OUTPUTS 33 | % AB - approximation of A*B 34 | % 35 | % EXAMPLE 36 | % import utils.* 37 | % rng(1); 38 | % m = 3000; 39 | % n = m; 40 | % A = [randn(m/2,n) ; rand(m/2,n)*20]; 41 | % B = [rand(m/2,n)*20 ; randn(m/2,n)]; 42 | % 43 | % tic; AB = A*B; toc 44 | % tic; AB1 = approxmtimes(A,B,25); toc 45 | % tic; AB2 = approxmtimes(A,B,25,'greedy'); toc 46 | % 47 | % norm(AB1-AB,'fro')^2/norm(AB,'fro')^2 48 | % norm(AB2-AB,'fro')^2/norm(AB,'fro')^2 49 | % 50 | % REFERENCE 51 | % Drineas et al. (2006). Fast Monte Carlo algorithms for matrices I: 52 | % Approximating matrix multiplication. SIAM Journal on Computing, 53 | % 36, 132-157 54 | % Belabbas & Wolfe (2008). On sparse representations of linear operators 55 | % and the approximation of matrix products. In Information Sciences 56 | % and Systems. CISS 2008, 258-263 57 | 58 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 59 | % The full license and most recent version of the code can be found at: 60 | % https://github.com/brian-lau/highdim 61 | % 62 | % This program is free software: you can redistribute it and/or modify 63 | % it under the terms of the GNU General Public License as published by 64 | % the Free Software Foundation, either version 3 of the License, or 65 | % (at your option) any later version. 66 | % 67 | % This program is distributed in the hope that it will be useful, 68 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 69 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 70 | % GNU General Public License for more details. 71 | 72 | % TODO 73 | % o B = A 74 | % o B = A' 75 | % o streaming (one-pass for uni with n known, two-pass for other cases) 76 | % o nargout == 2 should return C & R for all algs, AB \approx C*R 77 | % o faster randsample 78 | 79 | function AB = approxmtimes(A,B,c,method,uni) 80 | 81 | if nargin < 5 82 | uni = false; 83 | end 84 | 85 | if nargin < 4 86 | method = 'sampling'; 87 | end 88 | 89 | [m,n] = size(A); 90 | [n2,p] = size(B); 91 | c = fix(c); 92 | 93 | assert(n==n2,'Inner matrix dimensions must agree.'); 94 | assert(c>=1,'c must be >= 1.'); 95 | 96 | switch lower(method) 97 | case {'greedy'} 98 | A2 = A.^2; 99 | An = sum(A2)'; 100 | B2 = B.^2; 101 | Bn = sum(B2,2); 102 | [~,J] = sort(An.*Bn,'descend'); 103 | J = J(1:c); 104 | 105 | Q = (A(:,J)'*A(:,J)) .* (B(J,:)*B(J,:)'); 106 | r = sum( (A'*A(:,J)) .* (B*B(J,:)') )'; 107 | w = Q\r; 108 | 109 | AB = A(:,J)*diag(w)*B(J,:); 110 | case {'sampling'} 111 | if uni 112 | p_k = repmat(1/n,n,1); 113 | else 114 | % Probabilities that minimize expected normwise absolute error 115 | A2 = A.^2; 116 | An = sqrt(sum(A2))'; 117 | B2 = B.^2; 118 | Bn = sqrt(sum(B2,2)); 119 | 120 | p_k = An.*Bn; 121 | p_k = p_k/sum(p_k); 122 | end 123 | 124 | J = randsample(1:n,c,true,p_k); 125 | 126 | cp = sqrt(c*p_k(J)); 127 | C = bsxfun(@rdivide,A(:,J),cp'); 128 | R = bsxfun(@rdivide,B(J,:),cp); 129 | 130 | AB = C*R; 131 | otherwise 132 | error('Unrecognized method for approximate matrix multiplication'); 133 | end -------------------------------------------------------------------------------- /+diff/minentest.m: -------------------------------------------------------------------------------- 1 | % MINENTEST N-dimensional, 2-sample comparison of 2 distributions 2 | % 3 | % [p,e_n,e_n_boot] = minentest(x,y,varargin) 4 | % 5 | % Compares d-dimensional data from two samples using a measure based on 6 | % statistical energy. The test is non-parametric, does not require binning 7 | % and easily scales to arbitrary dimensions. 8 | % 9 | % The analytic distribution of the statistic is unknown, and p-values 10 | % are estimated using a permutation procedure, which works well 11 | % according to simulations by Aslan & Zech. 12 | % 13 | % INPUTS 14 | % x - [n1 x d] matrix 15 | % y - [n2 x d] matrix 16 | % 17 | % OPTIONAL (name/value pairs) 18 | % flag - 'sr', Szekely & Rizzo energy statistic 19 | % 'az', Aslan & Zech energy statistic (default) 20 | % nboot - # of bootstrap resamples (default = 1000) 21 | % replace - boolean for sampling with replacement (default = false) 22 | % 23 | % OUTPUTS 24 | % p - p-value by permutation 25 | % e_n - minimum energy statistic 26 | % e_n_boot - bootstrap samples 27 | % 28 | % REFERENCE 29 | % Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free 30 | % multivariate goodness-of-fit tests, two-sample comparison and unfolding. 31 | % Nuc Instr and Meth in Phys Res A 537: 626-636 32 | % Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics 33 | % based on distances. J Stat Planning & Infer 143: 1249-1272 34 | % 35 | % SEE ALSO 36 | % kstest2d, hotell2, DepTest2 37 | 38 | % $ Copyright (C) 2014 Brian Lau http://www.subcortex.net/ $ 39 | % The full license and most recent version of the code can be found on GitHub: 40 | % https://github.com/brian-lau/highdim 41 | % 42 | % This program is free software: you can redistribute it and/or modify 43 | % it under the terms of the GNU General Public License as published by 44 | % the Free Software Foundation, either version 3 of the License, or 45 | % (at your option) any later version. 46 | % 47 | % This program is distributed in the hope that it will be useful, 48 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 49 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 50 | % GNU General Public License for more details. 51 | % 52 | % REVISION HISTORY: 53 | % brian 08.25.11 written 54 | 55 | % TODO 56 | % o calculate distance matrix once and cache, permute index 57 | % attempted once, https://github.com/brian-lau/multdist/commit/ae58496848464cea50fe134ab6f1e2f929632c88 58 | % o k-sample version 59 | % o incomplete V-statistic 60 | 61 | 62 | function [p,e_n,e_n_boot] = minentest(x,y,varargin) 63 | 64 | par = inputParser; 65 | par.KeepUnmatched = true; 66 | addRequired(par,'x',@isnumeric); 67 | addRequired(par,'y',@isnumeric); 68 | addParamValue(par,'flag','sr',@ischar); 69 | addParamValue(par,'nboot',1000,@(x) isscalar(x)&&isnumeric(x)); 70 | addParamValue(par,'replace',false,@(x) islogical(x)||isnumeric(x)); 71 | parse(par,x,y,varargin{:}); 72 | 73 | [n,ny] = size(x); 74 | [m,my] = size(y); 75 | 76 | if ny ~= my 77 | error('# of columns in X and Y must match'); 78 | end 79 | 80 | pooled = [x ; y]; 81 | 82 | flag = par.Results.flag; 83 | nboot = par.Results.nboot; 84 | replace = par.Results.replace; 85 | e_n = energy(x,y,flag); 86 | e_n_boot = zeros(nboot,1); 87 | e_n_boot(1) = e_n; 88 | for i = 2:nboot 89 | if replace 90 | ind = unidrnd(n+m,1,n+m); 91 | else 92 | ind = randperm(n+m); 93 | end 94 | e_n_boot(i) = energy(pooled(ind(1:n),:),pooled(ind(n+1:end),:),flag); 95 | end 96 | 97 | p = sum(e_n_boot>=e_n)./nboot; 98 | 99 | function [dx,dy,dxy] = dist(x,y) 100 | dx = pdist(x,'euclidean'); 101 | dy = pdist(y,'euclidean'); 102 | dxy = pdist2(x,y,'euclidean'); 103 | 104 | function z = energy(x,y,flag) 105 | % FIXME, equal samples will generate infinite values, will produce 106 | % unreliable results, more of a problem for discrete data. 107 | n = size(x,1); 108 | m = size(y,1); 109 | [dx,dy,dxy] = dist(x,y); 110 | switch flag 111 | case 'az' 112 | % Aslan & Zech definition of energy statistic 113 | z = (1/(n*(n-1)))*sum(-log(dx)) + (1/(m*(m-1)))*sum(-log(dy))... 114 | - (1/(n*m))*sum(-log(dxy(:))); 115 | case 'sr' 116 | % Szekely & Rizzo definition of energy statistic 117 | % Verified against their R package 'energy' 118 | % in R: 119 | % data(iris) 120 | % eqdist.etest(iris[,1:4], c(75,75), R = 199) 121 | % E-statistic = 126.0453, p-value = 0.005 122 | % in Matlab: 123 | % load fisheriris; 124 | % [p,en] = minentest(meas(1:75,:),meas(76:end,:),'sr',200) 125 | z = (2/(n*m))*sum(dxy(:)) - (1/(n^2))*sum(2*dx) - (1/(m^2))*sum(2*dy); 126 | z = ((n*m)/(n+m)) * z; 127 | otherwise 128 | error('Bad FLAG'); 129 | end -------------------------------------------------------------------------------- /Testing/test_sphericity4.m: -------------------------------------------------------------------------------- 1 | %% Compare JNS to simulations of size and power in 2 | % Wang, Q and Yao J (2013). On the sphericity test with large-dimensional 3 | % observations. Electronic Journal of Statistics 7: 2164-2192 4 | 5 | %% Table 2 6 | clear all; 7 | n = [64 128 256 512]; 8 | p{1} = [4 8 16 32 48 56 60]; 9 | p{2} = [8 16 32 64 96 112 120]; 10 | p{3} = [16 32 64 128 192 224 240]; 11 | p{4} = [32 64 128 256 384 448 480]; 12 | 13 | reps = 20; 14 | tic; 15 | for i = 1:numel(n) 16 | for j = 1:numel(p{i}) 17 | for k = 1:reps 18 | x = gamrnd(4,1/2,n(i),p{i}(j))-2; 19 | %x = randn(n(i),p{i}(j)); 20 | pval(k) = sphere.jsn(x,'test','wang'); 21 | end 22 | prob(i,j) = mean(pval<=0.05); 23 | end 24 | toc 25 | end 26 | 27 | pN = [... % Normal(0,1) 28 | 0.0498 0.0545 0.0539 0.0558 0.0551 0.0547 0.0523;... 29 | 0.0539 0.0523 0.051 0.0538 0.055 0.0543 0.0545;... 30 | 0.0544 0.0534 0.0519 0.0507 0.0507 0.0503 0.0494;... 31 | 0.0542 0.0512 0.0519 0.0491 0.0487 0.0496 0.0488] 32 | 33 | % Normal, reps = 2000, method = 'wang' 34 | % prob = 35 | % 36 | % 0.0585 0.0645 0.0570 0.0525 0.0485 0.0520 0.0565 37 | % 0.0625 0.0620 0.0605 0.0570 0.0510 0.0480 0.0525 38 | % 0.0570 0.0565 0.0505 0.0550 0.0465 0.0550 0.0545 39 | % 0.0555 0.0540 0.0490 0.0420 0.0510 0.0490 0.0435 40 | 41 | % Normal, reps = 2000, method = 'wang' 42 | % prob = 43 | % 44 | % 0.0565 0.0600 0.0495 0.0445 0.0590 0.0515 0.0605 45 | % 0.0550 0.0575 0.0545 0.0525 0.0470 0.0505 0.0510 46 | % 0.0605 0.0485 0.0540 0.0555 0.0515 0.0545 0.0395 47 | % 0.0530 0.0555 0.0480 0.0485 0.0445 0.0460 0.0525 48 | 49 | pG = [... % Gamma(4,2)-2, note parametrization in matlab = gamrnd(4,1/2) 50 | 0.0698 0.0804 0.078 0.0703 0.0685 0.0615 0.0615;... 51 | 0.075 0.0724 0.0695 0.0603 0.0577 0.0591 0.0598;... 52 | 0.0719 0.0634 0.0598 0.0555 0.052 0.0541 0.0533;... 53 | 0.0606 0.0579 0.0507 0.0495 0.0502 0.0482 0.053] 54 | 55 | % Gamma, reps = 2000, method = 'wang' 56 | % prob = 57 | % 58 | % 0.0650 0.0590 0.0650 0.0620 0.0580 0.0565 0.0615 59 | % 0.0675 0.0595 0.0680 0.0525 0.0495 0.0580 0.0525 60 | % 0.0645 0.0580 0.0720 0.0495 0.0505 0.0410 0.0540 61 | % 0.0550 0.0575 0.0490 0.0560 0.0535 0.0595 0.0420 62 | 63 | % Gamma, reps = 2000, method = 'wang' 64 | % prob = 65 | % 66 | % 0.0795 0.0660 0.0595 0.0590 0.0520 0.0530 0.0475 67 | % 0.0715 0.0540 0.0645 0.0525 0.0540 0.0585 0.0550 68 | % 0.0615 0.0620 0.0520 0.0610 0.0540 0.0535 0.0580 69 | % 0.0505 0.0610 0.0530 0.0475 0.0520 0.0455 0.0490 70 | 71 | %% Table 3 72 | clear all; 73 | n = [64 128]; 74 | p{1} = [4 8 16 32 48 56 60]; 75 | p{2} = [8 16 32 64 96 112 120]; 76 | 77 | reps = 250; 78 | tic; 79 | for i = 1:numel(n) 80 | for j = 1:numel(p{i}) 81 | for k = 1:reps 82 | %x = gamrnd(4,1/2,n(i),p{i}(j))-2; 83 | x = randn(n(i),p{i}(j)); 84 | v = round(p{i}(j)/2); 85 | sigma = [0.5*ones(v,1);ones(p{i}(j)-v,1)]'; 86 | x = (diag(sqrt(sigma))*x')'; 87 | pval(k) = sphere.jsn(x,'test','n'); 88 | end 89 | prob(i,j) = mean(pval<=0.05); 90 | end 91 | toc 92 | end 93 | 94 | pN = [... % Normal, Power 1, CJ 95 | 0.7754 0.8662 0.912 0.9384 0.9471 0.949 0.9501;... 96 | 0.9984 0.9998 1 1 1 1 1] 97 | 98 | % Normal, Power 1, reps = 2500, method = 'wang' 99 | % prob = 100 | % 101 | % 0.8640 0.9256 0.9664 0.9744 0.9840 0.9812 0.9820 102 | % 0.9996 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 103 | 104 | % Normal, Power 1, reps = 2500, method = 'john' 105 | % prob = 106 | % 107 | % 0.7348 0.8336 0.8876 0.9304 0.9284 0.9364 0.9392 108 | % 0.9980 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 109 | 110 | % Normal, Power 1, reps = 2500, method = 'nagao' 111 | % prob = 112 | % 113 | % 0.7516 0.8468 0.8876 0.9404 0.9336 0.9308 0.9392 114 | % 0.9992 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 115 | 116 | pN = [... % Normal, Power 2, CJ 117 | 0.4694 0.5313 0.5732 0.5868 0.6035 0.6025 0.6048; 118 | 0.9424 0.9698 0.9781 0.9823 0.9824 0.9841 0.9844] 119 | 120 | % Normal, Power 2, reps = 2500, method = 'wang' 121 | % prob = 122 | % 123 | % 0.5400 0.5880 0.6460 0.6700 0.6660 0.6752 0.6900 124 | % 0.9720 0.9856 0.9844 0.9908 0.9904 0.9892 0.9920 125 | 126 | % Normal, Power 2, reps = 2500, method = 'john' 127 | % prob = 128 | % 129 | % 0.4412 0.4828 0.5360 0.5720 0.5732 0.5852 0.5696 130 | % 0.9444 0.9584 0.9724 0.9852 0.9836 0.9864 0.9836 131 | 132 | % Normal, Power 2, reps = 2500, method = 'nagao' 133 | % prob = 134 | % 135 | % 0.4596 0.5008 0.5260 0.5520 0.5788 0.5924 0.6016 136 | % 0.9384 0.9540 0.9756 0.9756 0.9836 0.9776 0.9820 137 | -------------------------------------------------------------------------------- /Testing/test_sphericity2.m: -------------------------------------------------------------------------------- 1 | %% Compare jns.m to size & power from 2 | % Ledoit & Wolf (2002). Some hypothesis tests for the covariance matrix 3 | % when the dimension is large compared to the sample size. 4 | % Annals of Statistics 30: 1081-1102 5 | 6 | %% Table 1 7 | clear all; 8 | n = [4 8 16 32 64 128 256]; 9 | p = [4 8 16 32 64 128 256]; 10 | reps = 200; 11 | tic; 12 | for i = 1:numel(p) 13 | for j = 1:numel(n) 14 | for k = 1:reps 15 | x = randn(n(j),p(i)); 16 | pval(k) = sphere.jsn(x,'test','wang'); 17 | end 18 | prob(i,j) = mean(pval<=0.05); 19 | end 20 | toc 21 | end 22 | 23 | % Ledoit & Wolf (method is John's test) 24 | pL = [... 25 | 0.01 0.03 0.04 0.05 0.05 0.05 0.05;... 26 | 0.03 0.04 0.04 0.05 0.05 0.05 0.05;... 27 | 0.04 0.05 0.05 0.05 0.05 0.05 0.05;... 28 | 0.05 0.05 0.05 0.05 0.05 0.05 0.05;... 29 | 0.05 0.05 0.05 0.05 0.05 0.05 0.05;... 30 | 0.05 0.05 0.05 0.05 0.05 0.05 0.05;... 31 | 0.05 0.05 0.05 0.05 0.05 0.05 0.05]; 32 | 33 | % method = 'john', reps = 2000; 25.11.14 34 | % prob = 35 | % 36 | % 0.0010 0.0230 0.0400 0.0500 0.0415 0.0505 0.0535 37 | % 0.0220 0.0370 0.0435 0.0485 0.0520 0.0490 0.0635 38 | % 0.0335 0.0415 0.0460 0.0515 0.0605 0.0460 0.0470 39 | % 0.0445 0.0505 0.0490 0.0460 0.0505 0.0505 0.0460 40 | % 0.0470 0.0515 0.0560 0.0535 0.0530 0.0480 0.0500 41 | % 0.0565 0.0545 0.0505 0.0515 0.0520 0.0515 0.0605 42 | % 0.0580 0.0510 0.0530 0.0530 0.0575 0.0550 0.0490 43 | 44 | % method = 'nagao', reps = 2000; 25.11.14 45 | % prob = 46 | % 47 | % 0.0420 0.0400 0.0540 0.0485 0.0495 0.0455 0.0510 48 | % 0.0720 0.0585 0.0540 0.0490 0.0475 0.0455 0.0550 49 | % 0.0310 0.0510 0.0485 0.0410 0.0555 0.0530 0.0530 50 | % 0.0280 0.0430 0.0430 0.0510 0.0475 0.0455 0.0505 51 | % 0.0315 0.0475 0.0480 0.0485 0.0530 0.0450 0.0495 52 | % 0.0370 0.0430 0.0410 0.0475 0.0565 0.0520 0.0535 53 | % 0.0355 0.0415 0.0450 0.0495 0.0435 0.0525 0.0370 54 | 55 | % method = 'wang', reps = 2000; 25.11.14 56 | % prob = 57 | % 58 | % 0.0270 0.0285 0.0445 0.0495 0.0485 0.0515 0.0620 59 | % 0.0505 0.0595 0.0380 0.0495 0.0485 0.0575 0.0535 60 | % 0.0495 0.0750 0.0635 0.0490 0.0695 0.0555 0.0570 61 | % 0.0500 0.0780 0.0965 0.0780 0.0575 0.0585 0.0550 62 | % 0.0475 0.0900 0.1030 0.1090 0.0725 0.0485 0.0480 63 | % 0.0525 0.0860 0.1030 0.1125 0.1215 0.0845 0.0455 64 | % 0.0585 0.0795 0.1070 0.1135 0.1250 0.1140 0.0885 65 | 66 | %% Table 2 67 | clear all; 68 | n = [4 8 16 32 64 128 256]; 69 | p = [4 8 16 32 64 128 256]; 70 | reps = 200; 71 | tic; 72 | for i = 1:numel(p) 73 | for j = 1:numel(n) 74 | for k = 1:reps 75 | sigma = [0.5*ones(round(p(i)/2),1);ones(p(i)-round(p(i)/2),1)]'; 76 | x = (diag(sqrt(sigma))*randn(n(j),p(i))')'; 77 | pval(k) = sphere.jsn(x,'test','w'); 78 | end 79 | prob(i,j) = mean(pval<=0.05); 80 | end 81 | toc 82 | end 83 | 84 | % Ledoit & Wolf (method is John's test) 85 | pL = [... 86 | 0.02 0.06 0.15 0.37 0.76 0.98 1;... 87 | 0.05 0.09 0.18 0.42 0.85 1.00 1;... 88 | 0.06 0.11 0.20 0.48 0.90 1.00 1;... 89 | 0.08 0.13 0.22 0.50 0.93 1.00 1;... 90 | 0.09 0.13 0.24 0.52 0.95 1.00 1;... 91 | 0.09 0.14 0.23 0.53 0.95 1.00 1;... 92 | 0.09 0.14 0.24 0.54 0.96 1.00 1]; 93 | 94 | % method = 'john', reps = 2000; 25.11.14 95 | % prob = 96 | % 97 | % 0.0010 0.0505 0.1310 0.3425 0.7420 0.9880 1.0000 98 | % 0.0360 0.0830 0.1785 0.4155 0.8385 0.9980 1.0000 99 | % 0.0535 0.0935 0.1960 0.4605 0.9045 1.0000 1.0000 100 | % 0.0585 0.1175 0.2025 0.4970 0.9225 1.0000 1.0000 101 | % 0.0755 0.1145 0.2115 0.5000 0.9390 1.0000 1.0000 102 | % 0.0750 0.1165 0.2285 0.5185 0.9445 1.0000 1.0000 103 | % 0.0890 0.1205 0.2375 0.5235 0.9625 1.0000 1.0000 104 | 105 | % method = 'nagao', reps = 2000; 25.11.14 106 | % prob = 107 | % 108 | % 0.0555 0.1050 0.1780 0.3995 0.7700 0.9795 1.0000 109 | % 0.0815 0.0905 0.1950 0.4260 0.8470 0.9990 1.0000 110 | % 0.0570 0.1005 0.1995 0.4605 0.9005 1.0000 1.0000 111 | % 0.0435 0.1065 0.2150 0.4700 0.9335 1.0000 1.0000 112 | % 0.0475 0.1035 0.2085 0.4815 0.9370 1.0000 1.0000 113 | % 0.0535 0.1110 0.2175 0.5090 0.9490 1.0000 1.0000 114 | % 0.0560 0.0995 0.1855 0.4890 0.9525 1.0000 1.0000 115 | 116 | % method = 'wang', reps = 2000; 25.11.14 117 | % prob = 118 | % 119 | % 0.0580 0.1025 0.2190 0.5230 0.8735 0.9945 1.0000 120 | % 0.1220 0.1930 0.2915 0.5815 0.9265 1.0000 1.0000 121 | % 0.1375 0.2855 0.4100 0.6500 0.9625 1.0000 1.0000 122 | % 0.1570 0.3140 0.4950 0.7425 0.9810 1.0000 1.0000 123 | % 0.1565 0.3085 0.5345 0.8165 0.9900 1.0000 1.0000 124 | % 0.1775 0.3290 0.5225 0.8010 0.9970 1.0000 1.0000 125 | % 0.1560 0.3235 0.5290 0.8190 0.9965 1.0000 1.0000 126 | % 127 | -------------------------------------------------------------------------------- /Testing/Test_rv.m: -------------------------------------------------------------------------------- 1 | % xUnit framework required 2 | % https://psexton.github.io/matlab-xunit/ 3 | 4 | % R package example, uses Pearson III approx for p-value 5 | % library(FactoMineR) 6 | % data(wine) 7 | % X <- wine[,3:7] 8 | % Y <- wine[,11:20] 9 | % coeffRV(X,Y) 10 | % $rv 11 | % [1] 0.6220991 12 | % $rvstd 13 | % [1] 8.100868 14 | % $mean 15 | % [1] 0.1307783 16 | % $variance 17 | % [1] 0.003678469 18 | % $skewness 19 | % [1] 1.390012 20 | % $p.value 21 | % [1] 1.885726e-05 22 | 23 | classdef Test_rv < TestCase 24 | properties 25 | x 26 | y 27 | end 28 | 29 | methods 30 | function self = Test_rv(name) 31 | self = self@TestCase(name); 32 | self.x = [... 33 | 3.074 3.000 2.714 2.280 1.960;... 34 | 2.964 2.821 2.375 2.280 1.680;... 35 | 2.857 2.929 2.560 1.960 2.077;... 36 | 2.808 2.593 2.417 1.913 2.160;... 37 | 3.607 3.429 3.154 2.154 2.040;... 38 | 2.857 3.111 2.577 2.040 2.077;... 39 | 3.214 3.222 2.962 2.115 2.040;... 40 | 3.120 2.852 2.500 2.200 2.185;... 41 | 2.857 2.815 2.808 1.923 2.074;... 42 | 2.893 3.000 2.571 1.846 1.680;... 43 | 3.250 3.286 2.714 1.926 1.962;... 44 | 3.393 3.179 2.769 2.038 1.920;... 45 | 3.179 3.286 2.778 2.231 1.760;... 46 | 3.071 3.107 2.731 2.120 1.800;... 47 | 3.107 3.143 2.846 2.185 1.962;... 48 | 2.929 3.179 2.852 2.000 2.037;... 49 | 3.036 3.179 3.037 2.231 1.667;... 50 | 3.071 2.926 2.741 2.000 1.880;... 51 | 2.643 2.786 2.536 1.889 1.808;... 52 | 3.696 3.192 2.833 1.826 2.385;... 53 | 3.708 2.926 2.520 2.040 2.667]; 54 | 55 | self.y = [... 56 | 3.407 3.308 2.885 2.320 1.840 2.000 1.650 3.259 2.963 3.200;... 57 | 3.370 3.000 2.560 2.440 1.739 2.000 1.381 2.962 2.808 2.926;... 58 | 3.250 2.929 2.769 2.192 2.250 1.750 1.250 3.077 2.800 3.077;... 59 | 3.160 2.880 2.391 2.083 2.167 2.304 1.476 2.542 2.583 2.478;... 60 | 3.536 3.360 3.160 2.231 2.148 1.762 1.600 3.615 3.296 3.462;... 61 | 3.179 3.385 2.800 2.240 2.148 1.750 1.476 3.214 3.148 3.321;... 62 | 3.429 3.500 3.038 2.200 2.385 1.826 1.476 3.250 3.222 3.385;... 63 | 3.654 3.077 2.520 2.320 2.444 2.080 1.905 3.280 3.160 2.962;... 64 | 3.357 3.346 3.000 2.040 2.125 1.875 1.524 3.148 2.893 3.308;... 65 | 3.222 3.259 2.926 2.040 2.042 2.000 1.773 3.077 2.704 2.778;... 66 | 3.607 3.385 2.889 2.115 2.160 1.955 1.571 3.286 3.036 3.222;... 67 | 3.481 3.385 2.962 2.000 2.200 2.042 1.545 3.321 3.071 3.143;... 68 | 3.481 3.423 2.963 2.269 2.154 1.957 1.571 3.481 3.259 3.269;... 69 | 3.357 3.444 2.885 2.120 2.346 1.826 1.550 3.269 3.080 3.192;... 70 | 3.357 3.370 2.846 2.240 2.280 1.750 1.524 3.333 3.037 3.370;... 71 | 3.286 3.308 3.115 2.269 2.000 1.917 1.400 3.040 2.960 3.200;... 72 | 3.444 3.500 3.185 2.160 2.240 1.913 1.750 3.520 3.296 3.462;... 73 | 3.370 3.360 2.963 2.308 1.917 2.000 1.429 3.250 2.920 2.880;... 74 | 2.889 2.800 2.500 1.962 2.111 2.080 1.318 2.680 2.308 2.556;... 75 | 3.737 3.080 2.833 1.773 2.440 2.292 1.571 3.437 2.958 2.600;... 76 | 3.727 2.885 2.600 2.083 2.609 2.174 1.650 3.095 3.136 2.545]; 77 | end 78 | 79 | function setUp(self) 80 | end 81 | 82 | function test(self) 83 | [pval,rv,rvstd] = dep.rvtest(self.x,self.y); 84 | assertElementsAlmostEqual(pval,1.885726e-05,'absolute',1e-5); 85 | assertElementsAlmostEqual(rv,0.6220991,'absolute',1e-5); 86 | assertElementsAlmostEqual(rvstd,8.100868,'absolute',1e-5); 87 | end 88 | 89 | function tearDown(self) 90 | end 91 | end 92 | end -------------------------------------------------------------------------------- /+utils/mexHadamard.c: -------------------------------------------------------------------------------- 1 | /* Hadamard Transform 2 | mex function to take hadamard transform 3 | 4 | Usage: w = hadamard(x) 5 | x must be a REAL VALUED COLUMN VECTOR or MATRIX 6 | m = size(x,1) must be a POWER OF TWO 7 | 8 | Notes: 9 | 1) This implementation uses exactly m*log2(m) additions/subtractions. 10 | 2) This is symmetric and orthogonal. To invert, apply again and 11 | divide by vector length. 12 | 13 | Written by: Peter Stobbe, Caltech 14 | Email: stobbe@acm.caltech.edu 15 | Created: August 2008 16 | Edits by Stephen Becker, 2009--2014 17 | 18 | Note: in R2008b, Matlab added "fwht" and "ifwht" (the Fast Walsh- 19 | Hadamart Transform and the inverse) to its Signal Processing 20 | Toolbox. With the default ordering and scaling, it's not 21 | equivalent to this, but you can change this with the following: 22 | y = length(x) * fwht( x, [], 'hadamard' ); 23 | Then y should be the same as hadamard(x) up to roundoff. 24 | However, it appears that this code is faster than fwht. 25 | 26 | Update Stephen Becker, Feb 27 2014, fix compiling issue for Mac OS X 27 | Update Stephen Becker, Mar 3 2014, issue error if input data is sparse 28 | https://github.com/stephenbeckr/SparsifiedKMeans 29 | 30 | Copyright (c) 2011, Peter Stobbe 31 | All rights reserved. 32 | 33 | Redistribution and use in source and binary forms, with or without 34 | modification, are permitted provided that the following conditions are 35 | met: 36 | 37 | * Redistributions of source code must retain the above copyright 38 | notice, this list of conditions and the following disclaimer. 39 | * Redistributions in binary form must reproduce the above copyright 40 | notice, this list of conditions and the following disclaimer in 41 | the documentation and/or other materials provided with the distribution 42 | 43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 44 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 45 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 46 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 47 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 48 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 49 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 50 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 51 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 52 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 53 | POSSIBILITY OF SUCH DAMAGE. 54 | 55 | */ 56 | 57 | #include 58 | 59 | 60 | /* SRB: Feb 27 2014, gcc-4.8 has problems with char16_t not being defined. 61 | * This seems to fix it 62 | * (and do this BEFORE including mex.h) */ 63 | /* See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56086#c4 64 | (but for, e.g., Mac w/ Xcode and Clang, this fails, so test 65 | for gcc. more possibilities here: 66 | https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html 67 | but clang defines GNUC too! 68 | http://nadeausoftware.com/articles/2012/10/c_c_tip_how_detect_compiler_name_and_version_using_compiler_predefined_macros 69 | */ 70 | #ifndef NO_UCHAR 71 | #define UCHAR_OK 72 | #endif 73 | #if defined(__GNUC__) && !(defined(__clang__)) && defined(UCHAR_OK) 74 | #include 75 | #endif 76 | 77 | #include "mex.h" 78 | 79 | /* 80 | y - output 81 | x - input 82 | m - length of vector 83 | */ 84 | void hadamard_apply_vector(double *y, double *x, unsigned m) 85 | { 86 | unsigned bit, j, k; 87 | double temp; 88 | 89 | for (j = 0; j < m; j+=2) { 90 | k = j+1; 91 | y[j] = x[j] + x[k]; 92 | y[k] = x[j] - x[k]; 93 | } 94 | 95 | for (bit = 2; bit < m; bit <<= 1) { 96 | for (j = 0; j < m; j++) { 97 | if( (bit & j) == 0 ) { 98 | k = j | bit; 99 | temp = y[j]; 100 | y[j] = y[j] + y[k]; 101 | y[k] = temp - y[k]; 102 | } 103 | } 104 | } 105 | } 106 | 107 | /* 108 | y - output 109 | x - input 110 | m - length of vectors (number of rows) 111 | n - number of vectors (number of columns) 112 | */ 113 | void hadamard_apply_matrix(double *y, double *x, unsigned m, unsigned n) 114 | { 115 | unsigned j; 116 | for(j = 0; j < n; j++) { 117 | hadamard_apply_vector(y + j*m, x + j*m, m); 118 | } 119 | } 120 | 121 | 122 | /* check that the vector length is a power of 2, 123 | just using bitshifting instead of log */ 124 | void checkPowerTwo(unsigned m) 125 | { 126 | /* check that it's not a degenerate 0 by 1 vector or singleton */ 127 | if (m <= 1) { 128 | mexErrMsgTxt("Vector length must be greater than 1."); 129 | } 130 | /* keep dividing by two until result is odd */ 131 | while( (m & 1) == 0 ){ 132 | m >>= 1; 133 | } 134 | /* check that m is not a multiple of an odd number greater than 1 */ 135 | if (m > 1) { 136 | mexErrMsgTxt("Vector length must be power of 2."); 137 | } 138 | } 139 | 140 | 141 | /* The gateway routine. */ 142 | void mexFunction(int nlhs, mxArray *plhs[], 143 | int nrhs, const mxArray *prhs[]) 144 | { 145 | double *x, *y; 146 | unsigned m, n; 147 | 148 | /* Check for the proper number of arguments. */ 149 | if (nrhs != 1) { 150 | mexErrMsgTxt("One and only one input required; must be a column vector or matrix, with # rows a power of 2."); 151 | } 152 | if (nlhs > 1) { 153 | mexErrMsgTxt("Too many output arguments."); 154 | } 155 | 156 | /* input size */ 157 | m = mxGetM(prhs[0]); 158 | checkPowerTwo(m); 159 | n = mxGetN(prhs[0]); 160 | 161 | if (mxIsComplex(prhs[0])) { 162 | mexErrMsgTxt("Input must be real."); 163 | } else if (mxIsSparse(prhs[0])) { 164 | mexErrMsgTxt("Input must be a full matrix, not sparse."); 165 | } else if (!mxIsDouble(prhs[0])) { 166 | mexErrMsgTxt("Input must be of type double."); 167 | } 168 | 169 | /* Create matrix for the return argument. */ 170 | plhs[0] = mxCreateDoubleMatrix(m, n, mxREAL); 171 | 172 | /* Assign pointers to each input and output. */ 173 | x = mxGetPr(prhs[0]); 174 | y = mxGetPr(plhs[0]); 175 | 176 | /* Call the C subroutine. */ 177 | hadamard_apply_matrix(y, x, m, n); 178 | return; 179 | } 180 | -------------------------------------------------------------------------------- /+dep/hsic.m: -------------------------------------------------------------------------------- 1 | % HSIC Hilbert-Schmidt Independence Criterion 2 | % 3 | % [stat,K,L,varargout] = hsic(x,y,varargin) 4 | % 5 | % Estimate the Hilbert-Schmidt Independence Criterion (HSIC). 6 | % 7 | % INPUTS 8 | % x - [n x p] n samples of dimensionality p 9 | % y - [n x q] n samples of dimensionality q 10 | % 11 | % OPTIONAL (name/value pairs) 12 | % kernel - string indicating kernel type 13 | % approx - string indicating approximation 14 | % unbiased - boolean indicating unbiased estimator (default=false) 15 | % gram - true indicates x & y are Gram matrices (default=false) 16 | % doublecenter - true indicates x & y are double-centered Gram 17 | % matrices (default=false) 18 | % 19 | % Additional name/value pairs are passed through to the kernel function. 20 | % 21 | % OUTPUTS 22 | % h - Hilbert-Schmidt Independence Criterion 23 | % K - [n x n] Gram matrix for x 24 | % L - [n x n] Gram matrix for y 25 | % params - 26 | % 27 | % EXAMPLE 28 | % rng(1234) 29 | % n = 1000; p = 50; q = p; 30 | % x = rand(n,p); 31 | % y = x.^2; 32 | % h = dep.hsic(x,y) % default Gaussian kernel with median heuristic 33 | % 34 | % % Equivalence between distance covariance (squared) & HSIC 35 | % h = dep.hsic(x,y,'kernel','brownian'); 36 | % d = dep.dcov(x,y); 37 | % [4*h d^2] 38 | % 39 | % % Approximate using random fourier features 40 | % h = dep.hsic(x,y,'approx','rfm','D',100,'sigma',2) 41 | % 42 | % % Approximate using Nystrom 43 | % h = dep.hsic(x,y,'approx','nystrom','k',100,'sigma',2) 44 | % 45 | % REFERENCE 46 | % Gretton et al (2008). A kernel statistical test of independence. In 47 | % Advances in neural information processing systems, 585-592 48 | % Sejdinovic et al (2013). Equivalence of distance-based and RKHS-based 49 | % statistics in hypothesis testing. Annals of Statistics 41: 2263-2291 50 | % Song et al (2012). Feature Selection via Dependence Maximization. 51 | % Journal of Machine Learning Research 13: 1393-1434 52 | % 53 | % SEE ALSO 54 | % hsictest, rfm 55 | 56 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 57 | % The full license and most recent version of the code can be found at: 58 | % https://github.com/brian-lau/highdim 59 | % 60 | % This program is free software: you can redistribute it and/or modify 61 | % it under the terms of the GNU General Public License as published by 62 | % the Free Software Foundation, either version 3 of the License, or 63 | % (at your option) any later version. 64 | % 65 | % This program is distributed in the hope that it will be useful, 66 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 67 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 68 | % GNU General Public License for more details. 69 | 70 | % TODO 71 | % o error for unbiased && approx, I don't know how to estimate the unbiased 72 | % version using feature maps, could potentially reconstruct full Gram 73 | % matrix? 74 | function [h,K,L,params] = hsic(x,y,varargin) 75 | 76 | par = inputParser; 77 | par.KeepUnmatched = true; 78 | par.PartialMatching = false; 79 | addRequired(par,'x',@isnumeric); 80 | addRequired(par,'y',@isnumeric); 81 | addParamValue(par,'kernel','rbf',@ischar); 82 | addParamValue(par,'approx','none',@ischar); 83 | addParamValue(par,'unbiased',false,@(x) isnumeric(x) || islogical(x)); 84 | addParamValue(par,'gram',false,@isscalar); 85 | addParamValue(par,'doublecenter',false,@isscalar); 86 | parse(par,x,y,varargin{:}); 87 | 88 | [m,p] = size(x); 89 | [n,q] = size(y); 90 | 91 | assert(m == n,'HSIC requires x and y to have the same # of samples'); 92 | assert(~(par.Results.doublecenter&&par.Results.unbiased),... 93 | 'Cannot compute unbiased HSIC estimate with double-centered Gram matrices.'); 94 | 95 | if par.Results.doublecenter 96 | Kc = x; 97 | Lc = y; 98 | elseif par.Results.gram 99 | K = x; 100 | L = y; 101 | else 102 | [K,L,params] = getKL(x,y,par.Results.kernel,par.Results.approx,par.Unmatched); 103 | end 104 | 105 | if par.Results.unbiased % U-statistic 106 | K = utils.zerodiag(K); 107 | L = utils.zerodiag(L); 108 | 109 | % l = ones(m,1); 110 | % h = trace(K*L) + (l'*K*l*l'*L*l)/(n-1)/(n-2) - 2*(l'*K*L*l)/(n-2); 111 | % h = h/(n*(n-3)); 112 | 113 | % Equivalent, but faster 114 | Kc = utils.ucenter(K); 115 | Lc = utils.ucenter(L); 116 | h = sum(sum(Kc.*Lc))/(n*(n-3)); 117 | else % V-statistic 118 | if any(strcmp(par.Results.approx,{'rfm' 'nys' 'nystrom'})) 119 | % K & L are feature maps 120 | phiXc = bsxfun(@minus,K,mean(K)); 121 | phiYc = bsxfun(@minus,L,mean(L)); 122 | h = (norm(phiXc'*phiYc,'fro')/n)^2; 123 | if nargin > 1 124 | K = K*K'; 125 | L = L*L'; 126 | end 127 | else 128 | % K & L are Gram matrices 129 | 130 | % H = eye(n) - ones(n)/n; 131 | % h = trace(K*H*L*H)/n^2 132 | 133 | % Equivalent, but faster 134 | if ~exist('Kc','var') 135 | Kc = utils.dcenter(K); 136 | Lc = utils.dcenter(L); 137 | end 138 | h = sum(sum(Kc.*Lc))/n^2; 139 | end 140 | end 141 | 142 | %% 143 | function [K,L,params] = getKL(x,y,kernel,approx,par) 144 | 145 | switch lower(kernel) 146 | case {'rbf' 'gauss' 'gaussian'} 147 | switch lower(approx) 148 | case {'rfm'} 149 | K = utils.rfm(x,par); 150 | L = utils.rfm(y,par); 151 | case {'nys' 'nystrom'} 152 | K = utils.nystrom(x,'kernel','rbf',par); 153 | L = utils.nystrom(y,'kernel','rbf',par); 154 | case {'none'} 155 | [K,sigmax] = utils.rbf(x,[],par); 156 | [L,sigmay] = utils.rbf(y,[],par); 157 | otherwise 158 | error('Unknown approximation for rbf kernel'); 159 | end 160 | case {'distance' 'brownian'} 161 | switch lower(approx) 162 | case {'nys' 'nystrom'} 163 | K = utils.nystrom(x,'kernel','brownian',par); 164 | L = utils.nystrom(y,'kernel','brownian',par); 165 | case {'none'} 166 | K = utils.distkern(x,x,par); 167 | L = utils.distkern(y,y,par); 168 | otherwise 169 | error('Unknown approximation for brownian kernel'); 170 | end 171 | otherwise 172 | error('Unsupported kernel'); 173 | end 174 | 175 | if exist('sigmax','var') 176 | params.sigmax = sigmax; 177 | end 178 | if exist('sigmay','var') 179 | params.sigmay = sigmay; 180 | end 181 | if ~exist('params','var') 182 | params = struct(); 183 | end 184 | -------------------------------------------------------------------------------- /+dep/dcorrtest.m: -------------------------------------------------------------------------------- 1 | % DCORRTEST Distance correlation test of independence 2 | % 3 | % [pval,r,stat,null] = dcorrtest(x,y,varargin) 4 | % 5 | % Given a sample X1,...,Xn from a p-dimensional multivariate distribution, 6 | % and a sample Y1,...,Xn from a q-dimensional multivariate distribution, 7 | % test the hypothesis: 8 | % 9 | % H0 : X and Y are mutually independent 10 | % 11 | % The default test is based on a modified distance correlation statistic 12 | % that when suitably transformed converges to a Student t distribution 13 | % under independence (Szekely & Rizzo 2013). The resulting t-test is 14 | % unbiased for sample sizes greater than three and all significance 15 | % levels. 16 | % 17 | % Several different permutation methods are also available. See DCOVTEST 18 | % for details. These are included mostly for testing since the t-test 19 | % is well-behaved even in small samples, and very computationally efficient. 20 | % 21 | % INPUTS 22 | % x - [n x p] n samples of dimensionality p 23 | % y - [n x q] n samples of dimensionality q 24 | % 25 | % OPTIONAL (as name/value pairs, order irrelevant) 26 | % method - 't' - t-test from Szekely & Rizzo (2013), DEFAULT 27 | % 'pearson' - Pearson type III approx by moment matching 28 | % 'perm-dist' - randomization using permutation of the rows & 29 | % columns of distance matrices 30 | % 'perm-brute' - brute force randomization, directly permuting 31 | % one of the inputs, which requires recalculating 32 | % and centering distance matrices 33 | % nboot - # permutations if not t-test 34 | % 35 | % OUTPUTS 36 | % pval - p-value 37 | % r - distance correlation 38 | % stat - test statistic 39 | % null - permutation statistics 40 | % 41 | % EXAMPLE 42 | % rng(1234); 43 | % p = 100; 44 | % n = 2000; 45 | % X = rand(n,p); Y = X.^2 + 15*randn(n,p); 46 | % 47 | % tic;[pval,r] = dep.dcorrtest(X,Y); toc % default t-test 48 | % [pval , r] 49 | % tic;[pval,r] = dep.dcorrtest(X,Y,'method','pearson'); toc 50 | % [pval , r] 51 | % tic;[pval,r] = dep.dcorrtest(X,Y,'method','perm-dist','nboot',200);toc 52 | % [pval , r] 53 | % tic;[pval,r] = dep.dcorrtest(X,Y,'method','perm-brute','nboot',200);toc 54 | % [pval , r] 55 | % 56 | % REFERENCE 57 | % Szekely et al (2007). Measuring and testing independence by correlation 58 | % of distances. Ann Statist 35: 2769-2794 59 | % Szekely & Rizzo (2013). The distance correlation t-test of independence 60 | % in high dimension. J Multiv Analysis 117: 193-213 61 | % 62 | % SEE ALSO 63 | % dcorr, DepTest2 64 | 65 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 66 | % The full license and most recent version of the code can be found at: 67 | % https://github.com/brian-lau/highdim 68 | % 69 | % This program is free software: you can redistribute it and/or modify 70 | % it under the terms of the GNU General Public License as published by 71 | % the Free Software Foundation, either version 3 of the License, or 72 | % (at your option) any later version. 73 | % 74 | % This program is distributed in the hope that it will be useful, 75 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 76 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 77 | % GNU General Public License for more details. 78 | 79 | function [pval,r,stat,varargout] = dcorrtest(x,y,varargin) 80 | 81 | par = inputParser; 82 | par.KeepUnmatched = true; 83 | addRequired(par,'x',@isnumeric); 84 | addRequired(par,'y',@isnumeric); 85 | addParamValue(par,'method','t',@ischar); 86 | addParamValue(par,'nboot',999,@(x) isnumeric(x) && isscalar(x)); 87 | parse(par,x,y,varargin{:}); 88 | 89 | [n,~] = size(x); 90 | assert(n == size(y,1),'DCORRTEST requires x and y to have the same # of samples'); 91 | 92 | permMethods = {'perm-dist' 'perm-brute'}; 93 | nboot = par.Results.nboot; 94 | method = lower(par.Results.method); 95 | 96 | switch method 97 | case {'pearson'} 98 | if ~isfield(par.Unmatched,'unbiased') 99 | % Override dcov default, we generally want unbiased dcorr 100 | [d,dvx,dvy,A,B] = dep.dcov(x,y,'unbiased',true,par.Unmatched); 101 | else 102 | [d,dvx,dvy,A,B] = dep.dcov(x,y,par.Unmatched); 103 | end 104 | r = d/sqrt(dvx*dvy); 105 | 106 | if isfield(par.Unmatched,'unbiased') && par.Unmatched.unbiased 107 | stat = (n*(n-3))*d; % = sum(sum(A.*B)) for unbiased estimator 108 | elseif ~isfield(par.Unmatched,'unbiased') 109 | stat = (n*(n-3))*d; % = sum(sum(A.*B)) for unbiased estimator 110 | else 111 | stat = (n^2)*d^2; % = sum(sum(A.*B)) for biased estimator 112 | end 113 | 114 | [pval,stat] = utils.pearsonIIIpval(A,B,stat); 115 | return; 116 | case {'t','ttest','t-test'} 117 | if isfield(par.Unmatched,'unbiased') && ~par.Unmatched.unbiased 118 | error('This method is only valid for UNBIASED estimator'); 119 | elseif ~isfield(par.Unmatched,'unbiased') 120 | r = dep.dcorr(x,y,'unbiased',true,par.Unmatched); 121 | else 122 | r = dep.dcorr(x,y,par.Unmatched); 123 | end 124 | 125 | v = n*(n-3)/2; 126 | stat = sqrt(v-1) * r/sqrt(1-r^2); 127 | pval = tcdf(stat,v-1,'upper'); 128 | return; 129 | case {'perm-dist'} 130 | a = sqrt(utils.sqdist(x,x)); 131 | b = sqrt(utils.sqdist(y,y)); 132 | [d,dvx,dvy] = dep.dcov(a,b,'dist',true,'unbiased',true); 133 | r = d/sqrt(dvx*dvy); 134 | 135 | null = zeros(nboot,1); 136 | for i = 1:nboot 137 | ind = randperm(n); 138 | [d2,dvx2,dvy2] = dep.dcov(a,b(ind,ind),'dist',true,'unbiased',true); 139 | null(i) = d2/sqrt(dvx2*dvy2); 140 | end 141 | case {'perm-brute'} 142 | [d,dvx,dvy] = dep.dcov(x,y,'unbiased',true); 143 | r = d/sqrt(dvx*dvy); 144 | 145 | null = zeros(nboot,1); 146 | for i = 1:nboot 147 | ind = randperm(n); 148 | [d2,dvx2,dvy2] = dep.dcov(x,y(ind,:),'unbiased',true); 149 | null(i) = d2/sqrt(dvx2*dvy2); 150 | end 151 | otherwise 152 | error('Unrecognized test method'); 153 | end 154 | 155 | % One of the permutation methods 156 | if any(strcmp(method,permMethods)) 157 | if ~exist('stat','var') 158 | stat = r; 159 | end 160 | pval = (1 + sum(null>stat)) / (1 + nboot); 161 | end 162 | 163 | if nargout == 4 164 | if exist('null','var') 165 | varargout{1} = null; 166 | else 167 | varargout{1} = []; 168 | end 169 | end -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | highdim 2 | ========== 3 | A Matlab library for statistical testing of high-dimensional data, including 4 | one and two-sample tests for homogeneity, uniformity, sphericity and 5 | independence. Of note are implementations of some modern tests 6 | appropriate for data where dimensionality grows with samples size, possibly 7 | exceeding the number of samples. 8 | 9 | # Installation 10 | Download [highdim](https://github.com/brian-lau/highdim/archive/master.zip) and 11 | add the resulting folder to your Matlab path. 12 | Folders prefixed by a `+` are packages that should not be explicitly added to your path, 13 | although their [parent folder should be](http://www.mathworks.com/help/matlab/matlab_oop/scoping-classes-with-packages.html#brfynt_-3). 14 | 15 | The Statistics toolbox is required. 16 | 17 | # Examples 18 | The various tests are most easily accessed through three interfaces: `DepTest1`, 19 | `DepTest2` and `UniSphereTest` for one-sample tests, two-sample tests and 20 | one-sample tests on the sphere, respectively. 21 | 22 | Detailed simulations of size, power and comparisons between tests are available 23 | in the [wiki](https://github.com/brian-lau/highdim/wiki). The examples below 24 | give an idea of what's available. 25 | 26 | ### Multivariate (In)dependence, Sphericity and Homogeneity 27 | ``` 28 | % Independent, but non-spherical data 29 | sigma = diag([ones(1,25),0.5*ones(1,5)]); 30 | x = (sigma*randn(50,30)')'; 31 | 32 | % Independence tests (Han & Liu, 2014) 33 | DepTest1(x,'test','spearman') 34 | DepTest1(x,'test','kendall') 35 | 36 | % Sphericity tests (Ledoit & Wolf, 2002; Wang & Yao, 2013; Zou et al., 2014) 37 | DepTest1(x,'test','john') 38 | DepTest1(x,'test','wang') 39 | DepTest1(x,'test','sign') 40 | DepTest1(x,'test','bcs') 41 | ``` 42 | * Han, F & Liu, H (2014). Distribution-free tests of independence with 43 | applications to testing more structures. [arXiv:1410.4179](http://arxiv.org/abs/1410.4179) 44 | * Ledoit, O & Wolf, M (2002). Some hypothesis tests for the covariance matrix 45 | when the dimension is large compared to the sample size. 46 | [Annals of Statistics 30: 1081-1102](http://projecteuclid.org/euclid.aos/1031689018) 47 | * Wang, Q & Yao, J (2013). On the sphericity test with large-dimensional 48 | observations. [Electronic Journal of Statistics 7: 2164-2192](http://projecteuclid.org/euclid.ejs/1378817880) 49 | * Zou, C et al (2014). Multivariate sign-based high-dimensional tests for 50 | sphericity. [Biometrika 101: 229-236](http://biomet.oxfordjournals.org/content/101/1/229) 51 | 52 | ``` 53 | % Non-indepedent data, with ~0 correlation, from the same distribution 54 | x = rand(200,1); y = rand(200,1); 55 | xx = 0.5*(x+y)-0.5; yy = 0.5*(x-y); 56 | corr(xx,yy) 57 | 58 | % Two-sample Independence tests (Gretton et al, 2008; Szekely & Rizzo, 2013) 59 | DepTest2(xx,yy,'test','dcorr') % Distance correlation t-test 60 | DepTest2(xx,yy,'test','hsic') % Hilbert Schmidt Independence Criterion 61 | 62 | % Do the samples come from the same distribution? (Gretton et al, 2012; Szekely et al. 2007) 63 | DepTest2(xx,yy,'test','mmd') % Maximum mean discrepancy 64 | DepTest2(xx,yy,'test','energy') % statistical energy 65 | ``` 66 | * Gretton, A et al (2008). A kernel statistical test of independence. [Neural Information Processing Systems](http://papers.nips.cc/paper/3201-a-kernel-statistical-test-of-independence.pdf) 67 | * Gretton, A et al (2012). A kernel two-sample test. [Journal of Machine Learning Research 13: 723-773](http://www.jmlr.org/papers/volume13/gretton12a/gretton12a.pdf) 68 | * Szekely, G et al (2007). Measuring and testing independence by correlation of distances. [Annals of Statistics 35: 2769-2794](http://projecteuclid.org/euclid.aos/1201012979) 69 | * Szekely, G & Rizzo, M (2013). The distance correlation t-test of independence 70 | in high dimension. [Journal of Multivariate Analysis 117: 193-213](http://dx.doi.org/10.1016/j.jmva.2013.02.012) 71 | 72 | ``` 73 | % Independent data, different distributions 74 | x = randn(200,1); y = rand(200,1); 75 | 76 | % Two-sample Independence tests 77 | DepTest2(x,y,'test','dcorr') 78 | DepTest2(x,y,'test','hsic') 79 | 80 | % Do the samples come from the same distribution? 81 | DepTest2(x,y,'test','mmd') 82 | DepTest2(x,y,'test','energy') 83 | ``` 84 | ### Differences in multivariate means and covariances 85 | ``` 86 | % Two high-dimensional samples with sparse difference in covariance matrix (4 entries) 87 | p = 50; n = 100; 88 | for ii = 1:p 89 | for jj = 1:p 90 | sigma(ii,jj) = 0.5^abs(ii-jj); 91 | end 92 | end 93 | D = diag(unifrnd(0.5,2.5,p,1)); 94 | S = D^.5*sigma*D^.5; U = zeros(p,p); 95 | [~,~,k] = utils.tri2sqind(p); 96 | r = randperm(numel(k)); 97 | U(k(r(1:4))) = unifrnd(0,4,4,1)*max(diag(S)); 98 | U = U + U'; 99 | [~,da] = eig(S); [~,db] = eig(S+U); 100 | d = abs(min([diag(da);diag(db)])) + 0.05; 101 | 102 | x = mvnrnd(zeros(1,p),S+d*eye(p),n); 103 | y = mvnrnd(zeros(1,p),S+U+d*(eye(p)),n); 104 | 105 | DepTest2(x,y,'test','covdiff') 106 | 107 | % Directly calling the test returns M, a matrix indicating where covariance 108 | % elements are significantly different (FWER controlled at alpha) 109 | [pval,stat,M] = diff.covtest(x,y); 110 | ``` 111 | * Cai, T et al (2013). Two-sample covariance matrix testing and support 112 | recovery in high-dimensional and sparse settings. [Journal of the 113 | American Statistical Association 108: 265-277](http://www.tandfonline.com/doi/abs/10.1080/01621459.2012.758041) 114 | 115 | ### Uniformity on hypersphere 116 | ``` 117 | % Non-uniform samples, antipodally distributed on the sphere 118 | sigma = diag([1 5 1]); 119 | x = (sigma*randn(50,3)')'; 120 | 121 | % Is projection onto unit hypersphere uniformly distributed? 122 | UniSphereTest(x,'test','rayleigh') % Rayleigh test fails since resultant is zero 123 | UniSphereTest(x,'test','gine-ajne') % Weighted Gine-Ajne 124 | UniSphereTest(x,'test','randproj') % random projection 125 | UniSphereTest(x,'test','bingham') % Bingham 126 | ``` 127 | * Cai, T et al (2013). Distribution of angles in random packing on spheres. [Journal of Machine Learning Research 14: 1837-1864](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4196685/) 128 | * Cuesta-Albertos, J et al (2009). On projection-based tests for 129 | directional and compositional data. [Statistics & Computing 19: 367-380](http://link.springer.com/article/10.1007%2Fs11222-008-9098-3#page-1) 130 | * Gine, E (1975) Invariant tests for uniformity on compact Riemannian manifolds based on Sobolev norms. [Annals of Statistics 3: 1243-1266](http://www.jstor.org/discover/10.2307/2958247) 131 | * Mardia, K & Jupp, P (2000). [Directional Statistics](https://books.google.fr/books?id=PTNiCm4Q-M0C&printsec=frontcover&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false). John Wiley 132 | * Prentice, M (1978). On invariant tests of uniformity for directions 133 | and orientations. [Annals of Statistics 6: 169-176](http://projecteuclid.org/euclid.aos/1176344075) 134 | 135 | Contributions 136 | -------------------------------- 137 | Copyright (c) 2017 Brian Lau [brian.lau@upmc.fr](mailto:brian.lau@upmc.fr), see [LICENSE](https://github.com/brian-lau/highdim/blob/master/LICENSE) 138 | 139 | Please feel free to [fork](https://github.com/brian-lau/highdim/fork) and contribute! 140 | -------------------------------------------------------------------------------- /DepTest1.m: -------------------------------------------------------------------------------- 1 | % DEPTEST1 Interface for one-sample tests 2 | % 3 | % Given a sample X1,...,Xn from a p-dimensional multivariate distribution, 4 | % test one of the hypotheses: 5 | % 6 | % H0 : Covariance matrix of sample is proportional to the identity 7 | % 8 | % using the following tests, 9 | % 'john' - John, Sugiura, Nagao test (JSN) 10 | % 'nagao' - JSN with Box-Bartlett correction 11 | % 'wang' - JSN with correction for large p 12 | % 'sign' - multivariate sign, non-parametric 13 | % 'bcs' - multivariate sign, correction for large p 14 | % 15 | % H0 : X1,...,Xp are mutually independent 16 | % 17 | % using the following rank-based tests suitable for high-dimensional data 18 | % 'spearman' - R1 from Han & Liu (default) 19 | % 'kendall' - R2 from Han & Liu 20 | % 21 | % PROPERTIES 22 | % x - [n x p] matrix, n samples with dimensionality p 23 | % n - # of samples 24 | % p - # of dimensions 25 | % test - string (see above, default = 'bcs') 26 | % params - parameters passed through for specific tests 27 | % alpha - alpha level (default = 0.05) 28 | % stat - corresponding statistic 29 | % pval - p-value 30 | % h - boolean, 1 indicates rejection of null at alpha 31 | % runtime - elapsed time for running test, in seconds 32 | % 33 | % EXAMPLE 34 | % % Independent, but non-spherical data 35 | % sigma = diag([ones(1,25),0.5*ones(1,5)]); 36 | % x = (sigma*randn(50,30)')'; 37 | % % Sphericity tests 38 | % DepTest1(x,'test','john') 39 | % DepTest1(x,'test','wang') 40 | % DepTest1(x,'test','sign') 41 | % DepTest1(x,'test','bcs') 42 | % % Independence tests 43 | % DepTest1(x,'test','spearman') 44 | % DepTest1(x,'test','kendall') 45 | % 46 | % REFERENCE 47 | % Han & Liu (2014). Distribution-free tests of independence with 48 | % applications to testing more structures. arXiv:1410.4179v1 49 | % Ledoit & Wolf (2002). Some hypothesis tests for the covariance matrix 50 | % when the dimension is large compared to the sample size. Annals of 51 | % Statistics 30: 1081-1102 52 | % Wang, Q and Yao J (2013). On the sphericity test with large-dimensional 53 | % observations. Electronic Journal of Statistics 7: 2164-2192 54 | % Zou et al (2014). Multivariate sign-based high-dimensional tests for 55 | % sphericity. Biometrika 101: 229-236 56 | % 57 | % SEE ALSO 58 | % DepTest2, UniSphereTest 59 | 60 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 61 | % The full license and most recent version of the code can be found 62 | % https://github.com/brian-lau/highdim 63 | % 64 | % This program is free software: you can redistribute it and/or modify 65 | % it under the terms of the GNU General Public License as published by 66 | % the Free Software Foundation, either version 3 of the License, or 67 | % (at your option) any later version. 68 | % 69 | % This program is distributed in the hope that it will be useful, 70 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 71 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 72 | % GNU General Public License for more details. 73 | 74 | classdef DepTest1 < handle 75 | properties 76 | x 77 | end 78 | properties (Dependent=true,SetAccess=private) 79 | n 80 | p 81 | end 82 | properties 83 | test 84 | params 85 | alpha = 0.05; 86 | end 87 | properties (SetAccess=private) 88 | stat 89 | pval 90 | h 91 | runtime 92 | end 93 | properties (Hidden=true,SetAccess=private) 94 | mc % monte carlo samples of empirical null distribution 95 | autoRun 96 | validTests = {'spearman' 'kendall' 'sign' 'bcs' ... 97 | 'john' 'nagao' 'wang'}; 98 | end 99 | properties(SetAccess = protected) 100 | version = '0.1.0' 101 | end 102 | 103 | methods 104 | function self = DepTest1(varargin) 105 | if (nargin == 1) || (rem(nargin,2) == 1) 106 | varargin = {'x' varargin{:}}; 107 | end 108 | 109 | par = inputParser; 110 | par.KeepUnmatched = true; 111 | addParamValue(par,'x',[],@isnumeric); 112 | addParamValue(par,'autoRun',true,@islogical); 113 | addParamValue(par,'test','spearman',@ischar); 114 | parse(par,varargin{:}); 115 | 116 | self.autoRun = par.Results.autoRun; 117 | self.params = par.Unmatched; 118 | self.test = par.Results.test; 119 | self.x = par.Results.x; 120 | end 121 | 122 | function set.x(self,x) 123 | [n,p] = size(x); 124 | % Clear cache of monte-carlo samples if dimensions change 125 | % Only applies for rank-based tests of independence 126 | if (self.n~=n) || (self.p~=p) 127 | self.mc = []; 128 | end 129 | self.x = x; 130 | if ~isempty(self.x) && self.autoRun 131 | self.run(); 132 | end 133 | end 134 | 135 | function set.test(self,test) 136 | test = lower(test); 137 | if any(strcmp(test,self.validTests)) 138 | self.test = test; 139 | if ~isempty(self.x) && self.autoRun 140 | self.run(); 141 | end 142 | else 143 | error('Invalid test'); 144 | end 145 | end 146 | 147 | function set.params(self,params) 148 | self.params = params; 149 | if ~isempty(self.x) && self.autoRun 150 | self.run(); 151 | end 152 | end 153 | 154 | function set.alpha(self,alpha) 155 | assert((alpha>0)&&(alpha<1),'00)&&(alpha<1),'0=stat)/nboot; 197 | end 198 | end 199 | end -------------------------------------------------------------------------------- /+dep/dcovtest.m: -------------------------------------------------------------------------------- 1 | % DCOVTEST Distance covariance test of independence 2 | % 3 | % [pval,r,stat,null] = dcovtest(x,y,varargin) 4 | % 5 | % Given a sample X1,...,Xn from a p-dimensional multivariate distribution, 6 | % and a sample Y1,...,Xn from a q-dimensional multivariate distribution, 7 | % test the hypothesis: 8 | % 9 | % H0 : X and Y are mutually independent 10 | % 11 | % This hypothesis is tested using several different permutation methods. 12 | % 13 | % The default permutation method avoids permuting the data altogether 14 | % by approximating the permutation distribution using a moment-matched 15 | % Pearson Type III distribution (Bilodeau & Guetsop Nangue 2017; Josse 16 | % et al 2008; Minas & Montana 2014). The first three moments of the 17 | % permutation distribution can be calculated exactly for distance 18 | % covariance and related statistics (Kazi-Aoual et al 1995), and the 19 | % Pearson type III fit using these moments is a robust and accurate 20 | % approximation to the null distribution (Josse et al 2008). Since this 21 | % method does not actually permute the data, it is very fast, achieving 22 | % the same statistical power that would otherwise require millions of 23 | % permutations (Minas & Montana, 2014). 24 | % 25 | % Testing using actual permutations of the data are also implemented. 26 | % Naive permutation of the rows of X or Y is expensive due to O(n^2) 27 | % distance calculations. This can be avoided since it is equivalent to 28 | % simultaneously permuting the rows and columns of the distance matrix, 29 | % and recomputing the statistic with the permuted distance matrix. 30 | % 31 | % INPUTS 32 | % x - [n x p] n samples of dimensionality p 33 | % y - [n x q] n samples of dimensionality q 34 | % 35 | % OPTIONAL (as name/value pairs, order irrelevant) 36 | % method - 'pearson' - Pearson type III approx by moment matching (DEFAULT) 37 | % 'perm' - randomization using permutation of the rows & 38 | % columns of the double-centered distance matrices 39 | % 'perm-dist' - randomization using permutation of the rows & 40 | % columns of distance matrices 41 | % 'perm-brute' - brute force randomization, directly permuting 42 | % one of the inputs, which requires recalculating 43 | % and centering distance matrices 44 | % nboot - # permutations if method != 'pearson' 45 | % 46 | % OUTPUTS 47 | % pval - p-value 48 | % d - distance covariance 49 | % stat - test statistic 50 | % null - permutation statistics 51 | % 52 | % EXAMPLE 53 | % rng(1234); 54 | % p = 100; 55 | % n = 2000; 56 | % X = rand(n,p); Y = X.^2 + 15*randn(n,p); 57 | % 58 | % tic;[pval,d] = dep.dcovtest(X,Y,'method','pearson'); toc 59 | % [pval, d] 60 | % tic;[pval,d] = dep.dcovtest(X,Y,'method','pearson','unbiased',true); toc 61 | % [pval, d] 62 | % tic;[pval,d] = dep.dcovtest(X,Y,'method','perm','nboot',200);toc 63 | % [pval, d] 64 | % tic;[pval,d] = dep.dcovtest(X,Y,'method','perm-brute','nboot',200);toc 65 | % [pval, d] 66 | % 67 | % REFERENCE 68 | % Bilodeau & Guetsop Nangue (2017). Approximations to permutation tests 69 | % of independence between two random vectors. 70 | % Computational Statistics & Data Analysis, submitted. 71 | % Josse, Pages & Husson (2008). Testing the significance of the RV 72 | % coefficient. Computational Statistics and Data Analysis. 53: 82-91 73 | % Kazi-Aoual et al (1995). Refined approximations to permutation tests 74 | % for multivariate inference. Computational Statistics & Data Analysis. 75 | % 20: 643-656 76 | % Minas & Montana (2014). Distance-based analysis of variance: 77 | % Approximate inference. Statistical Analysis & Data Mining. 7: 450-470 78 | % 79 | % SEE ALSO 80 | % dcov, dcorr, dcorrtest, DepTest2 81 | 82 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 83 | % The full license and most recent version of the code can be found at: 84 | % https://github.com/brian-lau/highdim 85 | % 86 | % This program is free software: you can redistribute it and/or modify 87 | % it under the terms of the GNU General Public License as published by 88 | % the Free Software Foundation, either version 3 of the License, or 89 | % (at your option) any later version. 90 | % 91 | % This program is distributed in the hope that it will be useful, 92 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 93 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 94 | % GNU General Public License for more details. 95 | 96 | function [pval,d,stat,varargout] = dcovtest(x,y,varargin) 97 | 98 | par = inputParser; 99 | par.KeepUnmatched = true; 100 | addRequired(par,'x',@isnumeric); 101 | addRequired(par,'y',@isnumeric); 102 | addParamValue(par,'method','pearson',@ischar); 103 | addParamValue(par,'nboot',999,@(x) isnumeric(x) && isscalar(x)); 104 | parse(par,x,y,varargin{:}); 105 | 106 | [n,~] = size(x); 107 | assert(n == size(y,1),'DCOVTEST requires x and y to have the same # of samples'); 108 | 109 | permMethods = {'perm' 'perm-dist' 'perm-brute'}; 110 | nboot = par.Results.nboot; 111 | method = lower(par.Results.method); 112 | 113 | switch method 114 | case {'pearson'} 115 | [d,~,~,A,B] = dep.dcov(x,y,par.Unmatched); 116 | 117 | if isfield(par.Unmatched,'unbiased') && par.Unmatched.unbiased 118 | stat = (n*(n-3))*d; % = sum(sum(A.*B)) for unbiased estimator 119 | else 120 | stat = (n^2)*d^2; % = sum(sum(A.*B)) for biased estimator 121 | end 122 | 123 | [pval,stat] = utils.pearsonIIIpval(A,B,stat); 124 | case {'perm'} 125 | if isfield(par.Unmatched,'unbiased') && par.Unmatched.unbiased 126 | % This only works for BIASED estimator, since distance matrices are 127 | % necessary for calculating the UNBIASED estimator 128 | error('Cannot use unbiased estimator for method = ''perm'''); 129 | end 130 | [d,~,~,A,B] = dep.dcov(x,y); 131 | 132 | null = zeros(nboot,1); 133 | for i = 1:nboot 134 | ind = randperm(n); 135 | null(i) = dep.dcov(A,B(ind,ind),'doublecenter',true); 136 | end 137 | case {'perm-dist'} 138 | a = sqrt(utils.sqdist(x,x)); 139 | b = sqrt(utils.sqdist(y,y)); 140 | d = dep.dcov(a,b,'dist',true); 141 | 142 | null = zeros(nboot,1); 143 | for i = 1:nboot 144 | ind = randperm(n); 145 | null(i) = dep.dcov(a,b(ind,ind),'dist',true); 146 | end 147 | case {'perm-brute'} 148 | d = dep.dcov(x,y,par.Unmatched); 149 | 150 | null = zeros(nboot,1); 151 | for i = 1:nboot 152 | ind = randperm(n); 153 | null(i) = dep.dcov(x,y(ind,:),par.Unmatched); 154 | end 155 | otherwise 156 | error('Unrecognized test method'); 157 | end 158 | 159 | % One of the permutation methods 160 | if any(strcmp(method,permMethods)) 161 | if ~exist('stat','var') 162 | stat = d; 163 | end 164 | pval = (1 + sum(null>stat)) / (1 + nboot); 165 | end 166 | 167 | if nargout == 4 168 | if exist('null','var') 169 | varargout{1} = null; 170 | else 171 | varargout{1} = []; 172 | end 173 | end -------------------------------------------------------------------------------- /+utils/rfm.m: -------------------------------------------------------------------------------- 1 | % RFM Random feature maps for Gaussian kernel 2 | % 3 | % [phi,W,rngState] = rfm(X,varargin) 4 | % 5 | % INPUTS 6 | % X - [n x d] n samples of dimensionality d 7 | % 8 | % OPTIONAL 9 | % sigma - scalar, standard deviation of Gaussian kernel, default = 1 10 | % sampling - string indicating method for sampling random features 11 | % 'uniform' - Classic random fourier features (DEFAULT) 12 | % 'qmc' - Quasi-Monte Carlo using Halton sequence 13 | % 'orf' - Orthogonal Random Features 14 | % 'sorf'- Structured Orthogonal Random Features 15 | % 'mm' - Moment-Matched 16 | % D - scalar, target dimensionality of feature map 17 | % W - [D x d] pre-computed feature map, convenience for a 18 | % applying feature map to new data 19 | % complex - boolean, true returns map as complex 20 | % sincos - boolean, true returns sin/cos embedding, default = true 21 | % The following parameters are specific for sampling = 'qmc' 22 | % skip - scalar, # initial points to omit, default = 1000 23 | % leap - scalar, # points in between sets, detault = 700 24 | % scramble - boolean, scramble sequence, default = true 25 | % state - scalar, state of qmc generator 26 | % 27 | % OUTPUTS 28 | % phi - feature mapped data 29 | % [n x D] when 'complex' = true 30 | % [n x 2D] when 'complex' = false, cos and sin components stacked 31 | % W - [D x d] feature map 32 | % rngState - state of the RNG before sampling 33 | % 34 | % REFERENCES 35 | % Felix et al (2016). Orthogonal random features. Advances in Neural 36 | % Information Processing Systems, 1975-1983 37 | % Rahimi & Recht (2007). Random features for large-scale kernel machines. 38 | % Proc 20th Int Conf on Neural Information Processing Systems, 1177-1184 39 | % Shen et al (2017). Random features for shift-invariant kernels with 40 | % moment matching. Proc 31st AAAI Conf on AI, 2520-2526 41 | % Sutherland & Schneider (2015). On the error of random fourier features. 42 | % UAI'15 Proc 31st Conf on Uncertainty in AI, 862-871 43 | % Yang et al (2014). Quasi-Monte Carlo feature maps for shift-invariant 44 | % kernels. Proc 31st Int Conf on Machine Learning (ICML-14), 485-493 45 | 46 | % $ Copyright (C) 2017 Brian Lau, brian.lau@upmc.fr $ 47 | % The full license and most recent version of the code can be found at: 48 | % https://github.com/brian-lau/highdim 49 | % 50 | % This program is free software: you can redistribute it and/or modify 51 | % it under the terms of the GNU General Public License as published by 52 | % the Free Software Foundation, either version 3 of the License, or 53 | % (at your option) any later version. 54 | % 55 | % This program is distributed in the hope that it will be useful, 56 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 57 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 58 | % GNU General Public License for more details. 59 | 60 | % TODO 61 | % o ORF, SORF should probably be run in blocks 62 | % currently generates W that is dxd and extracts Dxd segment 63 | % o fastfood 64 | % o better default D 65 | 66 | function [phi,W,rngState] = rfm(X,varargin) 67 | persistent pstream; % for qmc 68 | 69 | par = inputParser; 70 | par.KeepUnmatched = true; 71 | addRequired(par,'X',@isnumeric); 72 | addParamValue(par,'sigma',[],@(x) isnumeric(x) && isscalar(x)); 73 | addParamValue(par,'sampling','uniform',@ischar); 74 | addParamValue(par,'W',[],@ismatrix); 75 | addParamValue(par,'D',2^4,@(x) isnumeric(x) && isscalar(x)); 76 | addParamValue(par,'complex',false,@islogical); 77 | addParamValue(par,'sincos',true,@islogical); 78 | addParamValue(par,'skip',1000,@(x) isnumeric(x) && isscalar(x)); 79 | addParamValue(par,'leap',700,@(x) isnumeric(x) && isscalar(x)); 80 | addParamValue(par,'scramble',true,@(x) isnumeric(x) || islogical(x)); 81 | addParamValue(par,'state',[],@(x) isnumeric(x) && isscalar(x)); 82 | parse(par,X,varargin{:}); 83 | 84 | [n,d] = size(X); % # of dimensions 85 | D = par.Results.D; % # of random bases 86 | if isempty(par.Results.sigma) 87 | sigma = utils.sigest(X,par.Unmatched); 88 | else 89 | sigma = par.Results.sigma; 90 | end 91 | 92 | if nargout == 3 93 | rngState = rng; 94 | end 95 | 96 | if ~isempty(par.Results.W) 97 | assert(size(par.Results.W,2)==d,'Feature map dimensionality must match input data'); 98 | W = p.Results.W; 99 | else 100 | switch lower(par.Results.sampling) 101 | case {'uniform' 'uni' 'mc' 'rff'} 102 | % Random fourier features 103 | W = randn(D,d)/sigma; 104 | case {'mm'} 105 | if D < d 106 | warning('Risk of poor approximation for D << d'); 107 | end 108 | G = randn(D,d); 109 | W = utils.whiten(G)/sigma; 110 | case {'qmc'} 111 | if isempty(pstream) ... 112 | || ~isa(pstream,'qrandstream') ... 113 | || (pstream.PointSet.size(2) ~= d) 114 | pset = haltonset(d,'Skip',par.Results.skip,... 115 | 'Leap',par.Results.leap); 116 | if par.Results.scramble 117 | pset = scramble(pset,'RR2'); 118 | end 119 | % Persistent stream to properly increment draws on subsequent calls 120 | pstream = qrandstream(pset); 121 | %fprintf('Halton random stream opened\n') 122 | end 123 | 124 | if ~isempty(par.Results.state) 125 | pstream.State = par.Results.state; 126 | end 127 | %fprintf('Stream state: %g\n',pstream.State); 128 | omega = pstream.qrand(D); 129 | W = norminv(omega,0,1)/sigma; 130 | case {'orf'} 131 | G = randn(max(d,D),max(d,D)); 132 | [Q,~] = qr(G); 133 | 134 | % Chi-distributed with max(d,D) degrees of freedom 135 | s = sqrt(chi2rnd(max(d,D),max(d,D),1)); 136 | % S ensures that the row norms of SQ & G are identically distributed 137 | S = diag(s); 138 | 139 | W = (S*Q)/sigma; 140 | W = W(1:D,1:d); 141 | case {'sorf'} 142 | n2 = nextpow2(max(D,d)); 143 | % Brute-force matrix multiplication, O(d^2) 144 | % H = (1/sqrt(2^n2))*hadamard(2^n2); 145 | % D1 = diag(2*(rand(2^n2,1)<0.5) - 1); 146 | % D2 = diag(2*(rand(2^n2,1)<0.5) - 1); 147 | % D3 = diag(2*(rand(2^n2,1)<0.5) - 1); 148 | % W = sqrt(2^n2)*H*D1*H*D2*H*D3; 149 | 150 | % Using Fast Hadamard transform, O(d log d) 151 | Ds = 2*(rand(2^n2,3)<0.5) - 1; % Rademacher distributed diagonals 152 | HD1 = sqrt(2^n2)*utils.fwht( diag(Ds(:,1)) ); 153 | HD2 = sqrt(2^n2)*utils.fwht( diag(Ds(:,2)) ); 154 | HD3 = sqrt(2^n2)*utils.fwht( diag(Ds(:,3)) ); 155 | 156 | W = sqrt(2^n2)*HD1*HD2*HD3; 157 | W = W(1:D,1:d)/sigma; 158 | case {'sc'} 159 | %Signed Circulant Matrix Projection 160 | % http://felixyu.org/pdf/cbe_slides.pdf 161 | otherwise 162 | error('Unrecognized sampling method'); 163 | end 164 | end 165 | 166 | Z = X*W'; % [n x d] * [D x d]' 167 | 168 | if par.Results.sincos 169 | % Use the version with sin & cos features, which is more accurate, 170 | % Sutherland & Schneider (2105) 171 | if par.Results.complex 172 | phi = (cos(Z) - 1i*sin(Z)) * sqrt(1/D); 173 | else 174 | phi = [cos(Z) , sin(Z)] * sqrt(1/D); 175 | end 176 | else 177 | b = rand(1,D)*2*pi; 178 | phi = cos(bsxfun(@plus,Z,b)) * sqrt(1/D); 179 | end 180 | --------------------------------------------------------------------------------