├── Fair_SC_normalized.m ├── Fair_SC_unnormalized.m ├── README.md ├── SC_normalized.m ├── SC_unnormalized.m ├── clustering_accuracy.m ├── experiments_Figure2_Row1.m ├── experiments_Figure2_Row2.m ├── experiments_Figure3.m ├── experiments_Figure4.m └── generate_adja_SB_model.m /Fair_SC_normalized.m: -------------------------------------------------------------------------------- 1 | function clusterLabels = Fair_SC_normalized(adj,k,sensitive) 2 | %implementation of fair normalized SC as stated in Alg. 3 3 | % 4 | %INPUT: 5 | %adj ... (weighted) adjacency matrix of size n x n 6 | %k ... number of clusters 7 | %sensitive ... vector of length n encoding the sensitive attribute 8 | % 9 | %OUTPUT: 10 | %clusterLabels ... vector of length n comprising the cluster label for each 11 | % data point 12 | 13 | 14 | n = size(adj, 1); 15 | 16 | % converting sensitive to a vector with entries in [h] and building F %%% 17 | sens_unique=unique(sensitive); 18 | h = length(sens_unique); 19 | sens_unique=reshape(sens_unique,[1,h]); 20 | 21 | sensitiveNEW=sensitive; 22 | 23 | temp=1; 24 | for ell=sens_unique 25 | sensitiveNEW(sensitive==ell)=temp; 26 | temp=temp+1; 27 | end 28 | 29 | F=zeros(n,h-1); 30 | 31 | for ell=1:(h-1) 32 | temp=(sensitiveNEW == ell); 33 | F(temp,ell)=1; 34 | groupSize = sum(temp); 35 | F(:,ell) = F(:,ell)-groupSize/n; 36 | end 37 | %%%% 38 | 39 | 40 | 41 | degrees = sum(adj, 1); 42 | D = diag(degrees); 43 | L = D-adj; 44 | 45 | Z = null(F'); 46 | Q=sqrtm(Z'*D*Z); 47 | Qinv=inv(Q); 48 | 49 | 50 | Msymm=Qinv'*Z'*L*Z*Qinv; 51 | Msymm=(Msymm+Msymm')/2; 52 | 53 | try 54 | [Y, eigValues] = eigs(Msymm,k,'smallestabs','MaxIterations',500,'SubspaceDimension',min(size(Msymm,1),max(2*k,25))); 55 | catch 56 | [Y, eigValues] = eigs(Msymm,k,'smallestreal','MaxIterations',1000,'SubspaceDimension',min(size(Msymm,1),max(2*k,25))); 57 | end 58 | 59 | H = Z*Qinv*Y; 60 | 61 | clusterLabels = kmeans(H,k,'Replicates',10); 62 | end 63 | 64 | -------------------------------------------------------------------------------- /Fair_SC_unnormalized.m: -------------------------------------------------------------------------------- 1 | function clusterLabels = Fair_SC_unnormalized(adj,k,sensitive) 2 | %implementation of fair unnormalized SC as stated in Alg. 2 3 | % 4 | %INPUT: 5 | %adj ... (weighted) adjacency matrix of size n x n 6 | %k ... number of clusters 7 | %sensitive ... vector of length n encoding the sensitive attribute 8 | % 9 | %OUTPUT: 10 | %clusterLabels ... vector of length n comprising the cluster label for each 11 | % data point 12 | 13 | 14 | n = size(adj, 1); 15 | 16 | % converting sensitive to a vector with entries in [h] and building F %%% 17 | sens_unique=unique(sensitive); 18 | h = length(sens_unique); 19 | sens_unique=reshape(sens_unique,[1,h]); 20 | 21 | sensitiveNEW=sensitive; 22 | 23 | temp=1; 24 | for ell=sens_unique 25 | sensitiveNEW(sensitive==ell)=temp; 26 | temp=temp+1; 27 | end 28 | 29 | F=zeros(n,h-1); 30 | 31 | for ell=1:(h-1) 32 | temp=(sensitiveNEW == ell); 33 | F(temp,ell)=1; 34 | groupSize = sum(temp); 35 | F(:,ell) = F(:,ell)-groupSize/n; 36 | end 37 | %%%% 38 | 39 | 40 | 41 | degrees = sum(adj, 1); 42 | D = diag(degrees); 43 | L = D-adj; 44 | 45 | Z = null(F'); 46 | 47 | Msymm=Z'*L*Z; 48 | Msymm=(Msymm+Msymm')/2; 49 | 50 | try 51 | [Y, eigValues] = eigs(Msymm,k,'smallestabs','MaxIterations',500,'SubspaceDimension',min(size(Msymm,1),max(2*k,25))); 52 | catch 53 | [Y, eigValues] = eigs(Msymm,k,'smallestreal','MaxIterations',1000,'SubspaceDimension',min(size(Msymm,1),max(2*k,25))); 54 | end 55 | 56 | H = Z*Y; 57 | 58 | clusterLabels = kmeans(H,k,'Replicates',10); 59 | end 60 | 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fair_spectral_clustering 2 | 3 | MATLAB code for our paper "Guarantees for Spectral Clustering with Fairness Constraints" (https://arxiv.org/abs/1901.08668). 4 | 5 | To reproduce the experiments shown in Figures 2 to 4, simply run *experiments_Figure2_Row1.m*, *experiments_Figure2_Row2.m*, *experiments_Figure3.m* or *experiments_Figure4.m*. In the first line of the scripts, one can set the number of runs over which to average the results. 6 | 7 | The code has been tested in MATLAB R2017b. 8 | -------------------------------------------------------------------------------- /SC_normalized.m: -------------------------------------------------------------------------------- 1 | function clusterLabels = SC_normalized(adj,k) 2 | %implementation of normalized SC as described in Appendix A 3 | % 4 | %INPUT: 5 | %adj ... (weighted) adjacency matrix of size n x n 6 | %k ... number of clusters 7 | % 8 | %OUTPUT: 9 | %clusterLabels ... vector of length n comprising the cluster label for each 10 | % data point 11 | 12 | 13 | n=size(adj,1); 14 | 15 | degrees = sum(adj, 1); 16 | D = diag(degrees); 17 | Dsqinv=diag(1./sqrt(degrees)); 18 | 19 | L = D-adj; 20 | 21 | Msymm=Dsqinv*L*Dsqinv; 22 | Msymm=(Msymm+Msymm')/2; 23 | 24 | 25 | try 26 | [H, eigValues] = eigs(Msymm,k,'smallestabs','MaxIterations',500,'SubspaceDimension',min(n,max(2*k,25))); 27 | catch 28 | [H, eigValues] = eigs(Msymm,k,'smallestreal','MaxIterations',1000,'SubspaceDimension',min(n,max(2*k,25))); 29 | end 30 | 31 | clusterLabels = kmeans(Dsqinv*H,k,'Replicates',10); 32 | end 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /SC_unnormalized.m: -------------------------------------------------------------------------------- 1 | function clusterLabels = SC_unnormalized(adj,k) 2 | %implementation of unnormalized SC as stated in Alg. 1 3 | % 4 | %INPUT: 5 | %adj ... (weighted) adjacency matrix of size n x n 6 | %k ... number of clusters 7 | % 8 | %OUTPUT: 9 | %clusterLabels ... vector of length n comprising the cluster label for each 10 | % data point 11 | 12 | 13 | n=size(adj,1); 14 | 15 | degrees = sum(adj, 1); 16 | D = diag(degrees); 17 | L = D-adj; 18 | 19 | try 20 | [H, eigValues] = eigs(L,k,'smallestabs','MaxIterations',500,'SubspaceDimension',min(n,max(2*k,25))); 21 | catch 22 | [H, eigValues] = eigs(L,k,'smallestreal','MaxIterations',1000,'SubspaceDimension',min(n,max(2*k,25))); 23 | end 24 | 25 | clusterLabels = kmeans(H,k,'Replicates',10); 26 | end 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /clustering_accuracy.m: -------------------------------------------------------------------------------- 1 | function error=clustering_accuracy(labels,clustering) 2 | %computes the error of a clustering with respect to the ground-truth 3 | %clustering (see Section 4) 4 | % 5 | %INPUT: 6 | %labels ... vector of length n comprising ground-truth cluster labels 7 | %clustering ... vector of length n comprising cluster labels provided by a 8 | % clustering algorithm 9 | % 10 | %OUTPUT: 11 | %error ... error of the clustering 12 | 13 | 14 | n=length(labels); 15 | 16 | 17 | if sum(size(labels)==[n,1])==2 18 | labels=reshape(labels,[1,n]); 19 | end 20 | 21 | if sum(size(clustering)==[n,1])==2 22 | clustering=reshape(clustering,[1,n]); 23 | end 24 | 25 | 26 | aa=unique(labels); 27 | J=length(aa); 28 | 29 | bb=unique(clustering); 30 | K=length(bb); 31 | 32 | 33 | if sum(aa==(1:J))