├── README.md
├── algorithm_matroid_center.py
├── algorithms.py
├── experiments_adult_data_set.py
├── experiments_artificial_data.py
└── experiments_matroid_center.py


/README.md:
--------------------------------------------------------------------------------
 1 | # fair_k_center_clustering
 2 | 
 3 | Code for our paper "Fair k-Center Clustering for Data Summarization" (https://arxiv.org/abs/1901.08628).
 4 | 
 5 | To try it out and reproduce the boxplots (based on 10 runs) of the experiments of Figures 4 to 6 on artificial data, simply run
 6 | 
 7 | ```
 8 | python experiments_artificial_data.py 
 9 | ```
10 | 
11 | If you want to obtain the boxplots based on 50 runs, say, then run
12 | 
13 | ```
14 | python experiments_artificial_data.py 50
15 | ```
16 | 
17 | Similarly, in order to reproduce the boxplots of the experiments of Figures 5 and 6 on the Adult data set, run
18 | 
19 | ```
20 | python experiments_adult_data_set.py 50
21 | ```
22 | 
23 | If you want to compare our algorithm to the algorithm for the matroid center problem by Chen et al. (https://arxiv.org/abs/1301.0745), you need to have SageMath (http://www.sagemath.org/) installed on your system. Then simply run
24 | 
25 | ```
26 | sage -python experiments_matroid_center.py 50
27 | ```
28 | 
29 | 
30 | The code has been tested with the following software versions:
31 | - Python 2.7.10
32 | - Numpy 1.16.2
33 | - Scipy 1.1.0
34 | - Scikit-learn 0.19.1
35 | - Pandas 0.23.0
36 | - SageMath 8.2
37 | 


--------------------------------------------------------------------------------
/algorithm_matroid_center.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse.csgraph
  3 | 
  4 | from sage.all import *
  5 | import sage.matroids.matroid
  6 | import sage.matroids.constructor
  7 | 
  8 | 
  9 | 
 10 | ########################################################################################################################
 11 | ### Implementation of the 3-approximation algorithm for the matroid center problem proposed by Chen et al.
 12 | ### (Danny Z. Chen, Jian Li, Hongyu Liang, Haitao Wang. Matroid and Knapsack Center Problems. Algorithmica, 2016)
 13 | ########################################################################################################################
 14 | 
 15 | 
 16 | class PartitionMatroid_adapted(sage.matroids.matroid.Matroid):
 17 |     '''Adaptation of the class PartitionMatroid as presented in the 'Sage Reference Manual: MatroidTheory' to the matroid
 18 |     required in the algorithm by Chen et al..
 19 | 
 20 |     partition ... list of lists specifying the partition of the groundset'''
 21 | 
 22 | 
 23 |     def __init__(self, partition):
 24 |         self.partition = partition
 25 |         E = set()
 26 |         for P in partition:
 27 |             E.update(P)
 28 |         self.E = frozenset(E)
 29 |     def groundset(self):
 30 |         return self.E
 31 |     def _rank(self, X):
 32 |         X2 = set(X)
 33 |         used_indices = set()
 34 |         rk = 0
 35 |         while len(X2) > 0:
 36 |             e = X2.pop()
 37 |             for i in range(len(self.partition)-1):
 38 |                 if e in self.partition[i]:
 39 |                     if i not in used_indices:
 40 |                         used_indices.add(i)
 41 |                         rk = rk + 1
 42 |                     break
 43 |         return rk
 44 | 
 45 | 
 46 | 
 47 | class ValidCentersMatroid(sage.matroids.matroid.Matroid):
 48 |     '''Partition matroid encoding the constraints on the centers.
 49 | 
 50 |     sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups
 51 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k'''
 52 | 
 53 | 
 54 |     def __init__(self, sexes,nr_centers_per_sex):
 55 |         self.n = sexes.size
 56 |         self.sexes=sexes
 57 |         self.m=nr_centers_per_sex.size
 58 |         self.nr_centers_per_sex=nr_centers_per_sex
 59 |         self.E = frozenset(np.arange(self.n))
 60 |     def groundset(self):
 61 |         return self.E
 62 |     def _rank(self, X):
 63 |         X2 = set(X)
 64 |         nr_elem_in_groups=np.zeros(self.m,dtype=int)
 65 |         while len(X2) > 0:
 66 |             e = X2.pop()
 67 |             nr_elem_in_groups[self.sexes[e]]+=1
 68 |         return np.sum(np.minimum(nr_elem_in_groups,self.nr_centers_per_sex))
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | def MatCenter_binary_search(dmat, sexes, nr_centers_per_sex):
 75 |     '''Implementation of the algorithm by Chen et al..
 76 | 
 77 |     *) Rather than testing all distance values as threshold as suggested by Chen et al., we implement binary search to
 78 |     look for the optimal value.
 79 |     *) There might be a faster way than running Floyd-Warshall for every distance value that we are testing, however,
 80 |     in our experiments the time for doing so is negligible (for n<=250, the execution of the first five commands within the
 81 |     while-loop never takes more than 0.02 seconds).
 82 | 
 83 |     INPUT:
 84 |     dmat ... distance matrix of size nxn
 85 |     sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups
 86 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k
 87 | 
 88 |     RETURNS: approx. optimal centers'''
 89 | 
 90 | 
 91 |     n = dmat.shape[0]
 92 |     m = nr_centers_per_sex.size
 93 |     k = np.sum(nr_centers_per_sex)
 94 | 
 95 |     ConstrMatr =ValidCentersMatroid(sexes,nr_centers_per_sex)
 96 | 
 97 |     iu = np.triu_indices(n, 1)
 98 |     dval=np.sort(dmat[iu])
 99 | 
100 |     best_cost=np.inf
101 |     best_centers=np.array([],dtype=int)
102 | 
103 | 
104 |     while dval.size>0:
105 | 
106 |         thelp=int(np.floor(dval.size/2))
107 |         cand_dist=dval[thelp]
108 | 
109 |         dmat_c=dmat.copy()
110 |         dmat_c[dmat_c>cand_dist]=0
111 |         scipy.sparse.csgraph.floyd_warshall(dmat_c, directed=False, overwrite=True)
112 | 
113 |         CC=np.array([],dtype=int)
114 |         VV=np.zeros(n)
115 |         parti=[]
116 |         parti2=np.arange(n)
117 | 
118 |         while np.sum(VV)<n:
119 |             v=np.where(VV==0)[0][0]
120 |             np.append(CC,v)
121 |             temp=np.where(dmat_c[v, :] <= (cand_dist))[0]
122 |             parti.append(temp)
123 |             parti2=np.setdiff1d(parti2,temp)
124 |             VV[dmat_c[v,:]<=(2*cand_dist)]=1
125 | 
126 |         parti.append(parti2)
127 |         PartMatr=PartitionMatroid_adapted(parti)
128 | 
129 |         ttemp=np.array(list(ConstrMatr.intersection_unweighted(PartMatr)),dtype=int)
130 | 
131 |         if ttemp.size<CC.size:
132 |             curr_cost=np.inf
133 |             dval = dval[(thelp+1):]
134 |         else:
135 |             curr_cost = np.amax(np.amin(dmat[np.ix_(ttemp, np.arange(n))], axis=0))
136 |             dval=dval[0:thelp]
137 | 
138 |         if curr_cost<best_cost:
139 |             best_cost=curr_cost
140 |             best_centers=ttemp
141 | 
142 | 
143 |     for ell in np.arange(m):
144 |         if np.sum(sexes[best_centers] == ell) < nr_centers_per_sex[ell]:
145 |             toadd = nr_centers_per_sex[ell] - np.sum(sexes[best_centers] == ell)
146 |             toadd_pot = np.setdiff1d(np.where(sexes == ell)[0], best_centers)
147 |             if toadd_pot.size > toadd:
148 |                 best_centers = np.hstack((best_centers, toadd_pot[0:toadd]))
149 |             else:
150 |                 best_centers = np.hstack((best_centers, toadd_pot))
151 | 
152 |     return best_centers
153 | 
154 | 
155 | 
156 | def MatCenter_binary_search_WithGivenCenters(dmat, sexes, nr_centers_per_sex, given_centers):
157 |     '''Wrapper function that allows us to run the algorithm by Chen et al. with initially given centers.
158 | 
159 |     INPUT:
160 |     dmat ... distance matrix of size nxn
161 |     sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups
162 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k
163 |     given_centers ... integer-vector with entries in 0,...,n-1
164 | 
165 |     RETURNS: approx. optimal centers'''
166 | 
167 | 
168 |     m=nr_centers_per_sex.size
169 |     sexesNEW=sexes.copy()
170 |     sexesNEW[given_centers]=m
171 |     nr_centers_per_sexNEW=np.hstack((nr_centers_per_sex,given_centers.size))
172 | 
173 |     return np.setdiff1d(MatCenter_binary_search(dmat, sexesNEW, nr_centers_per_sexNEW),given_centers)
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/algorithms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse.csgraph
  3 | import itertools
  4 | 
  5 | 
  6 | 
  7 | ########################################################################################################################
  8 | def fair_k_center_exact(dmat,sexes,nr_centers_per_sex,given_centers):
  9 |     '''Exhaustive search to exactly solve the fair k-center problem (2) --- only works for small problem instances.
 10 | 
 11 |     INPUT:
 12 |     dmat ... distance matrix of size nxn
 13 |     sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups
 14 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k
 15 |     given_centers ... integer-vector with entries in 0,...,n-1
 16 | 
 17 |     RETURNS: (optimal centers, clustering, optimal fair k-center cost)'''
 18 | 
 19 | 
 20 |     n = dmat.shape[0]
 21 |     m = nr_centers_per_sex.size
 22 |     k = np.sum(nr_centers_per_sex)
 23 | 
 24 |     cost = np.inf
 25 |     best_choice = []
 26 | 
 27 | 
 28 |     for mmm in itertools.combinations(np.arange(n),k):
 29 | 
 30 |         cluster_centers = np.array(mmm)
 31 | 
 32 |         curr_nr_clusters_per_sex = np.zeros(m)
 33 |         for ell in np.arange(m):
 34 |             curr_nr_clusters_per_sex[ell] = np.sum(sexes[cluster_centers]==ell)
 35 | 
 36 |         if sum(curr_nr_clusters_per_sex==nr_centers_per_sex)==m:
 37 |             curr_cost = np.amax(np.amin(dmat[np.ix_(np.hstack((cluster_centers,given_centers)), np.arange(n))],axis=0))
 38 |         else:
 39 |             curr_cost = np.inf
 40 | 
 41 |         if curr_cost<cost:
 42 |             cost = curr_cost
 43 |             best_choice = cluster_centers.copy()
 44 | 
 45 |     clustering = np.array([np.argmin(dmat[ell, np.hstack((best_choice,given_centers))]) for ell in np.arange(n)])
 46 | 
 47 |     return best_choice, clustering, cost
 48 | ########################################################################################################################
 49 | 
 50 | 
 51 | 
 52 | ########################################################################################################################
 53 | def k_center_greedy_with_given_centers(dmat,k,given_centers):
 54 |     '''Implementation of Algorithm 1.
 55 | 
 56 |     INPUT:
 57 |     dmat ... distance matrix of size nxn
 58 |     k ... integer smaller than n
 59 |     given_centers ... integer-vector with entries in 0,...,n-1
 60 | 
 61 |     RETURNS: approx. optimal centers'''
 62 | 
 63 | 
 64 |     n=dmat.shape[0]
 65 | 
 66 |     if k==0:
 67 |         cluster_centers = np.array([],dtype=int)
 68 |     else:
 69 |         if given_centers.size==0:
 70 |             cluster_centers = np.random.choice(n,1,replace=False)
 71 |             kk = 1
 72 |         else:
 73 |             cluster_centers = given_centers
 74 |             kk = 0
 75 | 
 76 |         distance_to_closest = np.amin(dmat[np.ix_(cluster_centers,np.arange(n))],axis=0)
 77 |         while kk<k:
 78 |             temp = np.argmax(distance_to_closest)
 79 |             cluster_centers = np.append(cluster_centers,temp)
 80 |             distance_to_closest = np.amin(np.vstack((distance_to_closest,dmat[temp,:])),axis=0)
 81 |             kk+=1
 82 | 
 83 |         cluster_centers = cluster_centers[given_centers.size:]
 84 | 
 85 | 
 86 |     return cluster_centers
 87 | ########################################################################################################################
 88 | 
 89 | 
 90 | 
 91 | ########################################################################################################################
 92 | def fair_k_center_APPROX(dmat,sexes,nr_centers_per_sex,given_centers):
 93 |     '''Implementation of Algorithm 4.
 94 | 
 95 |     INPUT:
 96 |     dmat ... distance matrix of size nxn
 97 |     sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups
 98 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k
 99 |     given_centers ... integer-vector with entries in 0,...,n-1
100 | 
101 |     RETURNS: approx. optimal centers'''
102 | 
103 | 
104 |     n = dmat.shape[0]
105 |     m = nr_centers_per_sex.size
106 |     k = np.sum(nr_centers_per_sex)
107 | 
108 | 
109 |     if m==1:
110 |         cluster_centers = k_center_greedy_with_given_centers(dmat, k, given_centers)
111 | 
112 |     else:
113 |         cluster_centersTE = k_center_greedy_with_given_centers(dmat, k, given_centers)
114 | 
115 |         CURRENT_nr_clusters_per_sex = np.zeros(m, dtype=int)
116 |         for ell in np.arange(k):
117 |             CURRENT_nr_clusters_per_sex[sexes[cluster_centersTE[ell]]] += 1
118 | 
119 |         partition = np.array([np.argmin(dmat[ell, np.hstack((cluster_centersTE,given_centers))]) for ell in np.arange(n)])
120 |         G,centersTE = swapping_graph(partition[partition<k], np.array([np.where(np.arange(n)[partition < k] ==
121 |                             cluster_centersTE[ell])[0][0] for ell in np.arange(k)]),sexes[partition<k], nr_centers_per_sex)
122 |         cluster_centersTE=np.arange(n)[partition < k][centersTE]
123 | 
124 | 
125 |         if G.size==0:
126 |             cluster_centers=cluster_centersTE
127 |         else:
128 |             new_data_set=np.array([],dtype=int)
129 |             new_given_centersT=np.array([],dtype=int)
130 |             for ell in np.arange(k):
131 |                 if np.isin(sexes[cluster_centersTE[ell]],G):
132 |                     new_data_set=np.hstack((new_data_set,np.where(partition==ell)[0]))
133 |                 else:
134 |                     new_given_centersT=np.hstack((new_given_centersT,cluster_centersTE[ell]))
135 |             new_given_centers=np.hstack((new_given_centersT,given_centers))
136 |             sexes_new = sexes[new_data_set]
137 |             sexes_newT=np.zeros(new_data_set.size,dtype=int)
138 |             cc=0
139 |             for ell in G:
140 |                 sexes_newT[sexes_new==ell]=cc
141 |                 cc+=1
142 |             new_data_set=np.hstack((new_data_set,new_given_centers))
143 |             sexes_newT=np.hstack((sexes_newT,np.zeros(new_given_centers.size,dtype=int)))
144 | 
145 |             cluster_centers_rek=fair_k_center_APPROX(dmat[np.ix_(new_data_set,new_data_set)], sexes_newT,
146 |                                     nr_centers_per_sex[G],np.arange(new_data_set.size-new_given_centers.size,new_data_set.size))
147 | 
148 | 
149 | 
150 |             new_given_centersT_additional= np.array([],dtype=int)
151 |             for ell in np.setdiff1d(np.arange(m),G):
152 |                 if np.sum(sexes[new_given_centersT]==ell)<nr_centers_per_sex[ell]:
153 |                     toadd=nr_centers_per_sex[ell]-np.sum(sexes[new_given_centersT]==ell)
154 |                     toadd_pot=np.setdiff1d(np.where(sexes == ell)[0], new_given_centersT)
155 |                     if toadd_pot.size>toadd:
156 |                         new_given_centersT_additional=np.hstack((new_given_centersT_additional,toadd_pot[0:toadd]))
157 |                     else:
158 |                         new_given_centersT_additional = np.hstack((new_given_centersT_additional, toadd_pot))
159 | 
160 |             cluster_centers=np.hstack((new_given_centersT,new_given_centersT_additional,new_data_set[cluster_centers_rek]))
161 | 
162 | 
163 |     return cluster_centers
164 | ########################################################################################################################
165 | 
166 | 
167 | 
168 | ########################################################################################################################
169 | def swapping_graph(partition,centers,sexes,nr_centers_per_sex):
170 |     '''Implementation of Algorithm 3.
171 | 
172 |     INPUT:
173 |     partition ... integer-vector of length n with entries in 0 ... k-1
174 |     centers ... integer-vector of length k with entries in 0 ... n-1
175 |     sexes ... integer-vector of length n with entries in 0 ... m-1
176 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k
177 | 
178 |     RETURNS: (G, swapped centers)'''
179 | 
180 | 
181 |     n = partition.size
182 |     m = nr_centers_per_sex.size
183 |     k = centers.size
184 | 
185 | 
186 |     CURRENT_nr_clusters_per_sex = np.zeros(m, dtype=int)
187 |     for ell in np.arange(k):
188 |         CURRENT_nr_clusters_per_sex[sexes[centers[ell]]] += 1
189 | 
190 | 
191 |     sex_of_assigned_center = sexes[centers[partition]]
192 |     Adja = np.zeros((m, m))
193 |     for ell in np.arange(n):
194 |         Adja[sex_of_assigned_center[ell],sexes[ell]] = 1
195 | 
196 |     dmat_gr,predec = scipy.sparse.csgraph.shortest_path(Adja, directed=True, return_predecessors=True)
197 | 
198 |     is_there_a_path=0
199 |     for ell in np.arange(m):
200 |         for zzz in np.arange(m):
201 |             if ((CURRENT_nr_clusters_per_sex[ell]>nr_centers_per_sex[ell]) and (CURRENT_nr_clusters_per_sex[zzz]<nr_centers_per_sex[zzz])):
202 |                 if dmat_gr[ell,zzz]!=np.inf:
203 |                     path = np.array([zzz])
204 |                     while path[0]!=ell:
205 |                         path = np.hstack((predec[ell,path[0]],path))
206 |                     is_there_a_path = 1
207 |                     break
208 |         if is_there_a_path==1:
209 |             break
210 | 
211 | 
212 | 
213 |     while (is_there_a_path):
214 | 
215 |         for hhh in np.arange(path.size - 1):
216 |             for ell in np.arange(n):
217 |                 if (sexes[ell]==path[hhh+1]) and (sex_of_assigned_center[ell]==path[hhh]):
218 |                     centers[partition[ell]] = ell
219 |                     sex_of_assigned_center[partition==partition[ell]] = sexes[ell]
220 |                     break
221 |         CURRENT_nr_clusters_per_sex[path[0]] -= 1
222 |         CURRENT_nr_clusters_per_sex[path[-1]] += 1
223 | 
224 | 
225 |         Adja = np.zeros((m, m))
226 |         for ell in np.arange(n):
227 |             Adja[sex_of_assigned_center[ell], sexes[ell]] = 1
228 | 
229 |         dmat_gr, predec = scipy.sparse.csgraph.shortest_path(Adja, directed=True, return_predecessors=True)
230 | 
231 |         is_there_a_path = 0
232 |         for ell in np.arange(m):
233 |             for zzz in np.arange(m):
234 |                 if ((CURRENT_nr_clusters_per_sex[ell] > nr_centers_per_sex[ell]) and (CURRENT_nr_clusters_per_sex[zzz] < nr_centers_per_sex[zzz])):
235 |                     if dmat_gr[ell, zzz] != np.inf:
236 |                         path = np.array([zzz])
237 |                         while path[0] != ell:
238 |                             path = np.hstack((predec[ell, path[0]], path))
239 |                         is_there_a_path = 1
240 |                         break
241 |             if is_there_a_path == 1:
242 |                 break
243 | 
244 | 
245 | 
246 |     if sum(CURRENT_nr_clusters_per_sex==nr_centers_per_sex)==m:
247 |         return np.array([]), centers
248 |     else:
249 | 
250 |         G = np.where(CURRENT_nr_clusters_per_sex > nr_centers_per_sex)[0]
251 |         for ell in np.arange(m):
252 |             for zzz in np.arange(m):
253 |                 if (((dmat_gr[ell, zzz] != np.inf) and np.isin(ell, G)) and (not np.isin(zzz, G))):
254 |                     G = np.hstack((G, zzz))
255 | 
256 |         return G,centers
257 | ########################################################################################################################
258 | 
259 | 
260 | 
261 | ########################################################################################################################
262 | def heuristic_greedy_on_each_group(dmat,sexes,nr_centers_per_sex,given_centers):
263 |     '''Implementation of Heuristic A as described in Section 5.3.
264 | 
265 |     INPUT:
266 |     dmat ... distance matrix of size nxn
267 |     sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups
268 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k
269 |     given_centers ... integer-vector with entries in 0,...,n-1
270 | 
271 |     RETURNS: heuristically chosen centers'''
272 | 
273 | 
274 |     m = nr_centers_per_sex.size
275 | 
276 |     cluster_centers=np.array([],dtype=int)
277 | 
278 |     for ell in np.arange(m):
279 |         subgroup=np.where(sexes==ell)[0]
280 |         given_centers_subgroup=np.where(np.isin(subgroup,given_centers))[0]
281 | 
282 |         cent_subgroup=k_center_greedy_with_given_centers(dmat[np.ix_(subgroup, subgroup)],
283 |                                                                nr_centers_per_sex[ell], given_centers_subgroup)
284 | 
285 |         cluster_centers=np.hstack((cluster_centers,subgroup[cent_subgroup]))
286 | 
287 |     return cluster_centers
288 | ########################################################################################################################
289 | 
290 | 
291 | 
292 | ########################################################################################################################
293 | def heuristic_greedy_till_constraint_is_satisfied(dmat,sexes,nr_centers_per_sex,given_centers):
294 |     '''Implementation of Heuristic B as described in Section 5.3.
295 | 
296 |     INPUT:
297 |     dmat ... distance matrix of size nxn
298 |     sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups
299 |     nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k
300 |     given_centers ... integer-vector with entries in 0,...,n-1
301 | 
302 |     RETURNS: heuristically chosen centers
303 | 
304 |     SINCE WE DO NOT REQUIRE THE GENERAL CASE IN OUR EXPERIMENTS AND IT ALLOWS FOR A SLIGHTLY SIMPLER CODE,
305 |     HERE WE ASSUME THAT EITHER length(given_centers)>0 OR nr_centers_per_sex only[i]>0 FOR ALL i'''
306 | 
307 | 
308 |     n = dmat.shape[0]
309 |     m = nr_centers_per_sex.size
310 |     k=np.sum(nr_centers_per_sex)
311 | 
312 |     current_nr_per_sex=np.zeros(m)
313 | 
314 |     if k==0:
315 |         cluster_centers = np.array([], dtype=int)
316 |     else:
317 |         if given_centers.size==0:
318 |             cluster_centers=np.random.choice(n,1,replace=False)
319 |             current_nr_per_sex[sexes[cluster_centers]]+=1
320 |             kk=1
321 |         else:
322 |             cluster_centers = given_centers
323 |             kk=0
324 | 
325 |         distance_to_closest = np.amin(dmat[np.ix_(cluster_centers, np.arange(n))], axis=0)
326 |         while kk<k:
327 |             feasible_groups=np.where(current_nr_per_sex<nr_centers_per_sex)[0]
328 |             feasible_points=np.where(np.isin(sexes,feasible_groups))[0]
329 |             new_point=feasible_points[np.argmax(distance_to_closest[feasible_points])]
330 |             current_nr_per_sex[sexes[new_point]] += 1
331 |             cluster_centers = np.append(cluster_centers, new_point)
332 |             distance_to_closest = np.amin(np.vstack((distance_to_closest, dmat[new_point, :])), axis=0)
333 |             kk+=1
334 | 
335 |         cluster_centers=cluster_centers[given_centers.size:]
336 | 
337 |     return cluster_centers
338 | ########################################################################################################################
339 | 
340 | 
341 | 
342 | 
343 | 


--------------------------------------------------------------------------------
/experiments_adult_data_set.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics.pairwise import pairwise_distances
  3 | from sklearn.preprocessing import scale
  4 | import matplotlib.pyplot as plt
  5 | import sys
  6 | from algorithms import *
  7 | import os.path
  8 | import requests
  9 | import pandas
 10 | 
 11 | #####################################################################################
 12 | ### Experiments using the Adult data set from the UCI Machine Learning Repository ###
 13 | ### https://archive.ics.uci.edu/ml/datasets/adult                                 ###
 14 | #####################################################################################
 15 | 
 16 | ###############################################################
 17 | ## Experiment shown in middle and right plot of Figure 6
 18 | ###############################################################
 19 | def exp_comparison_heuristics_adult_data_set(nr_of_runs,race_is_sensitive_attribute):
 20 | # if race_is_sensitive_attribute==1, then we use race as sensitive attribute, otherwise we use gender
 21 | 
 22 |     if race_is_sensitive_attribute==1:
 23 |         m = 5
 24 |     else:
 25 |         m=2
 26 | 
 27 |     print '-------------------------------------------'
 28 |     print 'exp_comparison_heuristics_adult_data_set with '+str(m)+' groups'
 29 |     print '-------------------------------------------'
 30 | 
 31 | 
 32 |     n = 25000
 33 |     nr_initially_given = 100
 34 | 
 35 |     if (not os.path.exists('adult.data')):
 36 |         print('Adult data set does not exist in current folder --- Have to download it')
 37 |         r = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', allow_redirects=True)
 38 |         if r.status_code == requests.codes.ok:
 39 |             print('Download successful')
 40 |         else:
 41 |             print('Could not download Adult data set - please download it manually')
 42 |             sys.exit()
 43 |         open('adult.data', 'wb').write(r.content)
 44 | 
 45 | 
 46 |     df=pandas.read_csv('adult.data', sep=',',header=None)
 47 |     df=df[:n]
 48 | 
 49 |     if race_is_sensitive_attribute==1:
 50 |         sens_attr = 8
 51 |         sex = df[sens_attr]
 52 |         df = df.drop(columns=[sens_attr])
 53 |         sens_attributes = list(set(sex.astype(str).values))   # =[' Asian-Pac-Islander', ' White', ' Other', ' Amer-Indian-Eskimo', ' Black']
 54 |         sex_num = np.zeros(n, dtype=int)
 55 |         for rrr,ttt in enumerate(sens_attributes):
 56 |             sex_num[sex.astype(str).values == ttt]=rrr
 57 | 
 58 |         m=len(sens_attributes)   #m=5
 59 |         req_nr_per_sex=np.repeat(50, m)
 60 | 
 61 |     else:
 62 |         sens_attr = 9
 63 |         sex = df[sens_attr]
 64 |         sens_attributes = list(set(sex.astype(str).values))   # =[' Male', ' Female']
 65 |         df = df.drop(columns=[sens_attr])
 66 |         sex_num = np.zeros(n, dtype=int)
 67 |         sex_num[sex.astype(str).values == sens_attributes[1]] = 1
 68 | 
 69 |         m = len(sens_attributes)  # m=2
 70 |         req_nr_per_sex = np.repeat(200, m)
 71 | 
 72 | 
 73 |     #dropping non-numerical features and normalizing data
 74 |     cont_types=np.where(df.dtypes=='int')[0]   # =[0,2,4,9,10,11]
 75 |     df = df.iloc[:,cont_types]
 76 |     data = np.array(df.values, dtype=float)
 77 |     data = scale(data, axis=0)
 78 | 
 79 |     dmat = pairwise_distances(data,metric='l1')
 80 | 
 81 |     plot_data = np.zeros((nr_of_runs, 3))
 82 | 
 83 | 
 84 |     for rrr in np.arange(nr_of_runs):
 85 | 
 86 |         print 'run=', rrr
 87 | 
 88 |         initially_given = np.random.choice(n, size=nr_initially_given, replace=False)
 89 | 
 90 |         centers_approx = fair_k_center_APPROX(dmat, sex_num, req_nr_per_sex, initially_given)
 91 |         cost_approx = np.amax(
 92 |             np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(n))], axis=0))
 93 | 
 94 |         centers_heuristic1 = heuristic_greedy_on_each_group(dmat, sex_num, req_nr_per_sex, initially_given)
 95 |         cost_heuristic1 = np.amax(
 96 |             np.amin(dmat[np.ix_(np.hstack((centers_heuristic1, initially_given)), np.arange(n))], axis=0))
 97 | 
 98 |         centers_heuristic2 = heuristic_greedy_till_constraint_is_satisfied(dmat, sex_num, req_nr_per_sex,
 99 |                                                                            initially_given)
100 |         cost_heuristic2 = np.amax(
101 |             np.amin(dmat[np.ix_(np.hstack((centers_heuristic2, initially_given)), np.arange(n))], axis=0))
102 | 
103 |         plot_data[rrr, 0] = cost_approx
104 |         plot_data[rrr, 1] = cost_heuristic1
105 |         plot_data[rrr, 2] = cost_heuristic2
106 | 
107 | 
108 | 
109 |     data = [plot_data[:, ccc] for ccc in np.arange(3)]
110 |     fig, ax = plt.subplots(figsize=(3.4, 4.5))
111 |     ax.set_title('Adult data set, |S|=' + str(n), fontsize=16)
112 |     ax.boxplot(data)
113 |     fig.tight_layout()
114 |     plt.xticks([ggg for ggg in (1 + np.arange(3))], ['Our Alg.', 'Heur. A', 'Heur. B'], fontsize=12)
115 |     plt.ylabel('Cost', fontsize=14)
116 | 
117 |     fig.savefig('plot_exp_comparison_heuristics_adult_data_set_m='+str(m)+'.pdf', bbox_inches='tight')
118 |     plt.close()
119 | 
120 | 
121 | 
122 | 
123 | 
124 | ###############################################################
125 | ## Experiment shown in middle and right plot of Figure 5
126 | ###############################################################
127 | def exp_comparison_greedy_strategy_adult_data_set(nr_of_runs,race_is_sensitive_attribute):
128 | 
129 |     if race_is_sensitive_attribute==1:
130 |         m = 5
131 |     else:
132 |         m=2
133 | 
134 |     print '-------------------------------------------'
135 |     print 'exp_comparison_greedy_strategy_adult_data_set with '+str(m)+' groups'
136 |     print '-------------------------------------------'
137 | 
138 | 
139 |     n = 25000
140 |     nr_initially_given = 100
141 | 
142 |     if (not os.path.exists('adult.data')):
143 |         print('Adult data set does not exist in current folder --- Have to download it')
144 |         r = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', allow_redirects=True)
145 |         if r.status_code == requests.codes.ok:
146 |             print('Download successful')
147 |         else:
148 |             print('Could not download Adult data set - please download it manually')
149 |             sys.exit()
150 |         open('adult.data', 'wb').write(r.content)
151 | 
152 |     df = pandas.read_csv('adult.data', sep=',', header=None)
153 |     df = df[:n]
154 | 
155 |     if race_is_sensitive_attribute == 1:
156 |         sens_attr = 8
157 |         sex = df[sens_attr]
158 |         df = df.drop(columns=[sens_attr])
159 |         sens_attributes = list(set(
160 |             sex.astype(str).values))  # =[' Asian-Pac-Islander', ' White', ' Other', ' Amer-Indian-Eskimo', ' Black']
161 |         sex_num = np.zeros(n, dtype=int)
162 |         for rrr, ttt in enumerate(sens_attributes):
163 |             sex_num[sex.astype(str).values == ttt] = rrr
164 | 
165 |         m = len(sens_attributes)  # m=5
166 |         req_nr_per_sex = np.repeat(50, m)
167 | 
168 |     else:
169 |         sens_attr = 9
170 |         sex = df[sens_attr]
171 |         sens_attributes = list(set(sex.astype(str).values))  # =[' Male', ' Female']
172 |         df = df.drop(columns=[sens_attr])
173 |         sex_num = np.zeros(n, dtype=int)
174 |         sex_num[sex.astype(str).values == sens_attributes[1]] = 1
175 | 
176 |         m = len(sens_attributes)  # m=2
177 |         req_nr_per_sex = np.repeat(200, m)
178 | 
179 |     # dropping non-numerical features and normalizing data
180 |     cont_types = np.where(df.dtypes == 'int')[0]  # =[0,2,4,9,10,11]
181 |     df = df.iloc[:, cont_types]
182 |     data = np.array(df.values, dtype=float)
183 |     data = scale(data, axis=0)
184 | 
185 |     dmat = pairwise_distances(data, metric='l1')
186 | 
187 |     plot_data = np.zeros((nr_of_runs, 3))
188 | 
189 | 
190 |     for rrr in np.arange(nr_of_runs):
191 | 
192 |         print 'run=',rrr
193 | 
194 |         initially_given = np.random.choice(n, size=nr_initially_given, replace=False)
195 | 
196 |         centers_approx = fair_k_center_APPROX(dmat, sex_num, req_nr_per_sex, initially_given)
197 |         cost_approx = np.amax(
198 |             np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(n))], axis=0))
199 | 
200 |         centers_greedy = k_center_greedy_with_given_centers(dmat, np.sum(req_nr_per_sex), initially_given)
201 |         cost_greedy = np.amax(
202 |             np.amin(dmat[np.ix_(np.hstack((centers_greedy, initially_given)), np.arange(n))], axis=0))
203 | 
204 |         plot_data[rrr, 0] = cost_approx
205 |         plot_data[rrr, 1] = cost_greedy
206 | 
207 |         if m == 2:
208 |             plot_data[rrr, 2] = np.abs(np.sum(sex_num[centers_greedy] == 0) - np.sum(sex_num[centers_greedy] == 1))
209 |         else:
210 |             maxdev = 0
211 |             for der in np.arange(m):
212 |                 for das in np.arange(m):
213 |                     maxdev = np.max([maxdev, np.abs(
214 |                         np.sum(sex_num[centers_greedy] == der) - np.sum(sex_num[centers_greedy] == das))])
215 |             plot_data[rrr, 2] = maxdev
216 | 
217 | 
218 | 
219 |     if m == 2:
220 |         data = [plot_data[:, ccc] for ccc in np.arange(2)]
221 | 
222 |         fig = plt.figure()
223 |         st = fig.suptitle('Adult data set, |S|=|S$_1$|+|S$_2$|=' + str(n), fontsize=16)
224 | 
225 |         ax1 = fig.add_subplot(121)
226 |         ax1.boxplot(data)
227 |         plt.xticks([1, 2], ['Our Alg.', 'Unfair Greedy'], fontsize=12)
228 |         plt.ylabel('Cost', fontsize=14)
229 | 
230 |         ax2 = fig.add_subplot(122)
231 |         ax2.boxplot(plot_data[:, 2])
232 |         plt.xticks([1], ['Unfair Greedy'], fontsize=12)
233 |         plt.ylabel('|# centers in S$_1$ - # centers in S$_2$|', fontsize=14)
234 | 
235 |         fig.tight_layout()
236 | 
237 |         st.set_y(0.95)
238 |         fig.subplots_adjust(top=0.85)
239 | 
240 |         fig.savefig('exp_comparison_greedy_strategy_adult_data_set_m='+str(m)+'.pdf', bbox_inches='tight')
241 |         plt.close()
242 | 
243 |     else:
244 |         data = [plot_data[:, ccc] for ccc in np.arange(2)]
245 | 
246 |         fig = plt.figure()
247 |         st = fig.suptitle('Adult data set, |S|=|S$_1$|+...+|S$_5$|=' + str(n), fontsize=16)
248 | 
249 |         ax1 = fig.add_subplot(121)
250 |         ax1.boxplot(data)
251 |         plt.xticks([1, 2], ['Our Alg.', 'Unfair Greedy'], fontsize=12)
252 |         plt.ylabel('Cost', fontsize=14)
253 | 
254 |         ax2 = fig.add_subplot(122)
255 |         ax2.boxplot(plot_data[:, 2])
256 |         plt.xticks([1], ['Unfair Greedy'], fontsize=12)
257 |         plt.ylabel('max$_{i,j}$ |# centers in S$_i$ - # centers in S$_j$|', fontsize=13)
258 | 
259 |         fig.tight_layout()
260 | 
261 |         st.set_y(0.95)
262 |         fig.subplots_adjust(top=0.85)
263 |         fig.savefig('exp_comparison_greedy_strategy_adult_data_set_m='+str(m)+'.pdf', bbox_inches='tight')
264 |         plt.close()
265 | 
266 | 
267 | 
268 | 
269 | 
270 | if __name__ == "__main__":
271 |     if len(sys.argv)>1:
272 |         number_of_runs=int(sys.argv[1])
273 |     else:
274 |         number_of_runs=10
275 | 
276 |     exp_comparison_heuristics_adult_data_set(number_of_runs,0)
277 |     exp_comparison_heuristics_adult_data_set(number_of_runs, 1)
278 |     exp_comparison_greedy_strategy_adult_data_set(number_of_runs, 0)
279 |     exp_comparison_greedy_strategy_adult_data_set(number_of_runs,1)
280 | 


--------------------------------------------------------------------------------
/experiments_artificial_data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse.csgraph
  3 | from sklearn.metrics.pairwise import pairwise_distances
  4 | import matplotlib.pyplot as plt
  5 | import sys
  6 | from algorithms import *
  7 | 
  8 | 
  9 | 
 10 | ###############################################################
 11 | ## Experiment shown in Figure 4
 12 | ###############################################################
 13 | def exp_approx_factor_artificial_data(nr_of_runs):
 14 | 
 15 |     print '-------------------------------------------'
 16 |     print 'exp_approx_factor_artificial_data'
 17 |     print '-------------------------------------------'
 18 | 
 19 |     setting_list = [ell for ell in 2 + np.arange(19)]
 20 |     plot_data = np.zeros((nr_of_runs, len(setting_list)))
 21 | 
 22 |     n = 10000
 23 |     initially_given = np.array([], dtype=int)
 24 | 
 25 |     centersTRUE = np.zeros((100, 2))
 26 |     ccc = 0
 27 |     for zzz in np.arange(10):
 28 |         for rrr in np.arange(10):
 29 |             centersTRUE[ccc, 0] = zzz
 30 |             centersTRUE[ccc, 1] = rrr
 31 |             ccc += 1
 32 | 
 33 | 
 34 |     for ccc, m in enumerate(setting_list):
 35 | 
 36 |         print 'm = ',m
 37 | 
 38 |         for rrr in np.arange(nr_of_runs):
 39 |             sexes = np.random.randint(m, size=n)
 40 |             points = np.random.normal(size=(n, 2))
 41 | 
 42 |             points_ce = np.random.choice(centersTRUE.shape[0], n)
 43 |             for zzz in np.arange(centersTRUE.shape[0]):
 44 |                 cluster = np.where(points_ce == zzz)[0]
 45 |                 radius_cluster = np.max(np.sum((points[cluster, :]) ** 2, axis=1) ** (0.5))
 46 |                 points[cluster, :] = 0.5 * points[cluster, :] / radius_cluster + np.repeat(centersTRUE[zzz, :].reshape(1, 2),
 47 |                                                                  cluster.size, axis=0)
 48 | 
 49 |             sex_centersTRUE = np.random.randint(m,size=centersTRUE.shape[0])
 50 |             req_nr_per_sex = np.zeros(m,dtype=int)
 51 |             for ell in np.arange(m):
 52 |                 req_nr_per_sex[ell] = np.sum(sex_centersTRUE == ell)
 53 | 
 54 |             points = np.vstack((points, centersTRUE))
 55 |             sexes = np.hstack((sexes, sex_centersTRUE))
 56 |             hh = np.random.permutation(sexes.size)
 57 |             sexes = sexes[hh]
 58 |             points = points[hh, :]
 59 |             dmat = pairwise_distances(points)
 60 | 
 61 |             centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given)
 62 |             cost_approx = np.amax(
 63 |                 np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(dmat.shape[0]))], axis=0))
 64 |             cost_exact = 0.5
 65 |             fac = cost_approx / cost_exact
 66 |             plot_data[rrr, ccc] = fac
 67 | 
 68 |             print 'Approximation factor =',fac
 69 | 
 70 | 
 71 | 
 72 |     data = [plot_data[:, ccc] for ccc in np.arange(len(setting_list))]
 73 |     fig, ax = plt.subplots(figsize=(13, 4.5))
 74 |     ax.set_title('Simulated data with known optimal solution, S=' + str(dmat.shape[0]), fontsize=16)
 75 |     ax.boxplot(data)
 76 |     fig.tight_layout()
 77 |     plt.xticks([ggg for ggg in (1 + np.arange(len(setting_list)))],
 78 |                ['m=' + str(setting_list[ggg - 1]) for ggg in (1 + np.arange(len(setting_list)))], fontsize=12)
 79 |     plt.ylabel('Approximation factor', fontsize=14)
 80 | 
 81 |     fig.savefig('plot_exp_approx_factor_artificial_data.pdf', bbox_inches='tight')
 82 |     plt.close()
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | ###############################################################
 89 | ## Experiment shown in left plot of Figure 6
 90 | ###############################################################
 91 | def exp_comparison_heuristics_artificial_data(nr_of_runs):
 92 | 
 93 |     print '-------------------------------------------'
 94 |     print 'exp_comparison_heuristics_artificial_data'
 95 |     print '-------------------------------------------'
 96 | 
 97 |     plot_data = np.zeros((nr_of_runs, 3))
 98 | 
 99 |     n = 2000
100 |     m = 10
101 |     nr_initially_given = 10
102 |     req_nr_per_sex = np.repeat(4,m)
103 | 
104 | 
105 |     for rrr in np.arange(nr_of_runs):
106 | 
107 |         print 'run=',rrr
108 | 
109 |         indi_sexes = 0
110 |         while indi_sexes == 0:
111 |             sexes = np.random.randint(m, size=n)
112 | 
113 |             elem_per_sex = np.zeros(m, dtype=int)
114 |             for ell in np.arange(m):
115 |                 elem_per_sex[ell] = np.sum(sexes == ell)
116 | 
117 |             if np.sum(elem_per_sex >= req_nr_per_sex) == m:
118 |                 indi_sexes = 1
119 | 
120 |         initially_given = np.random.choice(n, size=nr_initially_given, replace=False)
121 | 
122 |         indi_dmat = 0
123 |         while indi_dmat == 0:
124 |             dmat = np.random.binomial(1, 2 * np.log(n) / n, (n, n)) * np.random.randint(1, high=100 + 1,
125 |                                                                                         size=(n, n)) + 0.0
126 |             dmat = np.triu(dmat, 1)
127 |             dmat = dmat + dmat.T
128 | 
129 |             scipy.sparse.csgraph.floyd_warshall(dmat, directed=False, overwrite=True)
130 | 
131 |             if not np.any(np.isinf(dmat)):
132 |                 indi_dmat = 1
133 | 
134 | 
135 |         centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given)
136 |         cost_approx = np.amax(
137 |             np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(n))], axis=0))
138 | 
139 |         centers_heuristic1 = heuristic_greedy_on_each_group(dmat, sexes, req_nr_per_sex, initially_given)
140 |         cost_heuristic1 = np.amax(
141 |             np.amin(dmat[np.ix_(np.hstack((centers_heuristic1, initially_given)), np.arange(n))], axis=0))
142 | 
143 |         centers_heuristic2 = heuristic_greedy_till_constraint_is_satisfied(dmat, sexes, req_nr_per_sex,
144 |                                                                            initially_given)
145 |         cost_heuristic2 = np.amax(
146 |             np.amin(dmat[np.ix_(np.hstack((centers_heuristic2, initially_given)), np.arange(n))], axis=0))
147 | 
148 |         plot_data[rrr, 0] = cost_approx
149 |         plot_data[rrr, 1] = cost_heuristic1
150 |         plot_data[rrr, 2] = cost_heuristic2
151 | 
152 | 
153 | 
154 |     data = [plot_data[:, ccc] for ccc in np.arange(3)]
155 |     fig, ax = plt.subplots(figsize=(3.4, 4.5))
156 |     ax.set_title('Simulated data, |S|=' + str(n), fontsize=16)
157 |     ax.boxplot(data)
158 |     fig.tight_layout()
159 |     plt.xticks([ggg for ggg in (1 + np.arange(3))], ['Our Alg.', 'Heur. A', 'Heur. B'], fontsize=12)
160 |     plt.ylabel('Cost', fontsize=14)
161 | 
162 |     fig.savefig('plot_exp_comparison_heuristics_artificial_data.pdf', bbox_inches='tight')
163 |     plt.close()
164 | 
165 | 
166 | 
167 | 
168 | 
169 | ###############################################################
170 | ## Experiment shown in left plot of Figure 5
171 | ###############################################################
172 | def exp_comparison_greedy_strategy_artificial_data(nr_of_runs):
173 | 
174 |     print '-------------------------------------------'
175 |     print 'exp_comparison_greedy_strategy_artificial_data'
176 |     print '-------------------------------------------'
177 | 
178 |     plot_data = np.zeros((nr_of_runs, 3))
179 | 
180 |     n = 2000
181 |     m = 10
182 |     nr_initially_given = 10
183 |     req_nr_per_sex = np.repeat(4, m)
184 | 
185 | 
186 | 
187 |     for rrr in np.arange(nr_of_runs):
188 | 
189 |         print 'run=',rrr
190 | 
191 |         indi_sexes = 0
192 |         while indi_sexes == 0:
193 |             sexes = np.random.randint(m, size=n)
194 | 
195 |             elem_per_sex = np.zeros(m, dtype=int)
196 |             for ell in np.arange(m):
197 |                 elem_per_sex[ell] = np.sum(sexes == ell)
198 | 
199 |             if np.sum(elem_per_sex >= req_nr_per_sex) == m:
200 |                 indi_sexes = 1
201 | 
202 |         initially_given = np.random.choice(n, size=nr_initially_given, replace=False)
203 | 
204 |         indi_dmat = 0
205 |         while indi_dmat == 0:
206 |             dmat = np.random.binomial(1, 2 * np.log(n) / n, (n, n)) * np.random.randint(1, high=100 + 1,
207 |                                                                                         size=(n, n)) + 0.0
208 |             dmat = np.triu(dmat, 1)
209 |             dmat = dmat + dmat.T
210 | 
211 |             scipy.sparse.csgraph.floyd_warshall(dmat, directed=False, overwrite=True)
212 | 
213 |             if not np.any(np.isinf(dmat)):
214 |                 indi_dmat = 1
215 | 
216 | 
217 |         centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given)
218 |         cost_approx = np.amax(
219 |             np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(n))], axis=0))
220 | 
221 |         centers_greedy = k_center_greedy_with_given_centers(dmat, np.sum(req_nr_per_sex), initially_given)
222 |         cost_greedy = np.amax(
223 |             np.amin(dmat[np.ix_(np.hstack((centers_greedy, initially_given)), np.arange(n))], axis=0))
224 | 
225 |         plot_data[rrr, 0] = cost_approx
226 |         plot_data[rrr, 1] = cost_greedy
227 | 
228 |         if m == 2:
229 |             plot_data[rrr, 2] = np.abs(np.sum(sexes[centers_greedy] == 0) - np.sum(sexes[centers_greedy] == 1))
230 |         else:
231 |             maxdev = 0
232 |             for der in np.arange(m):
233 |                 for das in np.arange(m):
234 |                     maxdev = np.max([maxdev, np.abs(
235 |                         np.sum(sexes[centers_greedy] == der) - np.sum(sexes[centers_greedy] == das))])
236 |             plot_data[rrr, 2] = maxdev
237 | 
238 | 
239 | 
240 |     if m == 2:
241 |         data = [plot_data[:, ccc] for ccc in np.arange(2)]
242 | 
243 |         fig = plt.figure()
244 |         st = fig.suptitle('Simulated data, |S|=|S$_1$|+|S$_2$|=' + str(n), fontsize=16)
245 | 
246 |         ax1 = fig.add_subplot(121)
247 |         ax1.boxplot(data)
248 |         plt.xticks([1, 2], ['Our Alg.', 'Unfair Greedy'], fontsize=12)
249 |         plt.ylabel('Cost', fontsize=14)
250 | 
251 |         ax2 = fig.add_subplot(122)
252 |         ax2.boxplot(plot_data[:, 2])
253 |         plt.xticks([1], ['Unfair Greedy'], fontsize=12)
254 |         plt.ylabel('|# centers in S$_1$ - # centers in S$_2$|', fontsize=14)
255 | 
256 |     else:
257 |         data = [plot_data[:, ccc] for ccc in np.arange(2)]
258 | 
259 |         fig = plt.figure()
260 |         st = fig.suptitle('Simulated data, |S|=|S$_1$|+...+|S$_{10}$|=' + str(n), fontsize=16)
261 | 
262 |         ax1 = fig.add_subplot(121)
263 |         ax1.boxplot(data)
264 |         plt.xticks([1, 2], ['Our Alg.', 'Unfair Greedy'], fontsize=12)
265 |         plt.ylabel('Cost', fontsize=14)
266 | 
267 |         ax2 = fig.add_subplot(122)
268 |         ax2.boxplot(plot_data[:, 2])
269 |         plt.xticks([1], ['Unfair Greedy'], fontsize=12)
270 |         plt.ylabel('max$_{i,j}$ |# centers in S$_i$ - # centers in S$_j$|', fontsize=13)
271 | 
272 |     fig.tight_layout()
273 |     st.set_y(0.95)
274 |     fig.subplots_adjust(top=0.85)
275 |     fig.savefig('exp_comparison_greedy_strategy_artificial_data.pdf', bbox_inches='tight')
276 |     plt.close()
277 | 
278 | 
279 | 
280 | 
281 | 
282 | if __name__ == "__main__":
283 |     if len(sys.argv)>1:
284 |         number_of_runs=int(sys.argv[1])
285 |     else:
286 |         number_of_runs=10
287 | 
288 |     exp_approx_factor_artificial_data(number_of_runs)
289 |     exp_comparison_heuristics_artificial_data(number_of_runs)
290 |     exp_comparison_greedy_strategy_artificial_data(number_of_runs)
291 | 


--------------------------------------------------------------------------------
/experiments_matroid_center.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse.csgraph
  3 | import matplotlib.pyplot as plt
  4 | import sys
  5 | import time
  6 | from algorithms import *
  7 | from algorithm_matroid_center import *
  8 | 
  9 | 
 10 | ###############################################################
 11 | ## Experiment shown in the left plot of Figure 3
 12 | ###############################################################
 13 | def exp_compare_to_optimal_solution(nr_of_runs):
 14 | 
 15 |     print '-------------------------------------------'
 16 |     print 'exp_compare_to_optimal_solution'
 17 |     print '-------------------------------------------'
 18 | 
 19 |     setting_list=[[25, 2, 2, [2,2]],
 20 |                   [25, 2, 2, [4,2]],
 21 |                   [25, 3, 2, [2,2,2]],
 22 |                   [25, 3, 1, [5, 1, 1]],
 23 |                   [25, 4, 0, [2, 2, 2,2]],
 24 |                   [25, 4, 0, [3, 3, 1, 1]],
 25 |                   [25, 5, 0, [2, 2, 2, 1,1]]]
 26 | 
 27 | 
 28 |     pl_data=np.zeros((nr_of_runs,len(setting_list)))
 29 |     pl_data_MATROID = np.zeros((nr_of_runs, len(setting_list)))
 30 |     pl_time = np.zeros((nr_of_runs, len(setting_list)))
 31 |     pl_time_MATROID = np.zeros((nr_of_runs, len(setting_list)))
 32 |     settings_as_vec=np.array([])
 33 | 
 34 | 
 35 |     for tr,sl in enumerate(setting_list):
 36 | 
 37 |         n=sl[0]
 38 |         m=sl[1]
 39 |         nr_initially_given=sl[2]
 40 |         req_nr_per_sex=np.array(sl[3])
 41 | 
 42 |         print ''
 43 |         print 'n='+str(n)+', m='+str(m)+', (k_{S_1},...,k_{S_m})='+str(tuple(req_nr_per_sex))+', |C_0|='+str(nr_initially_given)
 44 | 
 45 |         settings_as_vec=np.hstack((settings_as_vec,np.array([n,m,nr_initially_given]),req_nr_per_sex))
 46 | 
 47 | 
 48 |         for rrr in np.arange(nr_of_runs):
 49 | 
 50 |             indi_sexes = 0
 51 |             while indi_sexes == 0:
 52 |                 sexes = np.random.randint(m, size=n)
 53 | 
 54 |                 elem_per_sex = np.zeros(m, dtype=int)
 55 |                 for ell in np.arange(m):
 56 |                     elem_per_sex[ell] = np.sum(sexes == ell)
 57 | 
 58 |                 if np.sum(elem_per_sex >= req_nr_per_sex) == m:
 59 |                     indi_sexes = 1
 60 | 
 61 |             initially_given = np.random.choice(n, size=nr_initially_given, replace=False)
 62 | 
 63 |             indi_dmat = 0
 64 |             while indi_dmat == 0:
 65 |                 dmat = np.random.binomial(1, 2 * np.log(n) / n, (n, n)) * np.random.randint(1, high=100 + 1,
 66 |                                                                                             size=(n, n)) + 0.0
 67 |                 dmat = np.triu(dmat, 1)
 68 |                 dmat = dmat + dmat.T
 69 |                 scipy.sparse.csgraph.floyd_warshall(dmat, directed=False, overwrite=True)
 70 |                 if not np.any(np.isinf(dmat)):
 71 |                     indi_dmat = 1
 72 | 
 73 | 
 74 |             start = time.time()
 75 |             centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given)
 76 |             cost_approx = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx,initially_given)), np.arange(n))], axis=0))
 77 |             end = time.time()
 78 |             pl_time[rrr,tr] = end - start
 79 | 
 80 |             start = time.time()
 81 |             centers_approx_MATROID = MatCenter_binary_search_WithGivenCenters(dmat, sexes, req_nr_per_sex, initially_given)
 82 |             cost_approx_MATROID = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx_MATROID, initially_given)), np.arange(n))], axis=0))
 83 |             end = time.time()
 84 |             pl_time_MATROID[rrr, tr] = end - start
 85 | 
 86 |             centers_exact, cl_exact, cost_exact = fair_k_center_exact(dmat, sexes, req_nr_per_sex,initially_given)
 87 | 
 88 | 
 89 |             fac = cost_approx / cost_exact
 90 |             pl_data[rrr,tr]=fac
 91 |             fac_MATROID = cost_approx_MATROID / cost_exact
 92 |             pl_data_MATROID[rrr, tr] = fac_MATROID
 93 |             print 'Approximation factor Alg. 4='+str(fac)+' ---- Approximation factor M.C.='+str(fac_MATROID)
 94 | 
 95 | 
 96 | 
 97 |     data=[]
 98 |     XT=[]
 99 |     for tr in np.arange(len(setting_list)):
100 |         data.append(pl_data[:,tr])
101 |         data.append(pl_data_MATROID[:, tr])
102 |         XT.append('Alg. 4')
103 |         XT.append('M.C.')
104 |     fig, ax = plt.subplots(figsize=(12,4.5))
105 |     ax.set_title('Simulated data with computable optimal solution, |S|='+str(n),fontsize=16)
106 |     ax.boxplot(data)
107 |     fig.tight_layout()
108 |     plt.xticks([ggg for ggg in (1 + np.arange(2*len(setting_list)))],XT,fontsize=12)
109 |     plt.ylabel('Approximation factor',fontsize=14)
110 |     ylim = ax.get_ylim()
111 |     new_ylim = (0.95,ylim[1])
112 |     ax.set_ylim(new_ylim)
113 |     fig.savefig('exp_compare_to_optimal_solution_APPROXFACTOR.pdf',bbox_inches='tight')
114 |     plt.close()
115 | 
116 | 
117 |     data = []
118 |     XT = []
119 |     for tr in np.arange(len(setting_list)):
120 |         data.append(pl_time[:, tr])
121 |         data.append(pl_time_MATROID[:, tr])
122 |         XT.append('Alg. 4')
123 |         XT.append('M.C.')
124 |     fig, ax = plt.subplots(figsize=(12, 4.5))
125 |     ax.set_title('Simulated data with computable optimal solution, |S|=' + str(n), fontsize=16)
126 |     ax.boxplot(data)
127 |     fig.tight_layout()
128 |     plt.xticks([ggg for ggg in (1 + np.arange(2 * len(setting_list)))], XT, fontsize=12)
129 |     plt.ylabel('Running time [s]', fontsize=14)
130 |     fig.savefig('exp_compare_to_optimal_solution_TIME.pdf', bbox_inches='tight')
131 |     plt.close()
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | ###############################################################
140 | ## Experiment shown in the right plot of Figure 3 and in
141 | ## Figure 9 in Appendix B
142 | ###############################################################
143 | def exp_compare_to_each_other(nr_of_runs):
144 | 
145 |     print '-------------------------------------------'
146 |     print 'exp_compare_to_each_other'
147 |     print '-------------------------------------------'
148 | 
149 |     setting_list=[[50, 5, 0, [4,4,4,4,4]],
150 |                   [100, 5, 0, [4,4,4,4,4]],
151 |                   [150, 5, 0, [4, 4, 4, 4, 4]],
152 |                   [200, 5, 0, [4, 4, 4, 4, 4]],
153 |                   [250, 5, 0, [4, 4, 4, 4, 4]]]
154 | 
155 | 
156 |     n_list=np.array([])
157 | 
158 |     pl_data=np.zeros((nr_of_runs,len(setting_list)))
159 |     pl_data_MATROID = np.zeros((nr_of_runs, len(setting_list)))
160 |     pl_time = np.zeros((nr_of_runs, len(setting_list)))
161 |     pl_time_MATROID = np.zeros((nr_of_runs, len(setting_list)))
162 |     settings_as_vec=np.array([])
163 | 
164 | 
165 |     for tr,sl in enumerate(setting_list):
166 | 
167 |         n=sl[0]
168 |         n_list=np.hstack((n_list,n))
169 |         m=sl[1]
170 |         nr_initially_given=sl[2]
171 |         req_nr_per_sex=np.array(sl[3])
172 | 
173 |         print ''
174 |         print 'n='+str(n)
175 | 
176 |         settings_as_vec=np.hstack((settings_as_vec,np.array([n,m,nr_initially_given]),req_nr_per_sex))
177 | 
178 | 
179 |         for rrr in np.arange(nr_of_runs):
180 | 
181 |             indi_sexes = 0
182 |             while indi_sexes==0:
183 |                 sexes=np.random.randint(m,size=n)
184 | 
185 |                 elem_per_sex = np.zeros(m, dtype=int)
186 |                 for ell in np.arange(m):
187 |                     elem_per_sex[ell] = np.sum(sexes == ell)
188 | 
189 |                 if np.sum(elem_per_sex >= req_nr_per_sex) == m:
190 |                     indi_sexes = 1
191 | 
192 |             initially_given = np.random.choice(n, size=nr_initially_given,replace=False)
193 | 
194 |             indi_dmat=0
195 |             while indi_dmat==0:
196 |                 dmat=np.random.binomial(1,2*np.log(n)/n,(n,n))*np.random.randint(1,high=100+1,size=(n,n))+0.0
197 |                 dmat=np.triu(dmat,1)
198 |                 dmat=dmat+dmat.T
199 |                 scipy.sparse.csgraph.floyd_warshall(dmat,directed=False,overwrite=True)
200 |                 if not np.any(np.isinf(dmat)):
201 |                     indi_dmat=1
202 | 
203 | 
204 |             start = time.time()
205 |             centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given)
206 |             cost_approx = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx,initially_given)), np.arange(n))], axis=0))
207 |             end = time.time()
208 |             pl_data[rrr, tr] = cost_approx
209 |             pl_time[rrr,tr] = end - start
210 | 
211 |             start = time.time()
212 |             centers_approx_MATROID = MatCenter_binary_search_WithGivenCenters(dmat, sexes, req_nr_per_sex, initially_given)
213 |             cost_approx_MATROID = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx_MATROID, initially_given)), np.arange(n))], axis=0))
214 |             end = time.time()
215 |             pl_data_MATROID[rrr, tr] = cost_approx_MATROID
216 |             pl_time_MATROID[rrr, tr] = end - start
217 | 
218 |             print 'Cost Alg. 4='+str(cost_approx)+', Time Alg. 4='+str(pl_time[rrr,tr])+' ---- Cost M.C.='+str(cost_approx_MATROID)+\
219 |                 ', Time M.C.='+str(pl_time_MATROID[rrr, tr])
220 | 
221 | 
222 | 
223 |     data=[]
224 |     XT=[]
225 |     for tr in np.arange(len(setting_list)):
226 |         data.append(pl_data[:,tr])
227 |         data.append(pl_data_MATROID[:, tr])
228 |         XT.append('Alg. 4')
229 |         XT.append('M.C.')
230 |     fig, ax = plt.subplots(figsize=(12,4.5))
231 |     ax.set_title('Simulated data, m=' + str(m) + ', k=' + str(np.sum(req_nr_per_sex)), fontsize=16)
232 |     ax.boxplot(data)
233 |     fig.tight_layout()
234 |     plt.xticks([ggg for ggg in (1 + np.arange(2*len(setting_list)))],XT,fontsize=12)
235 |     plt.ylabel('Cost',fontsize=14)
236 |     ylim = ax.get_ylim()
237 |     new_ylim = (0.95,ylim[1])
238 |     ax.set_ylim(new_ylim)
239 |     fig.savefig('exp_compare_to_each_other_COST.pdf',bbox_inches='tight')
240 |     plt.close()
241 | 
242 | 
243 | 
244 |     plt.figure(figsize=(7.5, 4.5))
245 |     plt.plot(n_list, np.mean(pl_time, 0), label='Alg. 4', marker="x", color='b')
246 |     plt.plot(n_list, np.mean(pl_time_MATROID, 0), label='M.C.', marker="x", color='r')
247 |     plt.plot(n_list,
248 |              (np.mean(pl_time_MATROID, 0)[0] / (((n_list[0]) ** 2) * np.log(n_list[0]))) * (n_list ** 2) * np.log(
249 |                  n_list), label=r'~ $|S|^2 \cdot \ln(|S|)$', linestyle="--", color='m')
250 |     plt.plot(n_list, (np.mean(pl_time_MATROID, 0)[0] / ((n_list[0]) ** 2.5)) * (n_list ** 2.5),
251 |              label=r'~ $|S|^{5/2}$', linestyle="--", color='g')
252 | 
253 |     plt.title('Simulated data, m=' + str(m) + ', k=' + str(np.sum(req_nr_per_sex)), fontsize=16)
254 |     plt.legend()
255 |     plt.xlabel('|S|', fontsize=14)
256 |     plt.ylabel('Running time [s]', fontsize=14)
257 |     plt.savefig('exp_compare_to_each_other_TIME.pdf', bbox_inches='tight')
258 |     plt.close()
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | if __name__ == "__main__":
266 |     if len(sys.argv)>1:
267 |         number_of_runs=int(sys.argv[1])
268 |     else:
269 |         number_of_runs=10
270 | 
271 |     exp_compare_to_optimal_solution(number_of_runs)
272 |     exp_compare_to_each_other(number_of_runs)
273 | 


--------------------------------------------------------------------------------