├── README.md ├── algorithm_matroid_center.py ├── algorithms.py ├── experiments_adult_data_set.py ├── experiments_artificial_data.py └── experiments_matroid_center.py /README.md: -------------------------------------------------------------------------------- 1 | # fair_k_center_clustering 2 | 3 | Code for our paper "Fair k-Center Clustering for Data Summarization" (https://arxiv.org/abs/1901.08628). 4 | 5 | To try it out and reproduce the boxplots (based on 10 runs) of the experiments of Figures 4 to 6 on artificial data, simply run 6 | 7 | ``` 8 | python experiments_artificial_data.py 9 | ``` 10 | 11 | If you want to obtain the boxplots based on 50 runs, say, then run 12 | 13 | ``` 14 | python experiments_artificial_data.py 50 15 | ``` 16 | 17 | Similarly, in order to reproduce the boxplots of the experiments of Figures 5 and 6 on the Adult data set, run 18 | 19 | ``` 20 | python experiments_adult_data_set.py 50 21 | ``` 22 | 23 | If you want to compare our algorithm to the algorithm for the matroid center problem by Chen et al. (https://arxiv.org/abs/1301.0745), you need to have SageMath (http://www.sagemath.org/) installed on your system. Then simply run 24 | 25 | ``` 26 | sage -python experiments_matroid_center.py 50 27 | ``` 28 | 29 | 30 | The code has been tested with the following software versions: 31 | - Python 2.7.10 32 | - Numpy 1.16.2 33 | - Scipy 1.1.0 34 | - Scikit-learn 0.19.1 35 | - Pandas 0.23.0 36 | - SageMath 8.2 37 | -------------------------------------------------------------------------------- /algorithm_matroid_center.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse.csgraph 3 | 4 | from sage.all import * 5 | import sage.matroids.matroid 6 | import sage.matroids.constructor 7 | 8 | 9 | 10 | ######################################################################################################################## 11 | ### Implementation of the 3-approximation algorithm for the matroid center problem proposed by Chen et al. 12 | ### (Danny Z. Chen, Jian Li, Hongyu Liang, Haitao Wang. Matroid and Knapsack Center Problems. Algorithmica, 2016) 13 | ######################################################################################################################## 14 | 15 | 16 | class PartitionMatroid_adapted(sage.matroids.matroid.Matroid): 17 | '''Adaptation of the class PartitionMatroid as presented in the 'Sage Reference Manual: MatroidTheory' to the matroid 18 | required in the algorithm by Chen et al.. 19 | 20 | partition ... list of lists specifying the partition of the groundset''' 21 | 22 | 23 | def __init__(self, partition): 24 | self.partition = partition 25 | E = set() 26 | for P in partition: 27 | E.update(P) 28 | self.E = frozenset(E) 29 | def groundset(self): 30 | return self.E 31 | def _rank(self, X): 32 | X2 = set(X) 33 | used_indices = set() 34 | rk = 0 35 | while len(X2) > 0: 36 | e = X2.pop() 37 | for i in range(len(self.partition)-1): 38 | if e in self.partition[i]: 39 | if i not in used_indices: 40 | used_indices.add(i) 41 | rk = rk + 1 42 | break 43 | return rk 44 | 45 | 46 | 47 | class ValidCentersMatroid(sage.matroids.matroid.Matroid): 48 | '''Partition matroid encoding the constraints on the centers. 49 | 50 | sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups 51 | nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k''' 52 | 53 | 54 | def __init__(self, sexes,nr_centers_per_sex): 55 | self.n = sexes.size 56 | self.sexes=sexes 57 | self.m=nr_centers_per_sex.size 58 | self.nr_centers_per_sex=nr_centers_per_sex 59 | self.E = frozenset(np.arange(self.n)) 60 | def groundset(self): 61 | return self.E 62 | def _rank(self, X): 63 | X2 = set(X) 64 | nr_elem_in_groups=np.zeros(self.m,dtype=int) 65 | while len(X2) > 0: 66 | e = X2.pop() 67 | nr_elem_in_groups[self.sexes[e]]+=1 68 | return np.sum(np.minimum(nr_elem_in_groups,self.nr_centers_per_sex)) 69 | 70 | 71 | 72 | 73 | 74 | def MatCenter_binary_search(dmat, sexes, nr_centers_per_sex): 75 | '''Implementation of the algorithm by Chen et al.. 76 | 77 | *) Rather than testing all distance values as threshold as suggested by Chen et al., we implement binary search to 78 | look for the optimal value. 79 | *) There might be a faster way than running Floyd-Warshall for every distance value that we are testing, however, 80 | in our experiments the time for doing so is negligible (for n<=250, the execution of the first five commands within the 81 | while-loop never takes more than 0.02 seconds). 82 | 83 | INPUT: 84 | dmat ... distance matrix of size nxn 85 | sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups 86 | nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k 87 | 88 | RETURNS: approx. optimal centers''' 89 | 90 | 91 | n = dmat.shape[0] 92 | m = nr_centers_per_sex.size 93 | k = np.sum(nr_centers_per_sex) 94 | 95 | ConstrMatr =ValidCentersMatroid(sexes,nr_centers_per_sex) 96 | 97 | iu = np.triu_indices(n, 1) 98 | dval=np.sort(dmat[iu]) 99 | 100 | best_cost=np.inf 101 | best_centers=np.array([],dtype=int) 102 | 103 | 104 | while dval.size>0: 105 | 106 | thelp=int(np.floor(dval.size/2)) 107 | cand_dist=dval[thelp] 108 | 109 | dmat_c=dmat.copy() 110 | dmat_c[dmat_c>cand_dist]=0 111 | scipy.sparse.csgraph.floyd_warshall(dmat_c, directed=False, overwrite=True) 112 | 113 | CC=np.array([],dtype=int) 114 | VV=np.zeros(n) 115 | parti=[] 116 | parti2=np.arange(n) 117 | 118 | while np.sum(VV) toadd: 148 | best_centers = np.hstack((best_centers, toadd_pot[0:toadd])) 149 | else: 150 | best_centers = np.hstack((best_centers, toadd_pot)) 151 | 152 | return best_centers 153 | 154 | 155 | 156 | def MatCenter_binary_search_WithGivenCenters(dmat, sexes, nr_centers_per_sex, given_centers): 157 | '''Wrapper function that allows us to run the algorithm by Chen et al. with initially given centers. 158 | 159 | INPUT: 160 | dmat ... distance matrix of size nxn 161 | sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups 162 | nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k 163 | given_centers ... integer-vector with entries in 0,...,n-1 164 | 165 | RETURNS: approx. optimal centers''' 166 | 167 | 168 | m=nr_centers_per_sex.size 169 | sexesNEW=sexes.copy() 170 | sexesNEW[given_centers]=m 171 | nr_centers_per_sexNEW=np.hstack((nr_centers_per_sex,given_centers.size)) 172 | 173 | return np.setdiff1d(MatCenter_binary_search(dmat, sexesNEW, nr_centers_per_sexNEW),given_centers) 174 | 175 | 176 | -------------------------------------------------------------------------------- /algorithms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse.csgraph 3 | import itertools 4 | 5 | 6 | 7 | ######################################################################################################################## 8 | def fair_k_center_exact(dmat,sexes,nr_centers_per_sex,given_centers): 9 | '''Exhaustive search to exactly solve the fair k-center problem (2) --- only works for small problem instances. 10 | 11 | INPUT: 12 | dmat ... distance matrix of size nxn 13 | sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups 14 | nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k 15 | given_centers ... integer-vector with entries in 0,...,n-1 16 | 17 | RETURNS: (optimal centers, clustering, optimal fair k-center cost)''' 18 | 19 | 20 | n = dmat.shape[0] 21 | m = nr_centers_per_sex.size 22 | k = np.sum(nr_centers_per_sex) 23 | 24 | cost = np.inf 25 | best_choice = [] 26 | 27 | 28 | for mmm in itertools.combinations(np.arange(n),k): 29 | 30 | cluster_centers = np.array(mmm) 31 | 32 | curr_nr_clusters_per_sex = np.zeros(m) 33 | for ell in np.arange(m): 34 | curr_nr_clusters_per_sex[ell] = np.sum(sexes[cluster_centers]==ell) 35 | 36 | if sum(curr_nr_clusters_per_sex==nr_centers_per_sex)==m: 37 | curr_cost = np.amax(np.amin(dmat[np.ix_(np.hstack((cluster_centers,given_centers)), np.arange(n))],axis=0)) 38 | else: 39 | curr_cost = np.inf 40 | 41 | if curr_costtoadd: 156 | new_given_centersT_additional=np.hstack((new_given_centersT_additional,toadd_pot[0:toadd])) 157 | else: 158 | new_given_centersT_additional = np.hstack((new_given_centersT_additional, toadd_pot)) 159 | 160 | cluster_centers=np.hstack((new_given_centersT,new_given_centersT_additional,new_data_set[cluster_centers_rek])) 161 | 162 | 163 | return cluster_centers 164 | ######################################################################################################################## 165 | 166 | 167 | 168 | ######################################################################################################################## 169 | def swapping_graph(partition,centers,sexes,nr_centers_per_sex): 170 | '''Implementation of Algorithm 3. 171 | 172 | INPUT: 173 | partition ... integer-vector of length n with entries in 0 ... k-1 174 | centers ... integer-vector of length k with entries in 0 ... n-1 175 | sexes ... integer-vector of length n with entries in 0 ... m-1 176 | nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k 177 | 178 | RETURNS: (G, swapped centers)''' 179 | 180 | 181 | n = partition.size 182 | m = nr_centers_per_sex.size 183 | k = centers.size 184 | 185 | 186 | CURRENT_nr_clusters_per_sex = np.zeros(m, dtype=int) 187 | for ell in np.arange(k): 188 | CURRENT_nr_clusters_per_sex[sexes[centers[ell]]] += 1 189 | 190 | 191 | sex_of_assigned_center = sexes[centers[partition]] 192 | Adja = np.zeros((m, m)) 193 | for ell in np.arange(n): 194 | Adja[sex_of_assigned_center[ell],sexes[ell]] = 1 195 | 196 | dmat_gr,predec = scipy.sparse.csgraph.shortest_path(Adja, directed=True, return_predecessors=True) 197 | 198 | is_there_a_path=0 199 | for ell in np.arange(m): 200 | for zzz in np.arange(m): 201 | if ((CURRENT_nr_clusters_per_sex[ell]>nr_centers_per_sex[ell]) and (CURRENT_nr_clusters_per_sex[zzz] nr_centers_per_sex[ell]) and (CURRENT_nr_clusters_per_sex[zzz] < nr_centers_per_sex[zzz])): 235 | if dmat_gr[ell, zzz] != np.inf: 236 | path = np.array([zzz]) 237 | while path[0] != ell: 238 | path = np.hstack((predec[ell, path[0]], path)) 239 | is_there_a_path = 1 240 | break 241 | if is_there_a_path == 1: 242 | break 243 | 244 | 245 | 246 | if sum(CURRENT_nr_clusters_per_sex==nr_centers_per_sex)==m: 247 | return np.array([]), centers 248 | else: 249 | 250 | G = np.where(CURRENT_nr_clusters_per_sex > nr_centers_per_sex)[0] 251 | for ell in np.arange(m): 252 | for zzz in np.arange(m): 253 | if (((dmat_gr[ell, zzz] != np.inf) and np.isin(ell, G)) and (not np.isin(zzz, G))): 254 | G = np.hstack((G, zzz)) 255 | 256 | return G,centers 257 | ######################################################################################################################## 258 | 259 | 260 | 261 | ######################################################################################################################## 262 | def heuristic_greedy_on_each_group(dmat,sexes,nr_centers_per_sex,given_centers): 263 | '''Implementation of Heuristic A as described in Section 5.3. 264 | 265 | INPUT: 266 | dmat ... distance matrix of size nxn 267 | sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups 268 | nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k 269 | given_centers ... integer-vector with entries in 0,...,n-1 270 | 271 | RETURNS: heuristically chosen centers''' 272 | 273 | 274 | m = nr_centers_per_sex.size 275 | 276 | cluster_centers=np.array([],dtype=int) 277 | 278 | for ell in np.arange(m): 279 | subgroup=np.where(sexes==ell)[0] 280 | given_centers_subgroup=np.where(np.isin(subgroup,given_centers))[0] 281 | 282 | cent_subgroup=k_center_greedy_with_given_centers(dmat[np.ix_(subgroup, subgroup)], 283 | nr_centers_per_sex[ell], given_centers_subgroup) 284 | 285 | cluster_centers=np.hstack((cluster_centers,subgroup[cent_subgroup])) 286 | 287 | return cluster_centers 288 | ######################################################################################################################## 289 | 290 | 291 | 292 | ######################################################################################################################## 293 | def heuristic_greedy_till_constraint_is_satisfied(dmat,sexes,nr_centers_per_sex,given_centers): 294 | '''Implementation of Heuristic B as described in Section 5.3. 295 | 296 | INPUT: 297 | dmat ... distance matrix of size nxn 298 | sexes ... integer-vector of length n with entries in 0,...,m-1, where m is the number of groups 299 | nr_centers_per_sex ... integer-vector of length m with entries in 0,...,k and sum over entries equaling k 300 | given_centers ... integer-vector with entries in 0,...,n-1 301 | 302 | RETURNS: heuristically chosen centers 303 | 304 | SINCE WE DO NOT REQUIRE THE GENERAL CASE IN OUR EXPERIMENTS AND IT ALLOWS FOR A SLIGHTLY SIMPLER CODE, 305 | HERE WE ASSUME THAT EITHER length(given_centers)>0 OR nr_centers_per_sex only[i]>0 FOR ALL i''' 306 | 307 | 308 | n = dmat.shape[0] 309 | m = nr_centers_per_sex.size 310 | k=np.sum(nr_centers_per_sex) 311 | 312 | current_nr_per_sex=np.zeros(m) 313 | 314 | if k==0: 315 | cluster_centers = np.array([], dtype=int) 316 | else: 317 | if given_centers.size==0: 318 | cluster_centers=np.random.choice(n,1,replace=False) 319 | current_nr_per_sex[sexes[cluster_centers]]+=1 320 | kk=1 321 | else: 322 | cluster_centers = given_centers 323 | kk=0 324 | 325 | distance_to_closest = np.amin(dmat[np.ix_(cluster_centers, np.arange(n))], axis=0) 326 | while kk1: 272 | number_of_runs=int(sys.argv[1]) 273 | else: 274 | number_of_runs=10 275 | 276 | exp_comparison_heuristics_adult_data_set(number_of_runs,0) 277 | exp_comparison_heuristics_adult_data_set(number_of_runs, 1) 278 | exp_comparison_greedy_strategy_adult_data_set(number_of_runs, 0) 279 | exp_comparison_greedy_strategy_adult_data_set(number_of_runs,1) 280 | -------------------------------------------------------------------------------- /experiments_artificial_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse.csgraph 3 | from sklearn.metrics.pairwise import pairwise_distances 4 | import matplotlib.pyplot as plt 5 | import sys 6 | from algorithms import * 7 | 8 | 9 | 10 | ############################################################### 11 | ## Experiment shown in Figure 4 12 | ############################################################### 13 | def exp_approx_factor_artificial_data(nr_of_runs): 14 | 15 | print '-------------------------------------------' 16 | print 'exp_approx_factor_artificial_data' 17 | print '-------------------------------------------' 18 | 19 | setting_list = [ell for ell in 2 + np.arange(19)] 20 | plot_data = np.zeros((nr_of_runs, len(setting_list))) 21 | 22 | n = 10000 23 | initially_given = np.array([], dtype=int) 24 | 25 | centersTRUE = np.zeros((100, 2)) 26 | ccc = 0 27 | for zzz in np.arange(10): 28 | for rrr in np.arange(10): 29 | centersTRUE[ccc, 0] = zzz 30 | centersTRUE[ccc, 1] = rrr 31 | ccc += 1 32 | 33 | 34 | for ccc, m in enumerate(setting_list): 35 | 36 | print 'm = ',m 37 | 38 | for rrr in np.arange(nr_of_runs): 39 | sexes = np.random.randint(m, size=n) 40 | points = np.random.normal(size=(n, 2)) 41 | 42 | points_ce = np.random.choice(centersTRUE.shape[0], n) 43 | for zzz in np.arange(centersTRUE.shape[0]): 44 | cluster = np.where(points_ce == zzz)[0] 45 | radius_cluster = np.max(np.sum((points[cluster, :]) ** 2, axis=1) ** (0.5)) 46 | points[cluster, :] = 0.5 * points[cluster, :] / radius_cluster + np.repeat(centersTRUE[zzz, :].reshape(1, 2), 47 | cluster.size, axis=0) 48 | 49 | sex_centersTRUE = np.random.randint(m,size=centersTRUE.shape[0]) 50 | req_nr_per_sex = np.zeros(m,dtype=int) 51 | for ell in np.arange(m): 52 | req_nr_per_sex[ell] = np.sum(sex_centersTRUE == ell) 53 | 54 | points = np.vstack((points, centersTRUE)) 55 | sexes = np.hstack((sexes, sex_centersTRUE)) 56 | hh = np.random.permutation(sexes.size) 57 | sexes = sexes[hh] 58 | points = points[hh, :] 59 | dmat = pairwise_distances(points) 60 | 61 | centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given) 62 | cost_approx = np.amax( 63 | np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(dmat.shape[0]))], axis=0)) 64 | cost_exact = 0.5 65 | fac = cost_approx / cost_exact 66 | plot_data[rrr, ccc] = fac 67 | 68 | print 'Approximation factor =',fac 69 | 70 | 71 | 72 | data = [plot_data[:, ccc] for ccc in np.arange(len(setting_list))] 73 | fig, ax = plt.subplots(figsize=(13, 4.5)) 74 | ax.set_title('Simulated data with known optimal solution, S=' + str(dmat.shape[0]), fontsize=16) 75 | ax.boxplot(data) 76 | fig.tight_layout() 77 | plt.xticks([ggg for ggg in (1 + np.arange(len(setting_list)))], 78 | ['m=' + str(setting_list[ggg - 1]) for ggg in (1 + np.arange(len(setting_list)))], fontsize=12) 79 | plt.ylabel('Approximation factor', fontsize=14) 80 | 81 | fig.savefig('plot_exp_approx_factor_artificial_data.pdf', bbox_inches='tight') 82 | plt.close() 83 | 84 | 85 | 86 | 87 | 88 | ############################################################### 89 | ## Experiment shown in left plot of Figure 6 90 | ############################################################### 91 | def exp_comparison_heuristics_artificial_data(nr_of_runs): 92 | 93 | print '-------------------------------------------' 94 | print 'exp_comparison_heuristics_artificial_data' 95 | print '-------------------------------------------' 96 | 97 | plot_data = np.zeros((nr_of_runs, 3)) 98 | 99 | n = 2000 100 | m = 10 101 | nr_initially_given = 10 102 | req_nr_per_sex = np.repeat(4,m) 103 | 104 | 105 | for rrr in np.arange(nr_of_runs): 106 | 107 | print 'run=',rrr 108 | 109 | indi_sexes = 0 110 | while indi_sexes == 0: 111 | sexes = np.random.randint(m, size=n) 112 | 113 | elem_per_sex = np.zeros(m, dtype=int) 114 | for ell in np.arange(m): 115 | elem_per_sex[ell] = np.sum(sexes == ell) 116 | 117 | if np.sum(elem_per_sex >= req_nr_per_sex) == m: 118 | indi_sexes = 1 119 | 120 | initially_given = np.random.choice(n, size=nr_initially_given, replace=False) 121 | 122 | indi_dmat = 0 123 | while indi_dmat == 0: 124 | dmat = np.random.binomial(1, 2 * np.log(n) / n, (n, n)) * np.random.randint(1, high=100 + 1, 125 | size=(n, n)) + 0.0 126 | dmat = np.triu(dmat, 1) 127 | dmat = dmat + dmat.T 128 | 129 | scipy.sparse.csgraph.floyd_warshall(dmat, directed=False, overwrite=True) 130 | 131 | if not np.any(np.isinf(dmat)): 132 | indi_dmat = 1 133 | 134 | 135 | centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given) 136 | cost_approx = np.amax( 137 | np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(n))], axis=0)) 138 | 139 | centers_heuristic1 = heuristic_greedy_on_each_group(dmat, sexes, req_nr_per_sex, initially_given) 140 | cost_heuristic1 = np.amax( 141 | np.amin(dmat[np.ix_(np.hstack((centers_heuristic1, initially_given)), np.arange(n))], axis=0)) 142 | 143 | centers_heuristic2 = heuristic_greedy_till_constraint_is_satisfied(dmat, sexes, req_nr_per_sex, 144 | initially_given) 145 | cost_heuristic2 = np.amax( 146 | np.amin(dmat[np.ix_(np.hstack((centers_heuristic2, initially_given)), np.arange(n))], axis=0)) 147 | 148 | plot_data[rrr, 0] = cost_approx 149 | plot_data[rrr, 1] = cost_heuristic1 150 | plot_data[rrr, 2] = cost_heuristic2 151 | 152 | 153 | 154 | data = [plot_data[:, ccc] for ccc in np.arange(3)] 155 | fig, ax = plt.subplots(figsize=(3.4, 4.5)) 156 | ax.set_title('Simulated data, |S|=' + str(n), fontsize=16) 157 | ax.boxplot(data) 158 | fig.tight_layout() 159 | plt.xticks([ggg for ggg in (1 + np.arange(3))], ['Our Alg.', 'Heur. A', 'Heur. B'], fontsize=12) 160 | plt.ylabel('Cost', fontsize=14) 161 | 162 | fig.savefig('plot_exp_comparison_heuristics_artificial_data.pdf', bbox_inches='tight') 163 | plt.close() 164 | 165 | 166 | 167 | 168 | 169 | ############################################################### 170 | ## Experiment shown in left plot of Figure 5 171 | ############################################################### 172 | def exp_comparison_greedy_strategy_artificial_data(nr_of_runs): 173 | 174 | print '-------------------------------------------' 175 | print 'exp_comparison_greedy_strategy_artificial_data' 176 | print '-------------------------------------------' 177 | 178 | plot_data = np.zeros((nr_of_runs, 3)) 179 | 180 | n = 2000 181 | m = 10 182 | nr_initially_given = 10 183 | req_nr_per_sex = np.repeat(4, m) 184 | 185 | 186 | 187 | for rrr in np.arange(nr_of_runs): 188 | 189 | print 'run=',rrr 190 | 191 | indi_sexes = 0 192 | while indi_sexes == 0: 193 | sexes = np.random.randint(m, size=n) 194 | 195 | elem_per_sex = np.zeros(m, dtype=int) 196 | for ell in np.arange(m): 197 | elem_per_sex[ell] = np.sum(sexes == ell) 198 | 199 | if np.sum(elem_per_sex >= req_nr_per_sex) == m: 200 | indi_sexes = 1 201 | 202 | initially_given = np.random.choice(n, size=nr_initially_given, replace=False) 203 | 204 | indi_dmat = 0 205 | while indi_dmat == 0: 206 | dmat = np.random.binomial(1, 2 * np.log(n) / n, (n, n)) * np.random.randint(1, high=100 + 1, 207 | size=(n, n)) + 0.0 208 | dmat = np.triu(dmat, 1) 209 | dmat = dmat + dmat.T 210 | 211 | scipy.sparse.csgraph.floyd_warshall(dmat, directed=False, overwrite=True) 212 | 213 | if not np.any(np.isinf(dmat)): 214 | indi_dmat = 1 215 | 216 | 217 | centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given) 218 | cost_approx = np.amax( 219 | np.amin(dmat[np.ix_(np.hstack((centers_approx, initially_given)), np.arange(n))], axis=0)) 220 | 221 | centers_greedy = k_center_greedy_with_given_centers(dmat, np.sum(req_nr_per_sex), initially_given) 222 | cost_greedy = np.amax( 223 | np.amin(dmat[np.ix_(np.hstack((centers_greedy, initially_given)), np.arange(n))], axis=0)) 224 | 225 | plot_data[rrr, 0] = cost_approx 226 | plot_data[rrr, 1] = cost_greedy 227 | 228 | if m == 2: 229 | plot_data[rrr, 2] = np.abs(np.sum(sexes[centers_greedy] == 0) - np.sum(sexes[centers_greedy] == 1)) 230 | else: 231 | maxdev = 0 232 | for der in np.arange(m): 233 | for das in np.arange(m): 234 | maxdev = np.max([maxdev, np.abs( 235 | np.sum(sexes[centers_greedy] == der) - np.sum(sexes[centers_greedy] == das))]) 236 | plot_data[rrr, 2] = maxdev 237 | 238 | 239 | 240 | if m == 2: 241 | data = [plot_data[:, ccc] for ccc in np.arange(2)] 242 | 243 | fig = plt.figure() 244 | st = fig.suptitle('Simulated data, |S|=|S$_1$|+|S$_2$|=' + str(n), fontsize=16) 245 | 246 | ax1 = fig.add_subplot(121) 247 | ax1.boxplot(data) 248 | plt.xticks([1, 2], ['Our Alg.', 'Unfair Greedy'], fontsize=12) 249 | plt.ylabel('Cost', fontsize=14) 250 | 251 | ax2 = fig.add_subplot(122) 252 | ax2.boxplot(plot_data[:, 2]) 253 | plt.xticks([1], ['Unfair Greedy'], fontsize=12) 254 | plt.ylabel('|# centers in S$_1$ - # centers in S$_2$|', fontsize=14) 255 | 256 | else: 257 | data = [plot_data[:, ccc] for ccc in np.arange(2)] 258 | 259 | fig = plt.figure() 260 | st = fig.suptitle('Simulated data, |S|=|S$_1$|+...+|S$_{10}$|=' + str(n), fontsize=16) 261 | 262 | ax1 = fig.add_subplot(121) 263 | ax1.boxplot(data) 264 | plt.xticks([1, 2], ['Our Alg.', 'Unfair Greedy'], fontsize=12) 265 | plt.ylabel('Cost', fontsize=14) 266 | 267 | ax2 = fig.add_subplot(122) 268 | ax2.boxplot(plot_data[:, 2]) 269 | plt.xticks([1], ['Unfair Greedy'], fontsize=12) 270 | plt.ylabel('max$_{i,j}$ |# centers in S$_i$ - # centers in S$_j$|', fontsize=13) 271 | 272 | fig.tight_layout() 273 | st.set_y(0.95) 274 | fig.subplots_adjust(top=0.85) 275 | fig.savefig('exp_comparison_greedy_strategy_artificial_data.pdf', bbox_inches='tight') 276 | plt.close() 277 | 278 | 279 | 280 | 281 | 282 | if __name__ == "__main__": 283 | if len(sys.argv)>1: 284 | number_of_runs=int(sys.argv[1]) 285 | else: 286 | number_of_runs=10 287 | 288 | exp_approx_factor_artificial_data(number_of_runs) 289 | exp_comparison_heuristics_artificial_data(number_of_runs) 290 | exp_comparison_greedy_strategy_artificial_data(number_of_runs) 291 | -------------------------------------------------------------------------------- /experiments_matroid_center.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse.csgraph 3 | import matplotlib.pyplot as plt 4 | import sys 5 | import time 6 | from algorithms import * 7 | from algorithm_matroid_center import * 8 | 9 | 10 | ############################################################### 11 | ## Experiment shown in the left plot of Figure 3 12 | ############################################################### 13 | def exp_compare_to_optimal_solution(nr_of_runs): 14 | 15 | print '-------------------------------------------' 16 | print 'exp_compare_to_optimal_solution' 17 | print '-------------------------------------------' 18 | 19 | setting_list=[[25, 2, 2, [2,2]], 20 | [25, 2, 2, [4,2]], 21 | [25, 3, 2, [2,2,2]], 22 | [25, 3, 1, [5, 1, 1]], 23 | [25, 4, 0, [2, 2, 2,2]], 24 | [25, 4, 0, [3, 3, 1, 1]], 25 | [25, 5, 0, [2, 2, 2, 1,1]]] 26 | 27 | 28 | pl_data=np.zeros((nr_of_runs,len(setting_list))) 29 | pl_data_MATROID = np.zeros((nr_of_runs, len(setting_list))) 30 | pl_time = np.zeros((nr_of_runs, len(setting_list))) 31 | pl_time_MATROID = np.zeros((nr_of_runs, len(setting_list))) 32 | settings_as_vec=np.array([]) 33 | 34 | 35 | for tr,sl in enumerate(setting_list): 36 | 37 | n=sl[0] 38 | m=sl[1] 39 | nr_initially_given=sl[2] 40 | req_nr_per_sex=np.array(sl[3]) 41 | 42 | print '' 43 | print 'n='+str(n)+', m='+str(m)+', (k_{S_1},...,k_{S_m})='+str(tuple(req_nr_per_sex))+', |C_0|='+str(nr_initially_given) 44 | 45 | settings_as_vec=np.hstack((settings_as_vec,np.array([n,m,nr_initially_given]),req_nr_per_sex)) 46 | 47 | 48 | for rrr in np.arange(nr_of_runs): 49 | 50 | indi_sexes = 0 51 | while indi_sexes == 0: 52 | sexes = np.random.randint(m, size=n) 53 | 54 | elem_per_sex = np.zeros(m, dtype=int) 55 | for ell in np.arange(m): 56 | elem_per_sex[ell] = np.sum(sexes == ell) 57 | 58 | if np.sum(elem_per_sex >= req_nr_per_sex) == m: 59 | indi_sexes = 1 60 | 61 | initially_given = np.random.choice(n, size=nr_initially_given, replace=False) 62 | 63 | indi_dmat = 0 64 | while indi_dmat == 0: 65 | dmat = np.random.binomial(1, 2 * np.log(n) / n, (n, n)) * np.random.randint(1, high=100 + 1, 66 | size=(n, n)) + 0.0 67 | dmat = np.triu(dmat, 1) 68 | dmat = dmat + dmat.T 69 | scipy.sparse.csgraph.floyd_warshall(dmat, directed=False, overwrite=True) 70 | if not np.any(np.isinf(dmat)): 71 | indi_dmat = 1 72 | 73 | 74 | start = time.time() 75 | centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given) 76 | cost_approx = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx,initially_given)), np.arange(n))], axis=0)) 77 | end = time.time() 78 | pl_time[rrr,tr] = end - start 79 | 80 | start = time.time() 81 | centers_approx_MATROID = MatCenter_binary_search_WithGivenCenters(dmat, sexes, req_nr_per_sex, initially_given) 82 | cost_approx_MATROID = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx_MATROID, initially_given)), np.arange(n))], axis=0)) 83 | end = time.time() 84 | pl_time_MATROID[rrr, tr] = end - start 85 | 86 | centers_exact, cl_exact, cost_exact = fair_k_center_exact(dmat, sexes, req_nr_per_sex,initially_given) 87 | 88 | 89 | fac = cost_approx / cost_exact 90 | pl_data[rrr,tr]=fac 91 | fac_MATROID = cost_approx_MATROID / cost_exact 92 | pl_data_MATROID[rrr, tr] = fac_MATROID 93 | print 'Approximation factor Alg. 4='+str(fac)+' ---- Approximation factor M.C.='+str(fac_MATROID) 94 | 95 | 96 | 97 | data=[] 98 | XT=[] 99 | for tr in np.arange(len(setting_list)): 100 | data.append(pl_data[:,tr]) 101 | data.append(pl_data_MATROID[:, tr]) 102 | XT.append('Alg. 4') 103 | XT.append('M.C.') 104 | fig, ax = plt.subplots(figsize=(12,4.5)) 105 | ax.set_title('Simulated data with computable optimal solution, |S|='+str(n),fontsize=16) 106 | ax.boxplot(data) 107 | fig.tight_layout() 108 | plt.xticks([ggg for ggg in (1 + np.arange(2*len(setting_list)))],XT,fontsize=12) 109 | plt.ylabel('Approximation factor',fontsize=14) 110 | ylim = ax.get_ylim() 111 | new_ylim = (0.95,ylim[1]) 112 | ax.set_ylim(new_ylim) 113 | fig.savefig('exp_compare_to_optimal_solution_APPROXFACTOR.pdf',bbox_inches='tight') 114 | plt.close() 115 | 116 | 117 | data = [] 118 | XT = [] 119 | for tr in np.arange(len(setting_list)): 120 | data.append(pl_time[:, tr]) 121 | data.append(pl_time_MATROID[:, tr]) 122 | XT.append('Alg. 4') 123 | XT.append('M.C.') 124 | fig, ax = plt.subplots(figsize=(12, 4.5)) 125 | ax.set_title('Simulated data with computable optimal solution, |S|=' + str(n), fontsize=16) 126 | ax.boxplot(data) 127 | fig.tight_layout() 128 | plt.xticks([ggg for ggg in (1 + np.arange(2 * len(setting_list)))], XT, fontsize=12) 129 | plt.ylabel('Running time [s]', fontsize=14) 130 | fig.savefig('exp_compare_to_optimal_solution_TIME.pdf', bbox_inches='tight') 131 | plt.close() 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | ############################################################### 140 | ## Experiment shown in the right plot of Figure 3 and in 141 | ## Figure 9 in Appendix B 142 | ############################################################### 143 | def exp_compare_to_each_other(nr_of_runs): 144 | 145 | print '-------------------------------------------' 146 | print 'exp_compare_to_each_other' 147 | print '-------------------------------------------' 148 | 149 | setting_list=[[50, 5, 0, [4,4,4,4,4]], 150 | [100, 5, 0, [4,4,4,4,4]], 151 | [150, 5, 0, [4, 4, 4, 4, 4]], 152 | [200, 5, 0, [4, 4, 4, 4, 4]], 153 | [250, 5, 0, [4, 4, 4, 4, 4]]] 154 | 155 | 156 | n_list=np.array([]) 157 | 158 | pl_data=np.zeros((nr_of_runs,len(setting_list))) 159 | pl_data_MATROID = np.zeros((nr_of_runs, len(setting_list))) 160 | pl_time = np.zeros((nr_of_runs, len(setting_list))) 161 | pl_time_MATROID = np.zeros((nr_of_runs, len(setting_list))) 162 | settings_as_vec=np.array([]) 163 | 164 | 165 | for tr,sl in enumerate(setting_list): 166 | 167 | n=sl[0] 168 | n_list=np.hstack((n_list,n)) 169 | m=sl[1] 170 | nr_initially_given=sl[2] 171 | req_nr_per_sex=np.array(sl[3]) 172 | 173 | print '' 174 | print 'n='+str(n) 175 | 176 | settings_as_vec=np.hstack((settings_as_vec,np.array([n,m,nr_initially_given]),req_nr_per_sex)) 177 | 178 | 179 | for rrr in np.arange(nr_of_runs): 180 | 181 | indi_sexes = 0 182 | while indi_sexes==0: 183 | sexes=np.random.randint(m,size=n) 184 | 185 | elem_per_sex = np.zeros(m, dtype=int) 186 | for ell in np.arange(m): 187 | elem_per_sex[ell] = np.sum(sexes == ell) 188 | 189 | if np.sum(elem_per_sex >= req_nr_per_sex) == m: 190 | indi_sexes = 1 191 | 192 | initially_given = np.random.choice(n, size=nr_initially_given,replace=False) 193 | 194 | indi_dmat=0 195 | while indi_dmat==0: 196 | dmat=np.random.binomial(1,2*np.log(n)/n,(n,n))*np.random.randint(1,high=100+1,size=(n,n))+0.0 197 | dmat=np.triu(dmat,1) 198 | dmat=dmat+dmat.T 199 | scipy.sparse.csgraph.floyd_warshall(dmat,directed=False,overwrite=True) 200 | if not np.any(np.isinf(dmat)): 201 | indi_dmat=1 202 | 203 | 204 | start = time.time() 205 | centers_approx = fair_k_center_APPROX(dmat, sexes, req_nr_per_sex, initially_given) 206 | cost_approx = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx,initially_given)), np.arange(n))], axis=0)) 207 | end = time.time() 208 | pl_data[rrr, tr] = cost_approx 209 | pl_time[rrr,tr] = end - start 210 | 211 | start = time.time() 212 | centers_approx_MATROID = MatCenter_binary_search_WithGivenCenters(dmat, sexes, req_nr_per_sex, initially_given) 213 | cost_approx_MATROID = np.amax(np.amin(dmat[np.ix_(np.hstack((centers_approx_MATROID, initially_given)), np.arange(n))], axis=0)) 214 | end = time.time() 215 | pl_data_MATROID[rrr, tr] = cost_approx_MATROID 216 | pl_time_MATROID[rrr, tr] = end - start 217 | 218 | print 'Cost Alg. 4='+str(cost_approx)+', Time Alg. 4='+str(pl_time[rrr,tr])+' ---- Cost M.C.='+str(cost_approx_MATROID)+\ 219 | ', Time M.C.='+str(pl_time_MATROID[rrr, tr]) 220 | 221 | 222 | 223 | data=[] 224 | XT=[] 225 | for tr in np.arange(len(setting_list)): 226 | data.append(pl_data[:,tr]) 227 | data.append(pl_data_MATROID[:, tr]) 228 | XT.append('Alg. 4') 229 | XT.append('M.C.') 230 | fig, ax = plt.subplots(figsize=(12,4.5)) 231 | ax.set_title('Simulated data, m=' + str(m) + ', k=' + str(np.sum(req_nr_per_sex)), fontsize=16) 232 | ax.boxplot(data) 233 | fig.tight_layout() 234 | plt.xticks([ggg for ggg in (1 + np.arange(2*len(setting_list)))],XT,fontsize=12) 235 | plt.ylabel('Cost',fontsize=14) 236 | ylim = ax.get_ylim() 237 | new_ylim = (0.95,ylim[1]) 238 | ax.set_ylim(new_ylim) 239 | fig.savefig('exp_compare_to_each_other_COST.pdf',bbox_inches='tight') 240 | plt.close() 241 | 242 | 243 | 244 | plt.figure(figsize=(7.5, 4.5)) 245 | plt.plot(n_list, np.mean(pl_time, 0), label='Alg. 4', marker="x", color='b') 246 | plt.plot(n_list, np.mean(pl_time_MATROID, 0), label='M.C.', marker="x", color='r') 247 | plt.plot(n_list, 248 | (np.mean(pl_time_MATROID, 0)[0] / (((n_list[0]) ** 2) * np.log(n_list[0]))) * (n_list ** 2) * np.log( 249 | n_list), label=r'~ $|S|^2 \cdot \ln(|S|)$', linestyle="--", color='m') 250 | plt.plot(n_list, (np.mean(pl_time_MATROID, 0)[0] / ((n_list[0]) ** 2.5)) * (n_list ** 2.5), 251 | label=r'~ $|S|^{5/2}$', linestyle="--", color='g') 252 | 253 | plt.title('Simulated data, m=' + str(m) + ', k=' + str(np.sum(req_nr_per_sex)), fontsize=16) 254 | plt.legend() 255 | plt.xlabel('|S|', fontsize=14) 256 | plt.ylabel('Running time [s]', fontsize=14) 257 | plt.savefig('exp_compare_to_each_other_TIME.pdf', bbox_inches='tight') 258 | plt.close() 259 | 260 | 261 | 262 | 263 | 264 | 265 | if __name__ == "__main__": 266 | if len(sys.argv)>1: 267 | number_of_runs=int(sys.argv[1]) 268 | else: 269 | number_of_runs=10 270 | 271 | exp_compare_to_optimal_solution(number_of_runs) 272 | exp_compare_to_each_other(number_of_runs) 273 | --------------------------------------------------------------------------------