├── .gitattributes ├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── HBAC_scan └── helper_functions.py ├── LICENSE ├── README.md ├── classifiers └── Loan_approval_classifier │ ├── german_dataset.py │ └── helper_functions.py ├── data └── GermanCredit_dataset │ └── german_dataset.py ├── images ├── Header_Github.png └── Quantitative-qualitative.png ├── poetry.lock ├── pyproject.toml ├── tests ├── __init__.py ├── test_bahc.py ├── test_dataset.py └── test_validation.py └── unsupervised_bias_detection ├── __init__.py ├── cluster ├── __init__.py ├── _bahc.py ├── _cluster_node.py ├── _kmeans.py └── _kmodes.py └── utils ├── __init__.py ├── _get_column_dtypes.py ├── dataset.py └── validation.py /.gitattributes: -------------------------------------------------------------------------------- 1 | /Users/jurriaanparie/Downloads/Documents/AI_Trustworthy_Fairness_Training/Medium_blog/FairerML/AI_Audit_Challenge/data/BERT_fake_news/best_model.pth filter=lfs diff=lfs merge=lfs -text 2 | classification_models/BERT_disinformation_classifier/best_model.pth filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration Workflow 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | 8 | jobs: 9 | main: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.11"] 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v4 17 | 18 | - name: Install poetry 19 | run: | 20 | pipx install poetry 21 | poetry config virtualenvs.path .virtualenvs 22 | 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | cache: poetry 28 | cache-dependency-path: poetry.lock 29 | 30 | - name: Set poetry environment 31 | run: poetry env use ${{ matrix.python-version }} 32 | 33 | - name: Install dependencies 34 | run: poetry install --no-root --no-interaction 35 | 36 | - name: Lint 37 | run: poetry run ruff check unsupervised_bias_detection 38 | 39 | - name: Test 40 | run: poetry run pytest 41 | --color=yes 42 | --full-trace 43 | --showlocals 44 | --verbose 45 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Workflow 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | main: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout repository 12 | uses: actions/checkout@v4 13 | 14 | - name: Install poetry 15 | run: | 16 | pipx install poetry 17 | # poetry config virtualenvs.path .virtualenvs 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.11" 23 | 24 | - name: Publish package to PyPI 25 | env: 26 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 27 | run: poetry publish --build --username "__token__" --password $PYPI_TOKEN -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ~$as_scan_tool_report.docx 2 | *.ipynb_checkpoints 3 | *.DS_Store 4 | *__pycache__ 5 | -------------------------------------------------------------------------------- /HBAC_scan/helper_functions.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | import pingouin as pg 6 | import scipy.stats as stats 7 | 8 | # matplotlib 9 | import matplotlib.pyplot as plt 10 | from sklearn.cluster import KMeans 11 | from matplotlib.lines import Line2D 12 | from matplotlib import collections as mc 13 | 14 | # sklearn 15 | from sklearn.decomposition import PCA 16 | from sklearn.preprocessing import StandardScaler 17 | 18 | 19 | def init_GermanCredit_dataset( 20 | raw_data, 21 | features, 22 | with_errors=True, 23 | just_features=True, 24 | scale_features=True, 25 | with_classes=True, 26 | ): 27 | """Initializing dataset: scaling features, adding new columns which are required for HBAC""" 28 | 29 | new_data = raw_data.copy(deep=True) 30 | 31 | to_scale = new_data.drop( 32 | ["predicted_class", "true_class", "errors", "FP_errors", "FN_errors"], axis=1 33 | ).columns 34 | new_data[to_scale] = StandardScaler().fit_transform(features[to_scale]) 35 | 36 | new_data["clusters"] = 0 37 | new_data["new_clusters"] = -1 38 | return new_data 39 | 40 | 41 | def init_dataset(raw_data, features): 42 | """Initializing dataset: scaling features, adding new columns which are required for HBAC""" 43 | 44 | # copy dataframe 45 | new_data = raw_data.copy(deep=True) 46 | 47 | # only scale features 48 | to_scale = new_data.drop( 49 | ["predicted_class", "true_class", "errors", "FP_errors", "FN_errors"], axis=1 50 | ).columns 51 | new_data[to_scale] = StandardScaler().fit_transform(features[to_scale]) 52 | 53 | # initialize clustering parameters 54 | new_data["clusters"] = 0 55 | new_data["new_clusters"] = -1 56 | 57 | return new_data 58 | 59 | 60 | def bias(results, metric): 61 | """Return accuracy, FP rate or FN rate of dataframe""" 62 | 63 | if metric == "Accuracy": 64 | correct = results.loc[results["errors"] == 0] 65 | acc = len(correct) / len(results) 66 | return acc 67 | if metric == "FP": 68 | FPs = results.loc[ 69 | (results["predicted_class"] == 1) & (results["true_class"] == 0) 70 | ] 71 | Ns = results.loc[(results["true_class"] == 0)] 72 | if Ns.shape[0] != 0: 73 | FP_rate = len(FPs) / len(Ns) 74 | return 1 - FP_rate 75 | else: 76 | return 1 77 | if metric == "FN": 78 | FNs = results.loc[ 79 | (results["predicted_class"] == 0) & (results["true_class"] == 1) 80 | ] 81 | Ps = results.loc[(results["true_class"] == 1)] 82 | if Ps.shape[0] != 0: 83 | FN_rate = len(FNs) / len(Ps) 84 | return 1 - FN_rate 85 | else: 86 | return 1 87 | 88 | 89 | def bias_acc(data, metric, cluster_id, cluster_col): 90 | """Bias := bias metric of the selected cluster - bias metric of the remaining clusters""" 91 | cluster_x = data.loc[data[cluster_col] == cluster_id] 92 | if len(cluster_x) == 0: 93 | print("This is an empty cluster", cluster_id) 94 | remaining_clusters = data.loc[data[cluster_col] != cluster_id] 95 | if len(remaining_clusters) == 0: 96 | print("This cluster is the entire dataset", cluster_id) 97 | return bias(cluster_x, metric) - bias(remaining_clusters, metric) 98 | 99 | 100 | def get_max_bias(fulldata, metric, function=bias_acc): 101 | """Calculates the highest negative bias of the newly introduced clusters""" 102 | max_bias = -999999 103 | for cluster_number in fulldata["new_clusters"].unique(): 104 | current_bias = function(fulldata, metric, cluster_number, "new_clusters") 105 | if current_bias < max_bias: 106 | print("current bias: ", current_bias) 107 | print("max abs bias: ", max_bias) 108 | max_bias = current_bias 109 | return max_bias 110 | 111 | 112 | def get_max_bias_cluster(fulldata, metric, function=bias_acc): 113 | """Identifies cluster linked to the highest bias of the newly introduced clusters""" 114 | max_bias = 100 115 | min_bias = -100 116 | best_cluster = -2 117 | for cluster_number in fulldata["clusters"].unique(): 118 | current_bias = function(fulldata, metric, cluster_number, "clusters") 119 | print(f"{cluster_number} has bias {current_bias}") 120 | 121 | # Accuracy 122 | if metric == "Accuracy": 123 | if current_bias < max_bias: 124 | max_bias = current_bias 125 | best_cluster = cluster_number 126 | 127 | # FP/FN 128 | if metric == "FP" or metric == "FN": 129 | if current_bias > min_bias: 130 | min_bias = current_bias 131 | best_cluster = cluster_number 132 | 133 | return best_cluster 134 | 135 | 136 | def get_min_cluster_size(data): 137 | """Size of smallest new cluster""" 138 | min_cluster_size = len(data) 139 | for i in data["new_clusters"].unique(): 140 | # exclude the cluster -1 from being seen as a cluster, since it contains outliers 141 | if i == -1: 142 | continue 143 | size = len(data.loc[data["new_clusters"] == i]) 144 | if size < min_cluster_size: 145 | min_cluster_size = size 146 | return min_cluster_size 147 | 148 | 149 | def get_next_cluster(data, metric): 150 | """Identifies cluster number with the highest variance. The variance is calculated based on the error metric of each cluster. The cluster with the highest variance will be selected as splitting cluster""" 151 | n_cluster = max(data["clusters"]) 152 | highest_variance = -1 153 | cluster_number = 0 154 | 155 | for i in data["clusters"].unique(): 156 | if i == -1: 157 | continue 158 | cluster_i = data.loc[data["clusters"] == i] 159 | if metric == "Accuracy": 160 | variance_cluster = np.var(cluster_i["errors"]) 161 | if metric == "FP": 162 | variance_cluster = np.var(cluster_i["FP_errors"]) 163 | if metric == "FN": 164 | variance_cluster = np.var(cluster_i["FN_errors"]) 165 | 166 | if variance_cluster > highest_variance: 167 | highest_variance = variance_cluster 168 | cluster_number = i 169 | 170 | return cluster_number 171 | 172 | 173 | def calculate_variance(data, metric): 174 | """Determines variance for a dataframe.""" 175 | variance_list_local = [] 176 | for j in data["clusters"].unique(): 177 | average_bias = bias(data, metric) 178 | bias_clus = bias_acc(data, metric, j, "clusters") 179 | variance_list_local.append(bias_clus) 180 | variance = np.var(variance_list_local) 181 | return variance 182 | 183 | 184 | def get_random_cluster(clusters): 185 | """Identifies value of a random cluster""" 186 | result = -1 187 | while result == -1: 188 | result = random.randint(0, len(clusters.unique())) 189 | return result 190 | 191 | 192 | def HBAC_bias_scan( 193 | df, metric, split_cluster_size, acc_cluster_size, clustering_paramaters 194 | ): 195 | iterations_max = 20 196 | x = 0 # initial cluster number 197 | initial_bias = 0 198 | variance_list = [] 199 | average_bias = bias(df, metric) 200 | minimal_splittable_cluster_size = split_cluster_size 201 | minimal_acceptable_cluster_size = acc_cluster_size 202 | print(f"bias {metric} is: ", average_bias) 203 | 204 | for i in range(1, iterations_max): 205 | if i != 1: 206 | 207 | # calculate variance for cluster 208 | variance_list.append(calculate_variance(df, metric)) 209 | 210 | df["new_clusters"] = -1 211 | candidate_cluster = df.loc[df["clusters"] == x] 212 | 213 | if len(candidate_cluster) < minimal_splittable_cluster_size: 214 | x = get_random_cluster(df["clusters"]) 215 | continue 216 | 217 | # k-means clustering 218 | kmeans_algo = KMeans(**clustering_paramaters).fit( 219 | candidate_cluster.drop( 220 | [ 221 | "clusters", 222 | "new_clusters", 223 | "predicted_class", 224 | "true_class", 225 | "errors", 226 | "FP_errors", 227 | "FN_errors", 228 | ], 229 | axis=1, 230 | ) 231 | ) 232 | 233 | candidate_cluster["new_clusters"] = pd.DataFrame( 234 | kmeans_algo.predict( 235 | candidate_cluster.drop( 236 | [ 237 | "clusters", 238 | "new_clusters", 239 | "predicted_class", 240 | "true_class", 241 | "errors", 242 | "FP_errors", 243 | "FN_errors", 244 | ], 245 | axis=1, 246 | ) 247 | ), 248 | index=candidate_cluster.index, 249 | ) 250 | df["new_clusters"] = candidate_cluster["new_clusters"].combine_first( 251 | df["new_clusters"] 252 | ) 253 | 254 | # find discriminated clusters 255 | max_bias = get_max_bias(df, metric) 256 | min_new_size = get_min_cluster_size(df) 257 | 258 | if (max_bias <= initial_bias) & ( 259 | min_new_size > minimal_acceptable_cluster_size 260 | ): 261 | # Add new cluster 262 | n_cluster = max(df["clusters"]) 263 | df["clusters"][df["new_clusters"] == 1] = n_cluster + 1 264 | 265 | x = get_next_cluster(df, metric) 266 | initial_bias = max_bias 267 | else: 268 | x = get_random_cluster(df["clusters"]) 269 | 270 | print("done") 271 | return df 272 | 273 | 274 | def stat_df(df, discriminated_cluster, not_discriminated): 275 | 276 | # finding difference 277 | difference = (discriminated_cluster.mean()) - (not_discriminated.mean()) 278 | diff_dict = difference.to_dict() 279 | 280 | # unscaling the discriminated cluster 281 | unscaled_discriminated = df.loc[discriminated_cluster.index, :] 282 | 283 | # unscaled other data 284 | unscaled_remaining = df.drop(discriminated_cluster.index) 285 | 286 | # statistical testing 287 | welch_dict = {} 288 | CI_dict_left = {} 289 | CI_dict_right = {} 290 | 291 | features = [ 292 | col 293 | for col in df.columns.tolist() 294 | if col 295 | not in [ 296 | "tweet_id1", 297 | "scaled_errors", 298 | "predicted_class", 299 | "true_class", 300 | "errors", 301 | "FP_errors", 302 | "FN_errors", 303 | "clusters", 304 | "new_clusters", 305 | ] 306 | ] 307 | 308 | for i in features: 309 | welch_i = stats.ttest_ind( 310 | unscaled_discriminated[i], unscaled_remaining[i], equal_var=False 311 | ) 312 | res = pg.ttest(unscaled_discriminated[i], unscaled_remaining[i], paired=False) 313 | 314 | # attach to dictionary 315 | welch_dict[i] = welch_i.pvalue 316 | CI_dict_left[i] = res["CI95%"][0][0] 317 | CI_dict_right[i] = res["CI95%"][0][1] 318 | 319 | # store results in dataframe 320 | pd.set_option("display.float_format", lambda x: "%.5f" % x) 321 | cluster_analysis_df = pd.DataFrame( 322 | [diff_dict, welch_dict, CI_dict_left, CI_dict_right] 323 | ).T 324 | cluster_analysis_df.columns = ["difference", "p-value", "[0.025", "0.975]"] 325 | cluster_analysis_df = cluster_analysis_df.sort_values("p-value", ascending=[True]) 326 | n_rows = cluster_analysis_df.shape[0] 327 | 328 | # Get errors; (coef - lower bound of conf interval) 329 | cluster_analysis_df["errors"] = ( 330 | cluster_analysis_df["difference"] - cluster_analysis_df["[0.025"] 331 | ) 332 | cluster_analysis_df = cluster_analysis_df.iloc[0:n_rows,] 333 | cluster_analysis_df["num"] = [int(i) for i in np.linspace(n_rows - 1, 0, n_rows)] 334 | 335 | cluster_analysis_df = cluster_analysis_df.reset_index() 336 | 337 | return cluster_analysis_df 338 | 339 | 340 | def CI_plot(df, x_lim, feat_ls): 341 | """ 342 | Takes in results of Welch's t-test and returns a plot of 343 | the coefficients with 95% confidence intervals. 344 | """ 345 | n_rows = df.shape[0] 346 | 347 | # line segments 348 | lines_sign = [] 349 | lines_non_sign = [] 350 | index_ls = [] 351 | i = n_rows 352 | for feat in feat_ls: 353 | k = df[df["index"] == feat].index[0] 354 | p_value = df.iloc[k, 2] 355 | if p_value <= 0.05: 356 | sub_ls_sign = [] 357 | sub_ls_sign.append((df.iloc[k, 3], i)) 358 | sub_ls_sign.append((df.iloc[k, 4], i)) 359 | lines_sign.append(sub_ls_sign) 360 | index_ls.append((i, k)) 361 | i -= 1 362 | else: 363 | sub_ls_non_sign = [] 364 | sub_ls_non_sign.append((df.iloc[k, 3], i)) 365 | sub_ls_non_sign.append((df.iloc[k, 4], i)) 366 | lines_non_sign.append(sub_ls_non_sign) 367 | index_ls.append((i, k)) 368 | i -= 1 369 | 370 | fig, ax = plt.subplots(figsize=(10, 7)) 371 | 372 | # Line to define zero on the x-axis 373 | ax.axvline(x=0, linestyle="--", color="black", linewidth=1) 374 | 375 | # line segments significant 376 | lc = mc.LineCollection(lines_sign, colors="steelblue", linewidths=10, alpha=0.75) 377 | ax.add_collection(lc) 378 | ax.autoscale() 379 | 380 | # line segments non-significant 381 | lc = mc.LineCollection( 382 | lines_non_sign, colors="steelblue", linewidths=10, alpha=0.25 383 | ) 384 | ax.add_collection(lc) 385 | ax.autoscale() 386 | 387 | # title and axes 388 | plt.title("Cluster difference 95% confidence interval", fontsize=24) 389 | 390 | # font size axes 391 | ax.tick_params(axis="both", which="major", labelsize=16) 392 | 393 | # x-axis 394 | ax.set_xlabel("Difference in means", fontsize=22) 395 | ax.set_xlim(x_lim) 396 | xlims = ax.get_xlim() 397 | 398 | # annotate x-axis 399 | ax.annotate( 400 | "Cluster mean lower than\nrest of (standardized) dataset", 401 | xy=(xlims[0], -0.1), 402 | xytext=(xlims[0], -0.5), 403 | ha="center", 404 | annotation_clip=False, 405 | fontsize=14, 406 | style="italic", 407 | ) 408 | ax.annotate( 409 | "Cluster mean higher than\nrest of (standardized) dataset", 410 | xy=(xlims[1], -0.1), 411 | xytext=(xlims[1], -0.5), 412 | ha="center", 413 | annotation_clip=False, 414 | fontsize=14, 415 | style="italic", 416 | ) 417 | 418 | # y-axis 419 | columns = feat_ls 420 | ax.set_yticklabels([""] + columns[::-1]) 421 | 422 | # scatter plot 423 | idx_ls = [i for (i, k) in index_ls] 424 | scatter_ls = [df.iloc[k, 1] for (i, k) in index_ls] 425 | ax.scatter( 426 | y=idx_ls, 427 | marker="o", 428 | s=250, 429 | edgecolors="none", 430 | linewidth=2, 431 | x=scatter_ls, 432 | color="steelblue", 433 | ) 434 | 435 | # legend 436 | legend_elements = [ 437 | Line2D([0], [0], color="steelblue", alpha=0.75, lw=10, label="Significant"), 438 | Line2D([0], [0], color="steelblue", alpha=0.25, lw=10, label="Not significant"), 439 | ] 440 | ax.legend(handles=legend_elements, loc="best", fontsize=16) 441 | 442 | return plt.show() 443 | 444 | 445 | def pca_plot(data): 446 | """PCA dimensionality reduction to display identified clusters as scatterplot.""" 447 | 448 | pca_features = data.drop( 449 | [ 450 | "predicted_class", 451 | "true_class", 452 | "errors", 453 | "FP_errors", 454 | "FN_errors", 455 | "clusters", 456 | "new_clusters", 457 | ], 458 | axis=1, 459 | ) 460 | other_features = data[ 461 | [ 462 | "predicted_class", 463 | "true_class", 464 | "errors", 465 | "FP_errors", 466 | "FN_errors", 467 | "clusters", 468 | "new_clusters", 469 | ] 470 | ] 471 | 472 | df = pd.DataFrame(pca_features) 473 | pca = pd.DataFrame(PCA(n_components=2).fit_transform(df), index=df.index) 474 | temp_dataset = pca.join(other_features, how="left") 475 | temp_dataset.rename(columns={0: "PCA - 1st"}, inplace=True) 476 | temp_dataset.rename(columns={1: "PCA - 2nd"}, inplace=True) 477 | 478 | scatterplot = sns.scatterplot( 479 | data=temp_dataset, 480 | x="PCA - 1st", 481 | y="PCA - 2nd", 482 | hue="clusters", 483 | size="errors", 484 | sizes=(150, 30), 485 | palette="Set1", 486 | ) 487 | scatterplot.set_title("HBAC bias scan (k-means) on AI classifier") 488 | lgd = scatterplot.legend(loc="center left", bbox_to_anchor=(1.0, 0.5), ncol=1) 489 | plt.show() 490 | 491 | 492 | # plt.savefig('./test.png', bbox_extra_artists=(lgd,), bbox_inches='tight') 493 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | EUROPEAN UNION PUBLIC LICENCE v. 1.2 EUPL © the European Union 2007, 2016 2 | 3 | This European Union Public Licence (the ‘EUPL’) applies to the Work (as defined below) which is provided under the terms of this Licence. Any use of the Work, other than as authorised under this Licence is prohibited (to the extent such use is covered by a right of the copyright holder of the Work). The Work is provided under the terms of this Licence when the Licensor (as defined below) has placed the following notice immediately following the copyright notice for the Work: Licensed under the EUPL or has expressed by any other means his willingness to license under the EUPL. 4 | 5 | 1.Definitions In this Licence, the following terms have the following meaning: — ‘The Licence’:this Licence. — ‘The Original Work’:the work or software distributed or communicated by the Licensor under this Licence, available as Source Code and also as Executable Code as the case may be. — ‘Derivative Works’:the works or software that could be created by the Licensee, based upon the Original Work or modifications thereof. This Licence does not define the extent of modification or dependence on the Original Work required in order to classify a work as a Derivative Work; this extent is determined by copyright law applicable in the country mentioned in Article 15. — ‘The Work’:the Original Work or its Derivative Works. — ‘The Source Code’:the human-readable form of the Work which is the most convenient for people to study and modify. — ‘The Executable Code’:any code which has generally been compiled and which is meant to be interpreted by a computer as a program. — ‘The Licensor’:the natural or legal person that distributes or communicates the Work under the Licence. — ‘Contributor(s)’:any natural or legal person who modifies the Work under the Licence, or otherwise contributes to the creation of a Derivative Work. — ‘The Licensee’ or ‘You’:any natural or legal person who makes any usage of the Work under the terms of the Licence. — ‘Distribution’ or ‘Communication’:any act of selling, giving, lending, renting, distributing, communicating, transmitting, or otherwise making available, online or offline, copies of the Work or providing access to its essential functionalities at the disposal of any other natural or legal person. 6 | 7 | 2.Scope of the rights granted by the Licence The Licensor hereby grants You a worldwide, royalty-free, non-exclusive, sublicensable licence to do the following, for the duration of copyright vested in the Original Work: — use the Work in any circumstance and for all usage, — reproduce the Work, — modify the Work, and make Derivative Works based upon the Work, — communicate to the public, including the right to make available or display the Work or copies thereof to the public and perform publicly, as the case may be, the Work, — distribute the Work or copies thereof, — lend and rent the Work or copies thereof, — sublicense rights in the Work or copies thereof. Those rights can be exercised on any media, supports and formats, whether now known or later invented, as far as the applicable law permits so. In the countries where moral rights apply, the Licensor waives his right to exercise his moral right to the extent allowed by law in order to make effective the licence of the economic rights here above listed. The Licensor grants to the Licensee royalty-free, non-exclusive usage rights to any patents held by the Licensor, to the extent necessary to make use of the rights granted on the Work under this Licence. 8 | 9 | 3.Communication of the Source Code The Licensor may provide the Work either in its Source Code form, or as Executable Code. If the Work is provided as Executable Code, the Licensor provides in addition a machine-readable copy of the Source Code of the Work along with each copy of the Work that the Licensor distributes or indicates, in a notice following the copyright notice attached to the Work, a repository where the Source Code is easily and freely accessible for as long as the Licensor continues to distribute or communicate the Work. 10 | 11 | 4.Limitations on copyright Nothing in this Licence is intended to deprive the Licensee of the benefits from any exception or limitation to the exclusive rights of the rights owners in the Work, of the exhaustion of those rights or of other applicable limitations thereto. 12 | 13 | 5.Obligations of the Licensee The grant of the rights mentioned above is subject to some restrictions and obligations imposed on the Licensee. Those obligations are the following: 14 | 15 | Attribution right: The Licensee shall keep intact all copyright, patent or trademarks notices and all notices that refer to the Licence and to the disclaimer of warranties. The Licensee must include a copy of such notices and a copy of the Licence with every copy of the Work he/she distributes or communicates. The Licensee must cause any Derivative Work to carry prominent notices stating that the Work has been modified and the date of modification. 16 | 17 | Copyleft clause: If the Licensee distributes or communicates copies of the Original Works or Derivative Works, this Distribution or Communication will be done under the terms of this Licence or of a later version of this Licence unless the Original Work is expressly distributed only under this version of the Licence — for example by communicating ‘EUPL v. 1.2 only’. The Licensee (becoming Licensor) cannot offer or impose any additional terms or conditions on the Work or Derivative Work that alter or restrict the terms of the Licence. 18 | 19 | Compatibility clause: If the Licensee Distributes or Communicates Derivative Works or copies thereof based upon both the Work and another work licensed under a Compatible Licence, this Distribution or Communication can be done under the terms of this Compatible Licence. For the sake of this clause, ‘Compatible Licence’ refers to the licences listed in the appendix attached to this Licence. Should the Licensee's obligations under the Compatible Licence conflict with his/her obligations under this Licence, the obligations of the Compatible Licence shall prevail. 20 | 21 | Provision of Source Code: When distributing or communicating copies of the Work, the Licensee will provide a machine-readable copy of the Source Code or indicate a repository where this Source will be easily and freely available for as long as the Licensee continues to distribute or communicate the Work. Legal Protection: This Licence does not grant permission to use the trade names, trademarks, service marks, or names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the copyright notice. 22 | 23 | 6.Chain of Authorship The original Licensor warrants that the copyright in the Original Work granted hereunder is owned by him/her or licensed to him/her and that he/she has the power and authority to grant the Licence. Each Contributor warrants that the copyright in the modifications he/she brings to the Work are owned by him/her or licensed to him/her and that he/she has the power and authority to grant the Licence. Each time You accept the Licence, the original Licensor and subsequent Contributors grant You a licence to their contributions to the Work, under the terms of this Licence. 24 | 25 | 7.Disclaimer of Warranty The Work is a work in progress, which is continuously improved by numerous Contributors. It is not a finished work and may therefore contain defects or ‘bugs’ inherent to this type of development. For the above reason, the Work is provided under the Licence on an ‘as is’ basis and without warranties of any kind concerning the Work, including without limitation merchantability, fitness for a particular purpose, absence of defects or errors, accuracy, non-infringement of intellectual property rights other than copyright as stated in Article 6 of this Licence. This disclaimer of warranty is an essential part of the Licence and a condition for the grant of any rights to the Work. 26 | 27 | 8.Disclaimer of Liability Except in the cases of wilful misconduct or damages directly caused to natural persons, the Licensor will in no event be liable for any direct or indirect, material or moral, damages of any kind, arising out of the Licence or of the use of the Work, including without limitation, damages for loss of goodwill, work stoppage, computer failure or malfunction, loss of data or any commercial damage, even if the Licensor has been advised of the possibility of such damage. However, the Licensor will be liable under statutory product liability laws as far such laws apply to the Work. 28 | 29 | 9.Additional agreements While distributing the Work, You may choose to conclude an additional agreement, defining obligations or services consistent with this Licence. However, if accepting obligations, You may act only on your own behalf and on your sole responsibility, not on behalf of the original Licensor or any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against such Contributor by the fact You have accepted any warranty or additional liability. 30 | 31 | 10.Acceptance of the Licence The provisions of this Licence can be accepted by clicking on an icon ‘I agree’ placed under the bottom of a window displaying the text of this Licence or by affirming consent in any other similar way, in accordance with the rules of applicable law. Clicking on that icon indicates your clear and irrevocable acceptance of this Licence and all of its terms and conditions. Similarly, you irrevocably accept this Licence and all of its terms and conditions by exercising any rights granted to You by Article 2 of this Licence, such as the use of the Work, the creation by You of a Derivative Work or the Distribution or Communication by You of the Work or copies thereof. 32 | 33 | 11.Information to the public In case of any Distribution or Communication of the Work by means of electronic communication by You (for example, by offering to download the Work from a remote location) the distribution channel or media (for example, a website) must at least provide to the public the information requested by the applicable law regarding the Licensor, the Licence and the way it may be accessible, concluded, stored and reproduced by the Licensee. 34 | 35 | 12.Termination of the Licence The Licence and the rights granted hereunder will terminate automatically upon any breach by the Licensee of the terms of the Licence. Such a termination will not terminate the licences of any person who has received the Work from the Licensee under the Licence, provided such persons remain in full compliance with the Licence. 36 | 37 | 13.Miscellaneous Without prejudice of Article 9 above, the Licence represents the complete agreement between the Parties as to the Work. If any provision of the Licence is invalid or unenforceable under applicable law, this will not affect the validity or enforceability of the Licence as a whole. Such provision will be construed or reformed so as necessary to make it valid and enforceable. The European Commission may publish other linguistic versions or new versions of this Licence or updated versions of the Appendix, so far this is required and reasonable, without reducing the scope of the rights granted by the Licence. New versions of the Licence will be published with a unique version number. All linguistic versions of this Licence, approved by the European Commission, have identical value. Parties can take advantage of the linguistic version of their choice. 38 | 39 | 14.Jurisdiction Without prejudice to specific agreement between parties, — any litigation resulting from the interpretation of this License, arising between the European Union institutions, bodies, offices or agencies, as a Licensor, and any Licensee, will be subject to the jurisdiction of the Court of Justice of the European Union, as laid down in article 272 of the Treaty on the Functioning of the European Union, — any litigation arising between other parties and resulting from the interpretation of this License, will be subject to the exclusive jurisdiction of the competent court where the Licensor resides or conducts its primary business. 40 | 41 | 15.Applicable Law Without prejudice to specific agreement between parties, — this Licence shall be governed by the law of the European Union Member State where the Licensor has his seat, resides or has his registered office, — this licence shall be governed by Belgian law if the Licensor has no seat, residence or registered office inside a European Union Member State. 42 | 43 | Appendix 44 | 45 | ‘Compatible Licences’ according to Article 5 EUPL are: — GNU General Public License (GPL) v. 2, v. 3 — GNU Affero General Public License (AGPL) v. 3 — Open Software License (OSL) v. 2.1, v. 3.0 — Eclipse Public License (EPL) v. 1.0 — CeCILL v. 2.0, v. 2.1 — Mozilla Public Licence (MPL) v. 2 — GNU Lesser General Public Licence (LGPL) v. 2.1, v. 3 — Creative Commons Attribution-ShareAlike v. 3.0 Unported (CC BY-SA 3.0) for works other than software — European Union Public Licence (EUPL) v. 1.1, v. 1.2 — Québec Free and Open-Source Licence — Reciprocity (LiLiQ-R) or Strong Reciprocity (LiLiQ-R+). 46 | 47 | The European Commission may update this Appendix to later versions of the above licences without producing a new version of the EUPL, as long as they provide the rights granted in Article 2 of this Licence and protect the covered Source Code from exclusive appropriation. All other changes or additions to this Appendix require the production of a new EUPL version. 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![image](https://raw.githubusercontent.com/NGO-Algorithm-Audit/unsupervised-bias-detection/refs/heads/master/images/Header_Github.png) 2 | ## Detecting higher-dimensional forms of proxy bias 3 | 4 | 📄 Applied in real-world audit: [audit report](https://algorithmaudit.eu/algoprudence/cases/aa202402_preventing-prejudice_addendum/) 5 | 6 | ☁️ Web app on [Algorithm Audit website](https://algorithmaudit.eu/technical-tools/bdt/#web-app) 7 | 8 | 🧪 Scientific paper: [Arxiv pre-print](https://arxiv.org/pdf/2502.01713) 9 | 10 | ## Key takeaways – Why unsupervised bias detection? 11 | - **Quantitative-qualitative joint method**: Data-driven bias testing combined with the balanced and context-sensitive judgment of human experts; 12 | - **Normative advice commission**: Expert-led, deliberative assessment to establish unfair treatment; 13 | - **Bias scan tool**: Scalable method based on machine learning to detect algorithmic bias; 14 | - **Unsupervised bias detection**: No user data needed on protected attributes; 15 | - **Detects complex bias**: Identifies unfairly treated groups characterized by mixture of features, detects intersectional bias; 16 | - **Model-agnostic**: Works for all binary AI classifiers; 17 | - **Open-source and not-for-profit**: Easy to use and available for the entire AI auditing community. 18 | 19 | 20 | 21 | 22 | | | | 23 | | --- | --- | 24 | | **Code** | [![!pypi](https://img.shields.io/pypi/v/unsupervised-bias-detection?logo=pypi&color=blue)](https://pypi.org/project/unsupervised-bias-detection/) [![!python-versions](https://img.shields.io/pypi/pyversions/aeon?logo=python)](https://www.python.org/) [![license](https://img.shields.io/badge/license-MIT-blue)](https://github.com/NGO-Algorithm-Audit/unsupervised-bias-detection?tab=MIT-1-ov-file#) | 25 | | **Community** | [![!slack](https://img.shields.io/static/v1?logo=slack&label=Slack&message=chat&color=lightgreen)](https://join.slack.com/t/aa-experthub/shared_invite/zt-2n8aqry8z-lWC6XTbqVmb6S2hpkThaqQ) [![!linkedin](https://img.shields.io/static/v1?logo=linkedin&label=LinkedIn&message=news&color=lightblue)](https://www.linkedin.com/company/algorithm-audit/) | 26 | 27 | 28 | ## How this tool fits in our quantitative-qualitative AI auditing framework? 29 | The Joint Fairness Assessment Method developed (JFAM) by NGO Algorithm Audit combines data-driven bias testing with normative and context-sensitive judgment of human experts, to determine fair AI on a case-by-case basis. The data-driven component comprises this unsupervised clustering tool (available as a free-to-use [web app](https://algorithmaudit.eu/technical-tools/bdt/#web-app)) that discovers complex and hidden forms of bias. It thereby tackles the difficult problem of detecting proxy-discrimination that stems from unforeseen and higher-dimensional forms of bias, including intersectional forms of discrimination. The results of the bias scan tool serve as a starting point for a deliberative assessment by human experts to evaluate potential discrimination and unfairness in an AI system. 30 | 31 | As an example, we applied our bias detection tool to a BERT-based disinformation classifier and distilled a set of pressing questions about its performance and possible biases. We presented these questions to an independent advice commission composed of four academic experts on fair AI, and two civil society organizations working on disinformation detection. The advice commission believes there is a low risk of (higher-dimensional) proxy discrimination by the reviewed disinformation classifier. The commission judged that the differences in treatment identified by the quantitative bias scan can be justified, if certain conditions apply. The full advice can be read in our [algoprudence case repository](https://algorithmaudit.eu/algoprudence/cases/aa202301_bert-based-disinformation-classifier/) (ALGO:AA:2023:01). 32 | 33 | Our joint approach to AI auditing is supported by 20+ actors from the international AI auditing community, including journalists, civil society organizations, NGOs, corporate data scientists and academics. In sum, it combines the power of rigorous, machine learning-informed bias testing with the balanced judgment of human experts, to determine fair AI in a concrete way. 34 | 35 | 1The bias scan tool is based on the k-means Hierarchical Bias-Aware Clustering method as described in Bias-Aware Hierarchical Clustering for detecting the discriminated groups of users in recommendation systems, Misztal-Radecka, Indurkya, _Information Processing and Management_ (2021). [[link]](https://www.sciencedirect.com/science/article/abs/pii/S0306457321000285) Additional research indicates that k-means HBAC, in comparison to other clustering algorithms, works best to detect bias in real-world datasets. 36 | 37 | 2The uploaded data is instantly deleted from the server after being processed. 38 | 39 | 3Real-time Rumor Debunking on Twitter, Liu et al., _Proceedings of the 24th ACM International on Conference on Information and Knowledge Management_ (2015). 40 | 41 | ## Bias detection tool manual 42 | 43 | A .csv file of max. 1GB, with columns: features, performance metric. Note: Only the naming, not the order of the columns is of importance. The dataframe displayed in Table 1 is digestible by the [web app](https://algorithmaudit.eu/technical-tools/bdt/#web-app) 44 | 45 | | feat_1 | feat_2 | ... | feat_n | performance metric | 46 | |--------|--------|-----|--------|------------| 47 | | 10 | 1 | ... | 0.1 | 1 | 48 | | 20 | 2 | ... | 0.2 | 1 | 49 | | 30 | 3 | ... | 0.3 | 0 | 50 | 51 | 52 | *Table 1 – Structure of input data in the bias detection tool* 53 | 54 | Features values can be numeric or categorical values. The numeric performance metric is context-dependent. The variable can, for instance, represents being 'selected for examination' (yes or no), 'assigned to a high-risk catagory (yes or no)' or false positive (yes or no). Low scores are considered to be a negative bias, i.e., if being selected for examination is considered to be harmful, 'selected for examination=Yes' should be codified as 0 and 'selected for examination=No' should be codified as 1. 55 | 56 | ## Example – Hierarchical Bias-Aware Clustering 57 | 58 | Note: The feature labels used in this example can easily be changed for numeric targets. This flexibility enables adaptation to detect (higher-dimensional) bias in various AI classifiers. 59 | 60 | ```python 61 | import unsupervised-bias-detection as usb 62 | 63 | X = [[35, 55000, 1], # age, income, number of cars 64 | [40, 45000, 0], 65 | [20, 30000, 0]] 66 | y = [1, 0, 0] # flagged for fraud examination (yes:0, no:1) 67 | hbac = BiasAwareHierarchicalKMeans(n_iter=1, min_cluster_size=1).fit(X, y) 68 | hbac.n_clusters_ 69 | >>> 2 70 | hbac.scores_ 71 | >>> array([ 0.5, -0.5]) 72 | ``` 73 | 74 | ## Schematic overview 75 | ![image](./images/Quantitative-qualitative.png) 76 | 77 | ## Contributing Members 78 | - [Floris Holstege](https://github.com/fholstege) 79 | - [Joel Persson](https://github.com/jopersson) 80 | - [Jurriaan Parie](https://github.com/jfparie) 81 | - [Kirtan Padh](https://github.com/kirtanp) 82 | - [Krsto Proroković](https://github.com/krstopro) 83 | - [Mackenzie Jorgensen](https://github.com/mjorgen1) 84 | 85 | ### 20+ endorsements from various parts of the AI auditing community 86 | #### Journalism 87 | - Gabriel Geiger, Investigative Reporter Algorithms and Automated Decision-Making at Lighthouse Reports 88 | 89 | #### Civil society organisations 90 | - [Maldita](https://maldita.es/maldita-es-journalism-to-not-be-fooled/), an independent journalistic platform focused on the control of disinformation and public discourse through fact-checking and data journalism techniques 91 | - [Demos](https://demos.co.uk/), Britain's leading cross-party think-tank 92 | - [AI Forensics](https://www.aiforensics.org), a European non-profit that investigates influential and opaque algorithms 93 | - [NLAIC](https://nlaic.com), The Netherlands AI Coalition 94 | - [Progressive Café](https://progressiefcafe.nl), public platform of young Dutch intellectuals, represented by Kiza Magendane 95 | - [Dutch AI Ethics Community](https://www.linkedin.com/company/daiec/), represented by Samaa Mohammad 96 | - Simone Maria Parazzoli, OECD Observatory of Public Sector Innovation (OPSI) 97 | 98 | #### Industry 99 | - Selma Muhammad, Trustworthy AI consultant at Deloitte 100 | - Laurens van der Maas, Data Scientist at AWS 101 | - Xiaoming op de Hoek, Data Scientist at Rabobank 102 | - Jan Overgoor, Data Scientist at SPAN 103 | - Dasha Simons, Trustworthy AI consultant at IBM 104 | 105 | #### Academia 106 | - Anne Meuwese, Professor in Public Law & AI at Leiden University 107 | - Hinda Haned, Professor in Responsible Data Science at University of Amsterdam 108 | - Raphaële Xenidis, Associate Professor in EU law at Sciences Po Paris 109 | - Marlies van Eck, Assistant Professor in Administrative Law & AI at Radboud University 110 | - Aileen Nielsen, Fellow Law&Tech at ETH Zürich 111 | - Vahid Niamadpour, PhD-candidate in Linguistics at Leiden University 112 | - Ola Al Khatib, PhD-candidate in the legal regulation of algorithmic decision-making at Utrecht University 113 | 114 | ## Help and Support 115 | 116 | This project is still in its early stages, and the documentation is a work in progress. In the meantime, feel free to open an [issue](https://github.com/NGO-Algorithm-Audit/unsupervised-bias-detection/issues), and we'll do our best to assist you. 117 | 118 | ## Contributing 119 | 120 | Your contributions are highly encouraged! There are many opportunities for potential projects, so please reach out if you'd like to get involved. Whether it's code, notebooks, examples, or documentation, every contribution is valuable—so don’t hesitate to jump in. To contribute, simply fork the project, make your changes, and submit a pull request. We’ll work with you to address any issues and get your code merged into the main branch. -------------------------------------------------------------------------------- /classifiers/Loan_approval_classifier/german_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from aif360.datasets import StandardDataset 4 | 5 | 6 | default_mappings = { 7 | "label_maps": [{0: "Good Credit", 1: "Bad Credit"}], 8 | "protected_attribute_maps": [ 9 | {1.0: "Male", 0.0: "Female"}, 10 | {1.0: "Old", 0.0: "Young"}, 11 | ], 12 | } 13 | 14 | 15 | def default_preprocessing(df): 16 | """Adds a derived sex attribute based on personal_status.""" 17 | # TODO: ignores the value of privileged_classes for 'sex' 18 | status_map = { 19 | "A91": "male", 20 | "A93": "male", 21 | "A94": "male", 22 | "A92": "female", 23 | "A95": "female", 24 | } 25 | df["sex"] = df["personal_status"].replace(status_map) 26 | 27 | return df 28 | 29 | 30 | class GermanDataset(StandardDataset): 31 | """German credit Dataset. 32 | 33 | See :file:`aif360/data/raw/german/README.md`. 34 | """ 35 | 36 | def __init__( 37 | self, 38 | label_name="credit", 39 | favorable_classes=[0], 40 | protected_attribute_names=[], 41 | privileged_classes=[], 42 | instance_weights_name=None, 43 | categorical_features=[ 44 | "status", 45 | "credit_history", 46 | "purpose", 47 | "savings", 48 | "employment", 49 | "other_debtors", 50 | "property", 51 | "installment_plans", 52 | "housing", 53 | "skill_level", 54 | "telephone", 55 | "foreign_worker", 56 | ], 57 | features_to_keep=[], 58 | features_to_drop=["personal_status"], 59 | na_values=[], 60 | custom_preprocessing=default_preprocessing, 61 | metadata=default_mappings, 62 | ): 63 | """See :obj:`StandardDataset` for a description of the arguments. 64 | 65 | By default, this code converts the 'age' attribute to a binary value 66 | where privileged is `age > 25` and unprivileged is `age <= 25` as 67 | proposed by Kamiran and Calders [1]_. 68 | 69 | References: 70 | .. [1] F. Kamiran and T. Calders, "Classifying without 71 | discriminating," 2nd International Conference on Computer, 72 | Control and Communication, 2009. 73 | 74 | Examples: 75 | In some cases, it may be useful to keep track of a mapping from 76 | `float -> str` for protected attributes and/or labels. If our use 77 | case differs from the default, we can modify the mapping stored in 78 | `metadata`: 79 | 80 | >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'} 81 | >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}] 82 | >>> gd = GermanDataset(protected_attribute_names=['sex'], 83 | ... privileged_classes=[['male']], metadata={'label_map': label_map, 84 | ... 'protected_attribute_maps': protected_attribute_maps}) 85 | 86 | Now this information will stay attached to the dataset and can be 87 | used for more descriptive visualizations. 88 | """ 89 | 90 | # change path 91 | filepath = "../../data/GermanCredit_dataset/german.data" 92 | 93 | # as given by german.doc 94 | column_names = [ 95 | "status", 96 | "month", 97 | "credit_history", 98 | "purpose", 99 | "credit_amount", 100 | "savings", 101 | "employment", 102 | "investment_as_income_percentage", 103 | "personal_status", 104 | "other_debtors", 105 | "residence_since", 106 | "property", 107 | "age", 108 | "installment_plans", 109 | "housing", 110 | "number_of_credits", 111 | "skill_level", 112 | "people_liable_for", 113 | "telephone", 114 | "foreign_worker", 115 | "credit", 116 | ] 117 | try: 118 | df = pd.read_csv( 119 | filepath, sep=" ", header=None, names=column_names, na_values=na_values 120 | ) 121 | except IOError as err: 122 | print("IOError: {}".format(err)) 123 | print("To use this class, please download the following files:") 124 | print( 125 | "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data" 126 | ) 127 | print( 128 | "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc" 129 | ) 130 | print("\nand place them, as-is, in the folder:") 131 | print( 132 | "\n\t{}\n".format( 133 | os.path.abspath( 134 | os.path.join( 135 | os.path.abspath(__file__), 136 | "..", 137 | "..", 138 | "data", 139 | "raw", 140 | "german", 141 | ) 142 | ) 143 | ) 144 | ) 145 | import sys 146 | 147 | sys.exit(1) 148 | 149 | super(GermanDataset, self).__init__( 150 | df=df, 151 | label_name=label_name, 152 | favorable_classes=favorable_classes, 153 | protected_attribute_names=protected_attribute_names, 154 | privileged_classes=privileged_classes, 155 | instance_weights_name=instance_weights_name, 156 | categorical_features=categorical_features, 157 | features_to_keep=features_to_keep, 158 | features_to_drop=features_to_drop, 159 | na_values=na_values, 160 | custom_preprocessing=custom_preprocessing, 161 | metadata=metadata, 162 | ) 163 | -------------------------------------------------------------------------------- /classifiers/Loan_approval_classifier/helper_functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from aif360.datasets import StandardDataset 4 | 5 | 6 | def default_preprocessing(df): 7 | # default: 1, no default: 0 8 | df["credit"] = df["credit"].replace({1.0: 0, 2.0: 1}) 9 | 10 | # sex 11 | # male: 0, female: 1 12 | status_map = {"A91": 0, "A93": 0, "A94": 0, "A92": 1, "A95": 1} 13 | df["sex"] = df["personal_status"].replace(status_map) 14 | 15 | return df 16 | 17 | 18 | class GermanDataset(StandardDataset): 19 | """German credit Dataset. 20 | See :file:`aif360/data/raw/german/README.md`. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | label_name="credit", 26 | favorable_classes=[1], 27 | protected_attribute_names=["sex", "age"], 28 | privileged_classes=[], 29 | instance_weights_name=None, 30 | categorical_features=[ 31 | "status", 32 | "credit_history", 33 | "purpose", 34 | "savings", 35 | "employment", 36 | "other_debtors", 37 | "property", 38 | "installment_plans", 39 | "housing", 40 | "skill_level", 41 | "telephone", 42 | "foreign_worker", 43 | ], 44 | features_to_keep=[], 45 | features_to_drop=["personal_status"], 46 | na_values=[], 47 | custom_preprocessing=default_preprocessing, 48 | metadata=None, 49 | ): 50 | """See :obj:`StandardDataset` for a description of the arguments. 51 | By default, this code converts the 'age' attribute to a binary value 52 | where privileged is `age > 25` and unprivileged is `age <= 25` as 53 | proposed by Kamiran and Calders [1]_. 54 | References: 55 | .. [1] F. Kamiran and T. Calders, "Classifying without 56 | discriminating," 2nd International Conference on Computer, 57 | Control and Communication, 2009. 58 | Examples: 59 | In some cases, it may be useful to keep track of a mapping from 60 | `float -> str` for protected attributes and/or labels. If our use 61 | case differs from the default, we can modify the mapping stored in 62 | `metadata`: 63 | >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'} 64 | >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}] 65 | >>> gd = GermanDataset(protected_attribute_names=['sex'], 66 | ... privileged_classes=[['male']], metadata={'label_map': label_map, 67 | ... 'protected_attribute_maps': protected_attribute_maps}) 68 | Now this information will stay attached to the dataset and can be 69 | used for more descriptive visualizations. 70 | """ 71 | 72 | filepath = os.path.join( 73 | os.path.dirname(os.path.abspath(__file__)), 74 | "..", 75 | "..", 76 | "data", 77 | "GermanCredit_dataset", 78 | "german.data", 79 | ) 80 | # as given by german.doc 81 | column_names = [ 82 | "status", 83 | "month", 84 | "credit_history", 85 | "purpose", 86 | "credit_amount", 87 | "savings", 88 | "employment", 89 | "investment_as_income_percentage", 90 | "personal_status", 91 | "other_debtors", 92 | "residence_since", 93 | "property", 94 | "age", 95 | "installment_plans", 96 | "housing", 97 | "number_of_credits", 98 | "skill_level", 99 | "people_liable_for", 100 | "telephone", 101 | "foreign_worker", 102 | "credit", 103 | ] 104 | try: 105 | df = pd.read_csv( 106 | filepath, sep=" ", header=None, names=column_names, na_values=na_values 107 | ) 108 | except IOError as err: 109 | print("IOError: {}".format(err)) 110 | print("To use this class, please download the following files:") 111 | print( 112 | "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data" 113 | ) 114 | print( 115 | "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc" 116 | ) 117 | print("\nand place them, as-is, in the folder:") 118 | print( 119 | "\n\t{}\n".format( 120 | os.path.abspath( 121 | os.path.join( 122 | os.path.abspath(__file__), 123 | "..", 124 | "..", 125 | "data", 126 | "raw", 127 | "german", 128 | ) 129 | ) 130 | ) 131 | ) 132 | import sys 133 | 134 | sys.exit(1) 135 | 136 | super(GermanDataset, self).__init__( 137 | df=df, 138 | label_name=label_name, 139 | favorable_classes=favorable_classes, 140 | protected_attribute_names=protected_attribute_names, 141 | privileged_classes=privileged_classes, 142 | instance_weights_name=instance_weights_name, 143 | categorical_features=categorical_features, 144 | features_to_keep=features_to_keep, 145 | features_to_drop=features_to_drop, 146 | na_values=na_values, 147 | custom_preprocessing=custom_preprocessing, 148 | metadata=metadata, 149 | ) 150 | -------------------------------------------------------------------------------- /data/GermanCredit_dataset/german_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from aif360.datasets import StandardDataset 4 | 5 | 6 | default_mappings = { 7 | "label_maps": [{0: "Good Credit", 1: "Bad Credit"}], 8 | "protected_attribute_maps": [ 9 | {1.0: "Male", 0.0: "Female"}, 10 | {1.0: "Old", 0.0: "Young"}, 11 | ], 12 | } 13 | 14 | 15 | def default_preprocessing(df): 16 | """Adds a derived sex attribute based on personal_status.""" 17 | # TODO: ignores the value of privileged_classes for 'sex' 18 | status_map = { 19 | "A91": "male", 20 | "A93": "male", 21 | "A94": "male", 22 | "A92": "female", 23 | "A95": "female", 24 | } 25 | df["sex"] = df["personal_status"].replace(status_map) 26 | 27 | return df 28 | 29 | 30 | class GermanDataset(StandardDataset): 31 | """German credit Dataset. 32 | 33 | See :file:`aif360/data/raw/german/README.md`. 34 | """ 35 | 36 | def __init__( 37 | self, 38 | label_name="credit", 39 | favorable_classes=[0], 40 | protected_attribute_names=[], 41 | privileged_classes=[], 42 | instance_weights_name=None, 43 | categorical_features=[ 44 | "status", 45 | "credit_history", 46 | "purpose", 47 | "savings", 48 | "employment", 49 | "other_debtors", 50 | "property", 51 | "installment_plans", 52 | "housing", 53 | "skill_level", 54 | "telephone", 55 | "foreign_worker", 56 | ], 57 | features_to_keep=[], 58 | features_to_drop=["personal_status"], 59 | na_values=[], 60 | custom_preprocessing=default_preprocessing, 61 | metadata=default_mappings, 62 | ): 63 | """See :obj:`StandardDataset` for a description of the arguments. 64 | 65 | By default, this code converts the 'age' attribute to a binary value 66 | where privileged is `age > 25` and unprivileged is `age <= 25` as 67 | proposed by Kamiran and Calders [1]_. 68 | 69 | References: 70 | .. [1] F. Kamiran and T. Calders, "Classifying without 71 | discriminating," 2nd International Conference on Computer, 72 | Control and Communication, 2009. 73 | 74 | Examples: 75 | In some cases, it may be useful to keep track of a mapping from 76 | `float -> str` for protected attributes and/or labels. If our use 77 | case differs from the default, we can modify the mapping stored in 78 | `metadata`: 79 | 80 | >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'} 81 | >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}] 82 | >>> gd = GermanDataset(protected_attribute_names=['sex'], 83 | ... privileged_classes=[['male']], metadata={'label_map': label_map, 84 | ... 'protected_attribute_maps': protected_attribute_maps}) 85 | 86 | Now this information will stay attached to the dataset and can be 87 | used for more descriptive visualizations. 88 | """ 89 | 90 | # change path 91 | filepath = "./german.data" 92 | 93 | # as given by german.doc 94 | column_names = [ 95 | "status", 96 | "month", 97 | "credit_history", 98 | "purpose", 99 | "credit_amount", 100 | "savings", 101 | "employment", 102 | "investment_as_income_percentage", 103 | "personal_status", 104 | "other_debtors", 105 | "residence_since", 106 | "property", 107 | "age", 108 | "installment_plans", 109 | "housing", 110 | "number_of_credits", 111 | "skill_level", 112 | "people_liable_for", 113 | "telephone", 114 | "foreign_worker", 115 | "credit", 116 | ] 117 | try: 118 | df = pd.read_csv( 119 | filepath, sep=" ", header=None, names=column_names, na_values=na_values 120 | ) 121 | except IOError as err: 122 | print("IOError: {}".format(err)) 123 | print("To use this class, please download the following files:") 124 | print( 125 | "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data" 126 | ) 127 | print( 128 | "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc" 129 | ) 130 | print("\nand place them, as-is, in the folder:") 131 | print( 132 | "\n\t{}\n".format( 133 | os.path.abspath( 134 | os.path.join( 135 | os.path.abspath(__file__), 136 | "..", 137 | "..", 138 | "data", 139 | "raw", 140 | "german", 141 | ) 142 | ) 143 | ) 144 | ) 145 | import sys 146 | 147 | sys.exit(1) 148 | 149 | super(GermanDataset, self).__init__( 150 | df=df, 151 | label_name=label_name, 152 | favorable_classes=favorable_classes, 153 | protected_attribute_names=protected_attribute_names, 154 | privileged_classes=privileged_classes, 155 | instance_weights_name=instance_weights_name, 156 | categorical_features=categorical_features, 157 | features_to_keep=features_to_keep, 158 | features_to_drop=features_to_drop, 159 | na_values=na_values, 160 | custom_preprocessing=custom_preprocessing, 161 | metadata=metadata, 162 | ) 163 | -------------------------------------------------------------------------------- /images/Header_Github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NGO-Algorithm-Audit/unsupervised-bias-detection/486d02f2f28a40a31db3f2bbee12dea07fc70eb1/images/Header_Github.png -------------------------------------------------------------------------------- /images/Quantitative-qualitative.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NGO-Algorithm-Audit/unsupervised-bias-detection/486d02f2f28a40a31db3f2bbee12dea07fc70eb1/images/Quantitative-qualitative.png -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "colorama" 5 | version = "0.4.6" 6 | description = "Cross-platform colored terminal text." 7 | optional = false 8 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 9 | files = [ 10 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 11 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 12 | ] 13 | 14 | [[package]] 15 | name = "fairlearn" 16 | version = "0.10.0" 17 | description = "A Python package to assess and improve fairness of machine learning models." 18 | optional = false 19 | python-versions = ">=3.8" 20 | files = [ 21 | {file = "fairlearn-0.10.0-py3-none-any.whl", hash = "sha256:772224097f8c073168bde44e659d7a2107f96d608063a738df9c985e17dab30f"}, 22 | {file = "fairlearn-0.10.0.tar.gz", hash = "sha256:70e7aefaf9cb16e00462624d58b0517397970dc40d4cbc71e8d40f7c69800f9d"}, 23 | ] 24 | 25 | [package.dependencies] 26 | numpy = ">=1.24.4" 27 | pandas = ">=2.0.3" 28 | scikit-learn = ">=1.2.1" 29 | scipy = ">=1.9.3" 30 | 31 | [[package]] 32 | name = "iniconfig" 33 | version = "2.0.0" 34 | description = "brain-dead simple config-ini parsing" 35 | optional = false 36 | python-versions = ">=3.7" 37 | files = [ 38 | {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, 39 | {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, 40 | ] 41 | 42 | [[package]] 43 | name = "joblib" 44 | version = "1.4.2" 45 | description = "Lightweight pipelining with Python functions" 46 | optional = false 47 | python-versions = ">=3.8" 48 | files = [ 49 | {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, 50 | {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, 51 | ] 52 | 53 | [[package]] 54 | name = "kmodes" 55 | version = "0.12.2" 56 | description = "Python implementations of the k-modes and k-prototypes clustering algorithms for clustering categorical data." 57 | optional = false 58 | python-versions = "*" 59 | files = [ 60 | {file = "kmodes-0.12.2-py2.py3-none-any.whl", hash = "sha256:b764f7166dd5fe63826135ed74df796693dc7c25fc2cb8a106e14f3bfb371004"}, 61 | {file = "kmodes-0.12.2.tar.gz", hash = "sha256:d840ac9f4616a668ebacba24a12ec1def87da24a9fd0a0dc2f7499a9b9a6f45b"}, 62 | ] 63 | 64 | [package.dependencies] 65 | joblib = ">=0.11" 66 | numpy = ">=1.10.4" 67 | scikit-learn = ">=0.22.0" 68 | scipy = ">=0.13.3" 69 | 70 | [package.extras] 71 | dev = ["pandas", "pytest", "pytest-cov"] 72 | 73 | [[package]] 74 | name = "numpy" 75 | version = "1.26.4" 76 | description = "Fundamental package for array computing in Python" 77 | optional = false 78 | python-versions = ">=3.9" 79 | files = [ 80 | {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, 81 | {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, 82 | {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, 83 | {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, 84 | {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, 85 | {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, 86 | {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, 87 | {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, 88 | {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, 89 | {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, 90 | {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, 91 | {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, 92 | {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, 93 | {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, 94 | {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, 95 | {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, 96 | {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, 97 | {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, 98 | {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, 99 | {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, 100 | {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, 101 | {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, 102 | {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, 103 | {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, 104 | {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, 105 | {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, 106 | {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, 107 | {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, 108 | {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, 109 | {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, 110 | {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, 111 | {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, 112 | {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, 113 | {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, 114 | {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, 115 | {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, 116 | ] 117 | 118 | [[package]] 119 | name = "packaging" 120 | version = "24.2" 121 | description = "Core utilities for Python packages" 122 | optional = false 123 | python-versions = ">=3.8" 124 | files = [ 125 | {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, 126 | {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, 127 | ] 128 | 129 | [[package]] 130 | name = "pandas" 131 | version = "2.2.3" 132 | description = "Powerful data structures for data analysis, time series, and statistics" 133 | optional = false 134 | python-versions = ">=3.9" 135 | files = [ 136 | {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"}, 137 | {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"}, 138 | {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"}, 139 | {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"}, 140 | {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"}, 141 | {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"}, 142 | {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"}, 143 | {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"}, 144 | {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"}, 145 | {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"}, 146 | {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"}, 147 | {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"}, 148 | {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"}, 149 | {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"}, 150 | {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"}, 151 | {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"}, 152 | {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"}, 153 | {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"}, 154 | {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"}, 155 | {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"}, 156 | {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"}, 157 | {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"}, 158 | {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"}, 159 | {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"}, 160 | {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"}, 161 | {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"}, 162 | {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"}, 163 | {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"}, 164 | {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"}, 165 | {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"}, 166 | {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"}, 167 | {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"}, 168 | {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"}, 169 | {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"}, 170 | {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"}, 171 | {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"}, 172 | {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"}, 173 | {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"}, 174 | {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"}, 175 | {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"}, 176 | {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"}, 177 | {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"}, 178 | ] 179 | 180 | [package.dependencies] 181 | numpy = [ 182 | {version = ">=1.23.2", markers = "python_version == \"3.11\""}, 183 | {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, 184 | ] 185 | python-dateutil = ">=2.8.2" 186 | pytz = ">=2020.1" 187 | tzdata = ">=2022.7" 188 | 189 | [package.extras] 190 | all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] 191 | aws = ["s3fs (>=2022.11.0)"] 192 | clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] 193 | compression = ["zstandard (>=0.19.0)"] 194 | computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] 195 | consortium-standard = ["dataframe-api-compat (>=0.1.7)"] 196 | excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] 197 | feather = ["pyarrow (>=10.0.1)"] 198 | fss = ["fsspec (>=2022.11.0)"] 199 | gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] 200 | hdf5 = ["tables (>=3.8.0)"] 201 | html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] 202 | mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] 203 | output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] 204 | parquet = ["pyarrow (>=10.0.1)"] 205 | performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] 206 | plot = ["matplotlib (>=3.6.3)"] 207 | postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] 208 | pyarrow = ["pyarrow (>=10.0.1)"] 209 | spss = ["pyreadstat (>=1.2.0)"] 210 | sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] 211 | test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] 212 | xml = ["lxml (>=4.9.2)"] 213 | 214 | [[package]] 215 | name = "pluggy" 216 | version = "1.5.0" 217 | description = "plugin and hook calling mechanisms for python" 218 | optional = false 219 | python-versions = ">=3.8" 220 | files = [ 221 | {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, 222 | {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, 223 | ] 224 | 225 | [package.extras] 226 | dev = ["pre-commit", "tox"] 227 | testing = ["pytest", "pytest-benchmark"] 228 | 229 | [[package]] 230 | name = "pytest" 231 | version = "8.3.5" 232 | description = "pytest: simple powerful testing with Python" 233 | optional = false 234 | python-versions = ">=3.8" 235 | files = [ 236 | {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, 237 | {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, 238 | ] 239 | 240 | [package.dependencies] 241 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 242 | iniconfig = "*" 243 | packaging = "*" 244 | pluggy = ">=1.5,<2" 245 | 246 | [package.extras] 247 | dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] 248 | 249 | [[package]] 250 | name = "python-dateutil" 251 | version = "2.9.0.post0" 252 | description = "Extensions to the standard Python datetime module" 253 | optional = false 254 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" 255 | files = [ 256 | {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, 257 | {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, 258 | ] 259 | 260 | [package.dependencies] 261 | six = ">=1.5" 262 | 263 | [[package]] 264 | name = "pytz" 265 | version = "2025.1" 266 | description = "World timezone definitions, modern and historical" 267 | optional = false 268 | python-versions = "*" 269 | files = [ 270 | {file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"}, 271 | {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"}, 272 | ] 273 | 274 | [[package]] 275 | name = "ruff" 276 | version = "0.2.2" 277 | description = "An extremely fast Python linter and code formatter, written in Rust." 278 | optional = false 279 | python-versions = ">=3.7" 280 | files = [ 281 | {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"}, 282 | {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"}, 283 | {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"}, 284 | {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"}, 285 | {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"}, 286 | {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"}, 287 | {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"}, 288 | {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"}, 289 | {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"}, 290 | {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"}, 291 | {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"}, 292 | {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"}, 293 | {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"}, 294 | {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"}, 295 | {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"}, 296 | {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"}, 297 | {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"}, 298 | ] 299 | 300 | [[package]] 301 | name = "scikit-learn" 302 | version = "1.6.1" 303 | description = "A set of python modules for machine learning and data mining" 304 | optional = false 305 | python-versions = ">=3.9" 306 | files = [ 307 | {file = "scikit_learn-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d056391530ccd1e501056160e3c9673b4da4805eb67eb2bdf4e983e1f9c9204e"}, 308 | {file = "scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0c8d036eb937dbb568c6242fa598d551d88fb4399c0344d95c001980ec1c7d36"}, 309 | {file = "scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8634c4bd21a2a813e0a7e3900464e6d593162a29dd35d25bdf0103b3fce60ed5"}, 310 | {file = "scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:775da975a471c4f6f467725dff0ced5c7ac7bda5e9316b260225b48475279a1b"}, 311 | {file = "scikit_learn-1.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:8a600c31592bd7dab31e1c61b9bbd6dea1b3433e67d264d17ce1017dbdce8002"}, 312 | {file = "scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33"}, 313 | {file = "scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d"}, 314 | {file = "scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2"}, 315 | {file = "scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8"}, 316 | {file = "scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415"}, 317 | {file = "scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b"}, 318 | {file = "scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2"}, 319 | {file = "scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f"}, 320 | {file = "scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86"}, 321 | {file = "scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52"}, 322 | {file = "scikit_learn-1.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ffa1e9e25b3d93990e74a4be2c2fc61ee5af85811562f1288d5d055880c4322"}, 323 | {file = "scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:dc5cf3d68c5a20ad6d571584c0750ec641cc46aeef1c1507be51300e6003a7e1"}, 324 | {file = "scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c06beb2e839ecc641366000ca84f3cf6fa9faa1777e29cf0c04be6e4d096a348"}, 325 | {file = "scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8ca8cb270fee8f1f76fa9bfd5c3507d60c6438bbee5687f81042e2bb98e5a97"}, 326 | {file = "scikit_learn-1.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:7a1c43c8ec9fde528d664d947dc4c0789be4077a3647f232869f41d9bf50e0fb"}, 327 | {file = "scikit_learn-1.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a17c1dea1d56dcda2fac315712f3651a1fea86565b64b48fa1bc090249cbf236"}, 328 | {file = "scikit_learn-1.6.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a7aa5f9908f0f28f4edaa6963c0a6183f1911e63a69aa03782f0d924c830a35"}, 329 | {file = "scikit_learn-1.6.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0650e730afb87402baa88afbf31c07b84c98272622aaba002559b614600ca691"}, 330 | {file = "scikit_learn-1.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:3f59fe08dc03ea158605170eb52b22a105f238a5d512c4470ddeca71feae8e5f"}, 331 | {file = "scikit_learn-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6849dd3234e87f55dce1db34c89a810b489ead832aaf4d4550b7ea85628be6c1"}, 332 | {file = "scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e7be3fa5d2eb9be7d77c3734ff1d599151bb523674be9b834e8da6abe132f44e"}, 333 | {file = "scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44a17798172df1d3c1065e8fcf9019183f06c87609b49a124ebdf57ae6cb0107"}, 334 | {file = "scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b7a3b86e411e4bce21186e1c180d792f3d99223dcfa3b4f597ecc92fa1a422"}, 335 | {file = "scikit_learn-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:7a73d457070e3318e32bdb3aa79a8d990474f19035464dfd8bede2883ab5dc3b"}, 336 | {file = "scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e"}, 337 | ] 338 | 339 | [package.dependencies] 340 | joblib = ">=1.2.0" 341 | numpy = ">=1.19.5" 342 | scipy = ">=1.6.0" 343 | threadpoolctl = ">=3.1.0" 344 | 345 | [package.extras] 346 | benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"] 347 | build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"] 348 | docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.17.1)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)", "towncrier (>=24.8.0)"] 349 | examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] 350 | install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"] 351 | maintenance = ["conda-lock (==2.5.6)"] 352 | tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.5.1)", "scikit-image (>=0.17.2)"] 353 | 354 | [[package]] 355 | name = "scipy" 356 | version = "1.15.2" 357 | description = "Fundamental algorithms for scientific computing in Python" 358 | optional = false 359 | python-versions = ">=3.10" 360 | files = [ 361 | {file = "scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9"}, 362 | {file = "scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5"}, 363 | {file = "scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e"}, 364 | {file = "scipy-1.15.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9"}, 365 | {file = "scipy-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3"}, 366 | {file = "scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d"}, 367 | {file = "scipy-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58"}, 368 | {file = "scipy-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa"}, 369 | {file = "scipy-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65"}, 370 | {file = "scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4"}, 371 | {file = "scipy-1.15.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1"}, 372 | {file = "scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971"}, 373 | {file = "scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655"}, 374 | {file = "scipy-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e"}, 375 | {file = "scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0"}, 376 | {file = "scipy-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40"}, 377 | {file = "scipy-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462"}, 378 | {file = "scipy-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737"}, 379 | {file = "scipy-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd"}, 380 | {file = "scipy-1.15.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301"}, 381 | {file = "scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93"}, 382 | {file = "scipy-1.15.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20"}, 383 | {file = "scipy-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e"}, 384 | {file = "scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8"}, 385 | {file = "scipy-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11"}, 386 | {file = "scipy-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53"}, 387 | {file = "scipy-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded"}, 388 | {file = "scipy-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf"}, 389 | {file = "scipy-1.15.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37"}, 390 | {file = "scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d"}, 391 | {file = "scipy-1.15.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb"}, 392 | {file = "scipy-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27"}, 393 | {file = "scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0"}, 394 | {file = "scipy-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32"}, 395 | {file = "scipy-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d"}, 396 | {file = "scipy-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f"}, 397 | {file = "scipy-1.15.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9"}, 398 | {file = "scipy-1.15.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f"}, 399 | {file = "scipy-1.15.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6"}, 400 | {file = "scipy-1.15.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af"}, 401 | {file = "scipy-1.15.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274"}, 402 | {file = "scipy-1.15.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776"}, 403 | {file = "scipy-1.15.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828"}, 404 | {file = "scipy-1.15.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28"}, 405 | {file = "scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db"}, 406 | {file = "scipy-1.15.2.tar.gz", hash = "sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec"}, 407 | ] 408 | 409 | [package.dependencies] 410 | numpy = ">=1.23.5,<2.5" 411 | 412 | [package.extras] 413 | dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] 414 | doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.16.5)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.0.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"] 415 | test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] 416 | 417 | [[package]] 418 | name = "six" 419 | version = "1.17.0" 420 | description = "Python 2 and 3 compatibility utilities" 421 | optional = false 422 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" 423 | files = [ 424 | {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, 425 | {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, 426 | ] 427 | 428 | [[package]] 429 | name = "threadpoolctl" 430 | version = "3.5.0" 431 | description = "threadpoolctl" 432 | optional = false 433 | python-versions = ">=3.8" 434 | files = [ 435 | {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, 436 | {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, 437 | ] 438 | 439 | [[package]] 440 | name = "tzdata" 441 | version = "2025.1" 442 | description = "Provider of IANA time zone data" 443 | optional = false 444 | python-versions = ">=2" 445 | files = [ 446 | {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"}, 447 | {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, 448 | ] 449 | 450 | [metadata] 451 | lock-version = "2.0" 452 | python-versions = "^3.11" 453 | content-hash = "8deba4f2f65ebce004129edd41a6ab9792fd26cf058aedb9880a24545cb92659" 454 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "unsupervised-bias-detection" 3 | version = "0.2.6" 4 | description = "package for unsupervised bias detection" 5 | authors = ["NGO Algorithm Audit"] 6 | license = "EUPL-1.2 license" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | numpy = "^1.26.4" 12 | scikit-learn = ">=1.6.1" 13 | kmodes = "^0.12.2" 14 | 15 | [tool.poetry.group.dev.dependencies] 16 | ruff = "^0.2.2" 17 | pytest = "^8.0.2" 18 | pandas = "^2.2.2" 19 | fairlearn = "^0.10.0" 20 | 21 | [tool.ruff.lint] 22 | select = ["D"] 23 | 24 | [tool.ruff.lint.pydocstyle] 25 | convention = "numpy" 26 | 27 | [build-system] 28 | requires = ["poetry-core"] 29 | build-backend = "poetry.core.masonry.api" 30 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NGO-Algorithm-Audit/unsupervised-bias-detection/486d02f2f28a40a31db3f2bbee12dea07fc70eb1/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_bahc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unsupervised_bias_detection.cluster import BiasAwareHierarchicalKMeans 3 | 4 | 5 | def test_shapes(): 6 | # Checks that labels and scores have the right shapes 7 | rng = np.random.RandomState(12) 8 | X = rng.rand(20, 10) 9 | y = rng.rand(20) 10 | bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2) 11 | bahc.fit(X, y) 12 | assert len(bahc.labels_) == len(X) 13 | assert len(bahc.scores_) == bahc.n_clusters_ 14 | 15 | 16 | def test_labels(): 17 | # Checks that label values are between 0 and n_clusters 18 | rng = np.random.RandomState(12) 19 | X = rng.rand(20, 10) 20 | y = rng.rand(20) 21 | bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2) 22 | bahc.fit(X, y) 23 | assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_)) 24 | 25 | 26 | def test_cluster_sizes(): 27 | # Checks that cluster sizes are at least bahc_min_cluster_size 28 | rng = np.random.RandomState(12) 29 | X = rng.rand(20, 10) 30 | y = rng.rand(20) 31 | bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=5) 32 | bahc.fit(X, y) 33 | assert np.all(np.bincount(bahc.labels_) >= bahc.bahc_min_cluster_size) 34 | 35 | 36 | def test_constant_metric(): 37 | # Checks that there is only one cluster with a score of 0 if the metric is constant 38 | rng = np.random.RandomState(12) 39 | X = rng.rand(20, 10) 40 | y = np.full(20, rng.rand()) 41 | bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2) 42 | bahc.fit(X, y) 43 | assert bahc.n_clusters_ == 1 44 | assert bahc.scores_[0] == 0 45 | 46 | 47 | def test_scores(): 48 | # Checks that scores are computed correctly 49 | rng = np.random.RandomState(12) 50 | X = rng.rand(20, 10) 51 | y = rng.rand(20) 52 | bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2) 53 | bahc.fit(X, y) 54 | # TODO: Check this!!! 55 | for i in range(bahc.n_clusters_): 56 | cluster_indices = np.arange(20)[bahc.labels_ == i] 57 | complement_indices = np.arange(20)[bahc.labels_ != i] 58 | score = np.mean(y[complement_indices]) - np.mean(y[cluster_indices]) 59 | assert bahc.scores_[i] == score 60 | 61 | 62 | def test_scores_are_sorted(): 63 | # Checks that scores are sorted in descending order 64 | rng = np.random.RandomState(12) 65 | X = rng.rand(20, 10) 66 | y = rng.rand(20) 67 | bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2) 68 | bahc.fit(X, y) 69 | assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:]) 70 | 71 | 72 | def test_predict(): 73 | # Checks that predict returns the same labels as fit 74 | rng = np.random.RandomState(12) 75 | X = rng.rand(20, 10) 76 | y = rng.rand(20) 77 | bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2) 78 | bahc.fit(X, y) 79 | assert np.array_equal(bahc.predict(X), bahc.labels_) 80 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | """Provides tests for the loading function in unsupervised_bias_detection/utils/dataset.py.""" 2 | import pytest 3 | 4 | from unsupervised_bias_detection.utils.dataset import load_default_dataset 5 | 6 | 7 | def test_loading_dataset_passes(): 8 | """Checks that the default dataset loading function works as expected.""" 9 | data, true_labels = load_default_dataset() 10 | assert data is not None and true_labels is not None 11 | 12 | 13 | @pytest.mark.xfail 14 | def test_unneeded_argument(): 15 | """Checks that no argument is necessary for the function call.""" 16 | assert load_default_dataset(False) is TypeError 17 | -------------------------------------------------------------------------------- /tests/test_validation.py: -------------------------------------------------------------------------------- 1 | """Provides tests for the functions in unsupervised_bias_detection/utils/validation.py.""" 2 | import pandas as pd 3 | import numpy as np 4 | import pytest 5 | 6 | from unsupervised_bias_detection.utils.validation import run_checks 7 | 8 | 9 | def test_always_passes(): 10 | """Test0: all numerical and good (no errors expected).""" 11 | dict0 = { 12 | "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]], 13 | "preds": [0, 1, 1], 14 | "true_labels": [0, 0, 1], 15 | } 16 | df_test0 = pd.DataFrame(data=dict0) 17 | assert not run_checks(df_test0) is ValueError 18 | 19 | 20 | @pytest.mark.xfail 21 | def test_not_binary_y(): 22 | """Test1: all numerical BUT predictions and labels are not binary.""" 23 | dict1 = { 24 | "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]], 25 | "preds": [6, 7, 8], 26 | "true_labels": [11, 0, 2], 27 | } 28 | df_test1 = pd.DataFrame(data=dict1) 29 | assert run_checks(df_test1) is ValueError 30 | 31 | 32 | @pytest.mark.xfail 33 | def test_categorical_preds(): 34 | """Test2: all numerical BUT predictions are categorical.""" 35 | dict2 = { 36 | "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]], 37 | "preds": ["yellow", "yellow", "blue"], 38 | "true_labels": [0, 1, 1], 39 | } 40 | df_test2 = pd.DataFrame(data=dict2) 41 | assert run_checks(df_test2) is ValueError 42 | 43 | 44 | @pytest.mark.xfail 45 | def test_categorical_true_labels(): 46 | """Test3: all numerical BUT true labels are categorical.""" 47 | dict3 = { 48 | "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]], 49 | "preds": [0, 1, 0], 50 | "true_labels": ["red", "red", "yellow"], 51 | } 52 | df_test3 = pd.DataFrame(data=dict3) 53 | assert run_checks(df_test3) is ValueError 54 | 55 | 56 | @pytest.mark.xfail 57 | def test_multiclass_preds(): 58 | """Test4: all numerical BUT predictions are multi-class.""" 59 | dict4 = { 60 | "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]], 61 | "preds": [0, 1, 2], 62 | "true_labels": [0, 1, 1], 63 | } 64 | df_test4 = pd.DataFrame(data=dict4) 65 | assert run_checks(df_test4) is ValueError 66 | 67 | 68 | @pytest.mark.xfail 69 | def test_multiclass_true_labels(): 70 | """Test5: all numerical BUT true labels are multi-class.""" 71 | dict5 = { 72 | "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]], 73 | "preds": [0, 1, 1], 74 | "true_labels": [0, 1, 2], 75 | } 76 | df_test5 = pd.DataFrame(data=dict5) 77 | assert run_checks(df_test5) is ValueError 78 | 79 | 80 | @pytest.mark.xfail 81 | def test_features_nonnumerical(): 82 | """Test6: x includes categorical values.""" 83 | dict6 = { 84 | "x": [[1, "three", 2], ["blue", 100, 0], [0, 0, 0]], 85 | "preds": [0, 1, 1], 86 | "true_labels": [1, 1, 1], 87 | } 88 | df_test6 = pd.DataFrame(data=dict6) 89 | assert run_checks(df_test6) is ValueError 90 | 91 | 92 | @pytest.mark.xfail 93 | def test_two_missing_columns(): 94 | """Test7: only features present, missing predictions and true labels.""" 95 | dict7 = {"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]]} 96 | df_test7 = pd.DataFrame(data=dict7) 97 | assert run_checks(df_test7) is IndexError 98 | 99 | 100 | @pytest.mark.xfail 101 | def test_missing_true_labels(): 102 | """Test8: true labels column missing.""" 103 | dict8 = {"x": [[1, "three", 2], ["blue", 100, 0], [0, 0, 0]], "preds": [0, 1, 1]} 104 | df_test8 = pd.DataFrame(data=dict8) 105 | assert run_checks(df_test8) is IndexError 106 | 107 | 108 | @pytest.mark.xfail 109 | def test_missing_features(): 110 | """Test9: features missing.""" 111 | dict9 = {"preds": [0, 1, 1], "true_labels": [0, 1, 1]} 112 | df_test9 = pd.DataFrame(data=dict9) 113 | assert run_checks(df_test9) is IndexError 114 | 115 | 116 | @pytest.mark.xfail 117 | def test_not_pandas_type(): 118 | """Test10: the data is not of type pandas.""" 119 | array10 = np.array([[1, 2, 3, 0, 1], [4, 5, 6, 0, 0], [7, 8, 9, 1, 1]]) 120 | assert run_checks(array10) is TypeError 121 | -------------------------------------------------------------------------------- /unsupervised_bias_detection/__init__.py: -------------------------------------------------------------------------------- 1 | """unsupervised-bias-detection.""" -------------------------------------------------------------------------------- /unsupervised_bias_detection/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | """The :mod:`unsupervised_bias_detection.cluster` module implements bias-aware clustering algorithms.""" 2 | 3 | from ._bahc import BiasAwareHierarchicalClustering 4 | from ._kmeans import BiasAwareHierarchicalKMeans 5 | from ._kmodes import BiasAwareHierarchicalKModes 6 | 7 | __all__ = [ 8 | "BiasAwareHierarchicalClustering", 9 | "BiasAwareHierarchicalKMeans", 10 | "BiasAwareHierarchicalKModes", 11 | ] 12 | -------------------------------------------------------------------------------- /unsupervised_bias_detection/cluster/_bahc.py: -------------------------------------------------------------------------------- 1 | from ._cluster_node import ClusterNode 2 | from collections import deque 3 | import heapq 4 | from numbers import Integral 5 | import numpy as np 6 | from sklearn.base import BaseEstimator, ClusterMixin 7 | from sklearn.utils._param_validation import Interval 8 | from sklearn.utils.validation import validate_data 9 | from typing import Any, Type 10 | 11 | 12 | class BiasAwareHierarchicalClustering(BaseEstimator, ClusterMixin): 13 | """ 14 | TODO: Add docstring. 15 | 16 | References 17 | ---------- 18 | .. [1] J. Misztal-Radecka, B. Indurkhya, "Bias-Aware Hierarchical Clustering for detecting the discriminated 19 | groups of users in recommendation systems", Information Processing & Management, vol. 58, no. 3, May. 2021. 20 | """ 21 | 22 | _parameter_constraints: dict = { 23 | "bahc_max_iter": [Interval(Integral, 1, None, closed="left")], 24 | "bahc_min_cluster_size": [Interval(Integral, 1, None, closed="left")], 25 | } 26 | 27 | def __init__( 28 | self, 29 | clustering_cls: Type[ClusterMixin], 30 | bahc_max_iter: int, 31 | bahc_min_cluster_size: int, 32 | margin: float = 1e-5, 33 | **clustering_params: Any, 34 | ): 35 | self.clustering_cls = clustering_cls 36 | self.bahc_max_iter = bahc_max_iter 37 | self.bahc_min_cluster_size = bahc_min_cluster_size 38 | self.margin = margin 39 | self.clustering_params = clustering_params 40 | 41 | def fit(self, X, y): 42 | """Compute bias-aware hierarchical clustering. 43 | 44 | Parameters 45 | ---------- 46 | X : array-like of shape (n_samples, n_features) 47 | List of n_features-dimensional data points. Each row 48 | corresponds to a single data point. 49 | y : array-like of shape (n_samples) 50 | Metric values. 51 | 52 | Returns 53 | ------- 54 | self : object 55 | Fitted estimator. 56 | """ 57 | X, y = validate_data( 58 | self, 59 | X, 60 | y, 61 | reset=False, 62 | accept_large_sparse=False, 63 | order="C", 64 | ) 65 | n_samples, _ = X.shape 66 | # We start with all samples being in a single cluster with label 0 67 | self.n_clusters_ = 1 68 | labels = np.zeros(n_samples, dtype=np.uint32) 69 | leaves = [] 70 | label = 0 71 | std = np.std(y) 72 | # The entire dataset has a discrimination score of zero 73 | score = 0 74 | root = ClusterNode(label, -std, score) 75 | self.cluster_tree_ = root 76 | heap = [root] 77 | for _ in range(self.bahc_max_iter): 78 | if not heap: 79 | # If the heap is empty we stop iterating 80 | break 81 | # Take the cluster with the highest standard deviation of metric y 82 | node = heapq.heappop(heap) 83 | label = node.label 84 | score = node.score 85 | cluster_indices = np.nonzero(labels == label)[0] 86 | X_cluster = X[cluster_indices] 87 | 88 | clustering_model = self.clustering_cls(**self.clustering_params) 89 | cluster_labels = clustering_model.fit_predict(X_cluster) 90 | 91 | if hasattr(clustering_model, "n_clusters_"): 92 | n_children = clustering_model.n_clusters_ 93 | else: 94 | n_children = len(np.unique(cluster_labels)) 95 | 96 | # We first check if all child clusters meet the minimum size requirement 97 | valid_split = True 98 | children_indices = [] 99 | for i in range(n_children): 100 | child_indices = cluster_indices[np.nonzero(cluster_labels == i)[0]] 101 | if len(child_indices) >= self.bahc_min_cluster_size: 102 | children_indices.append(child_indices) 103 | else: 104 | valid_split = False 105 | break 106 | 107 | # If all children clusters are of sufficient size, we check if the score of any child cluster is greater than or equal to the current score 108 | if valid_split: 109 | valid_split = False 110 | child_scores = [] 111 | for child_indices in children_indices: 112 | y_cluster = y[child_indices] 113 | complement_mask = np.ones(n_samples, dtype=bool) 114 | complement_mask[child_indices] = False 115 | y_complement = y[complement_mask] 116 | child_score = np.mean(y_complement) - np.mean(y_cluster) 117 | if child_score >= score + self.margin: 118 | valid_split = True 119 | child_scores.append(child_score) 120 | 121 | # If the split is valid, we create the children nodes and split the current node 122 | # Otherwise, we add the current node to the leaves 123 | if valid_split: 124 | # TODO: Make this nicer! 125 | # TODO: Maybe explain why we negate std before pushing to heap 126 | first_child_indices = children_indices[0] 127 | first_child_std = np.std(y[first_child_indices]) 128 | first_child_score = child_scores[0] 129 | first_child = ClusterNode(label, -first_child_std, first_child_score) 130 | heapq.heappush(heap, first_child) 131 | labels[first_child_indices] = label 132 | children = [first_child] 133 | for i in range(1, n_children): 134 | child_indices = children_indices[i] 135 | child_std = np.std(y[child_indices]) 136 | child_score = child_scores[i] 137 | child_node = ClusterNode(self.n_clusters_, -child_std, child_score) 138 | heapq.heappush(heap, child_node) 139 | labels[child_indices] = self.n_clusters_ 140 | children.append(child_node) 141 | self.n_clusters_ += 1 142 | node.split(clustering_model, children) 143 | else: 144 | leaves.append(node) 145 | 146 | leaves.extend(heap) 147 | leaf_scores = np.array([leaf.score for leaf in leaves]) 148 | # We sort clusters by decreasing scores 149 | sorted_indices = np.argsort(-leaf_scores) 150 | self.scores_ = leaf_scores[sorted_indices] 151 | leaf_labels = np.array([leaf.label for leaf in leaves]) 152 | leaf_labels = leaf_labels[sorted_indices] 153 | label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32) 154 | label_mapping[leaf_labels] = np.arange(self.n_clusters_, dtype=np.uint32) 155 | self.labels_ = label_mapping[labels] 156 | for leaf in leaves: 157 | leaf.label = label_mapping[leaf.label] 158 | return self 159 | 160 | def predict(self, X): 161 | """Predict the cluster labels for the given data. 162 | 163 | Parameters 164 | ---------- 165 | X : array-like of shape (n_samples, n_features) 166 | """ 167 | # TODO: Assert that fit has been called 168 | # TODO: Assert that X has the same number of features as the data used to fit 169 | # TODO: Assert that clustering_model has predict method 170 | # TODO: Validate X 171 | n_samples, _ = X.shape 172 | labels = np.zeros(n_samples, dtype=np.uint32) 173 | queue = deque([(self.cluster_tree_, np.arange(n_samples))]) 174 | while queue: 175 | node, indices = queue.popleft() 176 | if node.is_leaf: 177 | labels[indices] = node.label 178 | else: 179 | cluster = X[indices] 180 | clustering_model = node.clustering_model 181 | cluster_labels = clustering_model.predict(cluster) 182 | if hasattr(clustering_model, "n_clusters_"): 183 | n_clusters = clustering_model.n_clusters_ 184 | else: 185 | n_clusters = len(np.unique(cluster_labels)) 186 | for i in range(n_clusters): 187 | child_indices = indices[np.nonzero(cluster_labels == i)[0]] 188 | if child_indices.size > 0: 189 | queue.append((node.children[i], child_indices)) 190 | return labels 191 | -------------------------------------------------------------------------------- /unsupervised_bias_detection/cluster/_cluster_node.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from sklearn.base import ClusterMixin 3 | from typing import Self 4 | 5 | class ClusterNode: 6 | def __init__(self, label: int, neg_std: float, score: float): 7 | """ 8 | Initialize a node in the cluster tree. 9 | 10 | Parameters 11 | ---------- 12 | label : int 13 | The cluster label for this node (required as all nodes start as leaves) 14 | """ 15 | self.label = label 16 | self.neg_std = neg_std 17 | self.score = score 18 | self.clustering_model = None 19 | self.children = [] 20 | 21 | @property 22 | def is_leaf(self): 23 | return len(self.children) == 0 24 | 25 | def __lt__(self, other: Self): 26 | # TODO: Use score before label 27 | return self.neg_std < other.neg_std or (self.neg_std == other.neg_std and self.label < other.label) 28 | 29 | def split(self, clustering_model: ClusterMixin, children: list[Self]): 30 | """ 31 | Split this node by setting its clustering model and adding children. 32 | 33 | This converts the node to an internal node and removes its label 34 | 35 | Parameters 36 | ---------- 37 | clustering_model : ClusterMixin 38 | The clustering model used to split this node 39 | children : list of ClusterNode 40 | The child nodes resulting from the split 41 | """ 42 | self.label = None 43 | self.clustering_model = clustering_model 44 | self.children = children 45 | 46 | def get_leaves(self) -> list[Self]: 47 | """ 48 | Get all leaf nodes in the subtree rooted at this node. 49 | 50 | Returns 51 | ------- 52 | list of ClusterNode 53 | All leaf nodes in the subtree 54 | """ 55 | if not self.children: 56 | return [self] 57 | 58 | leaves = [] 59 | for child in self.children: 60 | leaves.extend(child.get_leaves()) 61 | return leaves -------------------------------------------------------------------------------- /unsupervised_bias_detection/cluster/_kmeans.py: -------------------------------------------------------------------------------- 1 | from ._bahc import BiasAwareHierarchicalClustering 2 | from sklearn.base import BaseEstimator, ClusterMixin 3 | from sklearn.cluster import KMeans 4 | 5 | 6 | class BiasAwareHierarchicalKMeans(BaseEstimator, ClusterMixin): 7 | """Bias-Aware Hierarchical k-Means Clustering. 8 | 9 | Parameters 10 | ---------- 11 | bahc_max_iter : int 12 | Maximum number of iterations. 13 | bahc_min_cluster_size : int 14 | Minimum size of a cluster. 15 | kmeans_params : dict 16 | k-means parameters 17 | 18 | Attributes 19 | ---------- 20 | n_clusters_ : int 21 | The number of clusters found by the algorithm. 22 | labels_ : ndarray of shape (n_samples,) 23 | Cluster labels for each point. Lower labels correspond to higher discrimination scores. 24 | scores_ : ndarray of shape (n_clusters_,) 25 | Discrimination scores for each cluster. 26 | 27 | References 28 | ---------- 29 | .. [1] J. Misztal-Radecka, B. Indurkhya, "Bias-Aware Hierarchical Clustering for detecting the discriminated 30 | groups of users in recommendation systems", Information Processing & Management, vol. 58, no. 3, May. 2021. 31 | 32 | Examples 33 | -------- 34 | >>> from unsupervised_bias_detection.clustering import BiasAwareHierarchicalKMeans 35 | >>> import numpy as np 36 | >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) 37 | >>> y = np.array([0, 0, 0, 10, 10, 10]) 38 | >>> bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=1, bahc_min_cluster_size=1, random_state=12).fit(X, y) 39 | >>> bahc.labels_ 40 | array([0, 0, 0, 1, 1, 1], dtype=uint32) 41 | >>> bahc.scores_ 42 | array([ 10., -10.]) 43 | """ 44 | 45 | def __init__( 46 | self, 47 | bahc_max_iter, 48 | bahc_min_cluster_size, 49 | **kmeans_params, 50 | ): 51 | if "n_clusters" not in kmeans_params: 52 | kmeans_params["n_clusters"] = 2 53 | 54 | if "n_init" not in kmeans_params: 55 | kmeans_params["n_init"] = "auto" 56 | 57 | self.bahc_max_iter = bahc_max_iter 58 | self.bahc_min_cluster_size = bahc_min_cluster_size 59 | self._bahc = BiasAwareHierarchicalClustering( 60 | KMeans, 61 | bahc_max_iter, 62 | bahc_min_cluster_size, 63 | **kmeans_params, 64 | ) 65 | 66 | def fit(self, X, y): 67 | self._bahc.fit(X, y) 68 | self.n_clusters_ = self._bahc.n_clusters_ 69 | self.labels_ = self._bahc.labels_ 70 | self.scores_ = self._bahc.scores_ 71 | self.cluster_tree_ = self._bahc.cluster_tree_ 72 | return self 73 | 74 | def predict(self, X): 75 | return self._bahc.predict(X) -------------------------------------------------------------------------------- /unsupervised_bias_detection/cluster/_kmodes.py: -------------------------------------------------------------------------------- 1 | from ._bahc import BiasAwareHierarchicalClustering 2 | from kmodes.kmodes import KModes 3 | from sklearn.base import BaseEstimator, ClusterMixin 4 | 5 | 6 | class BiasAwareHierarchicalKModes(BaseEstimator, ClusterMixin): 7 | """Bias-Aware Hierarchical k-Modes Clustering. 8 | 9 | Parameters 10 | ---------- 11 | bahc_max_iter : int 12 | Maximum number of iterations. 13 | bahc_min_cluster_size : int 14 | Minimum size of a cluster. 15 | kmodes_params : dict 16 | k-modes parameters 17 | 18 | Attributes 19 | ---------- 20 | n_clusters_ : int 21 | The number of clusters found by the algorithm. 22 | labels_ : ndarray of shape (n_samples,) 23 | Cluster labels for each point. Lower labels correspond to higher discrimination scores. 24 | scores_ : ndarray of shape (n_clusters_,) 25 | Discrimination scores for each cluster. 26 | 27 | References 28 | ---------- 29 | .. [1] J. Misztal-Radecka, B. Indurkhya, "Bias-Aware Hierarchical Clustering for detecting the discriminated 30 | groups of users in recommendation systems", Information Processing & Management, vol. 58, no. 3, May. 2021. 31 | 32 | Examples 33 | -------- 34 | >>> from unsupervised_bias_detection.clustering import BiasAwareHierarchicalKModes 35 | >>> import numpy as np 36 | >>> X = np.array([[0, 1], [0, 2], [0, 0], [1, 4], [1, 5], [1, 3]]) 37 | >>> y = np.array([0, 0, 0, 10, 10, 10]) 38 | >>> bahc = BiasAwareHierarchicalKModes(bahc_max_iter=1, bahc_min_cluster_size=1, random_state=12).fit(X, y) 39 | >>> bahc.labels_ 40 | array([0, 0, 0, 1, 1, 1], dtype=uint32) 41 | >>> bahc.scores_ 42 | array([ 10., -10.]) 43 | """ 44 | 45 | def __init__(self, bahc_max_iter, bahc_min_cluster_size, **kmodes_params): 46 | if "n_clusters" not in kmodes_params: 47 | kmodes_params["n_clusters"] = 2 48 | 49 | self.bahc_max_iter = bahc_max_iter 50 | self.bahc_min_cluster_size = bahc_min_cluster_size 51 | self._hbac = BiasAwareHierarchicalClustering( 52 | KModes, bahc_max_iter, bahc_min_cluster_size, **kmodes_params 53 | ) 54 | 55 | def fit(self, X, y): 56 | self._hbac.fit(X, y) 57 | self.n_clusters_ = self._hbac.n_clusters_ 58 | self.labels_ = self._hbac.labels_ 59 | self.scores_ = self._hbac.scores_ 60 | self.cluster_tree_ = self._hbac.cluster_tree_ 61 | return self 62 | 63 | def predict(self, X): 64 | return self._hbac.predict(X) -------------------------------------------------------------------------------- /unsupervised_bias_detection/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """The :mod:`unsupervised_bias_detection.utils` module implements utility functions.""" 2 | 3 | from ._get_column_dtypes import get_column_dtypes 4 | 5 | __all__ = [ 6 | "get_column_dtypes", 7 | ] -------------------------------------------------------------------------------- /unsupervised_bias_detection/utils/_get_column_dtypes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def get_column_dtypes(data) -> dict: 6 | """ 7 | Return a dictionary mapping column names to abstract data types that are compatible with the processor. 8 | 9 | The mapping is as follows: 10 | - float64, float32, int64, int32 -> "numerical" 11 | - bool -> "boolean" 12 | - datetime64[...] -> "datetime" 13 | - timedelta64[...] -> "timedelta" 14 | - All others (e.g., object) -> "categorical" 15 | """ 16 | def map_dtype(dtype: str) -> str: 17 | if dtype in ['float64', 'float32', 'int64', 'int32']: 18 | return "numerical" 19 | elif dtype == 'bool': 20 | return "boolean" 21 | elif 'datetime' in dtype: 22 | return "datetime" 23 | elif 'timedelta' in dtype: 24 | return "timedelta" 25 | else: 26 | return "categorical" 27 | 28 | if isinstance(data, pd.DataFrame): 29 | return {col: map_dtype(str(dtype)) for col, dtype in data.dtypes.items()} 30 | elif isinstance(data, np.ndarray) and data.dtype.names is not None: 31 | return {name: map_dtype(str(data.dtype.fields[name][0])) for name in data.dtype.names} 32 | else: 33 | raise TypeError("Data must be a pandas DataFrame or a structured numpy array.") -------------------------------------------------------------------------------- /unsupervised_bias_detection/utils/dataset.py: -------------------------------------------------------------------------------- 1 | """Provides a default dataset with a healthcare focus for users to use.""" 2 | import fairlearn.datasets as data 3 | 4 | 5 | def load_default_dataset(): 6 | """ 7 | Fetch a default healthcare dataset for use in x, y format. 8 | 9 | This function loads the diabetes hospital dataset from Msft's Fairlearn package and returns the x and y as pandas. 10 | The healthcare dataset provides data as to if a patient will be readmitted to hospital because of their diabetes. 11 | 12 | Parameters 13 | ---------- 14 | None 15 | 16 | Returns 17 | ------- 18 | diabetes_dataset_x: pandas.core.frame.DataFrame 19 | The features from the diabetes hospital dataset. 20 | diabetes_dataset_y: pandas.core.series.Series 21 | The target label (true label) for the diabetes hospital dataset is readmit_30_days. 22 | One could use the variable readmit_binary as the target instead. 23 | 24 | See Also 25 | -------- 26 | Dataset details: https://fairlearn.org/main/user_guide/datasets/diabetes_hospital_data.html 27 | 28 | Example 29 | -------- 30 | >>> from unsupervised_bias_detection.utils import load_default_dataset 31 | >>> x, y = load_default_dataset() 32 | >>> x 33 | pandas.dataframe( race gender age ... had_outpatient_days readmitted readmit_binary 34 | 0 Caucasian Female '30 years or younger' ... False NO 0 35 | 1 Caucasian Female '30 years or younger' ... False >30 1 36 | 2 AfricanAmerican Female '30 years or younger' ... True NO 0 37 | 3 Caucasian Male '30-60 years' ... False NO 0 38 | 4 Caucasian Male '30-60 years' ... False NO 0 39 | ... ... ... ... ... ... ... ... 40 | 101761 AfricanAmerican Male 'Over 60 years' ... False >30 1 41 | 101762 AfricanAmerican Female 'Over 60 years' ... False NO 0 42 | 101763 Caucasian Male 'Over 60 years' ... True NO 0 43 | 101764 Caucasian Female 'Over 60 years' ... False NO 0 44 | 101765 Caucasian Male 'Over 60 years' ... False NO 0) 45 | 46 | [101766 rows x 24 columns] 47 | >>> y 48 | pandas.series( 0 0 49 | 1 0 50 | 2 0 51 | 3 0 52 | 4 0 53 | .. 54 | 101761 0 55 | 101762 0 56 | 101763 0 57 | 101764 0 58 | 101765 0) 59 | Name: readmit_30_days, Length: 101766, dtype: int64 60 | """ 61 | print( 62 | "Note: it is up to the user to train a model with the provided data now before running the bias detection " 63 | "tool whether it is via the Algorithm Audit website for a demo or via the unsupervised_bias_detection " 64 | "package." 65 | ) 66 | diabetes_dataset_x, diabetes_dataset_y = data.fetch_diabetes_hospital( 67 | return_X_y=True 68 | ) 69 | return diabetes_dataset_x, diabetes_dataset_y 70 | -------------------------------------------------------------------------------- /unsupervised_bias_detection/utils/validation.py: -------------------------------------------------------------------------------- 1 | """Provides functions for testing dataset properties.""" 2 | import pandas as pd 3 | 4 | 5 | # TODO: add functionality to complete checks if dealing with a numpy array instead of pandas 6 | 7 | 8 | def _data_preprocessing(data): 9 | """ 10 | Validate dataset is pandas and extract information about the dataset and returns that info in the form of variables. 11 | 12 | This non-public method checks the dataset is a pandas dataframe. It extracts the features, predictions, 13 | and true labels from the dataset and returns them. 14 | 15 | Parameters 16 | ---------- 17 | data: pandas dataframe 18 | 19 | Returns 20 | ------- 21 | features: pandas.core.series.Series 22 | predictions: pandas.core.series.Series 23 | true_labels: pandas.core.series.Series 24 | """ 25 | if not isinstance(data, pd.DataFrame): 26 | raise ValueError("Data must be of type pandas.DataFrame.") 27 | 28 | column_length = len(data.columns) 29 | features = data.iloc[:, column_length - 3] 30 | predictions = data.iloc[:, column_length - 2] 31 | true_labels = data.iloc[:, column_length - 1] 32 | return features, predictions, true_labels 33 | 34 | 35 | def _check_numerical_x_y(features, predictions, true_labels): 36 | """ 37 | Test that the x (features) and y (preds/labels) are numerical. 38 | 39 | Parameters 40 | ---------- 41 | features: pandas.core.series.Series 42 | predictions: pandas.core.series.Series 43 | true_labels: pandas.core.series.Series 44 | 45 | Returns 46 | ------- 47 | None 48 | """ 49 | for i in range(len(features)): 50 | row = features[i] 51 | pred = str(predictions[i]) 52 | true_lab = str(true_labels[i]) 53 | for x in range(len(row)): 54 | # numerical x check 55 | if not str(row[x]).isnumeric(): 56 | raise ValueError("Features must be numeric.") 57 | # numerical y check 58 | if not (pred.isnumeric() and true_lab.isnumeric()): 59 | raise ValueError("Labels and predictions must be numeric.") 60 | return 61 | 62 | 63 | def _check_binary_class(predictions, true_labels): 64 | """ 65 | Test that the predictions and true labels are binary in value (0 or 1). 66 | 67 | Parameters 68 | ---------- 69 | predictions: pandas.core.series.Series 70 | true_labels: pandas.core.series.Series 71 | 72 | Returns 73 | ------- 74 | None 75 | """ 76 | for i in range(len(predictions)): 77 | pred = str(predictions[i]) 78 | true_lab = str(true_labels[i]) 79 | if not ((pred == "0" or pred == "1") and (true_lab == "0" or true_lab == "1")): 80 | raise ValueError( 81 | "Labels and predictions should be 0 or 1 for binary classification." 82 | ) 83 | return 84 | 85 | 86 | # Public method that runs the private functions to test 3 properties of the dataset 87 | def run_checks(data): 88 | """ 89 | Test all the property tests for the dataset by calling the private methods in this file. 90 | 91 | Parameters 92 | ---------- 93 | data: pandas dataframe 94 | 95 | Returns 96 | ------- 97 | None 98 | 99 | Example 100 | -------- 101 | >>> from unsupervised_bias_detection.utils.validation import run_checks 102 | >>>data_dict = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0, 1, 1], 'true_labels': [0, 0, 1]} 103 | >>>data_df = pd.DataFrame(data=data_dict) 104 | >>>data_df 105 | x preds true_labels 106 | 0 [1, 2, 3] 0 0 107 | 1 [3, 2, 1] 1 0 108 | 2 [4, 5, 6] 1 1 109 | >>> run_checks(data_df) 110 | """ 111 | print("Beginning testing...") 112 | features, predictions, true_labels = _data_preprocessing(data) 113 | _check_numerical_x_y(features, predictions, true_labels) 114 | _check_binary_class(predictions, true_labels) 115 | print("No errors, finished testing.") 116 | return 117 | --------------------------------------------------------------------------------