├── BIAS ├── models │ ├── empty.txt │ ├── targetnames.npy │ ├── opt_cnn_model-100.keras │ ├── opt_cnn_model-200.keras │ ├── opt_cnn_model-30.keras │ ├── opt_cnn_model-50.keras │ ├── opt_cnn_model-500.keras │ └── opt_cnn_model-600.keras ├── __init__.py ├── install.r ├── Create_RF.py ├── uniform_test.py ├── SB_Toolbox.py └── SB_Test_runner.py ├── .gitignore ├── example.py ├── requirements.txt ├── setup.py ├── .github └── workflows │ ├── python-publish.yml │ └── docker-publish.yml ├── LICENSE.md ├── Dockerfile └── README.md /BIAS/models/empty.txt: -------------------------------------------------------------------------------- 1 | Place the Random Forest models in this folder. -------------------------------------------------------------------------------- /BIAS/models/targetnames.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/targetnames.npy -------------------------------------------------------------------------------- /BIAS/models/opt_cnn_model-100.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-100.keras -------------------------------------------------------------------------------- /BIAS/models/opt_cnn_model-200.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-200.keras -------------------------------------------------------------------------------- /BIAS/models/opt_cnn_model-30.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-30.keras -------------------------------------------------------------------------------- /BIAS/models/opt_cnn_model-50.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-50.keras -------------------------------------------------------------------------------- /BIAS/models/opt_cnn_model-500.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-500.keras -------------------------------------------------------------------------------- /BIAS/models/opt_cnn_model-600.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-600.keras -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | structured_data_classifier 2 | BIAS/__pycache__ 3 | env 4 | BIAS/models/RF 5 | *.pkl 6 | __pycache__ 7 | dist 8 | build 9 | *.egg-info -------------------------------------------------------------------------------- /BIAS/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import rpy2.robjects as robjects 3 | 4 | 5 | # get the value of the environment variable HOME 6 | R_installed = os.getenv("R_PACKAGES_INSTALLED") 7 | 8 | if R_installed != "Yes": 9 | dirname = os.path.dirname(__file__) 10 | robjects.r.source(f"{dirname}/install.r", encoding="utf-8") 11 | os.environ["R_PACKAGES_INSTALLED"] = "Yes" 12 | 13 | from .SB_Toolbox import BIAS, f0 14 | 15 | __all__ = ("BIAS", "f0") 16 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | #example of using the BIAS toolbox to test a DE algorithm 2 | 3 | from scipy.optimize import differential_evolution 4 | import numpy as np 5 | from BIAS import BIAS, f0, install_r_packages 6 | 7 | install_r_packages() 8 | 9 | bounds = [(0,1), (0, 1), (0, 1), (0, 1), (0, 1)] 10 | 11 | #do 30 independent runs (5 dimensions) 12 | samples = [] 13 | print("Performing optimization method 50 times of f0.") 14 | for i in np.arange(50): 15 | result = differential_evolution(f0, bounds, maxiter=100) 16 | samples.append(result.x) 17 | 18 | samples = np.array(samples) 19 | 20 | test = BIAS() 21 | # use the classical stastistical approach to detect BIAS 22 | print(test.predict(samples, show_figure=True)) 23 | 24 | #use the trained deep learning model to predict and explain BIAS 25 | y, preds = test.predict_deep(samples) 26 | test.explain(samples, preds, filename="explanation.png") 27 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backports.zoneinfo==0.2.1;python_version<"3.9" 2 | certifi==2021.10.8 3 | cffi==1.15.0 4 | charset-normalizer==2.0.9 5 | cycler==0.11.0 6 | fonttools==4.28.3 7 | idna==3.3 8 | Jinja2==3.0.3 9 | kiwisolver==1.3.2 10 | MarkupSafe==2.0.1 11 | matplotlib==3.6.2 12 | numpy==1.22.0 13 | packaging==21.3 14 | pandas==1.3.4 15 | patsy==0.5.2 16 | Pillow==8.4.0 17 | pycparser==2.21 18 | pyparsing==3.0.6 19 | python-dateutil==2.8.2 20 | pytz==2021.3 21 | pytz-deprecation-shim==0.1.0.post0 22 | requests==2.26.0 23 | rpy2==3.4.5 24 | setuptools-scm==6.3.2 25 | scikit-learn==1.3.1 26 | scipy==1.7.3 27 | seaborn==0.13.2 28 | six==1.16.0 29 | statsmodels==0.13.1 30 | threadpoolctl==3.0.0 31 | tomli==1.2.2 32 | tzdata==2021.5 33 | tzlocal==4.1 34 | urllib3==1.26.7 35 | wget==3.2 36 | zenodo-get==1.3.4 37 | shap==0.42.0 38 | tensorflow==2.10.0 39 | autokeras==1.0.20 40 | -------------------------------------------------------------------------------- /BIAS/install.r: -------------------------------------------------------------------------------- 1 | 2 | r = getOption("repos") 3 | r["CRAN"] = "http://cran.us.r-project.org" 4 | options(repos = r) 5 | for (x in c("zoo", "lgarch", "Rcpp", "RcppArmadillo", 'nortest', 'data.table', 'goftest')){ 6 | if (!require(x,character.only = TRUE)) 7 | { 8 | install.packages(x,dep=TRUE) 9 | if(!require(x,character.only = TRUE)) stop("Package not found") 10 | } 11 | } 12 | 13 | if (!require("AutoSEARCH", character.only = TRUE)) { 14 | #install AutoSearch 15 | url <- "https://cran.r-project.org/src/contrib/Archive/AutoSEARCH/AutoSEARCH_1.5.tar.gz" 16 | pkgFile <- "AutoSEARCH_1.5.tar.gz" 17 | download.file(url = url, destfile = pkgFile) 18 | 19 | # Install package 20 | install.packages(pkgs=pkgFile, type="source", repos=NULL) 21 | 22 | # Delete package tarball 23 | unlink(pkgFile) 24 | } 25 | 26 | if (!require("PoweR", character.only = TRUE)) { 27 | #install PoweR 28 | url <- "https://cran.r-project.org/src/contrib/Archive/PoweR/PoweR_1.0.7.tar.gz" 29 | pkgFile <- "PoweR_1.0.7.tar.gz" 30 | download.file(url = url, destfile = pkgFile) 31 | 32 | # Install package 33 | install.packages(pkgs=pkgFile, type="source", repos=NULL) 34 | 35 | # Delete package tarball 36 | unlink(pkgFile) 37 | } 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | __version__ = "1.4.1" 8 | gh_ref = os.environ.get("GITHUB_REF") 9 | if gh_ref: 10 | *_, tag = gh_ref.split("/") 11 | __version__ = tag.replace("v", "") 12 | 13 | setuptools.setup( 14 | name='struct-bias', 15 | version=__version__, 16 | author="Diederick Vermetten, Niki van Stein", 17 | author_email="d.l.vermetten@liacs.leidenuniv.nl", 18 | description="BIAS toolbox: Structural bias detection for continuous optimization algorithms", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | packages=setuptools.find_packages(), 22 | package_data={ 23 | 'BIAS': ['install.r', 'models/*'], 24 | }, 25 | python_requires='>=3.6', 26 | install_requires=[ 27 | 'numpy', 28 | 'tensorflow', 29 | 'shap', 30 | 'rpy2', 31 | 'scipy', 32 | 'pandas', 33 | 'scikit-learn', 34 | 'matplotlib', 35 | 'seaborn', 36 | 'statsmodels', 37 | 'regex', 38 | 'autokeras' 39 | ], 40 | classifiers=[ 41 | "Programming Language :: Python :: 3", 42 | "License :: OSI Approved :: MIT License", 43 | "Operating System :: OS Independent", 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Publish Python 🐍 distribution 📦 to PyPI 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | environment: 23 | name: pypi 24 | url: https://pypi.org/p/struct-bias 25 | permissions: 26 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 27 | steps: 28 | - uses: actions/checkout@v3 29 | - name: Set up Python 30 | uses: actions/setup-python@v3 31 | with: 32 | python-version: '3.x' 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | pip install build 37 | - name: Build package 38 | run: python -m build 39 | - name: Publish package distributions to PyPI 40 | uses: pypa/gh-action-pypi-publish@release/v1 41 | with: 42 | verbose: true 43 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## License 2 | 3 | This application is governed by the __BSD 3-Clause license__. 4 | 5 | BSD 3-Clause License 6 | 7 | Copyright (c) 2021, 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | 13 | * Redistributions of source code must retain the above copyright notice, this 14 | list of conditions and the following disclaimer. 15 | 16 | * Redistributions in binary form must reproduce the above copyright notice, 17 | this list of conditions and the following disclaimer in the documentation 18 | and/or other materials provided with the distribution. 19 | 20 | * Neither the name of the copyright holder nor the names of its 21 | contributors may be used to endorse or promote products derived from 22 | this software without specific prior written permission. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 25 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 27 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 28 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 | 35 | ### Remarks ### 36 | 37 | 38 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /app 6 | 7 | # Install system dependencies required for building R and other tools, including X11 libraries 8 | RUN apt-get update && apt-get install -y --no-install-recommends \ 9 | build-essential \ 10 | libcurl4-openssl-dev \ 11 | libssl-dev \ 12 | libxml2-dev \ 13 | libfontconfig1-dev \ 14 | libreadline-dev \ 15 | wget \ 16 | curl \ 17 | zlib1g-dev \ 18 | libbz2-dev \ 19 | liblzma-dev \ 20 | libpcre2-dev \ 21 | libpcre3-dev \ 22 | gfortran \ 23 | libx11-dev \ 24 | libxt-dev \ 25 | x11proto-core-dev \ 26 | libcairo2-dev \ 27 | xvfb \ 28 | && rm -rf /var/lib/apt/lists/* 29 | 30 | # Download and install R 4.1.2 31 | RUN wget https://cran.rstudio.com/src/base/R-4/R-4.1.2.tar.gz && \ 32 | tar zxvf R-4.1.2.tar.gz && \ 33 | cd R-4.1.2 && \ 34 | ./configure --enable-R-shlib --with-blas --with-lapack && \ 35 | make && \ 36 | make install && \ 37 | cd .. && \ 38 | rm -rf R-4.1.2 R-4.1.2.tar.gz 39 | 40 | 41 | # Copy the current directory contents into the container at /app 42 | COPY ./BIAS /app/BIAS 43 | 44 | # Install R packages (add any necessary R packages here) 45 | RUN Rscript /app/BIAS/install.r 46 | 47 | ENV R_PACKAGES_INSTALLED=Yes 48 | 49 | # Copy example files 50 | COPY example.py /app/example.py 51 | COPY requirements.txt /app/requirements.txt 52 | COPY setup.py /app/setup.py 53 | COPY README.md /app/README.md 54 | 55 | # Install Python dependencies specified in requirements.txt 56 | RUN pip install --upgrade pip 57 | #RUN python setup.py install 58 | RUN pip install -r requirements.txt 59 | 60 | RUN apt-get update && apt-get install -y zip unzip 61 | 62 | # Download reference value files 63 | # Download and unzip the files from figshare 64 | RUN wget https://figshare.com/ndownloader/files/30591411 -O bias_data.zip && \ 65 | unzip bias_data.zip -d /app/BIAS/data/ && \ 66 | rm bias_data.zip 67 | 68 | RUN wget https://figshare.com/ndownloader/files/43106839 -O bias_models.zip && \ 69 | mkdir -p /app/BIAS/models/ && \ 70 | unzip bias_models.zip -d /app/BIAS/models/ && \ 71 | rm bias_models.zip 72 | 73 | # Install any additional dependencies for Jupyter notebooks 74 | RUN pip install jupyter 75 | 76 | # Set environment variables for R libraries 77 | ENV R_HOME=/usr/local/lib/R 78 | ENV LD_LIBRARY_PATH=/usr/local/lib/R/lib:/usr/local/lib/R/modules:$LD_LIBRARY_PATH 79 | 80 | # Copy tutorial file (last such that we can update it easily) 81 | COPY Tutorial.ipynb /app/Tutorial.ipynb 82 | 83 | # Expose the port that Jupyter will run on 84 | EXPOSE 8888 85 | 86 | # Add a script to start Jupyter automatically when the container starts 87 | CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=''"] 88 | 89 | # Optional: Add a health check 90 | #HEALTHCHECK --interval=30s CMD curl --fail http://localhost:8888 || exit 1 91 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | # 2 | name: Create and publish a Docker image 3 | 4 | # Configures this workflow to run every time a change is pushed to the branch called `master`. 5 | on: 6 | push: 7 | branches: ['master'] 8 | 9 | # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. 10 | env: 11 | REGISTRY: ghcr.io 12 | IMAGE_NAME: ${{ github.repository }} 13 | 14 | # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. 15 | jobs: 16 | build-and-push-image: 17 | runs-on: ubuntu-latest 18 | # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. 19 | permissions: 20 | contents: read 21 | packages: write 22 | attestations: write 23 | id-token: write 24 | # 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v4 28 | # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. 29 | - name: Log in to the Container registry 30 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 31 | with: 32 | registry: ${{ env.REGISTRY }} 33 | username: ${{ github.actor }} 34 | password: ${{ secrets.GITHUB_TOKEN }} 35 | # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. 36 | - name: Extract metadata (tags, labels) for Docker 37 | id: meta 38 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 39 | with: 40 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 41 | # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. 42 | # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. 43 | # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. 44 | - name: Build and push Docker image 45 | id: push 46 | uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 47 | with: 48 | context: . 49 | push: true 50 | tags: ${{ steps.meta.outputs.tags }} 51 | labels: ${{ steps.meta.outputs.labels }} 52 | 53 | # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." 54 | - name: Generate artifact attestation 55 | uses: actions/attest-build-provenance@v1 56 | with: 57 | subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} 58 | subject-digest: ${{ steps.push.outputs.digest }} 59 | push-to-registry: true 60 | 61 | -------------------------------------------------------------------------------- /BIAS/Create_RF.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pickle 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sbs 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.metrics import confusion_matrix, f1_score 10 | from sklearn.model_selection import train_test_split 11 | 12 | from zipfile import ZipFile 13 | import requests 14 | from io import BytesIO 15 | import os 16 | 17 | 18 | test_names = [ 19 | "1-spacing", 20 | "2-spacing", 21 | "3-spacing", 22 | "ad", 23 | "ad_transform", 24 | "shapiro", 25 | "jb", 26 | "ddst", 27 | "kurtosis", 28 | "mmpd_min", 29 | "mmpd_max", 30 | "range", 31 | "min", 32 | "max", 33 | "mdd_min", 34 | "mdd_max", 35 | "wasserstein", 36 | "kolmogorov", 37 | "CvM", 38 | "Durbin", 39 | "Kuiper", 40 | "HG1", 41 | "HG2", 42 | "Greenwood", 43 | "QM", 44 | "RC", 45 | "Moran", 46 | "Cressie1", 47 | "Cressie2", 48 | "Vasicek", 49 | "Swartz", 50 | "Morales", 51 | "Pardo", 52 | "Marhuenda", 53 | "Zhang1", 54 | "Zhang2", 55 | ] 56 | 57 | readable_label_dict = { 58 | "gaps": "Gaps", 59 | "cauchy": "Center", 60 | "clusters": "Clusters", 61 | "inv_cauchy": "Bounds", 62 | "inv_norm": "Bounds", 63 | "norm": "Center", 64 | "part_unif": "Clusters", 65 | "shifted_spikes": "Discretization", 66 | "spikes": "Discretization", 67 | "trunc_unif": "Center", 68 | "bound_thing": "Bounds", 69 | } 70 | 71 | 72 | def create_RF_rej( 73 | included_tests=test_names, 74 | plot_feat_importance=False, 75 | use_bias_labels=False, 76 | feature_order=None, 77 | rf_file_name=None, 78 | ): 79 | dirname = os.path.dirname(__file__) 80 | r = requests.get("https://figshare.com/ndownloader/files/30590670") 81 | zipfile = ZipFile(BytesIO(r.content)) 82 | zipfile.extractall(f"{dirname}/models/RFs/") 83 | 84 | r = requests.get("https://figshare.com/ndownloader/files/30591417") 85 | zipfile = ZipFile(BytesIO(r.content)) 86 | zipfile.extractall(f"{dirname}/models/RFs/SB/") 87 | cols_to_get = included_tests + ["scen"] 88 | dt_samples = [] 89 | for sample_size in [30, 50, 100, 600]: 90 | for f in glob.glob(f"{dirname}/models/RFs/SB/S{sample_size}/*.csv"): 91 | dt_temp = pd.read_csv(f) 92 | # print(len(dt_temp)) 93 | if dt_temp["scen"][0] != "unif": 94 | # Remove samples for which no tests reject (non-biased) 95 | try: 96 | dt_rej_temp = pd.read_csv( 97 | f"{dirname}/models/RFs/SB/Rejections/S{sample_size}_A0.01_Cnone_{os.path.basename(f)}", 98 | index_col=0, 99 | ) 100 | 101 | dt_test_only = dt_rej_temp[included_tests] 102 | idxs_save = np.where(dt_test_only.transpose().sum() > 0) 103 | dt_samples.append(dt_rej_temp[cols_to_get].iloc[idxs_save]) 104 | except: 105 | next 106 | dt_samples = pd.concat(dt_samples) 107 | print(dt_samples.columns) 108 | print(included_tests) 109 | X = dt_samples[included_tests] 110 | if use_bias_labels: 111 | Y = [readable_label_dict[x] for x in dt_samples["scen"]] 112 | else: 113 | Y = dt_samples["scen"] 114 | 115 | rf = RandomForestClassifier(oob_score=True, class_weight="balanced") 116 | 117 | rf.fit(X, Y) 118 | 119 | if plot_feat_importance: 120 | plt.figure(figsize=(19, 10)) 121 | if feature_order is None: 122 | sbs.barplot(x=included_tests, y=rf.feature_importances_) 123 | else: 124 | sbs.barplot( 125 | x=included_tests, y=rf.feature_importances_, order=feature_order 126 | ) 127 | plt.xticks(rotation=90) 128 | plt.tight_layout() 129 | plt.savefig(f"RF_feature_importance.pdf") 130 | plt.show() 131 | 132 | print(rf.oob_score_) 133 | 134 | if rf_file_name is not None: 135 | with open(f"{dirname}/models/RFs/{rf_file_name}.pkl", "wb") as output_file: 136 | pickle.dump(rf, output_file) 137 | return rf 138 | -------------------------------------------------------------------------------- /BIAS/uniform_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.polynomial.legendre import legval 3 | 4 | 5 | def ddst_base_legendre(x, j): 6 | """ 7 | Compute the j-th Legendre polynomial evaluated at x. 8 | 9 | Parameters: 10 | - x (array-like): The sample data. 11 | - j (int): The degree of the polynomial. 12 | 13 | Returns: 14 | - values (numpy.ndarray): The evaluated polynomial values at x. 15 | """ 16 | # Map x from [0, 1] to [-1, 1] 17 | x_mapped = 2 * np.array(x) - 1 18 | # Coefficients for the j-th Legendre polynomial 19 | coefs = np.zeros(j + 1) 20 | coefs[j] = 1 21 | values = legval(x_mapped, coefs) 22 | return values 23 | 24 | 25 | def ddst_phi(x, j, base): 26 | """ 27 | Compute the coefficient for the j-th term using the base function. 28 | 29 | Parameters: 30 | - x (array-like): The sample data. 31 | - j (int): The degree of the polynomial. 32 | - base (function): The orthonormal base function. 33 | 34 | Returns: 35 | - coefficient (float): The computed coefficient. 36 | """ 37 | # Evaluate the base function at x 38 | phi_values = base(x, j) 39 | # Compute the mean value 40 | coefficient = np.mean(phi_values) 41 | return coefficient 42 | 43 | 44 | def ddst_uniform_Nk(x, base=None, Dmax=10): 45 | """ 46 | Compute the cumulative sums for the data-driven smooth test of uniformity. 47 | 48 | Parameters: 49 | - x (array-like): The sample data. 50 | - base (function): The orthonormal base function to use (default is ddst_base_legendre). 51 | - Dmax (int): The maximum degree of the polynomial. 52 | 53 | Returns: 54 | - coord (numpy.ndarray): The cumulative sums of the transformed data. 55 | """ 56 | if base is None: 57 | base = ddst_base_legendre 58 | 59 | n = len(x) 60 | maxN = max(min(Dmax, n - 2, 20), 1) 61 | coord = np.zeros(maxN) 62 | for j in range(1, maxN + 1): 63 | coord[j - 1] = ddst_phi(x, j, base) 64 | coord = np.cumsum(coord**2 * n) 65 | return coord 66 | 67 | 68 | def ddst_IIC(coord, n, c=2.4): 69 | """ 70 | Compute the model selection index l using the Information Criterion. 71 | 72 | Parameters: 73 | - coord (numpy.ndarray): The cumulative sums. 74 | - n (int): Sample size. 75 | - c (float): Calibrating parameter in the penalty in the model selection rule. 76 | 77 | Returns: 78 | - l (int): The selected index (starting from 1). 79 | """ 80 | Dmax = len(coord) 81 | ic = coord - c * np.arange(1, Dmax + 1) 82 | l = np.argmin(ic) + 1 # Add 1 because numpy arrays are 0-indexed 83 | return l 84 | 85 | 86 | def ddst_uniform_test( 87 | x, 88 | base=ddst_base_legendre, 89 | d_n=10, 90 | c=2.4, 91 | nr=100000, 92 | compute_p=True, 93 | alpha=0.05, 94 | compute_cv=True, 95 | **kwargs, 96 | ): 97 | """ 98 | Data Driven Smooth Test for Uniformity. 99 | 100 | Parameters: 101 | - x (array-like): A (non-empty) numeric vector of data. 102 | - base (function): Function returning an orthonormal system (default is ddst_base_legendre). 103 | - d_n (int): Maximum dimension considered. 104 | - c (float): Calibrating parameter in the penalty in the model selection rule. 105 | - nr (int): Number of runs for p-value and critical value computation. 106 | - compute_p (bool): Whether to compute a p-value. 107 | - alpha (float): Significance level. 108 | - compute_cv (bool): Whether to compute a critical value corresponding to alpha. 109 | - kwargs: Further arguments. 110 | 111 | Returns: 112 | - result (dict): A dictionary containing test results. 113 | """ 114 | # Only Legendre base is implemented yet 115 | base = ddst_base_legendre 116 | method_name = "ddst_base_legendre" 117 | 118 | x = np.asarray(x) 119 | n = len(x) 120 | if n < 5: 121 | raise ValueError("length(x) should be at least 5") 122 | 123 | # Compute coordinates 124 | coord = ddst_uniform_Nk(x, base=base, Dmax=d_n) 125 | # Compute model selection index l 126 | l = ddst_IIC(coord, n, c) 127 | # Test statistic t 128 | t = coord[l - 1] # Adjust for zero-based indexing 129 | # Coordinates differences 130 | coord_diffs = coord - np.concatenate(([0], coord[:-1])) 131 | # Prepare result 132 | result = { 133 | "statistic": t, 134 | "parameter": l, 135 | "coordinates": coord_diffs, 136 | "method": "Data Driven Smooth Test for Uniformity", 137 | } 138 | 139 | # Compute p-value and critical value if required 140 | if compute_p or compute_cv: 141 | tmp = np.zeros(nr) 142 | for i in range(nr): 143 | y = np.random.uniform(0, 1, n) 144 | tmpC = ddst_uniform_Nk(y, base=base, Dmax=d_n) 145 | l_sim = ddst_IIC(tmpC, n, c) 146 | tmp[i] = tmpC[l_sim - 1] # Adjust index for zero-based indexing 147 | if compute_p: 148 | result["p_value"] = np.mean(tmp > t) 149 | if compute_cv: 150 | result["cv"] = np.quantile(tmp, alpha) 151 | 152 | # Construct data name 153 | data_name = f"x, base: {method_name} c: {c} d_n: {d_n}" + ( 154 | f" cv({alpha}) : {result['cv']:.5f}" if compute_cv else "" 155 | ) 156 | result["data_name"] = data_name 157 | 158 | return result 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-BIAS: Bias In Algorithms, Structural 2 | ## A toolbox for detecting structural bias in continuous optimization heuristics. 3 | 4 | With a deep-learning extension to better evaluate the type of bias and gain insights using explainable AI 5 | 6 | 7 | 8 | ## Using the BIAS-Toolbox with Docker (Recommended) 9 | 10 | The BIAS-Toolbox can be used inside a Docker container, eliminating the need to manually install all dependencies and packages. Follow the steps below to run the Docker image, and to start working with the toolbox in a Jupyter notebook environment. We provide the following prebuild container: `ghcr.io/nikivanstein/bias:master` 11 | 12 | ### Prerequisites 13 | 14 | Make sure you have Docker installed on your system. You can install Docker by following the instructions [here](https://docs.docker.com/get-docker/). 15 | 16 | ### Steps to Run the Docker Image 17 | 18 | 1. **Pull the Prebuild Image** 19 | The following command will pull the prebuild image to your system. 20 | 21 | ```bash 22 | docker pull ghcr.io/nikivanstein/bias:master 23 | ``` 24 | 25 | 2. **Run the Prebuild Docker Container**: 26 | The following command will start the container and expose the Jupyter notebook interface on port `8888`: 27 | 28 | ```bash 29 | docker run -p 8888:8888 ghcr.io/nikivanstein/bias:master 30 | ``` 31 | 32 | 3. **Access the Jupyter Notebook**: 33 | After starting the container, you should see a message with instructions to access the Jupyter notebook. It will look something like this: 34 | 35 | ``` 36 | To access the notebook, open this file in a browser: 37 | http://127.0.0.1:8888/?token= 38 | ``` 39 | 40 | Open the provided URL in your web browser to start using the BIAS-Toolbox within Jupyter. 41 | 42 | 43 | ### Steps to Build the Dockerfile yourself 44 | 45 | 1. **Clone the Repository**: 46 | If you haven't already cloned the BIAS repository, do so with the following command: 47 | 48 | ```bash 49 | git clone https://github.com/nikivanstein/BIAS.git 50 | cd BIAS 51 | ``` 52 | 53 | 2. **Build the Docker Image**: 54 | The `Dockerfile` included in this repository will install all necessary dependencies (both Python and R), download required data and model files, and set up the environment. 55 | 56 | To build the Docker image, run the following command from the root of the repository (where the `Dockerfile` is located): 57 | 58 | ```bash 59 | docker build -t bias-toolbox . 60 | ``` 61 | 62 | This will create a Docker image named `bias-toolbox`. 63 | 64 | 3. **Run the Docker Container**: 65 | Once the image is built, you can run the container. The following command will start the container and expose the Jupyter notebook interface on port `8888`: 66 | 67 | ```bash 68 | docker run -p 8888:8888 bias-toolbox 69 | ``` 70 | 71 | 4. **Access the Jupyter Notebook**: 72 | After starting the container, you should see a message with instructions to access the Jupyter notebook. It will look something like this: 73 | 74 | ``` 75 | To access the notebook, open this file in a browser: 76 | http://127.0.0.1:8888/?token= 77 | ``` 78 | 79 | Open the provided URL in your web browser to start using the BIAS-Toolbox within Jupyter. 80 | 81 | 82 | ### Stopping the Container 83 | 84 | To stop the running Docker container, press `CTRL+C` in the terminal where the container is running, or find the container's ID with the command: 85 | 86 | ```bash 87 | docker ps 88 | ``` 89 | 90 | Then stop the container with: 91 | 92 | ```bash 93 | docker stop 94 | ``` 95 | 96 | ### Additional Notes 97 | 98 | - The image is configured to use Jupyter Notebook with R and Python integrations. 99 | - R version `4.1.2` is installed and configured along with the necessary R packages as specified in the `install.r` script. 100 | - Python dependencies are handled via the `requirements.txt` file. 101 | 102 | By using Docker, you can avoid issues related to dependency installation and system setup, providing a consistent environment for running the BIAS-Toolbox. 103 | 104 | 105 | ## Setup using Pip 106 | 107 | Another way of using the BIAS-Toolbox is by installing the pip package. 108 | 109 | This package requires an R-installation to be present. 110 | 111 | The package is tested with R 4.1.2 (install from source https://cran.r-project.org/src/base/R-4/R-4.1.2.tar.gz) 112 | 113 | The R packages will be installed automatically upon first importing BIAS. 114 | 115 | Install the BIAS toolbox using pip: 116 | 117 | pip install struct-bias 118 | 119 | This installs the following R packages: 120 | 121 | - PoweR 122 | - AutoSEARCH 123 | - nortest 124 | - data.table 125 | - goftest 126 | - ddst 127 | 128 | 129 | ### Detailed setup using virtual env 130 | 131 | 1. Download and install R from https://cran.r-project.org/ use version 4.1.2 132 | Example for Ubuntu based system: 133 | ```sh 134 | sudo wget https://cran.rstudio.com/src/base/R-4/R-4.1.2.tar.gz 135 | tar zxvf R-4.1.2.tar.gz 136 | cd R-4.1.2 137 | ./configure --enable-R-shlib --with-blas --with-lapack 138 | make 139 | sudo make install 140 | ``` 141 | 142 | 2. Download this repository (clone or as zip) 143 | 3. Create a python virtual env `python -m venv env` 144 | 4. Activate the env (in powershell for example: `env/Scripts/Activate.ps1 `) 145 | 5. Install dependencies `pip install -r requirements.txt` 146 | 6. Checkout the `example.py` to start using the BIAS toolbox. 147 | 148 | 149 | ## Example 150 | 151 | ```py 152 | #example of using the BIAS toolbox to test a DE algorithm 153 | 154 | from scipy.optimize import differential_evolution 155 | import numpy as np 156 | from BIAS import BIAS, f0 157 | 158 | bounds = [(0,1), (0, 1), (0, 1), (0, 1), (0, 1)] 159 | 160 | #do 30 independent runs (5 dimensions) 161 | samples = [] 162 | print("Performing optimization method 30 times of f0.") 163 | for i in np.arange(30): 164 | result = differential_evolution(f0, bounds, maxiter=100) 165 | samples.append(result.x) 166 | 167 | samples = np.array(samples) 168 | 169 | test = BIAS() 170 | print(test.predict(samples, show_figure=True)) 171 | 172 | y, preds = test.predict_deep(samples) 173 | test.explain(samples, preds, filename="explanation.png") 174 | ``` 175 | 176 | ## Additional files 177 | 178 | Note: The code for generating the RF used to predict the type of bias is included, but the full RF is not. These can be found on zenodo: https://doi.org/10.6084/m9.figshare.16546041. 179 | The RF models will be downloaded automatically the first time the predict function requires them. 180 | 181 | ### Citation 182 | 183 | If you use the BIAS toolbox in a scientific publication, we would appreciate using the following citations: 184 | 185 | ``` 186 | @ARTICLE{9828803, 187 | author={Vermetten, Diederick and van Stein, Bas and Caraffini, Fabio and Minku, Leandro L. and Kononova, Anna V.}, 188 | journal={IEEE Transactions on Evolutionary Computation}, 189 | title={BIAS: A Toolbox for Benchmarking Structural Bias in the Continuous Domain}, 190 | year={2022}, 191 | volume={26}, 192 | number={6}, 193 | pages={1380-1393}, 194 | doi={10.1109/TEVC.2022.3189848} 195 | } 196 | 197 | @software{niki_van_stein_2023_7803623, 198 | author = {Niki van Stein and 199 | Diederick Vermetten}, 200 | title = {Basvanstein/BIAS: v1.1 Deep-BIAS Toolbox}, 201 | month = apr, 202 | year = 2023, 203 | publisher = {Zenodo}, 204 | version = {v1.1}, 205 | doi = {10.5281/zenodo.7803623}, 206 | url = {https://doi.org/10.5281/zenodo.7803623} 207 | } 208 | ``` 209 | -------------------------------------------------------------------------------- /BIAS/SB_Toolbox.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from io import BytesIO 4 | from zipfile import ZipFile 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | import requests 10 | import rpy2.robjects as robjects 11 | import seaborn as sbs 12 | import shap 13 | import tensorflow as tf 14 | from rpy2.robjects.packages import importr 15 | from scipy.stats import percentileofscore 16 | from statsmodels.stats.multitest import multipletests 17 | import autokeras as ak 18 | 19 | from .SB_Test_runner import get_scens_per_dim, get_simulated_data, get_test_dict 20 | 21 | 22 | def f0(x): 23 | """f0 random function, to be used a objective function to test optimization algorithms. 24 | 25 | Args: 26 | x (list): input for the objective function, ignored since the function is random. 27 | 28 | Returns: 29 | float: A uniform random number 30 | """ 31 | return np.random.uniform() 32 | 33 | 34 | def getXAIBackground(n_samples=30, rep=20): 35 | """Get background training samples to approximate Shapley values for the deeplearning approach. 36 | 37 | Args: 38 | n_samples (int, optional): number of samples, should be in [30,50,100,600]. Defaults to 30. 39 | rep (int, optional): number of repetitions per scenario. Defaults to 20. 40 | """ 41 | scenes = get_scens_per_dim() 42 | X = [] 43 | for scene in scenes: 44 | label = scene[0] 45 | kwargs = scene[1] 46 | data = get_simulated_data(label, rep=rep, n_samples=n_samples, kwargs=kwargs) 47 | for r in range(rep): 48 | X.append(np.sort(data[:, r])) 49 | X = np.expand_dims(X, axis=2) 50 | return np.array(X) 51 | 52 | 53 | class BIAS: 54 | def __init__(self): 55 | """BIAS toolbox for predicting bias in black box optimization algorithms. 56 | Predicts both the presence of bias and the bias type. Use f0 as objective function for at least 30 independent optimization runs. 57 | 58 | Args: 59 | install_r (bool): if set to True, try to install the required R packages automatically. 60 | """ 61 | self.p_value_columns = [ 62 | "1-spacing", 63 | "2-spacing", 64 | "3-spacing", 65 | "ad", 66 | "ad_transform", 67 | "shapiro", 68 | "jb", 69 | "ddst", 70 | ] 71 | self.pwr = importr("PoweR") 72 | self.deepmodel = None 73 | 74 | def _load_ref_vals(self, n_samples, alpha=0.01, across=False): 75 | """Helper function to load the reference values needed for calculating the p-values. 76 | 77 | Args: 78 | n_samples (int): the sample size used for the statistical tests. Can only be 79 | in [30,50,100,600] 80 | alpha (float, optional): Can only be in [0.01, 0.05]. Defaults to 0.01. 81 | across (bool, optional): Whether we use across dimension reference vals or not. Defaults to False. 82 | 83 | Returns: 84 | list, list: two lists of reference values loaded from files. 85 | """ 86 | dirname = os.path.dirname(__file__) 87 | # download reference values if needed from figshare 88 | if not os.path.isfile( 89 | f"{dirname}/data/Crit_vals_across/S{n_samples}_A{alpha}_with_refs.pkl" 90 | ): 91 | print( 92 | "Downloading reference values for statistical tests, this takes a while.." 93 | ) 94 | r = requests.get("https://figshare.com/ndownloader/files/30591411") 95 | zipfile = ZipFile(BytesIO(r.content)) 96 | zipfile.extractall(f"{dirname}/data/") 97 | if across: 98 | with open( 99 | f"{dirname}/data/Crit_vals_across/S{n_samples}_A{alpha}_with_refs.pkl", 100 | "rb", 101 | ) as f: 102 | ref_vals, _ = pickle.load(f) 103 | with open( 104 | f"{dirname}/data/Crit_vals_pwr_across/S{n_samples}_A{alpha}_with_refs.pkl", 105 | "rb", 106 | ) as f: 107 | ref_vals_new, _ = pickle.load(f) 108 | else: 109 | with open( 110 | f"{dirname}/data/Crit_vals/S{n_samples}_A{alpha}_with_refs.pkl", "rb" 111 | ) as f: 112 | _, ref_vals = pickle.load(f) 113 | with open( 114 | f"{dirname}/data/Crit_vals_pwr/S{n_samples}_A{alpha}_with_refs.pkl", 115 | "rb", 116 | ) as f: 117 | _, ref_vals_new = pickle.load(f) 118 | return ref_vals, ref_vals_new 119 | 120 | def _get_test_types(self): 121 | """Helper function for the poweR-based tests. 122 | 123 | Returns: 124 | dict: Dict of test functions from R. 125 | """ 126 | testnames = [ 127 | "kolmogorov", 128 | "CvM", 129 | "AD_pwr", 130 | "Durbin", 131 | "Kuiper", 132 | "HG1", 133 | "HG2", 134 | "Greenwood", 135 | "QM", 136 | "RC", 137 | "Moran", 138 | "Cressie1", 139 | "Cressie2", 140 | "Vasicek", 141 | "Swartz", 142 | "Morales", 143 | "Pardo", 144 | "Marhuenda", 145 | "Zhang1", 146 | "Zhang2", 147 | ] 148 | test_types_new = [ 149 | self.pwr.create_alter(robjects.FloatVector(np.arange(63, 83)))[i][0] 150 | for i in range(20) 151 | ] 152 | return {k: v for k, v in zip(testnames, test_types_new)} 153 | 154 | def transform_to_reject_dt_corr( 155 | self, dt, alpha, n_samples, correction_method="fdr_bh" 156 | ): 157 | """Apply p-value corrections on the dataframe of test statistics. 158 | 159 | Args: 160 | dt (dataframe): The DataFrame containing the calculated test statistics for each dimension. 161 | alpha (float): The threshold for statistical significance. 162 | n_samples (int): The sample size used for the statistical tests. Can only be 163 | in [30,50,100,600] 164 | correction_method (str, optional): Which type of p-value correction to apply. Recommended is 'fdr_bh', 165 | but 'fdr_by' and 'holm' are also supported.. Defaults to 'fdr_bh'. 166 | 167 | Returns: 168 | dataframe: Corrected test statistics. 169 | """ 170 | reference_vals, ref_vals_new = self._load_ref_vals(n_samples) 171 | test_types_new = self._get_test_types() 172 | 173 | dt_rejections = pd.DataFrame() 174 | dt_p_vals_temp = pd.DataFrame() 175 | for colname in self.p_value_columns: 176 | dt_p_vals_temp[colname] = dt[colname] 177 | for k, v in reference_vals.items(): 178 | if "kurt" in k: 179 | temp = [ 180 | percentileofscore(score=x, a=v, kind="mean") / 100 for x in dt[k] 181 | ] 182 | temp = [min(x, 1 - x) for x in temp] # two-sided comparison 183 | elif k in ["min", "wasserstein", "mdd_max", "mdd_min"]: 184 | temp = [ 185 | 1 - percentileofscore(score=x, a=v, kind="mean") / 100 186 | for x in dt[k] 187 | ] 188 | else: 189 | temp = [ 190 | percentileofscore(score=x, a=v, kind="mean") / 100 for x in dt[k] 191 | ] 192 | dt_p_vals_temp[k] = temp 193 | for k, v in ref_vals_new.items(): 194 | if test_types_new[k] == 4: 195 | temp = [ 196 | percentileofscore(score=x, a=v, kind="mean") / 100 for x in dt[k] 197 | ] 198 | else: 199 | temp = [ 200 | 1 - percentileofscore(score=x, a=v, kind="mean") / 100 201 | for x in dt[k] 202 | ] 203 | dt_p_vals_temp[k] = temp 204 | res = np.array( 205 | [ 206 | multipletests(x, alpha=alpha, method=correction_method)[0] 207 | for x in np.array(dt_p_vals_temp) 208 | ] 209 | ).reshape(dt_p_vals_temp.shape) 210 | return pd.DataFrame(res, columns=dt_p_vals_temp.columns) 211 | 212 | def _get_test_names_dict(self): 213 | """Helper function to ensure consistent naming for the used statistical tests 214 | by creating a dictionary 215 | 216 | Returns: 217 | dict: Dict of all test functions. 218 | """ 219 | test_dict_per = get_test_dict(n_samples=100, per_dim=True) 220 | test_names = list(test_dict_per.keys()) 221 | test_names.remove("AD_pwr") 222 | test_names_paper = [ 223 | "1-spacing", 224 | "2-spacing", 225 | "3-spacing", 226 | "range", 227 | "min", 228 | "max", 229 | "AD", 230 | "tAD", 231 | "Shapiro", 232 | "JB", 233 | "LD-min", 234 | "LD-max", 235 | "Kurt", 236 | "MPD-max", 237 | "MPD-min", 238 | "Wasserstein", 239 | "NS", 240 | "KS", 241 | "CvM", 242 | "Durbin", 243 | "Kuiper", 244 | "HG1", 245 | "HG2", 246 | "Greenwood", 247 | "QM", 248 | "RC", 249 | "Moran", 250 | "Cressie1", 251 | "Cressie2", 252 | "Vasicek", 253 | "Swartz", 254 | "Morales", 255 | "Pardo", 256 | "Marhuenda", 257 | "Zhang1", 258 | "Zhang2", 259 | ] 260 | 261 | test_label_dict = {k: v for k, v in zip(test_names, test_names_paper)} 262 | return test_label_dict 263 | 264 | def plot_swarm_with_heatmap(self, data, rejections, filename=None): 265 | """Plotting function to create the swarmplot and rejection heatmap. 266 | 267 | Args: 268 | data (dataframe): The DataFrame containing the final position values. 269 | rejections (dataframe): The DataFrame containing the corresponding test rejections. 270 | filename (string, optional): If not none, the name of the file to store the figure. Defaults to None. 271 | """ 272 | test_label_dict = self._get_test_names_dict() 273 | data_dt = pd.DataFrame(data) 274 | fig, axs = plt.subplots(2, figsize=(19, 14), sharex=True) 275 | ax1 = axs[0] 276 | dt_molt = data_dt.melt() 277 | dt_molt["variable"] = dt_molt["variable"] + 1.5 278 | sbs.swarmplot(data=dt_molt, x="variable", y="value", ax=ax1) 279 | ax1.set_xlim(-0.5, self.DIM - 0.5) 280 | for dim in range(self.DIM): 281 | c0 = ax1.get_children()[dim] 282 | c0.set_offsets([[x + 0.5, y] for x, y in c0.get_offsets()]) 283 | ax1.axvline(dim, color="k", lw=0.6, ls=":") 284 | sbs.heatmap( 285 | np.array(rejections).transpose(), 286 | ax=axs[1], 287 | cbar=False, 288 | yticklabels=[test_label_dict[x] for x in rejections.columns], 289 | linewidths=0.01, 290 | cmap="crest_r", 291 | ) 292 | 293 | ax1.set_xlabel("") 294 | axs[1].set_xlabel("Dimension", fontsize=16) 295 | axs[1].set_xticklabels(range(1, self.DIM + 1), fontsize=14) 296 | axs[1].set_yticklabels(axs[1].get_yticklabels(), fontsize=14) 297 | ax1.set_ylabel("Value", fontsize=16) 298 | ax1.set_ylim(0, 1) 299 | ax1.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1], fontsize=14) 300 | plt.tight_layout() 301 | if filename is not None: 302 | plt.savefig(filename) 303 | plt.show() 304 | 305 | def predict_type(self, dt_rej, print_type=False): 306 | """Predict the type of bias using the rejection data. 307 | 308 | Args: 309 | dt_rej (dataframe): Dataframe containing rejection data. 310 | print_type (bool, optional): Whether to output the type to the standard output or not. Defaults to False. 311 | 312 | Returns: 313 | dict: Dict with the predicted Class and the Class_Probabilities 314 | """ 315 | mean_rej = np.mean(np.array(dt_rej), axis=0) > 0.1 316 | if np.sum(mean_rej) == 0: 317 | if print_type: 318 | print("No clear evidence of bias detected") 319 | return "none" 320 | dirname = os.path.dirname(__file__) 321 | 322 | # download RF models if needed from 323 | if not os.path.isfile(f"{dirname}/models/RFs/rf_few_classes.pkl"): 324 | print("Downloading model files, this takes a while..") 325 | r = requests.get("https://figshare.com/ndownloader/files/43106839") 326 | zipfile = ZipFile(BytesIO(r.content)) 327 | zipfile.extractall(f"{dirname}/models/") 328 | 329 | with open(f"{dirname}/models/RFs/rf_few_classes.pkl", "rb") as input_file: 330 | rf = pickle.load(input_file) 331 | res_class = rf.predict(mean_rej.reshape(1, -1)) 332 | classes = rf.classes_ 333 | prob_classes = rf.predict_proba(mean_rej.reshape(1, -1)) 334 | 335 | with open(f"{dirname}/models/RFs/rf_scens.pkl", "rb") as input_file: 336 | rf = pickle.load(input_file) 337 | res_scen = rf.predict(mean_rej.reshape(1, -1)) 338 | scennames = rf.classes_ 339 | prob_scens = rf.predict_proba(mean_rej.reshape(1, -1)) 340 | 341 | if print_type: 342 | print( 343 | f"Detected bias which seems to be related to {res_class} ({np.max(prob_classes):.2f} probability)." 344 | + f"The rejections seems to be most similar to the {res_scen} scenario ({np.max(prob_scens):.2f} probability)." 345 | + "\nWe strongly advise you to now use the `predict_deep` function to more accurately predict the Structural Bias type." 346 | ) 347 | return { 348 | "Class": res_class[0], 349 | "Class Probabilities": prob_classes, 350 | "Scenario": res_scen[0], 351 | "Scenario Probabilities": prob_scens, 352 | } 353 | 354 | def explain(self, data, preds, filename=None, verbose=False): 355 | """Explain the predictions of the deeplearning model. 356 | You need to call predict_deep first. 357 | 358 | Args: 359 | data (dataframe): The matrix containing the final position values on F0. Note that these should be scaled 360 | in [0,1], and in the shape (n_samples, dimension), where n_samples is in [30, 50, 100, 600] 361 | preds (array): Predictions of bias type for each dimension. 362 | filename (string): Where to save the figure, if None it will call plt.show() instead. 363 | verbose (bool): Print additional output. 364 | """ 365 | # calculate the shapley values per dim 366 | 367 | fig, axes = plt.subplots( 368 | nrows=data.shape[1], 369 | ncols=2, 370 | figsize=(12, data.shape[1] * 2), 371 | gridspec_kw={"width_ratios": [1, 3]}, 372 | ) 373 | for d in range(data.shape[1]): 374 | x = [np.sort(data[:, d])] 375 | x = np.expand_dims(x, axis=2) 376 | shap_val = self.explainer.shap_values(x) 377 | if verbose: 378 | print(preds[d]) 379 | y = np.argmax(preds[d], axis=1) # prediction of the dimension 380 | shap_vals_pred = shap_val[y[0]][0] 381 | 382 | cmap = sbs.color_palette("coolwarm", as_cmap=True) 383 | norm = plt.Normalize( 384 | vmin=-1 * np.max(np.abs(shap_vals_pred)), 385 | vmax=np.max(np.abs(shap_vals_pred)), 386 | ) # 0 and 1 are the defaults, but you can adapt these to fit other uses 387 | df = pd.DataFrame( 388 | {"x": np.sort(data[:, d]).flatten(), "shap": shap_vals_pred.flatten()} 389 | ) 390 | palette = {h: cmap(norm(h)) for h in df["shap"]} 391 | axes[d, 0].bar(self.targetnames, preds[d][0]) 392 | axes[d, 0].tick_params(axis="x", labelrotation=30) 393 | axes[d, 0].set_title("Prediction probabilities") 394 | axes[d, 0].set_ylim([0, 1]) 395 | 396 | axes[d, 1].set_title(f"Predicted: {self.targetnames[y]}") 397 | sbs.swarmplot( 398 | data=df, 399 | x="x", 400 | hue="shap", 401 | palette=palette, 402 | ax=axes[d, 1], 403 | size=4, 404 | legend=False, 405 | ) 406 | axes[d, 1].set_xlabel("") 407 | axes[d, 1].set_xlim([0, 1]) 408 | 409 | # sbs.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) 410 | plt.tight_layout() 411 | if filename == None: 412 | plt.show() 413 | else: 414 | plt.savefig(filename) 415 | plt.close() 416 | 417 | def predict_deep(self, data, include_proba=True): 418 | """Predict the BIAS using our neural network. 419 | 420 | Args: 421 | data (dataframe): The matrix containing the final position values on F0. Note that these should be scaled 422 | in [0,1], and in the shape (n_samples, dimension), where n_samples is in [30, 50, 100, 600] 423 | include_proba (boolean, optional): To include the probabilities of each class or only the final label. 424 | 425 | Raises: 426 | ValueError: Unsupported sample size. 427 | 428 | Returns: 429 | predicted bias type (string), optional probabilities (array) 430 | """ 431 | # load model 432 | n_samples = data.shape[0] 433 | if not n_samples in [30, 50, 100, 600]: 434 | raise ValueError("Sample size is not supported") 435 | if self.deepmodel == None: 436 | dirname = os.path.dirname(__file__) 437 | # download RF models if needed from 438 | self.deepmodel = tf.keras.models.load_model( 439 | f"{dirname}/models/opt_cnn_model-{n_samples}.keras" 440 | ) 441 | self.targetnames = np.load( 442 | f"{dirname}/models/targetnames.npy", allow_pickle=True 443 | ) 444 | # loading explainable background samples and loading the explainer 445 | self.xai_background = getXAIBackground(data.shape[0]) 446 | self.explainer = shap.DeepExplainer(self.deepmodel, self.xai_background) 447 | preds = [] 448 | for d in range(data.shape[1]): 449 | # perform per dimension test 450 | x = np.sort(data[:, d]) 451 | x = np.expand_dims([x], axis=2) 452 | preds.append(self.deepmodel.predict(x, verbose=0)) 453 | 454 | decisions = np.argmax(np.array(preds).reshape(-1, 5), axis=1) > 0 455 | 456 | if np.mean(decisions) <= 0.1: 457 | y = "unif" 458 | else: 459 | pred_mean = np.mean(np.array(preds), axis=0) 460 | y = self.targetnames[np.argmax(pred_mean.flatten()[1:]) + 1] 461 | 462 | if include_proba: 463 | return y, preds 464 | return y 465 | 466 | def predict( 467 | self, 468 | data, 469 | corr_method="fdr_bh", 470 | alpha=0.01, 471 | show_figure=False, 472 | filename=None, 473 | print_type=True, 474 | ): 475 | """The main function used to detect Structural Bias. 476 | 477 | Args: 478 | data (dataframe): The matrix containing the final position values on F0. Note that these should be scaled 479 | in [0,1], and in the shape (n_samples, dimension), where n_samples is in [30, 50, 100, 600] 480 | corr_method (str, optional): Which type of p-value correction to apply. Recommended is 'fdr_bh', 481 | but 'fdr_by' and 'holm' are also supported.. Defaults to 'fdr_bh'. 482 | alpha (float, optional): The threshold for statistical significance. Defaults to 0.01. 483 | show_figure (bool, optional): Whether or not to create a plot of the final positions and the corresponding test rejections. Defaults to False. 484 | filename (string, optional): If not none, the name of the file to store the figure (only when show_figure is True). Defaults to None. 485 | print_type (bool, optional): Wheter or not to print the predicted type of SB. Defaults to True. 486 | 487 | Raises: 488 | ValueError: Unsupported sample size. 489 | 490 | Returns: 491 | dataframe, dict: rejection data, predicted Bias and type. 492 | """ 493 | self.DIM = data.shape[1] 494 | n_samples = data.shape[0] 495 | if not n_samples in [30, 50, 100, 600]: 496 | raise ValueError("Sample size is not supported") 497 | if print_type: 498 | print( 499 | f"Running SB calculation with {self.DIM}-dimensional data of sample size {n_samples} (alpha = {alpha})" 500 | ) 501 | records = {} 502 | test_battery_per_dim = get_test_dict(n_samples) 503 | for tname, tfunc in test_battery_per_dim.items(): 504 | temp = [] 505 | for r in range(self.DIM): 506 | try: 507 | temp.append(tfunc(data[:, r], alpha=alpha)) 508 | except: 509 | next 510 | records[tname] = temp 511 | dt = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in records.items()])) 512 | dt_rejections = self.transform_to_reject_dt_corr( 513 | dt, alpha, n_samples, corr_method 514 | ) 515 | # Drop duplicate test 516 | dt_rejections = dt_rejections.drop("AD_pwr", axis=1) 517 | 518 | if show_figure: 519 | self.plot_swarm_with_heatmap(data, dt_rejections, filename) 520 | 521 | return dt_rejections, self.predict_type(dt_rejections, print_type) 522 | 523 | def transform_to_reject_dt_across(self, dt, alpha, n_samples): 524 | """Transform rejection data for across dimension tests. 525 | 526 | Args: 527 | dt (dataframe): Rejection dataframe. 528 | alpha (float): Signficance level. 529 | n_samples (int): Number of samples. 530 | 531 | Returns: 532 | dataframe: Transformed rejection data. 533 | """ 534 | crit_vals, crit_vals_new = self._load_ref_vals(n_samples, alpha, True) 535 | test_types_new = self._get_test_types() 536 | 537 | dt_rejections = pd.DataFrame() 538 | for colname in self.p_value_columns: 539 | dt_rejections[colname] = dt[colname] < alpha 540 | 541 | # Ugly solution to distinguish two-sided vs one-sided tests 542 | dt_rejections["kurtosis"] = (crit_vals["kurtosis_low"] > dt["kurtosis"]) | ( 543 | dt["kurtosis"] > crit_vals["kurtosis_high"] 544 | ) 545 | dt_rejections["mmpd"] = (crit_vals["mmpd_low"] > dt["mmpd"]) | ( 546 | dt["mmpd"] > crit_vals["mmpd_high"] 547 | ) 548 | dt_rejections["mi"] = (crit_vals["mi_low"] > dt["mi"]) | ( 549 | dt["mi"] > crit_vals["mi_high"] 550 | ) 551 | dt_rejections["med_ddlud"] = (crit_vals["med_ddlud_low"] > dt["med_ddlud"]) | ( 552 | dt["med_ddlud"] > crit_vals["med_ddlud_high"] 553 | ) 554 | for k, v in crit_vals.items(): 555 | if "kurt" in k or "low" in k or "high" in k: 556 | next 557 | else: 558 | if k in ["max_ddlud"]: 559 | dt_rejections[k] = dt[k] > v 560 | else: 561 | dt_rejections[k] = dt[k] < v 562 | 563 | for k, v in crit_vals_new.items(): 564 | if test_types_new[k] == 4: 565 | dt_rejections[k] = dt[k] < v 566 | else: 567 | dt_rejections[k] = dt[k] > v 568 | return dt_rejections 569 | 570 | def predict_multi_dim(self, data, alpha=0.01, print_type=True): 571 | """Predict Bias using across dimension tests. 572 | 573 | Args: 574 | data (dataframe): dataframe containing end positions. 575 | alpha (float, optional): Signficance level. Defaults to 0.01. 576 | print_type (bool, optional): Whether to output the type or not. Defaults to True. 577 | 578 | Raises: 579 | ValueError: unsupported sample size or dimension. 580 | 581 | Returns: 582 | list: List of failed tests that show potential bias. 583 | """ 584 | DIM = data.shape[1] 585 | n_samples = data.shape[0] 586 | if not n_samples in [30, 50, 100, 600]: 587 | raise ValueError("Sample size is not supported") 588 | if DIM != 30: 589 | raise ValueError( 590 | "Only 30-dimensional data is supported for across-dimension testing" 591 | ) 592 | if print_type: 593 | print( 594 | f"Running SB calculation with {DIM}-dimensional data of sample size {n_samples} (alpha = {alpha})" 595 | ) 596 | records = {} 597 | test_battery_across_dim = get_test_dict(n_samples, per_dim=False) 598 | for tname, tfunc in test_battery_across_dim.items(): 599 | try: 600 | records[tname] = tfunc(data) 601 | except: 602 | next 603 | # TODO: fix this function 604 | dt = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in records.items()])) 605 | dt_rejections = self.transform_to_reject_dt_across(dt, alpha, n_samples) 606 | failed_tests = [ 607 | x for x in dt_rejections.columns if np.sum(dt_rejections[x]) > 0 608 | ] 609 | if print_type: 610 | if len(failed_tests == 0): 611 | print("No clear evidence of bias detected") 612 | else: 613 | print( 614 | f"The following tests detected potential structural bias: {failed_tests}" 615 | ) 616 | return failed_tests 617 | -------------------------------------------------------------------------------- /BIAS/SB_Test_runner.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from functools import partial 3 | from multiprocessing import Pool, cpu_count 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy.stats as ss 8 | from sklearn.feature_selection import mutual_info_regression 9 | from sklearn.metrics import ( 10 | adjusted_mutual_info_score, 11 | pairwise_distances, 12 | pairwise_distances_argmin_min, 13 | ) 14 | 15 | import rpy2.robjects as robjects 16 | from rpy2.robjects.packages import importr 17 | from .uniform_test import ddst_uniform_test 18 | 19 | importr("data.table") 20 | importr("goftest") 21 | pwr = importr("PoweR") 22 | 23 | robjects.r( 24 | """ 25 | R_test_ad <- function(x, max_=1) { 26 | return(ad.test(x, "punif", max=max_, min=0)[[2]]) 27 | } 28 | 29 | R_test_norm <- function(x, test='Shapiro') { 30 | qnorm_temp <- qnorm(x) 31 | qnorm_temp[is.infinite(qnorm_temp)] <- 4*sign(qnorm_temp[is.infinite(qnorm_temp)]) 32 | if (test == 'Shapiro') { 33 | return(shapiro.test(qnorm_temp)[[2]]) 34 | } else { 35 | return(AutoSEARCH::jb.test(qnorm_temp)$p.value) 36 | } 37 | } 38 | """ 39 | ) 40 | 41 | 42 | def get_mi(X, type_="med"): 43 | mutuals = [] 44 | for i in range(X.shape[1]): 45 | for j in range(i, X.shape[1]): 46 | if i != j: 47 | mutuals.append( 48 | mutual_info_regression(X[:, i].reshape(-1, 1), X[:, j])[0] 49 | ) 50 | if type_ == "med": 51 | return np.median(mutuals) 52 | return np.max(mutuals) 53 | 54 | 55 | def get_mmpd(X): 56 | pairwisedistances = [] 57 | # print(len(X)) 58 | for i in range(len(X)): 59 | one = X[i] 60 | rest = np.append(X[:i], X[i + 1 :], axis=0) 61 | res, res_dist = pairwise_distances_argmin_min(one.reshape(1, -1), rest) 62 | pairwisedistances.append(res_dist[0]) 63 | return np.max(pairwisedistances) 64 | 65 | 66 | def get_mddlud(X, type_="med"): 67 | baseline_space = np.linspace(0, 1, len(X)) 68 | lindist = [] 69 | for i in range(X.shape[1]): 70 | lindist.append(np.max(np.abs(np.sort(X[:, i]) - baseline_space))) 71 | if type_ == "med": 72 | return np.median(lindist) 73 | return max(lindist) 74 | 75 | 76 | def is_valid(x, centers, gap_size): 77 | for c in centers: 78 | if np.abs(x - c) < gap_size: 79 | return False 80 | return True 81 | 82 | 83 | def get_simulated_data(scen, rep=1000, n_samples=100, kwargs={}): 84 | if scen == "unif": 85 | data_arr = np.random.uniform(size=(n_samples, rep)) 86 | elif scen == "trunc_unif": 87 | min_ = kwargs["min"] 88 | max_ = kwargs["max"] 89 | data_arr = np.random.uniform(size=(n_samples, rep), low=min_, high=max_) 90 | elif scen == "spikes": 91 | data_arr = ( 92 | np.random.randint(kwargs["max"] + 1, size=(n_samples, rep)) / kwargs["max"] 93 | ) 94 | elif scen == "shifted_spikes": 95 | possible_vals = [i / kwargs["max"] for i in range(kwargs["max"] + 1)] 96 | translations = np.random.normal(0, kwargs["sigma"], size=(kwargs["max"] + 1)) 97 | possible_vals = [ 98 | i + j if 0 <= i + j <= 1 else i for i, j in zip(possible_vals, translations) 99 | ] 100 | data_arr = np.random.choice(possible_vals, size=(n_samples, rep)) 101 | elif scen == "norm": 102 | nr_req = n_samples * rep 103 | data_temp = np.random.normal(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10)) 104 | data_arr = [x for x in data_temp if 0 < x < 1][:nr_req] 105 | data_arr = np.array(data_arr).reshape((n_samples, rep)) 106 | elif scen == "cauchy": 107 | nr_req = n_samples * rep 108 | data_temp = ss.cauchy.rvs(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10)) 109 | data_arr = [x for x in data_temp if 0 < x < 1][:nr_req] 110 | data_arr = np.array(data_arr).reshape((n_samples, rep)) 111 | elif scen == "inv_cauchy": 112 | nr_req = n_samples * rep 113 | data_temp = ss.cauchy.rvs(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10)) 114 | data_arr = [x for x in data_temp if 0 < x < 1][:nr_req] 115 | data_arr = [ 116 | 1 + kwargs["mu"] - x if x > kwargs["mu"] else kwargs["mu"] - x 117 | for x in data_arr 118 | ] # Not efficient, but works. Maybe improve later 119 | data_arr = np.array(data_arr).reshape((n_samples, rep)) 120 | elif scen == "inv_norm": 121 | nr_req = n_samples * rep 122 | data_temp = np.random.normal(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10)) 123 | data_arr = [x for x in data_temp if 0 < x < 1][:nr_req] 124 | data_arr = [ 125 | 1 + kwargs["mu"] - x if x > kwargs["mu"] else kwargs["mu"] - x 126 | for x in data_arr 127 | ] # Not efficient, but works. Maybe improve later 128 | data_arr = np.array(data_arr).reshape((n_samples, rep)) 129 | elif scen == "gaps": 130 | temp = [] 131 | for rep in range(rep): 132 | data_temp = np.random.uniform(size=(n_samples * 10)) 133 | centers = np.random.uniform(size=kwargs["n_centers"]) 134 | temp.append( 135 | [x for x in data_temp if is_valid(x, centers, kwargs["sigma"])][ 136 | :n_samples 137 | ] 138 | ) 139 | data_arr = np.array(temp).transpose() 140 | elif scen == "consistent_gaps": 141 | temp = [] 142 | centers = np.random.uniform(size=kwargs["n_centers"]) 143 | for rep in range(rep): 144 | data_temp = np.random.uniform(size=(n_samples * 10)) 145 | temp.append( 146 | [x for x in data_temp if is_valid(x, centers, kwargs["sigma"])][ 147 | :n_samples 148 | ] 149 | ) 150 | data_arr = np.array(temp).transpose() 151 | elif scen == "clusters": 152 | temp = [] 153 | for rep in range(rep): 154 | centers = np.random.uniform(size=kwargs["n_centers"]) 155 | samples = [ 156 | np.random.normal(loc=x, scale=kwargs["sigma"]) 157 | for x in np.random.choice(centers, size=n_samples * 10) 158 | ] 159 | temp.append([x for x in samples if 0 < x < 1][:n_samples]) 160 | data_arr = np.array(temp).transpose() 161 | elif scen == "consistent_clusters": 162 | temp = [] 163 | centers = np.random.uniform(size=kwargs["n_centers"]) 164 | for rep in range(rep): 165 | samples = [ 166 | np.random.normal(loc=x, scale=kwargs["sigma"]) 167 | for x in np.random.choice(centers, size=n_samples * 10) 168 | ] 169 | temp.append([x for x in samples if 0 < x < 1][:n_samples]) 170 | data_arr = np.array(temp).transpose() 171 | elif scen == "part_unif": 172 | temp = [] 173 | for rep in range(rep): 174 | n_unif = int(np.ceil(kwargs["frac_unif"] * n_samples)) 175 | data_temp = np.random.uniform(size=(n_unif)) 176 | new_points = [ 177 | np.random.normal(loc=x, scale=kwargs["sigma"]) 178 | for x in np.random.choice(data_temp, size=n_samples * 10) 179 | ] 180 | data_new = [x for x in new_points if 0 < x < 1][:n_samples] 181 | # deviations = np.random.normal(size = len(data_temp), scale=kwargs['sigma']) 182 | # new_points = [x + y for x,y in zip(data_temp, deviations) if 0 < x+y < 1] 183 | data_new = np.append(data_temp[:n_unif], data_new[: (n_samples - n_unif)]) 184 | temp.append(data_new) 185 | data_arr = np.array(temp).transpose() 186 | elif scen == "bound_thing": 187 | temp = [] 188 | for _ in range(rep): 189 | n_01 = int(np.ceil((1 - kwargs["frac_between"]) * n_samples)) 190 | data_temp = np.random.uniform(size=(n_samples)) 191 | data_temp[ 192 | np.random.choice(range(n_samples), n_01, replace=False) 193 | ] = np.random.choice( 194 | [0, 1], size=n_01, p=[kwargs["frac_0"], 1 - kwargs["frac_0"]] 195 | ) 196 | # for idx in np.random.randint(0, n_samples, n_01): 197 | # data_temp[idx] = np.random.csv" 198 | temp.append(np.array(data_temp)) 199 | data_arr = np.array(temp).transpose() 200 | return data_arr 201 | 202 | 203 | scenario_dict = { 204 | "unif": None, 205 | "trunc_unif": ["min", "max"], 206 | "spikes": ["max"], 207 | "shifted_spikes": ["max", "sigma"], 208 | "norm": ["sigma", "mu"], 209 | "inv_norm": ["sigma", "mu"], 210 | "cauchy": ["sigma", "mu"], 211 | "inv_cauchy": ["sigma", "mu"], 212 | "gaps": ["n_centers", "sigma"], 213 | "clusters": ["n_centers", "sigma"], 214 | "part_unif": ["frac_unif", "sigma"], 215 | } 216 | 217 | scenario_dict_across = { 218 | "unif": None, 219 | "trunc_unif": ["min", "max"], 220 | "spikes": ["max"], 221 | "shifted_spikes": ["max", "sigma"], 222 | "norm": ["sigma", "mu"], 223 | "inv_norm": ["sigma", "mu"], 224 | "cauchy": ["sigma", "mu"], 225 | "inv_cauchy": ["sigma", "mu"], 226 | "gaps": ["n_centers", "sigma"], 227 | "consistent_gaps": ["n_centers", "sigma"], 228 | "clusters": ["n_centers", "sigma"], 229 | "consistent_clusters": ["n_centers", "sigma"], 230 | "part_unif": ["frac_unif", "sigma"], 231 | } 232 | 233 | # Note: this file is set up terribly (since it is derived from my notebook-code). TODO: Figure out a better way to structure this!!! 234 | 235 | 236 | def get_test_dict(n_samples, per_dim=True): 237 | ### Start by setting up the reference values which need to be gotten from simulations ### 238 | 239 | # spacing-values 240 | dist_vals_rand = np.array( 241 | [ 242 | np.diff(np.sort(np.append(np.random.uniform(size=n_samples), [0, 1]))) 243 | for _ in range(1000) 244 | ] 245 | ).reshape(-1) 246 | dist_vals_rand2 = [] 247 | for rep in range(1000): 248 | x = np.sort(np.append(np.random.uniform(size=n_samples), [0, 1])) 249 | dist_vals_rand2.append(x[2:] - x[:-2]) 250 | dist_vals_rand2 = np.array(dist_vals_rand2).reshape(-1) 251 | 252 | dist_vals_rand3 = [] 253 | for rep in range(1000): 254 | x = np.sort(np.append(np.random.uniform(size=n_samples), [0, 1])) 255 | dist_vals_rand3.append(x[3:] - x[:-3]) 256 | dist_vals_rand3 = np.array(dist_vals_rand3).reshape(-1) 257 | 258 | # #Range values 259 | # dists = [np.max(x) - np.min(x) for x in np.random.uniform(size=(10000,n_samples))] 260 | # mins = [np.min(x) for x in np.random.uniform(size=(10000,n_samples))] 261 | # maxs = [np.max(x) for x in np.random.uniform(size=(10000,n_samples))] 262 | 263 | # #linspace baseline 264 | comp_to = np.linspace(0, 1, num=n_samples) 265 | # wassersteins = [np.sum(np.abs(np.sort(x) - comp_to)) for x in np.random.uniform(size=(10000,n_samples))] 266 | # lindist_min = [np.min(np.abs(np.sort(x) - comp_to)) for x in np.random.uniform(size=(10000,n_samples))] 267 | # lindist_max = [np.max(np.abs(np.sort(x) - comp_to)) for x in np.random.uniform(size=(10000,n_samples))] 268 | 269 | # #max pairwise distances 270 | # max_pair_dists = [np.max(np.diff(np.sort(np.random.uniform(size=(n_samples))))) for x in range(10000)] 271 | 272 | ### Define the tests. For now, this is needed here, since it relies on the simulated reference values :( 273 | def test_spacing(x, m=1, alpha=0.01): 274 | x = np.sort(np.append(x, [0, 1])) 275 | if m == 1: 276 | p = ss.ks_2samp(np.diff(x), dist_vals_rand)[1] 277 | elif m == 2: 278 | p = ss.ks_2samp(x[2:] - x[:-2], dist_vals_rand2)[1] 279 | else: 280 | p = ss.ks_2samp(x[3:] - x[:-3], dist_vals_rand3)[1] 281 | return p # < alpha 282 | 283 | def test_range(x, alpha=0.01): 284 | return np.max(x) - np.min(x) # <= np.quantile(dists, alpha) 285 | 286 | def test_edges(x, type_="min", alpha=0.01): 287 | if type_ == "min": 288 | return np.min(x) # >= np.quantile(mins, 1-alpha) 289 | else: 290 | return np.max(x) # <= np.quantile(maxs, alpha) 291 | 292 | def test_ad(x, transform=False, alpha=0.01): 293 | if transform: 294 | x = np.abs(x - 0.5) 295 | return robjects.globalenv["R_test_ad"](robjects.FloatVector(x), 0.5)[ 296 | 0 297 | ] # < alpha 298 | return robjects.globalenv["R_test_ad"](robjects.FloatVector(x))[0] # < alpha 299 | 300 | def test_normal_transformed(x, test="Shapiro", alpha=0.01): 301 | return robjects.globalenv["R_test_norm"](robjects.FloatVector(x), test)[ 302 | 0 303 | ] # < alpha 304 | 305 | # TODO: fix the naming scheme (this is mddlud!) 306 | def test_lindist_dim(x, type_="min", alpha=0.01): 307 | if type_ == "max": 308 | return np.max( 309 | np.abs(np.sort(x) - comp_to) 310 | ) # >= np.quantile(lindist_max, 1-alpha) 311 | else: 312 | return np.min( 313 | np.abs(np.sort(x) - comp_to) 314 | ) # <= np.quantile(lindist_min, alpha) 315 | 316 | def test_pairwise_dists_dim(x, type_="min", alpha=0.01): 317 | if type_ == "max": 318 | return np.max( 319 | np.diff(np.sort(x)) 320 | ) # >= np.quantile(max_pair_dists, 1-alpha) 321 | else: 322 | return np.max(np.diff(np.sort(x))) # <= np.quantile(max_pair_dists, alpha) 323 | 324 | def test_kurtosis(x, alpha=0.01): 325 | return ss.kurtosis(ss.norm.ppf(x)) 326 | 327 | # return not (np.quantile(kurts,alpha/2) < ss.kurtosis(ss.norm.ppf(x)) < np.quantile(kurts,1-alpha/2)) 328 | 329 | def test_wasserstein(x, alpha=0.01): 330 | # Note: not scaled for sample size (won't matter for result, but need to keep in mind that right baseline needs to be used!) 331 | return np.sum( 332 | np.abs(np.sort(x) - comp_to) 333 | ) # > np.quantile(wassersteins,1-alpha) 334 | 335 | def test_ddst(x, alpha=0.01): 336 | return ddst_uniform_test(x, nr=1000, compute_p=True)["p_value"] # < alpha 337 | 338 | def test_pwr(x, test_nr, alpha=0.01): 339 | return pwr.statcompute(test_nr, robjects.FloatVector(x))[0][0] 340 | 341 | test_battery_per_dim = { 342 | "1-spacing": test_spacing, 343 | "2-spacing": partial(test_spacing, m=2), 344 | "3-spacing": partial(test_spacing, m=3), 345 | "range": test_range, 346 | "min": test_edges, 347 | "max": partial(test_edges, type_="max"), 348 | "ad": test_ad, 349 | "ad_transform": partial(test_ad, transform=True), 350 | "shapiro": test_normal_transformed, 351 | "jb": partial(test_normal_transformed, test="jb"), 352 | "mdd_min": test_lindist_dim, 353 | "mdd_max": partial(test_lindist_dim, type_="max"), 354 | "kurtosis": test_kurtosis, 355 | "mmpd_max": test_pairwise_dists_dim, 356 | "mmpd_min": partial(test_pairwise_dists_dim, type_="max"), 357 | "wasserstein": test_wasserstein, 358 | "ddst": test_ddst, 359 | "kolmogorov": partial(test_pwr, test_nr=63), 360 | "CvM": partial(test_pwr, test_nr=64), 361 | "AD_pwr": partial(test_pwr, test_nr=65), 362 | "Durbin": partial(test_pwr, test_nr=66), 363 | "Kuiper": partial(test_pwr, test_nr=67), 364 | "HG1": partial(test_pwr, test_nr=68), 365 | "HG2": partial(test_pwr, test_nr=69), 366 | "Greenwood": partial(test_pwr, test_nr=70), 367 | "QM": partial(test_pwr, test_nr=71), 368 | "RC": partial(test_pwr, test_nr=72), 369 | "Moran": partial(test_pwr, test_nr=73), 370 | "Cressie1": partial(test_pwr, test_nr=74), 371 | "Cressie2": partial(test_pwr, test_nr=75), 372 | "Vasicek": partial(test_pwr, test_nr=76), 373 | "Swartz": partial(test_pwr, test_nr=77), 374 | "Morales": partial(test_pwr, test_nr=78), 375 | "Pardo": partial(test_pwr, test_nr=79), 376 | "Marhuenda": partial(test_pwr, test_nr=80), 377 | "Zhang1": partial(test_pwr, test_nr=81), 378 | "Zhang2": partial(test_pwr, test_nr=82), 379 | } 380 | 381 | if per_dim: 382 | return test_battery_per_dim 383 | 384 | def test_mi(X, type_="med", alpha=0.01): 385 | mi = get_mi(X, type_) 386 | if type_ == "med": 387 | return mi # > np.quantile(med_mis, 1-alpha) 388 | return mi # > np.quantile(max_mis, 1-alpha) 389 | 390 | def test_mmpd(X, alpha=0.01): 391 | mmpd = get_mmpd(X) 392 | return mmpd # > np.quantile(mmpds, 1-alpha) 393 | 394 | def test_mddlud(X, type_="med", alpha=0.01): 395 | mddlud = get_mddlud(X, type_) 396 | if type_ == "med": 397 | return mddlud # > np.quantile(med_ddluds, 1-alpha) 398 | return mddlud # > np.quantile(max_ddluds, 1-alpha) 399 | 400 | def test_spacing_across(X, m=1, alpha=0.01): 401 | # Not very efficient, but works for now 402 | diffs = [] 403 | for dim in range(X.shape[1]): 404 | x = np.sort(np.append(X[:, dim], [0, 1])) 405 | if m == 1: 406 | diffs.append(np.diff(x)) 407 | else: 408 | diffs.append(x[m:] - x[: (-1 * m)]) 409 | diffs = np.array(diffs).reshape(-1) 410 | if m == 1: 411 | p = ss.ks_2samp(diffs, dist_vals_rand)[1] 412 | elif m == 2: 413 | p = ss.ks_2samp(diffs, dist_vals_rand2)[1] 414 | else: 415 | p = ss.ks_2samp(diffs, dist_vals_rand3)[1] 416 | return p # < alpha 417 | 418 | test_battery_across_dim = { 419 | "mi": test_mi, 420 | # 'max_mi' : partial(test_mi, type_='max'), 421 | "mmpd": test_mmpd, 422 | "med_ddlud": test_mddlud, 423 | "max_ddlud": partial(test_mddlud, type_="max"), 424 | } 425 | 426 | def run_test_aggr(x, test, **kwargs): 427 | y = x.reshape(-1) 428 | return test(y, **kwargs) 429 | 430 | test_battery_aggr = {} 431 | for k, v in test_battery_per_dim.items(): 432 | if ( 433 | "mdd" not in k 434 | and "mmpd" not in k 435 | and "spacing" not in k 436 | and "wasser" not in k 437 | ): 438 | test_battery_aggr[k] = partial(run_test_aggr, test=v) 439 | 440 | test_battery_aggr["1-spacing"] = test_spacing_across 441 | test_battery_aggr["2-spacing"] = partial(test_spacing_across, m=2) 442 | test_battery_aggr["3-spacing"] = partial(test_spacing_across, m=3) 443 | test_battery_aggr.pop("range", None) 444 | test_battery_aggr.pop("min", None) 445 | test_battery_aggr.pop("max", None) 446 | 447 | return {**test_battery_across_dim, **test_battery_aggr} 448 | 449 | 450 | def runParallelFunction(runFunction, arguments): 451 | """ 452 | Return the output of runFunction for each set of arguments, 453 | making use of as much parallelization as possible on this system 454 | 455 | :param runFunction: The function that can be executed in parallel 456 | :param arguments: List of tuples, where each tuple are the arguments 457 | to pass to the function 458 | :return: 459 | """ 460 | 461 | arguments = list(arguments) 462 | p = Pool(min(cpu_count(), len(arguments))) 463 | # local_func = partial(func_star, func=runFunction) 464 | results = p.map(runFunction, arguments) 465 | p.close() 466 | return results 467 | 468 | 469 | def run_scenario_across( 470 | scen_list, foldername="", rep=100, dims=30, alpha=0.01, n_samples=100 471 | ): 472 | np.random.seed(42) 473 | records = {} 474 | kwargs = scen_list[1] 475 | scen = scen_list[0] 476 | test_battery_across = get_test_dict(n_samples, False) 477 | if n_samples > 150: 478 | test_battery_across.pop("jb", None) 479 | test_battery_across.pop("shapiro", None) 480 | for r in range(rep): 481 | data_arr = get_simulated_data(scen, dims, n_samples, kwargs) 482 | # print(data_arr.shape) 483 | for tname, tfunc in test_battery_across.items(): 484 | print(tname) 485 | if tname in records: 486 | records[tname].append(tfunc(data_arr, alpha=alpha)) 487 | else: 488 | records[tname] = [tfunc(data_arr, alpha=alpha)] 489 | dt = pd.DataFrame.from_dict(records) 490 | scen_name = f"{foldername}S{scen}" 491 | for k, v in kwargs.items(): 492 | dt[f"{k}_"] = v 493 | scen_name = f"{scen_name}_{k}_{v}" 494 | scen_name = f"{scen_name}.csv" 495 | dt["scen"] = scen 496 | dt["n_samples"] = n_samples 497 | dt["dims"] = dims 498 | dt.to_csv(scen_name) 499 | 500 | 501 | # return dt 502 | 503 | 504 | def run_scenario(scen_list, foldername="", rep=1500, alpha=0.01, n_samples=100): 505 | np.random.seed(42) 506 | kwargs = scen_list[1] 507 | scen = scen_list[0] 508 | # print(scen) 509 | data_arr = get_simulated_data(scen, rep, n_samples, kwargs) 510 | records = {} 511 | test_battery_per_dim = get_test_dict(n_samples) 512 | for tname, tfunc in test_battery_per_dim.items(): 513 | print(tname) 514 | temp = [] 515 | for r in range(rep): 516 | temp.append(tfunc(data_arr[:, r], alpha=alpha)) 517 | records[tname] = temp 518 | dt = pd.DataFrame.from_dict(records) 519 | scen_name = f"{foldername}S{scen}" 520 | for k, v in kwargs.items(): 521 | dt[f"{k}_"] = v 522 | scen_name = f"{scen_name}_{k}_{v}" 523 | scen_name = f"{scen_name}.csv" 524 | # print(scen_name) 525 | dt["scen"] = scen 526 | dt["n_samples"] = n_samples 527 | dt.to_csv(scen_name) 528 | 529 | 530 | # return dt 531 | 532 | 533 | def get_scens_per_dim(): 534 | scens = [["unif", {}]] 535 | for temp in [0.025, 0.05, 0.1, 0.2]: 536 | scens.append(["trunc_unif", {"min": temp / 2, "max": 1 - temp / 2}]) 537 | for temp in [0.025, 0.05, 0.1, 0.2]: 538 | scens.append(["trunc_unif", {"min": temp, "max": 1}]) 539 | for max_ in [25, 50, 100, 150, 200, 250]: 540 | scens.append(["spikes", {"max": max_}]) 541 | for max_ in [25, 50, 100, 150, 200, 250]: 542 | for sigma in [0.005, 0.01, 0.02, 0.03, 0.04, 0.05]: 543 | scens.append(["shifted_spikes", {"max": max_, "sigma": sigma}]) 544 | for s in ["norm", "inv_norm", "cauchy", "inv_cauchy"]: 545 | for sigma in [0.1, 0.2, 0.3, 0.4]: 546 | for mu in [0.5, 0.6, 0.7]: 547 | scens.append([s, {"sigma": sigma, "mu": mu}]) 548 | for n_centers in [1, 2, 3, 4, 5]: 549 | for gap_rad in [0.01, 0.02, 0.03, 0.04, 0.05]: 550 | scens.append(["gaps", {"n_centers": n_centers, "sigma": gap_rad}]) 551 | for n_centers in [1, 2, 3, 4, 5]: 552 | for gap_rad in [0.01, 0.025, 0.05, 0.1, 0.2, 0.3]: 553 | scens.append(["clusters", {"n_centers": n_centers, "sigma": gap_rad}]) 554 | for n_unif in [0.1, 0.25, 0.5]: 555 | for sigma in [0.01, 0.02, 0.05, 0.1]: 556 | scens.append(["part_unif", {"frac_unif": n_unif, "sigma": sigma}]) 557 | for f_0 in [0.1, 0.35, 0.45, 0.5]: 558 | for f_between in [0.5, 0.25, 0.1, 0.05, 0.025, 0.01]: 559 | scens.append(["bound_thing", {"frac_between": f_between, "frac_0": f_0}]) 560 | return scens 561 | 562 | 563 | def get_scens_across_dim(): 564 | scens = [["unif", {}]] 565 | for temp in [0.01, 0.025, 0.05, 0.1, 0.2]: 566 | scens.append(["trunc_unif", {"min": temp / 2, "max": 1 - temp / 2}]) 567 | scens.append(["trunc_unif", {"min": temp, "max": 1}]) 568 | for max_ in [25, 50, 100, 150, 200, 250, 500, 1000]: 569 | scens.append(["spikes", {"max": max_}]) 570 | for sigma in [0.005, 0.01, 0.02, 0.03, 0.04, 0.05]: 571 | scens.append(["shifted_spikes", {"max": max_, "sigma": sigma}]) 572 | for sigma in [0.1, 0.2, 0.3, 0.4, 0.5]: 573 | for mu in [0.5, 0.6, 0.7]: 574 | scens.append(["norm", {"sigma": sigma, "mu": mu}]) 575 | scens.append(["inv_norm", {"sigma": sigma, "mu": mu}]) 576 | scens.append(["cauchy", {"sigma": sigma, "mu": mu}]) 577 | scens.append(["inv_cauchy", {"sigma": sigma, "mu": mu}]) 578 | for n_centers in [1, 2, 3, 4, 5]: 579 | for gap_rad in [0.01, 0.02, 0.03, 0.04, 0.05]: 580 | scens.append( 581 | ["consistent_gaps", {"n_centers": n_centers, "sigma": gap_rad}] 582 | ) 583 | scens.append(["gaps", {"n_centers": n_centers, "sigma": gap_rad}]) 584 | for n_centers in [1, 2, 3, 4, 5]: 585 | for gap_rad in [0.01, 0.025, 0.05, 0.1, 0.2, 0.3]: 586 | scens.append( 587 | ["consistent_clusters", {"n_centers": n_centers, "sigma": gap_rad}] 588 | ) 589 | scens.append(["clusters", {"n_centers": n_centers, "sigma": gap_rad}]) 590 | for n_unif in [0.1, 0.25, 0.5]: 591 | for sigma in [0.01, 0.02, 0.05, 0.1]: 592 | scens.append(["part_unif", {"frac_unif": n_unif, "sigma": sigma}]) 593 | return scens 594 | 595 | 596 | def get_scens_inv(): 597 | # Get only the inv-based scenarios 598 | scens = [] 599 | for sigma in [0.1, 0.2, 0.3, 0.4, 0.5]: 600 | for mu in [0.6, 0.7]: 601 | scens.append(["inv_norm", {"sigma": sigma, "mu": mu}]) 602 | scens.append(["inv_cauchy", {"sigma": sigma, "mu": mu}]) 603 | return scens 604 | 605 | 606 | def get_scens_bound(): 607 | # Get only the added heavy-bound scenario 608 | scens = [] 609 | for f_0 in [0.1, 0.35, 0.45, 0.5]: 610 | for f_between in [0.5, 0.25, 0.1, 0.05, 0.025, 0.01]: 611 | scens.append(["bound_thing", {"frac_between": f_between, "frac_0": f_0}]) 612 | return scens 613 | 614 | 615 | def run_test_cases(n_samples, fname="Datatables", per_dim=True, rep=1500): 616 | if per_dim: 617 | scens = get_scens_per_dim() 618 | foldername = f"{fname}/S{n_samples}/" 619 | partial_run = partial( 620 | run_scenario, foldername=foldername, n_samples=n_samples, rep=rep 621 | ) 622 | runParallelFunction(partial_run, scens) 623 | else: 624 | scens = get_scens_across_dim() 625 | foldername = f"{fname}/S{n_samples}_Across/" 626 | partial_run = partial( 627 | run_scenario_across, foldername=foldername, n_samples=n_samples 628 | ) 629 | runParallelFunction(partial_run, scens) 630 | 631 | 632 | if __name__ == "__main__": 633 | idx_nr = int(sys.argv[1]) 634 | # rep = int(sys.argv[2]) 635 | 636 | # idx_nr decides which experiment is run (division on nodes) 637 | fname = "/var/scratch/dlvermet/SB" 638 | # fname = "Datatables" 639 | run_per = idx_nr < 4 640 | s = [30, 50, 100, 600][idx_nr % 4] 641 | # for s in [30, 50, 100, 600]: 642 | run_test_cases(s, fname, run_per) 643 | # run_test_cases(100) 644 | # alpha = [0.05, 0.01, ] 645 | --------------------------------------------------------------------------------