├── BIAS
    ├── models
    │   ├── empty.txt
    │   ├── targetnames.npy
    │   ├── opt_cnn_model-100.keras
    │   ├── opt_cnn_model-200.keras
    │   ├── opt_cnn_model-30.keras
    │   ├── opt_cnn_model-50.keras
    │   ├── opt_cnn_model-500.keras
    │   └── opt_cnn_model-600.keras
    ├── __init__.py
    ├── install.r
    ├── Create_RF.py
    ├── uniform_test.py
    ├── SB_Toolbox.py
    └── SB_Test_runner.py
├── .gitignore
├── example.py
├── requirements.txt
├── setup.py
├── .github
    └── workflows
    │   ├── python-publish.yml
    │   └── docker-publish.yml
├── LICENSE.md
├── Dockerfile
└── README.md


/BIAS/models/empty.txt:
--------------------------------------------------------------------------------
1 | Place the Random Forest models in this folder.


--------------------------------------------------------------------------------
/BIAS/models/targetnames.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/targetnames.npy


--------------------------------------------------------------------------------
/BIAS/models/opt_cnn_model-100.keras:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-100.keras


--------------------------------------------------------------------------------
/BIAS/models/opt_cnn_model-200.keras:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-200.keras


--------------------------------------------------------------------------------
/BIAS/models/opt_cnn_model-30.keras:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-30.keras


--------------------------------------------------------------------------------
/BIAS/models/opt_cnn_model-50.keras:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-50.keras


--------------------------------------------------------------------------------
/BIAS/models/opt_cnn_model-500.keras:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-500.keras


--------------------------------------------------------------------------------
/BIAS/models/opt_cnn_model-600.keras:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikivanstein/BIAS/HEAD/BIAS/models/opt_cnn_model-600.keras


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | structured_data_classifier
2 | BIAS/__pycache__
3 | env
4 | BIAS/models/RF
5 | *.pkl
6 | __pycache__
7 | dist
8 | build
9 | *.egg-info


--------------------------------------------------------------------------------
/BIAS/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import rpy2.robjects as robjects
 3 | 
 4 | 
 5 | # get the value of the environment variable HOME
 6 | R_installed = os.getenv("R_PACKAGES_INSTALLED")
 7 | 
 8 | if R_installed != "Yes":
 9 |     dirname = os.path.dirname(__file__)
10 |     robjects.r.source(f"{dirname}/install.r", encoding="utf-8")
11 |     os.environ["R_PACKAGES_INSTALLED"] = "Yes"
12 | 
13 | from .SB_Toolbox import BIAS, f0
14 | 
15 | __all__ = ("BIAS", "f0")
16 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | #example of using the BIAS toolbox to test a DE algorithm
 2 | 
 3 | from scipy.optimize import differential_evolution
 4 | import numpy as np
 5 | from BIAS import BIAS, f0, install_r_packages
 6 | 
 7 | install_r_packages()
 8 | 
 9 | bounds = [(0,1), (0, 1), (0, 1), (0, 1), (0, 1)]
10 | 
11 | #do 30 independent runs (5 dimensions)
12 | samples = []
13 | print("Performing optimization method 50 times of f0.")
14 | for i in np.arange(50):
15 |     result = differential_evolution(f0, bounds, maxiter=100)
16 |     samples.append(result.x)
17 | 
18 | samples = np.array(samples)
19 | 
20 | test = BIAS()
21 | # use the classical stastistical approach to detect BIAS
22 | print(test.predict(samples, show_figure=True))
23 | 
24 | #use the trained deep learning model to predict and explain BIAS
25 | y, preds = test.predict_deep(samples)
26 | test.explain(samples, preds, filename="explanation.png")
27 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.zoneinfo==0.2.1;python_version<"3.9"
 2 | certifi==2021.10.8
 3 | cffi==1.15.0
 4 | charset-normalizer==2.0.9
 5 | cycler==0.11.0
 6 | fonttools==4.28.3
 7 | idna==3.3
 8 | Jinja2==3.0.3
 9 | kiwisolver==1.3.2
10 | MarkupSafe==2.0.1
11 | matplotlib==3.6.2
12 | numpy==1.22.0
13 | packaging==21.3
14 | pandas==1.3.4
15 | patsy==0.5.2
16 | Pillow==8.4.0
17 | pycparser==2.21
18 | pyparsing==3.0.6
19 | python-dateutil==2.8.2
20 | pytz==2021.3
21 | pytz-deprecation-shim==0.1.0.post0
22 | requests==2.26.0
23 | rpy2==3.4.5
24 | setuptools-scm==6.3.2
25 | scikit-learn==1.3.1
26 | scipy==1.7.3
27 | seaborn==0.13.2
28 | six==1.16.0
29 | statsmodels==0.13.1
30 | threadpoolctl==3.0.0
31 | tomli==1.2.2
32 | tzdata==2021.5
33 | tzlocal==4.1
34 | urllib3==1.26.7
35 | wget==3.2
36 | zenodo-get==1.3.4
37 | shap==0.42.0
38 | tensorflow==2.10.0
39 | autokeras==1.0.20
40 | 


--------------------------------------------------------------------------------
/BIAS/install.r:
--------------------------------------------------------------------------------
 1 | 
 2 | r = getOption("repos")
 3 | r["CRAN"] = "http://cran.us.r-project.org"
 4 | options(repos = r)
 5 | for (x in c("zoo", "lgarch", "Rcpp", "RcppArmadillo", 'nortest', 'data.table', 'goftest')){
 6 |     if (!require(x,character.only = TRUE))
 7 |     {
 8 |     install.packages(x,dep=TRUE)
 9 |         if(!require(x,character.only = TRUE)) stop("Package not found")
10 |     }
11 | }
12 | 
13 | if (!require("AutoSEARCH", character.only = TRUE)) {
14 |     #install AutoSearch
15 |     url <- "https://cran.r-project.org/src/contrib/Archive/AutoSEARCH/AutoSEARCH_1.5.tar.gz"
16 |     pkgFile <- "AutoSEARCH_1.5.tar.gz"
17 |     download.file(url = url, destfile = pkgFile)
18 | 
19 |     # Install package
20 |     install.packages(pkgs=pkgFile, type="source", repos=NULL)
21 | 
22 |     # Delete package tarball
23 |     unlink(pkgFile)
24 | }
25 | 
26 | if (!require("PoweR", character.only = TRUE)) {
27 |     #install PoweR
28 |     url <- "https://cran.r-project.org/src/contrib/Archive/PoweR/PoweR_1.0.7.tar.gz"
29 |     pkgFile <- "PoweR_1.0.7.tar.gz"
30 |     download.file(url = url, destfile = pkgFile)
31 | 
32 |     # Install package
33 |     install.packages(pkgs=pkgFile, type="source", repos=NULL)
34 | 
35 |     # Delete package tarball
36 |     unlink(pkgFile)
37 | }
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import setuptools
 3 | 
 4 | with open("README.md", "r") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | __version__ = "1.4.1"
 8 | gh_ref = os.environ.get("GITHUB_REF")
 9 | if gh_ref:
10 |     *_, tag = gh_ref.split("/")
11 |     __version__ = tag.replace("v", "")
12 | 
13 | setuptools.setup(
14 |     name='struct-bias',
15 |     version=__version__,
16 |     author="Diederick Vermetten, Niki van Stein",
17 |     author_email="d.l.vermetten@liacs.leidenuniv.nl",
18 |     description="BIAS toolbox: Structural bias detection for continuous optimization algorithms",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     packages=setuptools.find_packages(),
22 |     package_data={
23 |         'BIAS': ['install.r', 'models/*'],
24 |     },
25 |     python_requires='>=3.6',
26 |     install_requires=[
27 |         'numpy',
28 |         'tensorflow',
29 |         'shap',
30 |         'rpy2',
31 |         'scipy',
32 |         'pandas',
33 |         'scikit-learn',
34 |         'matplotlib',
35 |         'seaborn',
36 |         'statsmodels',
37 |         'regex',
38 |         'autokeras'
39 |     ],
40 |     classifiers=[
41 |         "Programming Language :: Python :: 3",
42 |         "License :: OSI Approved :: MIT License",
43 |         "Operating System :: OS Independent",
44 |     ],
45 | )
46 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Publish Python 🐍 distribution 📦 to PyPI
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 |     environment:
23 |       name: pypi
24 |       url: https://pypi.org/p/struct-bias
25 |     permissions:
26 |       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
27 |     steps:
28 |     - uses: actions/checkout@v3
29 |     - name: Set up Python
30 |       uses: actions/setup-python@v3
31 |       with:
32 |         python-version: '3.x'
33 |     - name: Install dependencies
34 |       run: |
35 |         python -m pip install --upgrade pip
36 |         pip install build
37 |     - name: Build package
38 |       run: python -m build
39 |     - name: Publish package distributions to PyPI
40 |       uses: pypa/gh-action-pypi-publish@release/v1
41 |       with:
42 |         verbose: true
43 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ## License
 2 | 
 3 | This application is governed by the __BSD 3-Clause license__. 
 4 | 
 5 | BSD 3-Clause License
 6 | 
 7 | Copyright (c) 2021, 
 8 | All rights reserved.
 9 | 
10 | Redistribution and use in source and binary forms, with or without
11 | modification, are permitted provided that the following conditions are met:
12 | 
13 | * Redistributions of source code must retain the above copyright notice, this
14 |   list of conditions and the following disclaimer.
15 | 
16 | * Redistributions in binary form must reproduce the above copyright notice,
17 |   this list of conditions and the following disclaimer in the documentation
18 |   and/or other materials provided with the distribution.
19 | 
20 | * Neither the name of the copyright holder nor the names of its
21 |   contributors may be used to endorse or promote products derived from
22 |   this software without specific prior written permission.
23 | 
24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
28 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 | 
35 | ### Remarks ###
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.9-slim
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /app
 6 | 
 7 | # Install system dependencies required for building R and other tools, including X11 libraries
 8 | RUN apt-get update && apt-get install -y --no-install-recommends \
 9 |     build-essential \
10 |     libcurl4-openssl-dev \
11 |     libssl-dev \
12 |     libxml2-dev \
13 |     libfontconfig1-dev \
14 |     libreadline-dev \
15 |     wget \
16 |     curl \
17 |     zlib1g-dev \
18 |     libbz2-dev \
19 |     liblzma-dev \
20 |     libpcre2-dev \
21 |     libpcre3-dev \
22 |     gfortran \
23 |     libx11-dev \
24 |     libxt-dev \
25 |     x11proto-core-dev \
26 |     libcairo2-dev \
27 |     xvfb \
28 |     && rm -rf /var/lib/apt/lists/*
29 | 
30 | # Download and install R 4.1.2
31 | RUN wget https://cran.rstudio.com/src/base/R-4/R-4.1.2.tar.gz && \
32 |     tar zxvf R-4.1.2.tar.gz && \
33 |     cd R-4.1.2 && \
34 |     ./configure --enable-R-shlib --with-blas --with-lapack && \
35 |     make && \
36 |     make install && \
37 |     cd .. && \
38 |     rm -rf R-4.1.2 R-4.1.2.tar.gz
39 | 
40 | 
41 | # Copy the current directory contents into the container at /app
42 | COPY ./BIAS /app/BIAS
43 | 
44 | # Install R packages (add any necessary R packages here)
45 | RUN Rscript /app/BIAS/install.r
46 | 
47 | ENV R_PACKAGES_INSTALLED=Yes
48 | 
49 | # Copy example files
50 | COPY example.py /app/example.py
51 | COPY requirements.txt /app/requirements.txt
52 | COPY setup.py /app/setup.py
53 | COPY README.md /app/README.md
54 | 
55 | # Install Python dependencies specified in requirements.txt
56 | RUN pip install --upgrade pip
57 | #RUN python setup.py install
58 | RUN pip install -r requirements.txt
59 | 
60 | RUN apt-get update && apt-get install -y zip unzip
61 | 
62 | # Download reference value files
63 | # Download and unzip the files from figshare
64 | RUN wget https://figshare.com/ndownloader/files/30591411 -O bias_data.zip && \
65 |     unzip bias_data.zip -d /app/BIAS/data/ && \
66 |     rm bias_data.zip
67 | 
68 | RUN wget https://figshare.com/ndownloader/files/43106839 -O bias_models.zip && \
69 |     mkdir -p /app/BIAS/models/ && \
70 |     unzip bias_models.zip -d /app/BIAS/models/ && \
71 |     rm bias_models.zip
72 | 
73 | # Install any additional dependencies for Jupyter notebooks
74 | RUN pip install jupyter
75 | 
76 | # Set environment variables for R libraries
77 | ENV R_HOME=/usr/local/lib/R
78 | ENV LD_LIBRARY_PATH=/usr/local/lib/R/lib:/usr/local/lib/R/modules:$LD_LIBRARY_PATH
79 | 
80 | # Copy tutorial file (last such that we can update it easily)
81 | COPY Tutorial.ipynb /app/Tutorial.ipynb
82 | 
83 | # Expose the port that Jupyter will run on
84 | EXPOSE 8888
85 | 
86 | # Add a script to start Jupyter automatically when the container starts
87 | CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=''"]
88 | 
89 | # Optional: Add a health check
90 | #HEALTHCHECK --interval=30s CMD curl --fail http://localhost:8888 || exit 1
91 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | name: Create and publish a Docker image
 3 | 
 4 | # Configures this workflow to run every time a change is pushed to the branch called `master`.
 5 | on:
 6 |   push:
 7 |     branches: ['master']
 8 | 
 9 | # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
10 | env:
11 |   REGISTRY: ghcr.io
12 |   IMAGE_NAME: ${{ github.repository }}
13 | 
14 | # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
15 | jobs:
16 |   build-and-push-image:
17 |     runs-on: ubuntu-latest
18 |     # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
19 |     permissions:
20 |       contents: read
21 |       packages: write
22 |       attestations: write
23 |       id-token: write
24 |       # 
25 |     steps:
26 |       - name: Checkout repository
27 |         uses: actions/checkout@v4
28 |       # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
29 |       - name: Log in to the Container registry
30 |         uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
31 |         with:
32 |           registry: ${{ env.REGISTRY }}
33 |           username: ${{ github.actor }}
34 |           password: ${{ secrets.GITHUB_TOKEN }}
35 |       # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
36 |       - name: Extract metadata (tags, labels) for Docker
37 |         id: meta
38 |         uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
39 |         with:
40 |           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
41 |       # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
42 |       # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
43 |       # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
44 |       - name: Build and push Docker image
45 |         id: push
46 |         uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
47 |         with:
48 |           context: .
49 |           push: true
50 |           tags: ${{ steps.meta.outputs.tags }}
51 |           labels: ${{ steps.meta.outputs.labels }}
52 |       
53 |       # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." 
54 |       - name: Generate artifact attestation
55 |         uses: actions/attest-build-provenance@v1
56 |         with:
57 |           subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
58 |           subject-digest: ${{ steps.push.outputs.digest }}
59 |           push-to-registry: true
60 |       
61 | 


--------------------------------------------------------------------------------
/BIAS/Create_RF.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import pickle
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | import seaborn as sbs
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.metrics import confusion_matrix, f1_score
 10 | from sklearn.model_selection import train_test_split
 11 | 
 12 | from zipfile import ZipFile
 13 | import requests
 14 | from io import BytesIO
 15 | import os
 16 | 
 17 | 
 18 | test_names = [
 19 |     "1-spacing",
 20 |     "2-spacing",
 21 |     "3-spacing",
 22 |     "ad",
 23 |     "ad_transform",
 24 |     "shapiro",
 25 |     "jb",
 26 |     "ddst",
 27 |     "kurtosis",
 28 |     "mmpd_min",
 29 |     "mmpd_max",
 30 |     "range",
 31 |     "min",
 32 |     "max",
 33 |     "mdd_min",
 34 |     "mdd_max",
 35 |     "wasserstein",
 36 |     "kolmogorov",
 37 |     "CvM",
 38 |     "Durbin",
 39 |     "Kuiper",
 40 |     "HG1",
 41 |     "HG2",
 42 |     "Greenwood",
 43 |     "QM",
 44 |     "RC",
 45 |     "Moran",
 46 |     "Cressie1",
 47 |     "Cressie2",
 48 |     "Vasicek",
 49 |     "Swartz",
 50 |     "Morales",
 51 |     "Pardo",
 52 |     "Marhuenda",
 53 |     "Zhang1",
 54 |     "Zhang2",
 55 | ]
 56 | 
 57 | readable_label_dict = {
 58 |     "gaps": "Gaps",
 59 |     "cauchy": "Center",
 60 |     "clusters": "Clusters",
 61 |     "inv_cauchy": "Bounds",
 62 |     "inv_norm": "Bounds",
 63 |     "norm": "Center",
 64 |     "part_unif": "Clusters",
 65 |     "shifted_spikes": "Discretization",
 66 |     "spikes": "Discretization",
 67 |     "trunc_unif": "Center",
 68 |     "bound_thing": "Bounds",
 69 | }
 70 | 
 71 | 
 72 | def create_RF_rej(
 73 |     included_tests=test_names,
 74 |     plot_feat_importance=False,
 75 |     use_bias_labels=False,
 76 |     feature_order=None,
 77 |     rf_file_name=None,
 78 | ):
 79 |     dirname = os.path.dirname(__file__)
 80 |     r = requests.get("https://figshare.com/ndownloader/files/30590670")
 81 |     zipfile = ZipFile(BytesIO(r.content))
 82 |     zipfile.extractall(f"{dirname}/models/RFs/")
 83 | 
 84 |     r = requests.get("https://figshare.com/ndownloader/files/30591417")
 85 |     zipfile = ZipFile(BytesIO(r.content))
 86 |     zipfile.extractall(f"{dirname}/models/RFs/SB/")
 87 |     cols_to_get = included_tests + ["scen"]
 88 |     dt_samples = []
 89 |     for sample_size in [30, 50, 100, 600]:
 90 |         for f in glob.glob(f"{dirname}/models/RFs/SB/S{sample_size}/*.csv"):
 91 |             dt_temp = pd.read_csv(f)
 92 |             #             print(len(dt_temp))
 93 |             if dt_temp["scen"][0] != "unif":
 94 |                 # Remove samples for which no tests reject (non-biased)
 95 |                 try:
 96 |                     dt_rej_temp = pd.read_csv(
 97 |                         f"{dirname}/models/RFs/SB/Rejections/S{sample_size}_A0.01_Cnone_{os.path.basename(f)}",
 98 |                         index_col=0,
 99 |                     )
100 | 
101 |                     dt_test_only = dt_rej_temp[included_tests]
102 |                     idxs_save = np.where(dt_test_only.transpose().sum() > 0)
103 |                     dt_samples.append(dt_rej_temp[cols_to_get].iloc[idxs_save])
104 |                 except:
105 |                     next
106 |     dt_samples = pd.concat(dt_samples)
107 |     print(dt_samples.columns)
108 |     print(included_tests)
109 |     X = dt_samples[included_tests]
110 |     if use_bias_labels:
111 |         Y = [readable_label_dict[x] for x in dt_samples["scen"]]
112 |     else:
113 |         Y = dt_samples["scen"]
114 | 
115 |     rf = RandomForestClassifier(oob_score=True, class_weight="balanced")
116 | 
117 |     rf.fit(X, Y)
118 | 
119 |     if plot_feat_importance:
120 |         plt.figure(figsize=(19, 10))
121 |         if feature_order is None:
122 |             sbs.barplot(x=included_tests, y=rf.feature_importances_)
123 |         else:
124 |             sbs.barplot(
125 |                 x=included_tests, y=rf.feature_importances_, order=feature_order
126 |             )
127 |         plt.xticks(rotation=90)
128 |         plt.tight_layout()
129 |         plt.savefig(f"RF_feature_importance.pdf")
130 |         plt.show()
131 | 
132 |     print(rf.oob_score_)
133 | 
134 |     if rf_file_name is not None:
135 |         with open(f"{dirname}/models/RFs/{rf_file_name}.pkl", "wb") as output_file:
136 |             pickle.dump(rf, output_file)
137 |     return rf
138 | 


--------------------------------------------------------------------------------
/BIAS/uniform_test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy.polynomial.legendre import legval
  3 | 
  4 | 
  5 | def ddst_base_legendre(x, j):
  6 |     """
  7 |     Compute the j-th Legendre polynomial evaluated at x.
  8 | 
  9 |     Parameters:
 10 |     - x (array-like): The sample data.
 11 |     - j (int): The degree of the polynomial.
 12 | 
 13 |     Returns:
 14 |     - values (numpy.ndarray): The evaluated polynomial values at x.
 15 |     """
 16 |     # Map x from [0, 1] to [-1, 1]
 17 |     x_mapped = 2 * np.array(x) - 1
 18 |     # Coefficients for the j-th Legendre polynomial
 19 |     coefs = np.zeros(j + 1)
 20 |     coefs[j] = 1
 21 |     values = legval(x_mapped, coefs)
 22 |     return values
 23 | 
 24 | 
 25 | def ddst_phi(x, j, base):
 26 |     """
 27 |     Compute the coefficient for the j-th term using the base function.
 28 | 
 29 |     Parameters:
 30 |     - x (array-like): The sample data.
 31 |     - j (int): The degree of the polynomial.
 32 |     - base (function): The orthonormal base function.
 33 | 
 34 |     Returns:
 35 |     - coefficient (float): The computed coefficient.
 36 |     """
 37 |     # Evaluate the base function at x
 38 |     phi_values = base(x, j)
 39 |     # Compute the mean value
 40 |     coefficient = np.mean(phi_values)
 41 |     return coefficient
 42 | 
 43 | 
 44 | def ddst_uniform_Nk(x, base=None, Dmax=10):
 45 |     """
 46 |     Compute the cumulative sums for the data-driven smooth test of uniformity.
 47 | 
 48 |     Parameters:
 49 |     - x (array-like): The sample data.
 50 |     - base (function): The orthonormal base function to use (default is ddst_base_legendre).
 51 |     - Dmax (int): The maximum degree of the polynomial.
 52 | 
 53 |     Returns:
 54 |     - coord (numpy.ndarray): The cumulative sums of the transformed data.
 55 |     """
 56 |     if base is None:
 57 |         base = ddst_base_legendre
 58 | 
 59 |     n = len(x)
 60 |     maxN = max(min(Dmax, n - 2, 20), 1)
 61 |     coord = np.zeros(maxN)
 62 |     for j in range(1, maxN + 1):
 63 |         coord[j - 1] = ddst_phi(x, j, base)
 64 |     coord = np.cumsum(coord**2 * n)
 65 |     return coord
 66 | 
 67 | 
 68 | def ddst_IIC(coord, n, c=2.4):
 69 |     """
 70 |     Compute the model selection index l using the Information Criterion.
 71 | 
 72 |     Parameters:
 73 |     - coord (numpy.ndarray): The cumulative sums.
 74 |     - n (int): Sample size.
 75 |     - c (float): Calibrating parameter in the penalty in the model selection rule.
 76 | 
 77 |     Returns:
 78 |     - l (int): The selected index (starting from 1).
 79 |     """
 80 |     Dmax = len(coord)
 81 |     ic = coord - c * np.arange(1, Dmax + 1)
 82 |     l = np.argmin(ic) + 1  # Add 1 because numpy arrays are 0-indexed
 83 |     return l
 84 | 
 85 | 
 86 | def ddst_uniform_test(
 87 |     x,
 88 |     base=ddst_base_legendre,
 89 |     d_n=10,
 90 |     c=2.4,
 91 |     nr=100000,
 92 |     compute_p=True,
 93 |     alpha=0.05,
 94 |     compute_cv=True,
 95 |     **kwargs,
 96 | ):
 97 |     """
 98 |     Data Driven Smooth Test for Uniformity.
 99 | 
100 |     Parameters:
101 |     - x (array-like): A (non-empty) numeric vector of data.
102 |     - base (function): Function returning an orthonormal system (default is ddst_base_legendre).
103 |     - d_n (int): Maximum dimension considered.
104 |     - c (float): Calibrating parameter in the penalty in the model selection rule.
105 |     - nr (int): Number of runs for p-value and critical value computation.
106 |     - compute_p (bool): Whether to compute a p-value.
107 |     - alpha (float): Significance level.
108 |     - compute_cv (bool): Whether to compute a critical value corresponding to alpha.
109 |     - kwargs: Further arguments.
110 | 
111 |     Returns:
112 |     - result (dict): A dictionary containing test results.
113 |     """
114 |     # Only Legendre base is implemented yet
115 |     base = ddst_base_legendre
116 |     method_name = "ddst_base_legendre"
117 | 
118 |     x = np.asarray(x)
119 |     n = len(x)
120 |     if n < 5:
121 |         raise ValueError("length(x) should be at least 5")
122 | 
123 |     # Compute coordinates
124 |     coord = ddst_uniform_Nk(x, base=base, Dmax=d_n)
125 |     # Compute model selection index l
126 |     l = ddst_IIC(coord, n, c)
127 |     # Test statistic t
128 |     t = coord[l - 1]  # Adjust for zero-based indexing
129 |     # Coordinates differences
130 |     coord_diffs = coord - np.concatenate(([0], coord[:-1]))
131 |     # Prepare result
132 |     result = {
133 |         "statistic": t,
134 |         "parameter": l,
135 |         "coordinates": coord_diffs,
136 |         "method": "Data Driven Smooth Test for Uniformity",
137 |     }
138 | 
139 |     # Compute p-value and critical value if required
140 |     if compute_p or compute_cv:
141 |         tmp = np.zeros(nr)
142 |         for i in range(nr):
143 |             y = np.random.uniform(0, 1, n)
144 |             tmpC = ddst_uniform_Nk(y, base=base, Dmax=d_n)
145 |             l_sim = ddst_IIC(tmpC, n, c)
146 |             tmp[i] = tmpC[l_sim - 1]  # Adjust index for zero-based indexing
147 |         if compute_p:
148 |             result["p_value"] = np.mean(tmp > t)
149 |         if compute_cv:
150 |             result["cv"] = np.quantile(tmp, alpha)
151 | 
152 |     # Construct data name
153 |     data_name = f"x, base: {method_name}  c: {c}  d_n: {d_n}" + (
154 |         f"  cv({alpha}) : {result['cv']:.5f}" if compute_cv else ""
155 |     )
156 |     result["data_name"] = data_name
157 | 
158 |     return result
159 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deep-BIAS: Bias In Algorithms, Structural
  2 | ## A toolbox for detecting structural bias in continuous optimization heuristics.
  3 | 
  4 | With a deep-learning extension to better evaluate the type of bias and gain insights using explainable AI
  5 | 
  6 | 
  7 | 
  8 | ## Using the BIAS-Toolbox with Docker (Recommended)
  9 | 
 10 | The BIAS-Toolbox can be used inside a Docker container, eliminating the need to manually install all dependencies and packages. Follow the steps below to run the Docker image, and to start working with the toolbox in a Jupyter notebook environment. We provide the following prebuild container: `ghcr.io/nikivanstein/bias:master`
 11 | 
 12 | ### Prerequisites
 13 | 
 14 | Make sure you have Docker installed on your system. You can install Docker by following the instructions [here](https://docs.docker.com/get-docker/).
 15 | 
 16 | ### Steps to Run the Docker Image
 17 | 
 18 | 1. **Pull the Prebuild Image**
 19 |    The following command will pull the prebuild image to your system.
 20 | 
 21 |    ```bash
 22 |    docker pull ghcr.io/nikivanstein/bias:master
 23 |    ```
 24 | 
 25 | 2. **Run the Prebuild Docker Container**:
 26 |    The following command will start the container and expose the Jupyter notebook interface on port `8888`:
 27 |    
 28 |    ```bash
 29 |    docker run -p 8888:8888 ghcr.io/nikivanstein/bias:master
 30 |    ```
 31 | 
 32 | 3. **Access the Jupyter Notebook**:
 33 |    After starting the container, you should see a message with instructions to access the Jupyter notebook. It will look something like this:
 34 |    
 35 |    ```
 36 |    To access the notebook, open this file in a browser:
 37 |        http://127.0.0.1:8888/?token=<token>
 38 |    ```
 39 |    
 40 |    Open the provided URL in your web browser to start using the BIAS-Toolbox within Jupyter.
 41 | 
 42 | 
 43 | ### Steps to Build the Dockerfile yourself
 44 | 
 45 | 1. **Clone the Repository**:
 46 |    If you haven't already cloned the BIAS repository, do so with the following command:
 47 |    
 48 |    ```bash
 49 |    git clone https://github.com/nikivanstein/BIAS.git
 50 |    cd BIAS
 51 |    ```
 52 | 
 53 | 2. **Build the Docker Image**:
 54 |    The `Dockerfile` included in this repository will install all necessary dependencies (both Python and R), download required data and model files, and set up the environment.
 55 | 
 56 |    To build the Docker image, run the following command from the root of the repository (where the `Dockerfile` is located):
 57 |    
 58 |    ```bash
 59 |    docker build -t bias-toolbox .
 60 |    ```
 61 | 
 62 |    This will create a Docker image named `bias-toolbox`.
 63 | 
 64 | 3. **Run the Docker Container**:
 65 |    Once the image is built, you can run the container. The following command will start the container and expose the Jupyter notebook interface on port `8888`:
 66 |    
 67 |    ```bash
 68 |    docker run -p 8888:8888 bias-toolbox
 69 |    ```
 70 | 
 71 | 4. **Access the Jupyter Notebook**:
 72 |    After starting the container, you should see a message with instructions to access the Jupyter notebook. It will look something like this:
 73 |    
 74 |    ```
 75 |    To access the notebook, open this file in a browser:
 76 |        http://127.0.0.1:8888/?token=<token>
 77 |    ```
 78 |    
 79 |    Open the provided URL in your web browser to start using the BIAS-Toolbox within Jupyter.
 80 | 
 81 | 
 82 | ### Stopping the Container
 83 | 
 84 | To stop the running Docker container, press `CTRL+C` in the terminal where the container is running, or find the container's ID with the command:
 85 | 
 86 | ```bash
 87 | docker ps
 88 | ```
 89 | 
 90 | Then stop the container with:
 91 | 
 92 | ```bash
 93 | docker stop <container_id>
 94 | ```
 95 | 
 96 | ### Additional Notes
 97 | 
 98 | - The image is configured to use Jupyter Notebook with R and Python integrations.
 99 | - R version `4.1.2` is installed and configured along with the necessary R packages as specified in the `install.r` script.
100 | - Python dependencies are handled via the `requirements.txt` file.
101 | 
102 | By using Docker, you can avoid issues related to dependency installation and system setup, providing a consistent environment for running the BIAS-Toolbox.
103 | 
104 | 
105 | ## Setup using Pip
106 | 
107 | Another way of using the BIAS-Toolbox is by installing the pip package.
108 | 
109 | This package requires an R-installation to be present.
110 | 
111 | The package is tested with R 4.1.2 (install from source https://cran.r-project.org/src/base/R-4/R-4.1.2.tar.gz)
112 | 
113 | The R packages will be installed automatically upon first importing BIAS.
114 | 
115 | Install the BIAS toolbox using pip:
116 | 
117 |     pip install struct-bias
118 | 
119 | This installs the following R packages:
120 | 
121 | - PoweR
122 | - AutoSEARCH
123 | - nortest
124 | - data.table
125 | - goftest
126 | - ddst
127 | 
128 | 
129 | ### Detailed setup using virtual env
130 | 
131 | 1. Download and install R from https://cran.r-project.org/ use version 4.1.2  
132 |    Example for Ubuntu based system:
133 |     ```sh
134 |     sudo wget https://cran.rstudio.com/src/base/R-4/R-4.1.2.tar.gz  
135 |     tar zxvf R-4.1.2.tar.gz  
136 |     cd R-4.1.2  
137 |     ./configure --enable-R-shlib --with-blas --with-lapack
138 |     make  
139 |     sudo make install  
140 |     ```
141 |     
142 | 2. Download this repository (clone or as zip)
143 | 3. Create a python virtual env `python -m venv env`
144 | 4. Activate the env (in powershell for example: `env/Scripts/Activate.ps1 `)
145 | 5. Install dependencies `pip install -r requirements.txt`
146 | 6. Checkout the `example.py` to start using the BIAS toolbox.
147 | 
148 | 
149 | ## Example
150 | 
151 | ```py
152 | #example of using the BIAS toolbox to test a DE algorithm
153 | 
154 | from scipy.optimize import differential_evolution
155 | import numpy as np
156 | from BIAS import BIAS, f0
157 | 
158 | bounds = [(0,1), (0, 1), (0, 1), (0, 1), (0, 1)]
159 | 
160 | #do 30 independent runs (5 dimensions)
161 | samples = []
162 | print("Performing optimization method 30 times of f0.")
163 | for i in np.arange(30):
164 |     result = differential_evolution(f0, bounds, maxiter=100)
165 |     samples.append(result.x)
166 | 
167 | samples = np.array(samples)
168 | 
169 | test = BIAS()
170 | print(test.predict(samples, show_figure=True))
171 | 
172 | y, preds = test.predict_deep(samples)
173 | test.explain(samples, preds, filename="explanation.png")
174 | ```
175 | 
176 | ## Additional files
177 | 
178 | Note: The code for generating the RF used to predict the type of bias is included, but the full RF is not. These can be found on zenodo: https://doi.org/10.6084/m9.figshare.16546041.
179 | The RF models will be downloaded automatically the first time the predict function requires them.
180 | 
181 | ### Citation
182 | 
183 | If you use the BIAS toolbox in a scientific publication, we would appreciate using the following citations:
184 | 
185 | ```
186 | @ARTICLE{9828803,
187 |   author={Vermetten, Diederick and van Stein, Bas and Caraffini, Fabio and Minku, Leandro L. and Kononova, Anna V.},
188 |   journal={IEEE Transactions on Evolutionary Computation}, 
189 |   title={BIAS: A Toolbox for Benchmarking Structural Bias in the Continuous Domain}, 
190 |   year={2022},
191 |   volume={26},
192 |   number={6},
193 |   pages={1380-1393},
194 |   doi={10.1109/TEVC.2022.3189848}
195 | }
196 | 
197 | @software{niki_van_stein_2023_7803623,
198 |   author       = {Niki van Stein and
199 |                   Diederick Vermetten},
200 |   title        = {Basvanstein/BIAS: v1.1 Deep-BIAS Toolbox},
201 |   month        = apr,
202 |   year         = 2023,
203 |   publisher    = {Zenodo},
204 |   version      = {v1.1},
205 |   doi          = {10.5281/zenodo.7803623},
206 |   url          = {https://doi.org/10.5281/zenodo.7803623}
207 | }
208 | ```
209 | 


--------------------------------------------------------------------------------
/BIAS/SB_Toolbox.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from io import BytesIO
  4 | from zipfile import ZipFile
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | import requests
 10 | import rpy2.robjects as robjects
 11 | import seaborn as sbs
 12 | import shap
 13 | import tensorflow as tf
 14 | from rpy2.robjects.packages import importr
 15 | from scipy.stats import percentileofscore
 16 | from statsmodels.stats.multitest import multipletests
 17 | import autokeras as ak
 18 | 
 19 | from .SB_Test_runner import get_scens_per_dim, get_simulated_data, get_test_dict
 20 | 
 21 | 
 22 | def f0(x):
 23 |     """f0 random function, to be used a objective function to test optimization algorithms.
 24 | 
 25 |     Args:
 26 |         x (list): input for the objective function, ignored since the function is random.
 27 | 
 28 |     Returns:
 29 |         float: A uniform random number
 30 |     """
 31 |     return np.random.uniform()
 32 | 
 33 | 
 34 | def getXAIBackground(n_samples=30, rep=20):
 35 |     """Get background training samples to approximate Shapley values for the deeplearning approach.
 36 | 
 37 |     Args:
 38 |         n_samples (int, optional): number of samples, should be in [30,50,100,600]. Defaults to 30.
 39 |         rep (int, optional): number of repetitions per scenario. Defaults to 20.
 40 |     """
 41 |     scenes = get_scens_per_dim()
 42 |     X = []
 43 |     for scene in scenes:
 44 |         label = scene[0]
 45 |         kwargs = scene[1]
 46 |         data = get_simulated_data(label, rep=rep, n_samples=n_samples, kwargs=kwargs)
 47 |         for r in range(rep):
 48 |             X.append(np.sort(data[:, r]))
 49 |     X = np.expand_dims(X, axis=2)
 50 |     return np.array(X)
 51 | 
 52 | 
 53 | class BIAS:
 54 |     def __init__(self):
 55 |         """BIAS toolbox for predicting bias in black box optimization algorithms.
 56 |         Predicts both the presence of bias and the bias type. Use f0 as objective function for at least 30 independent optimization runs.
 57 | 
 58 |         Args:
 59 |             install_r (bool): if set to True, try to install the required R packages automatically.
 60 |         """
 61 |         self.p_value_columns = [
 62 |             "1-spacing",
 63 |             "2-spacing",
 64 |             "3-spacing",
 65 |             "ad",
 66 |             "ad_transform",
 67 |             "shapiro",
 68 |             "jb",
 69 |             "ddst",
 70 |         ]
 71 |         self.pwr = importr("PoweR")
 72 |         self.deepmodel = None
 73 | 
 74 |     def _load_ref_vals(self, n_samples, alpha=0.01, across=False):
 75 |         """Helper function to load the reference values needed for calculating the p-values.
 76 | 
 77 |         Args:
 78 |             n_samples (int): the sample size used for the statistical tests. Can only be
 79 |         in [30,50,100,600]
 80 |             alpha (float, optional): Can only be in [0.01, 0.05]. Defaults to 0.01.
 81 |             across (bool, optional): Whether we use across dimension reference vals or not. Defaults to False.
 82 | 
 83 |         Returns:
 84 |             list, list: two lists of reference values loaded from files.
 85 |         """
 86 |         dirname = os.path.dirname(__file__)
 87 |         # download reference values if needed from figshare
 88 |         if not os.path.isfile(
 89 |             f"{dirname}/data/Crit_vals_across/S{n_samples}_A{alpha}_with_refs.pkl"
 90 |         ):
 91 |             print(
 92 |                 "Downloading reference values for statistical tests, this takes a while.."
 93 |             )
 94 |             r = requests.get("https://figshare.com/ndownloader/files/30591411")
 95 |             zipfile = ZipFile(BytesIO(r.content))
 96 |             zipfile.extractall(f"{dirname}/data/")
 97 |         if across:
 98 |             with open(
 99 |                 f"{dirname}/data/Crit_vals_across/S{n_samples}_A{alpha}_with_refs.pkl",
100 |                 "rb",
101 |             ) as f:
102 |                 ref_vals, _ = pickle.load(f)
103 |             with open(
104 |                 f"{dirname}/data/Crit_vals_pwr_across/S{n_samples}_A{alpha}_with_refs.pkl",
105 |                 "rb",
106 |             ) as f:
107 |                 ref_vals_new, _ = pickle.load(f)
108 |         else:
109 |             with open(
110 |                 f"{dirname}/data/Crit_vals/S{n_samples}_A{alpha}_with_refs.pkl", "rb"
111 |             ) as f:
112 |                 _, ref_vals = pickle.load(f)
113 |             with open(
114 |                 f"{dirname}/data/Crit_vals_pwr/S{n_samples}_A{alpha}_with_refs.pkl",
115 |                 "rb",
116 |             ) as f:
117 |                 _, ref_vals_new = pickle.load(f)
118 |         return ref_vals, ref_vals_new
119 | 
120 |     def _get_test_types(self):
121 |         """Helper function for the poweR-based tests.
122 | 
123 |         Returns:
124 |             dict: Dict of test functions from R.
125 |         """
126 |         testnames = [
127 |             "kolmogorov",
128 |             "CvM",
129 |             "AD_pwr",
130 |             "Durbin",
131 |             "Kuiper",
132 |             "HG1",
133 |             "HG2",
134 |             "Greenwood",
135 |             "QM",
136 |             "RC",
137 |             "Moran",
138 |             "Cressie1",
139 |             "Cressie2",
140 |             "Vasicek",
141 |             "Swartz",
142 |             "Morales",
143 |             "Pardo",
144 |             "Marhuenda",
145 |             "Zhang1",
146 |             "Zhang2",
147 |         ]
148 |         test_types_new = [
149 |             self.pwr.create_alter(robjects.FloatVector(np.arange(63, 83)))[i][0]
150 |             for i in range(20)
151 |         ]
152 |         return {k: v for k, v in zip(testnames, test_types_new)}
153 | 
154 |     def transform_to_reject_dt_corr(
155 |         self, dt, alpha, n_samples, correction_method="fdr_bh"
156 |     ):
157 |         """Apply p-value corrections on the dataframe of test statistics.
158 | 
159 |         Args:
160 |             dt (dataframe): The DataFrame containing the calculated test statistics for each dimension.
161 |             alpha (float): The threshold for statistical significance.
162 |             n_samples (int): The sample size used for the statistical tests. Can only be
163 |                 in [30,50,100,600]
164 |             correction_method (str, optional): Which type of p-value correction to apply. Recommended is 'fdr_bh',
165 |                 but 'fdr_by' and 'holm' are also supported.. Defaults to 'fdr_bh'.
166 | 
167 |         Returns:
168 |             dataframe: Corrected test statistics.
169 |         """
170 |         reference_vals, ref_vals_new = self._load_ref_vals(n_samples)
171 |         test_types_new = self._get_test_types()
172 | 
173 |         dt_rejections = pd.DataFrame()
174 |         dt_p_vals_temp = pd.DataFrame()
175 |         for colname in self.p_value_columns:
176 |             dt_p_vals_temp[colname] = dt[colname]
177 |         for k, v in reference_vals.items():
178 |             if "kurt" in k:
179 |                 temp = [
180 |                     percentileofscore(score=x, a=v, kind="mean") / 100 for x in dt[k]
181 |                 ]
182 |                 temp = [min(x, 1 - x) for x in temp]  # two-sided comparison
183 |             elif k in ["min", "wasserstein", "mdd_max", "mdd_min"]:
184 |                 temp = [
185 |                     1 - percentileofscore(score=x, a=v, kind="mean") / 100
186 |                     for x in dt[k]
187 |                 ]
188 |             else:
189 |                 temp = [
190 |                     percentileofscore(score=x, a=v, kind="mean") / 100 for x in dt[k]
191 |                 ]
192 |             dt_p_vals_temp[k] = temp
193 |         for k, v in ref_vals_new.items():
194 |             if test_types_new[k] == 4:
195 |                 temp = [
196 |                     percentileofscore(score=x, a=v, kind="mean") / 100 for x in dt[k]
197 |                 ]
198 |             else:
199 |                 temp = [
200 |                     1 - percentileofscore(score=x, a=v, kind="mean") / 100
201 |                     for x in dt[k]
202 |                 ]
203 |             dt_p_vals_temp[k] = temp
204 |         res = np.array(
205 |             [
206 |                 multipletests(x, alpha=alpha, method=correction_method)[0]
207 |                 for x in np.array(dt_p_vals_temp)
208 |             ]
209 |         ).reshape(dt_p_vals_temp.shape)
210 |         return pd.DataFrame(res, columns=dt_p_vals_temp.columns)
211 | 
212 |     def _get_test_names_dict(self):
213 |         """Helper function to ensure consistent naming for the used statistical tests
214 |         by creating a dictionary
215 | 
216 |         Returns:
217 |             dict: Dict of all test functions.
218 |         """
219 |         test_dict_per = get_test_dict(n_samples=100, per_dim=True)
220 |         test_names = list(test_dict_per.keys())
221 |         test_names.remove("AD_pwr")
222 |         test_names_paper = [
223 |             "1-spacing",
224 |             "2-spacing",
225 |             "3-spacing",
226 |             "range",
227 |             "min",
228 |             "max",
229 |             "AD",
230 |             "tAD",
231 |             "Shapiro",
232 |             "JB",
233 |             "LD-min",
234 |             "LD-max",
235 |             "Kurt",
236 |             "MPD-max",
237 |             "MPD-min",
238 |             "Wasserstein",
239 |             "NS",
240 |             "KS",
241 |             "CvM",
242 |             "Durbin",
243 |             "Kuiper",
244 |             "HG1",
245 |             "HG2",
246 |             "Greenwood",
247 |             "QM",
248 |             "RC",
249 |             "Moran",
250 |             "Cressie1",
251 |             "Cressie2",
252 |             "Vasicek",
253 |             "Swartz",
254 |             "Morales",
255 |             "Pardo",
256 |             "Marhuenda",
257 |             "Zhang1",
258 |             "Zhang2",
259 |         ]
260 | 
261 |         test_label_dict = {k: v for k, v in zip(test_names, test_names_paper)}
262 |         return test_label_dict
263 | 
264 |     def plot_swarm_with_heatmap(self, data, rejections, filename=None):
265 |         """Plotting function to create the swarmplot and rejection heatmap.
266 | 
267 |         Args:
268 |             data (dataframe): The DataFrame containing the final position values.
269 |             rejections (dataframe): The DataFrame containing the corresponding test rejections.
270 |             filename (string, optional): If not none, the name of the file to store the figure. Defaults to None.
271 |         """
272 |         test_label_dict = self._get_test_names_dict()
273 |         data_dt = pd.DataFrame(data)
274 |         fig, axs = plt.subplots(2, figsize=(19, 14), sharex=True)
275 |         ax1 = axs[0]
276 |         dt_molt = data_dt.melt()
277 |         dt_molt["variable"] = dt_molt["variable"] + 1.5
278 |         sbs.swarmplot(data=dt_molt, x="variable", y="value", ax=ax1)
279 |         ax1.set_xlim(-0.5, self.DIM - 0.5)
280 |         for dim in range(self.DIM):
281 |             c0 = ax1.get_children()[dim]
282 |             c0.set_offsets([[x + 0.5, y] for x, y in c0.get_offsets()])
283 |             ax1.axvline(dim, color="k", lw=0.6, ls=":")
284 |         sbs.heatmap(
285 |             np.array(rejections).transpose(),
286 |             ax=axs[1],
287 |             cbar=False,
288 |             yticklabels=[test_label_dict[x] for x in rejections.columns],
289 |             linewidths=0.01,
290 |             cmap="crest_r",
291 |         )
292 | 
293 |         ax1.set_xlabel("")
294 |         axs[1].set_xlabel("Dimension", fontsize=16)
295 |         axs[1].set_xticklabels(range(1, self.DIM + 1), fontsize=14)
296 |         axs[1].set_yticklabels(axs[1].get_yticklabels(), fontsize=14)
297 |         ax1.set_ylabel("Value", fontsize=16)
298 |         ax1.set_ylim(0, 1)
299 |         ax1.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1], fontsize=14)
300 |         plt.tight_layout()
301 |         if filename is not None:
302 |             plt.savefig(filename)
303 |         plt.show()
304 | 
305 |     def predict_type(self, dt_rej, print_type=False):
306 |         """Predict the type of bias using the rejection data.
307 | 
308 |         Args:
309 |             dt_rej (dataframe): Dataframe containing rejection data.
310 |             print_type (bool, optional): Whether to output the type to the standard output or not. Defaults to False.
311 | 
312 |         Returns:
313 |             dict: Dict with the predicted Class and the Class_Probabilities
314 |         """
315 |         mean_rej = np.mean(np.array(dt_rej), axis=0) > 0.1
316 |         if np.sum(mean_rej) == 0:
317 |             if print_type:
318 |                 print("No clear evidence of bias detected")
319 |             return "none"
320 |         dirname = os.path.dirname(__file__)
321 | 
322 |         # download RF models if needed from
323 |         if not os.path.isfile(f"{dirname}/models/RFs/rf_few_classes.pkl"):
324 |             print("Downloading model files, this takes a while..")
325 |             r = requests.get("https://figshare.com/ndownloader/files/43106839")
326 |             zipfile = ZipFile(BytesIO(r.content))
327 |             zipfile.extractall(f"{dirname}/models/")
328 | 
329 |         with open(f"{dirname}/models/RFs/rf_few_classes.pkl", "rb") as input_file:
330 |             rf = pickle.load(input_file)
331 |         res_class = rf.predict(mean_rej.reshape(1, -1))
332 |         classes = rf.classes_
333 |         prob_classes = rf.predict_proba(mean_rej.reshape(1, -1))
334 | 
335 |         with open(f"{dirname}/models/RFs/rf_scens.pkl", "rb") as input_file:
336 |             rf = pickle.load(input_file)
337 |         res_scen = rf.predict(mean_rej.reshape(1, -1))
338 |         scennames = rf.classes_
339 |         prob_scens = rf.predict_proba(mean_rej.reshape(1, -1))
340 | 
341 |         if print_type:
342 |             print(
343 |                 f"Detected bias which seems to be related to {res_class} ({np.max(prob_classes):.2f} probability)."
344 |                 + f"The rejections seems to be most similar to the {res_scen} scenario ({np.max(prob_scens):.2f} probability)."
345 |                 + "\nWe strongly advise you to now use the `predict_deep` function to more accurately predict the Structural Bias type."
346 |             )
347 |         return {
348 |             "Class": res_class[0],
349 |             "Class Probabilities": prob_classes,
350 |             "Scenario": res_scen[0],
351 |             "Scenario Probabilities": prob_scens,
352 |         }
353 | 
354 |     def explain(self, data, preds, filename=None, verbose=False):
355 |         """Explain the predictions of the deeplearning model.
356 |         You need to call predict_deep first.
357 | 
358 |         Args:
359 |             data (dataframe): The matrix containing the final position values on F0. Note that these should be scaled
360 |                 in [0,1], and in the shape (n_samples, dimension), where n_samples is in [30, 50, 100, 600]
361 |             preds (array): Predictions of bias type for each dimension.
362 |             filename (string): Where to save the figure, if None it will call plt.show() instead.
363 |             verbose (bool): Print additional output.
364 |         """
365 |         # calculate the shapley values per dim
366 | 
367 |         fig, axes = plt.subplots(
368 |             nrows=data.shape[1],
369 |             ncols=2,
370 |             figsize=(12, data.shape[1] * 2),
371 |             gridspec_kw={"width_ratios": [1, 3]},
372 |         )
373 |         for d in range(data.shape[1]):
374 |             x = [np.sort(data[:, d])]
375 |             x = np.expand_dims(x, axis=2)
376 |             shap_val = self.explainer.shap_values(x)
377 |             if verbose:
378 |                 print(preds[d])
379 |             y = np.argmax(preds[d], axis=1)  # prediction of the dimension
380 |             shap_vals_pred = shap_val[y[0]][0]
381 | 
382 |             cmap = sbs.color_palette("coolwarm", as_cmap=True)
383 |             norm = plt.Normalize(
384 |                 vmin=-1 * np.max(np.abs(shap_vals_pred)),
385 |                 vmax=np.max(np.abs(shap_vals_pred)),
386 |             )  # 0 and 1 are the defaults, but you can adapt these to fit other uses
387 |             df = pd.DataFrame(
388 |                 {"x": np.sort(data[:, d]).flatten(), "shap": shap_vals_pred.flatten()}
389 |             )
390 |             palette = {h: cmap(norm(h)) for h in df["shap"]}
391 |             axes[d, 0].bar(self.targetnames, preds[d][0])
392 |             axes[d, 0].tick_params(axis="x", labelrotation=30)
393 |             axes[d, 0].set_title("Prediction probabilities")
394 |             axes[d, 0].set_ylim([0, 1])
395 | 
396 |             axes[d, 1].set_title(f"Predicted: {self.targetnames[y]}")
397 |             sbs.swarmplot(
398 |                 data=df,
399 |                 x="x",
400 |                 hue="shap",
401 |                 palette=palette,
402 |                 ax=axes[d, 1],
403 |                 size=4,
404 |                 legend=False,
405 |             )
406 |             axes[d, 1].set_xlabel("")
407 |             axes[d, 1].set_xlim([0, 1])
408 | 
409 |         # sbs.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
410 |         plt.tight_layout()
411 |         if filename == None:
412 |             plt.show()
413 |         else:
414 |             plt.savefig(filename)
415 |         plt.close()
416 | 
417 |     def predict_deep(self, data, include_proba=True):
418 |         """Predict the BIAS using our neural network.
419 | 
420 |         Args:
421 |             data (dataframe): The matrix containing the final position values on F0. Note that these should be scaled
422 |                 in [0,1], and in the shape (n_samples, dimension), where n_samples is in [30, 50, 100, 600]
423 |             include_proba (boolean, optional): To include the probabilities of each class or only the final label.
424 | 
425 |         Raises:
426 |             ValueError: Unsupported sample size.
427 | 
428 |         Returns:
429 |             predicted bias type (string), optional probabilities (array)
430 |         """
431 |         # load model
432 |         n_samples = data.shape[0]
433 |         if not n_samples in [30, 50, 100, 600]:
434 |             raise ValueError("Sample size is not supported")
435 |         if self.deepmodel == None:
436 |             dirname = os.path.dirname(__file__)
437 |             # download RF models if needed from
438 |             self.deepmodel = tf.keras.models.load_model(
439 |                 f"{dirname}/models/opt_cnn_model-{n_samples}.keras"
440 |             )
441 |             self.targetnames = np.load(
442 |                 f"{dirname}/models/targetnames.npy", allow_pickle=True
443 |             )
444 |             # loading explainable background samples and loading the explainer
445 |             self.xai_background = getXAIBackground(data.shape[0])
446 |             self.explainer = shap.DeepExplainer(self.deepmodel, self.xai_background)
447 |         preds = []
448 |         for d in range(data.shape[1]):
449 |             # perform per dimension test
450 |             x = np.sort(data[:, d])
451 |             x = np.expand_dims([x], axis=2)
452 |             preds.append(self.deepmodel.predict(x, verbose=0))
453 | 
454 |         decisions = np.argmax(np.array(preds).reshape(-1, 5), axis=1) > 0
455 | 
456 |         if np.mean(decisions) <= 0.1:
457 |             y = "unif"
458 |         else:
459 |             pred_mean = np.mean(np.array(preds), axis=0)
460 |             y = self.targetnames[np.argmax(pred_mean.flatten()[1:]) + 1]
461 | 
462 |         if include_proba:
463 |             return y, preds
464 |         return y
465 | 
466 |     def predict(
467 |         self,
468 |         data,
469 |         corr_method="fdr_bh",
470 |         alpha=0.01,
471 |         show_figure=False,
472 |         filename=None,
473 |         print_type=True,
474 |     ):
475 |         """The main function used to detect Structural Bias.
476 | 
477 |         Args:
478 |             data (dataframe): The matrix containing the final position values on F0. Note that these should be scaled
479 |                 in [0,1], and in the shape (n_samples, dimension), where n_samples is in [30, 50, 100, 600]
480 |             corr_method (str, optional): Which type of p-value correction to apply. Recommended is 'fdr_bh',
481 |                 but 'fdr_by' and 'holm' are also supported.. Defaults to 'fdr_bh'.
482 |             alpha (float, optional): The threshold for statistical significance. Defaults to 0.01.
483 |             show_figure (bool, optional): Whether or not to create a plot of the final positions and the corresponding test rejections. Defaults to False.
484 |             filename (string, optional): If not none, the name of the file to store the figure (only when show_figure is True). Defaults to None.
485 |             print_type (bool, optional): Wheter or not to print the predicted type of SB. Defaults to True.
486 | 
487 |         Raises:
488 |             ValueError: Unsupported sample size.
489 | 
490 |         Returns:
491 |             dataframe, dict: rejection data, predicted Bias and type.
492 |         """
493 |         self.DIM = data.shape[1]
494 |         n_samples = data.shape[0]
495 |         if not n_samples in [30, 50, 100, 600]:
496 |             raise ValueError("Sample size is not supported")
497 |         if print_type:
498 |             print(
499 |                 f"Running SB calculation with {self.DIM}-dimensional data of sample size {n_samples} (alpha = {alpha})"
500 |             )
501 |         records = {}
502 |         test_battery_per_dim = get_test_dict(n_samples)
503 |         for tname, tfunc in test_battery_per_dim.items():
504 |             temp = []
505 |             for r in range(self.DIM):
506 |                 try:
507 |                     temp.append(tfunc(data[:, r], alpha=alpha))
508 |                 except:
509 |                     next
510 |             records[tname] = temp
511 |         dt = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in records.items()]))
512 |         dt_rejections = self.transform_to_reject_dt_corr(
513 |             dt, alpha, n_samples, corr_method
514 |         )
515 |         # Drop duplicate test
516 |         dt_rejections = dt_rejections.drop("AD_pwr", axis=1)
517 | 
518 |         if show_figure:
519 |             self.plot_swarm_with_heatmap(data, dt_rejections, filename)
520 | 
521 |         return dt_rejections, self.predict_type(dt_rejections, print_type)
522 | 
523 |     def transform_to_reject_dt_across(self, dt, alpha, n_samples):
524 |         """Transform rejection data for across dimension tests.
525 | 
526 |         Args:
527 |             dt (dataframe): Rejection dataframe.
528 |             alpha (float): Signficance level.
529 |             n_samples (int): Number of samples.
530 | 
531 |         Returns:
532 |             dataframe: Transformed rejection data.
533 |         """
534 |         crit_vals, crit_vals_new = self._load_ref_vals(n_samples, alpha, True)
535 |         test_types_new = self._get_test_types()
536 | 
537 |         dt_rejections = pd.DataFrame()
538 |         for colname in self.p_value_columns:
539 |             dt_rejections[colname] = dt[colname] < alpha
540 | 
541 |         # Ugly solution to distinguish two-sided vs one-sided tests
542 |         dt_rejections["kurtosis"] = (crit_vals["kurtosis_low"] > dt["kurtosis"]) | (
543 |             dt["kurtosis"] > crit_vals["kurtosis_high"]
544 |         )
545 |         dt_rejections["mmpd"] = (crit_vals["mmpd_low"] > dt["mmpd"]) | (
546 |             dt["mmpd"] > crit_vals["mmpd_high"]
547 |         )
548 |         dt_rejections["mi"] = (crit_vals["mi_low"] > dt["mi"]) | (
549 |             dt["mi"] > crit_vals["mi_high"]
550 |         )
551 |         dt_rejections["med_ddlud"] = (crit_vals["med_ddlud_low"] > dt["med_ddlud"]) | (
552 |             dt["med_ddlud"] > crit_vals["med_ddlud_high"]
553 |         )
554 |         for k, v in crit_vals.items():
555 |             if "kurt" in k or "low" in k or "high" in k:
556 |                 next
557 |             else:
558 |                 if k in ["max_ddlud"]:
559 |                     dt_rejections[k] = dt[k] > v
560 |                 else:
561 |                     dt_rejections[k] = dt[k] < v
562 | 
563 |         for k, v in crit_vals_new.items():
564 |             if test_types_new[k] == 4:
565 |                 dt_rejections[k] = dt[k] < v
566 |             else:
567 |                 dt_rejections[k] = dt[k] > v
568 |         return dt_rejections
569 | 
570 |     def predict_multi_dim(self, data, alpha=0.01, print_type=True):
571 |         """Predict Bias using across dimension tests.
572 | 
573 |         Args:
574 |             data (dataframe): dataframe containing end positions.
575 |             alpha (float, optional): Signficance level. Defaults to 0.01.
576 |             print_type (bool, optional): Whether to output the type or not. Defaults to True.
577 | 
578 |         Raises:
579 |             ValueError: unsupported sample size or dimension.
580 | 
581 |         Returns:
582 |             list: List of failed tests that show potential bias.
583 |         """
584 |         DIM = data.shape[1]
585 |         n_samples = data.shape[0]
586 |         if not n_samples in [30, 50, 100, 600]:
587 |             raise ValueError("Sample size is not supported")
588 |         if DIM != 30:
589 |             raise ValueError(
590 |                 "Only 30-dimensional data is supported for across-dimension testing"
591 |             )
592 |         if print_type:
593 |             print(
594 |                 f"Running SB calculation with {DIM}-dimensional data of sample size {n_samples} (alpha = {alpha})"
595 |             )
596 |         records = {}
597 |         test_battery_across_dim = get_test_dict(n_samples, per_dim=False)
598 |         for tname, tfunc in test_battery_across_dim.items():
599 |             try:
600 |                 records[tname] = tfunc(data)
601 |             except:
602 |                 next
603 |         # TODO: fix this function
604 |         dt = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in records.items()]))
605 |         dt_rejections = self.transform_to_reject_dt_across(dt, alpha, n_samples)
606 |         failed_tests = [
607 |             x for x in dt_rejections.columns if np.sum(dt_rejections[x]) > 0
608 |         ]
609 |         if print_type:
610 |             if len(failed_tests == 0):
611 |                 print("No clear evidence of bias detected")
612 |             else:
613 |                 print(
614 |                     f"The following tests detected potential structural bias: {failed_tests}"
615 |                 )
616 |         return failed_tests
617 | 


--------------------------------------------------------------------------------
/BIAS/SB_Test_runner.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from functools import partial
  3 | from multiprocessing import Pool, cpu_count
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy.stats as ss
  8 | from sklearn.feature_selection import mutual_info_regression
  9 | from sklearn.metrics import (
 10 |     adjusted_mutual_info_score,
 11 |     pairwise_distances,
 12 |     pairwise_distances_argmin_min,
 13 | )
 14 | 
 15 | import rpy2.robjects as robjects
 16 | from rpy2.robjects.packages import importr
 17 | from .uniform_test import ddst_uniform_test
 18 | 
 19 | importr("data.table")
 20 | importr("goftest")
 21 | pwr = importr("PoweR")
 22 | 
 23 | robjects.r(
 24 |     """
 25 | R_test_ad <- function(x, max_=1) {
 26 |     return(ad.test(x, "punif", max=max_, min=0)[[2]])
 27 | }
 28 | 
 29 | R_test_norm <- function(x, test='Shapiro') {
 30 |     qnorm_temp <- qnorm(x)
 31 |     qnorm_temp[is.infinite(qnorm_temp)] <- 4*sign(qnorm_temp[is.infinite(qnorm_temp)])
 32 |     if (test == 'Shapiro') {
 33 |         return(shapiro.test(qnorm_temp)[[2]])
 34 |     } else {
 35 |         return(AutoSEARCH::jb.test(qnorm_temp)$p.value)
 36 |     }
 37 | }
 38 | """
 39 | )
 40 | 
 41 | 
 42 | def get_mi(X, type_="med"):
 43 |     mutuals = []
 44 |     for i in range(X.shape[1]):
 45 |         for j in range(i, X.shape[1]):
 46 |             if i != j:
 47 |                 mutuals.append(
 48 |                     mutual_info_regression(X[:, i].reshape(-1, 1), X[:, j])[0]
 49 |                 )
 50 |     if type_ == "med":
 51 |         return np.median(mutuals)
 52 |     return np.max(mutuals)
 53 | 
 54 | 
 55 | def get_mmpd(X):
 56 |     pairwisedistances = []
 57 |     #     print(len(X))
 58 |     for i in range(len(X)):
 59 |         one = X[i]
 60 |         rest = np.append(X[:i], X[i + 1 :], axis=0)
 61 |         res, res_dist = pairwise_distances_argmin_min(one.reshape(1, -1), rest)
 62 |         pairwisedistances.append(res_dist[0])
 63 |     return np.max(pairwisedistances)
 64 | 
 65 | 
 66 | def get_mddlud(X, type_="med"):
 67 |     baseline_space = np.linspace(0, 1, len(X))
 68 |     lindist = []
 69 |     for i in range(X.shape[1]):
 70 |         lindist.append(np.max(np.abs(np.sort(X[:, i]) - baseline_space)))
 71 |     if type_ == "med":
 72 |         return np.median(lindist)
 73 |     return max(lindist)
 74 | 
 75 | 
 76 | def is_valid(x, centers, gap_size):
 77 |     for c in centers:
 78 |         if np.abs(x - c) < gap_size:
 79 |             return False
 80 |     return True
 81 | 
 82 | 
 83 | def get_simulated_data(scen, rep=1000, n_samples=100, kwargs={}):
 84 |     if scen == "unif":
 85 |         data_arr = np.random.uniform(size=(n_samples, rep))
 86 |     elif scen == "trunc_unif":
 87 |         min_ = kwargs["min"]
 88 |         max_ = kwargs["max"]
 89 |         data_arr = np.random.uniform(size=(n_samples, rep), low=min_, high=max_)
 90 |     elif scen == "spikes":
 91 |         data_arr = (
 92 |             np.random.randint(kwargs["max"] + 1, size=(n_samples, rep)) / kwargs["max"]
 93 |         )
 94 |     elif scen == "shifted_spikes":
 95 |         possible_vals = [i / kwargs["max"] for i in range(kwargs["max"] + 1)]
 96 |         translations = np.random.normal(0, kwargs["sigma"], size=(kwargs["max"] + 1))
 97 |         possible_vals = [
 98 |             i + j if 0 <= i + j <= 1 else i for i, j in zip(possible_vals, translations)
 99 |         ]
100 |         data_arr = np.random.choice(possible_vals, size=(n_samples, rep))
101 |     elif scen == "norm":
102 |         nr_req = n_samples * rep
103 |         data_temp = np.random.normal(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10))
104 |         data_arr = [x for x in data_temp if 0 < x < 1][:nr_req]
105 |         data_arr = np.array(data_arr).reshape((n_samples, rep))
106 |     elif scen == "cauchy":
107 |         nr_req = n_samples * rep
108 |         data_temp = ss.cauchy.rvs(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10))
109 |         data_arr = [x for x in data_temp if 0 < x < 1][:nr_req]
110 |         data_arr = np.array(data_arr).reshape((n_samples, rep))
111 |     elif scen == "inv_cauchy":
112 |         nr_req = n_samples * rep
113 |         data_temp = ss.cauchy.rvs(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10))
114 |         data_arr = [x for x in data_temp if 0 < x < 1][:nr_req]
115 |         data_arr = [
116 |             1 + kwargs["mu"] - x if x > kwargs["mu"] else kwargs["mu"] - x
117 |             for x in data_arr
118 |         ]  # Not efficient, but works. Maybe improve later
119 |         data_arr = np.array(data_arr).reshape((n_samples, rep))
120 |     elif scen == "inv_norm":
121 |         nr_req = n_samples * rep
122 |         data_temp = np.random.normal(kwargs["mu"], kwargs["sigma"], size=(nr_req * 10))
123 |         data_arr = [x for x in data_temp if 0 < x < 1][:nr_req]
124 |         data_arr = [
125 |             1 + kwargs["mu"] - x if x > kwargs["mu"] else kwargs["mu"] - x
126 |             for x in data_arr
127 |         ]  # Not efficient, but works. Maybe improve later
128 |         data_arr = np.array(data_arr).reshape((n_samples, rep))
129 |     elif scen == "gaps":
130 |         temp = []
131 |         for rep in range(rep):
132 |             data_temp = np.random.uniform(size=(n_samples * 10))
133 |             centers = np.random.uniform(size=kwargs["n_centers"])
134 |             temp.append(
135 |                 [x for x in data_temp if is_valid(x, centers, kwargs["sigma"])][
136 |                     :n_samples
137 |                 ]
138 |             )
139 |         data_arr = np.array(temp).transpose()
140 |     elif scen == "consistent_gaps":
141 |         temp = []
142 |         centers = np.random.uniform(size=kwargs["n_centers"])
143 |         for rep in range(rep):
144 |             data_temp = np.random.uniform(size=(n_samples * 10))
145 |             temp.append(
146 |                 [x for x in data_temp if is_valid(x, centers, kwargs["sigma"])][
147 |                     :n_samples
148 |                 ]
149 |             )
150 |         data_arr = np.array(temp).transpose()
151 |     elif scen == "clusters":
152 |         temp = []
153 |         for rep in range(rep):
154 |             centers = np.random.uniform(size=kwargs["n_centers"])
155 |             samples = [
156 |                 np.random.normal(loc=x, scale=kwargs["sigma"])
157 |                 for x in np.random.choice(centers, size=n_samples * 10)
158 |             ]
159 |             temp.append([x for x in samples if 0 < x < 1][:n_samples])
160 |         data_arr = np.array(temp).transpose()
161 |     elif scen == "consistent_clusters":
162 |         temp = []
163 |         centers = np.random.uniform(size=kwargs["n_centers"])
164 |         for rep in range(rep):
165 |             samples = [
166 |                 np.random.normal(loc=x, scale=kwargs["sigma"])
167 |                 for x in np.random.choice(centers, size=n_samples * 10)
168 |             ]
169 |             temp.append([x for x in samples if 0 < x < 1][:n_samples])
170 |         data_arr = np.array(temp).transpose()
171 |     elif scen == "part_unif":
172 |         temp = []
173 |         for rep in range(rep):
174 |             n_unif = int(np.ceil(kwargs["frac_unif"] * n_samples))
175 |             data_temp = np.random.uniform(size=(n_unif))
176 |             new_points = [
177 |                 np.random.normal(loc=x, scale=kwargs["sigma"])
178 |                 for x in np.random.choice(data_temp, size=n_samples * 10)
179 |             ]
180 |             data_new = [x for x in new_points if 0 < x < 1][:n_samples]
181 |             #             deviations = np.random.normal(size = len(data_temp), scale=kwargs['sigma'])
182 |             #             new_points = [x + y for x,y in zip(data_temp, deviations) if 0 < x+y < 1]
183 |             data_new = np.append(data_temp[:n_unif], data_new[: (n_samples - n_unif)])
184 |             temp.append(data_new)
185 |         data_arr = np.array(temp).transpose()
186 |     elif scen == "bound_thing":
187 |         temp = []
188 |         for _ in range(rep):
189 |             n_01 = int(np.ceil((1 - kwargs["frac_between"]) * n_samples))
190 |             data_temp = np.random.uniform(size=(n_samples))
191 |             data_temp[
192 |                 np.random.choice(range(n_samples), n_01, replace=False)
193 |             ] = np.random.choice(
194 |                 [0, 1], size=n_01, p=[kwargs["frac_0"], 1 - kwargs["frac_0"]]
195 |             )
196 |             #             for idx in np.random.randint(0, n_samples, n_01):
197 |             #                 data_temp[idx] = np.random.csv"
198 |             temp.append(np.array(data_temp))
199 |         data_arr = np.array(temp).transpose()
200 |     return data_arr
201 | 
202 | 
203 | scenario_dict = {
204 |     "unif": None,
205 |     "trunc_unif": ["min", "max"],
206 |     "spikes": ["max"],
207 |     "shifted_spikes": ["max", "sigma"],
208 |     "norm": ["sigma", "mu"],
209 |     "inv_norm": ["sigma", "mu"],
210 |     "cauchy": ["sigma", "mu"],
211 |     "inv_cauchy": ["sigma", "mu"],
212 |     "gaps": ["n_centers", "sigma"],
213 |     "clusters": ["n_centers", "sigma"],
214 |     "part_unif": ["frac_unif", "sigma"],
215 | }
216 | 
217 | scenario_dict_across = {
218 |     "unif": None,
219 |     "trunc_unif": ["min", "max"],
220 |     "spikes": ["max"],
221 |     "shifted_spikes": ["max", "sigma"],
222 |     "norm": ["sigma", "mu"],
223 |     "inv_norm": ["sigma", "mu"],
224 |     "cauchy": ["sigma", "mu"],
225 |     "inv_cauchy": ["sigma", "mu"],
226 |     "gaps": ["n_centers", "sigma"],
227 |     "consistent_gaps": ["n_centers", "sigma"],
228 |     "clusters": ["n_centers", "sigma"],
229 |     "consistent_clusters": ["n_centers", "sigma"],
230 |     "part_unif": ["frac_unif", "sigma"],
231 | }
232 | 
233 | # Note: this file is set up terribly (since it is derived from my notebook-code). TODO: Figure out a better way to structure this!!!
234 | 
235 | 
236 | def get_test_dict(n_samples, per_dim=True):
237 |     ### Start by setting up the reference values which need to be gotten from simulations ###
238 | 
239 |     # spacing-values
240 |     dist_vals_rand = np.array(
241 |         [
242 |             np.diff(np.sort(np.append(np.random.uniform(size=n_samples), [0, 1])))
243 |             for _ in range(1000)
244 |         ]
245 |     ).reshape(-1)
246 |     dist_vals_rand2 = []
247 |     for rep in range(1000):
248 |         x = np.sort(np.append(np.random.uniform(size=n_samples), [0, 1]))
249 |         dist_vals_rand2.append(x[2:] - x[:-2])
250 |     dist_vals_rand2 = np.array(dist_vals_rand2).reshape(-1)
251 | 
252 |     dist_vals_rand3 = []
253 |     for rep in range(1000):
254 |         x = np.sort(np.append(np.random.uniform(size=n_samples), [0, 1]))
255 |         dist_vals_rand3.append(x[3:] - x[:-3])
256 |     dist_vals_rand3 = np.array(dist_vals_rand3).reshape(-1)
257 | 
258 |     #     #Range values
259 |     #     dists = [np.max(x) - np.min(x) for x in np.random.uniform(size=(10000,n_samples))]
260 |     #     mins = [np.min(x) for x in np.random.uniform(size=(10000,n_samples))]
261 |     #     maxs = [np.max(x) for x in np.random.uniform(size=(10000,n_samples))]
262 | 
263 |     #     #linspace baseline
264 |     comp_to = np.linspace(0, 1, num=n_samples)
265 |     #     wassersteins = [np.sum(np.abs(np.sort(x) - comp_to)) for x in np.random.uniform(size=(10000,n_samples))]
266 |     #     lindist_min = [np.min(np.abs(np.sort(x) - comp_to)) for x in np.random.uniform(size=(10000,n_samples))]
267 |     #     lindist_max = [np.max(np.abs(np.sort(x) - comp_to)) for x in np.random.uniform(size=(10000,n_samples))]
268 | 
269 |     #     #max pairwise distances
270 |     #     max_pair_dists = [np.max(np.diff(np.sort(np.random.uniform(size=(n_samples))))) for x in range(10000)]
271 | 
272 |     ### Define the tests. For now, this is needed here, since it relies on the simulated reference values :(
273 |     def test_spacing(x, m=1, alpha=0.01):
274 |         x = np.sort(np.append(x, [0, 1]))
275 |         if m == 1:
276 |             p = ss.ks_2samp(np.diff(x), dist_vals_rand)[1]
277 |         elif m == 2:
278 |             p = ss.ks_2samp(x[2:] - x[:-2], dist_vals_rand2)[1]
279 |         else:
280 |             p = ss.ks_2samp(x[3:] - x[:-3], dist_vals_rand3)[1]
281 |         return p  # < alpha
282 | 
283 |     def test_range(x, alpha=0.01):
284 |         return np.max(x) - np.min(x)  # <= np.quantile(dists, alpha)
285 | 
286 |     def test_edges(x, type_="min", alpha=0.01):
287 |         if type_ == "min":
288 |             return np.min(x)  # >= np.quantile(mins, 1-alpha)
289 |         else:
290 |             return np.max(x)  # <= np.quantile(maxs, alpha)
291 | 
292 |     def test_ad(x, transform=False, alpha=0.01):
293 |         if transform:
294 |             x = np.abs(x - 0.5)
295 |             return robjects.globalenv["R_test_ad"](robjects.FloatVector(x), 0.5)[
296 |                 0
297 |             ]  # < alpha
298 |         return robjects.globalenv["R_test_ad"](robjects.FloatVector(x))[0]  # < alpha
299 | 
300 |     def test_normal_transformed(x, test="Shapiro", alpha=0.01):
301 |         return robjects.globalenv["R_test_norm"](robjects.FloatVector(x), test)[
302 |             0
303 |         ]  # < alpha
304 | 
305 |     # TODO: fix the naming scheme (this is mddlud!)
306 |     def test_lindist_dim(x, type_="min", alpha=0.01):
307 |         if type_ == "max":
308 |             return np.max(
309 |                 np.abs(np.sort(x) - comp_to)
310 |             )  # >= np.quantile(lindist_max, 1-alpha)
311 |         else:
312 |             return np.min(
313 |                 np.abs(np.sort(x) - comp_to)
314 |             )  # <= np.quantile(lindist_min, alpha)
315 | 
316 |     def test_pairwise_dists_dim(x, type_="min", alpha=0.01):
317 |         if type_ == "max":
318 |             return np.max(
319 |                 np.diff(np.sort(x))
320 |             )  # >= np.quantile(max_pair_dists, 1-alpha)
321 |         else:
322 |             return np.max(np.diff(np.sort(x)))  # <= np.quantile(max_pair_dists, alpha)
323 | 
324 |     def test_kurtosis(x, alpha=0.01):
325 |         return ss.kurtosis(ss.norm.ppf(x))
326 | 
327 |     #         return not (np.quantile(kurts,alpha/2) < ss.kurtosis(ss.norm.ppf(x)) < np.quantile(kurts,1-alpha/2))
328 | 
329 |     def test_wasserstein(x, alpha=0.01):
330 |         # Note: not scaled for sample size (won't matter for result, but need to keep in mind that right baseline needs to be used!)
331 |         return np.sum(
332 |             np.abs(np.sort(x) - comp_to)
333 |         )  # > np.quantile(wassersteins,1-alpha)
334 | 
335 |     def test_ddst(x, alpha=0.01):
336 |         return ddst_uniform_test(x, nr=1000, compute_p=True)["p_value"]  # < alpha
337 | 
338 |     def test_pwr(x, test_nr, alpha=0.01):
339 |         return pwr.statcompute(test_nr, robjects.FloatVector(x))[0][0]
340 | 
341 |     test_battery_per_dim = {
342 |         "1-spacing": test_spacing,
343 |         "2-spacing": partial(test_spacing, m=2),
344 |         "3-spacing": partial(test_spacing, m=3),
345 |         "range": test_range,
346 |         "min": test_edges,
347 |         "max": partial(test_edges, type_="max"),
348 |         "ad": test_ad,
349 |         "ad_transform": partial(test_ad, transform=True),
350 |         "shapiro": test_normal_transformed,
351 |         "jb": partial(test_normal_transformed, test="jb"),
352 |         "mdd_min": test_lindist_dim,
353 |         "mdd_max": partial(test_lindist_dim, type_="max"),
354 |         "kurtosis": test_kurtosis,
355 |         "mmpd_max": test_pairwise_dists_dim,
356 |         "mmpd_min": partial(test_pairwise_dists_dim, type_="max"),
357 |         "wasserstein": test_wasserstein,
358 |         "ddst": test_ddst,
359 |         "kolmogorov": partial(test_pwr, test_nr=63),
360 |         "CvM": partial(test_pwr, test_nr=64),
361 |         "AD_pwr": partial(test_pwr, test_nr=65),
362 |         "Durbin": partial(test_pwr, test_nr=66),
363 |         "Kuiper": partial(test_pwr, test_nr=67),
364 |         "HG1": partial(test_pwr, test_nr=68),
365 |         "HG2": partial(test_pwr, test_nr=69),
366 |         "Greenwood": partial(test_pwr, test_nr=70),
367 |         "QM": partial(test_pwr, test_nr=71),
368 |         "RC": partial(test_pwr, test_nr=72),
369 |         "Moran": partial(test_pwr, test_nr=73),
370 |         "Cressie1": partial(test_pwr, test_nr=74),
371 |         "Cressie2": partial(test_pwr, test_nr=75),
372 |         "Vasicek": partial(test_pwr, test_nr=76),
373 |         "Swartz": partial(test_pwr, test_nr=77),
374 |         "Morales": partial(test_pwr, test_nr=78),
375 |         "Pardo": partial(test_pwr, test_nr=79),
376 |         "Marhuenda": partial(test_pwr, test_nr=80),
377 |         "Zhang1": partial(test_pwr, test_nr=81),
378 |         "Zhang2": partial(test_pwr, test_nr=82),
379 |     }
380 | 
381 |     if per_dim:
382 |         return test_battery_per_dim
383 | 
384 |     def test_mi(X, type_="med", alpha=0.01):
385 |         mi = get_mi(X, type_)
386 |         if type_ == "med":
387 |             return mi  # > np.quantile(med_mis, 1-alpha)
388 |         return mi  # > np.quantile(max_mis, 1-alpha)
389 | 
390 |     def test_mmpd(X, alpha=0.01):
391 |         mmpd = get_mmpd(X)
392 |         return mmpd  # > np.quantile(mmpds, 1-alpha)
393 | 
394 |     def test_mddlud(X, type_="med", alpha=0.01):
395 |         mddlud = get_mddlud(X, type_)
396 |         if type_ == "med":
397 |             return mddlud  # > np.quantile(med_ddluds, 1-alpha)
398 |         return mddlud  # > np.quantile(max_ddluds, 1-alpha)
399 | 
400 |     def test_spacing_across(X, m=1, alpha=0.01):
401 |         # Not very efficient, but works for now
402 |         diffs = []
403 |         for dim in range(X.shape[1]):
404 |             x = np.sort(np.append(X[:, dim], [0, 1]))
405 |             if m == 1:
406 |                 diffs.append(np.diff(x))
407 |             else:
408 |                 diffs.append(x[m:] - x[: (-1 * m)])
409 |         diffs = np.array(diffs).reshape(-1)
410 |         if m == 1:
411 |             p = ss.ks_2samp(diffs, dist_vals_rand)[1]
412 |         elif m == 2:
413 |             p = ss.ks_2samp(diffs, dist_vals_rand2)[1]
414 |         else:
415 |             p = ss.ks_2samp(diffs, dist_vals_rand3)[1]
416 |         return p  # < alpha
417 | 
418 |     test_battery_across_dim = {
419 |         "mi": test_mi,
420 |         #     'max_mi' : partial(test_mi, type_='max'),
421 |         "mmpd": test_mmpd,
422 |         "med_ddlud": test_mddlud,
423 |         "max_ddlud": partial(test_mddlud, type_="max"),
424 |     }
425 | 
426 |     def run_test_aggr(x, test, **kwargs):
427 |         y = x.reshape(-1)
428 |         return test(y, **kwargs)
429 | 
430 |     test_battery_aggr = {}
431 |     for k, v in test_battery_per_dim.items():
432 |         if (
433 |             "mdd" not in k
434 |             and "mmpd" not in k
435 |             and "spacing" not in k
436 |             and "wasser" not in k
437 |         ):
438 |             test_battery_aggr[k] = partial(run_test_aggr, test=v)
439 | 
440 |     test_battery_aggr["1-spacing"] = test_spacing_across
441 |     test_battery_aggr["2-spacing"] = partial(test_spacing_across, m=2)
442 |     test_battery_aggr["3-spacing"] = partial(test_spacing_across, m=3)
443 |     test_battery_aggr.pop("range", None)
444 |     test_battery_aggr.pop("min", None)
445 |     test_battery_aggr.pop("max", None)
446 | 
447 |     return {**test_battery_across_dim, **test_battery_aggr}
448 | 
449 | 
450 | def runParallelFunction(runFunction, arguments):
451 |     """
452 |     Return the output of runFunction for each set of arguments,
453 |     making use of as much parallelization as possible on this system
454 | 
455 |     :param runFunction: The function that can be executed in parallel
456 |     :param arguments:   List of tuples, where each tuple are the arguments
457 |                         to pass to the function
458 |     :return:
459 |     """
460 | 
461 |     arguments = list(arguments)
462 |     p = Pool(min(cpu_count(), len(arguments)))
463 |     #     local_func = partial(func_star, func=runFunction)
464 |     results = p.map(runFunction, arguments)
465 |     p.close()
466 |     return results
467 | 
468 | 
469 | def run_scenario_across(
470 |     scen_list, foldername="", rep=100, dims=30, alpha=0.01, n_samples=100
471 | ):
472 |     np.random.seed(42)
473 |     records = {}
474 |     kwargs = scen_list[1]
475 |     scen = scen_list[0]
476 |     test_battery_across = get_test_dict(n_samples, False)
477 |     if n_samples > 150:
478 |         test_battery_across.pop("jb", None)
479 |         test_battery_across.pop("shapiro", None)
480 |     for r in range(rep):
481 |         data_arr = get_simulated_data(scen, dims, n_samples, kwargs)
482 |         #         print(data_arr.shape)
483 |         for tname, tfunc in test_battery_across.items():
484 |             print(tname)
485 |             if tname in records:
486 |                 records[tname].append(tfunc(data_arr, alpha=alpha))
487 |             else:
488 |                 records[tname] = [tfunc(data_arr, alpha=alpha)]
489 |     dt = pd.DataFrame.from_dict(records)
490 |     scen_name = f"{foldername}S{scen}"
491 |     for k, v in kwargs.items():
492 |         dt[f"{k}_"] = v
493 |         scen_name = f"{scen_name}_{k}_{v}"
494 |     scen_name = f"{scen_name}.csv"
495 |     dt["scen"] = scen
496 |     dt["n_samples"] = n_samples
497 |     dt["dims"] = dims
498 |     dt.to_csv(scen_name)
499 | 
500 | 
501 | #     return dt
502 | 
503 | 
504 | def run_scenario(scen_list, foldername="", rep=1500, alpha=0.01, n_samples=100):
505 |     np.random.seed(42)
506 |     kwargs = scen_list[1]
507 |     scen = scen_list[0]
508 |     #     print(scen)
509 |     data_arr = get_simulated_data(scen, rep, n_samples, kwargs)
510 |     records = {}
511 |     test_battery_per_dim = get_test_dict(n_samples)
512 |     for tname, tfunc in test_battery_per_dim.items():
513 |         print(tname)
514 |         temp = []
515 |         for r in range(rep):
516 |             temp.append(tfunc(data_arr[:, r], alpha=alpha))
517 |         records[tname] = temp
518 |     dt = pd.DataFrame.from_dict(records)
519 |     scen_name = f"{foldername}S{scen}"
520 |     for k, v in kwargs.items():
521 |         dt[f"{k}_"] = v
522 |         scen_name = f"{scen_name}_{k}_{v}"
523 |     scen_name = f"{scen_name}.csv"
524 |     #     print(scen_name)
525 |     dt["scen"] = scen
526 |     dt["n_samples"] = n_samples
527 |     dt.to_csv(scen_name)
528 | 
529 | 
530 | #     return dt
531 | 
532 | 
533 | def get_scens_per_dim():
534 |     scens = [["unif", {}]]
535 |     for temp in [0.025, 0.05, 0.1, 0.2]:
536 |         scens.append(["trunc_unif", {"min": temp / 2, "max": 1 - temp / 2}])
537 |     for temp in [0.025, 0.05, 0.1, 0.2]:
538 |         scens.append(["trunc_unif", {"min": temp, "max": 1}])
539 |     for max_ in [25, 50, 100, 150, 200, 250]:
540 |         scens.append(["spikes", {"max": max_}])
541 |     for max_ in [25, 50, 100, 150, 200, 250]:
542 |         for sigma in [0.005, 0.01, 0.02, 0.03, 0.04, 0.05]:
543 |             scens.append(["shifted_spikes", {"max": max_, "sigma": sigma}])
544 |     for s in ["norm", "inv_norm", "cauchy", "inv_cauchy"]:
545 |         for sigma in [0.1, 0.2, 0.3, 0.4]:
546 |             for mu in [0.5, 0.6, 0.7]:
547 |                 scens.append([s, {"sigma": sigma, "mu": mu}])
548 |     for n_centers in [1, 2, 3, 4, 5]:
549 |         for gap_rad in [0.01, 0.02, 0.03, 0.04, 0.05]:
550 |             scens.append(["gaps", {"n_centers": n_centers, "sigma": gap_rad}])
551 |     for n_centers in [1, 2, 3, 4, 5]:
552 |         for gap_rad in [0.01, 0.025, 0.05, 0.1, 0.2, 0.3]:
553 |             scens.append(["clusters", {"n_centers": n_centers, "sigma": gap_rad}])
554 |     for n_unif in [0.1, 0.25, 0.5]:
555 |         for sigma in [0.01, 0.02, 0.05, 0.1]:
556 |             scens.append(["part_unif", {"frac_unif": n_unif, "sigma": sigma}])
557 |     for f_0 in [0.1, 0.35, 0.45, 0.5]:
558 |         for f_between in [0.5, 0.25, 0.1, 0.05, 0.025, 0.01]:
559 |             scens.append(["bound_thing", {"frac_between": f_between, "frac_0": f_0}])
560 |     return scens
561 | 
562 | 
563 | def get_scens_across_dim():
564 |     scens = [["unif", {}]]
565 |     for temp in [0.01, 0.025, 0.05, 0.1, 0.2]:
566 |         scens.append(["trunc_unif", {"min": temp / 2, "max": 1 - temp / 2}])
567 |         scens.append(["trunc_unif", {"min": temp, "max": 1}])
568 |     for max_ in [25, 50, 100, 150, 200, 250, 500, 1000]:
569 |         scens.append(["spikes", {"max": max_}])
570 |         for sigma in [0.005, 0.01, 0.02, 0.03, 0.04, 0.05]:
571 |             scens.append(["shifted_spikes", {"max": max_, "sigma": sigma}])
572 |     for sigma in [0.1, 0.2, 0.3, 0.4, 0.5]:
573 |         for mu in [0.5, 0.6, 0.7]:
574 |             scens.append(["norm", {"sigma": sigma, "mu": mu}])
575 |             scens.append(["inv_norm", {"sigma": sigma, "mu": mu}])
576 |             scens.append(["cauchy", {"sigma": sigma, "mu": mu}])
577 |             scens.append(["inv_cauchy", {"sigma": sigma, "mu": mu}])
578 |     for n_centers in [1, 2, 3, 4, 5]:
579 |         for gap_rad in [0.01, 0.02, 0.03, 0.04, 0.05]:
580 |             scens.append(
581 |                 ["consistent_gaps", {"n_centers": n_centers, "sigma": gap_rad}]
582 |             )
583 |             scens.append(["gaps", {"n_centers": n_centers, "sigma": gap_rad}])
584 |     for n_centers in [1, 2, 3, 4, 5]:
585 |         for gap_rad in [0.01, 0.025, 0.05, 0.1, 0.2, 0.3]:
586 |             scens.append(
587 |                 ["consistent_clusters", {"n_centers": n_centers, "sigma": gap_rad}]
588 |             )
589 |             scens.append(["clusters", {"n_centers": n_centers, "sigma": gap_rad}])
590 |     for n_unif in [0.1, 0.25, 0.5]:
591 |         for sigma in [0.01, 0.02, 0.05, 0.1]:
592 |             scens.append(["part_unif", {"frac_unif": n_unif, "sigma": sigma}])
593 |     return scens
594 | 
595 | 
596 | def get_scens_inv():
597 |     # Get only the inv-based scenarios
598 |     scens = []
599 |     for sigma in [0.1, 0.2, 0.3, 0.4, 0.5]:
600 |         for mu in [0.6, 0.7]:
601 |             scens.append(["inv_norm", {"sigma": sigma, "mu": mu}])
602 |             scens.append(["inv_cauchy", {"sigma": sigma, "mu": mu}])
603 |     return scens
604 | 
605 | 
606 | def get_scens_bound():
607 |     # Get only the added heavy-bound scenario
608 |     scens = []
609 |     for f_0 in [0.1, 0.35, 0.45, 0.5]:
610 |         for f_between in [0.5, 0.25, 0.1, 0.05, 0.025, 0.01]:
611 |             scens.append(["bound_thing", {"frac_between": f_between, "frac_0": f_0}])
612 |     return scens
613 | 
614 | 
615 | def run_test_cases(n_samples, fname="Datatables", per_dim=True, rep=1500):
616 |     if per_dim:
617 |         scens = get_scens_per_dim()
618 |         foldername = f"{fname}/S{n_samples}/"
619 |         partial_run = partial(
620 |             run_scenario, foldername=foldername, n_samples=n_samples, rep=rep
621 |         )
622 |         runParallelFunction(partial_run, scens)
623 |     else:
624 |         scens = get_scens_across_dim()
625 |         foldername = f"{fname}/S{n_samples}_Across/"
626 |         partial_run = partial(
627 |             run_scenario_across, foldername=foldername, n_samples=n_samples
628 |         )
629 |         runParallelFunction(partial_run, scens)
630 | 
631 | 
632 | if __name__ == "__main__":
633 |     idx_nr = int(sys.argv[1])
634 |     #     rep = int(sys.argv[2])
635 | 
636 |     # idx_nr decides which experiment is run (division on nodes)
637 |     fname = "/var/scratch/dlvermet/SB"
638 |     #     fname = "Datatables"
639 |     run_per = idx_nr < 4
640 |     s = [30, 50, 100, 600][idx_nr % 4]
641 |     #     for s in [30, 50, 100, 600]:
642 |     run_test_cases(s, fname, run_per)
643 | #     run_test_cases(100)
644 | #     alpha = [0.05, 0.01, ]
645 | 


--------------------------------------------------------------------------------