├── .editorconfig ├── .flake8 ├── .gitattributes ├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── cnil ├── CNIL_opinion_anonymeter.pdf └── CNIL_opinion_anonymeter_courtesy_translation.pdf ├── notebooks ├── anonymeter_example.ipynb └── datasets │ ├── adults_control.csv │ ├── adults_syn_ctgan.csv │ └── adults_train.csv ├── pyproject.toml ├── src └── anonymeter │ ├── __init__.py │ ├── evaluators │ ├── __init__.py │ ├── inference_evaluator.py │ ├── linkability_evaluator.py │ └── singling_out_evaluator.py │ ├── neighbors │ ├── __init__.py │ └── mixed_types_kneighbors.py │ ├── preprocessing │ ├── __init__.py │ ├── transformations.py │ └── type_detection.py │ └── stats │ ├── __init__.py │ └── confidence.py └── tests ├── __init__.py ├── datasets ├── adults_ori.csv └── adults_syn.csv ├── fixtures.py ├── test_confidence.py ├── test_inference_evaluator.py ├── test_linkability_evaluator.py ├── test_mixed_types_kneigbors.py ├── test_singling_out_evaluator.py ├── test_transformations.py └── test_type_detection.py /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | 9 | [*.cfg] 10 | indent_style = space 11 | indent_size = 8 12 | 13 | [*.{yml,yaml,md,markdown}] 14 | indent_style = space 15 | indent_size = 2 16 | 17 | [*.{sh,py,pyi,js,ts,json,xml,css,handlebars}] 18 | indent_style = space 19 | indent_size = 4 20 | 21 | [*.py] 22 | profile = black 23 | 24 | [*.html] 25 | indent_style = space 26 | indent_size = 2 27 | max_line_length=120 28 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | 3 | # References: 4 | # https://flake8.readthedocs.io/en/latest/user/configuration.html 5 | # https://flake8.readthedocs.io/en/latest/user/error-codes.html 6 | 7 | ignore = 8 | D100 # Missing docstring in public module 9 | D107 # Missing docstring in __init__ 10 | D401 # First line should be in imperative mood 11 | 12 | # flake8-docstring adds D??? errors to flake8 output. 13 | # Those errors are disabled for the code listed below. 14 | # Format: one file (or files if placeholders used) per line, 15 | # then "colon" and finally the warnings (e.g. D404) or group of 16 | # warnings (e.g. D or D4) to ignore per file. 17 | per-file-ignores = 18 | docs/*:D 19 | setup.py:D 20 | tests/*:D 21 | 22 | exclude = 23 | .git,.vscode,env,.env,.venv,.pytest_cache,.mypy_cache, **/__init__.py, 24 | build,dist,htmlcov,setup.py 25 | 26 | max-line-length = 120 27 | # black enforces an equal amount of whitespace around slice operators, we need to ignore flake8's rule 28 | # https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#configuration 29 | extend-ignore = E203 30 | 31 | max-complexity = 18 32 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py diff=python 2 | *.png binary 3 | *.jpg binary 4 | *.csv text -whitespace 5 | *.rst text 6 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | # We test only the minimum and the maximum supported versions of python 17 | python-version: ["3.8", "3.11"] 18 | pandas-version: ["1.4", "2.1"] 19 | exclude: 20 | - python-version: "3.8" 21 | pandas-version: "2.1" 22 | 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v2 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install pandas~=${{ matrix.pandas-version }} 33 | pip install -e '.[dev]' 34 | - name: Ruff Ruff 35 | run: | 36 | ruff ./src ./tests 37 | - name: mypy check 38 | run: | 39 | mypy ./src ./tests 40 | - name: Test with pytest 41 | run: | 42 | pytest 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.3.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: detect-private-key 8 | - id: check-added-large-files 9 | - id: mixed-line-ending 10 | args: ['--fix', 'lf'] 11 | 12 | - repo: https://github.com/jumanjihouse/pre-commit-hooks 13 | rev: 3.0.0 14 | hooks: 15 | - id: git-check 16 | 17 | - repo: https://github.com/astral-sh/ruff-pre-commit 18 | rev: v0.1.7 19 | hooks: 20 | - id: ruff 21 | - id: ruff-format 22 | 23 | - repo: local 24 | hooks: 25 | - id: mypy 26 | name: mypy 27 | language: system 28 | entry: bash -c 'mypy "$@"' -- 29 | types: [python] 30 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ## [1.0.0] - 2024-02-02 11 | 12 | ### Changed 13 | 14 | - numba is updated to 0.58 to allow for the newer numpy version 15 | - numpy version range is adapted accordingly to numba's requirements 16 | - python 3.11 is allowed 17 | - pandas version is relaxed to allow for pandas >= 2 18 | * added additional CI pipeline for pandas 2 19 | 20 | ### Fixed 21 | 22 | - singling out evaluators getting stuck on multivariate queries 23 | 24 | ## [0.0.2] - 2023-07-10 25 | 26 | ### Added 27 | 28 | - CNIL mention (#18) 29 | - Customized logging on module level (#19) 30 | 31 | ### Fixed 32 | 33 | - Pre-commit errors (#19) 34 | 35 | 36 | ## [0.0.1] - 2023-04-24 37 | 38 | ### Added 39 | 40 | - Initial release 41 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | ## Releasing a new version 4 | 5 | ### Building the new version 6 | 7 | 1. Increment the version in `pyproject.toml`. 8 | 9 | 1. Update the `CHANGELOG.md`. 10 | 11 | - Follow [the guidelines](https://keepachangelog.com/) 12 | - Rename `Unreleased` section to the new version 13 | - Create a new empty `Unreleased` section 14 | 15 | 1. Create a PR with the title `Release [version]`, ask for a review. 16 | 17 | 1. Publish the package to Test PyPi and verify that it's working correctly 18 | (the instructions are below). 19 | 20 | 1. Merge the PR. 21 | 22 | 1. [Create new release](https://github.com/statice/anonymeter/releases/new) 23 | in github 24 | 25 | - specify the new tag which equals to the new `[version]` 26 | - copy paste the new release notes from the `CHANGELOG.md` 27 | 28 | 29 | ### Publishing to Test PyPi 30 | 31 | Don't forget to pull the latest `main`. 32 | 33 | Install development dependencies: 34 | ```bash 35 | pip install ".[dev]" 36 | ``` 37 | 38 | Build a source and a wheel distributions: 39 | ```bash 40 | rm -rf ./dist # clean the build directory if necessary 41 | python -m build 42 | ``` 43 | 44 | Login to Test PyPi. Create a new account if you don't have it yet 45 | and ask to be added as a collaborator for Anonymeter. 46 | 47 | Get the token from [Test PyPi](https://test.pypi.org/manage/account/#api-tokens) 48 | and save it as suggested to `$HOME/.pypirc`: 49 | ```toml 50 | [testpypi] 51 | username = __token__ 52 | password = YOUR_TOKEN_HERE 53 | ``` 54 | 55 | Upload the artifacts to Test PyPi: 56 | ```bash 57 | twine upload --repository testpypi dist/* 58 | ``` 59 | 60 | Test that the package installs and works properly. For example, 61 | you can create a new virtualenv and try to install the package there. 62 | ```bash 63 | mkdir ~/test-anonymeter # create some test directory 64 | cd ~/test-anonymeter 65 | python -m venv .venv # create new virtual env 66 | source .venv/bin/activate 67 | asdf reshim python # in case you use asdf 68 | pip install --upgrade pip 69 | pip install --index-url https://test.pypi.org/simple anonymeter==NEW_VERSION 70 | ``` 71 | 72 | You can check that anonymeter is working by running it against the original tests. 73 | For example, if you had Anonymeter repository checked out in `~/code/anonymeter`:: 74 | ``` 75 | ln -s ~/code/anonymeter/tests ~/test-anonymeter/tests 76 | pip install pytest 77 | python -m pytest 78 | ``` 79 | 80 | ### Publishing to PyPi 81 | 82 | Once you tested the package with Test PyPi, you're ready to publish to 83 | the original PyPi. 84 | 85 | Pull the latest `main` and build the package as described above. 86 | 87 | Login to PyPi. Create a new account if you don't have it yet 88 | and ask to be added as a collaborator for Anonymeter. 89 | 90 | Get the token from PyPi: https://pypi.org/manage/account/token 91 | and add it as suggested to `$HOME/.pypirc`: 92 | ```toml 93 | [pypi] 94 | username = __token__ 95 | password = YOUR_TOKEN_HERE 96 | ``` 97 | 98 | Upload the artifacts to PyPi: 99 | ```bash 100 | twine upload dist/* 101 | ``` 102 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The Clear BSD License 2 | 3 | Copyright (c) 2022 Anonos IP LLC (IP Owner) 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted (subject to the limitations in the disclaimer below) provided that 8 | the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the IP owner nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY 22 | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE IP OWNER AND 23 | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 25 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE IP OWNER OR 26 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 29 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 30 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Anonymeter: Unified Framework for Quantifying Privacy Risk in Synthetic Data 2 | 3 | `Anonymeter` is a unified statistical framework to jointly quantify different 4 | types of privacy risks in synthetic tabular datasets. `Anonymeter` is equipped 5 | with attack-based evaluations for the **Singling Out**, **Linkability**, and 6 | **Inference** risks, which are the three key indicators of factual anonymization 7 | according to the [Article 29 Working Party](https://ec.europa.eu/justice/article-29/documentation/opinion-recommendation/files/2014/wp216_en.pdf). 8 | 9 | 10 | > Anonymeter has been positively reviewed by the technical experts from the [Commission Nationale de l’Informatique et des Libertés (CNIL)](https://www.cnil.fr/en/home) which, in their words, _“have not identified any reason suggesting that the proposed set of methods could not allow to effectively evaluate the extent to which the aforementioned three criteria are fulfilled or not in the context of production and use of synthetic datasets”_. The CNIL also expressed the opinion that the results of Anonymeter (i.e. the three risk scores) **should be used by the data controller to decide whether the residual risks of re-identification are acceptable or not, and whether the dataset could be considered anonymous**. [Here](/cnil) you can find the full letter with the CNIL opinion on Anonymeter. 11 | 12 | 13 | ## `Anonymeter` in a nutshell 14 | 15 | In `Anonymeter` each privacy risk is derived from a privacy attacker whose task is to use the synthetic dataset 16 | to come up with a set of *guesses* of the form: 17 | - "there is only one person with attributes X, Y, and Z" (singling out) 18 | - "records A and B belong to the same person" (linkability) 19 | - "a person with attributes X and Y also have Z" (inference) 20 | 21 | Each evaluation consists of running three different attacks: 22 | - the "main" privacy attack, in which the attacker uses the synthetic data to guess information on records in the original data. 23 | - the "control" privacy attack, in which the attacker uses the synthetic data to guess information on records in the control dataset. 24 | - the "baseline" attack, which models a naive attacker who ignores the synthetic data and guess randomly. 25 | 26 | Checking how many of these guesses are correct, the success rates of the different attacks are measured and used to 27 | derive an estimate of the privacy risk. In particular, the "control attack" is used to separate what the attacker 28 | learns from the *utility* of the synthetic data, and what is instead indication of privacy leaks. 29 | The "baseline attack" instead functions as a sanity check. The "main attack" attack should outperform random 30 | guessing in order for the results to be trusted. 31 | 32 | For more details, a throughout 33 | description of the framework and the attack algorithms can be found in the paper 34 | [A Unified Framework for Quantifying Privacy Risk in Synthetic Data](https://petsymposium.org/popets/2023/popets-2023-0055.php), accepted at the 23rd Privacy Enhancing Technologies Symposium ([PETS 2023](https://petsymposium.org/cfp23.php)). 35 | 36 | 37 | 38 | ## Setup and installation 39 | 40 | `Anonymeter` requires Python 3.8.x, 3.9.x or 3.10.x installed. The simplest way to install `Anonymeter` is from `PyPi`. Simply run 41 | 42 | ``` 43 | pip install anonymeter 44 | ``` 45 | 46 | and you are good to go. 47 | 48 | ### Local installation 49 | 50 | To install `Anonymeter` locally, clone the repository: 51 | 52 | ```shell 53 | git clone git@github.com:statice/anonymeter.git 54 | ``` 55 | 56 | and install the dependencies: 57 | 58 | ```shell 59 | cd anonymeter # if you are not there already 60 | pip install . # Basic dependencies 61 | pip install ".[notebooks]" # Dependencies to run example notebooks 62 | pip install -e ".[notebooks,dev]" # Development setup 63 | ``` 64 | 65 | If you experience issues with the installation, we recommend to install 66 | `anonymeter` in a new clean virtual environment. 67 | 68 | ## Getting started 69 | 70 | Check out the example notebook in the `notebooks` folder to start playing around 71 | with `anonymeter`. To run this notebook you would need `jupyter` and some plotting libraries. 72 | This should be installed as part of the `notebooks` dependencies. If you haven't done so, please 73 | install them by executing: 74 | 75 | ```shell 76 | pip install anonymeter[notebooks] 77 | ``` 78 | if you are installing anonymeter from `PyPi`, or: 79 | 80 | ```shell 81 | pip install ".[notebooks]" 82 | ``` 83 | 84 | if you have opted for a local installation. 85 | 86 | ## Basic usage pattern 87 | 88 | For each of the three privacy risks anonymeter provide an `Evaluator` class. The high-level classes `SinglingOutEvaluator`, `LinkabilityEvaluator`, and `InferenceEvaluator` are the only thing that you need to import from `Anonymeter`. 89 | 90 | Despite the different nature of the privacy risks they evaluate, these classes have the same interface and are used in the same way. To instantiate the evaluator you have to provide three dataframes: the original dataset `ori` which has been used to generate the synthetic data, the synthetic data `syn`, and a `control` dataset containing original records which have not been used to generate the synthetic data. 91 | 92 | Another parameter common to all evaluators is the number of target records to attack (`n_attacks`). A higher number will reduce the statistical uncertainties on the results, at the expense of a longer computation time. 93 | 94 | ```python 95 | evaluator = *Evaluator(ori: pd.DataFrame, 96 | syn: pd.DataFrame, 97 | control: pd.DataFrame, 98 | n_attacks: int) 99 | ``` 100 | 101 | Once instantiated the evaluation pipeline is executed when calling the `evaluate`, and the resulting estimate of the risk can be accessed using the `risk()` method. 102 | 103 | ```python 104 | evaluator.evaluate() 105 | risk = evaluator.risk() 106 | ``` 107 | 108 | ## Configuring logging 109 | 110 | `Anonymeter` uses the standard Python logger named `anonymeter`. 111 | You can configure the logging level and the output destination 112 | using the standard Python logging API (see [here](https://docs.python.org/3/library/logging.html) for more details). 113 | 114 | For example, to set the logging level to `DEBUG` you can use the following snippet: 115 | 116 | ```python 117 | import logging 118 | 119 | # set the logging level to DEBUG 120 | logging.getLogger("anonymeter").setLevel(logging.DEBUG) 121 | ``` 122 | 123 | And if you want to log to a file, you can use the following snippet: 124 | 125 | ```python 126 | import logging 127 | 128 | # create a file handler 129 | file_handler = logging.FileHandler("anonymeter.log") 130 | 131 | # set the logging level for the file handler 132 | file_handler.setLevel(logging.DEBUG) 133 | 134 | # add the file handler to the logger 135 | logger = logging.getLogger("anonymeter") 136 | logger.addHandler(file_handler) 137 | logger.setLevel(logging.DEBUG) 138 | ``` 139 | 140 | 141 | ## Cite this work 142 | 143 | If you use anonymeter in your work, we would appreciate citations to the following paper: 144 | 145 | "A Unified Framework for Quantifying Privacy Risk in Synthetic Data", M. Giomi *et al*, PoPETS 2023. 146 | This `bibtex` entry can be used to refer to the paper: 147 | 148 | ```text 149 | @misc{anonymeter, 150 | doi = {https://doi.org/10.56553/popets-2023-0055}, 151 | url = {https://petsymposium.org/popets/2023/popets-2023-0055.php}, 152 | journal = {Proceedings of Privacy Enhancing Technologies Symposium}, 153 | year = {2023}, 154 | author = {Giomi, Matteo and Boenisch, Franziska and Wehmeyer, Christoph and Tasnádi, Borbála}, 155 | title = {A Unified Framework for Quantifying Privacy Risk in Synthetic Data}, 156 | } 157 | ``` 158 | 159 | ### License 160 | 161 | Licensed under Clear BSD License, see `LICENSE.md` to see the full license text. Patent-pending code (application US-20230401336-A1). 162 | 163 | -------------------------------------------------------------------------------- /cnil/CNIL_opinion_anonymeter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statice/anonymeter/0188bdf5615601e2f31503ae91a2b310af8d917c/cnil/CNIL_opinion_anonymeter.pdf -------------------------------------------------------------------------------- /cnil/CNIL_opinion_anonymeter_courtesy_translation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statice/anonymeter/0188bdf5615601e2f31503ae91a2b310af8d917c/cnil/CNIL_opinion_anonymeter_courtesy_translation.pdf -------------------------------------------------------------------------------- /notebooks/anonymeter_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "66b36556-e9af-4cf9-bb9c-25f12d3fc322", 6 | "metadata": {}, 7 | "source": [ 8 | "# Anonymeter example notebook\n", 9 | "\n", 10 | "This example notebook demonstrates the usage of `Anonymeter`, a software to derive GDPR-aligned measures of the privacy of synthetic datasets in an empirical, attack based fashion.\n", 11 | "\n", 12 | "`Anonymeter` contains privacy evaluators which measures the risks of singling out, linkability, and inference which might incur to data donors following the release of synthetic dataset. These risk are the three key indicators of factual anonymization according to the European General Data Protection Regulation (GDPR). For more details, please refer to [M. Giomi et al. 2022](https://petsymposium.org/popets/2023/popets-2023-0055.php)." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "1062da6f-d56e-48a5-b62f-fe987e7682fe", 18 | "metadata": {}, 19 | "source": [ 20 | "### Basic usage pattern\n", 21 | "\n", 22 | "For each of these privacy risks anonymeter provide an `Evaluator` class. The high-level classes `SinglingOutEvaluator`, `LinkabilityEvaluator`, and `InferenceEvaluator` are the only thing that you need to import from `Anonymeter`.\n", 23 | "\n", 24 | "Despite the different nature of the privacy risks they evaluate, these classes have the same interface and are used in the same way. To instantiate the evaluator you have to provide three dataframes: the original dataset `ori` which has been used to generate the synthetic data, the synthetic data `syn`, and a `control` dataset containing original records which have not been used to generate the synthetic data. \n", 25 | "\n", 26 | "Another parameter common to all evaluators is the number of target records to attack (`n_attacks`). A higher number will reduce the statistical uncertainties on the results, at the expense of a longer computation time.\n", 27 | "\n", 28 | "```python\n", 29 | "evaluator = *Evaluator(ori: pd.DataFrame, \n", 30 | " syn: pd.DataFrame, \n", 31 | " control: pd.DataFrame,\n", 32 | " n_attacks: int)\n", 33 | "```\n", 34 | "\n", 35 | "Once instantiated the evaluation pipeline is executed when calling the `evaluate`, and the resulting estimate of the risk can be accessed using the `risk()` method.\n", 36 | "\n", 37 | "```python\n", 38 | "evaluator.evaluate()\n", 39 | "risk = evaluator.risk()\n", 40 | "```" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "cb9d6771-69f3-4e49-b143-d5d0bae3ba3c", 46 | "metadata": {}, 47 | "source": [ 48 | "### A peak under the hood\n", 49 | "\n", 50 | "In `Anonymeter` each privacy risk is derived from a privacy attacker whose task is to use the synthetic dataset to come up with a set of *guesses* of the form:\n", 51 | "- \"there is only one person with attributes X, Y, and Z\" (singling out)\n", 52 | "- \"records A and B belong to the same person\" (linkability)\n", 53 | "- \"a person with attributes X and Y also have Z\" (inference)\n", 54 | "\n", 55 | "Each evaluation consists of running three different attacks:\n", 56 | "- the \"main\" privacy attack, in which the attacker uses the synthetic data to guess information on records in the original data.\n", 57 | "- the \"control\" privacy attack, in which the attacker uses the synthetic data to guess information on records in the control dataset. \n", 58 | "- the \"baseline\" attack, which models a naive attacker who ignores the synthetic data and guess randomly.\n", 59 | "\n", 60 | "Checking how many of these guesses are correct, the success rates of the different attacks are measured and used to derive an estimate of the privacy risk. In particular, the \"control attack\" is used to separate what the attacker learns from the *utility* of the synthetic data, and what is instead indication of privacy leaks. The \"baseline attack\" instead functions as a sanity check. The \"main attack\" attack should outperform random guessing in order for the results to be trusted. " 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 1, 66 | "id": "c64a6fab-1676-4539-b460-5b2fdb456b04", 67 | "metadata": { 68 | "tags": [] 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "import os\n", 73 | "import pandas as pd\n", 74 | "import matplotlib.pyplot as plt\n", 75 | "import seaborn as sns\n", 76 | "\n", 77 | "from anonymeter.evaluators import SinglingOutEvaluator\n", 78 | "from anonymeter.evaluators import LinkabilityEvaluator\n", 79 | "from anonymeter.evaluators import InferenceEvaluator" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "ada19947-b895-4279-aac3-9b87fac2fa6b", 85 | "metadata": {}, 86 | "source": [ 87 | "## Downloading the data\n", 88 | "\n", 89 | "For this example, we will use the famous `Adults` (more details [here](https://archive.ics.uci.edu/ml/datasets/adult)) dataset. This dataset contains aggregated census data, where every row represent a population segment. For the purpose of demonstrating `Anonymeter`, we will use this data as if each row would in fact refer to a real individual. \n", 90 | "\n", 91 | "The synthetic version has been generated by `CTGAN` from [SDV](https://sdv.dev/SDV/user_guides/single_table/ctgan.html), as explained in the paper accompanying this code release. For details on the generation process, e.g. regarding hyperparameters, see Section 6.2.1 of [the accompanying paper](https://petsymposium.org/popets/2023/popets-2023-0055.php))." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 2, 97 | "id": "fc128115-2f0c-43b1-9198-5c5594eae7f3", 98 | "metadata": { 99 | "tags": [] 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "dataset_dir = \"./datasets/\"\n", 104 | "\n", 105 | "ori = pd.read_csv(os.path.join(dataset_dir, \"adults_train.csv\"))\n", 106 | "syn = pd.read_csv(os.path.join(dataset_dir, \"adults_syn_ctgan.csv\"))\n", 107 | "control = pd.read_csv(os.path.join(dataset_dir, \"adults_control.csv\"))" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 3, 113 | "id": "f6abeed8-23ae-4d4a-9cdb-006c0bba109c", 114 | "metadata": { 115 | "tags": [] 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/html": [ 121 | "
\n", 122 | "\n", 135 | "\n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | "
agetype_employerfnlwgteducationeducation_nummaritaloccupationrelationshipracesexcapital_gaincapital_losshr_per_weekcountryincome
053Self-emp-not-inc13802211th7DivorcedCraft-repairNot-in-familyWhiteMale0060United-States<=50K
131Private344200HS-grad9Married-civ-spouseExec-managerialHusbandWhiteMale0040United-States>50K
228Private242482HS-grad9Never-marriedHandlers-cleanersOwn-childWhiteMale0040United-States<=50K
326Private193165Some-college10Married-civ-spouseTransport-movingHusbandWhiteMale0052United-States>50K
427Private267989Some-college10Married-civ-spouseMachine-op-inspctHusbandWhiteMale0040United-States<=50K
\n", 249 | "
" 250 | ], 251 | "text/plain": [ 252 | " age type_employer fnlwgt education education_num \\\n", 253 | "0 53 Self-emp-not-inc 138022 11th 7 \n", 254 | "1 31 Private 344200 HS-grad 9 \n", 255 | "2 28 Private 242482 HS-grad 9 \n", 256 | "3 26 Private 193165 Some-college 10 \n", 257 | "4 27 Private 267989 Some-college 10 \n", 258 | "\n", 259 | " marital occupation relationship race sex \\\n", 260 | "0 Divorced Craft-repair Not-in-family White Male \n", 261 | "1 Married-civ-spouse Exec-managerial Husband White Male \n", 262 | "2 Never-married Handlers-cleaners Own-child White Male \n", 263 | "3 Married-civ-spouse Transport-moving Husband White Male \n", 264 | "4 Married-civ-spouse Machine-op-inspct Husband White Male \n", 265 | "\n", 266 | " capital_gain capital_loss hr_per_week country income \n", 267 | "0 0 0 60 United-States <=50K \n", 268 | "1 0 0 40 United-States >50K \n", 269 | "2 0 0 40 United-States <=50K \n", 270 | "3 0 0 52 United-States >50K \n", 271 | "4 0 0 40 United-States <=50K " 272 | ] 273 | }, 274 | "execution_count": 3, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "ori.head()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "id": "f1d19013-b7cf-48e3-a068-6c1e5449884e", 286 | "metadata": {}, 287 | "source": [ 288 | "As visible the dataset contains several demographic information, as well as information regarding the education, financial situation, and personal life of some tens of thousands of \"individuals\"." 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "id": "52811434-e3ed-464e-8bbc-eafb1dfe0eb1", 294 | "metadata": { 295 | "tags": [] 296 | }, 297 | "source": [ 298 | "### Measuring the singling out risk\n", 299 | "\n", 300 | "The `SinglingOutEvaluator` try to measure how much the synthetic data can help an attacker finding combination of attributes that single out records in the training data. \n", 301 | "\n", 302 | "With the following code we evaluate the robustness of the synthetic data to \"univariate\" singling out attacks, which try to find unique values of some attribute which single out an individual. \n", 303 | "\n", 304 | "\n", 305 | "##### NOTE:\n", 306 | "\n", 307 | "The `SingingOutEvaluator` can sometimes raise a `RuntimeError`. This happens when not enough singling out queries are found. Increasing `n_attacks` will make this condition less frequent and the evaluation more robust, although much slower.\n" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 4, 313 | "id": "43acdda6-19d5-4611-ba4f-498fc7bd2d40", 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "PrivacyRisk(value=0.013741062122476706, ci=(0.0, 0.034101211562263624))\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "evaluator = SinglingOutEvaluator(ori=ori, \n", 326 | " syn=syn, \n", 327 | " control=control,\n", 328 | " n_attacks=500)\n", 329 | "\n", 330 | "try:\n", 331 | " evaluator.evaluate(mode='univariate')\n", 332 | " risk = evaluator.risk()\n", 333 | " print(risk)\n", 334 | "\n", 335 | "except RuntimeError as ex: \n", 336 | " print(f\"Singling out evaluation failed with {ex}. Please re-run this cell.\"\n", 337 | " \"For more stable results increase `n_attacks`. Note that this will \"\n", 338 | " \"make the evaluation slower.\")" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "id": "f753a354-50c1-4209-933a-a51291c03306", 344 | "metadata": {}, 345 | "source": [ 346 | "The risk estimate is accompanied by a confidence interval (at 95% level by default) which accounts for the finite number of attacks performed, 500 in this case. \n", 347 | "\n", 348 | "Using the `queries()` method, we can see what kind of singling out queries (i.e. the *guesses*) the attacker has come up with:" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 5, 354 | "id": "4af6f284-81a3-4b84-a39b-b2beb6a21b24", 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "['fnlwgt == 34465', 'fnlwgt == 95255', 'fnlwgt == 270228']" 361 | ] 362 | }, 363 | "execution_count": 5, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "evaluator.queries()[:3]" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "id": "abc36183-03c4-4612-80cd-c1b3849cb04e", 375 | "metadata": {}, 376 | "source": [ 377 | "As visible, the attack is picking up on the `fnlwgt` column, which has many (~63%) unique integer values and therefore provides a powerful handle for singling out. This should result in a singling out risk which is *compatible* within the confidence level with a few percentage points. The actual results can vary depending on notebook execution. " 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "id": "b0693e9c-0f69-4c4c-b506-4e7bd7190031", 383 | "metadata": {}, 384 | "source": [ 385 | "### Inspecting the results in more details" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "id": "f7db3b47-22e3-4d1f-8495-770799c378cc", 391 | "metadata": {}, 392 | "source": [ 393 | "There are two methods to inspect the results. The high level `risk()` method gives the high level estimation of the privacy risk, and its confidence interval." 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 6, 399 | "id": "4ea008ba-05a8-47b2-a316-132fe628cae3", 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "PrivacyRisk(value=0.013741062122476706, ci=(0.0, 0.034101211562263624))" 406 | ] 407 | }, 408 | "execution_count": 6, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "evaluator.risk(confidence_level=0.95)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "ba233989-5556-4f89-8037-1c26d78c8127", 420 | "metadata": {}, 421 | "source": [ 422 | "for more information, the `results()` method gives the success rates of the three attacks (the \"main\" one, the baseline one, and the one against control) that enters `Anonymeter` risk calculation." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 7, 428 | "id": "367928ba-7a1e-4c20-9dc6-84c490873700", 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "Successs rate of main attack: SuccessRate(value=0.03556819133600645, error=0.015776677709435333)\n", 436 | "Successs rate of baseline attack: SuccessRate(value=0.0038121702307761206, error=0.00381217023077612)\n", 437 | "Successs rate of control attack: SuccessRate(value=0.022131235900891076, error=0.012313616009370306)\n" 438 | ] 439 | } 440 | ], 441 | "source": [ 442 | "res = evaluator.results()\n", 443 | "\n", 444 | "print(\"Successs rate of main attack:\", res.attack_rate)\n", 445 | "print(\"Successs rate of baseline attack:\", res.baseline_rate)\n", 446 | "print(\"Successs rate of control attack:\", res.control_rate)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "id": "8543aad4-897c-4e80-96a7-a47aea1caf22", 452 | "metadata": {}, 453 | "source": [ 454 | "Note that you can obtain the `PrivacyRisk` from the attack results by:" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 8, 460 | "id": "d1081269-1830-430f-8305-9f254641de89", 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "data": { 465 | "text/plain": [ 466 | "PrivacyRisk(value=0.013741062122476706, ci=(0.0, 0.034101211562263624))" 467 | ] 468 | }, 469 | "execution_count": 8, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "res.risk()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "id": "00f4e993-defe-4c83-a4b3-da2cdf2ca02f", 481 | "metadata": {}, 482 | "source": [ 483 | "### Checking singling out with multivariate predicates\n", 484 | "\n", 485 | "The `SinglingOutEvaluator` can also attack the dataset using predicates which are combining different attributes. These are the so called `multivariate` predicates. \n", 486 | "\n", 487 | "To run the analysis using the `multivariate` singling out attack, the `mode` parameter of `evaluate` needs to be set correctly. The number of attributes used in the attacker queries via the `n_cols` parameter, set to 4 in this example. " 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 9, 493 | "id": "1a875b5f-4f75-4585-83b5-d0703ac82f90", 494 | "metadata": { 495 | "tags": [] 496 | }, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "PrivacyRisk(value=0.02878005056752415, ci=(0.0, 0.1380156613265963))\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "evaluator = SinglingOutEvaluator(ori=ori, \n", 508 | " syn=syn, \n", 509 | " control=control,\n", 510 | " n_attacks=100, # this attack takes longer\n", 511 | " n_cols=4)\n", 512 | "\n", 513 | "\n", 514 | "try:\n", 515 | " evaluator.evaluate(mode='multivariate')\n", 516 | " risk = evaluator.risk()\n", 517 | " print(risk)\n", 518 | "\n", 519 | "except RuntimeError as ex: \n", 520 | " print(f\"Singling out evaluation failed with {ex}. Please re-run this cell.\"\n", 521 | " \"For more stable results increase `n_attacks`. Note that this will \"\n", 522 | " \"make the evaluation slower.\")" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 10, 528 | "id": "786dbb5b-6b52-41c1-8d07-2d9467a3d649", 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "data": { 533 | "text/plain": [ 534 | "[\"education== '9th' & occupation== 'Other-service' & race== 'Amer-Indian-Eskimo' & sex== 'Male'\",\n", 535 | " \"education== '9th' & income== '>50K' & occupation== 'Other-service' & sex== 'Female'\",\n", 536 | " \"age<= 24 & capital_gain<= 0 & country== 'Ireland' & education_num>= 11\"]" 537 | ] 538 | }, 539 | "execution_count": 10, 540 | "metadata": {}, 541 | "output_type": "execute_result" 542 | } 543 | ], 544 | "source": [ 545 | "evaluator.queries()[:3]" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "id": "acff772b-3f99-402d-b1b6-01b0e09bd5b1", 551 | "metadata": {}, 552 | "source": [ 553 | "# Measuring the Linkability risk\n", 554 | "\n", 555 | "The `LinkabilityEvaluator` allows one to know how much the synthetic data will help an adversary who tries to link two other datasets based on a subset of attributes. \n", 556 | "\n", 557 | "For example, suppose that the adversary finds dataset A containing, among other fields, information about the profession and education of people, and dataset B containing some demographic and health related information. Can the attacker use the synthetic dataset to link these two datasets?\n", 558 | "\n", 559 | "To run the `LinkabilityEvaluator` one needs to specify which columns of auxiliary information are available to the attacker, and how they are distributed between the two datasets A and B. This is done using the `aux_cols` parameter." 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 11, 565 | "id": "dad588b3-b241-4256-ac11-ae73d9206782", 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "data": { 570 | "text/plain": [ 571 | "PrivacyRisk(value=0.0, ci=(0.0, 0.001915371248414205))" 572 | ] 573 | }, 574 | "execution_count": 11, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "aux_cols = [\n", 581 | " ['type_employer', 'education', 'hr_per_week', 'capital_loss', 'capital_gain'],\n", 582 | " [ 'race', 'sex', 'fnlwgt', 'age', 'country']\n", 583 | "]\n", 584 | "\n", 585 | "evaluator = LinkabilityEvaluator(ori=ori, \n", 586 | " syn=syn, \n", 587 | " control=control,\n", 588 | " n_attacks=2000,\n", 589 | " aux_cols=aux_cols,\n", 590 | " n_neighbors=10)\n", 591 | "\n", 592 | "evaluator.evaluate(n_jobs=-2) # n_jobs follow joblib convention. -1 = all cores, -2 = all execept one\n", 593 | "evaluator.risk()" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 12, 599 | "id": "321da78a-d695-4d7c-8a62-f1a226bfc04a", 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "name": "stdout", 604 | "output_type": "stream", 605 | "text": [ 606 | "Successs rate of main attack: SuccessRate(value=0.003453731022423335, error=0.00238542229400942)\n", 607 | "Successs rate of baseline attack: SuccessRate(value=0.0014575651169858408, error=0.001369297386280373)\n", 608 | "Successs rate of control attack: SuccessRate(value=0.005449896927860829, error=0.0030806162908989933)\n" 609 | ] 610 | } 611 | ], 612 | "source": [ 613 | "res = evaluator.results()\n", 614 | "\n", 615 | "print(\"Successs rate of main attack:\", res.attack_rate)\n", 616 | "print(\"Successs rate of baseline attack:\", res.baseline_rate)\n", 617 | "print(\"Successs rate of control attack:\", res.control_rate)" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "id": "f0190af5-a184-46cf-a048-30ecbe82fd51", 623 | "metadata": {}, 624 | "source": [ 625 | "As visible, the attack is not very successful and the linkability risk is low. The `n_neighbor` parameter can be used to allow for weaker indirect links to be scored as successes. It will have an impact on the risk estimate. To check the measured risk for different values of `n_neighbor` you don't have to re-run the evaluation. Rather, do:" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 13, 631 | "id": "c4d3644a-dd36-4807-aa3f-cf244cfd5d55", 632 | "metadata": {}, 633 | "outputs": [ 634 | { 635 | "name": "stdout", 636 | "output_type": "stream", 637 | "text": [ 638 | "PrivacyRisk(value=0.0, ci=(0.0, 0.0007611013198481277))\n" 639 | ] 640 | } 641 | ], 642 | "source": [ 643 | "print(evaluator.risk(n_neighbors=7))" 644 | ] 645 | }, 646 | { 647 | "cell_type": "markdown", 648 | "id": "0429baae-424d-4ebe-b8ec-9205397515ba", 649 | "metadata": {}, 650 | "source": [ 651 | "# Measuring the Inference Risk\n", 652 | "\n", 653 | "Finally, `anonymeter` allows to measure the inference risk. It does so by measuring the success of an attacker that tries to discover the value of some secret attribute for a set of target records on which some auxiliary knowledge is available.\n", 654 | "\n", 655 | "Similar to the case of the `LinkabilityEvaluator`, the main parameter here is `aux_cols` which specify what the attacker knows about its target, i.e. which columns are known to the attacker. By selecting the `secret` column, one can identify which attributes, alone or in combinations, exhibit the largest risks and thereby expose a lot of information on the original data.\n", 656 | "\n", 657 | "In the following snippet we will measure the inference risk for each column individually, using all the other columns as auxiliary information to model a very knowledgeable attacker. " 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 14, 663 | "id": "6c07054c-7ced-46c3-8a12-14123f6cc965", 664 | "metadata": { 665 | "tags": [] 666 | }, 667 | "outputs": [ 668 | { 669 | "name": "stderr", 670 | "output_type": "stream", 671 | "text": [ 672 | "/home/matteo/work/gits/anonymeter/src/anonymeter/stats/confidence.py:218: UserWarning: Attack is as good or worse as baseline model. Estimated rates: attack = 0.25195286286290325, baseline = 0.255937555828961. Analysis results cannot be trusted.\n", 673 | " self._sanity_check()\n" 674 | ] 675 | } 676 | ], 677 | "source": [ 678 | "columns = ori.columns\n", 679 | "results = []\n", 680 | "\n", 681 | "for secret in columns:\n", 682 | " \n", 683 | " aux_cols = [col for col in columns if col != secret]\n", 684 | " \n", 685 | " evaluator = InferenceEvaluator(ori=ori, \n", 686 | " syn=syn, \n", 687 | " control=control,\n", 688 | " aux_cols=aux_cols,\n", 689 | " secret=secret,\n", 690 | " n_attacks=1000)\n", 691 | " evaluator.evaluate(n_jobs=-2)\n", 692 | " results.append((secret, evaluator.results()))" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 15, 698 | "id": "0e492eeb-d296-4973-a08e-c5afc0ac36b5", 699 | "metadata": { 700 | "tags": [] 701 | }, 702 | "outputs": [ 703 | { 704 | "data": { 705 | "image/png": "", 706 | "text/plain": [ 707 | "
" 708 | ] 709 | }, 710 | "metadata": {}, 711 | "output_type": "display_data" 712 | } 713 | ], 714 | "source": [ 715 | "fig, ax = plt.subplots()\n", 716 | "\n", 717 | "risks = [res[1].risk().value for res in results]\n", 718 | "columns = [res[0] for res in results]\n", 719 | "\n", 720 | "ax.bar(x=columns, height=risks, alpha=0.5, ecolor='black', capsize=10)\n", 721 | "\n", 722 | "plt.xticks(rotation=45, ha='right')\n", 723 | "ax.set_ylabel(\"Measured inference risk\")\n", 724 | "_ = ax.set_xlabel(\"Secret column\")" 725 | ] 726 | }, 727 | { 728 | "cell_type": "markdown", 729 | "id": "b3e8c81d-7813-4779-8e27-3a633ec20ee7", 730 | "metadata": {}, 731 | "source": [ 732 | "As visible, a few columns in the dataset carry a significant inference risk. This means that an attacker in possession of the synthetic dataset can use it to infer some attribute of records in the original data, *beyond what can be explained by utility*." 733 | ] 734 | } 735 | ], 736 | "metadata": { 737 | "kernelspec": { 738 | "display_name": "Python 3 (ipykernel)", 739 | "language": "python", 740 | "name": "python3" 741 | }, 742 | "language_info": { 743 | "codemirror_mode": { 744 | "name": "ipython", 745 | "version": 3 746 | }, 747 | "file_extension": ".py", 748 | "mimetype": "text/x-python", 749 | "name": "python", 750 | "nbconvert_exporter": "python", 751 | "pygments_lexer": "ipython3", 752 | "version": "3.11.11" 753 | }, 754 | "vscode": { 755 | "interpreter": { 756 | "hash": "237cf5f6b3dcd73bf2688629baee50bd53e43ee0aa8f2bde7060bbd4d3c193da" 757 | } 758 | } 759 | }, 760 | "nbformat": 4, 761 | "nbformat_minor": 5 762 | } 763 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | [build-system] 5 | requires = ["setuptools>=61.0"] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "anonymeter" 10 | version = "1.0.0" 11 | authors = [ 12 | { name="Statice GmbH", email="hello@statice.ai" }, 13 | ] 14 | description = "Measure singling out, linkability, and inference risk for synthetic data." 15 | readme = "README.md" 16 | requires-python = "<3.12, >3.7" # limited by Numba support 17 | license = {file = "LICENSE.md"} 18 | classifiers = [ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: BSD License", 21 | "Operating System :: OS Independent", 22 | ] 23 | 24 | dependencies = [ 25 | "scikit-learn~=1.2", 26 | "numpy >=1.22, <1.27", # limited by Numba support 27 | "pandas>=1.4", 28 | "joblib~=1.2", 29 | "numba~=0.58", 30 | ] 31 | 32 | [project.optional-dependencies] 33 | notebooks = [ 34 | "jupyterlab~=3.4", 35 | "matplotlib~=3.5", 36 | "seaborn~=0.11", 37 | ] 38 | 39 | dev = [ 40 | # Linting and formatting 41 | "ruff~=0.1.14", 42 | "mypy~=1.8.0", 43 | 44 | # Pre-commit checks 45 | "pre-commit~=3.5", 46 | 47 | # Testing 48 | "pytest~=7.4", 49 | 50 | # Building and packaging 51 | "build~=0.10", 52 | "twine~=4.0", 53 | ] 54 | 55 | [project.urls] 56 | "Homepage" = "https://github.com/statice/anonymeter" 57 | "Bug Tracker" = "https://github.com/statice/anonymeter/issues" 58 | "Changelog" = "https://github.com/statice/anonymeter/blob/main/CHANGELOG.md" 59 | 60 | [tool.ruff] 61 | # https://docs.astral.sh/ruff/configuration/ 62 | 63 | line-length = 120 64 | 65 | select = [ 66 | "B", # https://docs.astral.sh/ruff/rules/#flake8-bugbear-b 67 | "C4", # https://docs.astral.sh/ruff/rules/#flake8-comprehensions-c4 68 | "E4", # https://docs.astral.sh/ruff/rules/#error-e 69 | "E7", 70 | "E9", 71 | "NPY", 72 | "F", # https://docs.astral.sh/ruff/rules/#pyflakes-f 73 | "I001", # isort 74 | "W", # https://docs.astral.sh/ruff/rules/#pycodestyle-e-w 75 | "YTT", # https://docs.astral.sh/ruff/rules/#flake8-2020-ytt 76 | "PGH", # https://docs.astral.sh/ruff/rules/#pygrep-hooks-pgh 77 | "PIE", # https://docs.astral.sh/ruff/rules/#flake8-pie-pie 78 | "UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up 79 | "RUF", 80 | ] 81 | 82 | [tool.ruff.format] 83 | quote-style = "double" 84 | indent-style = "space" 85 | 86 | [tool.ruff.isort] 87 | known-first-party = ["anonymeter"] 88 | forced-separate = ["tests"] 89 | 90 | [tool.ruff.lint] 91 | extend-select = ["NPY201"] 92 | preview = true 93 | 94 | [tool.mypy] 95 | ignore_missing_imports = true 96 | follow_imports = "silent" 97 | show_column_numbers = true 98 | check_untyped_defs = true 99 | show_error_context = false 100 | exclude = [ 101 | "docs", 102 | "build", 103 | "dist", 104 | ] 105 | 106 | [tool.pytest.ini_options] 107 | filterwarnings = [ 108 | "ignore::UserWarning", 109 | "ignore::FutureWarning", 110 | "ignore::PendingDeprecationWarning", 111 | ] 112 | testpaths = [ 113 | "tests", 114 | ] 115 | pythonpath = [ 116 | "src", 117 | ] 118 | xfail_strict=true 119 | -------------------------------------------------------------------------------- /src/anonymeter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statice/anonymeter/0188bdf5615601e2f31503ae91a2b310af8d917c/src/anonymeter/__init__.py -------------------------------------------------------------------------------- /src/anonymeter/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | """Tools to evaluate privacy risks along the directives of the Article 29 WGP.""" 5 | from anonymeter.evaluators.inference_evaluator import InferenceEvaluator 6 | from anonymeter.evaluators.linkability_evaluator import LinkabilityEvaluator 7 | from anonymeter.evaluators.singling_out_evaluator import SinglingOutEvaluator 8 | 9 | __all__ = ["InferenceEvaluator", "LinkabilityEvaluator", "SinglingOutEvaluator"] 10 | -------------------------------------------------------------------------------- /src/anonymeter/evaluators/inference_evaluator.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | """Privacy evaluator that measures the inference risk.""" 5 | 6 | from typing import List, Optional 7 | 8 | import numpy as np 9 | import numpy.typing as npt 10 | import pandas as pd 11 | 12 | from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors 13 | from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk 14 | 15 | 16 | def _run_attack( 17 | target: pd.DataFrame, 18 | syn: pd.DataFrame, 19 | n_attacks: int, 20 | aux_cols: List[str], 21 | secret: str, 22 | n_jobs: int, 23 | naive: bool, 24 | regression: Optional[bool], 25 | ) -> int: 26 | if regression is None: 27 | regression = pd.api.types.is_numeric_dtype(target[secret]) 28 | 29 | targets = target.sample(n_attacks, replace=False) 30 | 31 | if naive: 32 | guesses = syn.sample(n_attacks)[secret] 33 | 34 | else: 35 | nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=syn[aux_cols]) 36 | 37 | guesses_idx = nn.kneighbors(queries=targets[aux_cols]) 38 | if isinstance(guesses_idx, tuple): 39 | raise RuntimeError("guesses_idx cannot be a tuple") 40 | 41 | guesses = syn.iloc[guesses_idx.flatten()][secret] 42 | 43 | return evaluate_inference_guesses(guesses=guesses, secrets=targets[secret], regression=regression).sum() 44 | 45 | 46 | def evaluate_inference_guesses( 47 | guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05 48 | ) -> npt.NDArray: 49 | """Evaluate the success of an inference attack. 50 | 51 | The attack is successful if the attacker managed to make a correct guess. 52 | 53 | In case of regression problems, when the secret is a continuous variable, 54 | the guess is correct if the relative difference between guess and target 55 | is smaller than a given tolerance. In the case of categorical target 56 | variables, the inference is correct if the secrets are guessed exactly. 57 | 58 | Parameters 59 | ---------- 60 | guesses : pd.Series 61 | Attacker guesses for each of the targets. 62 | secrets : pd.Series 63 | Array with the true values of the secret for each of the targets. 64 | regression : bool 65 | Whether or not the attacker is trying to solve a classification or 66 | a regression task. The first case is suitable for categorical or 67 | discrete secrets, the second for numerical continuous ones. 68 | tolerance : float, default is 0.05 69 | Maximum value for the relative difference between target and secret 70 | for the inference to be considered correct. 71 | 72 | Returns 73 | ------- 74 | np.array 75 | Array of boolean values indicating the correcteness of each guess. 76 | 77 | """ 78 | guesses_np = guesses.to_numpy() 79 | secrets_np = secrets.to_numpy() 80 | 81 | if regression: 82 | rel_abs_diff = np.abs(guesses_np - secrets_np) / (guesses_np + 1e-12) 83 | value_match = rel_abs_diff <= tolerance 84 | else: 85 | value_match = guesses_np == secrets_np 86 | 87 | nan_match = np.logical_and(pd.isnull(guesses_np), pd.isnull(secrets_np)) 88 | 89 | return np.logical_or(nan_match, value_match) 90 | 91 | 92 | class InferenceEvaluator: 93 | """Privacy evaluator that measures the inference risk. 94 | 95 | The attacker's goal is to use the synthetic dataset to learn about some 96 | (potentially all) attributes of a target record from the original database. 97 | The attacker has a partial knowledge of some attributes of the target 98 | record (the auxiliary information AUX) and uses a similarity score to find 99 | the synthetic record that matches best the AUX. The success of the attack 100 | is compared to the baseline scenario of the trivial attacker, who guesses 101 | at random. 102 | 103 | .. note:: 104 | For a thorough interpretation of the attack result, it is recommended to 105 | set aside a small portion of the original dataset to use as a *control* 106 | dataset for the Inference Attack. These control records should **not** 107 | have been used to generate the synthetic dataset. For good statistical 108 | accuracy on the attack results, 500 to 1000 control records are usually 109 | enough. 110 | 111 | Comparing how successful the attack is when targeting the *training* and 112 | *control* dataset allows for a more sensitive measure of eventual 113 | information leak during the training process. If, using the synthetic 114 | data as a base, the attack is more successful against the original 115 | records in the training set than it is when targeting the control data, 116 | this indicates that specific information about some records have been 117 | transferred to the synthetic dataset. 118 | 119 | Parameters 120 | ---------- 121 | ori : pd.DataFrame 122 | Dataframe with the target records whose secrets the attacker 123 | will try to guess. This is the private dataframe from which 124 | the synthetic one has been derived. 125 | syn : pd.DataFrame 126 | Dataframe with the synthetic records. It is assumed to be 127 | fully available to the attacker. 128 | control : pd.DataFrame (optional) 129 | Independent sample of original records **not** used to 130 | create the synthetic dataset. This is used to evaluate 131 | the excess privacy risk. 132 | aux_cols : list of str 133 | Features of the records that are given to the attacker as auxiliary 134 | information. 135 | secret : str 136 | Secret attribute of the targets that is unknown to the attacker. 137 | This is what the attacker will try to guess. 138 | regression : bool, optional 139 | Specifies whether the target of the inference attack is quantitative 140 | (regression = True) or categorical (regression = False). If None 141 | (default), the code will try to guess this by checking the type of 142 | the variable. 143 | n_attacks : int, default is 500 144 | Number of attack attempts. 145 | 146 | """ 147 | 148 | def __init__( 149 | self, 150 | ori: pd.DataFrame, 151 | syn: pd.DataFrame, 152 | aux_cols: List[str], 153 | secret: str, 154 | regression: Optional[bool] = None, 155 | n_attacks: int = 500, 156 | control: Optional[pd.DataFrame] = None, 157 | ): 158 | self._ori = ori 159 | self._syn = syn 160 | self._control = control 161 | self._n_attacks = n_attacks 162 | 163 | # check if secret is a string column 164 | if not isinstance(secret, str): 165 | raise ValueError("secret must be a single column name") 166 | 167 | # check if secret is present in the original dataframe 168 | if secret not in ori.columns: 169 | raise ValueError(f"secret column '{secret}' not found in ori dataframe") 170 | 171 | self._secret = secret 172 | self._regression = regression 173 | self._aux_cols = aux_cols 174 | self._evaluated = False 175 | 176 | def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> int: 177 | return _run_attack( 178 | target=target, 179 | syn=self._syn, 180 | n_attacks=self._n_attacks, 181 | aux_cols=self._aux_cols, 182 | secret=self._secret, 183 | n_jobs=n_jobs, 184 | naive=naive, 185 | regression=self._regression, 186 | ) 187 | 188 | def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator": 189 | r"""Run the inference attack. 190 | 191 | Parameters 192 | ---------- 193 | n_jobs : int, default is -2 194 | The number of jobs to run in parallel. 195 | 196 | Returns 197 | ------- 198 | self 199 | The evaluated ``InferenceEvaluator`` object. 200 | 201 | """ 202 | self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs) 203 | self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs) 204 | self._n_control = ( 205 | None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs) 206 | ) 207 | 208 | self._evaluated = True 209 | return self 210 | 211 | def results(self, confidence_level: float = 0.95) -> EvaluationResults: 212 | """Raw evaluation results. 213 | 214 | Parameters 215 | ---------- 216 | confidence_level : float, default is 0.95 217 | Confidence level for the error bound calculation. 218 | 219 | Returns 220 | ------- 221 | EvaluationResults 222 | Object containing the success rates for the various attacks. 223 | 224 | """ 225 | if not self._evaluated: 226 | raise RuntimeError("The inference evaluator wasn't evaluated yet. Please, run `evaluate()` first.") 227 | 228 | return EvaluationResults( 229 | n_attacks=self._n_attacks, 230 | n_success=self._n_success, 231 | n_baseline=self._n_baseline, 232 | n_control=self._n_control, 233 | confidence_level=confidence_level, 234 | ) 235 | 236 | def risk(self, confidence_level: float = 0.95, baseline: bool = False) -> PrivacyRisk: 237 | """Compute the inference risk from the success of the attacker. 238 | 239 | This measures how much an attack on training data outperforms 240 | an attack on control data. An inference risk of 0 means that 241 | the attack had no advantage on the training data (no inference 242 | risk), while a value of 1 means that the attack exploited the 243 | maximally possible advantage. 244 | 245 | Parameters 246 | ---------- 247 | confidence_level : float, default is 0.95 248 | Confidence level for the error bound calculation. 249 | baseline : bool, default is False 250 | If True, return the baseline risk computed from a random guessing 251 | attack. If False (default) return the risk from the real attack. 252 | 253 | Returns 254 | ------- 255 | PrivacyRisk 256 | Estimate of the inference risk and its confidence interval. 257 | 258 | """ 259 | results = self.results(confidence_level=confidence_level) 260 | return results.risk(baseline=baseline) 261 | -------------------------------------------------------------------------------- /src/anonymeter/evaluators/linkability_evaluator.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | """Privacy evaluator that measures the linkability risk.""" 5 | import logging 6 | from typing import Dict, List, Optional, Set, Tuple, cast 7 | 8 | import numpy as np 9 | import numpy.typing as npt 10 | import pandas as pd 11 | 12 | from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors 13 | from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class LinkabilityIndexes: 19 | """Utility class to store indexes from linkability attack. 20 | 21 | Parameters 22 | ---------- 23 | idx_0 : np.ndarray 24 | Array containing the result of the nearest neighbor search 25 | between the first original dataset and the synthetic data. 26 | Rows correspond to original records and the i-th column 27 | contains the index of the i-th closest synthetic record. 28 | idx_1 : np.ndarray 29 | Array containing the result of the nearest neighbor search 30 | between the second original dataset and the synthetic data. 31 | Rows correspond to original records and the i-th column 32 | contains the index of the i-th closest synthetic record. 33 | 34 | """ 35 | 36 | def __init__(self, idx_0: npt.NDArray, idx_1: npt.NDArray): 37 | self._idx_0 = idx_0 38 | self._idx_1 = idx_1 39 | 40 | def find_links(self, n_neighbors: int) -> Dict[int, Set[int]]: 41 | """Return synthetic records that link originals in the split datasets. 42 | 43 | Parameters 44 | ---------- 45 | n_neighbors : int 46 | Number of neighbors considered for the link search. 47 | 48 | Returns 49 | ------- 50 | Dict[int, Set[int]] 51 | Dictionary mapping the index of the linking synthetic record 52 | to the index of the linked original record. 53 | 54 | """ 55 | if n_neighbors > self._idx_0.shape[0]: 56 | logger.warning(f"Neighbors too large ({n_neighbors}, using {self._idx_0.shape[0]}) instead.") 57 | n_neighbors = self._idx_0.shape[0] 58 | 59 | if n_neighbors < 1: 60 | raise ValueError(f"Invalid neighbors value ({n_neighbors}): must be positive.") 61 | 62 | links = {} 63 | for ii, (row0, row1) in enumerate(zip(self._idx_0, self._idx_1)): 64 | joined = set(row0[:n_neighbors]) & set(row1[:n_neighbors]) 65 | if len(joined) > 0: 66 | links[ii] = joined 67 | 68 | return links 69 | 70 | def count_links(self, n_neighbors: int) -> int: 71 | """Count successfully linked records. 72 | 73 | Parameters 74 | ---------- 75 | n_neighbors : int 76 | Number of neighbors considered for the link search. 77 | 78 | Returns 79 | ------- 80 | int 81 | Number of target records for which the synthetic dataset 82 | has provided the attacker wth means to link them. 83 | 84 | """ 85 | links = self.find_links(n_neighbors=n_neighbors) 86 | return _count_links(links) 87 | 88 | 89 | def _count_links(links: Dict[int, Set[int]]) -> int: 90 | """Count links.""" 91 | linkable: Set[int] = set() 92 | 93 | for ori_idx in links: 94 | linkable = linkable | {ori_idx} 95 | 96 | return len(linkable) 97 | 98 | 99 | def _random_links(n_synthetic: int, n_attacks: int, n_neighbors: int) -> npt.NDArray: 100 | rng = np.random.default_rng() 101 | 102 | return np.array([rng.choice(n_synthetic, size=n_neighbors, replace=False) for _ in range(n_attacks)]) 103 | 104 | 105 | def _random_linkability_attack(n_synthetic: int, n_attacks: int, n_neighbors: int) -> LinkabilityIndexes: 106 | idx_0 = _random_links(n_synthetic=n_synthetic, n_attacks=n_attacks, n_neighbors=n_neighbors) 107 | idx_1 = _random_links(n_synthetic=n_synthetic, n_attacks=n_attacks, n_neighbors=n_neighbors) 108 | 109 | return LinkabilityIndexes(idx_0=idx_0, idx_1=idx_1) 110 | 111 | 112 | def _find_nn(syn: pd.DataFrame, ori: pd.DataFrame, n_jobs: int, n_neighbors: int) -> npt.NDArray: 113 | nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=n_neighbors) 114 | 115 | if syn.ndim == 1: 116 | syn = syn.to_frame() 117 | 118 | if ori.ndim == 1: 119 | ori = ori.to_frame() 120 | 121 | nn.fit(syn) 122 | 123 | return cast(np.ndarray, nn.kneighbors(ori, return_distance=False)) 124 | 125 | 126 | def _linkability_attack( 127 | ori: pd.DataFrame, 128 | syn: pd.DataFrame, 129 | n_attacks: int, 130 | aux_cols: Tuple[List[str], List[str]], 131 | n_neighbors: int, 132 | n_jobs: int, 133 | ) -> LinkabilityIndexes: 134 | targets = ori.sample(n_attacks, replace=False) 135 | 136 | idx_0 = _find_nn(syn=syn[aux_cols[0]], ori=targets[aux_cols[0]], n_neighbors=n_neighbors, n_jobs=n_jobs) 137 | idx_1 = _find_nn(syn=syn[aux_cols[1]], ori=targets[aux_cols[1]], n_neighbors=n_neighbors, n_jobs=n_jobs) 138 | 139 | return LinkabilityIndexes(idx_0=idx_0, idx_1=idx_1) 140 | 141 | 142 | class LinkabilityEvaluator: 143 | r"""Measure the linkability risk created by a synthetic dataset. 144 | 145 | The linkability risk is measured from the success of a linkability attack. 146 | The attack is modeled along the following scenario. The attacker posesses 147 | two datasets, both of which share some columns with the *original* dataset 148 | that was used to generate the synthetic data. Those columns will be 149 | referred to as *auxiliary columns*. The attacker's aim is then to use the 150 | information contained in the synthetic data to connect these two datasets, 151 | i.e. to find records that belong to the same individual. 152 | 153 | To model this attack, the original dataset is split vertically into two 154 | parts. Then we try to reconnect the two parts using the synthetic data 155 | by looking for the closest neighbors of the split original records in 156 | the synthetic data. If both splits of an original record have the same 157 | closest synthetic neighbor, they are linked together. The more original 158 | records get relinked in this manner the more successful the attack. 159 | 160 | 161 | Parameters 162 | ---------- 163 | ori : pd.DataFrame 164 | Dataframe containing original data. 165 | syn : pd.DataFrame 166 | Dataframe containing synthetic data. It has to have 167 | the same columns as df_ori. 168 | aux_cols : tuple of two lists of strings or tuple of int, optional 169 | Features of the records that are given to the attacker as auxiliary 170 | information. 171 | n_attacks : int, default is 500. 172 | Number of records to attack. If None each record in the original 173 | dataset will be attacked. 174 | n_neighbors : int, default is 1 175 | The number of closest neighbors to include in the analysis. The 176 | default of 1 means that the linkability attack is considered 177 | successful only if the two original record split have the same 178 | synthetic record as closest neighbor. 179 | control : pd.DataFrame (optional) 180 | Independent sample of original records **not** used to create the 181 | synthetic dataset. This is used to evaluate the excess privacy risk. 182 | """ 183 | 184 | def __init__( 185 | self, 186 | ori: pd.DataFrame, 187 | syn: pd.DataFrame, 188 | aux_cols: Tuple[List[str], List[str]], 189 | n_attacks: Optional[int] = 500, 190 | n_neighbors: int = 1, 191 | control: Optional[pd.DataFrame] = None, 192 | ): 193 | self._ori = ori 194 | self._syn = syn 195 | self._n_attacks = n_attacks if n_attacks is not None else ori.shape[0] 196 | self._aux_cols = aux_cols 197 | self._n_neighbors = n_neighbors 198 | self._control = control 199 | self._evaluated = False 200 | 201 | def evaluate(self, n_jobs: int = -2) -> "LinkabilityEvaluator": 202 | """Run the linkability attack. 203 | 204 | Parameters 205 | ---------- 206 | n_jobs : int, default is -2 207 | The number of parallel jobs to run for neighbors search. 208 | 209 | Returns 210 | ------- 211 | self 212 | The evaluated ``LinkabilityEvaluator`` object. 213 | 214 | """ 215 | self._baseline_links = _random_linkability_attack( 216 | n_synthetic=self._syn.shape[0], n_attacks=self._n_attacks, n_neighbors=self._n_neighbors 217 | ) 218 | 219 | self._attack_links = _linkability_attack( 220 | ori=self._ori, 221 | syn=self._syn, 222 | n_attacks=self._n_attacks, 223 | aux_cols=self._aux_cols, 224 | n_neighbors=self._n_neighbors, 225 | n_jobs=n_jobs, 226 | ) 227 | 228 | self._control_links = ( 229 | None 230 | if self._control is None 231 | else _linkability_attack( 232 | ori=self._control, 233 | syn=self._syn, 234 | n_attacks=self._n_attacks, 235 | aux_cols=self._aux_cols, 236 | n_neighbors=self._n_neighbors, 237 | n_jobs=n_jobs, 238 | ) 239 | ) 240 | 241 | self._evaluated = True 242 | return self 243 | 244 | def results(self, confidence_level: float = 0.95, n_neighbors: Optional[int] = None) -> EvaluationResults: 245 | """Raw evaluation results. 246 | 247 | Parameters 248 | ---------- 249 | confidence_level : float, default is 0.95 250 | Confidence level for the error bound calculation. 251 | n_neighbors : int, default is None 252 | The number of closest neighbors to include in the analysis. 253 | If `None` (the default), the number used it the one 254 | given by the constructor. The value of this parameter must 255 | be smaller of equal to what has been used to initialize this 256 | evaluator. 257 | Returns 258 | ------- 259 | EvaluationResults 260 | Object containing the success rates for the various attacks. 261 | 262 | """ 263 | if not self._evaluated: 264 | raise RuntimeError("The linkability evaluator wasn't evaluated yet. Please, run `evaluate()` first.") 265 | 266 | if n_neighbors is None: 267 | n_neighbors = self._n_neighbors 268 | 269 | if n_neighbors > self._n_neighbors: 270 | raise ValueError( 271 | f"Cannot compute linkability results for `n_neighbors` " 272 | f"({n_neighbors}) larger than value used by constructor " 273 | f"({self._n_neighbors}. Using `n_neighbors == {self._n_neighbors}`" 274 | ) 275 | 276 | n_control = None if self._control_links is None else self._control_links.count_links(n_neighbors=n_neighbors) 277 | 278 | return EvaluationResults( 279 | n_attacks=self._n_attacks, 280 | n_success=self._attack_links.count_links(n_neighbors=n_neighbors), 281 | n_baseline=self._baseline_links.count_links(n_neighbors=n_neighbors), 282 | n_control=n_control, 283 | confidence_level=confidence_level, 284 | ) 285 | 286 | def risk( 287 | self, confidence_level: float = 0.95, baseline: bool = False, n_neighbors: Optional[int] = None 288 | ) -> PrivacyRisk: 289 | """Compute linkability risk. 290 | 291 | The linkability risk reflects how easy linkability attacks are. 292 | A linkability risk of 1 means that every single attacked record 293 | could be successfully linked together. A linkability risk of 0 294 | means that no links were found at all. 295 | 296 | Parameters 297 | ---------- 298 | confidence_level : float, default is 0.95 299 | Confidence level for the error bound calculation. 300 | baseline : bool, default is False 301 | If True, return the baseline risk computed from a random guessing 302 | attack. If False (default) return the risk from the real attack. 303 | n_neighbors : int, default is None 304 | The number of closest neighbors to include in the analysis. 305 | If `None` (the default), the number used it the one 306 | given by the constructor. The value of this parameter must 307 | be smaller of equal to what has been used to initialize this 308 | evaluator. 309 | 310 | Returns 311 | ------- 312 | PrivacyRisk 313 | Estimate of the linkability risk and its confidence interval. 314 | 315 | """ 316 | results = self.results(confidence_level=confidence_level, n_neighbors=n_neighbors) 317 | 318 | return results.risk(baseline=baseline) 319 | -------------------------------------------------------------------------------- /src/anonymeter/evaluators/singling_out_evaluator.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | """Privacy evaluator that measures the singling out risk.""" 5 | import logging 6 | from typing import Any, Callable, Dict, List, Optional, Set, Tuple 7 | 8 | import numpy as np 9 | import numpy.typing as npt 10 | import pandas as pd 11 | from pandas.api.types import is_bool_dtype, is_numeric_dtype 12 | from scipy.optimize import curve_fit 13 | 14 | from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk 15 | 16 | rng = np.random.default_rng() 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def _escape_quotes(string: str) -> str: 21 | return string.replace('"', '\\"').replace("'", "\\'") 22 | 23 | 24 | def _query_expression(col: str, val: Any, dtype: np.dtype) -> str: 25 | """Generate type-aware query expression.""" 26 | query: str = "" 27 | 28 | if pd.api.types.is_datetime64_any_dtype(dtype): 29 | f"{col} == '{val}'" 30 | elif isinstance(val, str): 31 | query = f"{col} == '{_escape_quotes(val)}'" 32 | else: 33 | query = f"{col} == {val}" 34 | 35 | return query 36 | 37 | 38 | def _query_from_record(record: pd.Series, dtypes: pd.Series, columns: List[str], medians: Optional[pd.Series]) -> str: 39 | """Construct a query from the attributes in a record.""" 40 | query = [] 41 | 42 | for col in sorted(columns): 43 | if pd.isna(record[col]): 44 | item = ".isna()" 45 | elif is_bool_dtype(dtypes[col]): 46 | item = f"== {record[col]}" 47 | elif is_numeric_dtype(dtypes[col]): 48 | if medians is None: 49 | operator = rng.choice([">=", "<="]) 50 | else: 51 | if record[col] > medians[col]: 52 | operator = ">=" 53 | else: 54 | operator = "<=" 55 | item = f"{operator} {record[col]}" 56 | 57 | elif isinstance(dtypes[col], pd.CategoricalDtype) and is_numeric_dtype(dtypes[col].categories.dtype): 58 | item = f"=={record[col]}" 59 | else: 60 | if isinstance(record[col], str): 61 | item = f"== '{_escape_quotes(record[col])}'" 62 | else: 63 | item = f'== "{record[col]}"' 64 | 65 | query.append(f"{col}{item}") 66 | 67 | return " & ".join(query) 68 | 69 | 70 | def _random_operator(data_type: str) -> str: 71 | if data_type == "categorical": 72 | ops = ["==", "!="] 73 | elif data_type == "boolean": 74 | ops = ["", "not "] 75 | elif data_type == "numerical": 76 | ops = ["==", "!=", ">", "<", ">=", "<="] 77 | else: 78 | raise ValueError(f"Unknown `data_type`: {data_type}") 79 | 80 | return rng.choice(ops) 81 | 82 | 83 | def _random_query(unique_values: Dict[str, List[Any]], cols: List[str]): 84 | """Generate a random query using given columns.""" 85 | query = [] 86 | 87 | for col in sorted(cols): 88 | values = unique_values[col] 89 | val = rng.choice(values) 90 | 91 | if pd.isna(val): 92 | expression = f"{_random_operator('boolean')}{col}.isna()" 93 | elif is_bool_dtype(values): 94 | expression = f"{_random_operator('boolean')}{col}" 95 | elif isinstance(values, pd.CategoricalDtype): 96 | expression = f"{col} {_random_operator('categorical')} {val}" 97 | elif is_numeric_dtype(values): 98 | expression = f"{col} {_random_operator('numerical')} {val}" 99 | elif isinstance(val, str): 100 | expression = f"{col} {_random_operator('categorical')} '{_escape_quotes(val)}'" 101 | else: 102 | expression = f"{col} {_random_operator('categorical')} '{val}'" 103 | 104 | query.append(expression) 105 | 106 | return " & ".join(query) 107 | 108 | 109 | def _random_queries(df: pd.DataFrame, n_queries: int, n_cols: int) -> List[str]: 110 | random_columns = [rng.choice(df.columns, size=n_cols, replace=False).tolist() for _ in range(n_queries)] 111 | unique_values = {col: df[col].unique() for col in df.columns} 112 | 113 | queries: List[str] = [_random_query(unique_values=unique_values, cols=cols) for cols in random_columns] 114 | 115 | return queries 116 | 117 | 118 | def safe_query_counts(query: str, df: pd.DataFrame) -> Optional[int]: 119 | """Return number of elements satisfying a given query.""" 120 | try: 121 | return len(df.query(query, engine="python")) 122 | except Exception as ex: 123 | logger.debug(f"Query {query} failed with {ex}.") 124 | return None 125 | 126 | 127 | def singling_out_probability_integral(n: int, w_min: float, w_max: float) -> float: 128 | """Integral of the singling out probability within a given range. 129 | 130 | The probability that a query singles out in a population of size 131 | n is defined by the query "weight" (w), i.e. the chance that the 132 | query matches a random row sampled from the data generating distribution. 133 | 134 | This probability is given by: P(w, n) = n*w * (1 - w)**(n - 1). 135 | See Cohen and Nissim 2020 [1] for more details. 136 | 137 | References 138 | ---------- 139 | [1] - https://arxiv.org/abs/1904.06009 140 | 141 | Parameters 142 | ---------- 143 | n : int 144 | Size of the population 145 | w_min : float 146 | Lower extreme of integration. Must be between 0 and 1. 147 | w_max : float 148 | Higher extreme of integration. Must be between w_min and 1. 149 | 150 | Returns 151 | ------- 152 | float 153 | The integral of the singling out probability in the given range. 154 | 155 | """ 156 | if w_min < 0 or w_min > 1: 157 | raise ValueError(f"Parameter `w_min` must be between 0 and 1. Got {w_min} instead.") 158 | 159 | if w_max < w_min or w_max > 1: 160 | raise ValueError( 161 | f"Parameter `w_max` must be greater than w_min ({w_min}) and smaller than 1. Got {w_max} instead." 162 | ) 163 | 164 | return ((n * w_min + 1) * (1 - w_min) ** n - (n * w_max + 1) * (1 - w_max) ** n) / (n + 1) 165 | 166 | 167 | def _measure_queries_success( 168 | df: pd.DataFrame, queries: List[str], n_repeat: int, n_meas: int 169 | ) -> Tuple[npt.NDArray, npt.NDArray]: 170 | sizes, successes = [], [] 171 | min_rows = min(1000, len(df)) 172 | 173 | for n_rows in np.linspace(min_rows, len(df), n_meas).astype(int): 174 | for _ in range(n_repeat): 175 | successes.append(len(_evaluate_queries(df=df.sample(n_rows, replace=False), queries=queries))) 176 | sizes.append(n_rows) 177 | 178 | return np.array(sizes), np.array(successes) 179 | 180 | 181 | def _model(x, w_eff, norm): 182 | return norm * singling_out_probability_integral(n=x, w_min=0, w_max=w_eff) 183 | 184 | 185 | def _fit_model(sizes: npt.NDArray, successes: npt.NDArray) -> Callable: 186 | # initial guesses 187 | w_eff_guess = 1 / np.max(sizes) 188 | norm_guess = 1 / singling_out_probability_integral(n=np.max(sizes), w_min=0, w_max=w_eff_guess) 189 | 190 | popt, _ = curve_fit(_model, xdata=sizes, ydata=successes, bounds=(0, (1, np.inf)), p0=(w_eff_guess, norm_guess)) 191 | 192 | return lambda x: _model(x, *popt) 193 | 194 | 195 | def fit_correction_term(df: pd.DataFrame, queries: List[str]) -> Callable: 196 | """Fit correction for different size of the control dataset. 197 | 198 | Parameters 199 | ---------- 200 | df : pd.DataFrame 201 | Dataframe on which the queries needs to be evaluated. 202 | queries : list of strings 203 | Singling out queries to evaluate on the data. 204 | 205 | Returns 206 | ------- 207 | callable 208 | Model of how the number of queries that singles out 209 | depends on the size of the dataset. 210 | 211 | """ 212 | sizes, successes = _measure_queries_success(df=df, queries=queries, n_repeat=5, n_meas=10) 213 | return _fit_model(sizes=sizes, successes=successes) 214 | 215 | 216 | class UniqueSinglingOutQueries: 217 | """Collection of unique queries that single out in a DataFrame.""" 218 | 219 | def __init__(self): 220 | self._set: Set[str] = set() 221 | self._list: List[str] = [] 222 | 223 | def check_and_append(self, query: str, df: pd.DataFrame): 224 | """Add a singling out query to the collection. 225 | 226 | A query singles out if the following conditions are met: 227 | 1. single out one record in the dataset. 228 | 2. have either a very low or a very high weight. In 229 | Both these cases singling out by chance is unlikely. 230 | Moreover, only queries that are not already in this collection 231 | can be added. 232 | 233 | Parameters 234 | ---------- 235 | query : str 236 | query expression to be added. 237 | df : pd.DataFrame 238 | Dataframe on which the queries need to single out. 239 | 240 | """ 241 | 242 | if query not in self._set: 243 | counts = safe_query_counts(query=query, df=df) 244 | 245 | if counts is not None and counts == 1: 246 | self._set.add(query) 247 | self._list.append(query) 248 | 249 | def __len__(self): 250 | """Length of the singling out queries in stored.""" 251 | return len(self._list) 252 | 253 | @property 254 | def queries(self) -> List[str]: 255 | """Queries that are present in the collection.""" 256 | return self._list 257 | 258 | 259 | def univariate_singling_out_queries(df: pd.DataFrame, n_queries: int) -> List[str]: 260 | """Generate singling out queries from rare attributes. 261 | 262 | Parameters 263 | ---------- 264 | df: pd.DataFrame 265 | Input dataframe from which queries will be generated. 266 | n_queries: int 267 | Number of queries to generate. 268 | 269 | Returns 270 | ------- 271 | List[str] 272 | The singling out queries. 273 | 274 | """ 275 | queries = [] 276 | 277 | for col in sorted(df.columns): 278 | if df[col].isna().sum() == 1: 279 | queries.append(f"{col}.isna()") 280 | 281 | if pd.api.types.is_numeric_dtype(df.dtypes[col]): 282 | values = df[col].dropna().sort_values() 283 | 284 | if len(values) > 0: 285 | queries.extend([f"{col} <= {values.iloc[0]}", f"{col} >= {values.iloc[-1]}"]) 286 | 287 | counts = df[col].value_counts() 288 | rare_values = counts[counts == 1] 289 | 290 | if len(rare_values) > 0: 291 | queries.extend([_query_expression(col=col, val=val, dtype=df.dtypes[col]) for val in rare_values.index]) 292 | 293 | rng.shuffle(queries) 294 | 295 | so_queries = UniqueSinglingOutQueries() 296 | 297 | for query in queries: 298 | so_queries.check_and_append(query, df=df) 299 | 300 | if len(so_queries) == n_queries: 301 | break 302 | 303 | return so_queries.queries 304 | 305 | 306 | def multivariate_singling_out_queries( 307 | df: pd.DataFrame, n_queries: int, n_cols: int, max_attempts: Optional[int] 308 | ) -> List[str]: 309 | """Generates singling out queries from a combination of attributes. 310 | 311 | Parameters 312 | ---------- 313 | df: pd.DataFrame 314 | Input dataframe from which queries will be generated. 315 | n_queries: int 316 | Number of queries to generate. 317 | n_cols: float 318 | Number of columns that the attacker uses to create the 319 | singling out queries. 320 | max_attemps: int, optional. 321 | Maximum number of attempts that the attacker can make to generate 322 | the requested ``n_attacks`` singling out queries. This is useful to 323 | avoid excessively long running calculations. There can be combinations 324 | of hyperparameters (`n_cols`) and datasets that make the task of 325 | generating enough singling out queries is too hard. This parameter 326 | caps the total number of query generation attempts, both those that 327 | are successfull as those that are not. If ``max_attempts`` is None, 328 | no limit will be imposed. 329 | 330 | 331 | Returns 332 | ------- 333 | List[str] 334 | The singling out queries. 335 | 336 | """ 337 | so_queries = UniqueSinglingOutQueries() 338 | medians = df.median(numeric_only=True) 339 | 340 | n_attempts = 0 341 | 342 | while len(so_queries) < n_queries: 343 | if max_attempts is not None and n_attempts >= max_attempts: 344 | logger.warning( 345 | f"Reached maximum number of attempts {max_attempts} when generating singling out queries. " 346 | f"Returning {len(so_queries.queries)} instead of the requested {n_queries}." 347 | "To avoid this, increase the number of attempts or set it to ``None`` to disable " 348 | "The limitation entirely." 349 | ) 350 | return so_queries.queries 351 | 352 | record = df.iloc[rng.integers(df.shape[0])] 353 | columns = rng.choice(df.columns, size=n_cols, replace=False).tolist() 354 | 355 | query = _query_from_record(record=record, dtypes=df.dtypes, columns=columns, medians=medians) 356 | 357 | so_queries.check_and_append(query=query, df=df) 358 | 359 | n_attempts += 1 360 | 361 | return so_queries.queries 362 | 363 | 364 | def _evaluate_queries(df: pd.DataFrame, queries: List[str]) -> List[str]: 365 | counts = np.array([safe_query_counts(query=q, df=df) for q in queries], dtype=float) 366 | 367 | if np.any(np.isnan(counts)) > 0: 368 | logger.warning( 369 | f"Found {np.sum(np.isnan(counts))} failed queries " 370 | f"out of {len(queries)}. Check DEBUG messages for more details." 371 | ) 372 | 373 | success = counts == 1 374 | return [q for iq, q in enumerate(queries) if success[iq]] 375 | 376 | 377 | def _generate_singling_out_queries( 378 | df: pd.DataFrame, mode: str, n_attacks: int, n_cols: int, max_attempts: Optional[int] 379 | ) -> List[str]: 380 | if mode == "univariate": 381 | queries = univariate_singling_out_queries(df=df, n_queries=n_attacks) 382 | 383 | elif mode == "multivariate": 384 | queries = multivariate_singling_out_queries( 385 | df=df, 386 | n_queries=n_attacks, 387 | n_cols=n_cols, 388 | max_attempts=max_attempts, 389 | ) 390 | 391 | else: 392 | raise RuntimeError(f"Parameter `mode` can be either `univariate` or `multivariate`. Got {mode} instead.") 393 | 394 | if len(queries) < n_attacks: 395 | logger.warning( 396 | f"Attack `{mode}` could generate only {len(queries)} " 397 | f"singling out queries out of the requested {n_attacks}. " 398 | "This can probably lead to an underestimate of the " 399 | "singling out risk." 400 | ) 401 | return queries 402 | 403 | 404 | class SinglingOutEvaluator: 405 | """Privacy evaluator that measures the singling out risk. 406 | 407 | Singling out happens when the attacker can determine that 408 | there is a single individual in the dataset that has certain 409 | attributes (for example "zip_code == XXX and first_name == YYY") 410 | with high enough confidence. According to the Article 29 WGP [2], 411 | singling out is one of the three risks (together with 412 | linkability and inference) that a successful anonymization technique 413 | must protect from. 414 | 415 | See [1] for the definition of some of the concepts used here. 416 | 417 | - [1]: https://arxiv.org/abs/1904.06009 418 | - [2]: https://ec.europa.eu/justice/article-29/documentation/\ 419 | opinion-recommendation/files/2014/wp216_en.pdf 420 | 421 | Parameters 422 | ---------- 423 | ori : pd.DataFrame 424 | Original dataframe on which the success of the singling out attacker 425 | attacker will be evaluated. 426 | syn : pd.DataFrame 427 | Synthetic dataframe used to generate the singling out queries. 428 | n_attacks : int, default is 500 429 | Number of singling out attacks to attempt. 430 | n_cols : int, default is 3 431 | Number of columns that the attacker uses to create the singling 432 | out queries. 433 | control : pd.DataFrame (optional) 434 | Independent sample of original records **not** used to create the 435 | synthetic dataset. This is used to evaluate the excess privacy risk. 436 | max_attempts : int or None, default is 10.000.000 437 | Maximum number of attempts that the attacker can make to generate 438 | the requested ``n_attacks`` singling out queries. This is useful to 439 | avoid excessively long running calculations. There can be combinations 440 | of hyperparameters (`n_cols`) and datasets that make the task of 441 | generating enough singling out queries is too hard. This parameter 442 | caps the total number of query generation attempts, both those that 443 | are successfull as those that are not. If ``max_attempts`` is None, 444 | no limit will be imposed. 445 | 446 | """ 447 | 448 | def __init__( 449 | self, 450 | ori: pd.DataFrame, 451 | syn: pd.DataFrame, 452 | n_attacks: int = 500, 453 | n_cols: int = 3, 454 | control: Optional[pd.DataFrame] = None, 455 | max_attempts: Optional[int] = 10000000, 456 | ): 457 | self._ori = ori.drop_duplicates() 458 | self._syn = syn.drop_duplicates() 459 | self._n_attacks = n_attacks 460 | self._n_cols = n_cols 461 | self._control = None if control is None else control.drop_duplicates() 462 | self._max_attempts = max_attempts 463 | self._queries: List[str] = [] 464 | self._random_queries: List[str] = [] 465 | self._evaluated = False 466 | 467 | def queries(self, baseline: bool = False) -> List[str]: 468 | """Successful singling out queries. 469 | 470 | Parameters 471 | ---------- 472 | baseline: bool, default is False. 473 | If True, return the queries used by the baseline attack (i.e. 474 | created at random). If False (default) return the queries used 475 | by the "real" attack. 476 | 477 | Returns 478 | ------- 479 | List[str]: 480 | successful singling out queries. 481 | 482 | """ 483 | return self._random_queries if baseline else self._queries 484 | 485 | def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator": 486 | """Run the attack and evaluate the guesses on the original dataset. 487 | 488 | Parameters 489 | ---------- 490 | mode : str, default is "multivariate" 491 | Name of the algorithm used to generate the singling out queries. 492 | Could be either `multivariate` or `univariate`. 493 | 494 | Returns 495 | ------- 496 | self 497 | The evaluated singling out evaluator. 498 | 499 | """ 500 | if mode == "multivariate": 501 | n_cols = self._n_cols 502 | elif mode == "univariate": 503 | n_cols = 1 504 | else: 505 | raise ValueError(f"mode must be either 'multivariate' or 'univariate', got {mode} instead.") 506 | 507 | baseline_queries = _random_queries(df=self._syn, n_queries=self._n_attacks, n_cols=n_cols) 508 | self._baseline_queries = _evaluate_queries(df=self._ori, queries=baseline_queries) 509 | self._n_baseline = len(self._baseline_queries) 510 | 511 | queries = _generate_singling_out_queries( 512 | df=self._syn, 513 | n_attacks=self._n_attacks, 514 | n_cols=self._n_cols, 515 | mode=mode, 516 | max_attempts=self._max_attempts, 517 | ) 518 | self._queries = _evaluate_queries(df=self._ori, queries=queries) 519 | self._n_success = len(self._queries) 520 | 521 | if self._control is None: 522 | self._n_control = None 523 | else: 524 | self._n_control = len(_evaluate_queries(df=self._control, queries=queries)) 525 | 526 | # correct the number of success against the control set 527 | # to account for different dataset sizes. 528 | if len(self._control) != len(self._ori): 529 | # fit the model to the data: 530 | fitted_model = fit_correction_term(df=self._control, queries=queries) 531 | 532 | correction = fitted_model(len(self._ori)) / fitted_model(len(self._control)) 533 | self._n_control *= correction 534 | 535 | self._evaluated = True 536 | return self 537 | 538 | def results(self, confidence_level: float = 0.95) -> EvaluationResults: 539 | """Raw evaluation results. 540 | 541 | Parameters 542 | ---------- 543 | confidence_level : float, default is 0.95 544 | Confidence level for the error bound calculation. 545 | 546 | Returns 547 | ------- 548 | EvaluationResults 549 | Object containing the success rates for the various attacks. 550 | 551 | """ 552 | if not self._evaluated: 553 | raise RuntimeError("The singling out evaluator wasn't evaluated yet. Please, run `evaluate()` first.") 554 | 555 | return EvaluationResults( 556 | n_attacks=self._n_attacks, 557 | n_success=self._n_success, 558 | n_baseline=self._n_baseline, 559 | n_control=self._n_control, 560 | confidence_level=confidence_level, 561 | ) 562 | 563 | def risk(self, confidence_level: float = 0.95, baseline: bool = False) -> PrivacyRisk: 564 | """Estimate the singling out risk. 565 | 566 | The risk is estimated comparing the number of successfull singling out 567 | queries to the desired number of attacks (``n_attacks``). 568 | 569 | Parameters 570 | ---------- 571 | confidence_level : float 572 | Confidence level for the reported error on the singling out risk. 573 | baseline : bool, default is False 574 | If True, return the baseline risk computed from a random guessing 575 | attack. If False (default) return the risk from the real attack. 576 | 577 | Returns 578 | ------- 579 | PrivacyRisk 580 | Estimate of the singling out risk and its confidence interval. 581 | 582 | """ 583 | results = self.results(confidence_level=confidence_level) 584 | return results.risk(baseline=baseline) 585 | -------------------------------------------------------------------------------- /src/anonymeter/neighbors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statice/anonymeter/0188bdf5615601e2f31503ae91a2b310af8d917c/src/anonymeter/neighbors/__init__.py -------------------------------------------------------------------------------- /src/anonymeter/neighbors/mixed_types_kneighbors.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | """Nearest neighbor search for mixed type data.""" 5 | import logging 6 | from math import fabs, isnan 7 | from typing import Dict, List, Optional, Tuple, Union 8 | 9 | import numpy as np 10 | import numpy.typing as npt 11 | import pandas as pd 12 | from joblib import Parallel, delayed 13 | from numba import jit 14 | 15 | from anonymeter.preprocessing.transformations import mixed_types_transform 16 | from anonymeter.preprocessing.type_detection import detect_consistent_col_types 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | @jit(nopython=True, nogil=True) 22 | def gower_distance(r0: npt.NDArray, r1: npt.NDArray, cat_cols_index: int) -> float: 23 | r"""Distance between two records inspired by the Gower distance [1]. 24 | 25 | To handle mixed type data, the distance is specialized for numerical (continuous) 26 | and categorical data. For numerical records, we use the L1 norm, 27 | computed after the columns have been normalized so that :math:`d(a_i, b_i)\leq 1` 28 | for every :math:`a_i`, :math:`b_i`. For categorical, :math:`d(a_i, b_i)` is 1, 29 | if the entries :math:`a_i`, :math:`b_i` differ, else, it is 0. 30 | 31 | Notes 32 | ----- 33 | To keep the balance between numerical and categorical values, the input records 34 | have to be properly normalized. Their numerical part need to be scaled so that 35 | the difference between any two values of a column (from both dataset) is *at most* 1. 36 | 37 | References 38 | ---------- 39 | [1]. `Gower (1971) "A general coefficient of similarity and some of its properties. 40 | `_ 41 | 42 | Parameters 43 | ---------- 44 | r0 : npt.NDArray 45 | Input array of shape (D,). 46 | r1 : npt.NDArray 47 | Input array of shape (D,). 48 | cat_cols_index : int 49 | Index delimiting the categorical columns in r0/r1 if present. For example, 50 | ``r0[:cat_cols_index]`` are the numerical columns, and ``r0[cat_cols_index:]`` are 51 | the categorical ones. For a fully numerical dataset, use ``cat_cols_index = 52 | len(r0)``. For a fully categorical one, set ``cat_cols_index`` to 0. 53 | 54 | Returns 55 | ------- 56 | float 57 | distance between the records. 58 | 59 | """ 60 | dist = 0.0 61 | 62 | for i in range(len(r0)): 63 | if isnan(r0[i]) and isnan(r1[i]): 64 | dist += 1 65 | 66 | else: 67 | if i < cat_cols_index: 68 | dist += fabs(r0[i] - r1[i]) 69 | 70 | else: 71 | if r0[i] != r1[i]: 72 | dist += 1 73 | return dist 74 | 75 | 76 | @jit(nopython=True, nogil=True) 77 | def _nearest_neighbors( 78 | queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int 79 | ) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]: 80 | r"""For every element of ``queries``, find its nearest neighbors in ``candidates``. 81 | 82 | Parameters 83 | ---------- 84 | queries : npt.NDArray 85 | Input array of shape (Nx, D). 86 | candidates : npt.NDArray 87 | Input array of shape (Ny, D). 88 | cat_cols_index : int 89 | Index delimiting the categorical columns in X/Y, if present. 90 | n_neighbors : int 91 | Determines the number of closest neighbors per entry to be returned. 92 | 93 | Returns 94 | ------- 95 | idx : npt.NDArray[int64] 96 | Array of shape (Nx, n_neighbors). For each element in ``queries``, 97 | this array contains the indices of the closest neighbors in 98 | ``candidates``. That is, ``candidates[idx[i]]`` are the elements of 99 | ``candidates`` that are closer to ``queries[i]``. 100 | lps : npt.NDArray[float64] 101 | Array of shape (Nx, n_neighbors). This array containing the distances 102 | between the record pairs identified by idx. 103 | 104 | """ 105 | idx = np.zeros((queries.shape[0], n_neighbors), dtype=np.int64) 106 | dists = np.zeros((queries.shape[0], n_neighbors), dtype=np.float64) 107 | 108 | for ix in range(queries.shape[0]): 109 | dist_ix = np.zeros((candidates.shape[0]), dtype=np.float64) 110 | 111 | for iy in range(candidates.shape[0]): 112 | dist_ix[iy] = gower_distance(r0=queries[ix], r1=candidates[iy], cat_cols_index=cat_cols_index) 113 | 114 | close_match_idx = dist_ix.argsort()[:n_neighbors] 115 | idx[ix] = close_match_idx 116 | dists[ix] = dist_ix[close_match_idx] 117 | 118 | return idx, dists 119 | 120 | 121 | class MixedTypeKNeighbors: 122 | """Nearest neighbor algorithm for mixed type data. 123 | 124 | To handle mixed type data, we use a distance function inspired by the Gower similarity. 125 | The distance is specialized for numerical (continuous) and categorical data. For 126 | numerical records, we use the L1 norm, computed after the columns have been 127 | normalized so that :math:`d(a_i, b_i) <= 1` for every :math:`a_i`, :math:`b_i`. 128 | For categorical, :math:`d(a_i, b_i)` is 1, if the entries :math:`a_i`, :math:`b_i` 129 | differ, else, it is 0. 130 | 131 | References 132 | ---------- 133 | [1]. `Gower (1971) "A general coefficient of similarity and some of its properties. 134 | `_ 135 | 136 | Parameters 137 | ---------- 138 | n_neighbors : int, default is 5 139 | Determines the number of closest neighbors per entry to be returned. 140 | n_jobs : int, default is -2 141 | Number of jobs to use. It follows joblib convention, so that ``n_jobs = -1`` 142 | means all available cores. 143 | 144 | """ 145 | 146 | def __init__(self, n_neighbors: int = 5, n_jobs: int = -2): 147 | self._n_neighbors = n_neighbors 148 | self._n_jobs = n_jobs 149 | 150 | def fit(self, candidates: pd.DataFrame, ctypes: Optional[Dict[str, List[str]]] = None): 151 | """Prepare for nearest neighbor search. 152 | 153 | Parameters 154 | ---------- 155 | candidates : pd.DataFrame 156 | Dataset containing the records one would find the neighbors in. 157 | ctypes : dict, optional. 158 | Dictionary specifying which columns in X should be treated as 159 | continuous and which should be treated as categorical. For example, 160 | ``ctypes = {'num': ['distance'], 'cat': ['color']}`` specify the types 161 | of a two column dataset. 162 | 163 | """ 164 | self._candidates = candidates 165 | self._ctypes = ctypes 166 | return self 167 | 168 | def kneighbors( 169 | self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False 170 | ) -> Union[Tuple[npt.NDArray, npt.NDArray], npt.NDArray]: 171 | """Find the nearest neighbors for a set of query points. 172 | 173 | Note 174 | ---- 175 | The search is performed in a brute-force fashion. For large datasets 176 | or large number of query points, the search for nearest neighbor will 177 | become very slow. 178 | 179 | Parameters 180 | ---------- 181 | queries : pd.DataFrame 182 | Query points for the nearest neighbor searches. 183 | n_neighbors : int, default is None 184 | Number of neighbors required for each sample. 185 | The default is the value passed to the constructor. 186 | return_distance : bool, default is False 187 | Whether or not to return the distances of the neigbors or 188 | just the indexes. 189 | 190 | Returns 191 | ------- 192 | np.narray of shape (df.shape[0], n_neighbors) 193 | Array with the indexes of the elements of the fit dataset closer to 194 | each element in the query dataset. 195 | np.narray of shape (df.shape[0], n_neighbors) 196 | Array with the distances of the neighbors pairs. This is optional and 197 | it is returned only if ``return_distances`` is ``True`` 198 | 199 | """ 200 | if n_neighbors is None: 201 | n_neighbors = self._n_neighbors 202 | 203 | if n_neighbors > self._candidates.shape[0]: 204 | logger.warning( 205 | f"Parameter ``n_neighbors``={n_neighbors} cannot be " 206 | f"larger than the size of the training data {self._candidates.shape[0]}." 207 | ) 208 | n_neighbors = self._candidates.shape[0] 209 | 210 | if self._ctypes is None: 211 | self._ctypes = detect_consistent_col_types(df1=self._candidates, df2=queries) 212 | candidates, queries = mixed_types_transform( 213 | df1=self._candidates, df2=queries, num_cols=self._ctypes["num"], cat_cols=self._ctypes["cat"] 214 | ) 215 | 216 | cols = self._ctypes["num"] + self._ctypes["cat"] 217 | queries = queries[cols].values 218 | candidates = candidates[cols].values 219 | 220 | with Parallel(n_jobs=self._n_jobs, backend="threading") as executor: 221 | res = executor( 222 | delayed(_nearest_neighbors)( 223 | queries=queries[ii : ii + 1], 224 | candidates=candidates, 225 | cat_cols_index=len(self._ctypes["num"]), 226 | n_neighbors=n_neighbors, 227 | ) 228 | for ii in range(queries.shape[0]) 229 | ) 230 | 231 | indexes_array, distances_array = zip(*res) 232 | indexes, distances = np.vstack(indexes_array), np.vstack(distances_array) 233 | 234 | if return_distance: 235 | return distances, indexes 236 | 237 | return indexes 238 | -------------------------------------------------------------------------------- /src/anonymeter/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statice/anonymeter/0188bdf5615601e2f31503ae91a2b310af8d917c/src/anonymeter/preprocessing/__init__.py -------------------------------------------------------------------------------- /src/anonymeter/preprocessing/transformations.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | """Data pre-processing and transformations for the privacy evaluators.""" 5 | import logging 6 | from typing import List, Tuple 7 | 8 | import pandas as pd 9 | from sklearn.preprocessing import LabelEncoder 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def _encode_categorical( 15 | df1: pd.DataFrame, 16 | df2: pd.DataFrame, 17 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 18 | """Encode dataframes with categorical values keeping label consistend.""" 19 | encoded = pd.concat((df1, df2), keys=["df1", "df2"]) 20 | 21 | for col in encoded.columns: 22 | encoded[col] = LabelEncoder().fit_transform(encoded[col]) 23 | 24 | return encoded.loc["df1"], encoded.loc["df2"] 25 | 26 | 27 | def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: 28 | """Scale dataframes with *only* numerical values.""" 29 | df1_min, df1_max = df1.min(), df1.max() 30 | df2_min, df2_max = df2.min(), df2.max() 31 | 32 | mins = df1_min.where(df1_min < df2_min, df2_min) 33 | maxs = df1_max.where(df1_max > df2_max, df2_max) 34 | ranges = maxs - mins 35 | 36 | if any(ranges == 0): 37 | cnames = ", ".join(ranges[ranges == 0].index.values) 38 | logger.debug( 39 | f"Numerical column(s) {cnames} have a null-range: all elements " 40 | "have the same value. These column(s) won't be scaled." 41 | ) 42 | ranges[ranges == 0] = 1 43 | 44 | df1_scaled = df1.apply(lambda x: x / ranges[x.name]) 45 | df2_scaled = df2.apply(lambda x: x / ranges[x.name]) 46 | if isinstance(df1_scaled, pd.Series) or isinstance(df2_scaled, pd.Series): 47 | raise RuntimeError("Unexpected error: scaling resulted in a Series.") 48 | 49 | return df1_scaled, df2_scaled 50 | 51 | 52 | def mixed_types_transform( 53 | df1: pd.DataFrame, df2: pd.DataFrame, num_cols: List[str], cat_cols: List[str] 54 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 55 | """Combination of an encoder and a scaler to treat mixed type data. 56 | 57 | Numerical columns are scaled by dividing them by their range across both 58 | datasets, so that the difference between any two values within a column will 59 | be smaller than or equal to one: 60 | x -> x' = x / max{max(x), max(x_other)} - min{min(x), min(x_other)} 61 | 62 | Categorical columns are label encoded. This encoding is based on the 63 | `statice.preprocessing.encoders.DataframeEncoder` fitted on the firts 64 | dataframe, and applied to both of them. 65 | 66 | Parameters 67 | ---------- 68 | df1: pd.DataFrame. 69 | Input DataFrame. This dataframe will be used to fit the DataframeLabelEncoder. 70 | df2: pd.DataFrame. 71 | Second input DataFrame. 72 | num_cols: list[str]. 73 | Names of the numerical columns to be processed. 74 | cat_cols: list[str]. 75 | Names of the columns to be processed. 76 | 77 | Returns 78 | ------- 79 | trans_df1: pd.DataFrame. 80 | Transformed df1. 81 | trans_df2: pd.DataFrame. 82 | Transformed df2. 83 | 84 | """ 85 | if not set(df1.columns) == set(df2.columns): 86 | raise ValueError(f"Input dataframes have different columns. df1: {df1.columns}, df2: {df2.columns}.") 87 | 88 | if not set(num_cols + cat_cols) == set(df1.columns): 89 | raise ValueError( 90 | f"Dataframes columns {df1.columns} do not match " 91 | "with `num_cols` and `cat_cols`.\n" 92 | f"num_cols: {num_cols}\n" 93 | f"cat_cols: {cat_cols}" 94 | ) 95 | 96 | df1_num, df2_num = pd.DataFrame(), pd.DataFrame() 97 | if len(num_cols) > 0: 98 | df1_num, df2_num = _scale_numerical(df1[num_cols], df2[num_cols]) 99 | 100 | df1_cat, df2_cat = pd.DataFrame(), pd.DataFrame() 101 | if len(cat_cols) > 0: 102 | df1_cat, df2_cat = _encode_categorical(df1[cat_cols], df2[cat_cols]) 103 | 104 | df1_out = pd.concat([df1_num, df1_cat], axis=1)[df1.columns] 105 | 106 | df2_out = pd.concat([df2_num, df2_cat], axis=1)[df2.columns] 107 | return df1_out, df2_out 108 | -------------------------------------------------------------------------------- /src/anonymeter/preprocessing/type_detection.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | from typing import Dict, List 5 | 6 | import pandas as pd 7 | 8 | 9 | def detect_col_types(df: pd.DataFrame) -> Dict[str, List[str]]: 10 | """Identify numerical and non-numerical columns in the dataframe. 11 | 12 | Parameters 13 | ---------- 14 | df : pandas.DataFrame 15 | 16 | Returns 17 | ------- 18 | Dict[str: List[str]] 19 | Dictionary with column names separated by types. Key of the dictionary are 20 | 'num' or 'cat' (numerical and non-numerical, that is categorical, resp.). 21 | Values are lists of column names. 22 | 23 | """ 24 | num_cols: List[str] = list(df.select_dtypes("number").columns.values) 25 | cat_cols: List[str] = [cn for cn in df.columns.values if cn not in num_cols] 26 | 27 | return {"num": sorted(num_cols), "cat": sorted(cat_cols)} 28 | 29 | 30 | def detect_consistent_col_types(df1: pd.DataFrame, df2: pd.DataFrame): 31 | """Detect colum types for a pair dataframe an check that they are the same. 32 | 33 | Parameters 34 | ---------- 35 | df1 : pandas.DataFrame 36 | Input dataframe 37 | df2 : pandas.DataFrame 38 | Input dataframe 39 | 40 | Returns 41 | ------- 42 | Dict[str: List[str]] 43 | Dictionary with column names separated by types. Key of the dictionary are 44 | 'num' or 'cat' (numerical and non-numerical, that is categorical, resp.). 45 | Values are lists of column names. 46 | 47 | """ 48 | ctypes1 = detect_col_types(df1) 49 | 50 | if ctypes1 != detect_col_types(df2): 51 | raise RuntimeError("Input dataframes have different column names/types.") 52 | 53 | return ctypes1 54 | -------------------------------------------------------------------------------- /src/anonymeter/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statice/anonymeter/0188bdf5615601e2f31503ae91a2b310af8d917c/src/anonymeter/stats/__init__.py -------------------------------------------------------------------------------- /src/anonymeter/stats/confidence.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | """Functions for estimating rates and errors in privacy attacks.""" 5 | 6 | import warnings 7 | from math import sqrt 8 | from typing import NamedTuple, Optional, Tuple 9 | 10 | from scipy.stats import norm 11 | 12 | 13 | class PrivacyRisk(NamedTuple): 14 | """Measure of a privacy risk. 15 | 16 | Parameters 17 | ---------- 18 | value : float 19 | Best estimate of the privacy risk. 20 | ci : (float, float) 21 | Confidence interval on the best estimate. 22 | 23 | """ 24 | 25 | value: float 26 | ci: Tuple[float, float] 27 | 28 | 29 | class SuccessRate(NamedTuple): 30 | """Estimate of the success rate of a privacy attack. 31 | 32 | Parameters 33 | ---------- 34 | value : float 35 | Best estimate of the success rate of the attacker. 36 | error : float 37 | Error on the best estimate. 38 | 39 | """ 40 | 41 | value: float 42 | error: float 43 | 44 | def to_risk(self) -> PrivacyRisk: 45 | """Convert attacker success rate to `PrivacyRisk`.""" 46 | return bind_value(point_estimate=self.value, error_bound=self.error) 47 | 48 | 49 | def probit(confidence_level: float) -> float: 50 | """Compute the probit for the given confidence level.""" 51 | result = norm.ppf(0.5 * (1.0 + confidence_level)) 52 | if not isinstance(result, float): 53 | raise RuntimeError("Unexpected error: probit resulted in a non-float value.") 54 | return result 55 | 56 | 57 | def success_rate(n_total: int, n_success: int, confidence_level: float) -> SuccessRate: 58 | """Estimate success rate in a Bernoulli-distributed sample. 59 | 60 | Attack scores follow a Bernoulli distribution (success/failure with rates p/1-p). 61 | The Wilson score interval is a frequentist-type estimator for success rate and 62 | confidence which is robust in problematic cases (e.g., when p goes extreme or 63 | sample size is small). The estimated rate is a weighted average between the 64 | MLE result and 0.5 which, however, in the sample sizes used in privacy attacks 65 | does not differ visibly from the MLE outcome. 66 | 67 | Parameters 68 | ---------- 69 | n_total : int 70 | Size of the sample. 71 | n_success : int 72 | Number of successful trials in the sample. 73 | confidence_level : float 74 | Confidence level for the error estimation. 75 | 76 | Returns 77 | ------- 78 | float 79 | Point estimate for the success rate. 80 | float 81 | Error bound of the point-estimated rate for the requested confidence level. 82 | 83 | Notes 84 | ----- 85 | E.B. WILSON 86 | Probable inference, the law of succession, and statistical inference 87 | Journal of the American Statistical Association 22, 209-212 (1927) 88 | DOI 10.1080/01621459.1927.10502953 89 | 90 | """ 91 | if confidence_level > 1 or confidence_level < 0: 92 | raise ValueError(f"Parameter `confidence_level` must be between 0 and 1. Got {confidence_level} instead.") 93 | 94 | z = probit(confidence_level) 95 | 96 | z_squared = z * z 97 | n_success_var = n_success * (n_total - n_success) / n_total 98 | denominator = n_total + z_squared 99 | 100 | rate = (n_success + 0.5 * z_squared) / denominator 101 | error = (z / denominator) * sqrt(n_success_var + 0.25 * z_squared) 102 | return SuccessRate(value=rate, error=error) 103 | 104 | 105 | def residual_success( 106 | attack_rate: SuccessRate, 107 | control_rate: SuccessRate, 108 | ) -> SuccessRate: 109 | """Compute residual success in a privacy attack. 110 | 111 | Residual success is defined as the excess of training attack 112 | success over control attack success, normalized w.r.t. 113 | the margin of improvement (unsuccessful attacks on control). 114 | 115 | Parameters 116 | ---------- 117 | attack_rate : SuccessRate 118 | Success rate on training data. 119 | control_rate : SuccessRate 120 | Success rate on control data. 121 | 122 | Returns 123 | ------- 124 | SuccessRate 125 | Residual success score without sign correction (i.e., negative 126 | outcome if control more attack-able than training). The correction 127 | would yield ``0 ≤ score ≤ 1`` (zero for negative uncorrected score). 128 | The error estimate is the propagated error bound of the residual 129 | success rate. 130 | 131 | """ 132 | residual = (attack_rate.value - control_rate.value) / (1.0 - control_rate.value) 133 | 134 | # propagate the error using 135 | # dF = sqrt[ (dF/dx)^2 dx^2 + (dF/dy)^2 dy^2 + ... ] 136 | der_wrt_attack = 1 / abs(1 - control_rate.value) 137 | der_wrt_control = (attack_rate.value - 1) / (1 - control_rate.value) ** 2 138 | 139 | error = sqrt((attack_rate.error * der_wrt_attack) ** 2 + (control_rate.error * der_wrt_control) ** 2) 140 | 141 | return SuccessRate(value=residual, error=error) 142 | 143 | 144 | def bind_value(point_estimate: float, error_bound: float) -> PrivacyRisk: 145 | """Force point_estimate and error into fixed bounds. 146 | 147 | Parameters 148 | ---------- 149 | point_estimate : float 150 | Point estimate of a rate or risk value. 151 | error_bound : float 152 | Symmetric error around the point estimate. 153 | 154 | 155 | Returns 156 | ------- 157 | float 158 | Point estimate respecting the bounds 0-1 or 0-100. 159 | Tuple[float, float] 160 | Asymmetric confidence interval respecting the bounds 0-1 or 0-100. 161 | 162 | """ 163 | bound_point = min(max(point_estimate, 0.0), 1.0) 164 | bound_lower = min(max(point_estimate - error_bound, 0.0), 1.0) 165 | bound_upper = min(max(point_estimate + error_bound, 0.0), 1.0) 166 | return PrivacyRisk(value=bound_point, ci=(bound_lower, bound_upper)) 167 | 168 | 169 | class EvaluationResults: 170 | """Results of a privacy evaluator. 171 | 172 | This class will compute the attacker's success rates 173 | and estimate for the corresponding privacy risk. 174 | 175 | Parameters 176 | ---------- 177 | n_attacks : int 178 | Total number of attacks performed. 179 | n_success : int 180 | Number of successful attacks. 181 | n_baseline : int 182 | Number of successful attacks for the 183 | baseline (i.e. random-guessing) attacker. 184 | n_control : int, default is None 185 | Number of successful attacks against the 186 | control dataset. If this parameter is not None 187 | the privacy risk will be measured relative to 188 | the attacker success on the control set. 189 | confidence_level : float, default is 0.95 190 | Desired confidence level for the confidence 191 | intervals on the risk. 192 | 193 | """ 194 | 195 | def __init__( 196 | self, 197 | n_attacks: int, 198 | n_success: int, 199 | n_baseline: int, 200 | n_control: Optional[int] = None, 201 | confidence_level: float = 0.95, 202 | ): 203 | self.attack_rate = success_rate(n_total=n_attacks, n_success=n_success, confidence_level=confidence_level) 204 | 205 | self.baseline_rate = success_rate(n_total=n_attacks, n_success=n_baseline, confidence_level=confidence_level) 206 | 207 | self.control_rate = ( 208 | None 209 | if n_control is None 210 | else success_rate(n_total=n_attacks, n_success=n_control, confidence_level=confidence_level) 211 | ) 212 | 213 | self.n_attacks = n_attacks 214 | self.n_success = n_success 215 | self.n_baseline = n_baseline 216 | self.n_control = n_control 217 | 218 | self._sanity_check() 219 | 220 | def _sanity_check(self): 221 | if self.baseline_rate.value >= self.attack_rate.value: 222 | warnings.warn( 223 | "Attack is as good or worse as baseline model. " 224 | f"Estimated rates: attack = {self.attack_rate.value}, " 225 | f"baseline = {self.baseline_rate.value}. " 226 | "Analysis results cannot be trusted.", 227 | stacklevel=2, 228 | ) 229 | 230 | if self.control_rate is not None and self.control_rate.value == 1: 231 | warnings.warn("Success of control attack is 100%. Cannot measure residual privacy risk.", stacklevel=2) 232 | 233 | def risk(self, baseline: bool = False) -> PrivacyRisk: 234 | """Estimate the privacy risk.""" 235 | if baseline: 236 | return self.baseline_rate.to_risk() 237 | 238 | if self.control_rate is None: 239 | return self.attack_rate.to_risk() 240 | else: 241 | return residual_success(attack_rate=self.attack_rate, control_rate=self.control_rate).to_risk() 242 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statice/anonymeter/0188bdf5615601e2f31503ae91a2b310af8d917c/tests/__init__.py -------------------------------------------------------------------------------- /tests/datasets/adults_ori.csv: -------------------------------------------------------------------------------- 1 | age,type_employer,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hr_per_week,country,income 2 | 35,Private,Some-college,10,Never-married,Sales,Not-in-family,White,Male,0,0,50,United-States,<=50K 3 | 27,Private,Assoc-voc,11,Divorced,Other-service,Unmarried,Amer-Indian-Eskimo,Female,0,0,40,United-States,<=50K 4 | 28,Private,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,60,Germany,>50K 5 | 30,Local-gov,HS-grad,9,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K 6 | 20,Private,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,35,United-States,<=50K 7 | 47,Private,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,<=50K 8 | 23,Private,10th,6,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K 9 | 45,Private,Some-college,10,Separated,Adm-clerical,Unmarried,White,Female,0,0,27,United-States,<=50K 10 | 30,Private,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 11 | 27,Private,HS-grad,9,Separated,Handlers-cleaners,Own-child,White,Female,0,1594,25,United-States,<=50K 12 | 21,Self-emp-not-inc,Some-college,10,Never-married,Farming-fishing,Own-child,White,Male,0,0,40,United-States,<=50K 13 | 25,Local-gov,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Female,0,0,40,United-States,<=50K 14 | 29,Private,Masters,14,Never-married,Sales,Not-in-family,White,Male,0,0,50,United-States,>50K 15 | 36,Private,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,64,United-States,>50K 16 | 52,Private,5th-6th,3,Widowed,Other-service,Unmarried,White,Female,0,0,40,Mexico,<=50K 17 | 37,Self-emp-inc,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,60,United-States,<=50K 18 | 52,Private,7th-8th,4,Divorced,Machine-op-inspct,Not-in-family,White,Female,0,0,64,United-States,<=50K 19 | 57,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K 20 | 63,Private,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,Scotland,<=50K 21 | 40,Private,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 22 | 34,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,<=50K 23 | 41,Self-emp-inc,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K 24 | 32,Private,Bachelors,13,Divorced,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States,<=50K 25 | 58,Self-emp-not-inc,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,<=50K 26 | 31,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,91,United-States,<=50K 27 | 46,Private,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,3103,0,60,United-States,>50K 28 | 26,Private,Some-college,10,Separated,Sales,Unmarried,White,Female,0,0,35,United-States,<=50K 29 | 53,Private,12th,8,Divorced,Transport-moving,Not-in-family,White,Female,0,0,40,United-States,<=50K 30 | 42,Local-gov,HS-grad,9,Widowed,Transport-moving,Unmarried,White,Female,0,0,40,United-States,<=50K 31 | 44,Private,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K 32 | 33,Private,Some-college,10,Never-married,Prof-specialty,Not-in-family,White,Female,3674,0,16,United-States,<=50K 33 | 27,Private,HS-grad,9,Divorced,Adm-clerical,Other-relative,White,Female,0,0,40,United-States,<=50K 34 | 55,?,HS-grad,9,Married-civ-spouse,?,Wife,White,Female,0,0,6,United-States,>50K 35 | 29,State-gov,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K 36 | 19,Private,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K 37 | 52,Private,HS-grad,9,Widowed,Other-service,Unmarried,Asian-Pac-Islander,Female,0,0,40,India,>50K 38 | 44,Private,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K 39 | 17,Private,11th,7,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,<=50K 40 | 53,Private,Bachelors,13,Never-married,Sales,Unmarried,White,Male,0,1669,50,United-States,<=50K 41 | 38,Private,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 42 | 33,Private,Assoc-voc,11,Married-civ-spouse,Tech-support,Husband,Asian-Pac-Islander,Male,0,0,10,United-States,<=50K 43 | 19,Local-gov,Some-college,10,Never-married,Protective-serv,Own-child,White,Male,0,1721,35,United-States,<=50K 44 | 31,Private,5th-6th,3,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,Mexico,<=50K 45 | 53,Private,9th,5,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Dominican-Republic,<=50K 46 | 22,Private,Some-college,10,Never-married,Adm-clerical,Own-child,Black,Female,0,0,35,United-States,<=50K 47 | 32,Federal-gov,HS-grad,9,Never-married,Exec-managerial,Unmarried,White,Female,0,1380,40,United-States,<=50K 48 | 50,Self-emp-inc,HS-grad,9,Married-civ-spouse,Sales,Wife,White,Female,0,0,30,United-States,<=50K 49 | 39,Private,Assoc-acdm,12,Separated,Prof-specialty,Not-in-family,White,Female,0,0,30,United-States,<=50K 50 | 41,Local-gov,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,Black,Female,0,0,40,United-States,<=50K 51 | 47,Private,5th-6th,3,Separated,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K 52 | 31,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,5013,0,32,United-States,<=50K 53 | 37,Private,HS-grad,9,Divorced,Craft-repair,Unmarried,White,Female,0,0,48,United-States,<=50K 54 | 68,?,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,45,United-States,<=50K 55 | 36,State-gov,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,7298,0,40,United-States,>50K 56 | 36,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,Asian-Pac-Islander,Female,0,0,40,Philippines,>50K 57 | 19,Private,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,40,United-States,<=50K 58 | 52,Private,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K 59 | 60,Private,HS-grad,9,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,38,United-States,<=50K 60 | 51,Private,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,47,United-States,>50K 61 | 45,Private,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 62 | 37,Private,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,24,United-States,<=50K 63 | 53,Federal-gov,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K 64 | 30,Private,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K 65 | 45,Private,Doctorate,16,Separated,Exec-managerial,Unmarried,White,Male,0,0,40,United-States,>50K 66 | 24,Private,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,30,United-States,<=50K 67 | 37,?,Assoc-acdm,12,Married-civ-spouse,?,Husband,White,Male,0,0,32,United-States,<=50K 68 | 58,Private,10th,6,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,>50K 69 | 24,Private,Bachelors,13,Never-married,Sales,Not-in-family,White,Female,0,0,50,United-States,<=50K 70 | 23,Private,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,30,United-States,<=50K 71 | 21,Private,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K 72 | 39,Private,HS-grad,9,Divorced,Machine-op-inspct,Unmarried,White,Male,0,1726,40,United-States,<=50K 73 | 32,Private,Some-college,10,Never-married,Machine-op-inspct,Other-relative,White,Female,0,2205,40,Holand-Netherlands,<=50K 74 | 40,Private,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,60,United-States,<=50K 75 | 26,Private,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Male,0,0,30,United-States,<=50K 76 | 58,?,Some-college,10,Never-married,?,Not-in-family,White,Female,0,0,40,United-States,<=50K 77 | 35,Self-emp-not-inc,Masters,14,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,>50K 78 | 22,Private,Assoc-voc,11,Never-married,Other-service,Not-in-family,White,Female,0,1762,40,United-States,<=50K 79 | 18,Private,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,35,United-States,<=50K 80 | 57,Private,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K 81 | 55,Federal-gov,Bachelors,13,Married-spouse-absent,Exec-managerial,Not-in-family,Black,Male,0,0,40,United-States,>50K 82 | 50,Self-emp-not-inc,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,55,United-States,<=50K 83 | 25,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K 84 | 43,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,2377,50,United-States,<=50K 85 | 40,Private,11th,7,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,55,United-States,<=50K 86 | 32,Private,HS-grad,9,Married-civ-spouse,Sales,Wife,White,Female,0,0,40,United-States,<=50K 87 | 23,Private,Assoc-voc,11,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K 88 | 36,?,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,15,United-States,<=50K 89 | 53,Private,Bachelors,13,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Female,0,0,21,Japan,>50K 90 | 20,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K 91 | 64,Private,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,2179,40,United-States,<=50K 92 | 37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,60,United-States,>50K 93 | 40,Private,Bachelors,13,Married-spouse-absent,Sales,Not-in-family,White,Male,0,0,40,United-States,>50K 94 | 23,Local-gov,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,20,United-States,<=50K 95 | 43,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,7688,0,40,United-States,>50K 96 | 46,Local-gov,Assoc-acdm,12,Divorced,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K 97 | 36,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,45,United-States,>50K 98 | 52,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1887,40,United-States,>50K 99 | 22,Private,10th,6,Never-married,Craft-repair,Own-child,White,Male,0,0,15,United-States,<=50K 100 | 29,Private,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K 101 | 28,Self-emp-not-inc,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,60,United-States,<=50K 102 | -------------------------------------------------------------------------------- /tests/datasets/adults_syn.csv: -------------------------------------------------------------------------------- 1 | age,type_employer,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hr_per_week,country,income 2 | 49,Private,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,48,United-States,<=50K 3 | 57,Private,Bachelors,13,Married-civ-spouse,Sales,Wife,White,Female,0,0,40,United-States,<=50K 4 | 22,Private,10th,6,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,30,United-States,<=50K 5 | 60,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,48,United-States,<=50K 6 | 90,Private,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K 7 | 72,Self-emp-not-inc,Prof-school,15,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,10,United-States,<=50K 8 | 68,Self-emp-inc,11th,7,Married-civ-spouse,Sales,Husband,White,Male,0,1258,40,United-States,<=50K 9 | 27,?,Some-college,10,Married-civ-spouse,?,Wife,White,Female,0,0,40,United-States,>50K 10 | 18,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Female,0,0,20,United-States,<=50K 11 | 30,Local-gov,9th,5,Divorced,Farming-fishing,Unmarried,White,Female,0,0,40,Mexico,<=50K 12 | 34,Self-emp-not-inc,11th,7,Married-civ-spouse,Sales,Wife,White,Female,0,0,30,United-States,<=50K 13 | 18,?,11th,7,Never-married,?,Own-child,White,Female,0,0,25,United-States,<=50K 14 | 34,Local-gov,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,35,United-States,<=50K 15 | 50,Local-gov,Bachelors,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,44,United-States,>50K 16 | 45,Self-emp-not-inc,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,38,United-States,>50K 17 | 35,Self-emp-inc,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K 18 | 21,Private,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,50,United-States,<=50K 19 | 53,Private,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1977,40,United-States,>50K 20 | 35,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,40,United-States,<=50K 21 | 20,Private,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K 22 | 18,Private,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,20,United-States,<=50K 23 | 45,Self-emp-inc,Assoc-voc,11,Divorced,Sales,Unmarried,White,Female,0,0,30,United-States,<=50K 24 | 29,Private,HS-grad,9,Never-married,Other-service,Unmarried,Black,Female,0,0,40,Japan,<=50K 25 | 57,Private,10th,6,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,United-States,<=50K 26 | 40,Private,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,50,United-States,>50K 27 | 52,Private,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,45,United-States,<=50K 28 | 50,Private,Some-college,10,Divorced,Craft-repair,Not-in-family,Black,Female,0,0,45,United-States,<=50K 29 | 53,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,Japan,<=50K 30 | 37,Local-gov,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,44,United-States,<=50K 31 | 56,Private,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K 32 | 62,Private,HS-grad,9,Widowed,Craft-repair,Unmarried,Black,Female,0,0,40,United-States,<=50K 33 | 32,Private,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,2829,0,40,?,<=50K 34 | 57,Private,HS-grad,9,Widowed,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K 35 | 45,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,>50K 36 | 42,Private,Masters,14,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,55,United-States,>50K 37 | 53,Private,Masters,14,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 38 | 28,Private,5th-6th,3,Never-married,Craft-repair,Other-relative,White,Male,0,0,40,Mexico,<=50K 39 | 28,Private,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K 40 | 57,?,Bachelors,13,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,>50K 41 | 83,Self-emp-inc,HS-grad,9,Divorced,Sales,Not-in-family,White,Male,0,0,20,United-States,<=50K 42 | 33,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K 43 | 25,Private,Assoc-acdm,12,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K 44 | 29,Private,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,40,United-States,<=50K 45 | 43,Private,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,15024,0,50,United-States,>50K 46 | 27,Private,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States,<=50K 47 | 40,Private,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K 48 | 24,Private,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,40,United-States,<=50K 49 | 27,Private,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,>50K 50 | 39,Private,HS-grad,9,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,40,United-States,<=50K 51 | 57,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,55,United-States,>50K 52 | 28,Private,Bachelors,13,Never-married,Sales,Not-in-family,White,Male,0,0,40,United-States,<=50K 53 | 17,Private,11th,7,Never-married,Sales,Own-child,White,Male,0,0,30,United-States,<=50K 54 | 27,Private,Assoc-voc,11,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,<=50K 55 | 49,Local-gov,Masters,14,Separated,Prof-specialty,Unmarried,White,Female,0,0,50,United-States,<=50K 56 | 64,Private,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,2635,0,40,United-States,<=50K 57 | 21,Private,Preschool,1,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,50,Mexico,<=50K 58 | 34,Self-emp-inc,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,50,United-States,<=50K 59 | 49,Self-emp-not-inc,HS-grad,9,Divorced,Transport-moving,Not-in-family,White,Male,0,0,70,United-States,<=50K 60 | 49,Private,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K 61 | 42,Private,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,40,United-States,<=50K 62 | 47,Private,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K 63 | 27,Private,10th,6,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,<=50K 64 | 44,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Female,0,0,48,United-States,<=50K 65 | 29,Private,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,1590,50,United-States,<=50K 66 | 25,Federal-gov,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,Amer-Indian-Eskimo,Male,0,0,40,United-States,<=50K 67 | 23,Self-emp-inc,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K 68 | 67,?,9th,5,Married-civ-spouse,?,Husband,White,Male,0,0,15,United-States,<=50K 69 | 39,Self-emp-not-inc,Bachelors,13,Divorced,Craft-repair,Not-in-family,Black,Male,0,1669,60,?,<=50K 70 | 41,Private,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,36,United-States,<=50K 71 | 18,Private,12th,8,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K 72 | 19,Private,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Female,0,0,25,United-States,<=50K 73 | 45,Private,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,Other,Male,4064,0,40,United-States,<=50K 74 | 74,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1825,12,United-States,>50K 75 | 30,State-gov,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,37,United-States,>50K 76 | 27,State-gov,Bachelors,13,Never-married,Prof-specialty,Not-in-family,Black,Male,0,0,40,United-States,<=50K 77 | 19,Private,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,20,United-States,<=50K 78 | 25,Private,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States,<=50K 79 | 25,Private,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,Asian-Pac-Islander,Female,0,0,37,India,>50K 80 | 31,Self-emp-not-inc,Assoc-voc,11,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,<=50K 81 | 53,Private,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,35,United-States,<=50K 82 | 20,Private,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,0,0,40,United-States,<=50K 83 | 26,Private,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,15,United-States,<=50K 84 | 33,Private,Bachelors,13,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 85 | 21,Private,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,25,United-States,<=50K 86 | 34,Private,Bachelors,13,Never-married,Craft-repair,Unmarried,White,Male,0,0,40,United-States,<=50K 87 | 24,Private,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,<=50K 88 | 22,Private,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States,<=50K 89 | 20,Private,9th,5,Never-married,Other-service,Unmarried,White,Male,0,0,30,Mexico,<=50K 90 | 67,Self-emp-not-inc,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,<=50K 91 | 30,Private,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K 92 | 43,Private,Assoc-acdm,12,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,45,United-States,<=50K 93 | 44,Private,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K 94 | 35,Private,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,39,United-States,<=50K 95 | 42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K 96 | 34,Private,HS-grad,9,Never-married,Machine-op-inspct,Not-in-family,White,Male,4416,0,30,United-States,<=50K 97 | 44,Private,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,Amer-Indian-Eskimo,Male,0,0,40,United-States,<=50K 98 | 52,Private,HS-grad,9,Separated,Priv-house-serv,Not-in-family,White,Female,0,0,50,United-States,<=50K 99 | 38,Private,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,60,United-States,>50K 100 | 34,Private,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,47,United-States,>50K 101 | 47,Self-emp-not-inc,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,<=50K 102 | -------------------------------------------------------------------------------- /tests/fixtures.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.. 4 | 5 | 6 | import os 7 | from typing import Optional 8 | 9 | import pandas as pd 10 | 11 | TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__)) 12 | 13 | 14 | def get_adult(which: str, n_samples: Optional[int] = None) -> pd.DataFrame: 15 | """Fixture for the adult dataset. 16 | 17 | For details see: 18 | https://archive.ics.uci.edu/ml/datasets/adult 19 | 20 | Parameters 21 | ---------- 22 | which : str, in ['ori', 'syn'] 23 | Whether to return the "original" or "synthetic" samples. 24 | n_samples : int 25 | Number of sample records to return. 26 | If `None` - return all samples. 27 | 28 | Returns 29 | ------- 30 | df : pd.DataFrame 31 | Adult dataframe. 32 | """ 33 | if which == "ori": 34 | fname = "adults_ori.csv" 35 | elif which == "syn": 36 | fname = "adults_syn.csv" 37 | else: 38 | raise ValueError(f"Invalid value {which} for parameter `which`. Available are: 'ori' or 'syn'.") 39 | 40 | return pd.read_csv(os.path.join(TEST_DIR_PATH, "datasets", fname), nrows=n_samples) 41 | -------------------------------------------------------------------------------- /tests/test_confidence.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | import numpy as np 5 | import pytest 6 | 7 | from anonymeter.stats.confidence import ( 8 | EvaluationResults, 9 | SuccessRate, 10 | bind_value, 11 | probit, 12 | residual_success, 13 | success_rate, 14 | ) 15 | 16 | 17 | def test_probit(): 18 | assert np.round(probit(0.95), decimals=2) == 1.96 19 | 20 | 21 | @pytest.mark.parametrize( 22 | "n_success, expected_risk, expected_error", 23 | [ 24 | (850, 0.849, 0.022), 25 | (0, 0.002, 0.002), 26 | (1000, 0.998, 0.002), 27 | ], 28 | ) 29 | def test_success_rate(n_success, expected_risk, expected_error): 30 | rate, error = success_rate(n_total=1000, n_success=n_success, confidence_level=0.95) 31 | assert np.round(rate, decimals=3) == expected_risk 32 | assert np.round(error, decimals=3) == expected_error 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "attack_rate, control_rate, expected", 37 | [ 38 | (SuccessRate(0.9, 0.0), SuccessRate(0.8, 0.0), SuccessRate(0.5, 0.0)), 39 | (SuccessRate(0.9, 0.02), SuccessRate(0.85, 0.02), SuccessRate(0.333, 0.16)), 40 | ], 41 | ) 42 | def test_residual_success(attack_rate, control_rate, expected): 43 | residual = residual_success(attack_rate=attack_rate, control_rate=control_rate) 44 | np.testing.assert_equal(np.round(residual, decimals=3), expected) 45 | 46 | 47 | @pytest.mark.parametrize( 48 | "point_estimate, error_bound, expected", 49 | [ 50 | (0.1, 0.3, (0.1, 0.0, 0.4)), 51 | (1.1, 0.5, (1.0, 0.6, 1.0)), 52 | (-0.1, 0.2, (0.0, 0.0, 0.1)), 53 | ], 54 | ) 55 | def test_bind_value(point_estimate, error_bound, expected): 56 | risk = bind_value(point_estimate, error_bound) 57 | np.testing.assert_almost_equal(np.array([risk.value, risk.ci[0], risk.ci[1]]), expected) 58 | 59 | 60 | @pytest.mark.parametrize( 61 | "n_attacks, n_success, n_baseline", 62 | [(100, 100, 0), (100, 23, 11), (111, 84, 42), (100, 0, 100)], 63 | ) 64 | def test_evaluation_results_simple(n_attacks, n_success, n_baseline): 65 | results = EvaluationResults( 66 | n_attacks=n_attacks, 67 | n_success=n_success, 68 | n_baseline=n_baseline, 69 | n_control=None, 70 | confidence_level=0, 71 | ) 72 | 73 | risk = results.risk() 74 | baseline_risk = results.risk(baseline=True) 75 | 76 | assert results.control_rate is None 77 | assert results.attack_rate.value == n_success / n_attacks 78 | assert results.baseline_rate.value == n_baseline / n_attacks 79 | 80 | assert risk.value == n_success / n_attacks 81 | assert baseline_risk.value == n_baseline / n_attacks 82 | assert risk.ci == (risk.value, risk.value) 83 | assert baseline_risk.ci == (baseline_risk.value, baseline_risk.value) 84 | 85 | 86 | @pytest.mark.parametrize( 87 | "n_attacks, n_success, n_baseline, n_control, confidence_level, expected_rate, expected_baseline", 88 | [ 89 | ( 90 | 100, 91 | 100, 92 | 0, 93 | None, 94 | 0.95, 95 | SuccessRate(value=0.9815032508965071, error=0.01849674910349284), 96 | SuccessRate(value=0.01849674910349284, error=0.01849674910349284), 97 | ), 98 | ( 99 | 100, 100 | 100, 101 | 0, 102 | None, 103 | 0.68, 104 | SuccessRate(value=0.9951036894831882, error=0.004896310516811869), 105 | SuccessRate(value=0.0048963105168118685, error=0.004896310516811869), 106 | ), 107 | ( 108 | 100, 109 | 23, 110 | 11, 111 | None, 112 | 0.95, 113 | SuccessRate(value=0.23998824451588613, error=0.08155558571285167), 114 | SuccessRate(value=0.1244274643007244, error=0.06188550073007873), 115 | ), 116 | ], 117 | ) 118 | def test_evaluation_results_confidence( 119 | n_attacks, 120 | n_success, 121 | n_baseline, 122 | n_control, 123 | confidence_level, 124 | expected_rate, 125 | expected_baseline, 126 | ): 127 | results = EvaluationResults( 128 | n_attacks=n_attacks, 129 | n_success=n_success, 130 | n_baseline=n_baseline, 131 | n_control=n_control, 132 | confidence_level=confidence_level, 133 | ) 134 | np.testing.assert_equal(results.attack_rate, expected_rate) 135 | np.testing.assert_equal(results.baseline_rate, expected_baseline) 136 | np.testing.assert_equal(results.risk(baseline=False), expected_rate.to_risk()) 137 | np.testing.assert_equal(results.risk(baseline=True), expected_baseline.to_risk()) 138 | 139 | 140 | def test_evaluation_results_warns_baseline(): 141 | with pytest.warns(UserWarning): 142 | EvaluationResults( 143 | n_attacks=100, 144 | n_success=49, 145 | n_baseline=50, 146 | n_control=None, 147 | confidence_level=0.95, 148 | ) 149 | 150 | 151 | def test_evaluation_results_warns_control(): 152 | with pytest.warns(UserWarning): 153 | EvaluationResults(n_attacks=100, n_success=49, n_baseline=0, n_control=100, confidence_level=0) 154 | 155 | 156 | @pytest.mark.parametrize("confidence_level", [-0.1, 1.2]) 157 | def test_confidence_exception(confidence_level): 158 | with pytest.raises(ValueError): 159 | EvaluationResults( 160 | n_attacks=100, 161 | n_success=49, 162 | n_baseline=0, 163 | n_control=None, 164 | confidence_level=confidence_level, 165 | ) 166 | -------------------------------------------------------------------------------- /tests/test_inference_evaluator.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | from typing import Iterable 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | 10 | from anonymeter.evaluators.inference_evaluator import InferenceEvaluator, evaluate_inference_guesses 11 | 12 | from tests.fixtures import get_adult 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "guesses, secrets, expected", 17 | [ 18 | (("a", "b"), ("a", "b"), (True, True)), 19 | ((np.nan, "b"), (np.nan, "b"), (True, True)), 20 | ((np.nan, np.nan), (np.nan, np.nan), (True, True)), 21 | ((np.nan, "b"), ("a", np.nan), (False, False)), 22 | (("a", "b"), ("a", "c"), (True, False)), 23 | (("b", "b"), ("a", "c"), (False, False)), 24 | ((1, 0), (2, 0), (False, True)), 25 | ], 26 | ) 27 | def test_evaluate_inference_guesses_classification(guesses, secrets, expected): 28 | out = evaluate_inference_guesses(guesses=pd.Series(guesses), secrets=pd.Series(secrets), regression=False) 29 | np.testing.assert_equal(out, expected) 30 | 31 | 32 | @pytest.mark.parametrize( 33 | "guesses, secrets, expected", 34 | [ 35 | ((1.0, 1.0), (1.0, 1.0), (True, True)), 36 | ((1.01, 1.0), (1.0, 1.01), (True, True)), 37 | ((1.0, 1.0), (2.0, 1.01), (False, True)), 38 | ((1.0, 2.0), (2.0, 1.01), (False, False)), 39 | ], 40 | ) 41 | def test_evaluate_inference_guesses_regression(guesses, secrets, expected): 42 | out = evaluate_inference_guesses(guesses=pd.Series(guesses), secrets=pd.Series(secrets), regression=True) 43 | np.testing.assert_equal(out, expected) 44 | 45 | 46 | @pytest.mark.parametrize( 47 | "guesses, secrets, tolerance, expected", 48 | [ 49 | ((1.0, 1.0), (1.05, 1.06), 0.05, (True, False)), 50 | ((1.0, 1.0), (1.05, 1.06), 0.06, (True, True)), 51 | ((1.0, np.nan), (1.05, np.nan), 0.06, (True, True)), 52 | ((np.nan, np.nan), (np.nan, np.nan), 0.06, (True, True)), 53 | ((1, np.nan), (np.nan, 1.06), 0.06, (False, False)), 54 | ((1.0, 1.0), (1.05, 1.06), 0.04, (False, False)), 55 | ((1.0, 1.0), (1.25, 1.26), 0.2, (False, False)), 56 | ((1.0, 1.0), (1.26, 1.25), 0.25, (False, True)), 57 | ], 58 | ) 59 | def test_evaluate_inference_guesses_regression_tolerance(guesses, secrets, tolerance, expected): 60 | out = evaluate_inference_guesses( 61 | guesses=pd.Series(guesses), secrets=pd.Series(secrets), tolerance=tolerance, regression=True 62 | ) 63 | np.testing.assert_equal(out, expected) 64 | 65 | 66 | @pytest.mark.parametrize( 67 | "ori, syn, expected", 68 | [ 69 | ([["a", "b"], ["c", "d"]], [["a", "b"], ["c", "d"]], 1.0), 70 | ([["a", "b"], ["c", "d"]], [["a", "b"], ["c", "e"]], 0.5), 71 | ([["a", "b"], ["c", "d"]], [["a", "h"], ["c", "g"]], 0.0), 72 | ], 73 | ) 74 | def test_inference_evaluator_rates( 75 | ori: Iterable, 76 | syn: Iterable, 77 | expected: float, 78 | ): 79 | # created a dataframe from ori and name columns c0 and c1 80 | ori = pd.DataFrame(ori, columns=pd.Index(["c0", "c1"])) 81 | syn = pd.DataFrame(syn, columns=pd.Index(["c0", "c1"])) 82 | evaluator = InferenceEvaluator( 83 | ori=ori, 84 | syn=syn, 85 | control=ori, 86 | aux_cols=["c0"], 87 | secret="c1", 88 | n_attacks=2, 89 | ).evaluate(n_jobs=1) 90 | results = evaluator.results(confidence_level=0) 91 | 92 | np.testing.assert_equal(results.attack_rate, (expected, 0)) 93 | np.testing.assert_equal(results.control_rate, (expected, 0)) 94 | 95 | 96 | @pytest.mark.parametrize( 97 | "aux_cols", 98 | [ 99 | ["type_employer", "capital_loss", "hr_per_week", "age"], 100 | ["education_num", "marital", "capital_loss"], 101 | ["age", "type_employer", "race"], 102 | ], 103 | ) 104 | @pytest.mark.parametrize("secret", ["education", "marital", "capital_gain"]) 105 | def test_inference_evaluator_leaks(aux_cols, secret): 106 | ori = get_adult("ori", n_samples=10) 107 | evaluator = InferenceEvaluator(ori=ori, syn=ori, control=ori, aux_cols=aux_cols, secret=secret, n_attacks=10) 108 | evaluator.evaluate(n_jobs=1) 109 | results = evaluator.results(confidence_level=0) 110 | 111 | np.testing.assert_equal(results.attack_rate, (1, 0)) 112 | np.testing.assert_equal(results.control_rate, (1, 0)) 113 | 114 | 115 | def test_evaluator_not_evaluated(): 116 | df = get_adult("ori", n_samples=10) 117 | evaluator = InferenceEvaluator( 118 | ori=df, 119 | syn=df, 120 | control=df, 121 | aux_cols=["education_num", "marital", "capital_loss"], 122 | secret="age", 123 | ) 124 | with pytest.raises(RuntimeError): 125 | evaluator.risk() 126 | -------------------------------------------------------------------------------- /tests/test_linkability_evaluator.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from anonymeter.evaluators.linkability_evaluator import LinkabilityEvaluator, LinkabilityIndexes 9 | 10 | from tests.fixtures import get_adult 11 | 12 | rng = np.random.default_rng(seed=42) 13 | 14 | 15 | @pytest.mark.parametrize("n_attacks", [4, None]) 16 | @pytest.mark.parametrize( 17 | "n_neighbors, confidence_level, expected_risk, expected_ci", 18 | [ 19 | (1, 0, 0.25, (0.25, 0.25)), 20 | (2, 0, 1, (1.0, 1.0)), 21 | (3, 0, 1, (1.0, 1.0)), 22 | (4, 0, 1, (1.0, 1.0)), 23 | (1, 0.95, 0.3725, (0.045587, 0.699358)), 24 | (2, 0.95, 0.7551, (0.5102, 1.0)), 25 | ], 26 | ) 27 | def test_linkability_evaluator(n_neighbors, confidence_level, expected_risk, expected_ci, n_attacks): 28 | ori = pd.DataFrame({"col0": [0, 0, 4, 0], "col1": [0, 1, 9, 4]}) 29 | syn = pd.DataFrame({"col0": [0, 1, 4, 9], "col1": [0, 1, 4, 9]}) 30 | 31 | evaluator = LinkabilityEvaluator( 32 | ori=ori, syn=syn, n_attacks=n_attacks, n_neighbors=n_neighbors, aux_cols=(["col0"], ["col1"]) 33 | ) 34 | evaluator.evaluate(n_jobs=1) 35 | risk, ci = evaluator.risk(confidence_level=confidence_level) 36 | np.testing.assert_allclose(risk, expected_risk, atol=1e-4) 37 | np.testing.assert_allclose(ci, expected_ci, atol=1e-4) 38 | 39 | 40 | @pytest.mark.parametrize("n_attacks", [4, None]) 41 | @pytest.mark.parametrize( 42 | "n_neighbors, confidence_level, expected_risk, expected_ci", 43 | [ 44 | (1, 0, 0.25, (0.25, 0.25)), 45 | (2, 0, 1, (1.0, 1.0)), 46 | (3, 0, 1, (1.0, 1.0)), 47 | (4, 0, 1, (1.0, 1.0)), 48 | (1, 0.95, 0.3725, (0.045587, 0.699358)), 49 | (2, 0.95, 0.7551, (0.5102, 1.0)), 50 | ], 51 | ) 52 | def test_linkability_evaluator_neighbors(n_neighbors, confidence_level, expected_risk, expected_ci, n_attacks): 53 | # see comment in the test_linkability_evaluator to understand 54 | # the ground truth on which this test is based. 55 | ori = pd.DataFrame({"col0": [0, 0, 4, 0], "col1": [0, 1, 9, 4]}) 56 | syn = pd.DataFrame({"col0": [0, 1, 4, 9], "col1": [0, 1, 4, 9]}) 57 | 58 | evaluator = LinkabilityEvaluator( 59 | ori=ori, syn=syn, n_attacks=n_attacks, n_neighbors=4, aux_cols=(["col0"], ["col1"]) 60 | ) 61 | evaluator.evaluate(n_jobs=1) 62 | risk, ci = evaluator.risk(confidence_level=confidence_level, n_neighbors=n_neighbors) 63 | np.testing.assert_allclose(risk, expected_risk, atol=1e-4) 64 | np.testing.assert_allclose(ci, expected_ci, atol=1e-4) 65 | 66 | 67 | @pytest.mark.parametrize("n_neighbors, fails", [(1, False), (2, False), (3, False), (4, False), (5, True), (45, True)]) 68 | def test_linkability_evaluator_neighbors_fails(n_neighbors, fails): 69 | ori = pd.DataFrame({"col0": [0, 0, 4, 0], "col1": [0, 1, 9, 4]}) 70 | syn = pd.DataFrame({"col0": [0, 1, 4, 9], "col1": [0, 1, 4, 9]}) 71 | 72 | evaluator = LinkabilityEvaluator(ori=ori, syn=syn, n_attacks=4, n_neighbors=4, aux_cols=(["col0"], ["col1"])) 73 | evaluator.evaluate(n_jobs=1) 74 | 75 | if fails: 76 | with pytest.raises(ValueError): 77 | evaluator.risk(n_neighbors=n_neighbors) 78 | else: 79 | evaluator.risk(n_neighbors=n_neighbors) 80 | 81 | 82 | @pytest.mark.parametrize("n_neighbors, expected_risk", [(1, 0.25), (2, 5 / 6), (3, 1), (4, 1)]) 83 | def test_baseline(n_neighbors, expected_risk): 84 | # note that for the baseline attack, it does not really matter 85 | # what's inside the synthetic or the original dataframe. 86 | ori = pd.DataFrame(rng.choice(["a", "b"], size=(400, 2)), columns=["c0", "c1"]) 87 | syn = pd.DataFrame([["a", "a"], ["b", "b"], ["a", "a"], ["a", "a"]], columns=["c0", "c1"]) 88 | evaluator = LinkabilityEvaluator( 89 | ori=ori, 90 | syn=syn, 91 | n_attacks=None, 92 | n_neighbors=n_neighbors, 93 | aux_cols=( 94 | ["c0"], 95 | ["c1"], 96 | ), 97 | ) 98 | evaluator.evaluate(n_jobs=1) 99 | baseline_risk, _ = evaluator.risk(confidence_level=0.95, baseline=True) 100 | np.testing.assert_allclose(baseline_risk, expected_risk, atol=5e-2) 101 | 102 | 103 | @pytest.mark.parametrize( 104 | "n_neighbors, idx_0, idx_1, expected, n_expected", 105 | [ 106 | (1, [[0], [1], [2], [3]], [[4], [5], [6], [7]], {}, 0), 107 | (1, [[0], [1], [2], [3]], [[4], [1], [6], [7]], {1: {1}}, 1), 108 | (1, [[0], [1], [2], [3]], [[4], [1], [6], [7]], {1: {1}}, 1), 109 | (1, [[0], [1], [6], [3]], [[4], [1], [6], [7]], {1: {1}, 2: {6}}, 2), 110 | (1, [[0, 1], [2, 3]], [[1, 0], [3, 2]], {}, 0), 111 | (2, [[0, 1], [2, 3]], [[1, 0], [3, 2]], {0: {0, 1}, 1: {2, 3}}, 2), 112 | ], 113 | ) 114 | def test_find_links(n_neighbors, idx_0, idx_1, expected, n_expected): 115 | indexes = LinkabilityIndexes(idx_0=np.array(idx_0), idx_1=np.array(idx_1)) 116 | links = indexes.find_links(n_neighbors=n_neighbors) 117 | n_links = indexes.count_links(n_neighbors=n_neighbors) 118 | assert links == expected 119 | assert n_links == n_expected 120 | 121 | 122 | @pytest.mark.parametrize("confidence_level", [0.5, 0.68, 0.95, 0.99]) 123 | def test_linkability_risk(confidence_level): 124 | ori = get_adult("ori", n_samples=10) 125 | col_sample = rng.choice(ori.columns, size=4, replace=False) 126 | 127 | evaluator = LinkabilityEvaluator( 128 | ori=ori, 129 | syn=ori, 130 | n_attacks=10, 131 | n_neighbors=5, 132 | aux_cols=( 133 | col_sample[:2].tolist(), 134 | col_sample[2:].tolist(), 135 | ), 136 | ) 137 | evaluator.evaluate(n_jobs=1) 138 | _, ci = evaluator.risk(confidence_level=confidence_level) 139 | np.testing.assert_allclose(ci[1], 1.0) 140 | 141 | 142 | def test_evaluator_not_evaluated(): 143 | evaluator = LinkabilityEvaluator( 144 | ori=pd.DataFrame(), 145 | syn=pd.DataFrame(), 146 | aux_cols=([], []), 147 | ) 148 | with pytest.raises(RuntimeError): 149 | evaluator.risk() 150 | -------------------------------------------------------------------------------- /tests/test_mixed_types_kneigbors.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors, gower_distance 9 | 10 | from tests.fixtures import get_adult 11 | 12 | rng = np.random.default_rng() 13 | 14 | 15 | def test_mixed_type_kNN(): 16 | df = get_adult("ori", n_samples=10) 17 | nn = MixedTypeKNeighbors().fit(df) 18 | shuffled_idx = rng.integers(10, size=10) 19 | dist, ids = nn.kneighbors(df.iloc[shuffled_idx], n_neighbors=1, return_distance=True) 20 | np.testing.assert_equal(ids.flatten(), shuffled_idx) 21 | np.testing.assert_equal(dist, 0) 22 | 23 | 24 | def test_mixed_type_kNN_numerical(): 25 | ori = pd.DataFrame([[0.0, "a"], [0.2, "a"], [0.15, "a"], [0.1, "a"]]) 26 | syn = pd.DataFrame([[0.01, "a"]]) 27 | nn = MixedTypeKNeighbors().fit(ori) 28 | ids = nn.kneighbors(syn, n_neighbors=4, return_distance=False) 29 | np.testing.assert_equal(ids, [[0, 3, 2, 1]]) 30 | 31 | 32 | def test_mixed_type_kNN_numerical_scaling(): 33 | ori = pd.DataFrame([[0.0, "a"], [0.2, "a"], [0.15, "a"], [0.1, "a"]]) 34 | 35 | # this is equal to the min value in the fitted dataframe. 36 | # The distance to the 2nd record in ori will be maximal. 37 | syn = pd.DataFrame([[0.0, "a"]]) 38 | nn = MixedTypeKNeighbors().fit(ori) 39 | dist, ids = nn.kneighbors(syn, n_neighbors=4, return_distance=True) 40 | np.testing.assert_equal(ids, [[0, 3, 2, 1]]) 41 | np.testing.assert_equal(dist[ids == 1], 1) 42 | 43 | 44 | @pytest.mark.parametrize("n_neighbors, n_queries", [(1, 10), (3, 5)]) 45 | def test_mixed_type_kNN_shape(n_neighbors, n_queries): 46 | df = get_adult("ori", n_samples=10) 47 | nn = MixedTypeKNeighbors(n_neighbors=n_neighbors).fit(df) 48 | ids = nn.kneighbors(df.head(n_queries)) 49 | assert isinstance(ids, np.ndarray) 50 | assert ids.shape == (n_queries, n_neighbors) 51 | 52 | nn = MixedTypeKNeighbors().fit(df) 53 | ids = nn.kneighbors(df.head(n_queries), n_neighbors=n_neighbors) 54 | assert isinstance(ids, np.ndarray) 55 | assert ids.shape == (n_queries, n_neighbors) 56 | 57 | 58 | @pytest.mark.parametrize( 59 | "r0, r1, expected", 60 | [ 61 | ([0, 1, 0, 0], [0, 1, 0, 0], 0), 62 | ([1, 1, 0, 0], [0, 1, 0, 0], 1), 63 | ([1, 1, 1, 0], [0, 1, 0, 0], 2), 64 | ([1, 0, 1, 0], [1, 1, 0, 1], 3), 65 | ([1, 0, 1, 0], [0, 1, 0, 1], 4), 66 | ], 67 | ) 68 | def test_gower_distance(r0, r1, expected): 69 | r0, r1 = np.array(r0), np.array(r1) 70 | dist = gower_distance(r0=r0, r1=r1, cat_cols_index=0) 71 | np.testing.assert_equal(dist, expected) 72 | 73 | # numerical and categorical should behave the same 74 | dist = gower_distance(r0=r0, r1=r1, cat_cols_index=4) 75 | np.testing.assert_equal(dist, expected) 76 | 77 | 78 | def test_gower_distance_numerical(): 79 | r0, r1 = rng.random(size=10), rng.random(size=10) 80 | dist = gower_distance(r0=r0, r1=r1, cat_cols_index=10) 81 | np.testing.assert_almost_equal(dist, np.sum(np.abs(r0 - r1))) 82 | -------------------------------------------------------------------------------- /tests/test_singling_out_evaluator.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | from scipy import integrate 8 | 9 | from anonymeter.evaluators.singling_out_evaluator import ( 10 | SinglingOutEvaluator, 11 | UniqueSinglingOutQueries, 12 | multivariate_singling_out_queries, 13 | safe_query_counts, 14 | singling_out_probability_integral, 15 | univariate_singling_out_queries, 16 | ) 17 | 18 | from tests.fixtures import get_adult 19 | 20 | 21 | @pytest.mark.parametrize("mode", ["univariate", "multivariate"]) 22 | def test_so_general(mode): 23 | ori = get_adult("ori", n_samples=10) 24 | syn = get_adult("syn", n_samples=10) 25 | soe = SinglingOutEvaluator(ori=ori, syn=syn, n_attacks=5).evaluate(mode=mode) 26 | 27 | for q in soe.queries(): 28 | assert len(syn.query(q) == 1) 29 | assert len(ori.query(q) == 1) 30 | 31 | 32 | def test_singling_out_queries_unique(): 33 | df = pd.DataFrame({"c1": [1], "c2": [2]}) 34 | 35 | queries = UniqueSinglingOutQueries() 36 | q1, q2 = "c1 == 1", "c2 == 2" 37 | 38 | queries.check_and_append(q1, df=df) 39 | queries.check_and_append(q1, df=df) 40 | assert queries.queries == [q1] 41 | 42 | queries.check_and_append(q2, df=df) 43 | assert queries.queries == [q1, q2] 44 | 45 | 46 | def test_singling_out_queries_same_characters(): 47 | df = pd.DataFrame([{"c": 1.2}, {"c": 2.1}]) 48 | 49 | queries = UniqueSinglingOutQueries() 50 | q1, q2 = "c == 1.2", "c == 2.1" 51 | 52 | queries.check_and_append(q1, df=df) 53 | queries.check_and_append(q1, df=df) 54 | assert queries.queries == [q1] 55 | 56 | queries.check_and_append(q2, df=df) 57 | assert queries.queries == [q1, q2] 58 | 59 | 60 | def test_singling_out_queries(): 61 | df = pd.DataFrame({"c1": [1, 1], "c2": [2, 3]}) 62 | 63 | queries = UniqueSinglingOutQueries() 64 | queries.check_and_append("c1 == 1", df=df) # does not single out 65 | assert len(queries) == 0 66 | 67 | queries.check_and_append("c1 == 1 and c2 == 3", df=df) # does single out 68 | assert len(queries) == 1 69 | 70 | 71 | @pytest.mark.parametrize( 72 | "query, result", [("c1 == 0 and c2 == 'a'", 2), ("c3 == 'fuffa'", None), ("c1 == 2 and c2 == 'c'", 1)] 73 | ) 74 | def test_safe_query_counts(query, result): 75 | df = pd.DataFrame({"c1": [0, 0, 2], "c2": ["a", "a", "c"]}) 76 | assert safe_query_counts(query=query, df=df) == result 77 | 78 | 79 | def test_univariate_singling_out_queries(): 80 | df = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) 81 | queries = univariate_singling_out_queries(df=df, n_queries=10) 82 | expected_queries = ["col1 == 'a'", "col1 == 'b'", "col1 == 'c'", "col1 == 'd'"] 83 | assert sorted(queries) == sorted(expected_queries) 84 | 85 | 86 | def test_singling_out_query_generator(): 87 | df = pd.DataFrame({"c0": ["a", "b"], "c1": [1.23, 9.87]}) 88 | queries = multivariate_singling_out_queries(df=df, n_queries=2, n_cols=2, max_attempts=None) 89 | possible_queries = [ 90 | "c1<= 1.23 & c1>= 9.87", 91 | "c1>= 9.87 & c1<= 1.23", 92 | "c0== 'b' & c1<= 1.23", 93 | "c0== 'b' & c1>= 9.87", 94 | "c0== 'b' & c0== 'a'", 95 | "c0== 'a' & c1<= 1.23", 96 | "c0== 'a' & c1>= 9.87", 97 | "c0== 'a' & c0== 'b'", 98 | ] 99 | for query in queries: 100 | assert query in possible_queries 101 | 102 | 103 | @pytest.mark.parametrize("confidence_level", [0.5, 0.68, 0.95, 0.99]) 104 | @pytest.mark.parametrize("mode", ["univariate", "multivariate"]) 105 | def test_singling_out_risk_estimate(confidence_level, mode): 106 | ori = get_adult("ori", 10) 107 | soe = SinglingOutEvaluator(ori=ori, syn=ori, n_attacks=5) 108 | soe.evaluate(mode=mode) 109 | _, ci = soe.risk(confidence_level=confidence_level) 110 | np.testing.assert_allclose(ci[1], 1.0) 111 | 112 | 113 | def test_evaluator_not_evaluated(): 114 | soe = SinglingOutEvaluator(ori=pd.DataFrame(), syn=pd.DataFrame()) 115 | with pytest.raises(RuntimeError): 116 | soe.risk() 117 | 118 | 119 | @pytest.mark.parametrize("n", [100, 4242, 11235]) 120 | @pytest.mark.parametrize("w_min, w_max", [(0, 1), (1 / 10000, 1 / 1000), (0.0013414, 0.2314)]) 121 | def test_probability_integral(n, w_min, w_max): 122 | def _so_probability(n: int, w: float): 123 | return n * w * ((1 - w) ** (n - 1)) 124 | 125 | desired, _ = integrate.quad(lambda x: _so_probability(w=x, n=n), a=w_min, b=w_max) 126 | integral = singling_out_probability_integral(n=n, w_min=w_min, w_max=w_max) 127 | np.testing.assert_almost_equal(desired, integral) 128 | 129 | 130 | @pytest.mark.parametrize("max_attempts", [1, 2, 3]) 131 | def test_so_evaluator_max_attempts(max_attempts): 132 | ori = get_adult("ori", 10) 133 | soe = SinglingOutEvaluator(ori=ori, syn=ori, n_attacks=10, max_attempts=max_attempts) 134 | soe.evaluate(mode="multivariate") 135 | 136 | assert len(soe.queries()) <= max_attempts 137 | -------------------------------------------------------------------------------- /tests/test_transformations.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | from scipy.spatial.distance import pdist, squareform 8 | 9 | from anonymeter.preprocessing.transformations import mixed_types_transform 10 | 11 | rng = np.random.default_rng() 12 | 13 | 14 | def test_scaling_numerical(): 15 | df_ori = pd.DataFrame({"c": rng.random(5)}) 16 | df_syn = pd.DataFrame({"c": rng.random(5)}) 17 | tdf_ori, tdf_syn = mixed_types_transform(df_ori, df_syn, num_cols=["c"], cat_cols=[]) 18 | # values are scaled so that abs(difference) is between 0 and 1. 19 | # since this is a square distance matrix, there will be two elements with d=1 20 | vals = pd.concat([tdf_ori, tdf_syn])["c"].values 21 | dm = squareform(pdist(vals[:, np.newaxis], "cityblock")) 22 | assert np.sum(np.isclose(dm, 1)) == 2 23 | assert np.amin(dm) == 0 24 | 25 | 26 | @pytest.mark.parametrize( 27 | "df1, df2, exp1, exp2", 28 | [ 29 | ( 30 | pd.DataFrame({"c": ["a", "b", "c", "d"]}), 31 | pd.DataFrame({"c": ["a", "b", "c", "c"]}), 32 | pd.DataFrame({"c": [0, 1, 2, 3]}), 33 | pd.DataFrame({"c": [0, 1, 2, 2]}), 34 | ), 35 | ( 36 | pd.DataFrame({"c": ["a", "b", "c", None]}), 37 | pd.DataFrame({"c": ["a", "b", "c", "c"]}), 38 | pd.DataFrame({"c": [0, 1, 2, 3]}), 39 | pd.DataFrame({"c": [0, 1, 2, 2]}), 40 | ), 41 | ( 42 | pd.DataFrame({"c": ["a", "b", "c", "d"]}), 43 | pd.DataFrame({"c": ["a", "b", None, "c"]}), 44 | pd.DataFrame({"c": [0, 1, 2, 3]}), 45 | pd.DataFrame({"c": [0, 1, 4, 2]}), 46 | ), 47 | ], 48 | ) 49 | def test_encoding_categorical(df1, df2, exp1, exp2): 50 | enc1, enc2 = mixed_types_transform(df1=df1, df2=df2, cat_cols=["c"], num_cols=[]) 51 | pd.testing.assert_frame_equal(enc1, exp1) 52 | pd.testing.assert_frame_equal(enc2, exp2) 53 | 54 | 55 | @pytest.mark.parametrize( 56 | "df1, df2, exp1, exp2", 57 | [ 58 | ( 59 | pd.DataFrame({"c": ["a", "b", "c"]}), 60 | pd.DataFrame({"c": ["a", "b", "d"]}), 61 | pd.DataFrame({"c": [0, 1, 2]}), 62 | pd.DataFrame({"c": [0, 1, 3]}), 63 | ), 64 | ( 65 | pd.DataFrame({"c": ["a", "b", "c"]}), 66 | pd.DataFrame({"c": ["a", "b", None]}), 67 | pd.DataFrame({"c": [0, 1, 2]}), 68 | pd.DataFrame({"c": [0, 1, 3]}), 69 | ), 70 | ( 71 | pd.DataFrame({"c": [None, "b", "c"]}), 72 | pd.DataFrame({"c": ["a", "b", None]}), 73 | pd.DataFrame({"c": [3, 1, 2]}), 74 | pd.DataFrame({"c": [0, 1, 3]}), 75 | ), 76 | ], 77 | ) 78 | def test_encoding_categorical_new_values(df1, df2, exp1, exp2): 79 | enc1, enc2 = mixed_types_transform(df1=df1, df2=df2, cat_cols=["c"], num_cols=[]) 80 | pd.testing.assert_frame_equal(enc1, exp1) 81 | pd.testing.assert_frame_equal(enc2, exp2) 82 | -------------------------------------------------------------------------------- /tests/test_type_detection.py: -------------------------------------------------------------------------------- 1 | # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. 2 | # Copyright (c) 2022 Anonos IP LLC. 3 | # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from anonymeter.preprocessing.type_detection import detect_col_types, detect_consistent_col_types 9 | 10 | rng = np.random.default_rng() 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "df, expected", 15 | [ 16 | (pd.DataFrame({"num": rng.random(5), "cat": list("abcde")}), {"cat": ["cat"], "num": ["num"]}), 17 | (pd.DataFrame({"num1": rng.random(5), "num2": [1, 2, 3, 4, 5]}), {"cat": [], "num": ["num1", "num2"]}), 18 | ( 19 | pd.DataFrame({"num1": rng.random(5), "num2": [1, 2, 3, 4, 5]}).astype("object"), 20 | {"cat": ["num1", "num2"], "num": []}, 21 | ), 22 | ( 23 | pd.DataFrame({"cat1": list("abcde"), "cat2": ["1", "2", "3", "4", "5"]}), 24 | {"cat": ["cat1", "cat2"], "num": []}, 25 | ), 26 | ], 27 | ) 28 | def test_detect_col_types(df, expected): 29 | ctypes = detect_col_types(df=df) 30 | assert ctypes == expected 31 | 32 | 33 | def test_detect_col_types_consistent(): 34 | df1 = pd.DataFrame({"num": rng.random(5), "cat": list("abcde")}) 35 | df2 = pd.DataFrame({"num": rng.random(5), "cat": list("fghil")}) 36 | assert detect_consistent_col_types(df1, df2) == {"cat": ["cat"], "num": ["num"]} 37 | 38 | 39 | def test_detect_col_types_consistent_raises(): 40 | df1 = pd.DataFrame({"num": rng.random(5), "cat": list("abcde")}) 41 | df2 = pd.DataFrame({"num": [str(_) for _ in rng.random(5)], "cat": list("fghil")}) 42 | with pytest.raises(RuntimeError): 43 | detect_consistent_col_types(df1, df2) 44 | --------------------------------------------------------------------------------