├── logos ├── ssi_logo_small.png └── openmined_logo_small.png ├── 4-remote-data-science ├── private_ai.png ├── syft_workflow.png ├── syft_ds_workflow.png └── private_ai_courses.png ├── 2-ml-models-attacks ├── lenet_mnist_model.pth ├── mia_reconstruction.png ├── models.py ├── train.py ├── 3-MIA-Reconstruction.ipynb ├── 2-MIA-Training.ipynb ├── dataset.py └── 1-FSGM-Attack.ipynb ├── ppml_requirements.txt ├── environment.yml ├── .gitignore ├── setup.md ├── Get-Ready.ipynb ├── README.md ├── 3-differential-privacy ├── 6-MIA-Reconstruction-OPACUS.ipynb ├── 5-MIA-Training-OPACUS.ipynb ├── 2-approx-differential-privacy.ipynb ├── 1-differential-privacy.ipynb └── 3-properties-differential-privacy.ipynb ├── LICENSE └── 1-data-anonimisation ├── 3-k-anonimity.ipynb └── 2-de-identification.ipynb /logos/ssi_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/logos/ssi_logo_small.png -------------------------------------------------------------------------------- /logos/openmined_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/logos/openmined_logo_small.png -------------------------------------------------------------------------------- /4-remote-data-science/private_ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/private_ai.png -------------------------------------------------------------------------------- /2-ml-models-attacks/lenet_mnist_model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/2-ml-models-attacks/lenet_mnist_model.pth -------------------------------------------------------------------------------- /4-remote-data-science/syft_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/syft_workflow.png -------------------------------------------------------------------------------- /2-ml-models-attacks/mia_reconstruction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/2-ml-models-attacks/mia_reconstruction.png -------------------------------------------------------------------------------- /4-remote-data-science/syft_ds_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/syft_ds_workflow.png -------------------------------------------------------------------------------- /4-remote-data-science/private_ai_courses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/private_ai_courses.png -------------------------------------------------------------------------------- /ppml_requirements.txt: -------------------------------------------------------------------------------- 1 | hagrid<0.3.122 2 | ipykernel>=6.19 3 | ipython>=8.12 4 | jupyter==1.0.0 5 | jupyterlab>=3.6 6 | notebook>=6.5 7 | opacus>=1.4 8 | opendp>=0.9.2 9 | pandas>=1.5 10 | pillow>=9.4 11 | pip>=23.1 12 | pydantic>=1.10 13 | torch>=1.13 14 | scikit-learn>=1.2.2 15 | scipy>=1.10 16 | setuptools>=67.8.0 17 | syft==0.8.6 18 | torchvision>=0.14.1 19 | tqdm>=4.65.0 20 | notexbook-theme==2.0.1 21 | phe>=1.5 22 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ppml 2 | dependencies: 3 | - ipykernel>=6.19 4 | - ipython>=8.12 5 | - jupyter=1.0.0 6 | - jupyterlab>=3.6 7 | - matplotlib>=3.7 8 | - notebook>=6.5.4 9 | - numpy>=1.24 10 | - pandas>=1.5 11 | - pillow>=9.4 12 | - pip>=23.1 13 | - python=3.11 14 | - pytorch::pytorch>=1.13 15 | - scikit-learn>=1.2 16 | - scipy>=1.10 17 | - setuptools>=67.8.0 18 | - pytorch::torchvision>=0.14.1 19 | - tqdm>=4.65.0 20 | - grpcio>=1.48,<1.52 21 | - pip: 22 | - opacus>=1.4 23 | - opendp>=0.9.2 24 | - pydantic>=1.10 25 | - notexbook-theme==2.0.1 26 | - phe==1.5.0 27 | -------------------------------------------------------------------------------- /2-ml-models-attacks/models.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class SoftmaxRegression(nn.Module): 5 | """Softmax Regression Classifier. 6 | 7 | This classifier is a generalization of logistic regression that 8 | allows the class variable to take more than two values—in our case, 9 | there are 40 individuals in the dataset, so the classifier needs to 10 | distinguish between 40 labels. 11 | Softmax regression is often used as the final layer in deep neural network 12 | architectures, so on its own this classifier can be seen as a 13 | neural network with no hidden layers 14 | 15 | Extracted from: https://dl.acm.org/doi/pdf/10.1145/2810103.2813677 16 | """ 17 | 18 | def __init__(self, in_features: int = 112 * 92, n_classes: int = 40): 19 | super(SoftmaxRegression, self).__init__() 20 | self.regression = nn.Linear(in_features, n_classes) 21 | 22 | def forward(self, x): 23 | x = self.regression(x) 24 | return nn.LogSoftmax(dim=1)(x) 25 | 26 | 27 | class MLP(nn.Module): 28 | """Multilayer Perceptron Network. 29 | 30 | A multilayer perceptron network model with one hidden layer 31 | of 3000 sigmoid neurons (or units), and a softmax output layer. 32 | This classifier can be understood as performing 33 | softmax regression after first applying a non-linear transformation 34 | to the feature vector. 35 | The point of this transformation, which corresponds to the hidden layer, 36 | is to map the feature vector into a lower-dimensional space in which 37 | the classes are separable by the softmax output layer. 38 | 39 | Adapted from: https://dl.acm.org/doi/pdf/10.1145/2810103.2813677 40 | """ 41 | 42 | def __init__(self, in_features: int = 112 * 92, n_classes: int = 40): 43 | super(MLP, self).__init__() 44 | self.hidden = nn.Linear(in_features, 3000) 45 | self.prediction = nn.Linear(3000, n_classes) 46 | 47 | def forward(self, x): 48 | x = self.hidden(x) 49 | x = nn.Sigmoid()(x) 50 | x = self.prediction(x) 51 | return nn.LogSoftmax(dim=1)(x) 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OSX Rubbish 2 | .DS_Store 3 | # any dataset folder is ignored 4 | */data/ 5 | data/ 6 | # checkpoints/ # re-established 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /setup.md: -------------------------------------------------------------------------------- 1 | ## Preamble 2 | 3 | To run the code included in this tutorial, we will leverage on a pretty "standard" Python/PyData stack: 4 | `numpy`, `pandas`, `matplotlib`, and `scikit-learn` for all the data science and Machine learning parts, 5 | and `pytorch` (w/ `torchvision`) for the Deep Learning examples. 6 | 7 | Moreover, a few **extra** / specialised packages will be also featured: 8 | - [PySyft](https://github.com/OpenMined/PySyft): A platform for Remote Data Science 9 | - [Opacus](https://opacus.ai): A library to train PyTorch models with differential privacy 10 | - [PHE](https://pypi.org/project/phe/): A Python 3 library implementing the Paillier Partially Homomorphic Encryption 11 | 12 | As for the Python version/distribution: any Python 3.10+ version should be fine. 13 | 14 | The [repository](http://github.com/leriomaggio/ppml-tutorial) contains the files to 15 | recreate the Python environment with all the required packages, either you are using [**Miniconda**](https://docs.anaconda.com/free/miniconda/index.html)(i.e. [`environment.yml`](http://github.com/leriomaggio/ppml-tutorial/environment.yml)) or 16 | Standard Python Distribution (i.e. [`ppml_requirements.txt`](http://github.com/leriomaggio/ppml-tutorial/ppml_requirements.txt)). 17 | 18 | ## Set up the Environment 19 | 20 | **Before we start**: 21 | 22 | All the instructions reported below will consider the **Terminal** 23 | and hence the command-line interface to run all the commands. 24 | 25 | Similarly, instructions to recreate the environment will consider using 26 | [`pyenv`](https://github.com/pyenv/pyenv) and [`pyenv-virtualenv`](https://github.com/pyenv/pyenv-virtualenv) 27 | to download the Python distribution, and install the environment. 28 | 29 | If you haven't already, let's download (or `git clone`) the current repository on your local computer. 30 | 31 | ```bash 32 | git clone https://github.com/leriomaggio/ppml-tutorial 33 | cd ppml-tutorial 34 | ``` 35 | 36 | **Setup the environment**: 37 | 38 | The repository contains a `ppml_requirements.txt` file that can be used to automatically recreate the 39 | environment with all the required packages. 40 | 41 | First, let's download the shims of the Python version we want to use. We will be using `Python 3.12`: 42 | 43 | 44 | ```bash 45 | pyenv install 3.12 46 | ``` 47 | 48 | Once this is complete, you should now have the shims of Python 3.12 available in your system. 49 | 50 | The next step is to now point to this version of the interpreter when creating the new virtual environment. 51 | 52 | ```bash 53 | pyenv virtualenv 3.12 ppml 54 | ``` 55 | 56 | This will create a new virtual environment called `ppml`. We now need to **activate** the environment: 57 | 58 | ```bash 59 | pyenv virtualenv activate ppml 60 | ``` 61 | 62 | **Finally**, you could install all the required packages using `pip`: 63 | 64 | ```bash 65 | 66 | pip install -f ppml_requirements.txt 67 | ``` 68 | 69 | ### Well Done! 🎉 70 | 71 | ## Test your Environment 72 | 73 | If you followed all the steps reported in the previous section to setup your local machine, you should be ready to 74 | proceed with **testing your environment**. 75 | 76 | To do so, please open the `Get-Ready.ipynb` notebook to check that everything works properly on your end: 77 | 78 | ```bash 79 | jupyter lab Get-Ready.ipynb 80 | ``` -------------------------------------------------------------------------------- /Get-Ready.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "id": "66212187", 7 | "metadata": {}, 8 | "source": [ 9 | "# Test your Environment" 10 | ] 11 | }, 12 | { 13 | "attachments": {}, 14 | "cell_type": "markdown", 15 | "id": "45d9726a", 16 | "metadata": {}, 17 | "source": [ 18 | "In this notebook you will find the few (and simple) steps that are required to test the environment that is required to execute all the code examples in the tutorial." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "id": "066ef2c5", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from collections import namedtuple\n", 29 | "\n", 30 | "Package = namedtuple(\"Package\", [\"name\", \"major\", \"minor\"])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "id": "fab6146c", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "PKGS_INFO = {\n", 41 | " 'matplotlib': Package(name=\"matplotlib\", major=3, minor=7),\n", 42 | " 'numpy': Package(\"numpy\", 1, 24),\n", 43 | " 'pandas': Package(\"pandas\", 1, 5),\n", 44 | " 'pytorch': Package(\"torch\", 1, 13),\n", 45 | " 'scikit-learn': Package(\"sklearn\", 1, 2),\n", 46 | " 'scipy': Package(\"scipy\", 1, 10),\n", 47 | " 'torchvision': Package(\"torchvision\", 0, 14),\n", 48 | " 'opacus': Package(\"opacus\", 1, 4),\n", 49 | " 'phe': Package(\"phe\", 1, 5),\n", 50 | " 'syft': Package(\"syft\", 0, 8)\n", 51 | "}" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "26f02be3", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from importlib import import_module\n", 62 | "\n", 63 | "def check_version(name: str, package: Package) -> bool:\n", 64 | " module = import_module(package.name)\n", 65 | " print(f\"Import {name.title()}: \", end=\" \")\n", 66 | " version = module.__version__\n", 67 | " major, minor, *_ = tuple(map(int, version.split(\".\")))\n", 68 | " check = package.major <= major or (package.major == major and package.minor <= minor)\n", 69 | " if not check:\n", 70 | " print(\"FAIL\")\n", 71 | " print(f\"In this tutorial we will be using {name} {package.major}.{package.minor} - found {version} instead\")\n", 72 | " else:\n", 73 | " print(\"OK\")\n", 74 | "\n", 75 | "\n", 76 | "def check_package_versions():\n", 77 | " for name, package in PKGS_INFO.items():\n", 78 | " check_version(name=name, package=package)\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "id": "3ddf73ab", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Import Matplotlib: OK\n", 92 | "Import Numpy: OK\n", 93 | "Import Pandas: OK\n", 94 | "Import Pytorch: OK\n", 95 | "Import Scikit-Learn: OK\n", 96 | "Import Scipy: OK\n", 97 | "Import Torchvision: OK\n", 98 | "Import Opacus: OK\n", 99 | "Import Phe: OK\n", 100 | "Import Syft: OK\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "check_package_versions()" 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3 (ipykernel)", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.12.3" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 5 130 | } 131 | -------------------------------------------------------------------------------- /2-ml-models-attacks/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as th 3 | from torch.utils.data import DataLoader 4 | from pathlib import Path 5 | from typing import Tuple 6 | from tqdm.notebook import tqdm 7 | from sklearn.metrics import accuracy_score 8 | 9 | 10 | def train( 11 | model: th.nn.Module, 12 | optimiser: th.optim.Optimizer, 13 | loaders: Tuple[DataLoader, DataLoader], 14 | epochs: int = 100, 15 | model_name: str = None, 16 | verbose: bool = False, 17 | ): 18 | """Simple Training/Validation Loop using the input model, and the pair of data loaders 19 | for training and validation, respectively. 20 | 21 | model: 22 | The target PyTorch model (nn.Module) to train 23 | optimiser: 24 | The model optimiser holding reference to model's parameters 25 | loaders: Tuple[DataLoader] 26 | Pair of Dataloader for training and validation data, respectively. 27 | epochs: int (default 100) 28 | Total number of training epoch 29 | model_name: str (default "") 30 | The name of the trained model - used mainly to name the checkpoint file 31 | that will be saved. If no name will be provided, the default 32 | `model.__class__.__name__.lower()` will be used. 33 | verbose: bool (default False) 34 | Verbosity of the report. If True, the Accuracy of each epoch will be printed. 35 | If not, only validation accuracy will be shown. 36 | """ 37 | if model_name is None or not model_name: 38 | model_name = model.__class__.__name__.lower() 39 | 40 | train_loader, test_loader = loaders 41 | device = th.device("cuda" if th.cuda.is_available() else "mps" if th.backends.mps.is_available() else "cpu") 42 | print(f"Using {device} Device") 43 | # move model to the selected device, in case 44 | model = model.to(device) 45 | # both models uses LogSoftmax already! So NLLLoss is what we need 46 | criterion = th.nn.NLLLoss() 47 | 48 | best_validation_accuracy = 0 49 | checkpoint_folder = Path("./checkpoints") 50 | checkpoint_folder.mkdir(exist_ok=True) 51 | 52 | for epoch in tqdm(range(epochs), desc="Epochs"): 53 | running_loss_pred, training_acc = _step( 54 | train_loader, model, optimiser, criterion, device, is_training=True 55 | ) 56 | if verbose: 57 | print(f"Prediction: {running_loss_pred}; Training ACC: {training_acc}") 58 | 59 | # run validation every 10 epochs 60 | if (epoch + 1) % 10 == 0: 61 | _, valid_acc = _step( 62 | test_loader, model, optimiser, criterion, device, is_training=False 63 | ) 64 | if verbose: 65 | print(f"Validation ACC: {valid_acc}") 66 | if best_validation_accuracy < valid_acc: 67 | if verbose: 68 | print("Saving Best Model Checkpoint") 69 | chk_path = checkpoint_folder / f"{model_name}.pt" 70 | print(chk_path) 71 | th.save(model.state_dict(), chk_path) 72 | best_validation_accuracy = valid_acc 73 | print(f"Best Validation ACC: {valid_acc}") 74 | 75 | 76 | def _step(loader, model, optimiser, criterion, device, is_training: bool): 77 | samples_count = 0 78 | running_loss_pred = 0.0 79 | y_true, y_pred = list(), list() 80 | for batch in loader: 81 | images, subject_ids = batch 82 | images = images.view(-1, 112 * 92).to(device) 83 | subject_ids = subject_ids.to(device) 84 | samples_count += len(images) 85 | 86 | # zero the gradient 87 | model.zero_grad() 88 | optimiser.zero_grad() 89 | 90 | with th.set_grad_enabled(is_training): 91 | out = model(images) 92 | loss = criterion(out, subject_ids) 93 | _, preds = th.max(out, 1) 94 | 95 | if is_training: 96 | loss.backward() 97 | optimiser.step() 98 | running_loss_pred += loss.item() 99 | 100 | y_pred.append(preds.detach().cpu().numpy()) 101 | y_true.append(subject_ids.detach().cpu().numpy()) 102 | 103 | if is_training: 104 | running_loss_pred /= samples_count 105 | y_pred = np.hstack(y_pred) 106 | y_true = np.hstack(y_true) 107 | step_acc = accuracy_score(y_true, y_pred) 108 | return running_loss_pred, step_acc 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PPML: Machine Learning on Data you cannot see 2 | 3 | Repository for the [tutorial](https://2024.pycon.it/en/submission/ryez) on **Privacy-Preserving Machine Learning** (`PPML`) presented at [PyCon Italia 2024](https://2024.pycon.it) 4 | 5 | ## Intro 6 | 7 | Privacy guarantees are **the** most crucial requirement when it comes to analyse sensitive data. These requirements could be sometimes very stringent, so that it becomes a real barrier for the entire pipeline. Reasons for this are manifold, and involve the fact that data could not be _shared_ nor moved from their silos of resident, let alone analysed in their _raw_ form. As a result, _data anonymisation techniques_ are sometimes used to generate a sanitised version of the original data. However, these techniques alone are not enough to guarantee that privacy will be completely preserved. Moreover, the _memoisation_ effect of Deep learning models could be maliciously exploited to _attack_ the models, and _reconstruct_ sensitive information about samples used in training, even if these information were not originally provided. 8 | 9 | *Privacy-preserving machine learning* (PPML) methods hold the promise to overcome all those issues, allowing to train machine learning models with full privacy guarantees. 10 | 11 | This workshop will be mainly organised in **three** main parts. In the first part, we will introduce the main threats to 12 | data and machine learning models (e.g. _membership inference attack_ ) for privacy. 13 | In the second part, we will work our way towards **differential privacy**: what is it, and how this method works, and 14 | how differential privacy could be used with Machine learning. 15 | Lastly, we will conclude the tutorial considering more complex ML scenarios to train Deep learning networks on encrypted data, with specialised _distributed_ settings for **remote analytics**. 16 | 17 | ### Outline 18 | 19 | - **Introduction**: Brief Intro to `PPML` and to the workshop (`10 mins`) [SLIDES](https://speakerdeck.com/leriomaggio/ppml-pyconit24) 20 | 21 | - **Part 1**: Data and ML models Threats (`45 mins`) 22 | - De-identification 23 | - K-anonimity and limitations 24 | - ML Model vulnerabilities: Adversarial Examples and _inference attack_ 25 | 26 | - **Part 2**: Short Introduction to Differential Privacy (`45 mins`) 27 | 28 | - Intro to Differential Privacy 29 | - Properties of Differential Privacy 30 | - DL training with Differential Privacy 31 | 32 | - **Break** (`5 mins`) 33 | 34 | - **Part 3**: Primer on Remote Data Science & PySyft (`25 mins`) 35 | - Intro to Federated Learning 36 | - DL training on (Homomorphically) Encrypted Data 37 | - Remote Data Science using PySyft 38 | 39 | 40 | ## Get the material 41 | 42 | Clone the current repository by running the following instructions: 43 | 44 | ```bash 45 | cd $HOME # This will make sure you'll be in your HOME folder 46 | git clone https://github.com/leriomaggio/ppml-tutorial.git 47 | ``` 48 | 49 | **Note**: This will create a new folder named `ppml-tutorial`. Move into this folder by typing: 50 | 51 | ```bash 52 | cd ppml-tutorial 53 | ``` 54 | 55 | Well done! Now you should do be in the right location. 56 | Bear with me for another few seconds, following instructions reported below 🙏 57 | 58 | ## Installation Instructions 59 | 60 | All the materials in this tutorial (code, and lecture notes) are made available as 61 | Jupyter notebooks. 62 | 63 | **(1)** There is no specific _hardware requirement_ to execute the code, i.e. running everything 64 | on your laptop should be more than fine 😊. 65 | 66 | **(2)**: As for the _software requirements_, we will be using a pretty standard Python/PyData stack: 67 | `numpy`, `pandas`, `matplotlib`, and `scikit-learn` for all the data science and Machine learning parts, 68 | along with `pytorch` and `torchvision` to work on the Deep Learning examples. 69 | 70 | Moreover, a few **extra** / specialised packages will be also featured: 71 | - [PySyft](https://github.com/OpenMined/PySyft): A platform for Remote Data Science 72 | - [Opacus](https://opacus.ai): A library to train PyTorch models with differential privacy 73 | - [PHE](https://pypi.org/project/phe/): A Python 3 library implementing the Paillier Partially Homomorphic Encryption 74 | 75 | Please refer to the [`setup.md`](./setup.md) document for step-by-step instructions to set up the environment 76 | on your computer. 77 | 78 | If you spot any error/mistake, please feel free to reach out directly to [me](mailto:valerio@openmined.org?subject=PPML%20SciPy23%20Issue), or to open an [Issue](http://github.com/leriomaggio/ppml-tutorial/issues) 79 | on the repository. 80 | 81 | Any feedback will be very much appreciated! 82 | 83 | Thank you! 🙏 84 | 85 | ## Colophon 86 | 87 | **Author**: Valerio Maggio ([`@leriomaggio`](https://twitter.com/leriomaggio)), 88 | Researcher, [SSI Fellow](https://www.software.ac.uk/about/fellows/valerio-maggio), 89 | and Education Lead at Open Mined. 90 | 91 | All the **Code** material is distributed under the terms of the Apache License. See [LICENSE](./LICENSE) file for additional details. 92 | 93 | All the instructional materials in this repository are free to use, and made available under the [Creative Commons Attribution 94 | license](https://creativecommons.org/licenses/by/4.0/). The following is a human-readable summary of (and not a substitute for) the [full legal text of the CC BY 4.0 95 | license](https://creativecommons.org/licenses/by/4.0/legalcode). 96 | 97 | You are free: 98 | 99 | * to **Share**---copy and redistribute the material in any medium or format 100 | * to **Adapt**---remix, transform, and build upon the material 101 | 102 | for any purpose, even commercially. 103 | 104 | The licensor cannot revoke these freedoms as long as you follow the 105 | license terms. 106 | 107 | Under the following terms: 108 | 109 | * **Attribution** --- You must give appropriate credit, and provide a link to the 110 | [LICENSE](https://github.com/leriomaggio/ppml-tutorial/LICENSE) [`cc-by-human`](https://creativecommons.org/licenses/by/4.0/), 111 | and indicate if changes were made. 112 | You may do so in any reasonable manner, but not in any way that suggests the 113 | licensor endorses you or your use. 114 | 115 | * **No additional restrictions** --- You may not apply legal terms or 116 | technological measures that legally restrict others from doing 117 | anything the license permits. 118 | 119 | ### Acknowledgment and funding 120 | 121 | The material developed in this tutorial has been supported by the [Software Sustainability Institute](https://www.software.ac.uk) (SSI), as part of my 122 | [SSI fellowship](https://www.software.ac.uk/about/fellows/valerio-maggio) on `PETs` (Privacy Enhancing Technologies). 123 | 124 | Please see this [deck](https://speakerdeck.com/leriomaggio/privacy-enhancing-data-science-ssi-fellowship-2022) to know more about my fellowship plans. 125 | 126 | Public shout out to all the people at [OpenMined](https://www.openmined.org) for all the encouragement and support with the preparation of this tutorial. 127 | I hope the material in this repository could contribute to raise awareness about all the amazing work on PETs it's being provided to the Open Source and the Python communities. 128 | 129 | ![OpenMined](./logos/openmined_logo_small.png "OpenMined") 130 | 131 | ## Contacts 132 | 133 | For any questions or doubts, feel free to open an [issue](https://github.com/leriomaggio/ppml-tutorial/issues) in the repository, or drop me an email @ `valerio_at_openmined_dot_org`. 134 | -------------------------------------------------------------------------------- /3-differential-privacy/6-MIA-Reconstruction-OPACUS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "94a85538-8e7e-4771-a5e7-d7b18d3b81bd", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Optional: setup NoTexBook theme\n", 11 | "%load_ext notexbook\n", 12 | "\n", 13 | "%texify" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "f5f5b821", 19 | "metadata": {}, 20 | "source": [ 21 | "# Model Inversion Attack" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "68655027", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import torch as th\n", 32 | "import numpy as np\n", 33 | "\n", 34 | "from matplotlib import pyplot as plt\n", 35 | "\n", 36 | "%matplotlib inline" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "04e011e0-7cd6-4832-a72c-be001f23fee8", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Tweak to reuse the Python modules defined in previous section\n", 47 | "import sys, os\n", 48 | "from pathlib import Path\n", 49 | "\n", 50 | "sys.path.insert(0, os.path.join(os.path.abspath(os.path.curdir), \"..\", \"2-ml-models-attacks\"))" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "45779d5a", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from dataset import ORLFaces\n", 61 | "from torchvision.transforms import ToTensor" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "3d5132ab", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n", 72 | "# see, https://github.com/pytorch/vision/issues/3497 for more information\n", 73 | "from six.moves import urllib\n", 74 | "\n", 75 | "opener = urllib.request.build_opener()\n", 76 | "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n", 77 | "urllib.request.install_opener(opener)\n", 78 | "\n", 79 | "from pathlib import Path\n", 80 | "import os\n", 81 | "\n", 82 | "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\"" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "4db6abf3", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "orl_faces_train = ORLFaces(\n", 93 | " root=DATA_FOLDER, download=True, split=\"train\", transform=ToTensor()\n", 94 | ")\n", 95 | "orl_faces_test = ORLFaces(root=DATA_FOLDER, download=True, split=\"test\", transform=ToTensor())" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "d0d51644", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "orl_faces_train.data.shape, orl_faces_test.data.shape" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "859989b5", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from torch.utils.data import DataLoader\n", 116 | "\n", 117 | "train_loader = DataLoader(\n", 118 | " orl_faces_train, batch_size=32, shuffle=False, drop_last=False\n", 119 | ")" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "c8c16f84", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# Reconstruction Attack Settings\n", 130 | "# See Paper, Section 5.2 - Reconstruction Attack\n", 131 | "α = 5000\n", 132 | "β = 100\n", 133 | "γ = 0.99\n", 134 | "λ = 0.1" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "39426d25", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "from models import SoftmaxRegression, MLP" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "34de5f7f", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "from pathlib import Path\n", 155 | "\n", 156 | "CHECKPOINT_FOLDER = Path(\"./checkpoints/\")\n", 157 | "\n", 158 | "\n", 159 | "def load_weights(model, model_name: str = None) -> th.TensorType:\n", 160 | " if model_name is None or not model_name:\n", 161 | " model_name = model.__class__.__name__.lower()\n", 162 | " w_file = CHECKPOINT_FOLDER / f\"{model_name}.pt\"\n", 163 | " try:\n", 164 | " weights = th.load(open(w_file, \"rb\"))\n", 165 | " except FileNotFoundError:\n", 166 | " print(f\"Model Weights file {w_file} does not exist! Please check.\")\n", 167 | " return None\n", 168 | " return weights" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "d37c65ad", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "softmax_reg = SoftmaxRegression()\n", 179 | "weights = load_weights(softmax_reg, model_name=\"softmax_reg_opacus_test\")\n", 180 | "\n", 181 | "weights[\"regression.weight\"] = weights[\"_module.regression.weight\"]\n", 182 | "_ = weights.pop(\"_module.regression.weight\")\n", 183 | "\n", 184 | "weights[\"regression.bias\"] = weights[\"_module.regression.bias\"]\n", 185 | "_ = weights.pop(\"_module.regression.bias\")\n", 186 | "\n", 187 | "if weights is not None:\n", 188 | " softmax_reg.load_state_dict(weights)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "81e83c5b", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "def process(im_flatten):\n", 199 | " max_v = th.max(im_flatten)\n", 200 | " min_v = th.min(im_flatten)\n", 201 | " return (im_flatten - min_v) / (max_v - min_v)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "id": "e52a0ac1", 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "def mi_face(model, target_label):\n", 212 | " aim_tensor = th.zeros(1, 112 * 92)\n", 213 | " aim_tensor.requires_grad = True\n", 214 | "\n", 215 | " lossn_1 = 10\n", 216 | " b = 0\n", 217 | " g = 0\n", 218 | "\n", 219 | " out = model(aim_tensor.detach())\n", 220 | " _, pred = th.max(out, 1)\n", 221 | " print(pred)\n", 222 | " print(f\"original input image {target_label}\")\n", 223 | " plt.imshow(\n", 224 | " np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy(), (1, 2, 0)),\n", 225 | " cmap=\"Greys\",\n", 226 | " )\n", 227 | " plt.show()\n", 228 | " print(\n", 229 | " f\"original input image predict label {target_label} - predict label: {pred.item()}\"\n", 230 | " )\n", 231 | "\n", 232 | " criterion = th.nn.NLLLoss()\n", 233 | "\n", 234 | " for i in range(α):\n", 235 | " out = model(aim_tensor)\n", 236 | " if aim_tensor.grad is not None:\n", 237 | " aim_tensor.grad.zero_()\n", 238 | " out = out.reshape(1, 40)\n", 239 | " target_class = th.tensor([target_label])\n", 240 | " loss = criterion(out, target_class)\n", 241 | " loss.backward()\n", 242 | " aim_grad = aim_tensor.grad\n", 243 | "\n", 244 | " # SGD Step\n", 245 | " # see https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD\n", 246 | " aim_tensor = aim_tensor - (λ * aim_grad)\n", 247 | " aim_tensor = process(aim_tensor)\n", 248 | " aim_tensor = th.clamp(aim_tensor.detach(), 0, 1)\n", 249 | " aim_tensor.requires_grad = True\n", 250 | " if loss >= lossn_1:\n", 251 | " b += 1\n", 252 | " if b > β:\n", 253 | " break\n", 254 | " else:\n", 255 | " b = 0\n", 256 | " lossn_1 = loss\n", 257 | " if loss < γ:\n", 258 | " break\n", 259 | "\n", 260 | " print(f\"Attack completed at {i} iterations\")\n", 261 | " out = model(aim_tensor.detach())\n", 262 | " _, pred = th.max(out, 1)\n", 263 | " print(pred)\n", 264 | " print(f\"inverted image {target_label}\")\n", 265 | " plt.imshow(\n", 266 | " np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy() * 255, (1, 2, 0)),\n", 267 | " cmap=\"Greys\",\n", 268 | " )\n", 269 | " plt.show()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "44013f2f", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "for cl in range(10):\n", 280 | " mi_face(softmax_reg, cl)" 281 | ] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3 (ipykernel)", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.12.3" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 5 305 | } 306 | -------------------------------------------------------------------------------- /3-differential-privacy/5-MIA-Training-OPACUS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0b9e9dde-7628-4d45-a408-afd93dd841ce", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Optional: setup NoTexBook theme\n", 11 | "%load_ext notexbook\n", 12 | "\n", 13 | "%texify" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "dcd69b34", 19 | "metadata": {}, 20 | "source": [ 21 | "# Model Inversion Attack - Model Training" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "885f544c", 27 | "metadata": {}, 28 | "source": [ 29 | "In this notebook we will repeat the same operations done in preparation for the **Model Inversion Attack** (in section 1) \n", 30 | "\n", 31 | "The very **big** difference this time though is that we will be using **Opacus** to train our ML model.\n", 32 | "\n", 33 | "$\\rightarrow$ ‼️ The very **remarkable** thing to notice is **how little** the implementation changes wrt. to the previous notebook\n", 34 | "(in fact, we will be using the **same** `train` function defined previously)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "eee64647", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import torch as th\n", 45 | "import numpy as np\n", 46 | "\n", 47 | "from matplotlib import pyplot as plt\n", 48 | "\n", 49 | "%matplotlib inline" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "ff722fd0", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import warnings\n", 60 | "warnings.simplefilter(\"ignore\")\n", 61 | "\n", 62 | "\n", 63 | "# Tweak to reuse the Python modules defined in previous section\n", 64 | "import sys, os\n", 65 | "from pathlib import Path\n", 66 | "\n", 67 | "sys.path.insert(0, os.path.join(os.path.abspath(os.path.curdir), \"..\", \"2-ml-models-attacks\"))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "9086c266", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from dataset import ORLFaces\n", 78 | "from torchvision.transforms import ToTensor, Grayscale, Compose\n", 79 | "from torch.utils.data import DataLoader" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "bbc48ffb", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "SEED = 123456\n", 90 | "\n", 91 | "np.random.seed(SEED)\n", 92 | "th.manual_seed(SEED)\n", 93 | "if th.cuda.is_available():\n", 94 | " th.cuda.manual_seed_all(SEED)\n", 95 | " th.backends.cudnn.deterministic = True" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "93241bc5", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n", 106 | "# see, https://github.com/pytorch/vision/issues/3497 for more information\n", 107 | "from six.moves import urllib\n", 108 | "\n", 109 | "opener = urllib.request.build_opener()\n", 110 | "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n", 111 | "urllib.request.install_opener(opener)\n", 112 | "\n", 113 | "from pathlib import Path\n", 114 | "import os\n", 115 | "\n", 116 | "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\"" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "c16625ec", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "imgs_trasform = Compose([Grayscale(num_output_channels=1), ToTensor()])\n", 127 | "\n", 128 | "orl_faces_train = ORLFaces(\n", 129 | " root=DATA_FOLDER, download=True, split=\"train\", transform=imgs_trasform\n", 130 | ")\n", 131 | "orl_faces_test = ORLFaces(\n", 132 | " root=DATA_FOLDER, download=True, split=\"test\", transform=imgs_trasform\n", 133 | ")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "b9ae6a51", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "BATCH_SIZE = 32\n", 144 | "\n", 145 | "train_loader = DataLoader(\n", 146 | " orl_faces_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=False\n", 147 | ")\n", 148 | "test_loader = DataLoader(\n", 149 | " orl_faces_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=False\n", 150 | ")" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "425c305c", 156 | "metadata": {}, 157 | "source": [ 158 | "Show some of the training images, for fun" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "61e794b9", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "from torchvision.utils import make_grid\n", 169 | "\n", 170 | "\n", 171 | "def imshow(img):\n", 172 | " npimg = img.numpy()\n", 173 | " plt.figure(figsize=(10, 12))\n", 174 | " plt.imshow(np.transpose(npimg, (1, 2, 0)))\n", 175 | " plt.show()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "6008bf8c", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "# get some random training images\n", 186 | "images, labels = next(iter(train_loader))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "8e03a7f6", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# show images\n", 197 | "imshow(make_grid(images))\n", 198 | "# print labels\n", 199 | "print(\" \".join(f\"{labels[j]}\" for j in range(BATCH_SIZE)))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "id": "cdc04e1d", 205 | "metadata": {}, 206 | "source": [ 207 | "## Privacy Parameters and Opacus" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "e0d1c795", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "from models import SoftmaxRegression, MLP" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "62df081d", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "softmax_reg = SoftmaxRegression()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "b5030c0e", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from opacus.validators import ModuleValidator\n", 238 | "\n", 239 | "errors = ModuleValidator.validate(softmax_reg, strict=False)\n", 240 | "errors" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "id": "dd545cdb", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "λ = 0.1 # optimiser learning rate" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "67025880", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "softmax_reg = SoftmaxRegression()\n", 261 | "softmax_sgd = th.optim.SGD(softmax_reg.parameters(), lr=λ)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "id": "9449cbff", 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "from opacus import PrivacyEngine" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "id": "cb9ff406", 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "from train import train" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "id": "74c6ecf0", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "MAX_GRAD_NORM = 1.2\n", 292 | "EPSILON = 50\n", 293 | "DELTA = 1e-5\n", 294 | "EPOCHS = 200 # we have increased by 100 the number of epochs of training" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "562a43d0", 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "privacy_engine = PrivacyEngine(accountant=\"gdp\")\n", 305 | "\n", 306 | "softmax_reg, softmax_sgd, train_loader = privacy_engine.make_private_with_epsilon(\n", 307 | " module=softmax_reg,\n", 308 | " optimizer=softmax_sgd,\n", 309 | " data_loader=train_loader,\n", 310 | " epochs=EPOCHS,\n", 311 | " target_epsilon=EPSILON,\n", 312 | " target_delta=DELTA,\n", 313 | " max_grad_norm=MAX_GRAD_NORM,\n", 314 | ")\n", 315 | "\n", 316 | "print(f\"Using sigma={softmax_sgd.noise_multiplier} and C={MAX_GRAD_NORM}\")" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "4f9c046a", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "train(model=softmax_reg, optimiser=softmax_sgd, loaders=(train_loader, test_loader), \n", 327 | " model_name=\"softmax_reg_opacus_test\", verbose=False, epochs=EPOCHS)" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 3 (ipykernel)", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.12.3" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 5 352 | } 353 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /3-differential-privacy/2-approx-differential-privacy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Optional: setup NoTexBook theme\n", 10 | "%load_ext notexbook\n", 11 | "%texify" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "**Adapted from**: [Ch6](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch6.ipynb)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Approximate Differential Privacy\n", 26 | "\n", 27 | "\n", 28 | "Approximate differential privacy, also called $(\\epsilon, \\delta)$-differential privacy, has the following definition:\n", 29 | "\n", 30 | "\\begin{align}\n", 31 | "\\mathsf{Pr}[F(x) = S] \\leq e^\\epsilon \\mathsf{Pr}[F(x') = s] + \\delta\n", 32 | "\\end{align}\n", 33 | "\n", 34 | "The new privacy parameter, $\\delta$, represents a \"failure probability\" for the definition. \n", 35 | "\n", 36 | "With probability $1-\\delta$, we will get the same guarantee as pure differential privacy; with probability $\\delta$, we get no guarantee. \n", 37 | "\n", 38 | "In other words:\n", 39 | "\n", 40 | "- With probability $1-\\delta$, $\\frac{\\mathsf{Pr}[F(x) = S]}{\\mathsf{Pr}[F(x') = s]} \\leq e^\\epsilon$\n", 41 | "- With probability $\\delta$, we get no guarantee at all\n", 42 | "\n", 43 | "This definition should seem a little bit scary! \n", 44 | "\n", 45 | "With probability $\\delta$, anything at all could happen - including a release of the entire sensitive dataset! \n", 46 | "\n", 47 | "For this reason, we typically require $\\delta$ to be very small - usually $\\frac{1}{n^2}$ or less, where $n$ is the size of the dataset. \n", 48 | "\n", 49 | "In addition, it can be demonstrated in practice that the $(\\epsilon, \\delta)$-differentially private mechanisms in practical use \n", 50 | "don't fail catastrophically - as allowed by the definition - instead, they fail *gracefully*, and don't do terrible things like \n", 51 | "releasing the entire dataset.\n", 52 | "\n", 53 | "\n", 54 | "Such mechanisms *are* possible, however, and they do satisfy the definition of $(\\epsilon, \\delta)$-differential privacy." 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Properties of Approximate Differential Privacy\n", 62 | "\n", 63 | "Approximate differential privacy has similar properties to pure $\\epsilon$-differential privacy. It satisfies **sequential composition**:\n", 64 | "\n", 65 | "- If $F_1(x)$ satisfies $(\\epsilon_1, \\delta_1)$-differential privacy\n", 66 | "- And $F_2(x)$ satisfies $(\\epsilon_2, \\delta_2)$-differential privacy\n", 67 | "- Then the mechanism $G(x) = (F_1(x), F_2(x))$ which releases both results satisfies $(\\epsilon_1+\\epsilon_2, \\delta_1 + \\delta_2)$-differential privacy\n", 68 | "\n", 69 | "The only difference from the pure $\\epsilon$ setting is that we add up the values of $\\delta$ as well as the values of $\\epsilon$. \n", 70 | "\n", 71 | "Approximate differential privacy also satisfies **post-processing and parallel composition**." 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## The Gaussian Mechanism\n", 79 | "\n", 80 | "The Gaussian mechanism is an alternative to the Laplace mechanism, which adds Gaussian noise instead of Laplacian noise. \n", 81 | "\n", 82 | "The Gaussian mechanism does *not* satisfy pure $\\epsilon$-differential privacy, but does satisfy $(\\epsilon, \\delta)$-differential \n", 83 | "privacy.\n", 84 | "\n", 85 | "According to the Gaussian mechanism, for a function $f(x)$ which returns a number, the following definition of $F(x)$ satisfies $(\\epsilon, \\delta)$-differential privacy:\n", 86 | "\n", 87 | "\\begin{align}\n", 88 | "F(x) = f(x) + \\mathcal{N}(\\sigma^2)\\\\\n", 89 | "\\text{where } \\sigma^2 = \\frac{2s^2 \\log(1.25/\\delta)}{\\epsilon^2}\n", 90 | "\\end{align}\n", 91 | "\n", 92 | "where $s$ is the sensitivity of $f$, and $\\mathcal{N}(\\sigma^2)$ denotes sampling from the Gaussian (normal) distribution with center 0 and variance $\\sigma^2$. Note that here (and elsewhere in these notes), $\\log$ denotes the natural logarithm." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "For real-valued functions $f : D \\rightarrow \\mathbb{R}$, we can use the Gaussian mechanism in exactly the same way as we do the Laplace mechanism, and it's easy to compare what happens under both mechanisms for a given value of $\\epsilon$." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "import pandas as pd\n", 109 | "import numpy as np\n", 110 | "import matplotlib.pyplot as plt\n", 111 | "plt.style.use('seaborn-v0_8-whitegrid')" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "tags": [ 119 | "hide-input" 120 | ] 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "epsilon = 1\n", 125 | "vals_laplace = [np.random.laplace(loc=0, scale=1/epsilon) for x in range(100000)]\n", 126 | "\n", 127 | "delta = 10e-5\n", 128 | "sigma = np.sqrt(2 * np.log(1.25 / delta)) * 1 / epsilon\n", 129 | "vals_gauss = [np.random.normal(loc=0, scale=sigma) for x in range(100000)]\n", 130 | "\n", 131 | "plt.hist(vals_laplace, bins=50, label='Laplace')\n", 132 | "plt.hist(vals_gauss, bins=50, alpha=.7, label='Gaussian');\n", 133 | "plt.legend();" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Here, we graph the empirical probability density function of the Laplace and Gaussian mechanisms for $\\epsilon = 1$, with $\\delta = 10^{-5}$ for the Gaussian mechanism.\n", 141 | "\n", 142 | "Compared to the Laplace mechanism, the plot for the Gaussian mechanism looks \"squished.\" \n", 143 | "\n", 144 | "Differentially private outputs which are far from the true answer are much more likely using the Gaussian mechanism than they are under the Laplace mechanism (which, by comparison, looks extremely \"pointy\").\n", 145 | "\n", 146 | "So the Gaussian mechanism has two **major drawbacks**:\n", 147 | "\n", 148 | "1. it requires the use of the the relaxed $(\\epsilon, \\delta)$-differential privacy definition, *and* \n", 149 | "2. it's less accurate than the Laplace mechanism. \n", 150 | "\n", 151 | "Why would we want to use it?" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Vector-Valued Functions and their Sensitivities\n", 159 | "\n", 160 | "So far, we have only considered real-valued functions (i.e. the function's output is always a single real number). \n", 161 | "\n", 162 | "Such functions are of the form $f : D \\rightarrow \\mathbb{R}$. \n", 163 | "\n", 164 | "Both the Laplace and Gaussian mechanism, however, can be extended to *vector-valued* functions of the form $f : D \\rightarrow \\mathbb{R}^k$, which return vectors of real numbers. \n", 165 | "\n", 166 | "We can think of histograms as vector-valued functions, which return a vector whose elements consist of histogram bin counts.\n", 167 | "\n", 168 | "We saw earlier that the *sensitivity* of a function is:\n", 169 | "\n", 170 | "\\begin{align}\n", 171 | "GS(f) = \\max_{d(x,x') \\leq 1} \\lvert f(x) - f(x') \\rvert\n", 172 | "\\end{align}\n", 173 | "\n", 174 | "How do we define sensitivity for vector-valued functions?\n", 175 | "\n", 176 | "Consider the expression $f(x) - f(x')$. \n", 177 | "\n", 178 | "If $f$ is a vector-valued function, then this expression represents the difference between two vectors, which can be computed as the difference between their corresponding elements (the difference of two length-$k$ vectors is thus a new length-$k$ vector). \n", 179 | "\n", 180 | "This new vector is the distance between $f(x)$ and $f(x')$, represented as a vector.\n", 181 | "\n", 182 | "The magnitude of this vector is the sensitivity of $f$. \n", 183 | "\n", 184 | "There are several ways to compute the magnitude of a vector; we'll use two of them: the $L1$ norm and the $L2$ norm.\n", 185 | "\n", 186 | "### L1 and L2 Norms\n", 187 | "\n", 188 | "The $L1$ norm of a vector $V$ of length $k$ is defined as $\\lVert V \\rVert_1 = \\sum_{i=1}^k \\lvert V_i \\rvert$ (i.e. it's the sum of the vector's elements). In 2-dimensional space, the $L1$ norm of the difference between two vectors yields the \"manhattan distance\" between them.\n", 189 | "\n", 190 | "The $L2$ norm of a vector $V$ of length $k$ is defined as $\\lVert V \\rVert_2 = \\sqrt{\\sum_{i=1}^k V_i^2}$ (i.e. the square root of the sum of the squares). In 2-dimensional space, this is the \"Euclidean distance\", and it's always less than or equal to the $L1$ norm.\n", 191 | "\n", 192 | "### L1 and L2 Sensitivities\n", 193 | "\n", 194 | "The $L1$ sensitivity of a vector-valued function $f$ is:\n", 195 | "\n", 196 | "\\begin{align}\n", 197 | "GS(f) = \\max_{d(x,x') \\leq 1} \\lVert f(x) - f(x') \\rVert_1\n", 198 | "\\end{align}\n", 199 | "\n", 200 | "This is equal to the sum of the *elementwise* sensitivities. For example, if we define a vector-valued function $f$ that returns a length-$k$ vector of 1-sensitive results, then the $L1$ sensitivity of $f$ is $k$.\n", 201 | "\n", 202 | "Similarly, the $L2$ sensitivity of a vector-valued function $f$ is:\n", 203 | "\n", 204 | "\\begin{align}\n", 205 | "GS_2(f) = \\max_{d(x,x') \\leq 1} \\lVert f(x) - f(x') \\rVert_2\n", 206 | "\\end{align}\n", 207 | "\n", 208 | "Using the same example as above, a vector-valued function $f$ returning a length-$k$ vector of 1-sensitive results has $L2$ sensitivity of $\\sqrt{k}$. For long vectors, the $L2$ sensitivity will obviously be much lower than the $L1$ sensitivity! For some applications, like machine learning algorithms (which sometimes return vectors with thousands of elements), $L2$ sensitivity is *significantly* lower than $L1$ sensitivity.\n", 209 | "\n", 210 | "### Choosing Between L1 and L2\n", 211 | "\n", 212 | "As mentioned earlier, both the Laplace and Gaussian mechanisms can be extended to vector-valued functions. \n", 213 | "\n", 214 | "However, there's a key difference between these two extensions: \n", 215 | "\n", 216 | "- the vector-valued Laplace mechanism **requires** the use of $L1$ sensitivity, while the vector-valued Gaussian mechanism allows the use of either $L1$ or $L2$ sensitivity. \n", 217 | "\n", 218 | "This is a **major strength** of the Gaussian mechanism. For applications in which $L2$ sensitivity is much lower than $L1$ sensitivity, the Gaussian mechansim allows adding *much* less noise.\n", 219 | "\n", 220 | "- The **vector-valued Laplace mechanism** releases $f(x) + (Y_1, \\dots, Y_k)$, where $Y_i$ are drawn i.i.d. from the Laplace distribution with scale $\\frac{s}{\\epsilon}$ and $s$ is the $L1$ sensitivity of $f$\n", 221 | "\n", 222 | "- The **vector-valued Gaussian mechanism** releases $f(x) + (Y_1, \\dots, Y_k)$, where $Y_i$ are drawn i.i.d. from the Gaussian distribution with $\\sigma^2 = \\frac{2s^2 \\log(1.25/\\delta)}{\\epsilon^2}$ and $s$ is the $L2$ sensitivity of $f$" 223 | ] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3 (ipykernel)", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.12.3" 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 4 247 | } 248 | -------------------------------------------------------------------------------- /2-ml-models-attacks/3-MIA-Reconstruction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5f01a093-5560-4e09-a9c3-95c097fdbcb6", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Optional: setup NoTexBook theme\n", 11 | "%load_ext notexbook\n", 12 | "\n", 13 | "%texify" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "3ba8845d-8556-402d-a2fc-52d8b4e3dc2b", 19 | "metadata": {}, 20 | "source": [ 21 | "# Model Inversion Attack" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "1c67e4d2", 27 | "metadata": {}, 28 | "source": [ 29 | "In this notebook we will be performing the **Model Inversion Attack** considering two pre-trained ML models as originally described in the reference paper:\n", 30 | "\n", 31 | "> **Model Inversion Attacks that Exploit Confidence Information and Basic Countermeasures**, by _Fredrikson, et. al_, 2015 \n", 32 | "[DOI](https://dl.acm.org/doi/pdf/10.1145/2810103.2813677).\n", 33 | "\n", 34 | "The two models are `SoftmaxRegression` and `MLP`.\n", 35 | "\n", 36 | "⚠️ **Note**: All the experimental settings, and choices made in this notebook are _replicating_ exactly the original paper." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "68655027", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import torch as th\n", 47 | "import numpy as np\n", 48 | "\n", 49 | "from matplotlib import pyplot as plt\n", 50 | "\n", 51 | "%matplotlib inline" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "1942bb1a", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n", 62 | "# see, https://github.com/pytorch/vision/issues/3497 for more information\n", 63 | "from six.moves import urllib\n", 64 | "\n", 65 | "opener = urllib.request.build_opener()\n", 66 | "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n", 67 | "urllib.request.install_opener(opener)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "44bf0bd8", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from pathlib import Path\n", 78 | "import os\n", 79 | "\n", 80 | "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\"" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "45779d5a", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "from dataset import ORLFaces\n", 91 | "from torchvision.transforms import ToTensor" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "4db6abf3", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "orl_faces_train = ORLFaces(root=DATA_FOLDER, download=True, split=\"train\", transform=ToTensor())\n", 102 | "orl_faces_test = ORLFaces(root=DATA_FOLDER, download=True, split=\"test\", transform=ToTensor())" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "d0d51644", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "orl_faces_train.data.shape, orl_faces_test.data.shape" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "859989b5", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "from torch.utils.data import DataLoader\n", 123 | "\n", 124 | "train_loader = DataLoader(orl_faces_train, batch_size=32, shuffle=False, drop_last=False)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "cd8754dc", 130 | "metadata": {}, 131 | "source": [ 132 | "## Reconstruction Attack\n", 133 | "\n", 134 | "#### Settings" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "c8c16f84", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Reconstruction Attack Settings\n", 145 | "# See Paper, Section 5.2 - Reconstruction Attack\n", 146 | "α = 5000 # total iterations\n", 147 | "β = 100 # max nr. of iterations without improvements\n", 148 | "γ = 0.99 # threshold of the cost \n", 149 | "λ = 0.1 # learning rate" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "b7b97e4a", 155 | "metadata": {}, 156 | "source": [ 157 | "#### Load Pre-trained Models" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "39426d25", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "from models import SoftmaxRegression" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "7b643a8f", 173 | "metadata": {}, 174 | "source": [ 175 | "⚠️ If you skipped the **`MIA-Training`** notebook, please download the **pre-trained** weights of the `SoftmaxRegression` model here: [softmax_regression_mia.pt](https://www.dropbox.com/s/t9wglqyj5zr74fq/softmax_mia.pt?dl=1) and save it into the local `checkpoints` folder\n", 176 | "\n" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "id": "34de5f7f", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "from pathlib import Path \n", 187 | "\n", 188 | "CHECKPOINT_FOLDER = Path(\"./checkpoints/\")\n", 189 | "CHECKPOINT_FOLDER.mkdir(exist_ok=True)\n", 190 | "\n", 191 | "def load_weights(model, model_filename: str = None):\n", 192 | " if model_filename is None or not model_filename:\n", 193 | " model_filename = f\"{model.__class__.__name__.lower()}.pt\"\n", 194 | " w_file = CHECKPOINT_FOLDER / model_filename\n", 195 | " try:\n", 196 | " weights = th.load(open(w_file, \"rb\"))\n", 197 | " except FileNotFoundError: \n", 198 | " print(f\"Model Weights file {w_file} does not exist! Please check.\")\n", 199 | " return None\n", 200 | " return weights\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "d37c65ad", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "softmax_reg = SoftmaxRegression()\n", 211 | "weights = load_weights(softmax_reg, model_filename=\"softmax_mia.pt\")\n", 212 | "if weights is not None:\n", 213 | " softmax_reg.load_state_dict(weights)\n", 214 | " \n", 215 | "softmax_reg" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "id": "ba0018ae", 221 | "metadata": {}, 222 | "source": [ 223 | "## MIA Reconstruction Strategy\n", 224 | "\n", 225 | "\n", 226 | "\"MIA" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "id": "81e83c5b", 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "def process(im_flatten):\n", 237 | " max_v = th.max(im_flatten)\n", 238 | " min_v = th.min(im_flatten)\n", 239 | " return (im_flatten-min_v) / (max_v - min_v)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "e52a0ac1", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "def mi_face(model, target_label):\n", 250 | " aim_tensor = th.zeros(1, 112*92)\n", 251 | " aim_tensor.requires_grad = True\n", 252 | " \n", 253 | " lossn_1 = 10\n", 254 | " b = 0\n", 255 | " g = 0\n", 256 | " \n", 257 | " out = model(aim_tensor.detach())\n", 258 | " _, pred = th.max(out, 1)\n", 259 | " print(pred)\n", 260 | " print(f'original input image {target_label}')\n", 261 | " plt.imshow(np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy(), (1, 2, 0)), cmap=\"Greys\")\n", 262 | " plt.show()\n", 263 | " print(f'original input image predict label {target_label} - predict label: {pred.item()}')\n", 264 | " \n", 265 | " criterion = th.nn.NLLLoss()\n", 266 | " \n", 267 | " for i in range(α):\n", 268 | " out = model(aim_tensor)\n", 269 | " if aim_tensor.grad is not None:\n", 270 | " aim_tensor.grad.zero_()\n", 271 | " out = out.reshape(1, 40)\n", 272 | " target_class = th.tensor([target_label])\n", 273 | " loss = criterion(out, target_class)\n", 274 | " loss.backward()\n", 275 | " aim_grad = aim_tensor.grad\n", 276 | " \n", 277 | " # SGD Step\n", 278 | " # see https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD\n", 279 | " aim_tensor = aim_tensor - (λ * aim_grad)\n", 280 | " aim_tensor = process(aim_tensor)\n", 281 | " aim_tensor = th.clamp(aim_tensor.detach(), 0, 1)\n", 282 | " aim_tensor.requires_grad = True\n", 283 | " if loss >= lossn_1:\n", 284 | " b += 1\n", 285 | " if b > β:\n", 286 | " break\n", 287 | " else:\n", 288 | " b = 0\n", 289 | " lossn_1 = loss\n", 290 | " if loss < γ:\n", 291 | " break\n", 292 | " \n", 293 | " print(f\"Attack completed at {i} iterations\")\n", 294 | " out = model(aim_tensor.detach())\n", 295 | " _, pred = th.max(out, 1)\n", 296 | " print(pred)\n", 297 | " print(f'inverted image {target_label}')\n", 298 | " plt.imshow(np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy() * 255, (1, 2, 0)), cmap=\"Greys\")\n", 299 | " plt.show()\n", 300 | "\n", 301 | " " 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "44013f2f", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "# Let's try to reconstruct the data for the first 10 classes (i.e. faces)\n", 312 | "for cl in range(10):\n", 313 | " mi_face(softmax_reg, cl)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "id": "2f14d689", 319 | "metadata": {}, 320 | "source": [ 321 | "### Exercise: \n", 322 | "\n", 323 | "Write the code to try the **model inversion reconstruction** using the `MLP` model" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "id": "3ad83641", 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "from models import MLP" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "a3df4a1b", 339 | "metadata": {}, 340 | "source": [ 341 | "⚠️ Grab the **pre-trained** weights of the `SoftmaxRegression` model here: [mlp_mia.pt](https://www.dropbox.com/s/8ul2lj2eqcykfxm/mlp_mia.pt?dl=1) and save it into the local `checkpoints` folder" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "id": "58bab294", 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "mlp = MLP()\n", 352 | "weights = load_weights(mlp, model_filename=\"mlp_mia.pt\")\n", 353 | "if weights is not None:\n", 354 | " mlp.load_state_dict(weights)\n", 355 | "\n", 356 | "mlp" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "id": "76662b42", 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "# Reconstruction Attack code HERE\n", 367 | "for cl in range(10):\n", 368 | " mi_face(mlp, cl)" 369 | ] 370 | } 371 | ], 372 | "metadata": { 373 | "kernelspec": { 374 | "display_name": "Python 3 (ipykernel)", 375 | "language": "python", 376 | "name": "python3" 377 | }, 378 | "language_info": { 379 | "codemirror_mode": { 380 | "name": "ipython", 381 | "version": 3 382 | }, 383 | "file_extension": ".py", 384 | "mimetype": "text/x-python", 385 | "name": "python", 386 | "nbconvert_exporter": "python", 387 | "pygments_lexer": "ipython3", 388 | "version": "3.12.3" 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 5 393 | } 394 | -------------------------------------------------------------------------------- /2-ml-models-attacks/2-MIA-Training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "d4e39d12-6b19-451d-b1b9-2502d6f8e15a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Optional: setup NoTexBook theme\n", 11 | "%load_ext notexbook\n", 12 | "\n", 13 | "%texify" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "dcd69b34", 19 | "metadata": {}, 20 | "source": [ 21 | "# Model Inversion Attack - Model Training" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "85ed1933", 27 | "metadata": {}, 28 | "source": [ 29 | "In this notebook we will be performing the training of **two** (out of three) of the ML models considered in the paper:\n", 30 | "\n", 31 | "> **Model Inversion Attacks that Exploit Confidence Information and Basic Countermeasures**, by _Fredrikson, et. al_, 2015 \n", 32 | "[DOI](https://dl.acm.org/doi/pdf/10.1145/2810103.2813677).\n", 33 | "\n", 34 | "The two models are `SoftmaxRegression` and `MLP`.\n", 35 | "\n", 36 | "⚠️ **NOTE**: Please feel free to skip this notebook completely (if you don't want to **re-train** the models on your own) and jump directly to the next [MIA Reconstruction](./2-MIA-Reconstruction.ipynb) notebook." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "eee64647", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import torch as th\n", 47 | "import numpy as np\n", 48 | "\n", 49 | "from matplotlib import pyplot as plt\n", 50 | "\n", 51 | "%matplotlib inline" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "3126b393", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n", 62 | "# see, https://github.com/pytorch/vision/issues/3497 for more information\n", 63 | "from six.moves import urllib\n", 64 | "\n", 65 | "opener = urllib.request.build_opener()\n", 66 | "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n", 67 | "urllib.request.install_opener(opener)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "9086c266", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from dataset import ORLFaces\n", 78 | "from torchvision.transforms import ToTensor, Grayscale, Compose\n", 79 | "from torch.utils.data import DataLoader" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "bbc48ffb", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# Reproducibility Settings\n", 90 | "\n", 91 | "SEED = 123456\n", 92 | "\n", 93 | "np.random.seed(SEED)\n", 94 | "th.manual_seed(SEED)\n", 95 | "if th.cuda.is_available():\n", 96 | " th.cuda.manual_seed_all(SEED)\n", 97 | " th.backends.cudnn.deterministic = True" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "5dc0251e", 103 | "metadata": {}, 104 | "source": [ 105 | "### The `ORLFaces` Dataset\n", 106 | "\n", 107 | "The original paper considers the **AT&T Face Database** faces dataset (which I have encapsualted and made available as a PyTorch `Dataset`): `ORLFaces`" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "345e23a7", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "from pathlib import Path\n", 118 | "import os\n", 119 | "\n", 120 | "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\"" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "2ee5718e", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "print(DATA_FOLDER)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "c16625ec", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "imgs_trasform = Compose([Grayscale(num_output_channels=1), ToTensor()])\n", 141 | "\n", 142 | "orl_faces_train = ORLFaces(\n", 143 | " root=DATA_FOLDER, download=True, split=\"train\", transform=imgs_trasform\n", 144 | ")\n", 145 | "orl_faces_test = ORLFaces(\n", 146 | " root=DATA_FOLDER, download=True, split=\"test\", transform=imgs_trasform\n", 147 | ")" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "b9ae6a51", 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "BATCH_SIZE = 32\n", 158 | "\n", 159 | "train_loader = DataLoader(\n", 160 | " orl_faces_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=False\n", 161 | ")\n", 162 | "test_loader = DataLoader(\n", 163 | " orl_faces_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=False\n", 164 | ")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "bd2b2a27", 170 | "metadata": {}, 171 | "source": [ 172 | "#### A few notes about the dataset \n", 173 | "\n", 174 | "The `ORLFaces` dataset contains `400` image files corresponding to `40` different subjects (`10` photo each).\n", 175 | "\n", 176 | "\n", 177 | "Images are `112x92` pixels, with `256` grey levels per pixel, and (originally) stored in `PGM` format.\n", 178 | "The photos of the subjects have been taken at different times, are varying the lightning, the facial expressions\n", 179 | " (e.g. open/closed eyes, smiling/serious face), and the facial details.\n", 180 | "\n", 181 | "**Train/Test** partitions have been generated similarly to what has been done in the original paper, that is: \n", 182 | "\n", 183 | "(for each subject):\n", 184 | "\n", 185 | "- Randomly pick $7$ (out of $10$) images of the subject and add them to the **training set**\n", 186 | "- Add remaining $3$ images to the **test set**" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "425c305c", 192 | "metadata": {}, 193 | "source": [ 194 | "#### Visualise a few Samples in the Dataset\n", 195 | "\n", 196 | "Before we start with the training, let's visualise a few random samples extracted from the dataset" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "61e794b9", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "from torchvision.utils import make_grid\n", 207 | "\n", 208 | "\n", 209 | "def imshow(img):\n", 210 | " npimg = img.numpy()\n", 211 | " plt.figure(figsize=(10, 12))\n", 212 | " plt.imshow(np.transpose(npimg, (1, 2, 0)))\n", 213 | " plt.show()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "aa210aaf", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "# get some random training images\n", 224 | "images, labels = next(iter(train_loader))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "089395c5", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "images.shape" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "7dc07e43", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "# show images\n", 245 | "imshow(make_grid(images))\n", 246 | "# print labels\n", 247 | "print(\" \".join(f\"{labels[j]}\" for j in range(BATCH_SIZE)))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "2513010f", 253 | "metadata": {}, 254 | "source": [ 255 | "ℹ️ **Note**: Do you see the **exact same faces** that are being displayed here? " 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "2b7b9841", 261 | "metadata": {}, 262 | "source": [ 263 | "## Machine Learning Model Training\n", 264 | "\n", 265 | "In the original Paper, authors refer to three separated models used as reference examples for the Model Inversion Attack. \n", 266 | "\n", 267 | "Here to keep things simple, we will only consider two of them: `SoftmaxRegression` and `MLP`" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "e0d1c795", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "from models import SoftmaxRegression, MLP" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "b14bbfe8", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "from train import train" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "9a193ca3", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "λ = 0.1 # optimiser learning rate, as used in the paper" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "id": "7b9d5e71", 303 | "metadata": {}, 304 | "source": [ 305 | "#### Training `SoftmaxRegression`\n", 306 | "\n", 307 | "Note: This should be super-fast even on a laptop (small model, small data)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "62df081d", 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "softmax_reg = SoftmaxRegression()\n", 318 | "softmax_sgd = th.optim.SGD(softmax_reg.parameters(), lr=λ)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "cb471d2d", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "softmax_reg" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "id": "c4e0f0a2", 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "train(\n", 339 | " model=softmax_reg,\n", 340 | " optimiser=softmax_sgd,\n", 341 | " loaders=(train_loader, test_loader),\n", 342 | " model_name=\"softmax_mia\",\n", 343 | ")" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "id": "d8acf454", 349 | "metadata": {}, 350 | "source": [ 351 | "### Training `MLP`\n", 352 | "\n", 353 | "⚠️ **Note**: This may be a bit slower to train on a laptop (it shouldn't be that much, though!) \n", 354 | "\n", 355 | "If you notice that it is the case, please also feel free to skip this and jump at the end of this notebook." 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "12c1109e", 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "mlp = MLP()\n", 366 | "mlp_sgd = th.optim.SGD(mlp.parameters(), lr=λ)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "id": "53e40c39", 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "mlp" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "id": "123b3abc", 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "train(\n", 387 | " model=mlp,\n", 388 | " optimiser=mlp_sgd,\n", 389 | " loaders=(train_loader, test_loader),\n", 390 | " model_name=\"mlp_mia\",\n", 391 | ")" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "id": "c5646846", 397 | "metadata": {}, 398 | "source": [ 399 | "### Congratulations\n", 400 | "\n", 401 | "**Well done** 🎉\n", 402 | "\n", 403 | "Now that we have our two reference **trained** model, we are ready to setup and launch the _model inversion_ attack to the model. \n", 404 | "\n", 405 | "$\\rightarrow$ **MIA Reconstruction**" 406 | ] 407 | } 408 | ], 409 | "metadata": { 410 | "kernelspec": { 411 | "display_name": "Python 3 (ipykernel)", 412 | "language": "python", 413 | "name": "python3" 414 | }, 415 | "language_info": { 416 | "codemirror_mode": { 417 | "name": "ipython", 418 | "version": 3 419 | }, 420 | "file_extension": ".py", 421 | "mimetype": "text/x-python", 422 | "name": "python", 423 | "nbconvert_exporter": "python", 424 | "pygments_lexer": "ipython3", 425 | "version": "3.12.3" 426 | } 427 | }, 428 | "nbformat": 4, 429 | "nbformat_minor": 5 430 | } 431 | -------------------------------------------------------------------------------- /2-ml-models-attacks/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides access to the ORLFaces (ORL Database of Faces) 3 | as encapsulated as a `torchvision.datasets.VisionDataset` class. 4 | 5 | Notes 6 | ----- 7 | The ORLFaces dataset [1]_ contains image files of `40` different subjects (orgainsed one per folder). 8 | Images are `112x92` pixels, with `256` grey levels per pixel, and stored in PGM format. 9 | Folders have names of the form `sID`, where `ID` indicates the subject number (between `1` and `40`). 10 | In each of these directories, there are **ten different** images of that subject, which have names of the 11 | form `Y.pgm`, where `Y` is the image number for that subject (between `1` and `10`) - accounting for 12 | a total of `400` images (10 per 40 subjects). 13 | 14 | Images are randomly partitioned in training and test sets. 15 | 16 | References 17 | ----------- 18 | .. [1] https://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html 19 | """ 20 | 21 | import os 22 | import torch 23 | import numpy as np 24 | import re 25 | from collections import defaultdict 26 | from PIL import Image 27 | from torchvision.datasets import VisionDataset 28 | from torchvision.datasets.utils import download_url, extract_archive 29 | from enum import Enum 30 | from pathlib import Path 31 | from typing import Callable, Optional, Any, List, Tuple 32 | 33 | 34 | class Partition(Enum): 35 | """ 36 | Enumeration of Data Partitions for Machine learning experiments 37 | """ 38 | 39 | train = "training" 40 | test = "test" 41 | 42 | 43 | class ORLFaces(VisionDataset): 44 | """`ORL Faces` (The ORL Database of Faces) 45 | 46 | The Dataset contains a folder with faces of 40 different subjects 47 | taken at different times, varying the lightning, facial expressions 48 | (e.g. open/closed eyes, smiling/serious face), and facial details. 49 | 50 | This dataset is being used in research as facial detection dataset. 51 | 52 | Attributes 53 | ---------- 54 | root : str 55 | Root directory where the local copy of dataset is stored. 56 | split : {"train", "test"} (default: "train") 57 | Target data data_partition. Two data partitions are available, namely 58 | "training", and "test". Training data_partition is considered 59 | by default. Any _validation_ partition could be extracted from the 60 | training dataset. 61 | download : bool, optional (False) 62 | If true, the dataset will be downloaded from the internet and saved in the root 63 | directory. If dataset is already downloaded, it is not downloaded again. 64 | transform : Callable, optional 65 | A function/transform that takes in an image and returns a transformed version 66 | seed: int optinal (123456) 67 | Random seed used to split images in training and testing partitions. 68 | The partitions are generated randomly (but consistently given the same random seed). 69 | Different values of this parameter will affect this generation. 70 | Note: Data partitions are generated only the first time the dataset is initialised, 71 | and before the local torch (tensor) files are saved. 72 | Multiple instances of this dataset with different seed won't have any effect, unless 73 | local partition files are effectively deleted. 74 | """ 75 | 76 | RAW_DATA_FOLDER = "orl_faces" 77 | 78 | resources = [ 79 | ( 80 | "https://www.dropbox.com/s/gxus70grtlt8bpq/orl_faces.tar.gz?dl=1", 81 | "83134c1ac2309b40441b35d5fa37a3f1", 82 | ) 83 | ] 84 | 85 | data_files = { 86 | Partition.train: "training.pt", 87 | Partition.test: "test.pt", 88 | } 89 | 90 | classes = list(range(1, 41)) 91 | 92 | def __init__( 93 | self, 94 | root: str, 95 | split: str = "train", 96 | download: bool = False, 97 | transform: Optional[Callable[[Any], Any]] = None, 98 | seed: int = 123456, 99 | ): 100 | super(ORLFaces, self).__init__(root, transform=transform) 101 | self._seed = seed 102 | self.random_gen = np.random.RandomState(self._seed) 103 | split = split.strip().lower() 104 | if split not in Partition.__members__.keys(): 105 | raise ValueError( 106 | "Data Partition not recognised. " 107 | "Accepted values are 'train', 'validation', 'test'." 108 | ) 109 | 110 | if download: 111 | self.download() 112 | 113 | if not self._check_exists(): 114 | raise RuntimeError( 115 | "Dataset not found." + " You can use download=True to download it" 116 | ) 117 | 118 | self.split = Partition[split] 119 | data_file = self.data_files[self.split] 120 | data_filepath = self.processed_folder / data_file 121 | self.data, self.targets = torch.load(data_filepath) 122 | 123 | def __len__(self): 124 | return len(self.data) 125 | 126 | def __getitem__(self, index): 127 | """ 128 | 129 | Parameters 130 | ---------- 131 | index : int 132 | Index of the sample 133 | 134 | Returns 135 | ------- 136 | tuple 137 | (Image, Target) where target is index of the target class. 138 | """ 139 | img, target = self.data[index], int(self.targets[index]) 140 | 141 | # doing this so that it is consistent with all other datasets 142 | # to return a PIL Image 143 | img = Image.fromarray(img.numpy(), mode="L") 144 | 145 | if self.transform is not None: 146 | img = self.transform(img) 147 | 148 | return img, target 149 | 150 | @property 151 | def processed_folder(self): 152 | return Path(self.root) / self.__class__.__name__ / "processed" 153 | 154 | @property 155 | def raw_folder(self): 156 | return Path(self.root) / self.__class__.__name__ / "raw" 157 | 158 | @property 159 | def partition(self): 160 | return self.split 161 | 162 | @property 163 | def class_to_idx(self): 164 | return {_class: i for i, _class in enumerate(self.classes)} 165 | 166 | @property 167 | def idx_to_class(self): 168 | return {v: k for k, v in self.class_to_idx.items()} 169 | 170 | def classes_map(self): 171 | return {i: c for i, c in enumerate(self.classes)} 172 | 173 | def _check_exists(self): 174 | for data_fname in self.data_files.values(): 175 | data_file = self.processed_folder / data_fname 176 | if not data_file.exists(): 177 | return False 178 | return True 179 | 180 | def extra_repr(self): 181 | return "Split: {}".format(self.split.value) 182 | 183 | def _download_and_extract_archive( 184 | self, 185 | url: str, 186 | download_root: str, 187 | filename: Optional[str] = None, 188 | md5: Optional[str] = None, 189 | ): 190 | download_root = os.path.expanduser(download_root) 191 | extract_root = download_root 192 | if not filename: 193 | filename = os.path.basename(url) 194 | 195 | from torchvision.datasets import utils 196 | 197 | utils._get_redirect_url = lambda ulr, max_hops: url 198 | download_url(url, download_root, filename, md5) 199 | archive = os.path.join(download_root, filename) 200 | print("Extracting {} to {}".format(archive, extract_root)) 201 | extract_archive(archive, extract_root, remove_finished=False) 202 | 203 | def download(self): 204 | """Download the ORLFaces data if it doesn't already exist in the processed folder""" 205 | 206 | if self._check_exists(): 207 | return 208 | 209 | os.makedirs(self.raw_folder, exist_ok=True) 210 | os.makedirs(self.processed_folder, exist_ok=True) 211 | 212 | # download files 213 | for url, md5 in self.resources: 214 | filename = url.rpartition("/")[-1].split("?")[0] 215 | self._download_and_extract_archive( 216 | url, download_root=str(self.raw_folder), filename=filename, md5=md5 217 | ) 218 | 219 | print("Processing...", end="") 220 | self._process_partitions() 221 | print("Done!") 222 | 223 | def _process_partitions(self): 224 | raw_data_filepath = self.raw_folder / self.RAW_DATA_FOLDER 225 | partitions = defaultdict(list) 226 | 227 | for subj_folder in os.listdir(raw_data_filepath): 228 | if not subj_folder.startswith("s"): 229 | continue # skip folder 230 | # class is zero-indexed! 231 | subj_class = int(subj_folder.replace("s", "").strip()) - 1 232 | # select training set images, randomly - using the input seed 233 | training_indices = self.random_gen.choice( 234 | np.arange(10), size=7, replace=False 235 | ) 236 | 237 | # sort image files, so we could use randomly selected indices, quickly 238 | subject_folder_path = raw_data_filepath / subj_folder 239 | image_files = filter( 240 | lambda f: not f.startswith("."), os.listdir(subject_folder_path) 241 | ) 242 | subject_images = sorted(image_files, key=lambda f: int(f.split(".")[0])) 243 | subject_images = map( 244 | lambda f: subject_folder_path / f, subject_images 245 | ) # store full path 246 | subject_images = np.asarray( 247 | list(subject_images) 248 | ) # convert to array for easy indexing 249 | 250 | # Add new pair (images, class) to corresponding partitions 251 | partitions[Partition.train].append( 252 | (subject_images[training_indices], subj_class) 253 | ) 254 | partitions[Partition.test].append( 255 | (np.delete(subject_images, training_indices), subj_class) 256 | ) 257 | 258 | # store partitions locally - to be reloaded later 259 | for partition, dataset in partitions.items(): 260 | images, labels = self._dataset_as_torch_tensors(dataset) 261 | data_file = self.processed_folder / self.data_files[partition] 262 | with open(data_file, "wb") as f: 263 | torch.save((images, labels), f) 264 | 265 | @staticmethod 266 | def read_pgm(filename, byteorder=">"): 267 | """Return image data from a raw PGM file as numpy array. 268 | Format specification: http://netpbm.sourceforge.net/doc/pgm.html 269 | """ 270 | with open(filename, "rb") as f: 271 | buffer = f.read() 272 | try: 273 | header, width, height, maxval = re.search( 274 | b"(^P5\s(?:\s*#.*[\r\n])*" 275 | b"(\d+)\s(?:\s*#.*[\r\n])*" 276 | b"(\d+)\s(?:\s*#.*[\r\n])*" 277 | b"(\d+)\s(?:\s*#.*[\r\n]\s)*)", 278 | buffer, 279 | ).groups() 280 | except AttributeError: 281 | raise ValueError("Not a raw PGM file: '%s'" % filename) 282 | return np.frombuffer( 283 | buffer, 284 | dtype="u1" if int(maxval) < 256 else byteorder + "u2", 285 | count=int(width) * int(height), 286 | offset=len(header), 287 | ).reshape((int(height), int(width))) 288 | 289 | def _dataset_as_torch_tensors( 290 | self, dataset: List[Tuple[List[str], int]] 291 | ) -> Tuple[torch.TensorType, torch.TensorType]: 292 | """ 293 | Collect all the images per subject and convert them in torch.Tensor. 294 | Labels will also be returned as tensor, repeated for each corresponding subject. 295 | 296 | Parameters 297 | ---------- 298 | dataset : List[Tuple[List[str], int]] 299 | Set of images and corresponding label for each subject, in the considered partition 300 | (i.e. train, or test) 301 | Returns 302 | ------- 303 | torch.TensorType 304 | [sample x pixels] tensor representing the whole data data_partition as 305 | torch Tensor. 306 | torch.TensorType 307 | [sample] tensor array of corresponding label (i.e. subject ID for each subject) 308 | """ 309 | images, labels = [], [] 310 | for subject, class_id in dataset: 311 | subject_images = np.asarray(list(map(self.read_pgm, subject))) 312 | subject_labels = np.zeros(shape=len(subject)) 313 | subject_labels.fill(class_id) 314 | images.append(subject_images) 315 | labels.append(subject_labels) 316 | images = np.vstack(images) 317 | labels = np.hstack(labels) 318 | return torch.from_numpy(images), torch.from_numpy(labels) 319 | -------------------------------------------------------------------------------- /3-differential-privacy/1-differential-privacy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Optional: setup NoTexBook theme\n", 10 | "%load_ext notexbook\n", 11 | "\n", 12 | "%texify" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "**Adapted from**: [Ch3](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch3.ipynb)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Differential Privacy" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Definition" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "Like $k$-Anonymity, *differential privacy*[3](#fn3) is a **formal notion of privacy** \n", 41 | "(i.e. it's possible to prove that a data release has the property). \n", 42 | "\n", 43 | "Unlike $k$-Anonymity, however, **differential privacy** is a property of *algorithms*, and not a property of *data*. \n", 44 | "\n", 45 | "That is, we can prove a *dataset* satisfies differential privacy by proving that an *algorithm* satisfies differential privacy.\n", 46 | "\n", 47 | "> **Definition**:\n", 48 | ">\n", 49 | "> A function which satisfies differential privacy is often called a *mechanism*. \n", 50 | "> We say that a *mechanism* $F$ satisfies differential privacy if for all *neighboring datasets* $x$ and $x'$, \n", 51 | "> and all possible outputs $S$,\n", 52 | ">\n", 53 | "\n", 54 | "\\begin{equation}\n", 55 | "\\frac{\\mathsf{Pr}[F(x) = S]}{\\mathsf{Pr}[F(x') = S]} \\leq e^\\epsilon\n", 56 | "\\end{equation}" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "**1. Neighbours Datasets**:\n", 64 | "\n", 65 | "Two datasets are considered **neighbours** if they differ in the data by **one single individual**.\n", 66 | "\n", 67 | "**2. $F$ Randomised Function**:\n", 68 | "\n", 69 | "Note that $F$ is typically a *randomised* function, so that the probability distribution describing its outputs is not just a point distribution.\n", 70 | "\n", 71 | "The important implication of this definition is that $F$'s output will be pretty much the same, *with or without* the data of any specific individual.\n", 72 | "\n", 73 | "In other words, the randomness built into $F$ should be \"enough\" so that an observed output from $F$ will not reveal which of $x$ or $x'$ was the input.\n", 74 | "\n", 75 | "Imagine that my data is present in $x$ but not in $x'$.\n", 76 | "\n", 77 | "**3. The Privacy Budget: $\\epsilon$**:\n", 78 | "\n", 79 | "If an adversary can't determine which of $x$ or $x'$ was the input to $F$, then the adversary can't tell whether or not my data was *present* in the input - let alone the contents of that data.\n", 80 | "\n", 81 | "The $\\epsilon$ parameter in the definition is called the *privacy parameter* or the *privacy budget*.\n", 82 | "\n", 83 | "$\\epsilon$ provides a knob to tune the **amount of privacy** the definition provides." 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "Small values of $\\epsilon$ require $F$ to provide *very* similar outputs when given similar inputs, and therefore provide **higher levels** of privacy.\n", 91 | "\n", 92 | "Large values of $\\epsilon$ allow less similarity in the outputs, and therefore provide **less privacy**.\n", 93 | "\n", 94 | "\n", 95 | "- Small values $\\epsilon \\rightarrow$ High Privacy\n", 96 | "- Large values $\\epsilon \\rightarrow$ Less Privacy\n", 97 | "\n", 98 | "How should we set $\\epsilon$ to prevent bad outcomes in practice? **Nobody knows** (i.e. Open Research Question). \n", 99 | "\n", 100 | "The general consensus is that $\\epsilon$ should be around `1` or smaller, and values of $\\epsilon$ above `10` probably don't do much to protect privacy - but this rule of thumb could turn out to be very conservative. \n", 101 | "\n", 102 | "**[3]**: Dwork, C; _Differential Privacy_ in Proceedings of the 33rd International Conference on Automata, Languages and Programming - Volume Part II, 2006 [link](https://doi.org/10.1007/11787006_1)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "\n", 110 | ">**Learning Objectives**\n", 111 | ">\n", 112 | "> - Define differential privacy\n", 113 | "> - Explain the importance of the privacy parameter $\\epsilon$\n", 114 | "> - Use the Laplace mechanism to enforce differential privacy for counting queries" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## The Laplace Mechanism" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Differential privacy is typically used to answer specific queries. Let's consider a query on the census data, *without* differential privacy." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "tags": [ 136 | "remove-cell" 137 | ] 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "import pandas as pd\n", 142 | "import numpy as np\n", 143 | "import matplotlib.pyplot as plt\n", 144 | "plt.style.use('seaborn-v0_8-whitegrid')" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\"\n", 154 | "adult = pd.read_csv(DATASET_URL)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "**Q** \"How many individuals in the dataset are 40 years old or older?\"\n", 162 | "\n", 163 | "This is an example of a **Count Query**." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "adult[adult['Age'] >= 40].shape[0]" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Laplace Mechanism" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "The easiest way to achieve differential privacy for this query is to add **random noise to its answer**. \n", 187 | "\n", 188 | "The key challenge is to add enough noise to satisfy the definition of differential privacy, but not so much that the answer becomes too noisy to be useful. \n", 189 | "\n", 190 | "To make this process easier, some basic *mechanisms* have been developed in the field of differential privacy, which describe exactly what kind of - and how much - noise to use. \n", 191 | "\n", 192 | "One of these is called the *Laplace mechanism*[4](#fn4).\n", 193 | "\n", 194 | "> **Definition**\n", 195 | "> \n", 196 | ">According to the Laplace mechanism, for a function $f(x)$ which returns a number, the following definition of $F(x)$ satisfies $\\epsilon$-differential privacy:\n", 197 | ">\n", 198 | ">\\begin{equation}\n", 199 | "F(x) = f(x) + \\textsf{Lap}(\\frac{s}{\\epsilon})\n", 200 | "\\end{equation}\n", 201 | ">\n", 202 | ">where $s$ is the *sensitivity* of $f$, and $\\textsf{Lap}(S)$ denotes sampling from the Laplace distribution with center 0 and scale $S$.\n", 203 | "\n", 204 | "\n", 205 | "**Sensitivity**:\n", 206 | "\n", 207 | "The *sensitivity* of a function $f$ is the amount $f$'s output changes when its input changes by 1. \n", 208 | "\n", 209 | "Sensitivity is a complex topic, and an integral part of designing differentially private algorithms. \n", 210 | "\n", 211 | "Let's just point out that *counting queries* always have a sensitivity of `1`: if a query counts the number of rows in the dataset with a particular property, and then we modify exactly one row of the dataset, then the query's output can change by at most `1`.\n", 212 | "\n", 213 | "Thus we can achieve differential privacy for our example query by using the `Laplace mechanism` with `sensitivity=1` and an $\\epsilon$ of our choosing.\n", 214 | "\n", 215 | "For now, let's pick $\\epsilon = 0.1$. We can sample from the Laplace distribution using Numpy's `random.laplace`.\n", 216 | "\n", 217 | "**[4]**: Dwork, C.; _Calibrating Noise to Sensitivity in Private Data Analysis_ in Proceedings of the Third Conference on Theory of Cryptography, 2006 [link](https://doi.org/10.1007/11681878_14)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "sensitivity = 1\n", 227 | "epsilon = 0.1\n", 228 | "\n", 229 | "adult[adult['Age'] >= 40].shape[0] + np.random.laplace(loc=0, scale=sensitivity/epsilon)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "You can see the effect of the noise by running this code multiple times. Each time, the output changes, but most of the time, the answer is close enough to the true answer (14,235) to be useful." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "true_count_stat = adult[adult['Age'] >= 40].shape[0]\n", 246 | "Lap = np.random.laplace(loc=0, scale=sensitivity/epsilon, size=30)\n", 247 | "print(f\"True Count Statistic: {true_count_stat}\")\n", 248 | "for i in range(30):\n", 249 | " print(f\"{i}) {(true_count_stat + Lap[i]):0.2f}\")" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "## How Much Noise is Enough?" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "How do we know that the Laplace mechanism adds enough noise to prevent the re-identification of individuals in the dataset? \n", 264 | "\n", 265 | "For one thing, we can try to break it!\n", 266 | "\n", 267 | "Let's write down a **malicious counting query**, which is specifically designed to determine whether Karrie Trusslove has an income greater than `$50k`." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "karries_row = adult[adult['Name'] == 'Karrie Trusslove']\n", 277 | "karries_row[karries_row['Target'] == '<=50K'].shape[0]" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "This result definitely violates Karrie's privacy, since it reveals the value of the income column for Karrie's row.\n", 285 | "\n", 286 | "Since we know how to ensure differential privacy for counting queries with the Laplace mechanism, we can do so for this query:" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "sensitivity = 1\n", 296 | "epsilon = 0.1\n", 297 | "\n", 298 | "karries_row = adult[adult['Name'] == 'Karrie Trusslove']\n", 299 | "karries_row[karries_row['Target'] == '<=50K'].shape[0] + np.random.laplace(loc=0, scale=sensitivity/epsilon)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "query = karries_row[karries_row['Target'] == '<=50K'].shape[0]" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "F = query + np.random.laplace(loc=0, scale=sensitivity/epsilon)\n", 318 | "\n", 319 | "np.mean([F for _ in range(100)])" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "Is this the true answer ?\n", 327 | "\n", 328 | "There's too much noise to be able to reliably tell.\n", 329 | "\n", 330 | "This is how differential privacy is *intended* to work - the approach does not *reject* queries which are determined to be malicious; instead, it adds enough noise that the results of a malicious query will be useless to the adversary." 331 | ] 332 | } 333 | ], 334 | "metadata": { 335 | "kernelspec": { 336 | "display_name": "Python 3 (ipykernel)", 337 | "language": "python", 338 | "name": "python3" 339 | }, 340 | "language_info": { 341 | "codemirror_mode": { 342 | "name": "ipython", 343 | "version": 3 344 | }, 345 | "file_extension": ".py", 346 | "mimetype": "text/x-python", 347 | "name": "python", 348 | "nbconvert_exporter": "python", 349 | "pygments_lexer": "ipython3", 350 | "version": "3.12.3" 351 | } 352 | }, 353 | "nbformat": 4, 354 | "nbformat_minor": 4 355 | } 356 | -------------------------------------------------------------------------------- /2-ml-models-attacks/1-FSGM-Attack.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext notexbook\n", 10 | "%texify" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Fast Gradient Sign Attack" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "This notebook showcases how to carry out a **Fast Gradient Sign Attack** (`FGSA`) to a pretrained model. \n", 25 | "\n", 26 | "**Note** This notebook has been adapted from the [FSGM Tutorial](https://pytorch.org/tutorials/beginner/fgsm_tutorial.html) by _Nathan Inkawhich_ `@inkawhich` available on the official [PyTorch Documentation](https://pytorch.org/docs/stable/index.html)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "(_from the original notebook_)\n", 34 | "> This tutorial will raise your awareness to the security vulnerabilities \n", 35 | "> of ML models, and will give insight into the hot topic of adversarial machine learning. \n", 36 | "> \n", 37 | "> You may be surprised to find that adding **imperceptible perturbations** to an image *can* cause \n", 38 | "> drastically different model performance.\n", 39 | "> `[...]`\n", 40 | ">\n", 41 | "> Specifically we will use one of the first and most popular attack methods, the _Fast Gradient Sign Attack_\n", 42 | "> (`FGSM`), to fool an `MNIST` classifier.\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Threats to Models\n", 50 | "\n", 51 | "There are several kinds of assumptions of the attacker’s knowledge, two of which are: **white-box** and **black-box**. \n", 52 | "\n", 53 | "- A *white-box* attack assumes the attacker has full knowledge and access to the model, including\n", 54 | "architecture, inputs, outputs, and weights. \n", 55 | "- A *black-box* attack assumes the attacker only has access to the inputs and outputs of the model, and knows nothing about the underlying architecture or weights. \n", 56 | "\n", 57 | "There are also several types of goals, including **misclassification** and\n", 58 | "**source/target misclassification**. \n", 59 | "\n", 60 | "A goal of *misclassification* means the adversary only wants the output classification to be wrong but does\n", 61 | "not care what the new classification is. \n", 62 | "\n", 63 | "A *source/target misclassification* means the adversary wants to alter an image that is originally of a specific source class so that it is classified as a specific target class." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Fast Gradient Sign Attack\n", 71 | "\n", 72 | "*Fast Gradient Sign Attack (FGSM)* and is described by _Goodfellow et. al._ in \n", 73 | "[Explaining and Harnessing Adversarial Examples](https://arxiv.org/abs/1412.6572). \n", 74 | "\n", 75 | "The attack is remarkably powerful, and yet intuitive. \n", 76 | "\n", 77 | "It is designed to attack neural networks by leveraging the way they learn: **gradients**. \n", 78 | "\n", 79 | "The idea is simple: \n", 80 | "\n", 81 | "> rather than working to minimize the loss by adjusting the weights based on the backpropagated gradients,\n", 82 | "> the attack **adjusts** the input data to maximize the loss based on the same backpropagated gradients. \n", 83 | "\n", 84 | "In other words, the attack uses the gradient of the loss w.r.t the input data, then adjusts the input data to maximize the loss.\n", 85 | "\n", 86 | "_(from the original paper)_\n", 87 | "\n", 88 | "![fgsm panda attack](https://pytorch.org/tutorials/_images/fgsm_panda_image.png)\n", 89 | "\n", 90 | "**TLDR;** Just perturbe the input data with some small change that would work in an **adversary** fashion (wrt. the optimisation process) that follows the **direction of the gradient** (i.e. $sign(\\nabla_{x} J(\\mathbf{\\theta}, \\mathbf{x}, y))$ )" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "import torch\n", 100 | "import torch.nn as nn\n", 101 | "import torch.nn.functional as F\n", 102 | "import torch.optim as optim\n", 103 | "\n", 104 | "from torch.utils.data import DataLoader\n", 105 | "from torchvision import datasets, transforms\n", 106 | "import numpy as np\n", 107 | "import matplotlib.pyplot as plt\n", 108 | "\n", 109 | "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n", 110 | "# see, https://github.com/pytorch/vision/issues/3497 for more information\n", 111 | "from six.moves import urllib\n", 112 | "\n", 113 | "opener = urllib.request.build_opener()\n", 114 | "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n", 115 | "urllib.request.install_opener(opener)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "from pathlib import Path\n", 125 | "import os \n", 126 | "\n", 127 | "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\"" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "print(DATA_FOLDER)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# Reproducibility Settings\n", 146 | "\n", 147 | "import numpy as np\n", 148 | "\n", 149 | "SEED = 123456\n", 150 | "np.random.seed(SEED)\n", 151 | "torch.manual_seed(SEED)\n", 152 | "\n", 153 | "if torch.cuda.is_available():\n", 154 | " torch.cuda.manual_seed_all(SEED)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "#### `LeNet` Model" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# LeNet Model definition\n", 171 | "class Net(nn.Module):\n", 172 | " def __init__(self):\n", 173 | " super(Net, self).__init__()\n", 174 | " self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n", 175 | " self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n", 176 | " self.conv2_drop = nn.Dropout2d()\n", 177 | " self.fc1 = nn.Linear(320, 50)\n", 178 | " self.fc2 = nn.Linear(50, 10)\n", 179 | "\n", 180 | " def forward(self, x):\n", 181 | " x = F.relu(F.max_pool2d(self.conv1(x), 2))\n", 182 | " x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n", 183 | " x = x.view(-1, 320)\n", 184 | " x = F.relu(self.fc1(x))\n", 185 | " x = F.dropout(x, training=self.training)\n", 186 | " x = self.fc2(x)\n", 187 | " return F.log_softmax(x, dim=1)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Initialise pre-trained model (and move it to available device)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "if torch.cuda.is_available():\n", 204 | " dev_name = \"cuda\"\n", 205 | "elif torch.backends.mps.is_available():\n", 206 | " dev_name = \"mps\"\n", 207 | "else:\n", 208 | " dev_name = \"cpu\"\n", 209 | "\n", 210 | "device = torch.device(dev_name)\n", 211 | "print(f\"You will be using the {device} device\")" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "torch.__version__" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "# Uncomment this when running on Anaconda Notebooks\n", 230 | "# !wget !wget https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/3-ml-models-attacks/lenet_mnist_model.pth" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "PRETRAINED_MODEL_WEIGHTS = \"lenet_mnist_model.pth\"\n", 240 | "\n", 241 | "# Initialize the network\n", 242 | "model = Net().to(device)\n", 243 | "\n", 244 | "# Load the pretrained model\n", 245 | "model.load_state_dict(torch.load(PRETRAINED_MODEL_WEIGHTS, map_location=device))\n", 246 | "\n", 247 | "# Set the model in evaluation mode. In this case this is for the Dropout layers\n", 248 | "model.eval()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "### Download MNIST Dataset" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# MNIST Test dataset and dataloader declaration\n", 265 | "mnist_test = datasets.MNIST(root=DATA_FOLDER, train=False, download=True, transform=transforms.ToTensor())\n", 266 | "test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=1, shuffle=False)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "### Before the Attack\n", 274 | "\n", 275 | "Before carrying out the attack, let's see how well the model classify the digits in the test set" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "from sklearn.metrics import accuracy_score\n", 285 | "from tqdm.notebook import tqdm\n", 286 | "\n", 287 | "y_preds, y_true = list(), list()\n", 288 | "with torch.no_grad(): # extra, as model is eval mode anyway\n", 289 | " for (image, target) in tqdm(test_loader):\n", 290 | " image, target = image.to(device), target.to(device)\n", 291 | " out = model(image)\n", 292 | " _, preds = torch.max(out, 1)\n", 293 | " y_preds.append(preds.detach().cpu().numpy())\n", 294 | " y_true.append(target.detach().cpu().numpy())\n", 295 | " y_preds = np.hstack(y_preds)\n", 296 | " y_true = np.hstack(y_true)\n", 297 | " \n", 298 | " print(f\"Pre-Trained Model ACC: {accuracy_score(y_true, y_preds)}\")" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "### `FSGM` Attack" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Now, we can define the function that creates the adversarial examples by\n", 313 | "perturbing the original inputs. " 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "# FGSM attack code\n", 323 | "def fgsm_attack(image: torch.Tensor, epsilon: float, data_gradient: torch.Tensor) -> torch.Tensor:\n", 324 | " # Collect the element-wise sign of the data gradient\n", 325 | " sign_data_grad = data_gradient.sign()\n", 326 | " # Create the perturbed image by adjusting each pixel of the input image\n", 327 | " perturbed_image = image + (epsilon * sign_data_grad)\n", 328 | " # Adding clipping to maintain [0,1] range\n", 329 | " perturbed_image = torch.clamp(perturbed_image, 0, 1) # normalise in [0, 1] to make it an actual image\n", 330 | " # Return the perturbed image\n", 331 | " return perturbed_image" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "Last but not least: the **test function**" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "def test(model, device, loader, epsilon):\n", 348 | " # from https://github.com/pytorch/tutorials/blob/master/beginner_source/fgsm_tutorial.py\n", 349 | " \n", 350 | " # Accuracy counter\n", 351 | " correct = 0\n", 352 | " adv_examples = []\n", 353 | "\n", 354 | " # Loop over all examples in test set\n", 355 | " for data, target in tqdm(test_loader, desc=f\"Running Attack on Batches with ε={ε}\"):\n", 356 | "\n", 357 | " # Send the data and label to the device\n", 358 | " data, target = data.to(device), target.to(device)\n", 359 | "\n", 360 | " # Set requires_grad attribute of tensor. Important for Attack\n", 361 | " data.requires_grad = True\n", 362 | "\n", 363 | " # Forward pass the data through the model\n", 364 | " output = model(data)\n", 365 | " init_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability\n", 366 | "\n", 367 | " # If the initial prediction is wrong, don't bother attacking, just move on\n", 368 | " if init_pred.item() != target.item():\n", 369 | " continue\n", 370 | "\n", 371 | " # Calculate the loss\n", 372 | " loss = F.nll_loss(output, target)\n", 373 | "\n", 374 | " # Zero all existing gradients\n", 375 | " model.zero_grad()\n", 376 | "\n", 377 | " # Calculate gradients of model in backward pass\n", 378 | " loss.backward()\n", 379 | "\n", 380 | " # Collect datagrad\n", 381 | " data_grad = data.grad.data\n", 382 | "\n", 383 | " # Call FGSM Attack\n", 384 | " perturbed_data = fgsm_attack(data, epsilon, data_grad)\n", 385 | "\n", 386 | " # Re-classify the perturbed image\n", 387 | " output = model(perturbed_data)\n", 388 | "\n", 389 | " # Check for success\n", 390 | " final_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability\n", 391 | " if final_pred.item() == target.item():\n", 392 | " correct += 1\n", 393 | " else:\n", 394 | " # Save some adv examples for visualization later\n", 395 | " if len(adv_examples) < 5:\n", 396 | " adv_ex = perturbed_data.squeeze().detach().cpu().numpy()\n", 397 | " adv_examples.append((init_pred.item(), final_pred.item(), adv_ex))\n", 398 | "\n", 399 | " # Calculate final accuracy for this epsilon\n", 400 | " final_acc = correct / float(len(test_loader))\n", 401 | " print(\n", 402 | " \"Epsilon: {}\\tTest Accuracy = {} / {} = {}\".format(\n", 403 | " epsilon, correct, len(test_loader), final_acc\n", 404 | " )\n", 405 | " )\n", 406 | "\n", 407 | " # Return the accuracy and an adversarial example\n", 408 | " return final_acc, adv_examples" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "### Run the Attack" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "ε = 0.05" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "acc, adv_examples = test(model, device, test_loader, ε)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "Now let's see how the perturbed images look like: " 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "fig, axes = plt.subplots(1, len(adv_examples), figsize=(8, 10))\n", 450 | "plt.xticks([], [])\n", 451 | "plt.yticks([], [])\n", 452 | "for j, (orig_pred, adv_pred, adv_example) in enumerate(adv_examples):\n", 453 | " if j == 0:\n", 454 | " axes[j].set_ylabel(f\"ε: {ε}\", fontsize=14)\n", 455 | " axes[j].set_title(\"{} -> {}\".format(orig_pred, adv_pred))\n", 456 | " axes[j].imshow(adv_example, cmap=\"gray\")\n", 457 | "plt.tight_layout()\n", 458 | "plt.show()" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "### Exercise:\n", 466 | "\n", 467 | "Now the question is: how much degradation in performance we have as soon as we keep incrementing the value of ε?\n", 468 | "\n", 469 | "What we should expect: \n", 470 | "- the bigger ε, the worse the accuracy\n", 471 | "- the bigger ε, the more \"discoverable\" the perturbation becomes\n", 472 | " - so that it's evident that an attack has been launched" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "EPSILONS = [0.05, .06, .1, .15, .2, .25, .3]\n", 482 | "\n", 483 | "accuracies = [acc]\n", 484 | "adv_examples_map = {0.05: adv_examples}\n", 485 | "\n", 486 | "# Run test for each epsilon\n", 487 | "for ε in EPSILONS[1:]:\n", 488 | " acc, adv_examples = test(model, device, test_loader, ε)\n", 489 | " accuracies.append(acc)\n", 490 | " adv_examples_map[ε] = adv_examples" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "### Results\n", 498 | "\n", 499 | "1. Let's print the accuracy values for each corresponding ε value" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "accuracies" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "plt.figure(figsize=(5, 5))\n", 518 | "\n", 519 | "# your code here: plot Accuracies vs EPSILONS\n", 520 | "plt.plot(EPSILONS, accuracies)\n", 521 | "plt.yticks(np.arange(0, 1.1, step=0.1))\n", 522 | "plt.xticks(np.arange(0, 0.35, step=0.05))\n", 523 | "plt.title(\"Accuracy vs Epsilon\")\n", 524 | "plt.xlabel(\"Epsilon\")\n", 525 | "plt.ylabel(\"Accuracy\")\n", 526 | "plt.show()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "2. Visualise Generated Adversarial Examples" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "# Plot several examples of adversarial samples at each epsilon\n", 543 | "\n", 544 | "for ε in EPSILONS:\n", 545 | " fig, axes = plt.subplots(1, len(adv_examples_map[ε]), figsize=(8, 10))\n", 546 | " plt.xticks([], [])\n", 547 | " plt.yticks([], [])\n", 548 | " for j, (orig_pred, adv_pred, adv_example) in enumerate(adv_examples_map[ε]):\n", 549 | " if j == 0:\n", 550 | " axes[j].set_ylabel(f\"ε: {ε}\", fontsize=14)\n", 551 | " axes[j].set_title(\"{} -> {}\".format(orig_pred, adv_pred))\n", 552 | " axes[j].imshow(adv_example, cmap=\"gray\")\n", 553 | " plt.tight_layout()\n", 554 | " plt.show()" 555 | ] 556 | } 557 | ], 558 | "metadata": { 559 | "kernelspec": { 560 | "display_name": "Python 3 (ipykernel)", 561 | "language": "python", 562 | "name": "python3" 563 | }, 564 | "language_info": { 565 | "codemirror_mode": { 566 | "name": "ipython", 567 | "version": 3 568 | }, 569 | "file_extension": ".py", 570 | "mimetype": "text/x-python", 571 | "name": "python", 572 | "nbconvert_exporter": "python", 573 | "pygments_lexer": "ipython3", 574 | "version": "3.12.3" 575 | } 576 | }, 577 | "nbformat": 4, 578 | "nbformat_minor": 4 579 | } 580 | -------------------------------------------------------------------------------- /1-data-anonimisation/3-k-anonimity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Optional: setup NoTexBook theme\n", 10 | "%load_ext notexbook\n", 11 | "%texify" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "**Adapted from**: [Ch2](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch2.ipynb)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# $k$-Anonymity\n", 26 | "\n", 27 | "$k$-Anonymity[2](#fn2) is a *formal privacy definition*. \n", 28 | "\n", 29 | "The definition of $k$-Anonymity is designed to formalize our intuition that a piece of auxiliary information should not narrow down the set of possible records for an individual \"too much.\" \n", 30 | "\n", 31 | "In other terms, $k$-Anonymity is designed to ensure that each individual can _blend into the crowd._\n", 32 | "\n", 33 | "Informally, we say that a dataset is \"$k$-Anonymized\" for a particular $k$ if each individual in the dataset is a member of a group of size at least $k$, such that each member of the group shares the same *quasi-identifiers* (a selected subset of all the dataset's columns) with all other members of the group. \n", 34 | "\n", 35 | "Therefore, the individuals in each group \"blend into\" their group - it's possible to narrow down an individual to membership in a particular group, but not to determine which group member is the target.\n", 36 | "\n", 37 | "> **Definition** (more formally) A dataset $D$ satisfies $k$-Anonymity for a value of $k$ if:\n", 38 | "> \n", 39 | "> - For each row $r_1 \\in D$, there exist at least $k-1$ other rows $r_2 \\dots r_k \\in D$ such that \n", 40 | "> $\\Pi_{qi(D)} r_1 = \\Pi_{qi(D)} r_2, \\ldots, = \\Pi_{qi(D)} r_k$\n", 41 | ">\n", 42 | "> where $qi(D)$ is the quasi-identifiers of $D$, and $\\Pi_{qi(D)} r$ represents the columns of $r$ containing quasi-identifiers (i.e. the projection of the quasi-identifiers).\n", 43 | "\n", 44 | "**[2]**: Sweeney, L: _k-ANONYMITY: A MODEL FOR PROTECTING PRIVACY_ on International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems, 2002 [link](https://doi.org/10.1142/S0218488502001648)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | ">**Learning Objectives**\n", 52 | "After reading this chapter, you will understand:\n", 53 | "> - The definition of $k$-Anonymity\n", 54 | "> - How to check for $k$-Anonymity\n", 55 | "> - How to generalize data to enforce $k$-Anonymity\n", 56 | "> - The limitations of $k$-Anonymity" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Checking for $k$-Anonymity" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "We'll start with a small dataset, so that we can immediately see by looking at the data whether it satisfies $k$-Anonymity or not. \n", 71 | "\n", 72 | "This dataset contains age plus two test scores; it clearly doesn't satisfy $k$-Anonymity for $k > 1$. \n", 73 | "\n", 74 | "Any dataset trivially satisfies $k$-Anonymity for $k = 1$, since each row can form its own group of size 1." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "tags": [ 82 | "remove-cell" 83 | ] 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "import pandas as pd\n", 88 | "import numpy as np\n", 89 | "import matplotlib.pyplot as plt\n", 90 | "plt.style.use('seaborn-v0_8-whitegrid')" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "tags": [ 98 | "hide-input" 99 | ] 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "example_data = {\n", 104 | " 'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], \n", 105 | " 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], \n", 106 | " 'age': [42, 52, 36, 24, 73], \n", 107 | " 'preTestScore': [4, 24, 31, 2, 3],\n", 108 | " 'postTestScore': [25, 94, 57, 62, 70]}\n", 109 | "df = pd.DataFrame(example_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])\n", 110 | "df" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "To implement a function to check whether a dataframe satisfies $k$-Anonymity, we loop over the rows. \n", 118 | "\n", 119 | "For each row, we query the dataframe to see how many rows match its values for the quasi-identifiers. \n", 120 | "\n", 121 | "If the number of rows in any group is less than $k$, the dataframe **does not** satisfy $k$-Anonymity for that value of $k$, and we return `False`. \n", 122 | "\n", 123 | "Note that in this simple definition, we consider *all* columns to contain quasi-identifiers; to limit our check to a subset of all columns, we would need to replace the `df.columns` expression with something else." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "def is_k_anonymised(df, k):\n", 133 | " for _, match in df.groupby(df.columns.tolist()).groups.items():\n", 134 | " if len(match) < k:\n", 135 | " return False\n", 136 | " return True" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "As expected, our example dataframe does *not* satisfy $k$-Anonymity for $k = 2$, but it does satisfy the property for $k=1$." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "is_k_anonymised(df, 1)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "is_k_anonymised(df, 2)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Generalizing Data to Satisfy $k$-Anonymity" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "The process of modifying a dataset so that it satisfies $k$-Anonymity for a desired $k$ is generally accomplished by *generalizing* the data, that is \"modifying values to be less specific, and therefore more likely to match the values of other individuals in the dataset\". \n", 176 | "\n", 177 | "For example, an `age` which is accurate to a year may be generalized by rounding to the nearest `10` years, or a `ZIP` code might have its rightmost digits replaced by zeros. \n", 178 | "\n", 179 | "For numeric values, this is easy to implement. \n", 180 | "\n", 181 | "We'll use the `apply` method of dataframes, and pass in a dictionary named `depths` which specifies how many digits to replace by `zeros` for each column. \n", 182 | "\n", 183 | "This gives us the flexibility to experiment with **different levels of generalization** for different columns." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "def generalise(df, depths):\n", 193 | " return df.apply(lambda x: x.apply(lambda y: int(int(y/(10**depths[x.name]))*(10**depths[x.name]))))" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Now, we can generalize our example dataframe. \n", 201 | "\n", 202 | "First, we'll try generalizing each column by one \"level\" - i.e. rounding to the nearest `10`." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# original dataframe as reference\n", 212 | "df" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "For simplicity, let's focus only on numerical fields (i.e. get rid of names):" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "df = df[[\"age\", \"preTestScore\", \"postTestScore\"]]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "depths = {\n", 238 | " 'age': 1,\n", 239 | " 'preTestScore': 1,\n", 240 | " 'postTestScore': 1\n", 241 | "}\n", 242 | "df2 = generalise(df, depths)\n", 243 | "df2" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "Notice that even after generalization, our example data *still* does not satisfy $k$-Anonymity for $k=2$." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "is_k_anonymised(df2, 2)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "We can try generalizing more - but then we'll end up removing *all* of the data!" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "depths = {\n", 276 | " 'age': 2,\n", 277 | " 'preTestScore': 2,\n", 278 | " 'postTestScore': 2\n", 279 | "}\n", 280 | "generalise(df, depths)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "This example illustrates one of the key challenges of achieving $k$-Anonymity:\n", 288 | "\n", 289 | ">**Challenge**:\n", 290 | ">\n", 291 | "> Achieving $k$-Anonymity for meaningful values of $k$ often requires removing quite a lot of information from the data\n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Does More Data Improve Generalization?" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Our example dataset is too small for $k$-Anonymity to work well. \n", 306 | "\n", 307 | "Because there are only `5` individuals in the dataset, building groups of `2` or more individuals who share the same properties is difficult. \n", 308 | "\n", 309 | "The solution to this problem is more data: in a dataset with more individuals, less generalization will typically be needed to satisfy $k$-Anonymity for a desired $k$.\n", 310 | "\n", 311 | "Let's try the same census data we examined for de-identification. \n", 312 | "\n", 313 | "This dataset contains more than `32,000` rows, so it should be easier to achieve $k$-Anonymity." 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": { 320 | "tags": [ 321 | "remove-cell" 322 | ] 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\"\n", 327 | "adult_data = pd.read_csv(DATASET_URL)\n", 328 | "adult_data.head()" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "We'll consider (again) the `ZIP code`, `age`, and `educational achievement` of each individual to be the **quasi-identifiers**. \n", 336 | "\n", 337 | "We'll project just those columns, and try to achieve $k$-Anonymity for $k=2$. \n", 338 | "\n", 339 | "The data is already $k$-Anonymous for $k=1$.\n", 340 | "\n", 341 | "For $k=2$, our algorithm finds a failing row quickly and finishes fast." 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "projection_age_edu = adult_data[['Age', 'Education-Num']]\n", 351 | "projection_age_edu.columns = ['age', 'edu']" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "is_k_anonymised(projection_age_edu, k=2)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "Now, we'll try to generalize to achieve $k$-Anonymity for $k=2$. \n", 368 | "\n", 369 | "We'll start with generalizing both age and educational attainment to the nearest `10`, and we'll consider only the first `1,000` entries." 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "# outliers are a real problem!\n", 379 | "depths = {\n", 380 | " 'age': 1,\n", 381 | " 'edu': 1\n", 382 | "}\n", 383 | "generalised_projection = generalise(projection_age_edu.head(1000), depths)\n", 384 | "is_k_anonymised(generalised_projection, 2)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "The generalized result still does not satisfy $k$-Anonymity for $k=2$! \n", 392 | "\n", 393 | "The reason is that the dataset contains *outliers* - individuals who are very different from the rest of the population. \n", 394 | "\n", 395 | "These individuals do not fit easily into any group, even after generalization. \n", 396 | "\n", 397 | "Even considering *only* ages, we can see the presence of outliers:" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "tags": [ 405 | "hide-input" 406 | ] 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "generalised_projection['age'].hist();" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "Achieving the optimal generalization for $k$-Anonymity is very challenging in cases like this. " 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "## Removing Outliers" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "Generalizing each row *more* would be overkill for the well-represented individuals with ages in the 20-40 range, and would hurt utility. \n", 432 | "\n", 433 | "However, more generalization is clearly needed for individuals at the upper and lower ends of the age range. \n", 434 | "\n", 435 | "This is the kind of challenge that occurs regularly in practice, and is difficult to solve automatically. \n", 436 | "\n", 437 | "In fact, **optimal generalization** for $k$-Anonymity has been shown to be NP-hard.\n", 438 | "\n", 439 | "> **Challenge**:\n", 440 | ">\n", 441 | ">Outliers make achieving $k$-Anonymity very challenging, even for large datasets. Optimal generalization for $k$-Anonymity is NP-hard." 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "One solution to this problem is simply to **clip** the age of each individual in the dataset to lie within a specific range, eliminating outliers entirely.\n", 449 | "\n", 450 | "This can also hurt utility, since it replaces real ages with fake ones, but it can be better than generalizing each row more.\n", 451 | "\n", 452 | "We can use Numpy's `clip` method to perform this clipping. We clip ages to be `60` or below, and leave educational levels alone (by clipping them to a very large value)." 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "# clipping away outliers\n", 462 | "depths = {\n", 463 | " 'age': 1,\n", 464 | " 'edu': 1\n", 465 | "}\n", 466 | "projection_clipped = projection_age_edu.clip(upper=np.array([60, 10000000000000]), axis='columns')\n", 467 | "generalised_projection_clipped = generalise(projection_clipped.head(500), depths)\n", 468 | "is_k_anonymised(generalised_projection, 7)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "is_k_anonymised(generalised_projection, 2)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "## What if we add in the whole dataset ?\n", 485 | "\n", 486 | "We can perform this generalization on all `32,000` rows:" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "is_k_anonymised(projection_age_edu, k=2)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "depths = {\n", 505 | " 'age': 1,\n", 506 | " 'edu': 1\n", 507 | "}\n", 508 | "generalised_proj_whole_dataset = generalise(projection_age_edu, depths)\n", 509 | "is_k_anonymised(generalised_proj_whole_dataset, k=2)" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "# the smallest group after generalisation\n", 519 | "min(map(len, generalised_projection_clipped.groupby(generalised_projection_clipped.columns.tolist()).groups.values()))" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "min(map(len, generalised_proj_whole_dataset.groupby(generalised_proj_whole_dataset.columns.tolist()).groups.values()))" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "So in this case, adding more data increases the statistics, reaching a value of $k$ up to $21$!" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "is_k_anonymised(generalised_proj_whole_dataset, k=22)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "## Summary" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "- $k$-Anonymity is a property of data, which ensures that each individual \"blends in\" with a group of at least $k$ individuals.\n", 559 | "- $k$-Anonymity is computationally expensive even to check: the naive algorithm is $O(n^2)$, and faster algorithms take considerable space.\n", 560 | "- $k$-Anonymity can be achieved by modifying a dataset by *generalizing* it, so that particular values become more common and groups are easier to form.\n", 561 | "- Optimal generalization is extremely difficult, and outliers can make it even more challenging. Solving this problem automatically is NP-hard." 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "## Further Reading" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "- [Data Anonymization: Perspectives from a Former Skeptic](https://towardsdatascience.com/data-anonymization-perspectives-from-a-former-skeptic-f35790a2042a)\n", 576 | "- `t-closeness` (**Beyond K-Anonimity**) [Paper](https://www.cs.purdue.edu/homes/ninghui/papers/t_closeness_icde07.pdf)\n", 577 | "- _Anonymising and Sharing Individual Patients Data_ [Paper](https://www.bmj.com/content/bmj/350/bmj.h1139.full.pdf?casa_token=NwqT3F-i9xkAAAAA:U_T2t8ZaB1xWBgDOH7QbgQAuwMXJ6FehY07q_C0AztDejEDxp08awbjyWeOlMLOl14lV-W0z1OVjmw)" 578 | ] 579 | } 580 | ], 581 | "metadata": { 582 | "kernelspec": { 583 | "display_name": "Python 3 (ipykernel)", 584 | "language": "python", 585 | "name": "python3" 586 | }, 587 | "language_info": { 588 | "codemirror_mode": { 589 | "name": "ipython", 590 | "version": 3 591 | }, 592 | "file_extension": ".py", 593 | "mimetype": "text/x-python", 594 | "name": "python", 595 | "nbconvert_exporter": "python", 596 | "pygments_lexer": "ipython3", 597 | "version": "3.12.3" 598 | } 599 | }, 600 | "nbformat": 4, 601 | "nbformat_minor": 4 602 | } 603 | -------------------------------------------------------------------------------- /1-data-anonimisation/2-de-identification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Optional: setup NoTexBook theme\n", 10 | "%load_ext notexbook\n", 11 | "%texify" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "**Adapted from**: [Ch1](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch1.ipynb)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "tags": [ 25 | "remove-cell" 26 | ] 27 | }, 28 | "source": [ 29 | "# De-Identification\n", 30 | "\n", 31 | "### Dataset\n", 32 | "\n", 33 | "The dataset is based on census data. The personally identifiable information (**PII**) is made up." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "import matplotlib.pyplot as plt" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\"" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "tags": [ 61 | "remove-cell" 62 | ] 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "adult = pd.read_csv(DATASET_URL)\n", 67 | "adult.head()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "# De-identification" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "*De-identification* is the process of removing *identifying information* from a dataset. The term *de-identification* is sometimes used as a synonym for other terms like *anonymization* or *pseudonymization*.\n", 82 | "\n", 83 | "> **Learning Objectives**\n", 84 | "> - Define the following concepts:\n", 85 | "> - De-identification & Re-identification\n", 86 | "> - Identifying information / personally identifying information\n", 87 | "> - Learn Example of (Data) Attacks\n", 88 | "> - Linking & Differencing Attacks\n", 89 | "> - Understand limitations of aggregate statistics" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Personal Identifiable Information\n", 97 | "\n", 98 | "Identifying information has no formal definition. It is usually understood to be information which would be used to identify us uniquely in the course of daily life - name, address, phone number, e-mail address, etc. \n", 99 | "\n", 100 | "As we will see later, it's *impossible* to formalize the concept of identifying information, because *all* information is identifying. \n", 101 | "\n", 102 | "The term **personally identifiable information** (`PII`) is often used as a synonym for \"identifying information\"." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "**Q**: How should we **de-identify** information? \n", 110 | "\n", 111 | "**A**: Easy - we just remove the columns that contain identifying information!" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "adult_de_identified = adult.copy().drop(columns=['Name', 'SSN'])\n", 121 | "adult_de_identified.head(1)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "> 💡 We'll save some of the identifying information for later, when we'll use it as *auxiliary data* to perform a *re-identification* attack." 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "Selected **PPI**s in the dataset:" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "adult_pii = adult[['Name', 'SSN', 'DOB', 'Zip']]\n", 145 | "adult_pii.head()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## Linking Attacks\n", 153 | "\n", 154 | "Imagine we want to determine the income of a friend from our de-identified data. \n", 155 | "\n", 156 | "Names have been removed, but we happen to know some _auxiliary information_ about our friend. \n", 157 | "\n", 158 | "Our friend's name is **Karrie Trusslove**, and we know Karrie's date of birth and zip code." 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "To perform a simple **linking attack**, we look at the _overlapping columns between the dataset_ we're trying to attack, and the auxiliary data we know.\n", 166 | "\n", 167 | "In this case, both datasets have dates of birth and zip codes.\n", 168 | "\n", 169 | "We look for rows in the dataset we're attacking with `dates of birth` and `zip codes` that match Karrie's `date of birth` and `zip code`.\n", 170 | "\n", 171 | "If there is **only one** such row, we've found Karrie's row in the dataset we're attacking. \n", 172 | "\n", 173 | "In databases, this is called a **JOIN** of two tables, and we can do it in Pandas using `merge`." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "karries_row = adult_pii[adult_pii['Name'] == 'Karrie Trusslove']\n", 183 | "karries_row" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "auxiliary_info = karries_row[[\"DOB\", \"Zip\"]]\n", 193 | "auxiliary_info.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "\n", 203 | "pd.merge(auxiliary_info, adult_de_identified, left_on=['DOB', 'Zip'], right_on=['DOB', 'Zip'])" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "> 💡 Indeed, there is only one row that matches.\n", 211 | "> We have used **auxiliary data** to re-identify an individual in a de-identified dataset, and we're able to infer that Karrie's income is less than `$50k`." 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### How Hard is it to Re-Identify Karrie?" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "This scenario is made up, but linking attacks are surprisingly easy to perform in practice.\n", 226 | "\n", 227 | "How easy? It turns out that in many cases, just one data point is sufficient to pinpoint a row!" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "pd.merge(auxiliary_info, adult_de_identified, left_on=['Zip'], right_on=['Zip'])" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "So ZIP code is sufficient **by itself** to allow us to re-identify Karrie. What about date of birth?" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "pd.merge(auxiliary_info, adult_de_identified, left_on=['DOB'], right_on=['DOB'])" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "This time, there are three rows returned - and we don't know which one is the real Karrie. \n", 260 | "\n", 261 | "**But we've still learned a lot about our dataset!**\n", 262 | "\n", 263 | "- We know that there's a 2/3 chance that Karrie's income is less than $50k\n", 264 | "- We can look at the differences between the rows to determine what additional auxiliary information would *help* us to distinguish them (e.g. sex, occupation, marital status)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "#### Is Karrie Special?" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "How hard is it to re-identify others in the dataset? \n", 279 | "\n", 280 | "Is Karrie especially easy or especially difficult to re-identify? \n", 281 | "\n", 282 | "A good way to understand the effectiveness of this type of attack is to look at how **selective** certain pieces of data are.\n", 283 | "\n", 284 | "In other words, how good they are at narrowing down the set of potential rows which may belong to the target individual.\n", 285 | "\n", 286 | "For example, is it common for `birth dates` to occur more than once ?\n", 287 | "\n", 288 | "We'd like to get an idea of how many dates of birth are likely to be useful in performing an attack, which we can do by looking at how common \"unique\" dates of birth are in the dataset." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "tags": [ 296 | "hide-input" 297 | ] 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "adult_pii['DOB'].value_counts().hist()\n", 302 | "\n", 303 | "plt.title(\"How selective is date of birth as PII ?\")\n", 304 | "plt.xlabel('Number of Dates of Birth')\n", 305 | "plt.ylabel('Number of Occurrences');" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "The histogram above shows that *the vast majority* of dates of birth occur 1, 2, or 3 times in the dataset, and *no date of birth* occurs more than 8 times. \n", 313 | "\n", 314 | "This means that date of birth is **fairly selective** - it's effective in narrowing down the possible records for an individual." 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "#### Quick Exercise:\n", 322 | "\n", 323 | "Let's try to repeat the experiment with `ZIP` codes:" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "# your code here\n" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "
\n", 340 | "\n", 341 | "Solution\n", 342 | "\n", 343 | "```python\n", 344 | "adult_pii['Zip'].value_counts().hist()\n", 345 | "\n", 346 | "plt.title(\"How selective is date of birth as PII ?\")\n", 347 | "plt.xlabel('Number of ZIP Codes')\n", 348 | "plt.ylabel('Number of Occurrences');\n", 349 | "```\n", 350 | "
" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "
\n", 358 | "\n", 359 | "Considerations\n", 360 | "\n", 361 | "The results when using ZIP code happens to be even worse: ZIP code is *very* selective in this dataset. \n", 362 | "\n", 363 | "Nearly all the ZIP codes occur only once.\n", 364 | "
" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "### How Many People can we Re-Identify?" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "**Q**: In this dataset, how many people can we re-identify uniquely?\n", 379 | "\n", 380 | "We can use our auxiliary information to find out!\n", 381 | "\n", 382 | "First, let's see what happens with just `dates of birth`.\n", 383 | "\n", 384 | "We want to know how many *possible identities* are returned for each data record in the dataset." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "scrolled": true, 392 | "tags": [ 393 | "hide-input" 394 | ] 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "attack = pd.merge(adult_pii, adult_de_identified, left_on=['DOB'], right_on=['DOB'])\n", 399 | "attack['Name'].value_counts().hist();\n", 400 | "\n", 401 | "plt.title(\"How many records can we identify with DoB ?\")\n", 402 | "plt.xlabel(\"Entries count\")\n", 403 | "plt.ylabel(\"Number of Records\")\n", 404 | "plt.show()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "\n", 412 | "The above histogram shows the **number of records with each number of possible identities**.\n", 413 | "\n", 414 | "The results show that we can uniquely identify almost `7,000` of the data records (out of about `32,000`), and an additional `10,000` data records are narrowed down to **only two** possible identities." 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "So it's not possible to re-identify a majority of individuals using *just* date of birth. \n", 422 | "\n", 423 | "What if we collect more information, to narrow things down further? \n", 424 | "\n", 425 | "If we use **both** `date of birth` and `ZIP`, we're able to do much better. \n", 426 | "\n", 427 | "In fact, we're able to uniquely re-identify basically the whole dataset." 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "scrolled": true, 435 | "tags": [ 436 | "hide-input" 437 | ] 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "attack = pd.merge(adult_pii, adult_de_identified, left_on=['DOB', 'Zip'], right_on=['DOB', 'Zip'])\n", 442 | "\n", 443 | "attack['Name'].value_counts().hist();\n", 444 | "plt.title(\"How many records can we identify with DoB & ZIP ?\")\n", 445 | "plt.xlabel(\"Entries count\")\n", 446 | "plt.ylabel(\"Number of Records\")\n", 447 | "plt.show()" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "> 💡 When we use both pieces of information, we can re-identify **essentially everyone**. \n", 455 | "\n", 456 | "This is a surprising result, since we generally assume that many people share the same birthday, and many people live in the same ZIP code.\n", 457 | "\n", 458 | "It turns out that the *combination* of these factors is **extremely** selective.\n", 459 | "\n", 460 | "According to Latanya Sweeney's work[1](#fn1), 87% of people in the US can be uniquely re-identified by the combination of date of birth, gender, and ZIP code.\n", 461 | "\n", 462 | "**[1]**: Sweeney, L, _Simple Demographics Often Identify People Uniquely_ [link](https://dataprivacylab.org/projects/identifiability/)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "Let's just check that we've actually re-identified *everyone*, by printing out the number of possible data records for each identity:" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": { 476 | "tags": [ 477 | "hide-input" 478 | ] 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "attack['Name'].value_counts(ascending=False).head()" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "Looks like we missed two people! In other words, in this dataset, only **two people** share a combination of ZIP code and date of birth." 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "## Aggregation" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "Another way to prevent the release of private information is to release only **aggregate** date." 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "# Let's take the Age as an example\n", 513 | "adult['Age'].mean()" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "### Problem of Small Groups" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "In many cases, aggregate statistics are broken down into smaller groups. \n", 528 | "\n", 529 | "For example, we might want to know the average age of people with a particular education level." 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "adult[['Education-Num', 'Age']].groupby('Education-Num').mean().head(3)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "Aggregation is supposed to _improve privacy_ because it's hard to identify the contribution of a particular individual to the aggregate statistic. \n", 546 | "\n", 547 | "But what if we aggregate over a group with just *one person* in it? \n", 548 | "\n", 549 | "In that case, the aggregate statistic reveals one person's age *exactly*, and provides no privacy protection at all! \n", 550 | "\n", 551 | "In our dataset, most individuals have a unique `ZIP` code - so if we compute the average age by ZIP code, then most of the \"averages\" actually reveal an individual's exact age." 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "adult[['Zip', 'Age']].groupby('Zip').mean().head()" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "The US Census Bureau, for example, releases aggregate statistics at the [*block level*](https://www.census.gov/newsroom/blogs/random-samplings/2011/07/what-are-census-blocks.html). \n", 568 | "\n", 569 | "Some census blocks have large populations, but some have a population of zero! \n", 570 | "\n", 571 | "The situation above, where small groups prevent aggregation from hiding information about individuals, turns out to be quite common.\n", 572 | "\n", 573 | "How big a group is \"big enough\" for aggregate statistics to help? \n", 574 | "\n", 575 | "It's hard to say - it depends on the data and on the attack - so it's challenging to build confidence that aggregate statistics are really privacy-preserving. \n", 576 | "\n", 577 | "However, even very large groups do not make aggregation completely robust against attacks, as we will see next." 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": {}, 583 | "source": [ 584 | "### Differencing Attacks" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "The problems with aggregation get even worse when you release multiple aggregate statistics over the same data. \n", 592 | "\n", 593 | "For example, consider the following two summation queries over large groups in our dataset (the first over the whole dataset, and the second over all records except one):" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "adult['Age'].sum()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "adult[adult['Name'] != 'Karrie Trusslove']['Age'].sum()" 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "metadata": {}, 617 | "source": [ 618 | "If we know both answers, we can simply take the difference and determine Karrie's age completely! \n", 619 | "\n", 620 | "This kind of attack can proceed even if the aggregate statistics are over *very large groups*." 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": { 627 | "scrolled": true 628 | }, 629 | "outputs": [], 630 | "source": [ 631 | "adult['Age'].sum() - adult[adult['Name'] != 'Karrie Trusslove']['Age'].sum()" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "#### Take away message\n", 639 | "\n", 640 | "(This is a recurring theme:)\n", 641 | "\n", 642 | "1. Releasing *data* that is useful makes ensuring *privacy* very difficult.\n", 643 | "\n", 644 | "2. Distinguishing between *malicious* and *non-malicious* queries is not possible (in general)." 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": {}, 650 | "source": [ 651 | "## Summary" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "- A *Linking attack* involves combining *auxiliary data* with *de-identified data* to *re-identify* individuals.\n", 659 | "\n", 660 | "- In the simplest case, a linking attack can be performed via a *join* of two tables containing these datasets.\n", 661 | "\n", 662 | "- Simple linking attacks are surprisingly effective:\n", 663 | " - Just a single data point is sufficient to narrow things down to a few records\n", 664 | " - The narrowed-down set of records helps suggest additional auxiliary data which might be helpful\n", 665 | " - Two data points are often good enough to re-identify a huge fraction of the population in a particular dataset\n", 666 | " - Three data points (gender, ZIP code, date of birth) uniquely identify 87% of people in the US\n", 667 | "\n", 668 | "\n", 669 | "\n", 670 | "- Releasing aggregate statistics is another way to not disclose sensitive information.\n", 671 | "\n", 672 | " - But sometimes aggregating per groups could lead to sensitive data leakage nonetheless.\n", 673 | " - A *Differencing Attack* involves gathering some information out from auxiliary info and aggregate statistics. " 674 | ] 675 | } 676 | ], 677 | "metadata": { 678 | "kernelspec": { 679 | "display_name": "Python 3 (ipykernel)", 680 | "language": "python", 681 | "name": "python3" 682 | }, 683 | "language_info": { 684 | "codemirror_mode": { 685 | "name": "ipython", 686 | "version": 3 687 | }, 688 | "file_extension": ".py", 689 | "mimetype": "text/x-python", 690 | "name": "python", 691 | "nbconvert_exporter": "python", 692 | "pygments_lexer": "ipython3", 693 | "version": "3.12.3" 694 | } 695 | }, 696 | "nbformat": 4, 697 | "nbformat_minor": 4 698 | } 699 | -------------------------------------------------------------------------------- /3-differential-privacy/3-properties-differential-privacy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "70a9dd8d-ef11-44d7-ac7f-d63171501c7d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Optional: setup NoTexBook theme\n", 11 | "%load_ext notexbook\n", 12 | "%texify" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "b90ec924", 18 | "metadata": {}, 19 | "source": [ 20 | "**Adapted from**: [Ch4](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch4.ipynb)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "d06e8e58-0d12-482b-8085-ba048e5c6e62", 26 | "metadata": {}, 27 | "source": [ 28 | "# Properties of Differential Privacy" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "8ae6c29f-0bf9-483d-be70-72a8a2088a34", 34 | "metadata": {}, 35 | "source": [ 36 | "In this notebook we will mention three important properties of **differentially private mechanisms** that arise from the definition\n", 37 | " of differential privacy.\n", 38 | "\n", 39 | "These properties are mentioned as they will be used / referenced when we will start generalising DP applications \n", 40 | "to Machine Learning algorithms.\n", 41 | "\n", 42 | "\n", 43 | "These three properties are:\n", 44 | "\n", 45 | "1. Sequential composition\n", 46 | "2. Parallel composition\n", 47 | "3. Post processing" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "07b6925b-22f4-4841-8217-e6ef69d0ab88", 53 | "metadata": {}, 54 | "source": [ 55 | "## Sequential Composition\n", 56 | "\n", 57 | "The first major property of differential privacy is *sequential composition*, which **bounds** the total \n", 58 | "privacy cost of releasing multiple results of differentially private mechanisms **on the same input data**. \n", 59 | "\n", 60 | "Formally, the sequential composition theorem for differential privacy says that:\n", 61 | "\n", 62 | "- If $F_1(x)$ satisfies $\\epsilon_1$-differential privacy\n", 63 | "- And $F_2(x)$ satisfies $\\epsilon_2$-differential privacy\n", 64 | "- Then the mechanism $G(x) = (F_1(x), F_2(x))$ which releases both results satisfies $\\epsilon_1+\\epsilon_2$-differential privacy\n", 65 | "\n", 66 | "\n", 67 | "Sequential composition is a vital property of differential privacy because it enables the design of algorithms that consult the data more than once. \n", 68 | "\n", 69 | "Sequential composition is also important when multiple separate analyses are performed on a single dataset, since it allows individuals to bound the *total* privacy cost they incur by participating in all of these analyses.\n", 70 | "\n", 71 | "The bound on privacy cost given by sequential composition is an *upper* bound - the actual privacy cost of two particular differentially private releases may be smaller than this, but never larger.\n", 72 | "\n", 73 | "The principle that the $\\epsilon$-s \"add up\" makes sense if we examine the distribution of outputs from a mechanism which averages two differentially private results together.\n", 74 | "\n", 75 | "However, please bear in mind that Sequential composition does not provide an **exact** upper bound: the exact total privacy cost can be indeed lower than the upper bound!." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "13c48694-ad91-47b6-9467-0fc10017aea5", 81 | "metadata": {}, 82 | "source": [ 83 | "## Parallel Composition\n", 84 | "\n", 85 | "The second important property of differential privacy is called *parallel composition*. \n", 86 | "\n", 87 | "Parallel composition can be seen as an alternative to sequential composition - a second way to calculate a bound on the total privacy cost of multiple data releases. \n", 88 | "\n", 89 | "Parallel composition is based on the idea of **splitting** your dataset into disjoint chunks and running a \n", 90 | "differentially private mechanism on each chunk separately. \n", 91 | "\n", 92 | "Since the chunks are **disjoint**, each individual's data appears in *exactly* one chunk - so even if there are $k$ chunks in total (and therefore $k$ runs of the mechanism), the mechanism runs exactly once on the data of each *individual*. \n", 93 | "\n", 94 | "Formally,\n", 95 | " - If $F(x)$ satisfies $\\epsilon$-differential privacy\n", 96 | " - And we split a dataset $X$ into $k$ disjoint chunks such that $x_1 \\cup ... \\cup x_k = X$\n", 97 | " - Then the mechanism which releases all of the results $F(x_1), ..., F(x_k)$ satisfies $\\epsilon$-differential privacy\n", 98 | "\n", 99 | "Note that this is a much better bound than sequential composition would give. \n", 100 | "\n", 101 | "Since we run $F$ $k$ times, sequential composition would say that this procedure satisfies $k\\epsilon$-differential privacy. \n", 102 | "\n", 103 | "Parallel composition allows us to say that the total privacy cost is just $\\epsilon$.\n", 104 | "\n", 105 | "The formal definition matches up with our intuition - if each participant in the dataset contributes one row to $X$, then this row will appear in *exactly* one of the chunks $x_1, ..., x_k$. \n", 106 | "\n", 107 | "That means $F$ will only \"see\" this participant's data *one time*, meaning a privacy cost of $\\epsilon$ is appropriate for that individual. Since this property holds for all individuals, the privacy cost is $\\epsilon$ for everyone." 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "id": "67375447", 113 | "metadata": {}, 114 | "source": [ 115 | "## Post-processing\n", 116 | "\n", 117 | "The third property of differential privacy is called *post-processing*. \n", 118 | "\n", 119 | "The idea is simple: it's impossible to **reverse the privacy protection** provided by differential privacy by post-processing the data in some way. \n", 120 | "\n", 121 | "Formally:\n", 122 | "\n", 123 | "- If $F(X)$ satisfies $\\epsilon$-differential privacy\n", 124 | "- Then for any (deterministic or randomized) function $g$, $g(F(X))$ satisfies $\\epsilon$-differential privacy\n", 125 | "\n", 126 | "The post-processing property means that it's always safe to perform arbitrary computations on the output of a differentially private mechanism - there's no danger of reversing the privacy protection the mechanism has provided. \n", 127 | "\n", 128 | "In particular, it's fine to perform post-processing that might reduce the noise or improve the signal in the mechanism's output (e.g. replacing negative results with zeros, for queries that shouldn't return negative results). \n", 129 | "\n", 130 | "The other implication of the **post-processing** property is that differential privacy provides resistance against privacy attacks based on **auxiliary information**. \n", 131 | "\n", 132 | "For example, the function $g$ might contain auxiliary information about elements of the dataset, and attempt to perform a linkage attack using this information. The post-processing property says that such an attack is limited in its effectiveness by the privacy parameter $\\epsilon$, regardless of the auxiliary information contained in $g$." 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "dc67cade-85d7-4494-bb5c-c045b414df84", 138 | "metadata": {}, 139 | "source": [ 140 | "### Histograms\n", 141 | "\n", 142 | "In our context, a *histogram* is an analysis of a dataset which splits the dataset into \"bins\" based on the value of one of the data attributes, and **counts** the number of rows in each bin. \n", 143 | "\n", 144 | "For example, a histogram might count the number of people in the dataset who achieved a particular educational level." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "7b23c875", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "import pandas as pd\n", 155 | "import numpy as np\n", 156 | "import matplotlib.pyplot as plt\n", 157 | "plt.style.use('seaborn-v0_8-whitegrid')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "720f3805-ebbf-476b-988f-711cb70ed47c", 164 | "metadata": { 165 | "tags": [ 166 | "hide-input" 167 | ] 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\"\n", 172 | "adult = pd.read_csv(DATASET_URL)\n", 173 | "\n", 174 | "adult['Education'].value_counts().to_frame().head(5)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "id": "a15d1d8e-0399-4f78-9f23-aa1f669ae035", 180 | "metadata": {}, 181 | "source": [ 182 | "Histograms are particularly interesting for differential privacy because they automatically satisfy parallel composition. \n", 183 | "\n", 184 | "Each \"bin\" in a histogram is defined by a possible value for a data attribute (for example, `'Education' == 'HS-grad'`). \n", 185 | "\n", 186 | "It's impossible for a single row to have *two* values for an attribute simultaneously, so defining the bins this way *guarantees* that they will be disjoint. \n", 187 | "\n", 188 | "Thus we have satisfied the requirements for parallel composition, and we can use a differentially private mechanism to release *all* \n", 189 | "of the bin counts with a total privacy cost of just $\\epsilon$." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "55d2c3a3-2cda-4274-9a8d-488e45b8b69e", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "epsilon = 1\n", 200 | "sensitivity = 1\n", 201 | "\n", 202 | "# This analysis has a total privacy cost of epsilon = 1, even though we release many results!\n", 203 | "f = lambda x: x + np.random.laplace(loc=0, scale=sensitivity/epsilon)\n", 204 | "s = adult['Education'].value_counts().apply(f)\n", 205 | "s.to_frame().head(5)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "705f4831-bb84-4929-b396-4aaed7204490", 211 | "metadata": {}, 212 | "source": [ 213 | "## Optional: Sensitivity and Clipping" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "95ee6158-2850-46e1-8986-1bf96b7ee87e", 219 | "metadata": {}, 220 | "source": [ 221 | "### Sensitivity" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "39346eee-d5bb-4707-b6c6-5ccf9aea56e4", 227 | "metadata": {}, 228 | "source": [ 229 | "When discussing the Laplace mechanism, we mentioned that the amount of **noise** that is necessary to ensure differential privacy for \n", 230 | "a given query depends on the *sensitivity* of the query. \n", 231 | "\n", 232 | "Roughly speaking, the sensitivity of a function reflects the amount the function's output will change when its input changes. \n", 233 | "\n", 234 | "Recall that the Laplace mechanism defines a mechanism $F(x)$ as follows:\n", 235 | "\n", 236 | "\\begin{align}\n", 237 | "F(x) = f(x) + \\textsf{Lap}\\left(\\frac{s}{\\epsilon}\\right)\n", 238 | "\\end{align}\n", 239 | "\n", 240 | "where $f(x)$ is a deterministic function (the query), $\\epsilon$ is the privacy parameter, and $s$ is the sensitivity of $f$.\n", 241 | "\n", 242 | "For a function $f : \\mathcal{D} \\rightarrow \\mathbb{R}$ mapping datasets ($\\mathcal{D}$) to real numbers, the *global sensitivity* of $f$ is defined as follows:\n", 243 | "\n", 244 | "\\begin{align}\n", 245 | "GS(f) = \\max_{x, x': d(x,x') <= 1} |f(x) - f(x')|\n", 246 | "\\end{align}\n", 247 | "\n", 248 | "Here, $d(x, x')$ represents the *distance* between two datasets $x$ and $x'$, and we say that two datasets are *neighbors* if their distance is 1 or less. \n", 249 | "\n", 250 | "How this distance is defined has a huge effect on the definition of privacy we obtain.\n", 251 | "\n", 252 | "The definition of global sensitivity says that for *any two* neighboring datasets $x$ and $x'$, the difference between $f(x)$ and $f(x')$ is at most $GS(f)$. \n", 253 | "\n", 254 | "**Global vs Local Sensitivity**:\n", 255 | "\n", 256 | "This measure of sensitivity is called \"global\" because it is independent of the actual dataset being queried (it holds for *any* choice of neighboring $x$ and $x'$). \n", 257 | "\n", 258 | "Another measure of sensitivity, called *local sensitivity*, fixes one of the datasets to be the one being queried." 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "id": "e9ecc7dc-36d6-4c3c-bdbd-f8a162366609", 264 | "metadata": {}, 265 | "source": [ 266 | "### Distance\n", 267 | "\n", 268 | "The distance metric $d(x,x')$ described earlier can be defined in many different ways. \n", 269 | "\n", 270 | "Intuitively, the distance between two datasets should be equal to 1 (i.e. the datasets are neighbors) if they differ in the data of exactly **one individual**. \n", 271 | "\n", 272 | "This idea is easy to formalize in some contexts (e.g. in the US Census, each individual submits a single response containing their data) but extremely challenging in others (e.g. location trajectories, social networks, and time-series data).\n", 273 | "\n", 274 | "A common formal definition for datasets containing rows is to consider the number of rows which differ between the two. \n", 275 | "\n", 276 | "When each individual's data is contained in a single row, this definition often makes sense. \n", 277 | "\n", 278 | "Formally, this definition of distance is encoded as a **symmetric difference** between the two datasets:\n", 279 | "\n", 280 | "\\begin{align}\n", 281 | "d(x, x') = | x - x' \\cup x' - x |\n", 282 | "\\end{align}\n", 283 | "\n", 284 | "This particular definition has several interesting and important implications:\n", 285 | "- If $x'$ is constructed from $x$ by *adding one row*, then $d(x,x') = 1$\n", 286 | "- If $x'$ is constructed from $x$ by *removing one row*, then $d(x,x') = 1$\n", 287 | "- If $x'$ is constructed from $x$ by *modifying one row*, then $d(x,x') = 2$\n", 288 | "\n", 289 | "In other words, adding or removing a row results in a neighboring dataset; *modifying* a row results in a dataset at distance *2*. \n", 290 | "\n", 291 | "This particular definition of distance results in what is typically called *unbounded differential privacy*. Many other definitions are possible, including one called **bounded differential privacy** in which modifying a single row in a dataset *does* result in a neighboring dataset. \n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "id": "d620b327-8fd8-4c71-9110-6595c0e68667", 297 | "metadata": {}, 298 | "source": [ 299 | "#### Calculating Sensitivity\n", 300 | "\n", 301 | "How do we determine the sensitivity of a particular function of interest? For some simple functions on real numbers, the answer is obvious.\n", 302 | "\n", 303 | "- The global sensitivity of $f(x) = x$ is 1, since changing $x$ by 1 changes $f(x)$ by 1\n", 304 | "- The global sensitivity of $f(x) = x+x$ is 2, since changing $x$ by 1 changes $f(x)$ by 2\n", 305 | "- The global sensitivity of $f(x) = 5*x$ is 5, since changing $x$ by 1 changes $f(x)$ by 5\n", 306 | "- The global sensitivity of $f(x) = x*x$ is unbounded, since the change in $f(x)$ depends on the value of $x$\n", 307 | "\n", 308 | "For functions that map datasets to real numbers, we can perform a similar analysis. We will consider the functions which represent common aggregate database queries: counts, sums, and averages." 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "id": "43f401d8-c5e6-409c-9973-6604bac4ef38", 314 | "metadata": {}, 315 | "source": [ 316 | "#### Counting Queries\n", 317 | "\n", 318 | "Counting queries (`COUNT` in SQL) count the number of rows in the dataset which satisfy a specific property. \n", 319 | "\n", 320 | "As a rule of thumb, **counting queries always have a sensitivity of 1**. \n", 321 | "\n", 322 | "This is because adding a row to the dataset can increase the output of the query by at most 1: either the new row has the desired property, and the count increases by 1, or it does not, and the count stays the same (the count may correspondingly decrease when a row is removed)." 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "25dc86d5-b81c-4173-bc05-2625cfc083d3", 328 | "metadata": {}, 329 | "source": [ 330 | "**Example: \"How many people are in the dataset?\"** (sensitivity: 1 - counting rows where the property = True)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "id": "54f12bca-afa9-482d-bf36-9b8b3577a67d", 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "adult.shape[0]" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "id": "f4ddef18-456a-4f87-999b-b7bc4b3129d9", 346 | "metadata": {}, 347 | "source": [ 348 | "**Example: \"How many people have an educational status above 10?\"** (sensitivity: 1 - counting rows with a property)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "id": "4399cb53-cfd6-4931-bf0c-66a344d67a6f", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "adult[adult['Education-Num'] > 10].shape[0]" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "id": "635a58cb-4853-4178-8717-61a28688dce3", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "adult[adult['Name'] == 'Joe Near'].shape[0]" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "id": "9faaef93-ce65-4446-969d-2eb16920789f", 374 | "metadata": {}, 375 | "source": [ 376 | "#### Summation Queries\n", 377 | "\n", 378 | "Summation queries (`SUM` in SQL) sum up the *attribute values* of dataset rows." 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "id": "ea93f080-7116-4265-856d-4bd3c30e4838", 384 | "metadata": {}, 385 | "source": [ 386 | "**Example: \"What is the sum of the ages of people with an educational status above 10?\"**" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "id": "6d62a9e8-2024-4844-91f3-404890a7f124", 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "adult[adult['Education-Num'] > 10]['Age'].sum()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "id": "e37f827f-233a-4038-aae6-d4f1857e8744", 402 | "metadata": {}, 403 | "source": [ 404 | "Sensitivity for these queries is not **as simple as it is for counting queries**. \n", 405 | "\n", 406 | "Adding a new row to the dataset will increase the result of our example query by the *age of the new person*. \n", 407 | "\n", 408 | "That means the sensitivity of the query depends on the **contents** of the row we add.\n", 409 | "\n", 410 | "As a rule of thumb, summation queries have **unbounded sensitivity** when no lower and upper bounds exist on the value of the attribute being summed. \n", 411 | "\n", 412 | "When lower and upper bounds do exist, the sensitivity of a summation query is equal to the **difference between them**. \n", 413 | "\n", 414 | "In the next section, we will see a technique called **clipping** for enforcing bounds when none exist, so that summation queries with unbounded sensitivity can be converted into queries with bounded sensitivity." 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "a8851ab7-acb0-4155-b6a0-480a8bb5312e", 420 | "metadata": {}, 421 | "source": [ 422 | "#### Average Queries\n", 423 | "\n", 424 | "Average queries (`AVG` in SQL) calculate the mean of attribute values in a particular column." 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "id": "bb73ae9a-26cd-4e9e-b303-64f18e404e37", 430 | "metadata": {}, 431 | "source": [ 432 | "**Example: \"What is the average age of people with an educational status above 10?\"**" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "id": "2fbd47bf-8509-462e-9138-7b3dc57d95c0", 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "adult[adult['Education-Num'] > 10]['Age'].mean()" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "id": "89b79a76-2e13-420a-a3dc-7387f44268a2", 448 | "metadata": {}, 449 | "source": [ 450 | "The easiest way to answer an average query with differential privacy is by re-phrasing it as two queries: a summation query divided by a counting query. For the above example:" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "id": "7cda7c33-ce63-472a-89d3-2f60fec93839", 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "adult[adult['Education-Num'] > 10]['Age'].sum() / adult[adult['Education-Num'] > 10]['Age'].shape[0]" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "id": "9263f787-cb9a-4d82-baa8-57d333abe6a0", 466 | "metadata": {}, 467 | "source": [ 468 | "The sensitivities of both queries can be calculated as described above. \n", 469 | "\n", 470 | "Noisy answers for each can be calculated (e.g. using the Laplace mechanism) and the noisy answers can be divided to obtain a differentially private mean. \n", 471 | "\n", 472 | "The total privacy cost of both queries can be calculated by **sequential composition**." 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "id": "497056eb-09a5-4336-bf25-13387de886dd", 478 | "metadata": {}, 479 | "source": [ 480 | "### Clipping\n", 481 | "\n", 482 | "Queries with unbounded sensitivity cannot be directly answered with differential privacy using the Laplace mechanism. \n", 483 | "\n", 484 | "Fortunately, we can often transform such queries into equivalent queries with *bounded* sensitivity, via a process called **clipping**.\n", 485 | "\n", 486 | "The basic idea behind clipping is to **enforce** upper and lower bounds on attribute values. \n", 487 | "\n", 488 | "> For example, ages above 125 can be \"clipped\" to exactly 125. \n", 489 | "\n", 490 | "After clipping has been performed, we are **guaranteed** that all ages will be 125 or below. \n", 491 | "\n", 492 | "As a result, the sensitivity of a summation query on clipped data is equal to the difference between the upper and lower bounds used in clipping: $upper - lower$. \n", 493 | "\n", 494 | "For example, the following query has a sensitivity of 125:" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "id": "721081ec-c922-4551-83b5-6317ec12533b", 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "adult['Age'].clip(lower=0, upper=125).sum()" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "id": "dbdb4402-0310-40f9-91db-ca6af05e3f86", 510 | "metadata": {}, 511 | "source": [ 512 | "The primary challenge in performing clipping is to determine the **upper** and **lower** bounds. \n", 513 | "\n", 514 | "Furthermore, there is a tradeoff between the amount of information lost in clipping and the amount of noise needed to ensure differential privacy. \n", 515 | "\n", 516 | "As a rule of thumb, **try to set the clipping bounds to include 100% of the dataset**, or get as close as possible. This is harder in some domains (e.g. graph queries) than others.\n", 517 | "\n", 518 | "It's tempting to determine the clipping bounds by looking at the data. For example, we can look at the histogram of ages in our dataset to determine an appropriate upper bound:" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "id": "72b94469-12ec-4ab9-a021-ea8fa52a2100", 525 | "metadata": { 526 | "tags": [ 527 | "hide-input" 528 | ] 529 | }, 530 | "outputs": [], 531 | "source": [ 532 | "plt.hist(adult['Age'])\n", 533 | "plt.xlabel('Age')\n", 534 | "plt.ylabel('Number of Records');" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "id": "4289b910-111b-468e-b726-ddf7c0722ecb", 540 | "metadata": {}, 541 | "source": [ 542 | "It's clear from this histogram that nobody in this particular dataset is over 90, so an upper bound of 90 would suffice.\n", 543 | "\n", 544 | "**NOTE**: However, it's important to note that **this approach does not satisfy differential privacy**. \n", 545 | "\n", 546 | "If we pick our clipping bounds by looking at the data, then the bounds themselves might reveal something about the data.\n", 547 | "\n", 548 | "Typically, clipping bounds are decided either by using a property of the dataset that can be known without looking at the data (e.g. that the dataset contains ages, which are likely to lie between 0 and 125), or by performing **differentially private queries** to evaluate different choices for the clipping bounds.\n", 549 | "\n", 550 | "**Determine Upper bound with differentially private queries**:\n", 551 | "\n", 552 | "To use the second approach, we typically set the lower bound to 0 and slowly increase the upper bound until the query's output stops changing (meaning we haven't included any new data by increasing the bound). \n", 553 | "\n", 554 | "For example, let's try computing the sum of ages for clipping bounds `from 0 to 100``, using the Laplace mechanism for each one to ensure differential privacy:" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "id": "e2fc38a2-4b32-4f7a-afdd-8b01a4ad52b2", 561 | "metadata": { 562 | "tags": [ 563 | "hide-input" 564 | ] 565 | }, 566 | "outputs": [], 567 | "source": [ 568 | "def laplace_mech(v, sensitivity, epsilon):\n", 569 | " return v + np.random.laplace(loc=0, scale=sensitivity/epsilon)\n", 570 | "\n", 571 | "epsilon_i = .01\n", 572 | "plt.plot([laplace_mech(adult['Age'].clip(lower=0, upper=i).sum(), i, epsilon_i) for i in range(100)])\n", 573 | "plt.xlabel('Clipping Bound for Age')\n", 574 | "plt.ylabel('Total Sum');" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "id": "eb09538c-800b-4294-b59b-24dacd48aab4", 580 | "metadata": {}, 581 | "source": [ 582 | "The total privacy cost for building this plot is $\\epsilon = 1$ by sequential composition, since we do 100 queries each with $\\epsilon_i = 0.01$. It's clear that the results level off around a value of `upper = 80`, so this is a good choice for the clipping bound.\n", 583 | "\n", 584 | "One refinement that **can work well when the scale of the data is not known** is to test upper bounds according to a logarithmic scale." 585 | ] 586 | } 587 | ], 588 | "metadata": { 589 | "kernelspec": { 590 | "display_name": "Python 3 (ipykernel)", 591 | "language": "python", 592 | "name": "python3" 593 | }, 594 | "language_info": { 595 | "codemirror_mode": { 596 | "name": "ipython", 597 | "version": 3 598 | }, 599 | "file_extension": ".py", 600 | "mimetype": "text/x-python", 601 | "name": "python", 602 | "nbconvert_exporter": "python", 603 | "pygments_lexer": "ipython3", 604 | "version": "3.12.3" 605 | } 606 | }, 607 | "nbformat": 4, 608 | "nbformat_minor": 5 609 | } 610 | --------------------------------------------------------------------------------