├── logos
    ├── ssi_logo_small.png
    └── openmined_logo_small.png
├── 4-remote-data-science
    ├── private_ai.png
    ├── syft_workflow.png
    ├── syft_ds_workflow.png
    └── private_ai_courses.png
├── 2-ml-models-attacks
    ├── lenet_mnist_model.pth
    ├── mia_reconstruction.png
    ├── models.py
    ├── train.py
    ├── 3-MIA-Reconstruction.ipynb
    ├── 2-MIA-Training.ipynb
    ├── dataset.py
    └── 1-FSGM-Attack.ipynb
├── ppml_requirements.txt
├── environment.yml
├── .gitignore
├── setup.md
├── Get-Ready.ipynb
├── README.md
├── 3-differential-privacy
    ├── 6-MIA-Reconstruction-OPACUS.ipynb
    ├── 5-MIA-Training-OPACUS.ipynb
    ├── 2-approx-differential-privacy.ipynb
    ├── 1-differential-privacy.ipynb
    └── 3-properties-differential-privacy.ipynb
├── LICENSE
└── 1-data-anonimisation
    ├── 3-k-anonimity.ipynb
    └── 2-de-identification.ipynb


/logos/ssi_logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/logos/ssi_logo_small.png


--------------------------------------------------------------------------------
/logos/openmined_logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/logos/openmined_logo_small.png


--------------------------------------------------------------------------------
/4-remote-data-science/private_ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/private_ai.png


--------------------------------------------------------------------------------
/2-ml-models-attacks/lenet_mnist_model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/2-ml-models-attacks/lenet_mnist_model.pth


--------------------------------------------------------------------------------
/4-remote-data-science/syft_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/syft_workflow.png


--------------------------------------------------------------------------------
/2-ml-models-attacks/mia_reconstruction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/2-ml-models-attacks/mia_reconstruction.png


--------------------------------------------------------------------------------
/4-remote-data-science/syft_ds_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/syft_ds_workflow.png


--------------------------------------------------------------------------------
/4-remote-data-science/private_ai_courses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/4-remote-data-science/private_ai_courses.png


--------------------------------------------------------------------------------
/ppml_requirements.txt:
--------------------------------------------------------------------------------
 1 | hagrid<0.3.122
 2 | ipykernel>=6.19
 3 | ipython>=8.12
 4 | jupyter==1.0.0
 5 | jupyterlab>=3.6
 6 | notebook>=6.5
 7 | opacus>=1.4
 8 | opendp>=0.9.2
 9 | pandas>=1.5
10 | pillow>=9.4
11 | pip>=23.1
12 | pydantic>=1.10
13 | torch>=1.13
14 | scikit-learn>=1.2.2
15 | scipy>=1.10
16 | setuptools>=67.8.0
17 | syft==0.8.6
18 | torchvision>=0.14.1
19 | tqdm>=4.65.0
20 | notexbook-theme==2.0.1
21 | phe>=1.5
22 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ppml
 2 | dependencies:
 3 |   - ipykernel>=6.19
 4 |   - ipython>=8.12
 5 |   - jupyter=1.0.0
 6 |   - jupyterlab>=3.6
 7 |   - matplotlib>=3.7
 8 |   - notebook>=6.5.4
 9 |   - numpy>=1.24
10 |   - pandas>=1.5
11 |   - pillow>=9.4
12 |   - pip>=23.1
13 |   - python=3.11
14 |   - pytorch::pytorch>=1.13
15 |   - scikit-learn>=1.2
16 |   - scipy>=1.10
17 |   - setuptools>=67.8.0
18 |   - pytorch::torchvision>=0.14.1
19 |   - tqdm>=4.65.0
20 |   - grpcio>=1.48,<1.52
21 |   - pip:
22 |       - opacus>=1.4
23 |       - opendp>=0.9.2
24 |       - pydantic>=1.10
25 |       - notexbook-theme==2.0.1
26 |       - phe==1.5.0
27 | 


--------------------------------------------------------------------------------
/2-ml-models-attacks/models.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class SoftmaxRegression(nn.Module):
 5 |     """Softmax Regression Classifier.
 6 | 
 7 |     This classifier is a generalization of logistic regression that
 8 |     allows the class variable to take more than two values—in our case,
 9 |     there are 40 individuals in the dataset, so the classifier needs to
10 |     distinguish between 40 labels.
11 |     Softmax regression is often used as the final layer in deep neural network
12 |     architectures, so on its own this classifier can be seen as a
13 |     neural network with no hidden layers
14 | 
15 |     Extracted from: https://dl.acm.org/doi/pdf/10.1145/2810103.2813677
16 |     """
17 | 
18 |     def __init__(self, in_features: int = 112 * 92, n_classes: int = 40):
19 |         super(SoftmaxRegression, self).__init__()
20 |         self.regression = nn.Linear(in_features, n_classes)
21 | 
22 |     def forward(self, x):
23 |         x = self.regression(x)
24 |         return nn.LogSoftmax(dim=1)(x)
25 | 
26 | 
27 | class MLP(nn.Module):
28 |     """Multilayer Perceptron Network.
29 | 
30 |     A multilayer perceptron network model with one hidden layer
31 |     of 3000 sigmoid neurons (or units), and a softmax output layer.
32 |     This classifier can be understood as performing
33 |     softmax regression after first applying a non-linear transformation
34 |     to the feature vector.
35 |     The point of this transformation, which corresponds to the hidden layer,
36 |     is to map the feature vector into a lower-dimensional space in which
37 |     the classes are separable by the softmax output layer.
38 | 
39 |     Adapted from: https://dl.acm.org/doi/pdf/10.1145/2810103.2813677
40 |     """
41 | 
42 |     def __init__(self, in_features: int = 112 * 92, n_classes: int = 40):
43 |         super(MLP, self).__init__()
44 |         self.hidden = nn.Linear(in_features, 3000)
45 |         self.prediction = nn.Linear(3000, n_classes)
46 | 
47 |     def forward(self, x):
48 |         x = self.hidden(x)
49 |         x = nn.Sigmoid()(x)
50 |         x = self.prediction(x)
51 |         return nn.LogSoftmax(dim=1)(x)
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # OSX Rubbish
  2 | .DS_Store
  3 | # any dataset folder is ignored
  4 | */data/
  5 | data/
  6 | # checkpoints/  # re-established
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | pip-wheel-metadata/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 


--------------------------------------------------------------------------------
/setup.md:
--------------------------------------------------------------------------------
 1 | ## Preamble
 2 | 
 3 | To run the code included in this tutorial, we will leverage on a pretty "standard" Python/PyData stack:
 4 | `numpy`, `pandas`, `matplotlib`, and `scikit-learn` for all the data science and Machine learning parts,
 5 | and `pytorch` (w/ `torchvision`) for the Deep Learning examples.
 6 | 
 7 | Moreover, a few **extra** / specialised packages will be also featured:
 8 | - [PySyft](https://github.com/OpenMined/PySyft): A platform for Remote Data Science
 9 | - [Opacus](https://opacus.ai): A library to train PyTorch models with differential privacy
10 | - [PHE](https://pypi.org/project/phe/): A Python 3 library implementing the Paillier Partially Homomorphic Encryption
11 | 
12 | As for the Python version/distribution: any Python 3.10+ version should be fine.
13 | 
14 | The [repository](http://github.com/leriomaggio/ppml-tutorial) contains the files to
15 | recreate the Python environment with all the required packages, either you are using [**Miniconda**](https://docs.anaconda.com/free/miniconda/index.html)(i.e. [`environment.yml`](http://github.com/leriomaggio/ppml-tutorial/environment.yml)) or 
16 | Standard Python Distribution (i.e. [`ppml_requirements.txt`](http://github.com/leriomaggio/ppml-tutorial/ppml_requirements.txt)).
17 | 
18 | ## Set up the Environment
19 | 
20 | **Before we start**:
21 | 
22 | All the instructions reported below will consider the **Terminal**
23 | and hence the command-line interface to run all the commands.
24 | 
25 | Similarly, instructions to recreate the environment will consider using 
26 | [`pyenv`](https://github.com/pyenv/pyenv) and [`pyenv-virtualenv`](https://github.com/pyenv/pyenv-virtualenv)
27 | to download the Python distribution, and install the environment.
28 | 
29 | If you haven't already, let's download (or `git clone`) the current repository on your local computer.
30 | 
31 | ```bash
32 | git clone https://github.com/leriomaggio/ppml-tutorial
33 | cd ppml-tutorial
34 | ```
35 | 
36 | **Setup the environment**:
37 | 
38 | The repository contains a `ppml_requirements.txt` file that can be used to automatically recreate the
39 | environment with all the required packages.
40 | 
41 | First, let's download the shims of the Python version we want to use. We will be using `Python 3.12`:
42 | 
43 | 
44 | ```bash
45 | pyenv install 3.12
46 | ```
47 | 
48 | Once this is complete, you should now have the shims of Python 3.12 available in your system. 
49 | 
50 | The next step is to now point to this version of the interpreter when creating the new virtual environment. 
51 | 
52 | ```bash
53 | pyenv virtualenv 3.12 ppml
54 | ```
55 | 
56 | This will create a new virtual environment called `ppml`. We now need to **activate** the environment:
57 | 
58 | ```bash
59 | pyenv virtualenv activate ppml
60 | ```
61 | 
62 | **Finally**, you could install all the required packages using `pip`:
63 | 
64 | ```bash
65 | 
66 | pip install -f ppml_requirements.txt
67 | ```
68 | 
69 | ### Well Done! 🎉
70 | 
71 | ## Test your Environment
72 | 
73 | If you followed all the steps reported in the previous section to setup your local machine, you should be ready to 
74 | proceed with **testing your environment**.
75 | 
76 | To do so, please open the `Get-Ready.ipynb` notebook to check that everything works properly on your end:
77 | 
78 | ```bash
79 | jupyter lab Get-Ready.ipynb
80 | ```


--------------------------------------------------------------------------------
/Get-Ready.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "id": "66212187",
  7 |    "metadata": {},
  8 |    "source": [
  9 |     "# Test your Environment"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "attachments": {},
 14 |    "cell_type": "markdown",
 15 |    "id": "45d9726a",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "In this notebook you will find the few (and simple) steps that are required to test the environment that is required to execute all the code examples in the tutorial."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "id": "066ef2c5",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from collections import namedtuple\n",
 29 |     "\n",
 30 |     "Package = namedtuple(\"Package\", [\"name\", \"major\", \"minor\"])"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "id": "fab6146c",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "PKGS_INFO = {\n",
 41 |     "    'matplotlib': Package(name=\"matplotlib\", major=3, minor=7),\n",
 42 |     "    'numpy': Package(\"numpy\", 1, 24),\n",
 43 |     "    'pandas': Package(\"pandas\", 1, 5),\n",
 44 |     "    'pytorch': Package(\"torch\", 1, 13),\n",
 45 |     "    'scikit-learn': Package(\"sklearn\", 1, 2),\n",
 46 |     "    'scipy': Package(\"scipy\", 1, 10),\n",
 47 |     "    'torchvision': Package(\"torchvision\", 0, 14),\n",
 48 |     "    'opacus': Package(\"opacus\", 1, 4),\n",
 49 |     "    'phe': Package(\"phe\", 1, 5),\n",
 50 |     "    'syft': Package(\"syft\", 0, 8)\n",
 51 |     "}"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "id": "26f02be3",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "from importlib import import_module\n",
 62 |     "\n",
 63 |     "def check_version(name: str, package: Package) -> bool:\n",
 64 |     "    module = import_module(package.name)\n",
 65 |     "    print(f\"Import {name.title()}: \", end=\" \")\n",
 66 |     "    version = module.__version__\n",
 67 |     "    major, minor, *_ = tuple(map(int, version.split(\".\")))\n",
 68 |     "    check = package.major <= major or (package.major == major and package.minor <= minor)\n",
 69 |     "    if not check:\n",
 70 |     "        print(\"FAIL\")\n",
 71 |     "        print(f\"In this tutorial we will be using {name} {package.major}.{package.minor} - found {version} instead\")\n",
 72 |     "    else:\n",
 73 |     "        print(\"OK\")\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "def check_package_versions():\n",
 77 |     "    for name, package in PKGS_INFO.items():\n",
 78 |     "        check_version(name=name, package=package)\n"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "id": "3ddf73ab",
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "Import Matplotlib:  OK\n",
 92 |       "Import Numpy:  OK\n",
 93 |       "Import Pandas:  OK\n",
 94 |       "Import Pytorch:  OK\n",
 95 |       "Import Scikit-Learn:  OK\n",
 96 |       "Import Scipy:  OK\n",
 97 |       "Import Torchvision:  OK\n",
 98 |       "Import Opacus:  OK\n",
 99 |       "Import Phe:  OK\n",
100 |       "Import Syft:  OK\n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "check_package_versions()"
106 |    ]
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "kernelspec": {
111 |    "display_name": "Python 3 (ipykernel)",
112 |    "language": "python",
113 |    "name": "python3"
114 |   },
115 |   "language_info": {
116 |    "codemirror_mode": {
117 |     "name": "ipython",
118 |     "version": 3
119 |    },
120 |    "file_extension": ".py",
121 |    "mimetype": "text/x-python",
122 |    "name": "python",
123 |    "nbconvert_exporter": "python",
124 |    "pygments_lexer": "ipython3",
125 |    "version": "3.12.3"
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 5
130 | }
131 | 


--------------------------------------------------------------------------------
/2-ml-models-attacks/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as th
  3 | from torch.utils.data import DataLoader
  4 | from pathlib import Path
  5 | from typing import Tuple
  6 | from tqdm.notebook import tqdm
  7 | from sklearn.metrics import accuracy_score
  8 | 
  9 | 
 10 | def train(
 11 |     model: th.nn.Module,
 12 |     optimiser: th.optim.Optimizer,
 13 |     loaders: Tuple[DataLoader, DataLoader],
 14 |     epochs: int = 100,
 15 |     model_name: str = None,
 16 |     verbose: bool = False,
 17 | ):
 18 |     """Simple Training/Validation Loop using the input model, and the pair of data loaders
 19 |     for training and validation, respectively.
 20 | 
 21 |     model:
 22 |         The target PyTorch model (nn.Module) to train
 23 |     optimiser:
 24 |         The model optimiser holding reference to model's parameters
 25 |     loaders: Tuple[DataLoader]
 26 |         Pair of Dataloader for training and validation data, respectively.
 27 |     epochs: int (default 100)
 28 |         Total number of training epoch
 29 |     model_name: str (default "")
 30 |         The name of the trained model - used mainly to name the checkpoint file
 31 |         that will be saved. If no name will be provided, the default
 32 |         `model.__class__.__name__.lower()` will be used.
 33 |     verbose: bool (default False)
 34 |         Verbosity of the report. If True, the Accuracy of each epoch will be printed.
 35 |         If not, only validation accuracy will be shown.
 36 |     """
 37 |     if model_name is None or not model_name:
 38 |         model_name = model.__class__.__name__.lower()
 39 | 
 40 |     train_loader, test_loader = loaders
 41 |     device = th.device("cuda" if th.cuda.is_available() else "mps" if th.backends.mps.is_available() else "cpu")
 42 |     print(f"Using {device} Device")
 43 |     # move model to the selected device, in case
 44 |     model = model.to(device)
 45 |     # both models uses LogSoftmax already! So NLLLoss is what we need
 46 |     criterion = th.nn.NLLLoss()
 47 | 
 48 |     best_validation_accuracy = 0
 49 |     checkpoint_folder = Path("./checkpoints")
 50 |     checkpoint_folder.mkdir(exist_ok=True)
 51 | 
 52 |     for epoch in tqdm(range(epochs), desc="Epochs"):
 53 |         running_loss_pred, training_acc = _step(
 54 |             train_loader, model, optimiser, criterion, device, is_training=True
 55 |         )
 56 |         if verbose:
 57 |             print(f"Prediction: {running_loss_pred}; Training ACC: {training_acc}")
 58 | 
 59 |         # run validation every 10 epochs
 60 |         if (epoch + 1) % 10 == 0:
 61 |             _, valid_acc = _step(
 62 |                 test_loader, model, optimiser, criterion, device, is_training=False
 63 |             )
 64 |             if verbose:
 65 |                 print(f"Validation ACC: {valid_acc}")
 66 |             if best_validation_accuracy < valid_acc:
 67 |                 if verbose:
 68 |                     print("Saving Best Model Checkpoint")
 69 |                 chk_path = checkpoint_folder / f"{model_name}.pt"
 70 |                 print(chk_path)
 71 |                 th.save(model.state_dict(), chk_path)
 72 |                 best_validation_accuracy = valid_acc
 73 |                 print(f"Best Validation ACC: {valid_acc}")
 74 | 
 75 | 
 76 | def _step(loader, model, optimiser, criterion, device, is_training: bool):
 77 |     samples_count = 0
 78 |     running_loss_pred = 0.0
 79 |     y_true, y_pred = list(), list()
 80 |     for batch in loader:
 81 |         images, subject_ids = batch
 82 |         images = images.view(-1, 112 * 92).to(device)
 83 |         subject_ids = subject_ids.to(device)
 84 |         samples_count += len(images)
 85 | 
 86 |         # zero the gradient
 87 |         model.zero_grad()
 88 |         optimiser.zero_grad()
 89 | 
 90 |         with th.set_grad_enabled(is_training):
 91 |             out = model(images)
 92 |             loss = criterion(out, subject_ids)
 93 |             _, preds = th.max(out, 1)
 94 | 
 95 |             if is_training:
 96 |                 loss.backward()
 97 |                 optimiser.step()
 98 |                 running_loss_pred += loss.item()
 99 | 
100 |         y_pred.append(preds.detach().cpu().numpy())
101 |         y_true.append(subject_ids.detach().cpu().numpy())
102 | 
103 |     if is_training:
104 |         running_loss_pred /= samples_count
105 |     y_pred = np.hstack(y_pred)
106 |     y_true = np.hstack(y_true)
107 |     step_acc = accuracy_score(y_true, y_pred)
108 |     return running_loss_pred, step_acc
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PPML: Machine Learning on Data you cannot see
  2 | 
  3 | Repository for the [tutorial](https://2024.pycon.it/en/submission/ryez) on **Privacy-Preserving Machine Learning** (`PPML`) presented at [PyCon Italia 2024](https://2024.pycon.it)
  4 | 
  5 | ## Intro
  6 | 
  7 | Privacy guarantees are **the** most crucial requirement when it comes to analyse sensitive data. These requirements could be sometimes very stringent, so that it becomes a real barrier for the entire pipeline. Reasons for this are manifold, and involve the fact that data could not be _shared_ nor moved from their silos of resident, let alone analysed in their _raw_ form. As a result, _data anonymisation techniques_ are sometimes used to generate a sanitised version of the original data. However, these techniques alone are not enough to guarantee that privacy will be completely preserved. Moreover, the _memoisation_ effect of Deep learning  models could be maliciously exploited to _attack_ the models, and _reconstruct_  sensitive information about samples used in training, even if these information were not originally provided. 
  8 | 
  9 | *Privacy-preserving machine learning* (PPML) methods hold the promise to overcome all those issues, allowing to train machine learning models with full privacy guarantees.
 10 | 
 11 | This workshop will be mainly organised in **three** main parts. In the first part, we will introduce the main threats to 
 12 | data and machine learning models (e.g. _membership inference attack_ ) for privacy. 
 13 | In the second part, we will work our way towards  **differential privacy**: what is it, and how this method works, and 
 14 | how differential privacy could be used with Machine learning. 
 15 | Lastly, we will conclude the tutorial considering more complex ML scenarios to train Deep learning networks on encrypted data, with specialised _distributed_ settings for **remote analytics**.
 16 | 
 17 | ### Outline
 18 | 
 19 | - **Introduction**: Brief Intro to `PPML` and to the workshop (`10 mins`) [SLIDES](https://speakerdeck.com/leriomaggio/ppml-pyconit24)
 20 | 
 21 | - **Part 1**: Data and ML models Threats (`45 mins`)
 22 |   - De-identification
 23 |   - K-anonimity and limitations
 24 |   - ML Model vulnerabilities: Adversarial Examples and _inference attack_
 25 | 
 26 | - **Part 2**: Short Introduction to Differential Privacy (`45 mins`)
 27 |   
 28 |   - Intro to Differential Privacy
 29 |   - Properties of Differential Privacy
 30 |   - DL training with Differential Privacy
 31 | 
 32 | - **Break** (`5 mins`)
 33 | 
 34 | - **Part 3**: Primer on Remote Data Science & PySyft (`25 mins`)
 35 |   - Intro to Federated Learning
 36 |   - DL training on (Homomorphically) Encrypted Data
 37 |   - Remote Data Science using PySyft
 38 | 
 39 | 
 40 | ## Get the material
 41 | 
 42 | Clone the current repository by running the following instructions:
 43 | 
 44 | ```bash
 45 | cd $HOME  # This will make sure you'll be in your HOME folder
 46 | git clone https://github.com/leriomaggio/ppml-tutorial.git
 47 | ```
 48 | 
 49 | **Note**: This will create a new folder named `ppml-tutorial`. Move into this folder by typing:
 50 | 
 51 | ```bash
 52 | cd ppml-tutorial
 53 | ```
 54 | 
 55 | Well done! Now you should do be in the right location.
 56 | Bear with me for another few seconds, following instructions reported below 🙏
 57 | 
 58 | ## Installation Instructions
 59 | 
 60 | All the materials in this tutorial (code, and lecture notes) are made available as
 61 | Jupyter notebooks.
 62 | 
 63 | **(1)** There is no specific _hardware requirement_ to execute the code, i.e. running everything
 64 | on your laptop should be more than fine 😊.
 65 | 
 66 | **(2)**: As for the _software requirements_, we will be using a pretty standard Python/PyData stack:
 67 | `numpy`, `pandas`, `matplotlib`, and `scikit-learn` for all the data science and Machine learning parts,
 68 | along with `pytorch` and `torchvision` to work on the Deep Learning examples.
 69 | 
 70 | Moreover, a few **extra** / specialised packages will be also featured:
 71 | - [PySyft](https://github.com/OpenMined/PySyft): A platform for Remote Data Science
 72 | - [Opacus](https://opacus.ai): A library to train PyTorch models with differential privacy
 73 | - [PHE](https://pypi.org/project/phe/): A Python 3 library implementing the Paillier Partially Homomorphic Encryption
 74 | 
 75 | Please refer to the [`setup.md`](./setup.md) document for step-by-step instructions to set up the environment
 76 | on your computer.
 77 | 
 78 | If you spot any error/mistake, please feel free to reach out directly to [me](mailto:valerio@openmined.org?subject=PPML%20SciPy23%20Issue), or to open an [Issue](http://github.com/leriomaggio/ppml-tutorial/issues)
 79 | on the repository.
 80 | 
 81 | Any feedback will be very much appreciated!
 82 | 
 83 | Thank you! 🙏
 84 | 
 85 | ## Colophon
 86 | 
 87 | **Author**: Valerio Maggio ([`@leriomaggio`](https://twitter.com/leriomaggio)),
 88 | Researcher, [SSI Fellow](https://www.software.ac.uk/about/fellows/valerio-maggio),
 89 | and Education Lead at Open Mined.
 90 | 
 91 | All the **Code** material is distributed under the terms of the Apache License. See [LICENSE](./LICENSE) file for additional details.
 92 | 
 93 | All the instructional materials in this repository are free to use, and made available under the [Creative Commons Attribution
 94 | license](https://creativecommons.org/licenses/by/4.0/). The following is a human-readable summary of (and not a substitute for) the [full legal text of the CC BY 4.0
 95 | license](https://creativecommons.org/licenses/by/4.0/legalcode).
 96 | 
 97 | You are free:
 98 | 
 99 | * to **Share**---copy and redistribute the material in any medium or format
100 | * to **Adapt**---remix, transform, and build upon the material
101 | 
102 | for any purpose, even commercially.
103 | 
104 | The licensor cannot revoke these freedoms as long as you follow the
105 | license terms.
106 | 
107 | Under the following terms:
108 | 
109 | * **Attribution** --- You must give appropriate credit, and provide a link to the
110 |   [LICENSE](https://github.com/leriomaggio/ppml-tutorial/LICENSE) [`cc-by-human`](https://creativecommons.org/licenses/by/4.0/),
111 |   and indicate if changes were made.
112 |   You may do so in any reasonable manner, but not in any way that suggests the
113 |   licensor endorses you or your use.
114 |   
115 | * **No additional restrictions** --- You may not apply legal terms or
116 | technological measures that legally restrict others from doing
117 | anything the license permits.
118 | 
119 | ### Acknowledgment and funding
120 | 
121 | The material developed in this tutorial has been supported by the [Software Sustainability Institute](https://www.software.ac.uk) (SSI), as part of my 
122 | [SSI fellowship](https://www.software.ac.uk/about/fellows/valerio-maggio) on `PETs` (Privacy Enhancing Technologies).
123 | 
124 | Please see this [deck](https://speakerdeck.com/leriomaggio/privacy-enhancing-data-science-ssi-fellowship-2022) to know more about my fellowship plans.
125 | 
126 | Public shout out to all the people at [OpenMined](https://www.openmined.org) for all the encouragement and support with the preparation of this tutorial.
127 | I hope the material in this repository could contribute to raise awareness about all the amazing work on PETs it's being provided to the Open Source and the Python communities.
128 | 
129 | ![OpenMined](./logos/openmined_logo_small.png "OpenMined")
130 | 
131 | ## Contacts
132 | 
133 | For any questions or doubts, feel free to open an [issue](https://github.com/leriomaggio/ppml-tutorial/issues) in the repository, or drop me an email @ `valerio_at_openmined_dot_org`.
134 | 


--------------------------------------------------------------------------------
/3-differential-privacy/6-MIA-Reconstruction-OPACUS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "94a85538-8e7e-4771-a5e7-d7b18d3b81bd",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Optional: setup NoTexBook theme\n",
 11 |     "%load_ext notexbook\n",
 12 |     "\n",
 13 |     "%texify"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "f5f5b821",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Model Inversion Attack"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "68655027",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import torch as th\n",
 32 |     "import numpy as np\n",
 33 |     "\n",
 34 |     "from matplotlib import pyplot as plt\n",
 35 |     "\n",
 36 |     "%matplotlib inline"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "04e011e0-7cd6-4832-a72c-be001f23fee8",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Tweak to reuse the Python modules defined in previous section\n",
 47 |     "import sys, os\n",
 48 |     "from pathlib import Path\n",
 49 |     "\n",
 50 |     "sys.path.insert(0, os.path.join(os.path.abspath(os.path.curdir), \"..\", \"2-ml-models-attacks\"))"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "45779d5a",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from dataset import ORLFaces\n",
 61 |     "from torchvision.transforms import ToTensor"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "id": "3d5132ab",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n",
 72 |     "#       see, https://github.com/pytorch/vision/issues/3497 for more information\n",
 73 |     "from six.moves import urllib\n",
 74 |     "\n",
 75 |     "opener = urllib.request.build_opener()\n",
 76 |     "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n",
 77 |     "urllib.request.install_opener(opener)\n",
 78 |     "\n",
 79 |     "from pathlib import Path\n",
 80 |     "import os\n",
 81 |     "\n",
 82 |     "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\""
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "4db6abf3",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "orl_faces_train = ORLFaces(\n",
 93 |     "    root=DATA_FOLDER, download=True, split=\"train\", transform=ToTensor()\n",
 94 |     ")\n",
 95 |     "orl_faces_test = ORLFaces(root=DATA_FOLDER, download=True, split=\"test\", transform=ToTensor())"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "d0d51644",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "orl_faces_train.data.shape, orl_faces_test.data.shape"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "859989b5",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "from torch.utils.data import DataLoader\n",
116 |     "\n",
117 |     "train_loader = DataLoader(\n",
118 |     "    orl_faces_train, batch_size=32, shuffle=False, drop_last=False\n",
119 |     ")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "c8c16f84",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# Reconstruction Attack Settings\n",
130 |     "# See Paper, Section 5.2 - Reconstruction Attack\n",
131 |     "α = 5000\n",
132 |     "β = 100\n",
133 |     "γ = 0.99\n",
134 |     "λ = 0.1"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "39426d25",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from models import SoftmaxRegression, MLP"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "id": "34de5f7f",
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "from pathlib import Path\n",
155 |     "\n",
156 |     "CHECKPOINT_FOLDER = Path(\"./checkpoints/\")\n",
157 |     "\n",
158 |     "\n",
159 |     "def load_weights(model, model_name: str = None) -> th.TensorType:\n",
160 |     "    if model_name is None or not model_name:\n",
161 |     "        model_name = model.__class__.__name__.lower()\n",
162 |     "    w_file = CHECKPOINT_FOLDER / f\"{model_name}.pt\"\n",
163 |     "    try:\n",
164 |     "        weights = th.load(open(w_file, \"rb\"))\n",
165 |     "    except FileNotFoundError:\n",
166 |     "        print(f\"Model Weights file {w_file} does not exist! Please check.\")\n",
167 |     "        return None\n",
168 |     "    return weights"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "d37c65ad",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "softmax_reg = SoftmaxRegression()\n",
179 |     "weights = load_weights(softmax_reg, model_name=\"softmax_reg_opacus_test\")\n",
180 |     "\n",
181 |     "weights[\"regression.weight\"] = weights[\"_module.regression.weight\"]\n",
182 |     "_ = weights.pop(\"_module.regression.weight\")\n",
183 |     "\n",
184 |     "weights[\"regression.bias\"] = weights[\"_module.regression.bias\"]\n",
185 |     "_ = weights.pop(\"_module.regression.bias\")\n",
186 |     "\n",
187 |     "if weights is not None:\n",
188 |     "    softmax_reg.load_state_dict(weights)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "81e83c5b",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "def process(im_flatten):\n",
199 |     "    max_v = th.max(im_flatten)\n",
200 |     "    min_v = th.min(im_flatten)\n",
201 |     "    return (im_flatten - min_v) / (max_v - min_v)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "id": "e52a0ac1",
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "def mi_face(model, target_label):\n",
212 |     "    aim_tensor = th.zeros(1, 112 * 92)\n",
213 |     "    aim_tensor.requires_grad = True\n",
214 |     "\n",
215 |     "    lossn_1 = 10\n",
216 |     "    b = 0\n",
217 |     "    g = 0\n",
218 |     "\n",
219 |     "    out = model(aim_tensor.detach())\n",
220 |     "    _, pred = th.max(out, 1)\n",
221 |     "    print(pred)\n",
222 |     "    print(f\"original input image {target_label}\")\n",
223 |     "    plt.imshow(\n",
224 |     "        np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy(), (1, 2, 0)),\n",
225 |     "        cmap=\"Greys\",\n",
226 |     "    )\n",
227 |     "    plt.show()\n",
228 |     "    print(\n",
229 |     "        f\"original input image predict label {target_label} - predict label: {pred.item()}\"\n",
230 |     "    )\n",
231 |     "\n",
232 |     "    criterion = th.nn.NLLLoss()\n",
233 |     "\n",
234 |     "    for i in range(α):\n",
235 |     "        out = model(aim_tensor)\n",
236 |     "        if aim_tensor.grad is not None:\n",
237 |     "            aim_tensor.grad.zero_()\n",
238 |     "        out = out.reshape(1, 40)\n",
239 |     "        target_class = th.tensor([target_label])\n",
240 |     "        loss = criterion(out, target_class)\n",
241 |     "        loss.backward()\n",
242 |     "        aim_grad = aim_tensor.grad\n",
243 |     "\n",
244 |     "        # SGD Step\n",
245 |     "        # see https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD\n",
246 |     "        aim_tensor = aim_tensor - (λ * aim_grad)\n",
247 |     "        aim_tensor = process(aim_tensor)\n",
248 |     "        aim_tensor = th.clamp(aim_tensor.detach(), 0, 1)\n",
249 |     "        aim_tensor.requires_grad = True\n",
250 |     "        if loss >= lossn_1:\n",
251 |     "            b += 1\n",
252 |     "            if b > β:\n",
253 |     "                break\n",
254 |     "        else:\n",
255 |     "            b = 0\n",
256 |     "        lossn_1 = loss\n",
257 |     "        if loss < γ:\n",
258 |     "            break\n",
259 |     "\n",
260 |     "    print(f\"Attack completed at {i} iterations\")\n",
261 |     "    out = model(aim_tensor.detach())\n",
262 |     "    _, pred = th.max(out, 1)\n",
263 |     "    print(pred)\n",
264 |     "    print(f\"inverted image {target_label}\")\n",
265 |     "    plt.imshow(\n",
266 |     "        np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy() * 255, (1, 2, 0)),\n",
267 |     "        cmap=\"Greys\",\n",
268 |     "    )\n",
269 |     "    plt.show()"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "id": "44013f2f",
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "for cl in range(10):\n",
280 |     "    mi_face(softmax_reg, cl)"
281 |    ]
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Python 3 (ipykernel)",
287 |    "language": "python",
288 |    "name": "python3"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.12.3"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 5
305 | }
306 | 


--------------------------------------------------------------------------------
/3-differential-privacy/5-MIA-Training-OPACUS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "0b9e9dde-7628-4d45-a408-afd93dd841ce",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Optional: setup NoTexBook theme\n",
 11 |     "%load_ext notexbook\n",
 12 |     "\n",
 13 |     "%texify"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "dcd69b34",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Model Inversion Attack - Model Training"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "id": "885f544c",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "In this notebook we will repeat the same operations done in preparation for the **Model Inversion Attack** (in section 1) \n",
 30 |     "\n",
 31 |     "The very **big** difference this time though is that we will be using **Opacus** to train our ML model.\n",
 32 |     "\n",
 33 |     "$\\rightarrow$ ‼️ The very **remarkable** thing to notice is **how little** the implementation changes wrt. to the previous notebook\n",
 34 |     "(in fact, we will be using the **same** `train` function defined previously)"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "eee64647",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import torch as th\n",
 45 |     "import numpy as np\n",
 46 |     "\n",
 47 |     "from matplotlib import pyplot as plt\n",
 48 |     "\n",
 49 |     "%matplotlib inline"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "ff722fd0",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import warnings\n",
 60 |     "warnings.simplefilter(\"ignore\")\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "# Tweak to reuse the Python modules defined in previous section\n",
 64 |     "import sys, os\n",
 65 |     "from pathlib import Path\n",
 66 |     "\n",
 67 |     "sys.path.insert(0, os.path.join(os.path.abspath(os.path.curdir), \"..\", \"2-ml-models-attacks\"))"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "9086c266",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from dataset import ORLFaces\n",
 78 |     "from torchvision.transforms import ToTensor, Grayscale, Compose\n",
 79 |     "from torch.utils.data import DataLoader"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "bbc48ffb",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "SEED = 123456\n",
 90 |     "\n",
 91 |     "np.random.seed(SEED)\n",
 92 |     "th.manual_seed(SEED)\n",
 93 |     "if th.cuda.is_available():\n",
 94 |     "    th.cuda.manual_seed_all(SEED)\n",
 95 |     "    th.backends.cudnn.deterministic = True"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "93241bc5",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n",
106 |     "#       see, https://github.com/pytorch/vision/issues/3497 for more information\n",
107 |     "from six.moves import urllib\n",
108 |     "\n",
109 |     "opener = urllib.request.build_opener()\n",
110 |     "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n",
111 |     "urllib.request.install_opener(opener)\n",
112 |     "\n",
113 |     "from pathlib import Path\n",
114 |     "import os\n",
115 |     "\n",
116 |     "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\""
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "c16625ec",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "imgs_trasform = Compose([Grayscale(num_output_channels=1), ToTensor()])\n",
127 |     "\n",
128 |     "orl_faces_train = ORLFaces(\n",
129 |     "    root=DATA_FOLDER, download=True, split=\"train\", transform=imgs_trasform\n",
130 |     ")\n",
131 |     "orl_faces_test = ORLFaces(\n",
132 |     "    root=DATA_FOLDER, download=True, split=\"test\", transform=imgs_trasform\n",
133 |     ")"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "id": "b9ae6a51",
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "BATCH_SIZE = 32\n",
144 |     "\n",
145 |     "train_loader = DataLoader(\n",
146 |     "    orl_faces_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=False\n",
147 |     ")\n",
148 |     "test_loader = DataLoader(\n",
149 |     "    orl_faces_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=False\n",
150 |     ")"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "id": "425c305c",
156 |    "metadata": {},
157 |    "source": [
158 |     "Show some of the training images, for fun"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "id": "61e794b9",
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "from torchvision.utils import make_grid\n",
169 |     "\n",
170 |     "\n",
171 |     "def imshow(img):\n",
172 |     "    npimg = img.numpy()\n",
173 |     "    plt.figure(figsize=(10, 12))\n",
174 |     "    plt.imshow(np.transpose(npimg, (1, 2, 0)))\n",
175 |     "    plt.show()"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "id": "6008bf8c",
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "# get some random training images\n",
186 |     "images, labels = next(iter(train_loader))"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "8e03a7f6",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "# show images\n",
197 |     "imshow(make_grid(images))\n",
198 |     "# print labels\n",
199 |     "print(\" \".join(f\"{labels[j]}\" for j in range(BATCH_SIZE)))"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "id": "cdc04e1d",
205 |    "metadata": {},
206 |    "source": [
207 |     "## Privacy Parameters and Opacus"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "e0d1c795",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "from models import SoftmaxRegression, MLP"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "id": "62df081d",
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "softmax_reg = SoftmaxRegression()"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "id": "b5030c0e",
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "from opacus.validators import ModuleValidator\n",
238 |     "\n",
239 |     "errors = ModuleValidator.validate(softmax_reg, strict=False)\n",
240 |     "errors"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "id": "dd545cdb",
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "λ = 0.1 # optimiser learning rate"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "id": "67025880",
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "softmax_reg = SoftmaxRegression()\n",
261 |     "softmax_sgd = th.optim.SGD(softmax_reg.parameters(), lr=λ)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "id": "9449cbff",
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "from opacus import PrivacyEngine"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "id": "cb9ff406",
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "from train import train"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "id": "74c6ecf0",
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "MAX_GRAD_NORM = 1.2\n",
292 |     "EPSILON = 50\n",
293 |     "DELTA = 1e-5\n",
294 |     "EPOCHS = 200  # we have increased by 100 the number of epochs of training"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "id": "562a43d0",
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "privacy_engine = PrivacyEngine(accountant=\"gdp\")\n",
305 |     "\n",
306 |     "softmax_reg, softmax_sgd, train_loader = privacy_engine.make_private_with_epsilon(\n",
307 |     "    module=softmax_reg,\n",
308 |     "    optimizer=softmax_sgd,\n",
309 |     "    data_loader=train_loader,\n",
310 |     "    epochs=EPOCHS,\n",
311 |     "    target_epsilon=EPSILON,\n",
312 |     "    target_delta=DELTA,\n",
313 |     "    max_grad_norm=MAX_GRAD_NORM,\n",
314 |     ")\n",
315 |     "\n",
316 |     "print(f\"Using sigma={softmax_sgd.noise_multiplier} and C={MAX_GRAD_NORM}\")"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "id": "4f9c046a",
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "train(model=softmax_reg, optimiser=softmax_sgd, loaders=(train_loader, test_loader), \n",
327 |     "      model_name=\"softmax_reg_opacus_test\", verbose=False, epochs=EPOCHS)"
328 |    ]
329 |   }
330 |  ],
331 |  "metadata": {
332 |   "kernelspec": {
333 |    "display_name": "Python 3 (ipykernel)",
334 |    "language": "python",
335 |    "name": "python3"
336 |   },
337 |   "language_info": {
338 |    "codemirror_mode": {
339 |     "name": "ipython",
340 |     "version": 3
341 |    },
342 |    "file_extension": ".py",
343 |    "mimetype": "text/x-python",
344 |    "name": "python",
345 |    "nbconvert_exporter": "python",
346 |    "pygments_lexer": "ipython3",
347 |    "version": "3.12.3"
348 |   }
349 |  },
350 |  "nbformat": 4,
351 |  "nbformat_minor": 5
352 | }
353 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/3-differential-privacy/2-approx-differential-privacy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Optional: setup NoTexBook theme\n",
 10 |     "%load_ext notexbook\n",
 11 |     "%texify"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "**Adapted from**: [Ch6](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch6.ipynb)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "# Approximate Differential Privacy\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "Approximate differential privacy, also called $(\\epsilon, \\delta)$-differential privacy, has the following definition:\n",
 29 |     "\n",
 30 |     "\\begin{align}\n",
 31 |     "\\mathsf{Pr}[F(x) = S] \\leq e^\\epsilon \\mathsf{Pr}[F(x') = s] + \\delta\n",
 32 |     "\\end{align}\n",
 33 |     "\n",
 34 |     "The new privacy parameter, $\\delta$, represents a \"failure probability\" for the definition. \n",
 35 |     "\n",
 36 |     "With probability $1-\\delta$, we will get the same guarantee as pure differential privacy; with probability $\\delta$, we get no guarantee. \n",
 37 |     "\n",
 38 |     "In other words:\n",
 39 |     "\n",
 40 |     "- With probability $1-\\delta$, $\\frac{\\mathsf{Pr}[F(x) = S]}{\\mathsf{Pr}[F(x') = s]} \\leq e^\\epsilon$\n",
 41 |     "- With probability $\\delta$, we get no guarantee at all\n",
 42 |     "\n",
 43 |     "This definition should seem a little bit scary! \n",
 44 |     "\n",
 45 |     "With probability $\\delta$, anything at all could happen - including a release of the entire sensitive dataset! \n",
 46 |     "\n",
 47 |     "For this reason, we typically require $\\delta$ to be very small - usually $\\frac{1}{n^2}$ or less, where $n$ is the size of the dataset. \n",
 48 |     "\n",
 49 |     "In addition, it can be demonstrated in practice that the $(\\epsilon, \\delta)$-differentially private mechanisms in practical use \n",
 50 |     "don't fail catastrophically - as allowed by the definition - instead, they fail *gracefully*, and don't do terrible things like \n",
 51 |     "releasing the entire dataset.\n",
 52 |     "\n",
 53 |     "\n",
 54 |     "Such mechanisms *are* possible, however, and they do satisfy the definition of $(\\epsilon, \\delta)$-differential privacy."
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Properties of Approximate Differential Privacy\n",
 62 |     "\n",
 63 |     "Approximate differential privacy has similar properties to pure $\\epsilon$-differential privacy. It satisfies **sequential composition**:\n",
 64 |     "\n",
 65 |     "- If $F_1(x)$ satisfies $(\\epsilon_1, \\delta_1)$-differential privacy\n",
 66 |     "- And $F_2(x)$ satisfies $(\\epsilon_2, \\delta_2)$-differential privacy\n",
 67 |     "- Then the mechanism $G(x) = (F_1(x), F_2(x))$ which releases both results satisfies $(\\epsilon_1+\\epsilon_2, \\delta_1 + \\delta_2)$-differential privacy\n",
 68 |     "\n",
 69 |     "The only difference from the pure $\\epsilon$ setting is that we add up the values of $\\delta$ as well as the values of $\\epsilon$. \n",
 70 |     "\n",
 71 |     "Approximate differential privacy also satisfies **post-processing and parallel composition**."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## The Gaussian Mechanism\n",
 79 |     "\n",
 80 |     "The Gaussian mechanism is an alternative to the Laplace mechanism, which adds Gaussian noise instead of Laplacian noise. \n",
 81 |     "\n",
 82 |     "The Gaussian mechanism does *not* satisfy pure $\\epsilon$-differential privacy, but does satisfy $(\\epsilon, \\delta)$-differential \n",
 83 |     "privacy.\n",
 84 |     "\n",
 85 |     "According to the Gaussian mechanism, for a function $f(x)$ which returns a number, the following definition of $F(x)$ satisfies $(\\epsilon, \\delta)$-differential privacy:\n",
 86 |     "\n",
 87 |     "\\begin{align}\n",
 88 |     "F(x) = f(x) + \\mathcal{N}(\\sigma^2)\\\\\n",
 89 |     "\\text{where } \\sigma^2 = \\frac{2s^2 \\log(1.25/\\delta)}{\\epsilon^2}\n",
 90 |     "\\end{align}\n",
 91 |     "\n",
 92 |     "where $s$ is the sensitivity of $f$, and $\\mathcal{N}(\\sigma^2)$ denotes sampling from the Gaussian (normal) distribution with center 0 and variance $\\sigma^2$. Note that here (and elsewhere in these notes), $\\log$ denotes the natural logarithm."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "For real-valued functions $f : D \\rightarrow \\mathbb{R}$, we can use the Gaussian mechanism in exactly the same way as we do the Laplace mechanism, and it's easy to compare what happens under both mechanisms for a given value of $\\epsilon$."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "import pandas as pd\n",
109 |     "import numpy as np\n",
110 |     "import matplotlib.pyplot as plt\n",
111 |     "plt.style.use('seaborn-v0_8-whitegrid')"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "tags": [
119 |      "hide-input"
120 |     ]
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "epsilon = 1\n",
125 |     "vals_laplace = [np.random.laplace(loc=0, scale=1/epsilon) for x in range(100000)]\n",
126 |     "\n",
127 |     "delta = 10e-5\n",
128 |     "sigma = np.sqrt(2 * np.log(1.25 / delta)) * 1 / epsilon\n",
129 |     "vals_gauss = [np.random.normal(loc=0, scale=sigma) for x in range(100000)]\n",
130 |     "\n",
131 |     "plt.hist(vals_laplace, bins=50, label='Laplace')\n",
132 |     "plt.hist(vals_gauss, bins=50, alpha=.7, label='Gaussian');\n",
133 |     "plt.legend();"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Here, we graph the empirical probability density function of the Laplace and Gaussian mechanisms for $\\epsilon = 1$, with $\\delta = 10^{-5}$ for the Gaussian mechanism.\n",
141 |     "\n",
142 |     "Compared to the Laplace mechanism, the plot for the Gaussian mechanism looks \"squished.\" \n",
143 |     "\n",
144 |     "Differentially private outputs which are far from the true answer are much more likely using the Gaussian mechanism than they are under the Laplace mechanism (which, by comparison, looks extremely \"pointy\").\n",
145 |     "\n",
146 |     "So the Gaussian mechanism has two **major drawbacks**:\n",
147 |     "\n",
148 |     "1. it requires the use of the the relaxed $(\\epsilon, \\delta)$-differential privacy definition, *and* \n",
149 |     "2. it's less accurate than the Laplace mechanism. \n",
150 |     "\n",
151 |     "Why would we want to use it?"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "## Vector-Valued Functions and their Sensitivities\n",
159 |     "\n",
160 |     "So far, we have only considered real-valued functions (i.e. the function's output is always a single real number). \n",
161 |     "\n",
162 |     "Such functions are of the form $f : D \\rightarrow \\mathbb{R}$. \n",
163 |     "\n",
164 |     "Both the Laplace and Gaussian mechanism, however, can be extended to *vector-valued* functions of the form $f : D \\rightarrow \\mathbb{R}^k$, which return vectors of real numbers. \n",
165 |     "\n",
166 |     "We can think of histograms as vector-valued functions, which return a vector whose elements consist of histogram bin counts.\n",
167 |     "\n",
168 |     "We saw earlier that the *sensitivity* of a function is:\n",
169 |     "\n",
170 |     "\\begin{align}\n",
171 |     "GS(f) = \\max_{d(x,x') \\leq 1} \\lvert f(x) - f(x') \\rvert\n",
172 |     "\\end{align}\n",
173 |     "\n",
174 |     "How do we define sensitivity for vector-valued functions?\n",
175 |     "\n",
176 |     "Consider the expression $f(x) - f(x')$. \n",
177 |     "\n",
178 |     "If $f$ is a vector-valued function, then this expression represents the difference between two vectors, which can be computed as the difference between their corresponding elements (the difference of two length-$k$ vectors is thus a new length-$k$ vector). \n",
179 |     "\n",
180 |     "This new vector is the distance between $f(x)$ and $f(x')$, represented as a vector.\n",
181 |     "\n",
182 |     "The magnitude of this vector is the sensitivity of $f$. \n",
183 |     "\n",
184 |     "There are several ways to compute the magnitude of a vector; we'll use two of them: the $L1$ norm and the $L2$ norm.\n",
185 |     "\n",
186 |     "### L1 and L2 Norms\n",
187 |     "\n",
188 |     "The $L1$ norm of a vector $V$ of length $k$ is defined as $\\lVert V \\rVert_1 = \\sum_{i=1}^k \\lvert V_i \\rvert$ (i.e. it's the sum of the vector's elements). In 2-dimensional space, the $L1$ norm of the difference between two vectors yields the \"manhattan distance\" between them.\n",
189 |     "\n",
190 |     "The $L2$ norm of a vector $V$ of length $k$ is defined as $\\lVert V \\rVert_2 = \\sqrt{\\sum_{i=1}^k V_i^2}$ (i.e. the square root of the sum of the squares). In 2-dimensional space, this is the \"Euclidean distance\", and it's always less than or equal to the $L1$ norm.\n",
191 |     "\n",
192 |     "### L1 and L2 Sensitivities\n",
193 |     "\n",
194 |     "The $L1$ sensitivity of a vector-valued function $f$ is:\n",
195 |     "\n",
196 |     "\\begin{align}\n",
197 |     "GS(f) = \\max_{d(x,x') \\leq 1} \\lVert f(x) - f(x') \\rVert_1\n",
198 |     "\\end{align}\n",
199 |     "\n",
200 |     "This is equal to the sum of the *elementwise* sensitivities. For example, if we define a vector-valued function $f$ that returns a length-$k$ vector of 1-sensitive results, then the $L1$ sensitivity of $f$ is $k$.\n",
201 |     "\n",
202 |     "Similarly, the $L2$ sensitivity of a vector-valued function $f$ is:\n",
203 |     "\n",
204 |     "\\begin{align}\n",
205 |     "GS_2(f) = \\max_{d(x,x') \\leq 1} \\lVert f(x) - f(x') \\rVert_2\n",
206 |     "\\end{align}\n",
207 |     "\n",
208 |     "Using the same example as above, a vector-valued function $f$ returning a length-$k$ vector of 1-sensitive results has $L2$ sensitivity of $\\sqrt{k}$. For long vectors, the $L2$ sensitivity will obviously be much lower than the $L1$ sensitivity! For some applications, like machine learning algorithms (which sometimes return vectors with thousands of elements), $L2$ sensitivity is *significantly* lower than $L1$ sensitivity.\n",
209 |     "\n",
210 |     "### Choosing Between L1 and L2\n",
211 |     "\n",
212 |     "As mentioned earlier, both the Laplace and Gaussian mechanisms can be extended to vector-valued functions. \n",
213 |     "\n",
214 |     "However, there's a key difference between these two extensions: \n",
215 |     "\n",
216 |     "- the vector-valued Laplace mechanism **requires** the use of $L1$ sensitivity, while the vector-valued Gaussian mechanism allows the use of either $L1$ or $L2$ sensitivity. \n",
217 |     "\n",
218 |     "This is a **major strength** of the Gaussian mechanism. For applications in which $L2$ sensitivity is much lower than $L1$ sensitivity, the Gaussian mechansim allows adding *much* less noise.\n",
219 |     "\n",
220 |     "- The **vector-valued Laplace mechanism** releases $f(x) + (Y_1, \\dots, Y_k)$, where $Y_i$ are drawn i.i.d. from the Laplace distribution with scale $\\frac{s}{\\epsilon}$ and $s$ is the $L1$ sensitivity of $f$\n",
221 |     "\n",
222 |     "- The **vector-valued Gaussian mechanism** releases $f(x) + (Y_1, \\dots, Y_k)$, where $Y_i$ are drawn i.i.d. from the Gaussian distribution with $\\sigma^2 = \\frac{2s^2 \\log(1.25/\\delta)}{\\epsilon^2}$ and $s$ is the $L2$ sensitivity of $f$"
223 |    ]
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "kernelspec": {
228 |    "display_name": "Python 3 (ipykernel)",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "codemirror_mode": {
234 |     "name": "ipython",
235 |     "version": 3
236 |    },
237 |    "file_extension": ".py",
238 |    "mimetype": "text/x-python",
239 |    "name": "python",
240 |    "nbconvert_exporter": "python",
241 |    "pygments_lexer": "ipython3",
242 |    "version": "3.12.3"
243 |   }
244 |  },
245 |  "nbformat": 4,
246 |  "nbformat_minor": 4
247 | }
248 | 


--------------------------------------------------------------------------------
/2-ml-models-attacks/3-MIA-Reconstruction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "5f01a093-5560-4e09-a9c3-95c097fdbcb6",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Optional: setup NoTexBook theme\n",
 11 |     "%load_ext notexbook\n",
 12 |     "\n",
 13 |     "%texify"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "3ba8845d-8556-402d-a2fc-52d8b4e3dc2b",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Model Inversion Attack"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "id": "1c67e4d2",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "In this notebook we will be performing the **Model Inversion Attack** considering two pre-trained ML models as originally described in the reference paper:\n",
 30 |     "\n",
 31 |     "> **Model Inversion Attacks that Exploit Confidence Information and Basic Countermeasures**, by _Fredrikson, et. al_, 2015 \n",
 32 |     "[DOI](https://dl.acm.org/doi/pdf/10.1145/2810103.2813677).\n",
 33 |     "\n",
 34 |     "The two models are `SoftmaxRegression` and `MLP`.\n",
 35 |     "\n",
 36 |     "⚠️ **Note**: All the experimental settings, and choices made in this notebook are _replicating_ exactly the original paper."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "68655027",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import torch as th\n",
 47 |     "import numpy as np\n",
 48 |     "\n",
 49 |     "from matplotlib import pyplot as plt\n",
 50 |     "\n",
 51 |     "%matplotlib inline"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "1942bb1a",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n",
 62 |     "#       see, https://github.com/pytorch/vision/issues/3497 for more information\n",
 63 |     "from six.moves import urllib\n",
 64 |     "\n",
 65 |     "opener = urllib.request.build_opener()\n",
 66 |     "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n",
 67 |     "urllib.request.install_opener(opener)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "44bf0bd8",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from pathlib import Path\n",
 78 |     "import os\n",
 79 |     "\n",
 80 |     "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\""
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "45779d5a",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "from dataset import ORLFaces\n",
 91 |     "from torchvision.transforms import ToTensor"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "4db6abf3",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "orl_faces_train = ORLFaces(root=DATA_FOLDER, download=True, split=\"train\", transform=ToTensor())\n",
102 |     "orl_faces_test = ORLFaces(root=DATA_FOLDER, download=True, split=\"test\", transform=ToTensor())"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "d0d51644",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "orl_faces_train.data.shape, orl_faces_test.data.shape"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "859989b5",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "from torch.utils.data import DataLoader\n",
123 |     "\n",
124 |     "train_loader = DataLoader(orl_faces_train, batch_size=32, shuffle=False, drop_last=False)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "id": "cd8754dc",
130 |    "metadata": {},
131 |    "source": [
132 |     "## Reconstruction Attack\n",
133 |     "\n",
134 |     "#### Settings"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "c8c16f84",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# Reconstruction Attack Settings\n",
145 |     "# See Paper, Section 5.2 - Reconstruction Attack\n",
146 |     "α = 5000  # total iterations\n",
147 |     "β = 100   # max nr. of iterations without improvements\n",
148 |     "γ = 0.99  # threshold of the cost \n",
149 |     "λ = 0.1   # learning rate"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "b7b97e4a",
155 |    "metadata": {},
156 |    "source": [
157 |     "#### Load Pre-trained Models"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "id": "39426d25",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "from models import SoftmaxRegression"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "id": "7b643a8f",
173 |    "metadata": {},
174 |    "source": [
175 |     "⚠️ If you skipped the **`MIA-Training`** notebook, please download the **pre-trained** weights of the `SoftmaxRegression` model here: [softmax_regression_mia.pt](https://www.dropbox.com/s/t9wglqyj5zr74fq/softmax_mia.pt?dl=1) and save it into the local `checkpoints` folder\n",
176 |     "\n"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "id": "34de5f7f",
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "from pathlib import Path \n",
187 |     "\n",
188 |     "CHECKPOINT_FOLDER = Path(\"./checkpoints/\")\n",
189 |     "CHECKPOINT_FOLDER.mkdir(exist_ok=True)\n",
190 |     "\n",
191 |     "def load_weights(model, model_filename: str = None):\n",
192 |     "    if model_filename is None or not model_filename:\n",
193 |     "        model_filename = f\"{model.__class__.__name__.lower()}.pt\"\n",
194 |     "    w_file = CHECKPOINT_FOLDER / model_filename\n",
195 |     "    try:\n",
196 |     "        weights = th.load(open(w_file, \"rb\"))\n",
197 |     "    except FileNotFoundError: \n",
198 |     "        print(f\"Model Weights file {w_file} does not exist! Please check.\")\n",
199 |     "        return None\n",
200 |     "    return weights\n"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "id": "d37c65ad",
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "softmax_reg = SoftmaxRegression()\n",
211 |     "weights = load_weights(softmax_reg, model_filename=\"softmax_mia.pt\")\n",
212 |     "if weights is not None:\n",
213 |     "    softmax_reg.load_state_dict(weights)\n",
214 |     "    \n",
215 |     "softmax_reg"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "id": "ba0018ae",
221 |    "metadata": {},
222 |    "source": [
223 |     "## MIA Reconstruction Strategy\n",
224 |     "\n",
225 |     "\n",
226 |     "<img src=\"https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/3-ml-models-attacks/mia_reconstruction.png\" alt=\"MIA Reconstruction Attack\" class=\"maxw50\" />"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "id": "81e83c5b",
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "def process(im_flatten):\n",
237 |     "    max_v = th.max(im_flatten)\n",
238 |     "    min_v = th.min(im_flatten)\n",
239 |     "    return (im_flatten-min_v) / (max_v - min_v)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "id": "e52a0ac1",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "def mi_face(model, target_label):\n",
250 |     "    aim_tensor = th.zeros(1, 112*92)\n",
251 |     "    aim_tensor.requires_grad = True\n",
252 |     "    \n",
253 |     "    lossn_1 = 10\n",
254 |     "    b = 0\n",
255 |     "    g = 0\n",
256 |     "    \n",
257 |     "    out = model(aim_tensor.detach())\n",
258 |     "    _, pred = th.max(out, 1)\n",
259 |     "    print(pred)\n",
260 |     "    print(f'original input image {target_label}')\n",
261 |     "    plt.imshow(np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy(), (1, 2, 0)), cmap=\"Greys\")\n",
262 |     "    plt.show()\n",
263 |     "    print(f'original input image predict label {target_label} - predict label: {pred.item()}')\n",
264 |     "    \n",
265 |     "    criterion = th.nn.NLLLoss()\n",
266 |     "    \n",
267 |     "    for i in range(α):\n",
268 |     "        out = model(aim_tensor)\n",
269 |     "        if aim_tensor.grad is not None:\n",
270 |     "            aim_tensor.grad.zero_()\n",
271 |     "        out = out.reshape(1, 40)\n",
272 |     "        target_class = th.tensor([target_label])\n",
273 |     "        loss = criterion(out, target_class)\n",
274 |     "        loss.backward()\n",
275 |     "        aim_grad = aim_tensor.grad\n",
276 |     "        \n",
277 |     "        # SGD Step\n",
278 |     "        # see https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD\n",
279 |     "        aim_tensor = aim_tensor - (λ * aim_grad)\n",
280 |     "        aim_tensor = process(aim_tensor)\n",
281 |     "        aim_tensor = th.clamp(aim_tensor.detach(), 0, 1)\n",
282 |     "        aim_tensor.requires_grad = True\n",
283 |     "        if loss >= lossn_1:\n",
284 |     "            b += 1\n",
285 |     "            if b > β:\n",
286 |     "                break\n",
287 |     "        else:\n",
288 |     "            b = 0\n",
289 |     "        lossn_1 = loss\n",
290 |     "        if loss < γ:\n",
291 |     "            break\n",
292 |     "    \n",
293 |     "    print(f\"Attack completed at {i} iterations\")\n",
294 |     "    out = model(aim_tensor.detach())\n",
295 |     "    _, pred = th.max(out, 1)\n",
296 |     "    print(pred)\n",
297 |     "    print(f'inverted image {target_label}')\n",
298 |     "    plt.imshow(np.transpose(aim_tensor.detach().reshape(1, 112, 92).numpy() * 255, (1, 2, 0)), cmap=\"Greys\")\n",
299 |     "    plt.show()\n",
300 |     "\n",
301 |     "    "
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "id": "44013f2f",
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "# Let's try to reconstruct the data for the first 10 classes (i.e. faces)\n",
312 |     "for cl in range(10):\n",
313 |     "    mi_face(softmax_reg, cl)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "id": "2f14d689",
319 |    "metadata": {},
320 |    "source": [
321 |     "### Exercise: \n",
322 |     "\n",
323 |     "Write the code to try the **model inversion reconstruction** using the `MLP` model"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "id": "3ad83641",
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "from models import MLP"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "id": "a3df4a1b",
339 |    "metadata": {},
340 |    "source": [
341 |     "⚠️ Grab the **pre-trained** weights of the `SoftmaxRegression` model here: [mlp_mia.pt](https://www.dropbox.com/s/8ul2lj2eqcykfxm/mlp_mia.pt?dl=1) and save it into the local `checkpoints` folder"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "id": "58bab294",
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "mlp = MLP()\n",
352 |     "weights = load_weights(mlp, model_filename=\"mlp_mia.pt\")\n",
353 |     "if weights is not None:\n",
354 |     "    mlp.load_state_dict(weights)\n",
355 |     "\n",
356 |     "mlp"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "id": "76662b42",
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "# Reconstruction Attack code HERE\n",
367 |     "for cl in range(10):\n",
368 |     "    mi_face(mlp, cl)"
369 |    ]
370 |   }
371 |  ],
372 |  "metadata": {
373 |   "kernelspec": {
374 |    "display_name": "Python 3 (ipykernel)",
375 |    "language": "python",
376 |    "name": "python3"
377 |   },
378 |   "language_info": {
379 |    "codemirror_mode": {
380 |     "name": "ipython",
381 |     "version": 3
382 |    },
383 |    "file_extension": ".py",
384 |    "mimetype": "text/x-python",
385 |    "name": "python",
386 |    "nbconvert_exporter": "python",
387 |    "pygments_lexer": "ipython3",
388 |    "version": "3.12.3"
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 5
393 | }
394 | 


--------------------------------------------------------------------------------
/2-ml-models-attacks/2-MIA-Training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "d4e39d12-6b19-451d-b1b9-2502d6f8e15a",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Optional: setup NoTexBook theme\n",
 11 |     "%load_ext notexbook\n",
 12 |     "\n",
 13 |     "%texify"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "dcd69b34",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Model Inversion Attack - Model Training"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "id": "85ed1933",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "In this notebook we will be performing the training of **two** (out of three) of the ML models considered in the paper:\n",
 30 |     "\n",
 31 |     "> **Model Inversion Attacks that Exploit Confidence Information and Basic Countermeasures**, by _Fredrikson, et. al_, 2015 \n",
 32 |     "[DOI](https://dl.acm.org/doi/pdf/10.1145/2810103.2813677).\n",
 33 |     "\n",
 34 |     "The two models are `SoftmaxRegression` and `MLP`.\n",
 35 |     "\n",
 36 |     "⚠️ **NOTE**: Please feel free to skip this notebook completely (if you don't want to **re-train** the models on your own) and jump directly to the next [MIA Reconstruction](./2-MIA-Reconstruction.ipynb) notebook."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "eee64647",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import torch as th\n",
 47 |     "import numpy as np\n",
 48 |     "\n",
 49 |     "from matplotlib import pyplot as plt\n",
 50 |     "\n",
 51 |     "%matplotlib inline"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "3126b393",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n",
 62 |     "#       see, https://github.com/pytorch/vision/issues/3497 for more information\n",
 63 |     "from six.moves import urllib\n",
 64 |     "\n",
 65 |     "opener = urllib.request.build_opener()\n",
 66 |     "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n",
 67 |     "urllib.request.install_opener(opener)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "9086c266",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from dataset import ORLFaces\n",
 78 |     "from torchvision.transforms import ToTensor, Grayscale, Compose\n",
 79 |     "from torch.utils.data import DataLoader"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "bbc48ffb",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# Reproducibility Settings\n",
 90 |     "\n",
 91 |     "SEED = 123456\n",
 92 |     "\n",
 93 |     "np.random.seed(SEED)\n",
 94 |     "th.manual_seed(SEED)\n",
 95 |     "if th.cuda.is_available():\n",
 96 |     "    th.cuda.manual_seed_all(SEED)\n",
 97 |     "    th.backends.cudnn.deterministic = True"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "id": "5dc0251e",
103 |    "metadata": {},
104 |    "source": [
105 |     "### The `ORLFaces` Dataset\n",
106 |     "\n",
107 |     "The original paper considers the **AT&T Face Database** faces dataset (which I have encapsualted and made available as a PyTorch `Dataset`): `ORLFaces`"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "345e23a7",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "from pathlib import Path\n",
118 |     "import os\n",
119 |     "\n",
120 |     "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\""
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "id": "2ee5718e",
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "print(DATA_FOLDER)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "id": "c16625ec",
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "imgs_trasform = Compose([Grayscale(num_output_channels=1), ToTensor()])\n",
141 |     "\n",
142 |     "orl_faces_train = ORLFaces(\n",
143 |     "    root=DATA_FOLDER, download=True, split=\"train\", transform=imgs_trasform\n",
144 |     ")\n",
145 |     "orl_faces_test = ORLFaces(\n",
146 |     "    root=DATA_FOLDER, download=True, split=\"test\", transform=imgs_trasform\n",
147 |     ")"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "id": "b9ae6a51",
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "BATCH_SIZE = 32\n",
158 |     "\n",
159 |     "train_loader = DataLoader(\n",
160 |     "    orl_faces_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=False\n",
161 |     ")\n",
162 |     "test_loader = DataLoader(\n",
163 |     "    orl_faces_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=False\n",
164 |     ")"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "id": "bd2b2a27",
170 |    "metadata": {},
171 |    "source": [
172 |     "#### A few notes about the dataset \n",
173 |     "\n",
174 |     "The `ORLFaces` dataset contains `400` image files corresponding to `40` different subjects (`10` photo each).\n",
175 |     "\n",
176 |     "\n",
177 |     "Images are `112x92` pixels, with `256` grey levels per pixel, and (originally) stored in `PGM` format.\n",
178 |     "The photos of the subjects have been taken at different times, are varying the lightning, the facial expressions\n",
179 |     "    (e.g. open/closed eyes, smiling/serious face), and the facial details.\n",
180 |     "\n",
181 |     "**Train/Test** partitions have been generated similarly to what has been done in the original paper, that is: \n",
182 |     "\n",
183 |     "(for each subject):\n",
184 |     "\n",
185 |     "- Randomly pick $7$ (out of $10$) images of the subject and add them to the **training set**\n",
186 |     "- Add remaining $3$ images to the **test set**"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "id": "425c305c",
192 |    "metadata": {},
193 |    "source": [
194 |     "#### Visualise a few Samples in the Dataset\n",
195 |     "\n",
196 |     "Before we start with the training, let's visualise a few random samples extracted from the dataset"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "id": "61e794b9",
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "from torchvision.utils import make_grid\n",
207 |     "\n",
208 |     "\n",
209 |     "def imshow(img):\n",
210 |     "    npimg = img.numpy()\n",
211 |     "    plt.figure(figsize=(10, 12))\n",
212 |     "    plt.imshow(np.transpose(npimg, (1, 2, 0)))\n",
213 |     "    plt.show()"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "id": "aa210aaf",
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "# get some random training images\n",
224 |     "images, labels = next(iter(train_loader))"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "id": "089395c5",
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "images.shape"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "id": "7dc07e43",
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "# show images\n",
245 |     "imshow(make_grid(images))\n",
246 |     "# print labels\n",
247 |     "print(\" \".join(f\"{labels[j]}\" for j in range(BATCH_SIZE)))"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "id": "2513010f",
253 |    "metadata": {},
254 |    "source": [
255 |     "ℹ️ **Note**: Do you see the **exact same faces** that are being displayed here? "
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "id": "2b7b9841",
261 |    "metadata": {},
262 |    "source": [
263 |     "## Machine Learning Model Training\n",
264 |     "\n",
265 |     "In the original Paper, authors refer to three separated models used as reference examples for the Model Inversion Attack. \n",
266 |     "\n",
267 |     "Here to keep things simple, we will only consider two of them: `SoftmaxRegression` and `MLP`"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "e0d1c795",
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "from models import SoftmaxRegression, MLP"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "id": "b14bbfe8",
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "from train import train"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "9a193ca3",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "λ = 0.1  # optimiser learning rate, as used in the paper"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "id": "7b9d5e71",
303 |    "metadata": {},
304 |    "source": [
305 |     "#### Training `SoftmaxRegression`\n",
306 |     "\n",
307 |     "Note: This should be super-fast even on a laptop (small model, small data)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "id": "62df081d",
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "softmax_reg = SoftmaxRegression()\n",
318 |     "softmax_sgd = th.optim.SGD(softmax_reg.parameters(), lr=λ)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "id": "cb471d2d",
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "softmax_reg"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "id": "c4e0f0a2",
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "train(\n",
339 |     "    model=softmax_reg,\n",
340 |     "    optimiser=softmax_sgd,\n",
341 |     "    loaders=(train_loader, test_loader),\n",
342 |     "    model_name=\"softmax_mia\",\n",
343 |     ")"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "id": "d8acf454",
349 |    "metadata": {},
350 |    "source": [
351 |     "### Training `MLP`\n",
352 |     "\n",
353 |     "⚠️ **Note**:  This may be a bit slower to train on a laptop (it shouldn't be that much, though!) \n",
354 |     "\n",
355 |     "If you notice that it is the case, please also feel free to skip this and jump at the end of this notebook."
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "12c1109e",
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "mlp = MLP()\n",
366 |     "mlp_sgd = th.optim.SGD(mlp.parameters(), lr=λ)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "id": "53e40c39",
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "mlp"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "id": "123b3abc",
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "train(\n",
387 |     "    model=mlp,\n",
388 |     "    optimiser=mlp_sgd,\n",
389 |     "    loaders=(train_loader, test_loader),\n",
390 |     "    model_name=\"mlp_mia\",\n",
391 |     ")"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "id": "c5646846",
397 |    "metadata": {},
398 |    "source": [
399 |     "### Congratulations\n",
400 |     "\n",
401 |     "**Well done** 🎉\n",
402 |     "\n",
403 |     "Now that we have our two reference **trained** model, we are ready to setup and launch the _model inversion_ attack to the model. \n",
404 |     "\n",
405 |     "$\\rightarrow$ **MIA Reconstruction**"
406 |    ]
407 |   }
408 |  ],
409 |  "metadata": {
410 |   "kernelspec": {
411 |    "display_name": "Python 3 (ipykernel)",
412 |    "language": "python",
413 |    "name": "python3"
414 |   },
415 |   "language_info": {
416 |    "codemirror_mode": {
417 |     "name": "ipython",
418 |     "version": 3
419 |    },
420 |    "file_extension": ".py",
421 |    "mimetype": "text/x-python",
422 |    "name": "python",
423 |    "nbconvert_exporter": "python",
424 |    "pygments_lexer": "ipython3",
425 |    "version": "3.12.3"
426 |   }
427 |  },
428 |  "nbformat": 4,
429 |  "nbformat_minor": 5
430 | }
431 | 


--------------------------------------------------------------------------------
/2-ml-models-attacks/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module provides access to the ORLFaces (ORL Database of Faces)
  3 | as encapsulated as a `torchvision.datasets.VisionDataset` class.
  4 | 
  5 | Notes
  6 | -----
  7 | The ORLFaces dataset [1]_ contains image files of `40` different subjects (orgainsed one per folder).
  8 | Images are `112x92` pixels, with `256` grey levels per pixel, and stored in PGM format.
  9 | Folders have names of the form `sID`, where `ID` indicates the subject number (between `1` and `40`).
 10 | In each of these directories, there are **ten different** images of that subject, which have names of the
 11 | form `Y.pgm`, where `Y` is the image number for that subject (between `1` and `10`) - accounting for
 12 | a total of `400` images (10 per 40 subjects).
 13 | 
 14 | Images are randomly partitioned in training and test sets.
 15 | 
 16 | References
 17 | -----------
 18 | .. [1]  https://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html
 19 | """
 20 | 
 21 | import os
 22 | import torch
 23 | import numpy as np
 24 | import re
 25 | from collections import defaultdict
 26 | from PIL import Image
 27 | from torchvision.datasets import VisionDataset
 28 | from torchvision.datasets.utils import download_url, extract_archive
 29 | from enum import Enum
 30 | from pathlib import Path
 31 | from typing import Callable, Optional, Any, List, Tuple
 32 | 
 33 | 
 34 | class Partition(Enum):
 35 |     """
 36 |     Enumeration of Data Partitions for Machine learning experiments
 37 |     """
 38 | 
 39 |     train = "training"
 40 |     test = "test"
 41 | 
 42 | 
 43 | class ORLFaces(VisionDataset):
 44 |     """`ORL Faces` (The ORL Database of Faces)
 45 | 
 46 |     The Dataset contains a folder with faces of 40 different subjects
 47 |     taken at different times, varying the lightning, facial expressions
 48 |     (e.g. open/closed eyes, smiling/serious face), and facial details.
 49 | 
 50 |     This dataset is being used in research as facial detection dataset.
 51 | 
 52 |     Attributes
 53 |     ----------
 54 |     root : str
 55 |         Root directory where the local copy of dataset is stored.
 56 |     split : {"train", "test"} (default: "train")
 57 |         Target data data_partition. Two data partitions are available, namely
 58 |         "training", and "test". Training data_partition is considered
 59 |         by default. Any _validation_ partition could be extracted from the
 60 |         training dataset.
 61 |     download :  bool, optional (False)
 62 |         If true, the dataset will be downloaded from the internet and saved in the root
 63 |         directory. If dataset is already downloaded, it is not downloaded again.
 64 |     transform : Callable, optional
 65 |         A function/transform that takes in an image and returns a transformed version
 66 |     seed: int optinal (123456)
 67 |         Random seed used to split images in training and testing partitions.
 68 |         The partitions are generated randomly (but consistently given the same random seed).
 69 |         Different values of this parameter will affect this generation.
 70 |         Note: Data partitions are generated only the first time the dataset is initialised,
 71 |         and before the local torch (tensor) files are saved.
 72 |         Multiple instances of this dataset with different seed won't have any effect, unless
 73 |         local partition files are effectively deleted.
 74 |     """
 75 | 
 76 |     RAW_DATA_FOLDER = "orl_faces"
 77 | 
 78 |     resources = [
 79 |         (
 80 |             "https://www.dropbox.com/s/gxus70grtlt8bpq/orl_faces.tar.gz?dl=1",
 81 |             "83134c1ac2309b40441b35d5fa37a3f1",
 82 |         )
 83 |     ]
 84 | 
 85 |     data_files = {
 86 |         Partition.train: "training.pt",
 87 |         Partition.test: "test.pt",
 88 |     }
 89 | 
 90 |     classes = list(range(1, 41))
 91 | 
 92 |     def __init__(
 93 |         self,
 94 |         root: str,
 95 |         split: str = "train",
 96 |         download: bool = False,
 97 |         transform: Optional[Callable[[Any], Any]] = None,
 98 |         seed: int = 123456,
 99 |     ):
100 |         super(ORLFaces, self).__init__(root, transform=transform)
101 |         self._seed = seed
102 |         self.random_gen = np.random.RandomState(self._seed)
103 |         split = split.strip().lower()
104 |         if split not in Partition.__members__.keys():
105 |             raise ValueError(
106 |                 "Data Partition not recognised. "
107 |                 "Accepted values are 'train', 'validation', 'test'."
108 |             )
109 | 
110 |         if download:
111 |             self.download()
112 | 
113 |         if not self._check_exists():
114 |             raise RuntimeError(
115 |                 "Dataset not found." + " You can use download=True to download it"
116 |             )
117 | 
118 |         self.split = Partition[split]
119 |         data_file = self.data_files[self.split]
120 |         data_filepath = self.processed_folder / data_file
121 |         self.data, self.targets = torch.load(data_filepath)
122 | 
123 |     def __len__(self):
124 |         return len(self.data)
125 | 
126 |     def __getitem__(self, index):
127 |         """
128 | 
129 |         Parameters
130 |         ----------
131 |         index : int
132 |             Index of the sample
133 | 
134 |         Returns
135 |         -------
136 |         tuple
137 |             (Image, Target) where target is index of the target class.
138 |         """
139 |         img, target = self.data[index], int(self.targets[index])
140 | 
141 |         # doing this so that it is consistent with all other datasets
142 |         # to return a PIL Image
143 |         img = Image.fromarray(img.numpy(), mode="L")
144 | 
145 |         if self.transform is not None:
146 |             img = self.transform(img)
147 | 
148 |         return img, target
149 | 
150 |     @property
151 |     def processed_folder(self):
152 |         return Path(self.root) / self.__class__.__name__ / "processed"
153 | 
154 |     @property
155 |     def raw_folder(self):
156 |         return Path(self.root) / self.__class__.__name__ / "raw"
157 | 
158 |     @property
159 |     def partition(self):
160 |         return self.split
161 | 
162 |     @property
163 |     def class_to_idx(self):
164 |         return {_class: i for i, _class in enumerate(self.classes)}
165 | 
166 |     @property
167 |     def idx_to_class(self):
168 |         return {v: k for k, v in self.class_to_idx.items()}
169 | 
170 |     def classes_map(self):
171 |         return {i: c for i, c in enumerate(self.classes)}
172 | 
173 |     def _check_exists(self):
174 |         for data_fname in self.data_files.values():
175 |             data_file = self.processed_folder / data_fname
176 |             if not data_file.exists():
177 |                 return False
178 |         return True
179 | 
180 |     def extra_repr(self):
181 |         return "Split: {}".format(self.split.value)
182 | 
183 |     def _download_and_extract_archive(
184 |         self,
185 |         url: str,
186 |         download_root: str,
187 |         filename: Optional[str] = None,
188 |         md5: Optional[str] = None,
189 |     ):
190 |         download_root = os.path.expanduser(download_root)
191 |         extract_root = download_root
192 |         if not filename:
193 |             filename = os.path.basename(url)
194 | 
195 |         from torchvision.datasets import utils
196 | 
197 |         utils._get_redirect_url = lambda ulr, max_hops: url
198 |         download_url(url, download_root, filename, md5)
199 |         archive = os.path.join(download_root, filename)
200 |         print("Extracting {} to {}".format(archive, extract_root))
201 |         extract_archive(archive, extract_root, remove_finished=False)
202 | 
203 |     def download(self):
204 |         """Download the ORLFaces data if it doesn't already exist in the processed folder"""
205 | 
206 |         if self._check_exists():
207 |             return
208 | 
209 |         os.makedirs(self.raw_folder, exist_ok=True)
210 |         os.makedirs(self.processed_folder, exist_ok=True)
211 | 
212 |         # download files
213 |         for url, md5 in self.resources:
214 |             filename = url.rpartition("/")[-1].split("?")[0]
215 |             self._download_and_extract_archive(
216 |                 url, download_root=str(self.raw_folder), filename=filename, md5=md5
217 |             )
218 | 
219 |         print("Processing...", end="")
220 |         self._process_partitions()
221 |         print("Done!")
222 | 
223 |     def _process_partitions(self):
224 |         raw_data_filepath = self.raw_folder / self.RAW_DATA_FOLDER
225 |         partitions = defaultdict(list)
226 | 
227 |         for subj_folder in os.listdir(raw_data_filepath):
228 |             if not subj_folder.startswith("s"):
229 |                 continue  # skip folder
230 |             # class is zero-indexed!
231 |             subj_class = int(subj_folder.replace("s", "").strip()) - 1
232 |             # select training set images, randomly - using the input seed
233 |             training_indices = self.random_gen.choice(
234 |                 np.arange(10), size=7, replace=False
235 |             )
236 | 
237 |             # sort image files, so we could use randomly selected indices, quickly
238 |             subject_folder_path = raw_data_filepath / subj_folder
239 |             image_files = filter(
240 |                 lambda f: not f.startswith("."), os.listdir(subject_folder_path)
241 |             )
242 |             subject_images = sorted(image_files, key=lambda f: int(f.split(".")[0]))
243 |             subject_images = map(
244 |                 lambda f: subject_folder_path / f, subject_images
245 |             )  # store full path
246 |             subject_images = np.asarray(
247 |                 list(subject_images)
248 |             )  # convert to array for easy indexing
249 | 
250 |             # Add new pair (images, class) to corresponding partitions
251 |             partitions[Partition.train].append(
252 |                 (subject_images[training_indices], subj_class)
253 |             )
254 |             partitions[Partition.test].append(
255 |                 (np.delete(subject_images, training_indices), subj_class)
256 |             )
257 | 
258 |         # store partitions locally - to be reloaded later
259 |         for partition, dataset in partitions.items():
260 |             images, labels = self._dataset_as_torch_tensors(dataset)
261 |             data_file = self.processed_folder / self.data_files[partition]
262 |             with open(data_file, "wb") as f:
263 |                 torch.save((images, labels), f)
264 | 
265 |     @staticmethod
266 |     def read_pgm(filename, byteorder=">"):
267 |         """Return image data from a raw PGM file as numpy array.
268 |         Format specification: http://netpbm.sourceforge.net/doc/pgm.html
269 |         """
270 |         with open(filename, "rb") as f:
271 |             buffer = f.read()
272 |         try:
273 |             header, width, height, maxval = re.search(
274 |                 b"(^P5\s(?:\s*#.*[\r\n])*"
275 |                 b"(\d+)\s(?:\s*#.*[\r\n])*"
276 |                 b"(\d+)\s(?:\s*#.*[\r\n])*"
277 |                 b"(\d+)\s(?:\s*#.*[\r\n]\s)*)",
278 |                 buffer,
279 |             ).groups()
280 |         except AttributeError:
281 |             raise ValueError("Not a raw PGM file: '%s'" % filename)
282 |         return np.frombuffer(
283 |             buffer,
284 |             dtype="u1" if int(maxval) < 256 else byteorder + "u2",
285 |             count=int(width) * int(height),
286 |             offset=len(header),
287 |         ).reshape((int(height), int(width)))
288 | 
289 |     def _dataset_as_torch_tensors(
290 |         self, dataset: List[Tuple[List[str], int]]
291 |     ) -> Tuple[torch.TensorType, torch.TensorType]:
292 |         """
293 |         Collect all the images per subject and convert them in torch.Tensor.
294 |         Labels will also be returned as tensor, repeated for each corresponding subject.
295 | 
296 |         Parameters
297 |         ----------
298 |         dataset : List[Tuple[List[str], int]]
299 |             Set of images and corresponding label for each subject, in the considered partition
300 |             (i.e. train, or test)
301 |         Returns
302 |         -------
303 |         torch.TensorType
304 |             [sample x pixels] tensor representing the whole data data_partition as
305 |             torch Tensor.
306 |         torch.TensorType
307 |             [sample] tensor array of corresponding label (i.e. subject ID for each subject)
308 |         """
309 |         images, labels = [], []
310 |         for subject, class_id in dataset:
311 |             subject_images = np.asarray(list(map(self.read_pgm, subject)))
312 |             subject_labels = np.zeros(shape=len(subject))
313 |             subject_labels.fill(class_id)
314 |             images.append(subject_images)
315 |             labels.append(subject_labels)
316 |         images = np.vstack(images)
317 |         labels = np.hstack(labels)
318 |         return torch.from_numpy(images), torch.from_numpy(labels)
319 | 


--------------------------------------------------------------------------------
/3-differential-privacy/1-differential-privacy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Optional: setup NoTexBook theme\n",
 10 |     "%load_ext notexbook\n",
 11 |     "\n",
 12 |     "%texify"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "**Adapted from**: [Ch3](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch3.ipynb)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Differential Privacy"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Definition"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "Like $k$-Anonymity, *differential privacy*[3](#fn3) is a **formal notion of privacy** \n",
 41 |     "(i.e. it's possible to prove that a data release has the property). \n",
 42 |     "\n",
 43 |     "Unlike $k$-Anonymity, however, **differential privacy** is a property of *algorithms*, and not a property of *data*. \n",
 44 |     "\n",
 45 |     "That is, we can prove a *dataset* satisfies differential privacy by proving that an *algorithm* satisfies differential privacy.\n",
 46 |     "\n",
 47 |     "> **Definition**:\n",
 48 |     ">\n",
 49 |     "> A function which satisfies differential privacy is often called a *mechanism*. \n",
 50 |     "> We say that a *mechanism* $F$ satisfies differential privacy if for all *neighboring datasets* $x$ and $x'$, \n",
 51 |     "> and all possible outputs $S$,\n",
 52 |     ">\n",
 53 |     "\n",
 54 |     "\\begin{equation}\n",
 55 |     "\\frac{\\mathsf{Pr}[F(x) = S]}{\\mathsf{Pr}[F(x') = S]} \\leq e^\\epsilon\n",
 56 |     "\\end{equation}"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "**1. Neighbours Datasets**:\n",
 64 |     "\n",
 65 |     "Two datasets are considered **neighbours** if they differ in the data by **one single individual**.\n",
 66 |     "\n",
 67 |     "**2. $F$ Randomised Function**:\n",
 68 |     "\n",
 69 |     "Note that $F$ is typically a *randomised* function, so that the probability distribution describing its outputs is not just a point distribution.\n",
 70 |     "\n",
 71 |     "The important implication of this definition is that $F$'s output will be pretty much the same, *with or without* the data of any specific individual.\n",
 72 |     "\n",
 73 |     "In other words, the randomness built into $F$ should be \"enough\" so that an observed output from $F$ will not reveal which of $x$ or $x'$ was the input.\n",
 74 |     "\n",
 75 |     "Imagine that my data is present in $x$ but not in $x'$.\n",
 76 |     "\n",
 77 |     "**3. The Privacy Budget: $\\epsilon$**:\n",
 78 |     "\n",
 79 |     "If an adversary can't determine which of $x$ or $x'$ was the input to $F$, then the adversary can't tell whether or not my data was *present* in the input - let alone the contents of that data.\n",
 80 |     "\n",
 81 |     "The $\\epsilon$ parameter in the definition is called the *privacy parameter* or the *privacy budget*.\n",
 82 |     "\n",
 83 |     "$\\epsilon$ provides a knob to tune the **amount of privacy** the definition provides."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Small values of $\\epsilon$ require $F$ to provide *very* similar outputs when given similar inputs, and therefore provide **higher levels** of privacy.\n",
 91 |     "\n",
 92 |     "Large values of $\\epsilon$ allow less similarity in the outputs, and therefore provide **less privacy**.\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "- Small values $\\epsilon \\rightarrow$ High Privacy\n",
 96 |     "- Large values $\\epsilon \\rightarrow$ Less Privacy\n",
 97 |     "\n",
 98 |     "How should we set $\\epsilon$ to prevent bad outcomes in practice? **Nobody knows** (i.e. Open Research Question). \n",
 99 |     "\n",
100 |     "The general consensus is that $\\epsilon$ should be around `1` or smaller, and values of $\\epsilon$ above `10` probably don't do much to protect privacy - but this rule of thumb could turn out to be very conservative. \n",
101 |     "\n",
102 |     "<span id=\"fn3\">**[3]**: Dwork, C; _Differential Privacy_ in Proceedings of the 33rd International Conference on Automata, Languages and Programming - Volume Part II, 2006 [link](https://doi.org/10.1007/11787006_1)</span>"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "\n",
110 |     ">**Learning Objectives**\n",
111 |     ">\n",
112 |     "> - Define differential privacy\n",
113 |     "> - Explain the importance of the privacy parameter $\\epsilon$\n",
114 |     "> - Use the Laplace mechanism to enforce differential privacy for counting queries"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## The Laplace Mechanism"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "Differential privacy is typically used to answer specific queries. Let's consider a query on the census data, *without* differential privacy."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "tags": [
136 |      "remove-cell"
137 |     ]
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "import pandas as pd\n",
142 |     "import numpy as np\n",
143 |     "import matplotlib.pyplot as plt\n",
144 |     "plt.style.use('seaborn-v0_8-whitegrid')"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\"\n",
154 |     "adult = pd.read_csv(DATASET_URL)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "**Q** \"How many individuals in the dataset are 40 years old or older?\"\n",
162 |     "\n",
163 |     "This is an example of a **Count Query**."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "adult[adult['Age'] >= 40].shape[0]"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Laplace Mechanism"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "The easiest way to achieve differential privacy for this query is to add **random noise to its answer**. \n",
187 |     "\n",
188 |     "The key challenge is to add enough noise to satisfy the definition of differential privacy, but not so much that the answer becomes too noisy to be useful. \n",
189 |     "\n",
190 |     "To make this process easier, some basic *mechanisms* have been developed in the field of differential privacy, which describe exactly what kind of - and how much - noise to use. \n",
191 |     "\n",
192 |     "One of these is called the *Laplace mechanism*[4](#fn4).\n",
193 |     "\n",
194 |     "> **Definition**\n",
195 |     "> \n",
196 |     ">According to the Laplace mechanism, for a function $f(x)$ which returns a number, the following definition of $F(x)$ satisfies $\\epsilon$-differential privacy:\n",
197 |     ">\n",
198 |     ">\\begin{equation}\n",
199 |     "F(x) = f(x) + \\textsf{Lap}(\\frac{s}{\\epsilon})\n",
200 |     "\\end{equation}\n",
201 |     ">\n",
202 |     ">where $s$ is the *sensitivity* of $f$, and $\\textsf{Lap}(S)$ denotes sampling from the Laplace distribution with center 0 and scale $S$.\n",
203 |     "\n",
204 |     "\n",
205 |     "**Sensitivity**:\n",
206 |     "\n",
207 |     "The *sensitivity* of a function $f$ is the amount $f$'s output changes when its input changes by 1. \n",
208 |     "\n",
209 |     "Sensitivity is a complex topic, and an integral part of designing differentially private algorithms. \n",
210 |     "\n",
211 |     "Let's just point out that *counting queries* always have a sensitivity of `1`: if a query counts the number of rows in the dataset with a particular property, and then we modify exactly one row of the dataset, then the query's output can change by at most `1`.\n",
212 |     "\n",
213 |     "Thus we can achieve differential privacy for our example query by using the `Laplace mechanism` with `sensitivity=1` and an $\\epsilon$ of our choosing.\n",
214 |     "\n",
215 |     "For now, let's pick $\\epsilon = 0.1$. We can sample from the Laplace distribution using Numpy's `random.laplace`.\n",
216 |     "\n",
217 |     "<span id=\"fn4\">**[4]**: Dwork, C.; _Calibrating Noise to Sensitivity in Private Data Analysis_ in Proceedings of the Third Conference on Theory of Cryptography, 2006 [link](https://doi.org/10.1007/11681878_14)</span>"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "sensitivity = 1\n",
227 |     "epsilon = 0.1\n",
228 |     "\n",
229 |     "adult[adult['Age'] >= 40].shape[0] + np.random.laplace(loc=0, scale=sensitivity/epsilon)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "You can see the effect of the noise by running this code multiple times. Each time, the output changes, but most of the time, the answer is close enough to the true answer (14,235) to be useful."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "true_count_stat = adult[adult['Age'] >= 40].shape[0]\n",
246 |     "Lap = np.random.laplace(loc=0, scale=sensitivity/epsilon, size=30)\n",
247 |     "print(f\"True Count Statistic: {true_count_stat}\")\n",
248 |     "for i in range(30):\n",
249 |     "    print(f\"{i}) {(true_count_stat + Lap[i]):0.2f}\")"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "## How Much Noise is Enough?"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "How do we know that the Laplace mechanism adds enough noise to prevent the re-identification of individuals in the dataset? \n",
264 |     "\n",
265 |     "For one thing, we can try to break it!\n",
266 |     "\n",
267 |     "Let's write down a **malicious counting query**, which is specifically designed to determine whether Karrie Trusslove has an income greater than `$50k`."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "karries_row = adult[adult['Name'] == 'Karrie Trusslove']\n",
277 |     "karries_row[karries_row['Target'] == '<=50K'].shape[0]"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "This result definitely violates Karrie's privacy, since it reveals the value of the income column for Karrie's row.\n",
285 |     "\n",
286 |     "Since we know how to ensure differential privacy for counting queries with the Laplace mechanism, we can do so for this query:"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "sensitivity = 1\n",
296 |     "epsilon = 0.1\n",
297 |     "\n",
298 |     "karries_row = adult[adult['Name'] == 'Karrie Trusslove']\n",
299 |     "karries_row[karries_row['Target'] == '<=50K'].shape[0] + np.random.laplace(loc=0, scale=sensitivity/epsilon)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "query = karries_row[karries_row['Target'] == '<=50K'].shape[0]"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "F = query + np.random.laplace(loc=0, scale=sensitivity/epsilon)\n",
318 |     "\n",
319 |     "np.mean([F for _ in range(100)])"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "Is this the true answer ?\n",
327 |     "\n",
328 |     "There's too much noise to be able to reliably tell.\n",
329 |     "\n",
330 |     "This is how differential privacy is *intended* to work - the approach does not *reject* queries which are determined to be malicious; instead, it adds enough noise that the results of a malicious query will be useless to the adversary."
331 |    ]
332 |   }
333 |  ],
334 |  "metadata": {
335 |   "kernelspec": {
336 |    "display_name": "Python 3 (ipykernel)",
337 |    "language": "python",
338 |    "name": "python3"
339 |   },
340 |   "language_info": {
341 |    "codemirror_mode": {
342 |     "name": "ipython",
343 |     "version": 3
344 |    },
345 |    "file_extension": ".py",
346 |    "mimetype": "text/x-python",
347 |    "name": "python",
348 |    "nbconvert_exporter": "python",
349 |    "pygments_lexer": "ipython3",
350 |    "version": "3.12.3"
351 |   }
352 |  },
353 |  "nbformat": 4,
354 |  "nbformat_minor": 4
355 | }
356 | 


--------------------------------------------------------------------------------
/2-ml-models-attacks/1-FSGM-Attack.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext notexbook\n",
 10 |     "%texify"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "# Fast Gradient Sign Attack"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "This notebook showcases how to carry out a **Fast Gradient Sign Attack** (`FGSA`) to a pretrained model. \n",
 25 |     "\n",
 26 |     "**Note** This notebook has been adapted from the [FSGM Tutorial](https://pytorch.org/tutorials/beginner/fgsm_tutorial.html) by _Nathan Inkawhich_ `@inkawhich` available on the official [PyTorch Documentation](https://pytorch.org/docs/stable/index.html)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "(_from the original notebook_)\n",
 34 |     "> This tutorial will raise your awareness to the security vulnerabilities \n",
 35 |     "> of ML models, and will give insight into the hot topic of adversarial machine learning. \n",
 36 |     "> \n",
 37 |     "> You may be surprised to find that adding **imperceptible perturbations** to an image *can* cause \n",
 38 |     "> drastically different model performance.\n",
 39 |     "> `[...]`\n",
 40 |     ">\n",
 41 |     "> Specifically we will use one of the first and most popular attack methods, the _Fast Gradient Sign Attack_\n",
 42 |     "> (`FGSM`), to fool an `MNIST` classifier.\n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Threats to Models\n",
 50 |     "\n",
 51 |     "There are several kinds of assumptions of the attacker’s knowledge, two of which are: **white-box** and **black-box**. \n",
 52 |     "\n",
 53 |     "- A *white-box* attack assumes the attacker has full knowledge and access to the model, including\n",
 54 |     "architecture, inputs, outputs, and weights. \n",
 55 |     "- A *black-box* attack assumes the attacker only has access to the inputs and outputs of the model, and knows nothing about the underlying architecture or weights. \n",
 56 |     "\n",
 57 |     "There are also several types of goals, including **misclassification** and\n",
 58 |     "**source/target misclassification**. \n",
 59 |     "\n",
 60 |     "A goal of *misclassification* means the adversary only wants the output classification to be wrong but does\n",
 61 |     "not care what the new classification is. \n",
 62 |     "\n",
 63 |     "A *source/target misclassification* means the adversary wants to alter an image that is originally of a specific source class so that it is classified as a specific target class."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Fast Gradient Sign Attack\n",
 71 |     "\n",
 72 |     "*Fast Gradient Sign Attack (FGSM)* and is described by _Goodfellow et. al._ in \n",
 73 |     "[Explaining and Harnessing Adversarial Examples](https://arxiv.org/abs/1412.6572). \n",
 74 |     "\n",
 75 |     "The attack is remarkably powerful, and yet intuitive. \n",
 76 |     "\n",
 77 |     "It is designed to attack neural networks by leveraging the way they learn: **gradients**. \n",
 78 |     "\n",
 79 |     "The idea is simple: \n",
 80 |     "\n",
 81 |     "> rather than working to minimize the loss by adjusting the weights based on the backpropagated gradients,\n",
 82 |     "> the attack **adjusts** the input data to maximize the loss based on the same backpropagated gradients. \n",
 83 |     "\n",
 84 |     "In other words, the attack uses the gradient of the loss w.r.t the input data, then adjusts the input data to maximize the loss.\n",
 85 |     "\n",
 86 |     "_(from the original paper)_\n",
 87 |     "\n",
 88 |     "![fgsm panda attack](https://pytorch.org/tutorials/_images/fgsm_panda_image.png)\n",
 89 |     "\n",
 90 |     "**TLDR;** Just perturbe the input data with some small change that would work in an **adversary** fashion (wrt. the optimisation process) that follows the **direction of the gradient** (i.e. $sign(\\nabla_{x} J(\\mathbf{\\theta}, \\mathbf{x}, y))$ )"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "import torch\n",
100 |     "import torch.nn as nn\n",
101 |     "import torch.nn.functional as F\n",
102 |     "import torch.optim as optim\n",
103 |     "\n",
104 |     "from torch.utils.data import DataLoader\n",
105 |     "from torchvision import datasets, transforms\n",
106 |     "import numpy as np\n",
107 |     "import matplotlib.pyplot as plt\n",
108 |     "\n",
109 |     "# NOTE: This is a hack to get around \"User-agent\" limitations when downloading MNIST datasets\n",
110 |     "#       see, https://github.com/pytorch/vision/issues/3497 for more information\n",
111 |     "from six.moves import urllib\n",
112 |     "\n",
113 |     "opener = urllib.request.build_opener()\n",
114 |     "opener.addheaders = [(\"User-agent\", \"Mozilla/5.0\")]\n",
115 |     "urllib.request.install_opener(opener)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "from pathlib import Path\n",
125 |     "import os \n",
126 |     "\n",
127 |     "DATA_FOLDER = Path(os.path.join(os.path.abspath(os.path.curdir), \"..\")) / \"data\""
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "print(DATA_FOLDER)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# Reproducibility Settings\n",
146 |     "\n",
147 |     "import numpy as np\n",
148 |     "\n",
149 |     "SEED = 123456\n",
150 |     "np.random.seed(SEED)\n",
151 |     "torch.manual_seed(SEED)\n",
152 |     "\n",
153 |     "if torch.cuda.is_available():\n",
154 |     "    torch.cuda.manual_seed_all(SEED)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "#### `LeNet` Model"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "# LeNet Model definition\n",
171 |     "class Net(nn.Module):\n",
172 |     "    def __init__(self):\n",
173 |     "        super(Net, self).__init__()\n",
174 |     "        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n",
175 |     "        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n",
176 |     "        self.conv2_drop = nn.Dropout2d()\n",
177 |     "        self.fc1 = nn.Linear(320, 50)\n",
178 |     "        self.fc2 = nn.Linear(50, 10)\n",
179 |     "\n",
180 |     "    def forward(self, x):\n",
181 |     "        x = F.relu(F.max_pool2d(self.conv1(x), 2))\n",
182 |     "        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n",
183 |     "        x = x.view(-1, 320)\n",
184 |     "        x = F.relu(self.fc1(x))\n",
185 |     "        x = F.dropout(x, training=self.training)\n",
186 |     "        x = self.fc2(x)\n",
187 |     "        return F.log_softmax(x, dim=1)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "Initialise pre-trained model (and move it to available device)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "if torch.cuda.is_available():\n",
204 |     "    dev_name = \"cuda\"\n",
205 |     "elif torch.backends.mps.is_available():\n",
206 |     "    dev_name = \"mps\"\n",
207 |     "else:\n",
208 |     "    dev_name = \"cpu\"\n",
209 |     "\n",
210 |     "device = torch.device(dev_name)\n",
211 |     "print(f\"You will be using the {device} device\")"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "torch.__version__"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "# Uncomment this when running on Anaconda Notebooks\n",
230 |     "# !wget !wget https://raw.githubusercontent.com/leriomaggio/ppml-tutorial/main/3-ml-models-attacks/lenet_mnist_model.pth"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "PRETRAINED_MODEL_WEIGHTS = \"lenet_mnist_model.pth\"\n",
240 |     "\n",
241 |     "# Initialize the network\n",
242 |     "model = Net().to(device)\n",
243 |     "\n",
244 |     "# Load the pretrained model\n",
245 |     "model.load_state_dict(torch.load(PRETRAINED_MODEL_WEIGHTS, map_location=device))\n",
246 |     "\n",
247 |     "# Set the model in evaluation mode. In this case this is for the Dropout layers\n",
248 |     "model.eval()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "### Download MNIST Dataset"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# MNIST Test dataset and dataloader declaration\n",
265 |     "mnist_test = datasets.MNIST(root=DATA_FOLDER, train=False, download=True, transform=transforms.ToTensor())\n",
266 |     "test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=1, shuffle=False)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "### Before the Attack\n",
274 |     "\n",
275 |     "Before carrying out the attack, let's see how well the model classify the digits in the test set"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "from sklearn.metrics import accuracy_score\n",
285 |     "from tqdm.notebook import tqdm\n",
286 |     "\n",
287 |     "y_preds, y_true = list(), list()\n",
288 |     "with torch.no_grad():  # extra, as model is eval mode anyway\n",
289 |     "    for (image, target) in tqdm(test_loader):\n",
290 |     "        image, target = image.to(device), target.to(device)\n",
291 |     "        out = model(image)\n",
292 |     "        _, preds = torch.max(out, 1)\n",
293 |     "        y_preds.append(preds.detach().cpu().numpy())\n",
294 |     "        y_true.append(target.detach().cpu().numpy())\n",
295 |     "    y_preds = np.hstack(y_preds)\n",
296 |     "    y_true = np.hstack(y_true)\n",
297 |     "    \n",
298 |     "    print(f\"Pre-Trained Model ACC: {accuracy_score(y_true, y_preds)}\")"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "### `FSGM` Attack"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "Now, we can define the function that creates the adversarial examples by\n",
313 |     "perturbing the original inputs. "
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "# FGSM attack code\n",
323 |     "def fgsm_attack(image: torch.Tensor, epsilon: float, data_gradient: torch.Tensor) -> torch.Tensor:\n",
324 |     "    # Collect the element-wise sign of the data gradient\n",
325 |     "    sign_data_grad = data_gradient.sign()\n",
326 |     "    # Create the perturbed image by adjusting each pixel of the input image\n",
327 |     "    perturbed_image = image + (epsilon * sign_data_grad)\n",
328 |     "    # Adding clipping to maintain [0,1] range\n",
329 |     "    perturbed_image = torch.clamp(perturbed_image, 0, 1)  # normalise in [0, 1] to make it an actual image\n",
330 |     "    # Return the perturbed image\n",
331 |     "    return perturbed_image"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {},
337 |    "source": [
338 |     "Last but not least: the **test function**"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "def test(model, device, loader, epsilon):\n",
348 |     "    # from https://github.com/pytorch/tutorials/blob/master/beginner_source/fgsm_tutorial.py\n",
349 |     "    \n",
350 |     "    # Accuracy counter\n",
351 |     "    correct = 0\n",
352 |     "    adv_examples = []\n",
353 |     "\n",
354 |     "    # Loop over all examples in test set\n",
355 |     "    for data, target in tqdm(test_loader, desc=f\"Running Attack on Batches with ε={ε}\"):\n",
356 |     "\n",
357 |     "        # Send the data and label to the device\n",
358 |     "        data, target = data.to(device), target.to(device)\n",
359 |     "\n",
360 |     "        # Set requires_grad attribute of tensor. Important for Attack\n",
361 |     "        data.requires_grad = True\n",
362 |     "\n",
363 |     "        # Forward pass the data through the model\n",
364 |     "        output = model(data)\n",
365 |     "        init_pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability\n",
366 |     "\n",
367 |     "        # If the initial prediction is wrong, don't bother attacking, just move on\n",
368 |     "        if init_pred.item() != target.item():\n",
369 |     "            continue\n",
370 |     "\n",
371 |     "        # Calculate the loss\n",
372 |     "        loss = F.nll_loss(output, target)\n",
373 |     "\n",
374 |     "        # Zero all existing gradients\n",
375 |     "        model.zero_grad()\n",
376 |     "\n",
377 |     "        # Calculate gradients of model in backward pass\n",
378 |     "        loss.backward()\n",
379 |     "\n",
380 |     "        # Collect datagrad\n",
381 |     "        data_grad = data.grad.data\n",
382 |     "\n",
383 |     "        # Call FGSM Attack\n",
384 |     "        perturbed_data = fgsm_attack(data, epsilon, data_grad)\n",
385 |     "\n",
386 |     "        # Re-classify the perturbed image\n",
387 |     "        output = model(perturbed_data)\n",
388 |     "\n",
389 |     "        # Check for success\n",
390 |     "        final_pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability\n",
391 |     "        if final_pred.item() == target.item():\n",
392 |     "            correct += 1\n",
393 |     "        else:\n",
394 |     "            # Save some adv examples for visualization later\n",
395 |     "            if len(adv_examples) < 5:\n",
396 |     "                adv_ex = perturbed_data.squeeze().detach().cpu().numpy()\n",
397 |     "                adv_examples.append((init_pred.item(), final_pred.item(), adv_ex))\n",
398 |     "\n",
399 |     "    # Calculate final accuracy for this epsilon\n",
400 |     "    final_acc = correct / float(len(test_loader))\n",
401 |     "    print(\n",
402 |     "        \"Epsilon: {}\\tTest Accuracy = {} / {} = {}\".format(\n",
403 |     "            epsilon, correct, len(test_loader), final_acc\n",
404 |     "        )\n",
405 |     "    )\n",
406 |     "\n",
407 |     "    # Return the accuracy and an adversarial example\n",
408 |     "    return final_acc, adv_examples"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "metadata": {},
414 |    "source": [
415 |     "### Run the Attack"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "ε = 0.05"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "acc, adv_examples = test(model, device, test_loader, ε)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "Now let's see how the perturbed images look like: "
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "fig, axes = plt.subplots(1, len(adv_examples), figsize=(8, 10))\n",
450 |     "plt.xticks([], [])\n",
451 |     "plt.yticks([], [])\n",
452 |     "for j, (orig_pred, adv_pred, adv_example) in enumerate(adv_examples):\n",
453 |     "    if j == 0:\n",
454 |     "        axes[j].set_ylabel(f\"ε: {ε}\", fontsize=14)\n",
455 |     "    axes[j].set_title(\"{} -> {}\".format(orig_pred, adv_pred))\n",
456 |     "    axes[j].imshow(adv_example, cmap=\"gray\")\n",
457 |     "plt.tight_layout()\n",
458 |     "plt.show()"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "markdown",
463 |    "metadata": {},
464 |    "source": [
465 |     "### Exercise:\n",
466 |     "\n",
467 |     "Now the question is: how much degradation in performance we have as soon as we keep incrementing the value of ε?\n",
468 |     "\n",
469 |     "What we should expect: \n",
470 |     "- the bigger ε, the worse the accuracy\n",
471 |     "- the bigger ε, the more \"discoverable\" the perturbation becomes\n",
472 |     "    - so that it's evident that an attack has been launched"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "EPSILONS = [0.05, .06, .1, .15, .2, .25, .3]\n",
482 |     "\n",
483 |     "accuracies = [acc]\n",
484 |     "adv_examples_map = {0.05: adv_examples}\n",
485 |     "\n",
486 |     "# Run test for each epsilon\n",
487 |     "for ε in EPSILONS[1:]:\n",
488 |     "    acc, adv_examples = test(model, device, test_loader, ε)\n",
489 |     "    accuracies.append(acc)\n",
490 |     "    adv_examples_map[ε] = adv_examples"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "markdown",
495 |    "metadata": {},
496 |    "source": [
497 |     "### Results\n",
498 |     "\n",
499 |     "1. Let's print the accuracy values for each corresponding ε value"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "accuracies"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "plt.figure(figsize=(5, 5))\n",
518 |     "\n",
519 |     "# your code here: plot Accuracies vs EPSILONS\n",
520 |     "plt.plot(EPSILONS, accuracies)\n",
521 |     "plt.yticks(np.arange(0, 1.1, step=0.1))\n",
522 |     "plt.xticks(np.arange(0, 0.35, step=0.05))\n",
523 |     "plt.title(\"Accuracy vs Epsilon\")\n",
524 |     "plt.xlabel(\"Epsilon\")\n",
525 |     "plt.ylabel(\"Accuracy\")\n",
526 |     "plt.show()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "metadata": {},
532 |    "source": [
533 |     "2. Visualise Generated Adversarial Examples"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": null,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "# Plot several examples of adversarial samples at each epsilon\n",
543 |     "\n",
544 |     "for ε in EPSILONS:\n",
545 |     "    fig, axes = plt.subplots(1, len(adv_examples_map[ε]), figsize=(8, 10))\n",
546 |     "    plt.xticks([], [])\n",
547 |     "    plt.yticks([], [])\n",
548 |     "    for j, (orig_pred, adv_pred, adv_example) in enumerate(adv_examples_map[ε]):\n",
549 |     "        if j == 0:\n",
550 |     "            axes[j].set_ylabel(f\"ε: {ε}\", fontsize=14)\n",
551 |     "        axes[j].set_title(\"{} -> {}\".format(orig_pred, adv_pred))\n",
552 |     "        axes[j].imshow(adv_example, cmap=\"gray\")\n",
553 |     "    plt.tight_layout()\n",
554 |     "    plt.show()"
555 |    ]
556 |   }
557 |  ],
558 |  "metadata": {
559 |   "kernelspec": {
560 |    "display_name": "Python 3 (ipykernel)",
561 |    "language": "python",
562 |    "name": "python3"
563 |   },
564 |   "language_info": {
565 |    "codemirror_mode": {
566 |     "name": "ipython",
567 |     "version": 3
568 |    },
569 |    "file_extension": ".py",
570 |    "mimetype": "text/x-python",
571 |    "name": "python",
572 |    "nbconvert_exporter": "python",
573 |    "pygments_lexer": "ipython3",
574 |    "version": "3.12.3"
575 |   }
576 |  },
577 |  "nbformat": 4,
578 |  "nbformat_minor": 4
579 | }
580 | 


--------------------------------------------------------------------------------
/1-data-anonimisation/3-k-anonimity.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Optional: setup NoTexBook theme\n",
 10 |     "%load_ext notexbook\n",
 11 |     "%texify"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "**Adapted from**: [Ch2](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch2.ipynb)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "# $k$-Anonymity\n",
 26 |     "\n",
 27 |     "$k$-Anonymity[2](#fn2) is a *formal privacy definition*. \n",
 28 |     "\n",
 29 |     "The definition of $k$-Anonymity is designed to formalize our intuition that a piece of auxiliary information should not narrow down the set of possible records for an individual \"too much.\" \n",
 30 |     "\n",
 31 |     "In other terms, $k$-Anonymity is designed to ensure that each individual can _blend into the crowd._\n",
 32 |     "\n",
 33 |     "Informally, we say that a dataset is \"$k$-Anonymized\" for a particular $k$ if each individual in the dataset is a member of a group of size at least $k$, such that each member of the group shares the same *quasi-identifiers* (a selected subset of all the dataset's columns) with all other members of the group. \n",
 34 |     "\n",
 35 |     "Therefore, the individuals in each group \"blend into\" their group - it's possible to narrow down an individual to membership in a particular group, but not to determine which group member is the target.\n",
 36 |     "\n",
 37 |     "> **Definition** (more formally) A dataset $D$ satisfies $k$-Anonymity for a value of $k$ if:\n",
 38 |     "> \n",
 39 |     "> - For each row $r_1 \\in D$, there exist at least $k-1$ other rows $r_2 \\dots r_k \\in D$ such that \n",
 40 |     "> $\\Pi_{qi(D)} r_1 = \\Pi_{qi(D)} r_2, \\ldots, = \\Pi_{qi(D)} r_k$\n",
 41 |     ">\n",
 42 |     "> where $qi(D)$ is the quasi-identifiers of $D$, and $\\Pi_{qi(D)} r$ represents the columns of $r$ containing quasi-identifiers (i.e. the projection of the quasi-identifiers).\n",
 43 |     "\n",
 44 |     "<span id=\"fn1\">**[2]**: Sweeney, L: _k-ANONYMITY: A MODEL FOR PROTECTING PRIVACY_ on International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems, 2002 [link](https://doi.org/10.1142/S0218488502001648)</span>"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     ">**Learning Objectives**\n",
 52 |     "After reading this chapter, you will understand:\n",
 53 |     "> - The definition of $k$-Anonymity\n",
 54 |     "> - How to check for $k$-Anonymity\n",
 55 |     "> - How to generalize data to enforce $k$-Anonymity\n",
 56 |     "> - The limitations of $k$-Anonymity"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Checking for $k$-Anonymity"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "We'll start with a small dataset, so that we can immediately see by looking at the data whether it satisfies $k$-Anonymity or not. \n",
 71 |     "\n",
 72 |     "This dataset contains age plus two test scores; it clearly doesn't satisfy $k$-Anonymity for $k > 1$. \n",
 73 |     "\n",
 74 |     "Any dataset trivially satisfies $k$-Anonymity for $k = 1$, since each row can form its own group of size 1."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "tags": [
 82 |      "remove-cell"
 83 |     ]
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "import pandas as pd\n",
 88 |     "import numpy as np\n",
 89 |     "import matplotlib.pyplot as plt\n",
 90 |     "plt.style.use('seaborn-v0_8-whitegrid')"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {
 97 |     "tags": [
 98 |      "hide-input"
 99 |     ]
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "example_data = {\n",
104 |     "    'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], \n",
105 |     "    'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], \n",
106 |     "    'age': [42, 52, 36, 24, 73], \n",
107 |     "    'preTestScore': [4, 24, 31, 2, 3],\n",
108 |     "    'postTestScore': [25, 94, 57, 62, 70]}\n",
109 |     "df = pd.DataFrame(example_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])\n",
110 |     "df"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "To implement a function to check whether a dataframe satisfies $k$-Anonymity, we loop over the rows. \n",
118 |     "\n",
119 |     "For each row, we query the dataframe to see how many rows match its values for the quasi-identifiers. \n",
120 |     "\n",
121 |     "If the number of rows in any group is less than $k$, the dataframe **does not** satisfy $k$-Anonymity for that value of $k$, and we return `False`. \n",
122 |     "\n",
123 |     "Note that in this simple definition, we consider *all* columns to contain quasi-identifiers; to limit our check to a subset of all columns, we would need to replace the `df.columns` expression with something else."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "def is_k_anonymised(df, k):\n",
133 |     "    for _, match in df.groupby(df.columns.tolist()).groups.items():\n",
134 |     "        if len(match) < k:\n",
135 |     "            return False\n",
136 |     "    return True"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "As expected, our example dataframe does *not* satisfy $k$-Anonymity for $k = 2$, but it does satisfy the property for $k=1$."
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "is_k_anonymised(df, 1)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "is_k_anonymised(df, 2)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Generalizing Data to Satisfy $k$-Anonymity"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "The process of modifying a dataset so that it satisfies $k$-Anonymity for a desired $k$ is generally accomplished by *generalizing* the data, that is \"modifying values to be less specific, and therefore more likely to match the values of other individuals in the dataset\". \n",
176 |     "\n",
177 |     "For example, an `age` which is accurate to a year may be generalized by rounding to the nearest `10` years, or a `ZIP` code might have its rightmost digits replaced by zeros. \n",
178 |     "\n",
179 |     "For numeric values, this is easy to implement. \n",
180 |     "\n",
181 |     "We'll use the `apply` method of dataframes, and pass in a dictionary named `depths` which specifies how many digits to replace by `zeros` for each column. \n",
182 |     "\n",
183 |     "This gives us the flexibility to experiment with **different levels of generalization** for different columns."
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "def generalise(df, depths):\n",
193 |     "    return df.apply(lambda x: x.apply(lambda y: int(int(y/(10**depths[x.name]))*(10**depths[x.name]))))"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "Now, we can generalize our example dataframe. \n",
201 |     "\n",
202 |     "First, we'll try generalizing each column by one \"level\" - i.e. rounding to the nearest `10`."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "# original dataframe as reference\n",
212 |     "df"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "For simplicity, let's focus only on numerical fields (i.e. get rid of names):"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "df = df[[\"age\", \"preTestScore\", \"postTestScore\"]]"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "depths = {\n",
238 |     "    'age': 1,\n",
239 |     "    'preTestScore': 1,\n",
240 |     "    'postTestScore': 1\n",
241 |     "}\n",
242 |     "df2 = generalise(df, depths)\n",
243 |     "df2"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "Notice that even after generalization, our example data *still* does not satisfy $k$-Anonymity for $k=2$."
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "is_k_anonymised(df2, 2)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "We can try generalizing more - but then we'll end up removing *all* of the data!"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "depths = {\n",
276 |     "    'age': 2,\n",
277 |     "    'preTestScore': 2,\n",
278 |     "    'postTestScore': 2\n",
279 |     "}\n",
280 |     "generalise(df, depths)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "This example illustrates one of the key challenges of achieving $k$-Anonymity:\n",
288 |     "\n",
289 |     ">**Challenge**:\n",
290 |     ">\n",
291 |     "> Achieving $k$-Anonymity for meaningful values of $k$ often requires removing quite a lot of information from the data\n"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "## Does More Data Improve Generalization?"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "Our example dataset is too small for $k$-Anonymity to work well. \n",
306 |     "\n",
307 |     "Because there are only `5` individuals in the dataset, building groups of `2` or more individuals who share the same properties is difficult. \n",
308 |     "\n",
309 |     "The solution to this problem is more data: in a dataset with more individuals, less generalization will typically be needed to satisfy $k$-Anonymity for a desired $k$.\n",
310 |     "\n",
311 |     "Let's try the same census data we examined for de-identification. \n",
312 |     "\n",
313 |     "This dataset contains more than `32,000` rows, so it should be easier to achieve $k$-Anonymity."
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {
320 |     "tags": [
321 |      "remove-cell"
322 |     ]
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\"\n",
327 |     "adult_data = pd.read_csv(DATASET_URL)\n",
328 |     "adult_data.head()"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "We'll consider (again) the `ZIP code`, `age`, and `educational achievement` of each individual to be the **quasi-identifiers**. \n",
336 |     "\n",
337 |     "We'll project just those columns, and try to achieve $k$-Anonymity for $k=2$. \n",
338 |     "\n",
339 |     "The data is already $k$-Anonymous for $k=1$.\n",
340 |     "\n",
341 |     "For $k=2$, our algorithm finds a failing row quickly and finishes fast."
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "projection_age_edu = adult_data[['Age', 'Education-Num']]\n",
351 |     "projection_age_edu.columns = ['age', 'edu']"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "is_k_anonymised(projection_age_edu, k=2)"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "Now, we'll try to generalize to achieve $k$-Anonymity for $k=2$. \n",
368 |     "\n",
369 |     "We'll start with generalizing both age and educational attainment to the nearest `10`, and we'll consider only the first `1,000` entries."
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "# outliers are a real problem!\n",
379 |     "depths = {\n",
380 |     "    'age': 1,\n",
381 |     "    'edu': 1\n",
382 |     "}\n",
383 |     "generalised_projection = generalise(projection_age_edu.head(1000), depths)\n",
384 |     "is_k_anonymised(generalised_projection, 2)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "The generalized result still does not satisfy $k$-Anonymity for $k=2$! \n",
392 |     "\n",
393 |     "The reason is that the dataset contains *outliers* - individuals who are very different from the rest of the population. \n",
394 |     "\n",
395 |     "These individuals do not fit easily into any group, even after generalization. \n",
396 |     "\n",
397 |     "Even considering *only* ages, we can see the presence of outliers:"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "tags": [
405 |      "hide-input"
406 |     ]
407 |    },
408 |    "outputs": [],
409 |    "source": [
410 |     "generalised_projection['age'].hist();"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {},
416 |    "source": [
417 |     "Achieving the optimal generalization for $k$-Anonymity is very challenging in cases like this. "
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "## Removing Outliers"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "Generalizing each row *more* would be overkill for the well-represented individuals with ages in the 20-40 range, and would hurt utility. \n",
432 |     "\n",
433 |     "However, more generalization is clearly needed for individuals at the upper and lower ends of the age range. \n",
434 |     "\n",
435 |     "This is the kind of challenge that occurs regularly in practice, and is difficult to solve automatically. \n",
436 |     "\n",
437 |     "In fact, **optimal generalization** for $k$-Anonymity has been shown to be NP-hard.\n",
438 |     "\n",
439 |     "> **Challenge**:\n",
440 |     ">\n",
441 |     ">Outliers make achieving $k$-Anonymity very challenging, even for large datasets. Optimal generalization for $k$-Anonymity is NP-hard."
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "One solution to this problem is simply to **clip** the age of each individual in the dataset to lie within a specific range, eliminating outliers entirely.\n",
449 |     "\n",
450 |     "This can also hurt utility, since it replaces real ages with fake ones, but it can be better than generalizing each row more.\n",
451 |     "\n",
452 |     "We can use Numpy's `clip` method to perform this clipping. We clip ages to be `60` or below, and leave educational levels alone (by clipping them to a very large value)."
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "# clipping away outliers\n",
462 |     "depths = {\n",
463 |     "    'age': 1,\n",
464 |     "    'edu': 1\n",
465 |     "}\n",
466 |     "projection_clipped = projection_age_edu.clip(upper=np.array([60, 10000000000000]), axis='columns')\n",
467 |     "generalised_projection_clipped = generalise(projection_clipped.head(500), depths)\n",
468 |     "is_k_anonymised(generalised_projection, 7)"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": null,
474 |    "metadata": {},
475 |    "outputs": [],
476 |    "source": [
477 |     "is_k_anonymised(generalised_projection, 2)"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {},
483 |    "source": [
484 |     "## What if we add in the whole dataset ?\n",
485 |     "\n",
486 |     "We can perform this generalization on all `32,000` rows:"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": null,
492 |    "metadata": {},
493 |    "outputs": [],
494 |    "source": [
495 |     "is_k_anonymised(projection_age_edu, k=2)"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": [
504 |     "depths = {\n",
505 |     "    'age': 1,\n",
506 |     "    'edu': 1\n",
507 |     "}\n",
508 |     "generalised_proj_whole_dataset = generalise(projection_age_edu, depths)\n",
509 |     "is_k_anonymised(generalised_proj_whole_dataset, k=2)"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": [
518 |     "# the smallest group after generalisation\n",
519 |     "min(map(len, generalised_projection_clipped.groupby(generalised_projection_clipped.columns.tolist()).groups.values()))"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "min(map(len, generalised_proj_whole_dataset.groupby(generalised_proj_whole_dataset.columns.tolist()).groups.values()))"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {},
534 |    "source": [
535 |     "So in this case, adding more data increases the statistics, reaching a value of $k$ up to $21$!"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "is_k_anonymised(generalised_proj_whole_dataset, k=22)"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "markdown",
549 |    "metadata": {},
550 |    "source": [
551 |     "## Summary"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "- $k$-Anonymity is a property of data, which ensures that each individual \"blends in\" with a group of at least $k$ individuals.\n",
559 |     "- $k$-Anonymity is computationally expensive even to check: the naive algorithm is $O(n^2)$, and faster algorithms take considerable space.\n",
560 |     "- $k$-Anonymity can be achieved by modifying a dataset by *generalizing* it, so that particular values become more common and groups are easier to form.\n",
561 |     "- Optimal generalization is extremely difficult, and outliers can make it even more challenging. Solving this problem automatically is NP-hard."
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "markdown",
566 |    "metadata": {},
567 |    "source": [
568 |     "## Further Reading"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "markdown",
573 |    "metadata": {},
574 |    "source": [
575 |     "- [Data Anonymization: Perspectives from a Former Skeptic](https://towardsdatascience.com/data-anonymization-perspectives-from-a-former-skeptic-f35790a2042a)\n",
576 |     "- `t-closeness` (**Beyond K-Anonimity**) [Paper](https://www.cs.purdue.edu/homes/ninghui/papers/t_closeness_icde07.pdf)\n",
577 |     "- _Anonymising and Sharing Individual Patients Data_ [Paper](https://www.bmj.com/content/bmj/350/bmj.h1139.full.pdf?casa_token=NwqT3F-i9xkAAAAA:U_T2t8ZaB1xWBgDOH7QbgQAuwMXJ6FehY07q_C0AztDejEDxp08awbjyWeOlMLOl14lV-W0z1OVjmw)"
578 |    ]
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "Python 3 (ipykernel)",
584 |    "language": "python",
585 |    "name": "python3"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.12.3"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 4
602 | }
603 | 


--------------------------------------------------------------------------------
/1-data-anonimisation/2-de-identification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Optional: setup NoTexBook theme\n",
 10 |     "%load_ext notexbook\n",
 11 |     "%texify"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "**Adapted from**: [Ch1](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch1.ipynb)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {
 24 |     "tags": [
 25 |      "remove-cell"
 26 |     ]
 27 |    },
 28 |    "source": [
 29 |     "# De-Identification\n",
 30 |     "\n",
 31 |     "### Dataset\n",
 32 |     "\n",
 33 |     "The dataset is based on census data. The personally identifiable information (**PII**) is made up."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import pandas as pd\n",
 43 |     "import numpy as np\n",
 44 |     "import matplotlib.pyplot as plt"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\""
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "tags": [
 61 |      "remove-cell"
 62 |     ]
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "adult = pd.read_csv(DATASET_URL)\n",
 67 |     "adult.head()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "# De-identification"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "*De-identification* is the process of removing *identifying information* from a dataset. The term *de-identification* is sometimes used as a synonym for other terms like *anonymization* or *pseudonymization*.\n",
 82 |     "\n",
 83 |     "> **Learning Objectives**\n",
 84 |     "> - Define the following concepts:\n",
 85 |     ">   - De-identification & Re-identification\n",
 86 |     ">   - Identifying information / personally identifying information\n",
 87 |     "> - Learn Example of (Data) Attacks\n",
 88 |     ">   - Linking & Differencing Attacks\n",
 89 |     ">   - Understand limitations of aggregate statistics"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### Personal Identifiable Information\n",
 97 |     "\n",
 98 |     "Identifying information has no formal definition. It is usually understood to be information which would be used to identify us uniquely in the course of daily life - name, address, phone number, e-mail address, etc. \n",
 99 |     "\n",
100 |     "As we will see later, it's *impossible* to formalize the concept of identifying information, because *all* information is identifying. \n",
101 |     "\n",
102 |     "The term **personally identifiable information** (`PII`) is often used as a synonym for \"identifying information\"."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "**Q**: How should we **de-identify** information? \n",
110 |     "\n",
111 |     "**A**: Easy - we just remove the columns that contain identifying information!"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "adult_de_identified = adult.copy().drop(columns=['Name', 'SSN'])\n",
121 |     "adult_de_identified.head(1)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "> 💡 We'll save some of the identifying information for later, when we'll use it as *auxiliary data* to perform a *re-identification* attack."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "Selected **PPI**s in the dataset:"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "adult_pii = adult[['Name', 'SSN', 'DOB', 'Zip']]\n",
145 |     "adult_pii.head()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## Linking Attacks\n",
153 |     "\n",
154 |     "Imagine we want to determine the income of a friend from our de-identified data. \n",
155 |     "\n",
156 |     "Names have been removed, but we happen to know some _auxiliary information_ about our friend. \n",
157 |     "\n",
158 |     "Our friend's name is **Karrie Trusslove**, and we know Karrie's date of birth and zip code."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "To perform a simple **linking attack**, we look at the _overlapping columns between the dataset_ we're trying to attack, and the auxiliary data we know.\n",
166 |     "\n",
167 |     "In this case, both datasets have dates of birth and zip codes.\n",
168 |     "\n",
169 |     "We look for rows in the dataset we're attacking with `dates of birth` and `zip codes` that match Karrie's `date of birth` and `zip code`.\n",
170 |     "\n",
171 |     "If there is **only one** such row, we've found Karrie's row in the dataset we're attacking. \n",
172 |     "\n",
173 |     "In databases, this is called a **JOIN** of two tables, and we can do it in Pandas using `merge`."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "karries_row = adult_pii[adult_pii['Name'] == 'Karrie Trusslove']\n",
183 |     "karries_row"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "auxiliary_info = karries_row[[\"DOB\", \"Zip\"]]\n",
193 |     "auxiliary_info.head()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "\n",
203 |     "pd.merge(auxiliary_info, adult_de_identified, left_on=['DOB', 'Zip'], right_on=['DOB', 'Zip'])"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "> 💡 Indeed, there is only one row that matches.\n",
211 |     "> We have used **auxiliary data** to re-identify an individual in a de-identified dataset, and we're able to infer that Karrie's income is less than `$50k`."
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "### How Hard is it to Re-Identify Karrie?"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "This scenario is made up, but linking attacks are surprisingly easy to perform in practice.\n",
226 |     "\n",
227 |     "How easy? It turns out that in many cases, just one data point is sufficient to pinpoint a row!"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "pd.merge(auxiliary_info, adult_de_identified, left_on=['Zip'], right_on=['Zip'])"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "So ZIP code is sufficient **by itself** to allow us to re-identify Karrie. What about date of birth?"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "pd.merge(auxiliary_info, adult_de_identified, left_on=['DOB'], right_on=['DOB'])"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "This time, there are three rows returned - and we don't know which one is the real Karrie. \n",
260 |     "\n",
261 |     "**But we've still learned a lot about our dataset!**\n",
262 |     "\n",
263 |     "- We know that there's a 2/3 chance that Karrie's income is less than $50k\n",
264 |     "- We can look at the differences between the rows to determine what additional auxiliary information would *help* us to distinguish them (e.g. sex, occupation, marital status)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "#### Is Karrie Special?"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "How hard is it to re-identify others in the dataset? \n",
279 |     "\n",
280 |     "Is Karrie especially easy or especially difficult to re-identify? \n",
281 |     "\n",
282 |     "A good way to understand the effectiveness of this type of attack is to look at how **selective** certain pieces of data are.\n",
283 |     "\n",
284 |     "In other words, how good they are at narrowing down the set of potential rows which may belong to the target individual.\n",
285 |     "\n",
286 |     "For example, is it common for `birth dates` to occur more than once ?\n",
287 |     "\n",
288 |     "We'd like to get an idea of how many dates of birth are likely to be useful in performing an attack, which we can do by looking at how common \"unique\" dates of birth are in the dataset."
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {
295 |     "tags": [
296 |      "hide-input"
297 |     ]
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "adult_pii['DOB'].value_counts().hist()\n",
302 |     "\n",
303 |     "plt.title(\"How selective is date of birth as PII ?\")\n",
304 |     "plt.xlabel('Number of Dates of Birth')\n",
305 |     "plt.ylabel('Number of Occurrences');"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "The histogram above shows that *the vast majority* of dates of birth occur 1, 2, or 3 times in the dataset, and *no date of birth* occurs more than 8 times. \n",
313 |     "\n",
314 |     "This means that date of birth is **fairly selective** - it's effective in narrowing down the possible records for an individual."
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "#### Quick Exercise:\n",
322 |     "\n",
323 |     "Let's try to repeat the experiment with `ZIP` codes:"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "# your code here\n"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "<details>\n",
340 |     "\n",
341 |     "<summary>Solution</summary>\n",
342 |     "\n",
343 |     "```python\n",
344 |     "adult_pii['Zip'].value_counts().hist()\n",
345 |     "\n",
346 |     "plt.title(\"How selective is date of birth as PII ?\")\n",
347 |     "plt.xlabel('Number of ZIP Codes')\n",
348 |     "plt.ylabel('Number of Occurrences');\n",
349 |     "```\n",
350 |     "</details>"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "<details>\n",
358 |     "\n",
359 |     "<summary>Considerations</summary>\n",
360 |     "\n",
361 |     "The results when using ZIP code happens to be even worse: ZIP code is *very* selective in this dataset. \n",
362 |     "\n",
363 |     "Nearly all the ZIP codes occur only once.\n",
364 |     "</details>"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "### How Many People can we Re-Identify?"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "**Q**: In this dataset, how many people can we re-identify uniquely?\n",
379 |     "\n",
380 |     "We can use our auxiliary information to find out!\n",
381 |     "\n",
382 |     "First, let's see what happens with just `dates of birth`.\n",
383 |     "\n",
384 |     "We want to know how many *possible identities* are returned for each data record in the dataset."
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {
391 |     "scrolled": true,
392 |     "tags": [
393 |      "hide-input"
394 |     ]
395 |    },
396 |    "outputs": [],
397 |    "source": [
398 |     "attack = pd.merge(adult_pii, adult_de_identified, left_on=['DOB'], right_on=['DOB'])\n",
399 |     "attack['Name'].value_counts().hist();\n",
400 |     "\n",
401 |     "plt.title(\"How many records can we identify with DoB ?\")\n",
402 |     "plt.xlabel(\"Entries count\")\n",
403 |     "plt.ylabel(\"Number of Records\")\n",
404 |     "plt.show()"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "\n",
412 |     "The above histogram shows the **number of records with each number of possible identities**.\n",
413 |     "\n",
414 |     "The results show that we can uniquely identify almost `7,000` of the data records (out of about `32,000`), and an additional `10,000` data records are narrowed down to **only two** possible identities."
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "So it's not possible to re-identify a majority of individuals using *just* date of birth. \n",
422 |     "\n",
423 |     "What if we collect more information, to narrow things down further? \n",
424 |     "\n",
425 |     "If we use **both** `date of birth` and `ZIP`, we're able to do much better. \n",
426 |     "\n",
427 |     "In fact, we're able to uniquely re-identify basically the whole dataset."
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {
434 |     "scrolled": true,
435 |     "tags": [
436 |      "hide-input"
437 |     ]
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "attack = pd.merge(adult_pii, adult_de_identified, left_on=['DOB', 'Zip'], right_on=['DOB', 'Zip'])\n",
442 |     "\n",
443 |     "attack['Name'].value_counts().hist();\n",
444 |     "plt.title(\"How many records can we identify with DoB & ZIP ?\")\n",
445 |     "plt.xlabel(\"Entries count\")\n",
446 |     "plt.ylabel(\"Number of Records\")\n",
447 |     "plt.show()"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "> 💡 When we use both pieces of information, we can re-identify **essentially everyone**. \n",
455 |     "\n",
456 |     "This is a surprising result, since we generally assume that many people share the same birthday, and many people live in the same ZIP code.\n",
457 |     "\n",
458 |     "It turns out that the *combination* of these factors is **extremely** selective.\n",
459 |     "\n",
460 |     "According to Latanya Sweeney's work[1](#fn1), 87% of people in the US can be uniquely re-identified by the combination of date of birth, gender, and ZIP code.\n",
461 |     "\n",
462 |     "<span id=\"fn1\">**[1]**: Sweeney, L, _Simple Demographics Often Identify People Uniquely_ [link](https://dataprivacylab.org/projects/identifiability/)</span>"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "metadata": {},
468 |    "source": [
469 |     "Let's just check that we've actually re-identified *everyone*, by printing out the number of possible data records for each identity:"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "metadata": {
476 |     "tags": [
477 |      "hide-input"
478 |     ]
479 |    },
480 |    "outputs": [],
481 |    "source": [
482 |     "attack['Name'].value_counts(ascending=False).head()"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "Looks like we missed two people! In other words, in this dataset, only **two people** share a combination of ZIP code and date of birth."
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {},
495 |    "source": [
496 |     "## Aggregation"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "markdown",
501 |    "metadata": {},
502 |    "source": [
503 |     "Another way to prevent the release of private information is to release only **aggregate** date."
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "# Let's take the Age as an example\n",
513 |     "adult['Age'].mean()"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "markdown",
518 |    "metadata": {},
519 |    "source": [
520 |     "### Problem of Small Groups"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {},
526 |    "source": [
527 |     "In many cases, aggregate statistics are broken down into smaller groups. \n",
528 |     "\n",
529 |     "For example, we might want to know the average age of people with a particular education level."
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": null,
535 |    "metadata": {},
536 |    "outputs": [],
537 |    "source": [
538 |     "adult[['Education-Num', 'Age']].groupby('Education-Num').mean().head(3)"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "markdown",
543 |    "metadata": {},
544 |    "source": [
545 |     "Aggregation is supposed to _improve privacy_ because it's hard to identify the contribution of a particular individual to the aggregate statistic. \n",
546 |     "\n",
547 |     "But what if we aggregate over a group with just *one person* in it? \n",
548 |     "\n",
549 |     "In that case, the aggregate statistic reveals one person's age *exactly*, and provides no privacy protection at all! \n",
550 |     "\n",
551 |     "In our dataset, most individuals have a unique `ZIP` code - so if we compute the average age by ZIP code, then most of the \"averages\" actually reveal an individual's exact age."
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "metadata": {},
558 |    "outputs": [],
559 |    "source": [
560 |     "adult[['Zip', 'Age']].groupby('Zip').mean().head()"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {},
566 |    "source": [
567 |     "The US Census Bureau, for example, releases aggregate statistics at the [*block level*](https://www.census.gov/newsroom/blogs/random-samplings/2011/07/what-are-census-blocks.html). \n",
568 |     "\n",
569 |     "Some census blocks have large populations, but some have a population of zero! \n",
570 |     "\n",
571 |     "The situation above, where small groups prevent aggregation from hiding information about individuals, turns out to be quite common.\n",
572 |     "\n",
573 |     "How big a group is \"big enough\" for aggregate statistics to help? \n",
574 |     "\n",
575 |     "It's hard to say - it depends on the data and on the attack - so it's challenging to build confidence that aggregate statistics are really privacy-preserving. \n",
576 |     "\n",
577 |     "However, even very large groups do not make aggregation completely robust against attacks, as we will see next."
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "markdown",
582 |    "metadata": {},
583 |    "source": [
584 |     "### Differencing Attacks"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "markdown",
589 |    "metadata": {},
590 |    "source": [
591 |     "The problems with aggregation get even worse when you release multiple aggregate statistics over the same data. \n",
592 |     "\n",
593 |     "For example, consider the following two summation queries over large groups in our dataset (the first over the whole dataset, and the second over all records except one):"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": null,
599 |    "metadata": {},
600 |    "outputs": [],
601 |    "source": [
602 |     "adult['Age'].sum()"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": null,
608 |    "metadata": {},
609 |    "outputs": [],
610 |    "source": [
611 |     "adult[adult['Name'] != 'Karrie Trusslove']['Age'].sum()"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "markdown",
616 |    "metadata": {},
617 |    "source": [
618 |     "If we know both answers, we can simply take the difference and determine Karrie's age completely! \n",
619 |     "\n",
620 |     "This kind of attack can proceed even if the aggregate statistics are over *very large groups*."
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {
627 |     "scrolled": true
628 |    },
629 |    "outputs": [],
630 |    "source": [
631 |     "adult['Age'].sum() - adult[adult['Name'] != 'Karrie Trusslove']['Age'].sum()"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "markdown",
636 |    "metadata": {},
637 |    "source": [
638 |     "#### Take away message\n",
639 |     "\n",
640 |     "(This is a recurring theme:)\n",
641 |     "\n",
642 |     "1. Releasing *data* that is useful makes ensuring *privacy* very difficult.\n",
643 |     "\n",
644 |     "2. Distinguishing between *malicious* and *non-malicious* queries is not possible (in general)."
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "markdown",
649 |    "metadata": {},
650 |    "source": [
651 |     "## Summary"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "markdown",
656 |    "metadata": {},
657 |    "source": [
658 |     "- A *Linking attack* involves combining *auxiliary data* with *de-identified data* to *re-identify* individuals.\n",
659 |     "\n",
660 |     "- In the simplest case, a linking attack can be performed via a *join* of two tables containing these datasets.\n",
661 |     "\n",
662 |     "- Simple linking attacks are surprisingly effective:\n",
663 |     "  - Just a single data point is sufficient to narrow things down to a few records\n",
664 |     "  - The narrowed-down set of records helps suggest additional auxiliary data which might be helpful\n",
665 |     "  - Two data points are often good enough to re-identify a huge fraction of the population in a particular dataset\n",
666 |     "  - Three data points (gender, ZIP code, date of birth) uniquely identify 87% of people in the US\n",
667 |     "\n",
668 |     "\n",
669 |     "\n",
670 |     "- Releasing aggregate statistics is another way to not disclose sensitive information.\n",
671 |     "\n",
672 |     "  - But sometimes aggregating per groups could lead to sensitive data leakage nonetheless.\n",
673 |     "  - A *Differencing Attack* involves gathering some information out from auxiliary info and aggregate statistics. "
674 |    ]
675 |   }
676 |  ],
677 |  "metadata": {
678 |   "kernelspec": {
679 |    "display_name": "Python 3 (ipykernel)",
680 |    "language": "python",
681 |    "name": "python3"
682 |   },
683 |   "language_info": {
684 |    "codemirror_mode": {
685 |     "name": "ipython",
686 |     "version": 3
687 |    },
688 |    "file_extension": ".py",
689 |    "mimetype": "text/x-python",
690 |    "name": "python",
691 |    "nbconvert_exporter": "python",
692 |    "pygments_lexer": "ipython3",
693 |    "version": "3.12.3"
694 |   }
695 |  },
696 |  "nbformat": 4,
697 |  "nbformat_minor": 4
698 | }
699 | 


--------------------------------------------------------------------------------
/3-differential-privacy/3-properties-differential-privacy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "70a9dd8d-ef11-44d7-ac7f-d63171501c7d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Optional: setup NoTexBook theme\n",
 11 |     "%load_ext notexbook\n",
 12 |     "%texify"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "b90ec924",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "**Adapted from**: [Ch4](https://github.com/uvm-plaid/programming-dp/blob/master/notebooks/ch4.ipynb)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "d06e8e58-0d12-482b-8085-ba048e5c6e62",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Properties of Differential Privacy"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "8ae6c29f-0bf9-483d-be70-72a8a2088a34",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "In this notebook we will mention three important properties of **differentially private mechanisms** that arise from the definition\n",
 37 |     " of differential privacy.\n",
 38 |     "\n",
 39 |     "These properties are mentioned as they will be used / referenced when we will start generalising DP applications \n",
 40 |     "to Machine Learning algorithms.\n",
 41 |     "\n",
 42 |     "\n",
 43 |     "These three properties are:\n",
 44 |     "\n",
 45 |     "1. Sequential composition\n",
 46 |     "2. Parallel composition\n",
 47 |     "3. Post processing"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "07b6925b-22f4-4841-8217-e6ef69d0ab88",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## Sequential Composition\n",
 56 |     "\n",
 57 |     "The first major property of differential privacy is *sequential composition*, which **bounds** the total \n",
 58 |     "privacy cost of releasing multiple results of differentially private mechanisms **on the same input data**. \n",
 59 |     "\n",
 60 |     "Formally, the sequential composition theorem for differential privacy says that:\n",
 61 |     "\n",
 62 |     "- If $F_1(x)$ satisfies $\\epsilon_1$-differential privacy\n",
 63 |     "- And $F_2(x)$ satisfies $\\epsilon_2$-differential privacy\n",
 64 |     "- Then the mechanism $G(x) = (F_1(x), F_2(x))$ which releases both results satisfies $\\epsilon_1+\\epsilon_2$-differential privacy\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "Sequential composition is a vital property of differential privacy because it enables the design of algorithms that consult the data more than once. \n",
 68 |     "\n",
 69 |     "Sequential composition is also important when multiple separate analyses are performed on a single dataset, since it allows individuals to bound the *total* privacy cost they incur by participating in all of these analyses.\n",
 70 |     "\n",
 71 |     "The bound on privacy cost given by sequential composition is an *upper* bound - the actual privacy cost of two particular differentially private releases may be smaller than this, but never larger.\n",
 72 |     "\n",
 73 |     "The principle that the $\\epsilon$-s \"add up\" makes sense if we examine the distribution of outputs from a mechanism which averages two differentially private results together.\n",
 74 |     "\n",
 75 |     "However, please bear in mind that Sequential composition does not provide an **exact** upper bound: the exact total privacy cost can be indeed lower than the upper bound!."
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "id": "13c48694-ad91-47b6-9467-0fc10017aea5",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Parallel Composition\n",
 84 |     "\n",
 85 |     "The second important property of differential privacy is called *parallel composition*. \n",
 86 |     "\n",
 87 |     "Parallel composition can be seen as an alternative to sequential composition - a second way to calculate a bound on the total privacy cost of multiple data releases. \n",
 88 |     "\n",
 89 |     "Parallel composition is based on the idea of **splitting** your dataset into disjoint chunks and running a \n",
 90 |     "differentially private mechanism on each chunk separately. \n",
 91 |     "\n",
 92 |     "Since the chunks are **disjoint**, each individual's data appears in *exactly* one chunk - so even if there are $k$ chunks in total (and therefore $k$ runs of the mechanism), the mechanism runs exactly once on the data of each *individual*. \n",
 93 |     "\n",
 94 |     "Formally,\n",
 95 |     " - If $F(x)$ satisfies $\\epsilon$-differential privacy\n",
 96 |     " - And we split a dataset $X$ into $k$ disjoint chunks such that $x_1 \\cup ... \\cup x_k = X$\n",
 97 |     " - Then the mechanism which releases all of the results $F(x_1), ..., F(x_k)$ satisfies $\\epsilon$-differential privacy\n",
 98 |     "\n",
 99 |     "Note that this is a much better bound than sequential composition would give. \n",
100 |     "\n",
101 |     "Since we run $F$ $k$ times, sequential composition would say that this procedure satisfies $k\\epsilon$-differential privacy. \n",
102 |     "\n",
103 |     "Parallel composition allows us to say that the total privacy cost is just $\\epsilon$.\n",
104 |     "\n",
105 |     "The formal definition matches up with our intuition - if each participant in the dataset contributes one row to $X$, then this row will appear in *exactly* one of the chunks $x_1, ..., x_k$. \n",
106 |     "\n",
107 |     "That means $F$ will only \"see\" this participant's data *one time*, meaning a privacy cost of $\\epsilon$ is appropriate for that individual. Since this property holds for all individuals, the privacy cost is $\\epsilon$ for everyone."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "67375447",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Post-processing\n",
116 |     "\n",
117 |     "The third property of differential privacy is called *post-processing*. \n",
118 |     "\n",
119 |     "The idea is simple: it's impossible to **reverse the privacy protection** provided by differential privacy by post-processing the data in some way. \n",
120 |     "\n",
121 |     "Formally:\n",
122 |     "\n",
123 |     "- If $F(X)$ satisfies $\\epsilon$-differential privacy\n",
124 |     "- Then for any (deterministic or randomized) function $g$, $g(F(X))$ satisfies $\\epsilon$-differential privacy\n",
125 |     "\n",
126 |     "The post-processing property means that it's always safe to perform arbitrary computations on the output of a differentially private mechanism - there's no danger of reversing the privacy protection the mechanism has provided. \n",
127 |     "\n",
128 |     "In particular, it's fine to perform post-processing that might reduce the noise or improve the signal in the mechanism's output (e.g. replacing negative results with zeros, for queries that shouldn't return negative results). \n",
129 |     "\n",
130 |     "The other implication of the **post-processing** property is that differential privacy provides resistance against privacy attacks based on **auxiliary information**. \n",
131 |     "\n",
132 |     "For example, the function $g$ might contain auxiliary information about elements of the dataset, and attempt to perform a linkage attack using this information. The post-processing property says that such an attack is limited in its effectiveness by the privacy parameter $\\epsilon$, regardless of the auxiliary information contained in $g$."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "dc67cade-85d7-4494-bb5c-c045b414df84",
138 |    "metadata": {},
139 |    "source": [
140 |     "### Histograms\n",
141 |     "\n",
142 |     "In our context, a *histogram* is an analysis of a dataset which splits the dataset into \"bins\" based on the value of one of the data attributes, and **counts** the number of rows in each bin. \n",
143 |     "\n",
144 |     "For example, a histogram might count the number of people in the dataset who achieved a particular educational level."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "id": "7b23c875",
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "import pandas as pd\n",
155 |     "import numpy as np\n",
156 |     "import matplotlib.pyplot as plt\n",
157 |     "plt.style.use('seaborn-v0_8-whitegrid')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "id": "720f3805-ebbf-476b-988f-711cb70ed47c",
164 |    "metadata": {
165 |     "tags": [
166 |      "hide-input"
167 |     ]
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "DATASET_URL = \"https://raw.githubusercontent.com/uvm-plaid/programming-dp/master/notebooks/adult_with_pii.csv\"\n",
172 |     "adult = pd.read_csv(DATASET_URL)\n",
173 |     "\n",
174 |     "adult['Education'].value_counts().to_frame().head(5)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "id": "a15d1d8e-0399-4f78-9f23-aa1f669ae035",
180 |    "metadata": {},
181 |    "source": [
182 |     "Histograms are particularly interesting for differential privacy because they automatically satisfy parallel composition. \n",
183 |     "\n",
184 |     "Each \"bin\" in a histogram is defined by a possible value for a data attribute (for example, `'Education' == 'HS-grad'`). \n",
185 |     "\n",
186 |     "It's impossible for a single row to have *two* values for an attribute simultaneously, so defining the bins this way *guarantees* that they will be disjoint. \n",
187 |     "\n",
188 |     "Thus we have satisfied the requirements for parallel composition, and we can use a differentially private mechanism to release *all* \n",
189 |     "of the bin counts with a total privacy cost of just $\\epsilon$."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "id": "55d2c3a3-2cda-4274-9a8d-488e45b8b69e",
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "epsilon = 1\n",
200 |     "sensitivity = 1\n",
201 |     "\n",
202 |     "# This analysis has a total privacy cost of epsilon = 1, even though we release many results!\n",
203 |     "f = lambda x: x + np.random.laplace(loc=0, scale=sensitivity/epsilon)\n",
204 |     "s = adult['Education'].value_counts().apply(f)\n",
205 |     "s.to_frame().head(5)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "id": "705f4831-bb84-4929-b396-4aaed7204490",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Optional: Sensitivity and Clipping"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "id": "95ee6158-2850-46e1-8986-1bf96b7ee87e",
219 |    "metadata": {},
220 |    "source": [
221 |     "### Sensitivity"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "id": "39346eee-d5bb-4707-b6c6-5ccf9aea56e4",
227 |    "metadata": {},
228 |    "source": [
229 |     "When discussing the Laplace mechanism, we mentioned that the amount of **noise** that is necessary to ensure differential privacy for \n",
230 |     "a given query depends on the *sensitivity* of the query. \n",
231 |     "\n",
232 |     "Roughly speaking, the sensitivity of a function reflects the amount the function's output will change when its input changes. \n",
233 |     "\n",
234 |     "Recall that the Laplace mechanism defines a mechanism $F(x)$ as follows:\n",
235 |     "\n",
236 |     "\\begin{align}\n",
237 |     "F(x) = f(x) + \\textsf{Lap}\\left(\\frac{s}{\\epsilon}\\right)\n",
238 |     "\\end{align}\n",
239 |     "\n",
240 |     "where $f(x)$ is a deterministic function (the query), $\\epsilon$ is the privacy parameter, and $s$ is the sensitivity of $f$.\n",
241 |     "\n",
242 |     "For a function $f : \\mathcal{D} \\rightarrow \\mathbb{R}$ mapping datasets ($\\mathcal{D}$) to real numbers, the *global sensitivity* of $f$ is defined as follows:\n",
243 |     "\n",
244 |     "\\begin{align}\n",
245 |     "GS(f) = \\max_{x, x': d(x,x') <= 1} |f(x) - f(x')|\n",
246 |     "\\end{align}\n",
247 |     "\n",
248 |     "Here, $d(x, x')$ represents the *distance* between two datasets $x$ and $x'$, and we say that two datasets are *neighbors* if their distance is 1 or less. \n",
249 |     "\n",
250 |     "How this distance is defined has a huge effect on the definition of privacy we obtain.\n",
251 |     "\n",
252 |     "The definition of global sensitivity says that for *any two* neighboring datasets $x$ and $x'$, the difference between $f(x)$ and $f(x')$ is at most $GS(f)$. \n",
253 |     "\n",
254 |     "**Global vs Local Sensitivity**:\n",
255 |     "\n",
256 |     "This measure of sensitivity is called \"global\" because it is independent of the actual dataset being queried (it holds for *any* choice of neighboring $x$ and $x'$). \n",
257 |     "\n",
258 |     "Another measure of sensitivity, called *local sensitivity*, fixes one of the datasets to be the one being queried."
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "id": "e9ecc7dc-36d6-4c3c-bdbd-f8a162366609",
264 |    "metadata": {},
265 |    "source": [
266 |     "### Distance\n",
267 |     "\n",
268 |     "The distance metric $d(x,x')$ described earlier can be defined in many different ways. \n",
269 |     "\n",
270 |     "Intuitively, the distance between two datasets should be equal to 1 (i.e. the datasets are neighbors) if they differ in the data of exactly **one individual**. \n",
271 |     "\n",
272 |     "This idea is easy to formalize in some contexts (e.g. in the US Census, each individual submits a single response containing their data) but extremely challenging in others (e.g. location trajectories, social networks, and time-series data).\n",
273 |     "\n",
274 |     "A common formal definition for datasets containing rows is to consider the number of rows which differ between the two. \n",
275 |     "\n",
276 |     "When each individual's data is contained in a single row, this definition often makes sense. \n",
277 |     "\n",
278 |     "Formally, this definition of distance is encoded as a **symmetric difference** between the two datasets:\n",
279 |     "\n",
280 |     "\\begin{align}\n",
281 |     "d(x, x') = | x - x' \\cup x' - x |\n",
282 |     "\\end{align}\n",
283 |     "\n",
284 |     "This particular definition has several interesting and important implications:\n",
285 |     "- If $x'$ is constructed from $x$ by *adding one row*, then $d(x,x') = 1$\n",
286 |     "- If $x'$ is constructed from $x$ by *removing one row*, then $d(x,x') = 1$\n",
287 |     "- If $x'$ is constructed from $x$ by *modifying one row*, then $d(x,x') = 2$\n",
288 |     "\n",
289 |     "In other words, adding or removing a row results in a neighboring dataset; *modifying* a row results in a dataset at distance *2*. \n",
290 |     "\n",
291 |     "This particular definition of distance results in what is typically called *unbounded differential privacy*. Many other definitions are possible, including one called **bounded differential privacy** in which modifying a single row in a dataset *does* result in a neighboring dataset. \n"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "id": "d620b327-8fd8-4c71-9110-6595c0e68667",
297 |    "metadata": {},
298 |    "source": [
299 |     "#### Calculating Sensitivity\n",
300 |     "\n",
301 |     "How do we determine the sensitivity of a particular function of interest? For some simple functions on real numbers, the answer is obvious.\n",
302 |     "\n",
303 |     "- The global sensitivity of $f(x) = x$ is 1, since changing $x$ by 1 changes $f(x)$ by 1\n",
304 |     "- The global sensitivity of $f(x) = x+x$ is 2, since changing $x$ by 1 changes $f(x)$ by 2\n",
305 |     "- The global sensitivity of $f(x) = 5*x$ is 5, since changing $x$ by 1 changes $f(x)$ by 5\n",
306 |     "- The global sensitivity of $f(x) = x*x$ is unbounded, since the change in $f(x)$ depends on the value of $x$\n",
307 |     "\n",
308 |     "For functions that map datasets to real numbers, we can perform a similar analysis. We will consider the functions which represent common aggregate database queries: counts, sums, and averages."
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "id": "43f401d8-c5e6-409c-9973-6604bac4ef38",
314 |    "metadata": {},
315 |    "source": [
316 |     "#### Counting Queries\n",
317 |     "\n",
318 |     "Counting queries (`COUNT` in SQL) count the number of rows in the dataset which satisfy a specific property. \n",
319 |     "\n",
320 |     "As a rule of thumb, **counting queries always have a sensitivity of 1**. \n",
321 |     "\n",
322 |     "This is because adding a row to the dataset can increase the output of the query by at most 1: either the new row has the desired property, and the count increases by 1, or it does not, and the count stays the same (the count may correspondingly decrease when a row is removed)."
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "id": "25dc86d5-b81c-4173-bc05-2625cfc083d3",
328 |    "metadata": {},
329 |    "source": [
330 |     "**Example: \"How many people are in the dataset?\"** (sensitivity: 1 - counting rows where the property = True)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "id": "54f12bca-afa9-482d-bf36-9b8b3577a67d",
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "adult.shape[0]"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "id": "f4ddef18-456a-4f87-999b-b7bc4b3129d9",
346 |    "metadata": {},
347 |    "source": [
348 |     "**Example: \"How many people have an educational status above 10?\"** (sensitivity: 1 - counting rows with a property)"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "id": "4399cb53-cfd6-4931-bf0c-66a344d67a6f",
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "adult[adult['Education-Num'] > 10].shape[0]"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "id": "635a58cb-4853-4178-8717-61a28688dce3",
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "adult[adult['Name'] == 'Joe Near'].shape[0]"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "id": "9faaef93-ce65-4446-969d-2eb16920789f",
374 |    "metadata": {},
375 |    "source": [
376 |     "#### Summation Queries\n",
377 |     "\n",
378 |     "Summation queries (`SUM` in SQL) sum up the *attribute values* of dataset rows."
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "id": "ea93f080-7116-4265-856d-4bd3c30e4838",
384 |    "metadata": {},
385 |    "source": [
386 |     "**Example: \"What is the sum of the ages of people with an educational status above 10?\"**"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "id": "6d62a9e8-2024-4844-91f3-404890a7f124",
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "adult[adult['Education-Num'] > 10]['Age'].sum()"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "id": "e37f827f-233a-4038-aae6-d4f1857e8744",
402 |    "metadata": {},
403 |    "source": [
404 |     "Sensitivity for these queries is not **as simple as it is for counting queries**. \n",
405 |     "\n",
406 |     "Adding a new row to the dataset will increase the result of our example query by the *age of the new person*. \n",
407 |     "\n",
408 |     "That means the sensitivity of the query depends on the **contents** of the row we add.\n",
409 |     "\n",
410 |     "As a rule of thumb, summation queries have **unbounded sensitivity** when no lower and upper bounds exist on the value of the attribute being summed. \n",
411 |     "\n",
412 |     "When lower and upper bounds do exist, the sensitivity of a summation query is equal to the **difference between them**. \n",
413 |     "\n",
414 |     "In the next section, we will see a technique called **clipping** for enforcing bounds when none exist, so that summation queries with unbounded sensitivity can be converted into queries with bounded sensitivity."
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "id": "a8851ab7-acb0-4155-b6a0-480a8bb5312e",
420 |    "metadata": {},
421 |    "source": [
422 |     "#### Average Queries\n",
423 |     "\n",
424 |     "Average queries (`AVG` in SQL) calculate the mean of attribute values in a particular column."
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "id": "bb73ae9a-26cd-4e9e-b303-64f18e404e37",
430 |    "metadata": {},
431 |    "source": [
432 |     "**Example: \"What is the average age of people with an educational status above 10?\"**"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "id": "2fbd47bf-8509-462e-9138-7b3dc57d95c0",
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "adult[adult['Education-Num'] > 10]['Age'].mean()"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "id": "89b79a76-2e13-420a-a3dc-7387f44268a2",
448 |    "metadata": {},
449 |    "source": [
450 |     "The easiest way to answer an average query with differential privacy is by re-phrasing it as two queries: a summation query divided by a counting query. For the above example:"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "id": "7cda7c33-ce63-472a-89d3-2f60fec93839",
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "adult[adult['Education-Num'] > 10]['Age'].sum() / adult[adult['Education-Num'] > 10]['Age'].shape[0]"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "id": "9263f787-cb9a-4d82-baa8-57d333abe6a0",
466 |    "metadata": {},
467 |    "source": [
468 |     "The sensitivities of both queries can be calculated as described above. \n",
469 |     "\n",
470 |     "Noisy answers for each can be calculated (e.g. using the Laplace mechanism) and the noisy answers can be divided to obtain a differentially private mean. \n",
471 |     "\n",
472 |     "The total privacy cost of both queries can be calculated by **sequential composition**."
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "id": "497056eb-09a5-4336-bf25-13387de886dd",
478 |    "metadata": {},
479 |    "source": [
480 |     "### Clipping\n",
481 |     "\n",
482 |     "Queries with unbounded sensitivity cannot be directly answered with differential privacy using the Laplace mechanism. \n",
483 |     "\n",
484 |     "Fortunately, we can often transform such queries into equivalent queries with *bounded* sensitivity, via a process called **clipping**.\n",
485 |     "\n",
486 |     "The basic idea behind clipping is to **enforce** upper and lower bounds on attribute values. \n",
487 |     "\n",
488 |     "> For example, ages above 125 can be \"clipped\" to exactly 125. \n",
489 |     "\n",
490 |     "After clipping has been performed, we are **guaranteed** that all ages will be 125 or below. \n",
491 |     "\n",
492 |     "As a result, the sensitivity of a summation query on clipped data is equal to the difference between the upper and lower bounds used in clipping: $upper - lower$. \n",
493 |     "\n",
494 |     "For example, the following query has a sensitivity of 125:"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": null,
500 |    "id": "721081ec-c922-4551-83b5-6317ec12533b",
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": [
504 |     "adult['Age'].clip(lower=0, upper=125).sum()"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "id": "dbdb4402-0310-40f9-91db-ca6af05e3f86",
510 |    "metadata": {},
511 |    "source": [
512 |     "The primary challenge in performing clipping is to determine the **upper** and **lower** bounds. \n",
513 |     "\n",
514 |     "Furthermore, there is a tradeoff between the amount of information lost in clipping and the amount of noise needed to ensure differential privacy. \n",
515 |     "\n",
516 |     "As a rule of thumb, **try to set the clipping bounds to include 100% of the dataset**, or get as close as possible. This is harder in some domains (e.g. graph queries) than others.\n",
517 |     "\n",
518 |     "It's tempting to determine the clipping bounds by looking at the data. For example, we can look at the histogram of ages in our dataset to determine an appropriate upper bound:"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "id": "72b94469-12ec-4ab9-a021-ea8fa52a2100",
525 |    "metadata": {
526 |     "tags": [
527 |      "hide-input"
528 |     ]
529 |    },
530 |    "outputs": [],
531 |    "source": [
532 |     "plt.hist(adult['Age'])\n",
533 |     "plt.xlabel('Age')\n",
534 |     "plt.ylabel('Number of Records');"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "markdown",
539 |    "id": "4289b910-111b-468e-b726-ddf7c0722ecb",
540 |    "metadata": {},
541 |    "source": [
542 |     "It's clear from this histogram that nobody in this particular dataset is over 90, so an upper bound of 90 would suffice.\n",
543 |     "\n",
544 |     "**NOTE**: However, it's important to note that **this approach does not satisfy differential privacy**. \n",
545 |     "\n",
546 |     "If we pick our clipping bounds by looking at the data, then the bounds themselves might reveal something about the data.\n",
547 |     "\n",
548 |     "Typically, clipping bounds are decided either by using a property of the dataset that can be known without looking at the data (e.g. that the dataset contains ages, which are likely to lie between 0 and 125), or by performing **differentially private queries** to evaluate different choices for the clipping bounds.\n",
549 |     "\n",
550 |     "**Determine Upper bound with differentially private queries**:\n",
551 |     "\n",
552 |     "To use the second approach, we typically set the lower bound to 0 and slowly increase the upper bound until the query's output stops changing (meaning we haven't included any new data by increasing the bound). \n",
553 |     "\n",
554 |     "For example, let's try computing the sum of ages for clipping bounds `from 0 to 100``, using the Laplace mechanism for each one to ensure differential privacy:"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": null,
560 |    "id": "e2fc38a2-4b32-4f7a-afdd-8b01a4ad52b2",
561 |    "metadata": {
562 |     "tags": [
563 |      "hide-input"
564 |     ]
565 |    },
566 |    "outputs": [],
567 |    "source": [
568 |     "def laplace_mech(v, sensitivity, epsilon):\n",
569 |     "    return v + np.random.laplace(loc=0, scale=sensitivity/epsilon)\n",
570 |     "\n",
571 |     "epsilon_i = .01\n",
572 |     "plt.plot([laplace_mech(adult['Age'].clip(lower=0, upper=i).sum(), i, epsilon_i) for i in range(100)])\n",
573 |     "plt.xlabel('Clipping Bound for Age')\n",
574 |     "plt.ylabel('Total Sum');"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "markdown",
579 |    "id": "eb09538c-800b-4294-b59b-24dacd48aab4",
580 |    "metadata": {},
581 |    "source": [
582 |     "The total privacy cost for building this plot is $\\epsilon = 1$ by sequential composition, since we do 100 queries each with $\\epsilon_i = 0.01$. It's clear that the results level off around a value of `upper = 80`, so this is a good choice for the clipping bound.\n",
583 |     "\n",
584 |     "One refinement that **can work well when the scale of the data is not known** is to test upper bounds according to a logarithmic scale."
585 |    ]
586 |   }
587 |  ],
588 |  "metadata": {
589 |   "kernelspec": {
590 |    "display_name": "Python 3 (ipykernel)",
591 |    "language": "python",
592 |    "name": "python3"
593 |   },
594 |   "language_info": {
595 |    "codemirror_mode": {
596 |     "name": "ipython",
597 |     "version": 3
598 |    },
599 |    "file_extension": ".py",
600 |    "mimetype": "text/x-python",
601 |    "name": "python",
602 |    "nbconvert_exporter": "python",
603 |    "pygments_lexer": "ipython3",
604 |    "version": "3.12.3"
605 |   }
606 |  },
607 |  "nbformat": 4,
608 |  "nbformat_minor": 5
609 | }
610 | 


--------------------------------------------------------------------------------