├── MANIFEST.in
├── test
    ├── embedder
    │   ├── __init__.py
    │   ├── test_bloom_filters.py
    │   ├── strategies.py
    │   ├── test_embedder.py
    │   └── test_features.py
    ├── test_encryption.py
    ├── app
    │   ├── conftest.py
    │   ├── test_utils.py
    │   └── test_file_selector.py
    ├── matching
    │   └── test_perform.py
    └── test_config.py
├── docs
    ├── _static
    │   ├── app-home-screenshot.png
    │   └── 02-client-screenshot.png
    ├── assets
    │   └── pprl_cloud_diagram.png
    └── tutorials
    │   ├── index.qmd
    │   ├── example-febrl.qmd
    │   ├── run-through.qmd
    │   ├── example-verknupfung.qmd
    │   └── in-the-cloud.qmd
├── src
    └── pprl
    │   ├── app
    │       ├── static
    │       │   └── ons_files
    │       │   │   └── favicon.ico
    │       ├── templates
    │       │   ├── download-results.html
    │       │   ├── check-results.html
    │       │   ├── home.html
    │       │   ├── choose-data.html
    │       │   ├── process-data.html
    │       │   └── base.html
    │       ├── utils.py
    │       └── __init__.py
    │   ├── matching
    │       ├── __init__.py
    │       ├── local.py
    │       ├── perform.py
    │       └── cloud.py
    │   ├── __init__.py
    │   ├── embedder
    │       ├── __init__.py
    │       ├── bloom_filters.py
    │       └── features.py
    │   ├── config.py
    │   └── encryption.py
├── .env.example
├── scripts
    ├── 07-tear-down-author.sh
    ├── 06-tear-down-operator.sh
    ├── 08-tear-down-party.sh
    ├── 05-run-workload.sh
    ├── 03-setup-workload-author.sh
    ├── 04-authorise-workload.sh
    ├── 02-setup-workload-operator.sh
    ├── 01-setup-party-resources.sh
    ├── server.py
    └── common.sh
├── Dockerfile
├── .gitignore
├── index.qmd
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature-idea.md
    │   └── bug_report.md
    └── workflows
    │   ├── ci.yml
    │   └── docs.yml
├── LICENSE
├── pyproject.toml
├── .pre-commit-config.yaml
├── .secrets.baseline
├── _quarto.yml
└── README.md


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/embedder/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for the embedder subpackage."""
2 | 


--------------------------------------------------------------------------------
/docs/_static/app-home-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/_static/app-home-screenshot.png


--------------------------------------------------------------------------------
/docs/assets/pprl_cloud_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/assets/pprl_cloud_diagram.png


--------------------------------------------------------------------------------
/docs/_static/02-client-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/_static/02-client-screenshot.png


--------------------------------------------------------------------------------
/src/pprl/app/static/ons_files/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/src/pprl/app/static/ons_files/favicon.ico


--------------------------------------------------------------------------------
/src/pprl/matching/__init__.py:
--------------------------------------------------------------------------------
1 | """Functions for performing the matching locally or in the cloud."""
2 | 
3 | from .perform import perform_matching
4 | 
5 | __all__ = ["perform_matching"]
6 | 


--------------------------------------------------------------------------------
/src/pprl/__init__.py:
--------------------------------------------------------------------------------
1 | """Privacy-preserving record linkage via Bloom filter embeddings."""
2 | 
3 | from .embedder import EmbeddedDataFrame, Embedder
4 | 
5 | __all__ = ["EmbeddedDataFrame", "Embedder"]
6 | 


--------------------------------------------------------------------------------
/src/pprl/embedder/__init__.py:
--------------------------------------------------------------------------------
1 | """Tools for generating our Bloom filter embeddings and matchings."""
2 | 
3 | from .embedder import EmbeddedDataFrame, Embedder
4 | 
5 | __all__ = ["EmbeddedDataFrame", "Embedder"]
6 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | PARTY_1_PROJECT=pprl-party-1
 2 | PARTY_1_KEY_VERSION=1
 3 | 
 4 | PARTY_2_PROJECT=pprl-party-2
 5 | PARTY_2_KEY_VERSION=1
 6 | 
 7 | WORKLOAD_AUTHOR_PROJECT=pprl-party-1
 8 | WORKLOAD_AUTHOR_PROJECT_REGION=europe-west2
 9 | 
10 | WORKLOAD_OPERATOR_PROJECT=pprl-party-2
11 | WORKLOAD_OPERATOR_PROJECT_ZONE=europe-west2-c
12 | 


--------------------------------------------------------------------------------
/scripts/07-tear-down-author.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Tears down all billable resources for the workload author.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | set_gcp_project $WORKLOAD_AUTHOR_PROJECT
 9 | 
10 | delete_artifact_repository $ARTIFACT_REPOSITORY $WORKLOAD_AUTHOR_PROJECT_REGION
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 python:3.11-slim-bookworm
 2 | 
 3 | ENV PYTHONUNBUFFERED=True
 4 | ENV PRODUCTION=1
 5 | 
 6 | COPY pyproject.toml .
 7 | ADD src/pprl src/pprl
 8 | RUN python -m pip install --upgrade pip
 9 | RUN python -m pip install --no-cache-dir .
10 | 
11 | COPY .env .
12 | COPY scripts/server.py .
13 | 
14 | CMD [ "python", "server.py" ]
15 | 


--------------------------------------------------------------------------------
/src/pprl/app/templates/download-results.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block head %}
 4 | {% endblock %}
 5 | 
 6 | {% block body %}
 7 | <h2>Download your results</h2>
 8 | <br>
 9 | Press the button to download your results.
10 | <br><br>
11 | <form method = "POST" enctype = "multipart/form-data">
12 |    <input type="submit" value="Download results">
13 | </form>
14 | {% endblock %}
15 | 


--------------------------------------------------------------------------------
/docs/tutorials/index.qmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Tutorials
 3 | listing:
 4 |     type: table
 5 |     contents:
 6 |         - "*.qmd"
 7 |     fields: [title, description, reading-time]
 8 |     sort-ui: false
 9 |     filter-ui: false
10 | ---
11 | 
12 | These tutorials walk you through some of the essential workflows for pprl.
13 | The purpose of these documents is for you to learn how to use the pprl
14 | package for your own linkage projects.
15 | 
16 | <br>
17 | 


--------------------------------------------------------------------------------
/src/pprl/app/templates/check-results.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block head %}
 4 | {% endblock %}
 5 | 
 6 | {% block body %}
 7 | <h2>Check for your results</h2>
 8 | <br>
 9 | 
10 | {% if message %}
11 |    <h3>{{message}}</h3><br>
12 | {% endif %}
13 | 
14 | Press the button to check for your results.
15 | <br><br>
16 | 
17 | <form method = "POST" enctype = "multipart/form-data">
18 |    <input type="submit" value="Check for results">
19 | </form>
20 | {% endblock %}
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | !.gitkeep
 3 | 
 4 | # data
 5 | data/*
 6 | secrets/*
 7 | *.json
 8 | 
 9 | # logs
10 | log/*
11 | 
12 | 
13 | 
14 | # environment
15 | .env
16 | 
17 | # documentation
18 | /.quarto/
19 | /_site/
20 | 
21 | # tests
22 | .tox/
23 | .coverage
24 | .mypy_cache/
25 | .pytest_cache/
26 | .hypothesis/
27 | .ruff_cache/
28 | 
29 | 
30 | # system
31 | .DS_Store
32 | .vscode/
33 | 
34 | # cache
35 | */__pycache__/
36 | */**/__pycache__/
37 | 
38 | # build
39 | build/
40 | dist/
41 | *.egg-info/
42 | *.egg
43 | 


--------------------------------------------------------------------------------
/scripts/06-tear-down-operator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Tears down all billable resources for the workload operator.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | set_gcp_project $WORKLOAD_OPERATOR_PROJECT
 9 | 
10 | echo "Deleting workload virtual machine..."
11 | gcloud compute instances delete \
12 |   projects/$WORKLOAD_OPERATOR_PROJECT/zones/$WORKLOAD_OPERATOR_PROJECT_ZONE/instances/pprl-cvm
13 | 
14 | delete_storage_bucket $ATTESTATION_BUCKET
15 | 
16 | delete_service_account $WORKLOAD_SERVICE_ACCOUNT_EMAIL
17 | 


--------------------------------------------------------------------------------
/scripts/08-tear-down-party.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Tears down all billable resources for the data-owning party.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | export PROJECT_NAME=${1}
 9 | export PROJECT_KEY_VERSION=${2}
10 | if [ ! $PROJECT_KEY_VERSION ]; then
11 | do
12 |   export PROJECT_KEY_VERSION=1
13 | done
14 | 
15 | set_gcp_project $PROJECT_NAME
16 | 
17 | delete_storage_bucket $PROJECT_NAME-bucket
18 | 
19 | destroy_kms_key_version \
20 |   $PROJECT_NAME-akek $PROJECT_NAME-akek-kr $PROJECT_LOCATION $PROJECT_KEY_VERSION
21 | 
22 | delete_workload_identity_pool $PROJECT_NAME-wip $PROJECT_LOCATION
23 | 
24 | delete_service_account $PROJECT_NAME-sa@$PROJECT_NAME.iam.gserviceaccount.com
25 | 


--------------------------------------------------------------------------------
/scripts/05-run-workload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Sets the workload running on GCP.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | set_gcp_project $WORKLOAD_OPERATOR_PROJECT
 9 | 
10 | echo "Setting up confidential VM..."
11 | gcloud compute instances create pprl-cvm \
12 |   --confidential-compute \
13 |   --shielded-secure-boot \
14 |   --maintenance-policy=TERMINATE \
15 |   --scopes=cloud-platform \
16 |   --zone=$WORKLOAD_OPERATOR_PROJECT_ZONE \
17 |   --image-project=confidential-space-images \
18 |   --image-family=confidential-space \
19 |   --service-account=$WORKLOAD_SERVICE_ACCOUNT_EMAIL \
20 |     --metadata "^~^tee-image-reference=$WORKLOAD_IMAGE_REFERENCE:$WORKLOAD_IMAGE_TAG~tee-restart-policy=Never"
21 | 


--------------------------------------------------------------------------------
/index.qmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Welcome to the **pprl** documentation!
 3 | toc: false
 4 | sidebar: false
 5 | about:
 6 |     template: marquee
 7 |     links:
 8 |         - icon: github
 9 |           href: https://github.com/datasciencecampus/pprl
10 |           text: GitHub
11 | ---
12 | 
13 | ## What is this and why does it exist?
14 | 
15 | This package, **pprl**, implements a method for performing
16 | Privacy Preserving Record Linkage. This linkage can be done
17 | locally or through Google Cloud Platform.
18 | 
19 | ## Where do I go now?
20 | 
21 | If you're looking to get stuck in with pprl, head over to our
22 | [tutorials](docs/tutorials/index.qmd).
23 | 
24 | For more focused, technical details of how this all works, see our
25 | [API reference](docs/reference/index.qmd).
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-idea.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature idea
 3 | about: Suggest an idea for this project
 4 | title: "[FEATURE] : "
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Please be aware that, as pprl is an experimental package, ONS cannot promise to implement feature ideas.
11 | 
12 | ### Does your feature idea solve a problem?
13 | If this applies to your idea, please provide a clear and concise description of what the problem is.
14 | 
15 | ### Describe the solution you'd like
16 | A clear and concise description of what you want to happen.
17 | 
18 | ### Describe alternatives you've considered
19 | A clear and concise description of any alternative solutions or features you've considered.
20 | 
21 | ### Additional context
22 | Add any other context or screenshots about the feature request here.
23 | 


--------------------------------------------------------------------------------
/src/pprl/app/templates/home.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block body %}
 4 | <h2>Welcome to the PPRL application</h2>
 5 | 
 6 | <p>
 7 |    This application is for data-owning parties to process and upload their data
 8 |    to a Google Cloud Platform (GCP) bucket. Once both parties have uploaded
 9 |    their data, the operator can run the workload to link your datasets in a
10 |    secure environment.
11 | 
12 |    Keep this app open and you will be able to download your results at the end.
13 | </p>
14 | <br>
15 | <p>
16 |    To begin, please select which party you are.
17 | </p>
18 | <br>
19 | <form method="POST">
20 |    <select name="party">
21 |       {% for party in parties %}
22 |          <option value="{{ party }}">{{ party }}</option>
23 |       {% endfor %}
24 |    </select>
25 |    <br>
26 |    <input type="submit">
27 | </form>
28 | {% endblock %}
29 | 


--------------------------------------------------------------------------------
/src/pprl/app/templates/choose-data.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block body %}
 4 | <h2>Choose a dataset</h2>
 5 | 
 6 | Use one of the FEBRL datasets from the <i>RecordLinkage</i> package or upload
 7 | your own dataset. If you choose the latter, your file <b>must be a CSV</b>.
 8 | <br>
 9 | {{message}}
10 | <br>
11 | <form method = "POST" enctype = "multipart/form-data">
12 |    <p>
13 |       <input type="radio" id="FEBRLa" name="radio_input" value="FEBRLa" checked>
14 |       <label for="FEBRLa">FEBRLa</label>
15 |    </p>
16 |    <p>
17 |       <input type="radio" id="FEBRLb" name="radio_input" value="FEBRLb">
18 |       <label for="FEBRLb">FEBRLb</label>
19 |    </p>
20 |    <p>
21 |       <input type="radio" id="local_file" name="radio_input" value="local_file">
22 |       <label for="local_file">Upload dataset: &nbsp;</label><td><input type= "file" name = "file"/>
23 |    </p>
24 |    <p>
25 |       <br>
26 |       <input type = "submit"/>
27 |    </p>
28 | </form>
29 | {% endblock %}
30 | 


--------------------------------------------------------------------------------
/test/test_encryption.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for the `encryption` module."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from pprl import encryption
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "input_df",
11 |     [
12 |         (
13 |             pd.DataFrame(
14 |                 dict(
15 |                     ints=[1, 4, 5, 1273873],
16 |                     bools=[True, False, False, True],
17 |                     strings=["a", "bchc", "12djd", "]p8s|"],
18 |                 )
19 |             )
20 |         ),
21 |         (pd.DataFrame(dict())),
22 |         (pd.DataFrame(dict(mixed=[1, True, "my_string", 1.43434]))),
23 |     ],
24 | )
25 | def test_encrypt_decrypt_data(input_df):
26 |     """Make sure the dataframe is unchanged by the encryption process.
27 | 
28 |     We ignore the index here.
29 |     """
30 | 
31 |     payload, data_enc_key = encryption.encrypt_data(input_df)
32 |     decrypted_dataframe = encryption.decrypt_data(payload, data_enc_key)
33 |     assert input_df.equals(decrypted_dataframe.reset_index(drop=True))
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG] : "
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Please be aware that, as pprl is an experimental package, ONS cannot promise to resolve bugs.
11 | 
12 | ### Describe the bug
13 | A clear and concise description of what the bug is.
14 | 
15 | ### How to reproduce the bug
16 | Steps to reproduce the behaviour:
17 | 1. Go to '...'
18 | 2. Click on '....'
19 | 3. Scroll down to '....'
20 | 4. See error
21 | 
22 | ### Expected behaviour
23 | A clear and concise description of what you expected to happen.
24 | 
25 | ### Evidence (tracebacks and screenshots)
26 | If applicable, please add any tracebacks or screenshots to help explain your problem.
27 | 
28 | ### System information
29 | Please provide the following information about your environment:
30 | 
31 |  - OS: [e.g. macOS]
32 |  - Browser (when using the client-side app or GCP): [e.g. Chrome, Safari]
33 |  - pprl version: [e.g. 0.0.1]
34 | 
35 | ### Additional context
36 | Add any other context about the problem here.
37 | 


--------------------------------------------------------------------------------
/test/app/conftest.py:
--------------------------------------------------------------------------------
 1 | """Test configuration."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from pprl.app import app
 7 | 
 8 | 
 9 | @pytest.fixture()
10 | def client():
11 |     """Create a test client."""
12 |     return app.test_client()
13 | 
14 | 
15 | @pytest.fixture()
16 | def csv_client():
17 |     """Create a test client with a CSV attached."""
18 |     app.config["unprocessed_dataframe"] = pd.DataFrame(dict(column1=[], column2=[]))
19 |     app.config["filename"] = "my_file.csv"
20 |     return app.test_client()
21 | 
22 | 
23 | @pytest.fixture()
24 | def no_party_client():
25 |     """Create a test client with a CSV but no party number."""
26 |     app.config["unprocessed_dataframe"] = pd.DataFrame(dict(column1=[], column2=[]))
27 |     app.config["party_number"] = None
28 |     return app.test_client()
29 | 
30 | 
31 | @pytest.fixture()
32 | def party1_client():
33 |     """Create a test client with a CSV and party number 1."""
34 |     app.config["unprocessed_dataframe"] = pd.DataFrame(dict(column1=[], column2=[]))
35 |     app.config["party_number"] = 1
36 |     return app.test_client()
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Crown copyright Office for National Statistics
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/pprl/matching/local.py:
--------------------------------------------------------------------------------
 1 | """Functions for performing matching locally."""
 2 | 
 3 | import os
 4 | 
 5 | from pprl import config
 6 | from pprl.embedder.embedder import Embedder
 7 | 
 8 | 
 9 | def build_local_file_paths(party: str) -> tuple[str, str]:
10 |     """
11 |     Construct the paths for the input and output datasets for a party.
12 | 
13 |     Parameters
14 |     ----------
15 |     party : str
16 |         Name of the party.
17 | 
18 |     Returns
19 |     -------
20 |     inpath : str
21 |         Location of the party data.
22 |     outpath : str
23 |         Location to put the party results.
24 |     """
25 | 
26 |     stem = config.DIR_DATA_INTERIM
27 |     inpath = os.path.join(stem, f"{party}-data.json")
28 |     outpath = os.path.join(stem, f"{party}-output.json")
29 | 
30 |     return inpath, outpath
31 | 
32 | 
33 | def load_embedder() -> Embedder:
34 |     """
35 |     Load an embedder from a pickle in the local data directory.
36 | 
37 |     Returns
38 |     -------
39 |     embedder : Embedder
40 |         Reformed embedder instance.
41 |     """
42 | 
43 |     path = os.path.join(config.DIR_DATA_INTERIM, "embedder.pkl")
44 |     embedder = Embedder.from_pickle(path=path)
45 | 
46 |     return embedder
47 | 


--------------------------------------------------------------------------------
/scripts/03-setup-workload-author.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Creates artifact repository and workload image, then uploads the image.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | set_gcp_project $WORKLOAD_AUTHOR_PROJECT
 9 | 
10 | echo "Enabling APIs for workload author on $WORKLOAD_AUTHOR_PROJECT..."
11 | gcloud services enable artifactregistry.googleapis.com
12 | 
13 | create_artifact_repository $ARTIFACT_REPOSITORY $WORKLOAD_AUTHOR_REGION
14 | 
15 | gcloud auth configure-docker $WORKLOAD_AUTHOR_PROJECT_REGION-docker.pkg.dev
16 | 
17 | echo "Building the workload Docker image..."
18 | cd ..
19 | docker build . -t $WORKLOAD_IMAGE_REFERENCE
20 | cd scripts
21 | 
22 | echo "Pushing the workload Docker image to artifact registry $ARTIFACT_REPOSITORY..."
23 | docker push $WORKLOAD_IMAGE_REFERENCE:$WORKLOAD_IMAGE_TAG
24 | 
25 | echo "Granting roles/artifactregistry.reader role to workload service account $WORKLOAD_SERVICE_ACCOUNT..."
26 | gcloud artifacts repositories add-iam-policy-binding $ARTIFACT_REPOSITORY \
27 |     --project=$WORKLOAD_AUTHOR_PROJECT \
28 |     --role=roles/artifactregistry.reader \
29 |     --location=$WORKLOAD_AUTHOR_PROJECT_REGION \
30 |     --member="serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL"
31 | 


--------------------------------------------------------------------------------
/test/embedder/test_bloom_filters.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for the bloom_filters module."""
 2 | 
 3 | from hypothesis import given
 4 | from hypothesis import strategies as st
 5 | 
 6 | from pprl.embedder.bloom_filters import BloomFilterEncoder
 7 | 
 8 | 
 9 | @given(
10 |     st.lists(st.integers() | st.floats() | st.text(min_size=1), min_size=1, max_size=40),
11 |     st.integers(min_value=2, max_value=100),
12 |     st.integers(min_value=1, max_value=5),
13 |     st.integers(min_value=0, max_value=50),
14 |     st.text(),
15 | )
16 | def test_bloom_filter_vector_collision_fraction(feature, size, num_hashes, offset, salt):
17 |     """Test BloomFilterEncoder.bloom_filter_vector_collision_fraction.
18 | 
19 |     Tests the following properties for vec_idx_deduped: list[0 < int < size].
20 |     Tests the following properties for collision_fraction: >= 0, <= 1.
21 |     """
22 |     bfencoder = BloomFilterEncoder(size=size, num_hashes=num_hashes, offset=offset, salt=salt)
23 |     vec_idx_deduped, collision_fraction = bfencoder.bloom_filter_vector_collision_fraction(feature)
24 | 
25 |     assert all(isinstance(element, int) for element in vec_idx_deduped)
26 |     assert all(element <= (size + offset - 1) for element in vec_idx_deduped)
27 |     assert all(element >= offset for element in vec_idx_deduped)
28 | 
29 |     assert collision_fraction <= 1
30 |     assert collision_fraction >= 0
31 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |       - "dev*"
 9 | 
10 | jobs:
11 |   build:
12 | 
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         os: [ubuntu-latest, windows-latest]
17 |         python-version: ["3.10", "3.11"]
18 | 
19 |     steps:
20 |       - name: Checkout repository
21 |         uses: actions/checkout@v3
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v4
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |           cache: "pip"
27 |       - name: Update pip and install test dependencies
28 |         run: |
29 |           python -m pip install --upgrade pip
30 |           python -m pip install ".[test]"
31 |       - name: Run tests
32 |         run: |
33 |           python -m pytest test
34 |       - name: Run doctests
35 |         if: |
36 |           matrix.python-version == '3.11' &&
37 |           matrix.os == 'ubuntu-latest'
38 |         run: |
39 |           python -m doctest README.md
40 |       - name: Install and run linters
41 |         if: |
42 |           matrix.python-version == '3.11' &&
43 |           matrix.os == 'ubuntu-latest'
44 |         run: |
45 |           python -m pip install ".[lint]"
46 |           python -m ruff check src test
47 |           python -m ruff format --check src test
48 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Publish documentation
 2 | 
 3 | on:
 4 |     workflow_dispatch:
 5 |     push:
 6 |         branches: main
 7 | 
 8 | jobs:
 9 |     build-deploy:
10 |         runs-on: ubuntu-latest
11 |         permissions:
12 |             contents: write
13 |             pages: write
14 |         steps:
15 |             - name: Check out repository
16 |               uses: actions/checkout@v3
17 |             - name: Set up Quarto
18 |               uses: quarto-dev/quarto-actions/setup@v2
19 |             - name: Install Python and dependencies
20 |               uses: actions/setup-python@v4
21 |               with:
22 |                 python-version: "3.11"
23 |                 cache: "pip"
24 |             - name: Build API reference
25 |               run: |
26 |                 python -m pip install ".[docs]"
27 |                 python -m quartodoc build
28 |             - name: Render and publish
29 |               uses: quarto-dev/quarto-actions/publish@v2
30 |               with:
31 |                 target: gh-pages
32 |               env:
33 |                 GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34 |                 PARTY_1_PROJECT: pprl-party-1
35 |                 PARTY_1_KEY_VERSION: 1
36 |                 PARTY_2_PROJECT: pprl-party-2
37 |                 PARTY_2_KEY_VERSION: 1
38 |                 WORKLOAD_AUTHOR_PROJECT: pprl-party-1
39 |                 WORKLOAD_AUTHOR_PROJECT_REGION: europe-west2
40 |                 WORKLOAD_OPERATOR_PROJECT: pprl-party-2
41 |                 WORKLOAD_OPERATOR_PROJECT_ZONE: europe-west2-c
42 | 


--------------------------------------------------------------------------------
/test/matching/test_perform.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for the server utilities module."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from pprl.matching import perform
 7 | 
 8 | 
 9 | # Adding row index to check for bug to do with use of .loc instead of .iloc
10 | @pytest.mark.parametrize(
11 |     "df1,df2,match,colname,expected",
12 |     [
13 |         (
14 |             pd.DataFrame(dict(x=list("abcd")), index=["ann", "oying", "ind", "ex"]),
15 |             pd.DataFrame(dict(y=list("abcd")), index=["an", "oth", "ero", "ne"]),
16 |             ([0, 2], [0, 2]),
17 |             "private_index",
18 |             ["aa", "cc"],
19 |         ),
20 |     ],
21 | )
22 | def test_add_private_index(df1, df2, match, colname, expected):
23 |     """Test adding a private index works with move to `.iloc`."""
24 |     out1, out2 = perform.add_private_index(df1=df1, df2=df2, match=match, colname=colname)
25 |     result = out1.merge(out2, on=colname).loc[:, ["x", "y"]].agg("".join, axis=1).to_list()
26 | 
27 |     assert result == expected
28 | 
29 | 
30 | @pytest.mark.parametrize(
31 |     "df1,df2,match",
32 |     [
33 |         (
34 |             pd.DataFrame(dict(x=list("abcd")), index=["ann", "oying", "ind", "ex"]),
35 |             pd.DataFrame(dict(y=list("abcd")), index=["an", "oth", "ero", "ne"]),
36 |             ([0, 2], [0, 2]),
37 |         ),
38 |     ],
39 | )
40 | def test_add_private_index_complete(df1, df2, match):
41 |     """Check that the private indexes are all integers (no missing)."""
42 |     out1, out2 = perform.add_private_index(df1=df1, df2=df2, match=match)
43 | 
44 |     assert all(isinstance(i, int) for i in out1.private_index)
45 |     assert all(isinstance(i, int) for i in out2.private_index)
46 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=62", "wheel", "cython"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "pprl"
 7 | version = "0.1.0"
 8 | authors = [
 9 |     {name = "Mat Weldon"},
10 |     {name = "Samuel Stock"},
11 |     {name = "Kevin Fasusi"},
12 |     {name = "Henry Wilde"},
13 |     {name = "Data Science Campus", email = "datacampus@ons.gov.uk"},
14 | ]
15 | description = "Privacy-preserving record linkage via Bloom filter embedding"
16 | readme = "README.md"
17 | requires-python = ">=3.10"
18 | license = {text = "MIT License"}
19 | dependencies = [
20 |     "dill",
21 |     "flask",
22 |     "numpy",
23 |     "pandas==2.0.2",
24 |     "python-dotenv",
25 |     "requests==2.30.0",
26 |     "metaphone",
27 |     "cryptography",
28 |     "google-cloud-storage",
29 |     "google-cloud-logging",
30 |     "google-cloud-kms",
31 |     "scipy",
32 |     "recordlinkage",
33 | ]
34 | 
35 | [project.urls]
36 | homepage = "https://github.com/datasciencecampus/pprl_toolkit"
37 | 
38 | [project.optional-dependencies]
39 | lint = ["ruff==0.3.0", "mypy"]
40 | test = [
41 |     "hypothesis",
42 |     "pytest",
43 |     "pytest-randomly",
44 |     "pytest-sugar",
45 |     "pytest-cov"
46 | ]
47 | docs = [
48 |     "ipykernel",
49 |     "nbclient>=0.9.0",
50 |     "nbformat>=5.9.2",
51 |     "quartodoc>=0.6.6",
52 | ]
53 | dev = [
54 |     "pre-commit==3.1.0",
55 |     "pprl[lint,docs,test]"
56 | ]
57 | 
58 | [tool.ruff]
59 | line-length = 99
60 | exclude = ["notebooks/*"]
61 | extend-include = ["*.ipynb"]
62 | 
63 | [tool.ruff.lint]
64 | extend-select = ["D", "I", "W"]
65 | ignore = ["D105", "D107", "D202", "D413"]
66 | 
67 | [tool.ruff.lint.isort]
68 | known-first-party = ["pprl"]
69 | 
70 | [tool.ruff.lint.pydocstyle]
71 | convention = "numpy"
72 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | # Run 'pre-commit autoupdate' to update hook versions
 4 | repos:
 5 |   - repo: https://github.com/kynan/nbstripout
 6 |     rev: 0.7.1
 7 |     hooks:
 8 |       - id: nbstripout
 9 |         name: nbstripout - Strip outputs from notebooks (auto-fixes)
10 |         args:
11 |           - --extra-keys
12 |           - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId"
13 |   - repo: https://github.com/pre-commit/pre-commit-hooks
14 |     rev: v4.5.0
15 |     hooks:
16 |       - id: check-added-large-files
17 |         name: Check for files larger than 5 MB
18 |         args: [ "--maxkb=5120" ]
19 |       - id: end-of-file-fixer
20 |         name: Check for a blank line at the end of scripts (auto-fixes)
21 |         exclude: '\.Rd'
22 |       - id: trailing-whitespace
23 |         name: Check for trailing whitespaces (auto-fixes)
24 |   - repo: https://github.com/astral-sh/ruff-pre-commit
25 |     # Ruff version.
26 |     rev: v0.3.0
27 |     hooks:
28 |       # Run the linter.
29 |       - id: ruff
30 |         args: [ --fix ]
31 |       # Run the formatter.
32 |       - id: ruff-format
33 |   - repo: https://github.com/Yelp/detect-secrets
34 |     rev: v1.4.0
35 |     hooks:
36 |       - id: detect-secrets
37 |         name: detect-secrets - Detect secrets in staged code
38 |         args: [ "--baseline", ".secrets.baseline", '--exclude-files', '.*\.(ipynb|qmd)$',  ]
39 |         exclude: .*/tests/.*|^\.cruft\.json$
40 |       - id: detect-secrets
41 |         name: 'detect-secrets-jupyter'
42 |         args: ['--exclude-files', '.*[^i][^p][^y][^n][^b]$', '--exclude-lines', '"(hash|id|image/\w+)":.*', ]
43 | 


--------------------------------------------------------------------------------
/src/pprl/config.py:
--------------------------------------------------------------------------------
 1 | """Functions for handling PPRL configuration."""
 2 | 
 3 | import inspect
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import dotenv
 8 | 
 9 | import pprl
10 | 
11 | 
12 | def _find_directory(kind: str, what: str | None = None) -> Path:
13 |     """
14 |     Find a directory in the root of the pprl installation.
15 | 
16 |     Parameters
17 |     ----------
18 |     kind : str
19 |         The category of directory to find. Typically `data` or `log`.
20 |     what : str, optional
21 |         The name of the directory in `kind` to find. If not specified,
22 |         then `kind` is treated as the name of the directory.
23 | 
24 |     Returns
25 |     -------
26 |     where : pathlib.Path
27 |         Path object to the directory.
28 |     """
29 | 
30 |     where = Path(inspect.getfile(pprl)).parent.parent.parent / kind
31 | 
32 |     if what is not None:
33 |         where /= what
34 | 
35 |     return where
36 | 
37 | 
38 | def load_environment(path: None | str = None) -> dict[str, None | str]:
39 |     """
40 |     Load the configuration file as a dictionary.
41 | 
42 |     Parameters
43 |     ----------
44 |     path : str, optional
45 |         Location of the configuration file to load. If not specified,
46 |         try to load the configuration file from the root of the pprl
47 |         installation called `.env`.
48 | 
49 |     Returns
50 |     -------
51 |     config : collections.OrderedDict
52 |         Mapping of the key-value pairs in the configuration file.
53 |     """
54 | 
55 |     if path is None:
56 |         path = os.path.join(PPRL_ROOT, ".env")
57 | 
58 |     return dotenv.dotenv_values(path)
59 | 
60 | 
61 | PPRL_ROOT = _find_directory("")
62 | DIR_DATA_RAW = _find_directory("data", "raw")
63 | DIR_DATA_INTERIM = _find_directory("data", "interim")
64 | DIR_DATA_PROCESSED = _find_directory("data", "processed")
65 | DIR_LOGS = _find_directory("log")
66 | 


--------------------------------------------------------------------------------
/test/test_config.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for the `config` module."""
 2 | 
 3 | import os
 4 | import string
 5 | from pathlib import Path
 6 | from unittest import mock
 7 | 
 8 | from hypothesis import given
 9 | from hypothesis import strategies as st
10 | 
11 | import pprl
12 | from pprl import config
13 | 
14 | st_text = st.text(alphabet=string.ascii_lowercase, min_size=1)
15 | 
16 | 
17 | @given(st_text, st.one_of((st.just(None), st_text)))
18 | def test_find_directory(kind, what):
19 |     """Test that a directory can be found correctly."""
20 | 
21 |     root = Path("/path/to/a/test/module")
22 |     with mock.patch("pprl.config.inspect.getfile") as get:
23 |         get.return_value = root / "where" / "stuff" / "lives"
24 |         directory = config._find_directory(kind, what)
25 | 
26 |     assert isinstance(directory, Path)
27 | 
28 |     if what is None:
29 |         assert directory.stem == kind
30 |         assert directory.parent == root
31 |     else:
32 |         assert directory.stem == what
33 |         assert directory.parent == root / kind
34 | 
35 |     get.assert_called_once_with(pprl)
36 | 
37 | 
38 | @given(st_text)
39 | def test_load_environment_with_filename(filename):
40 |     """Test the config loader works with a file name."""
41 | 
42 |     with (
43 |         mock.patch("pprl.config.dotenv.dotenv_values") as values,
44 |         mock.patch("pprl.config.os.path.join") as join,
45 |     ):
46 |         values.return_value = "foo"
47 |         result = config.load_environment(filename)
48 | 
49 |     assert result == "foo"
50 | 
51 |     values.assert_called_once_with(filename)
52 |     join.assert_not_called()
53 | 
54 | 
55 | def test_load_environment_default():
56 |     """Test the config loader works without a file name."""
57 | 
58 |     with mock.patch("pprl.config.dotenv.dotenv_values") as values:
59 |         values.return_value = "foo"
60 |         result = config.load_environment()
61 | 
62 |     assert result == "foo"
63 | 
64 |     values.assert_called_once_with(os.path.join(config.PPRL_ROOT, ".env"))
65 | 


--------------------------------------------------------------------------------
/src/pprl/app/templates/process-data.html:
--------------------------------------------------------------------------------
 1 | <style>
 2 | form  { display: table;}
 3 | p     { display: table-row;  }
 4 | label { display: table-cell; }
 5 | input { display: table-cell; }
 6 | table { width: 50%;}
 7 | </style>
 8 | 
 9 | {% extends "base.html" %}
10 | 
11 | {% block body %}
12 | 
13 | <form method = "POST" enctype = "multipart/form-data">
14 |   <h2>Select column types</h2>
15 | 
16 |     Please choose how each column should be treated from the dropdown menus
17 |     below. See our <a
18 |     href="https://datasciencecampus.github.io/pprl/reference#features">feature
19 |     documentation</a> for details.
20 |   </p>
21 |   <br>
22 |   {% for column in columns%}
23 |     <p>
24 |       <label for="{{ column }}">{{ column }}: </label>
25 |       <select id="{{ column }}" name="{{ column }}">
26 |         <option value="drop">Drop Column</option>
27 |         {% for column_type in column_types%}
28 |           <option value="{{ column_type }}">{{ column_type }}</option>
29 |         {% endfor %}
30 |         <option value="keep">Keep Raw Column</option>
31 |       </select><br>
32 |     </p>
33 |   {% endfor %}
34 | 
35 |   <br>
36 |   <h2>Choose a salt (optional)</h2>
37 |   <p>
38 |     A cryptographic salt is a string that can be appended to all data before
39 |     they are encrypted and sent to GCP.
40 |     The salt should be the <b>same for both parties</b>.
41 |   </p>
42 |   <br>
43 |   <p>
44 |     <label for="salt">Salt:</label>
45 |     <input type="text", name="salt", value="">
46 |   </p>
47 | 
48 |   <br>
49 |   <h2>Where next?</h2>
50 |   <p>
51 |     You can either continue with the linkage by uploading the processed data to
52 |     Google Cloud Platform, or download your processed data locally. The local
53 |     download contains the Bloom filter embedding for your chosen dataset.
54 |   </p>
55 |   <br>
56 |   <input type="submit" name="upload" value="Upload to GCP">
57 |   &nbsp;
58 |   <input type="submit" name="download" value="Download file locally">
59 | </form>
60 | {% endblock %}
61 | 
62 | 
63 | form  { display: table;      }
64 | p     { display: table-row;  }
65 | label { display: table-cell; }
66 | input { display: table-cell; }
67 | 


--------------------------------------------------------------------------------
/.secrets.baseline:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "1.0.3",
 3 |   "plugins_used": [
 4 |     {
 5 |       "name": "ArtifactoryDetector"
 6 |     },
 7 |     {
 8 |       "name": "AWSKeyDetector"
 9 |     },
10 |     {
11 |       "name": "AzureStorageKeyDetector"
12 |     },
13 |     {
14 |       "name": "Base64HighEntropyString",
15 |       "limit": 4.5
16 |     },
17 |     {
18 |       "name": "BasicAuthDetector"
19 |     },
20 |     {
21 |       "name": "CloudantDetector"
22 |     },
23 |     {
24 |       "name": "HexHighEntropyString",
25 |       "limit": 3.0
26 |     },
27 |     {
28 |       "name": "IbmCloudIamDetector"
29 |     },
30 |     {
31 |       "name": "IbmCosHmacDetector"
32 |     },
33 |     {
34 |       "name": "JwtTokenDetector"
35 |     },
36 |     {
37 |       "name": "KeywordDetector",
38 |       "keyword_exclude": ""
39 |     },
40 |     {
41 |       "name": "MailchimpDetector"
42 |     },
43 |     {
44 |       "name": "NpmDetector"
45 |     },
46 |     {
47 |       "name": "PrivateKeyDetector"
48 |     },
49 |     {
50 |       "name": "SlackDetector"
51 |     },
52 |     {
53 |       "name": "SoftlayerDetector"
54 |     },
55 |     {
56 |       "name": "SquareOAuthDetector"
57 |     },
58 |     {
59 |       "name": "StripeDetector"
60 |     },
61 |     {
62 |       "name": "TwilioKeyDetector"
63 |     }
64 |   ],
65 |   "filters_used": [
66 |     {
67 |       "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
68 |     },
69 |     {
70 |       "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
71 |       "min_level": 2
72 |     },
73 |     {
74 |       "path": "detect_secrets.filters.heuristic.is_indirect_reference"
75 |     },
76 |     {
77 |       "path": "detect_secrets.filters.heuristic.is_likely_id_string"
78 |     },
79 |     {
80 |       "path": "detect_secrets.filters.heuristic.is_potential_uuid"
81 |     },
82 |     {
83 |       "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
84 |     },
85 |     {
86 |       "path": "detect_secrets.filters.heuristic.is_sequential_string"
87 |     },
88 |     {
89 |       "path": "detect_secrets.filters.heuristic.is_templated_secret"
90 |     }
91 |   ],
92 |   "results": {},
93 |   "generated_at": "2021-06-14T10:43:14Z"
94 | }
95 | 


--------------------------------------------------------------------------------
/scripts/04-authorise-workload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Authorises the workload to use the identity pool.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | export PROJECT_NAME=${1}
 9 | export PROJECT_WORKLOAD_IDENTITY_POOL=$PROJECT_NAME-wip
10 | export PROJECT_WIP_PROVIDER=$PROJECT_WORKLOAD_IDENTITY_POOL-provider
11 | export PROJECT_NUMBER=$(gcloud projects describe $PROJECT_NAME --format="value(projectNumber)")
12 | export PROJECT_SERVICE_ACCOUNT_EMAIL=$PROJECT_NAME-sa@$PROJECT_NAME.iam.gserviceaccount.com
13 | 
14 | export OPERATION=${2}
15 | if [ ! $OPERATION ]; then
16 |   export OPERATION=create
17 | fi
18 | 
19 | set_gcp_project $PROJECT_NAME
20 | 
21 | echo "Creating provider for $PROJECT_WORKLOAD_IDENTITY_POOL authorising $WORKLOAD_IMAGE_REFERENCE..."
22 | gcloud iam workload-identity-pools providers ${OPERATION}-oidc $PROJECT_WIP_PROVIDER \
23 |   --location=$PROJECT_LOCATION \
24 |   --workload-identity-pool="$PROJECT_WORKLOAD_IDENTITY_POOL" \
25 |   --issuer-uri="https://confidentialcomputing.googleapis.com/" \
26 |   --allowed-audiences="https://sts.googleapis.com" \
27 |   --attribute-mapping="google.subject='assertion.sub'" \
28 |   --attribute-condition="assertion.swname == 'CONFIDENTIAL_SPACE' &&
29 |     'STABLE' in assertion.submods.confidential_space.support_attributes &&
30 |     assertion.submods.container.image_reference == '$WORKLOAD_IMAGE_REFERENCE:$WORKLOAD_IMAGE_TAG' &&
31 |     '$WORKLOAD_SERVICE_ACCOUNT_EMAIL' in assertion.google_service_accounts"
32 | 
33 | echo "Creating attestation credentials file for $WORKLOAD_SERVICE_ACCOUNT..."
34 | gcloud iam workload-identity-pools create-cred-config \
35 |   projects/$PROJECT_NUMBER/locations/$PROJECT_LOCATION/workloadIdentityPools/$PROJECT_WORKLOAD_IDENTITY_POOL/providers/$PROJECT_WIP_PROVIDER \
36 |   --service-account=$PROJECT_SERVICE_ACCOUNT_EMAIL \
37 |   --credential-source-file="/run/container_launcher/attestation_verifier_claims_token" \
38 |   --output-file=../secrets/$PROJECT_NAME-attestation-credentials.json
39 | 
40 | echo "Copying attestation credentials for $PROJECT_NAME to $ATTESTATION_BUCKET..."
41 | if ! gsutil cp ../secrets/$PROJECT_NAME-attestation-credentials.json gs://$ATTESTATION_BUCKET/; then
42 |   err "Failed to upload the attestation credentials for $PROJECT_NAME to $ATTESTATION_BUCKET."
43 | fi
44 | 


--------------------------------------------------------------------------------
/_quarto.yml:
--------------------------------------------------------------------------------
 1 | project:
 2 |   type: website
 3 | 
 4 | website:
 5 |   title: "**pprl**"
 6 |   navbar:
 7 |     left:
 8 |       - href: index.qmd
 9 |         text: About
10 |       - href: docs/tutorials/index.qmd
11 |         text: Tutorials
12 |       - href: docs/reference/index.qmd
13 |         text: API Reference
14 |     right:
15 |       - icon: github
16 |         menu:
17 |           - text: Source code
18 |             url: https://github.com/datasciencecampus/pprl_toolkit
19 |           - text: Open an issue
20 |             url: https://github.com/datasciencecampus/pprl_toolkit/issues
21 |   sidebar:
22 |     style: docked
23 |     search: true
24 |     contents:
25 |       - text: About
26 |         href: index.qmd
27 |       - auto: "*.qmd"
28 |   reader-mode: true
29 |   page-footer:
30 |     left: >
31 |       All content is available under the
32 |       [Open Government Licence V3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/),
33 |       except where otherwise stated.
34 |     center: >
35 |       Built using [Quarto](https://quarto.org/).
36 | 
37 | format:
38 |   html:
39 |     mainfont: Arial
40 |     theme:
41 |       light: flatly
42 |       dark: darkly
43 |     lang: en-GB
44 | 
45 | metadata-files:
46 |   - docs/_sidebar.yml
47 | 
48 | quartodoc:
49 |   title: API reference
50 |   package: pprl
51 |   dir: docs/reference
52 |   sidebar: docs/_sidebar.yml
53 |   sections:
54 |     - title: Embeddings
55 |       desc: >
56 |         Tools for generating a Bloom filter embedding and its underlying
57 |         features.
58 |       package: pprl.embedder
59 |       contents:
60 |         - bloom_filters
61 |         - embedder
62 |         - features
63 |     - title: Encryption
64 |       desc: Functions for handling the data and key encryption processes.
65 |       contents:
66 |         - encryption
67 |     - title: Configuration
68 |       desc: Functions for working out and handling linkage configuration.
69 |       contents:
70 |         - config
71 |     - title: Client-side app
72 |       desc: >
73 |         Functions for the Flask application where users upload, process, and
74 |         download their data.
75 |       package: pprl.app
76 |       contents:
77 |         - utils
78 |     - title: Server functions
79 |       desc: >
80 |         Functions for the matching workload server. Used in `scripts/server.py`
81 |       package: pprl.matching
82 |       contents:
83 |         - cloud
84 |         - local
85 |         - perform
86 | 


--------------------------------------------------------------------------------
/scripts/02-setup-workload-operator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Creates a service account for the workload operator.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | set_gcp_project $WORKLOAD_OPERATOR_PROJECT
 9 | 
10 | echo "Enabling APIs for workload operator on $WORKLOAD_OPERATOR_PROJECT..."
11 | gcloud services enable \
12 |   compute.googleapis.com \
13 |   confidentialcomputing.googleapis.com \
14 |   logging.googleapis.com
15 | 
16 | echo "Creating attestion bucket for $WORKLOAD_OPERATOR_PROJECT..."
17 | create_storage_bucket $ATTESTATION_BUCKET
18 | 
19 | echo "Granting parties the rights to access $ATTESTATION_BUCKET..."
20 | grant_attestation_bucket_rights $PARTY_1_PROJECT_EMAIL $ATTESTATION_BUCKET
21 | grant_attestation_bucket_rights $PARTY_2_PROJECT_EMAIL $ATTESTATION_BUCKET
22 | 
23 | echo "Creating workload service account $WORKLOAD_SERVICE_ACCOUNT under $WORKLOAD_OPERATOR_PROJECT..."
24 | create_service_account $WORKLOAD_SERVICE_ACCOUNT
25 | 
26 | echo "Granting roles/storage.admin role for $ATTESTATION_BUCKET to service account $WORKLOAD_SERVICE_ACCOUNT..."
27 | if ! gcloud storage buckets add-iam-policy-binding gs://$ATTESTATION_BUCKET \
28 |   --member=serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL \
29 |   --role=roles/storage.admin; then
30 |   err "Failed to grant roles/storage.admin role for $ATTESTATION_BUCKET to service account $WORKLOAD_SERVICE_ACCOUNT."
31 | fi
32 | 
33 | echo "Granting roles/iam.serviceAccountUser role to workload operator..."
34 | if ! gcloud iam service-accounts add-iam-policy-binding $WORKLOAD_SERVICE_ACCOUNT_EMAIL \
35 |   --member="user:$(gcloud config get-value account)" \
36 |   --role="roles/iam.serviceAccountUser"; then
37 |   err "Failed to grant role to workload operator $WORKLOAD_OPERATOR_USER under $WORKLOAD_OPERATOR_PROJECT."
38 | fi
39 | 
40 | echo "Granting roles/confidentialcomputing.workloadUser to service account $WORKLOAD_SERVICE_ACCOUNT..."
41 | if ! gcloud projects add-iam-policy-binding $WORKLOAD_OPERATOR_PROJECT \
42 |   --member="serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL" \
43 |   --role="roles/confidentialcomputing.workloadUser"; then
44 |   err "Failed to grant roles/confidentialcomputing.workloadUser to service-account $WORKLOAD_SERVICE_ACCOUNT."
45 | fi
46 | 
47 | echo "Granting roles/logging.logWriter to service account $WORKLOAD_SERVICE_ACCOUNT..."
48 | if ! gcloud projects add-iam-policy-binding $WORKLOAD_OPERATOR_PROJECT \
49 |   --member="serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL" \
50 |   --role="roles/logging.logWriter"; then
51 |   err "Failed to grant roles/logging.logWriter to service account $WORKLOAD_SERVICE_ACCOUNT."
52 | fi
53 | 


--------------------------------------------------------------------------------
/scripts/01-setup-party-resources.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Sets up the cloud resources for a data-owning party.
 4 | 
 5 | echo "Loading functions and environment variables..."
 6 | source common.sh
 7 | 
 8 | export PROJECT_NAME=$1
 9 | export PROJECT_BUCKET=$PROJECT_NAME-bucket
10 | export PROJECT_KEYRING=$PROJECT_NAME-akek-kr
11 | export PROJECT_KEY=$PROJECT_NAME-akek
12 | export PROJECT_SERVICE_ACCOUNT=$PROJECT_NAME-sa
13 | export PROJECT_SERVICE_ACCOUNT_DOMAIN=$PROJECT_NAME.iam.gserviceaccount.com
14 | export PROJECT_SERVICE_ACCOUNT_EMAIL=$PROJECT_SERVICE_ACCOUNT@$PROJECT_SERVICE_ACCOUNT_DOMAIN
15 | export PROJECT_WORKLOAD_IDENTITY_POOL=$PROJECT_NAME-wip
16 | export PROJECT_NUMBER=$(gcloud projects describe $PROJECT_NAME --format="value(projectNumber)")
17 | 
18 | set_gcp_project $PROJECT_NAME
19 | 
20 | echo "Enabling APIs for data owners on $PROJECT_NAME..."
21 | gcloud services enable cloudkms.googleapis.com iamcredentials.googleapis.com
22 | 
23 | echo "Creating bucket for $PROJECT_NAME..."
24 | create_storage_bucket $PROJECT_BUCKET
25 | 
26 | echo "Creating keyring for $PROJECT_NAME..."
27 | create_kms_keyring $PROJECT_KEYRING $PROJECT_LOCATION
28 | 
29 | echo "Creating key encryption key on $PROJECT_KEYRING..."
30 | create_kms_encryption_key $PROJECT_KEY $PROJECT_KEYRING global
31 | 
32 | echo "Creating service account for $PROJECT_NAME..."
33 | create_service_account $PROJECT_SERVICE_ACCOUNT
34 | 
35 | echo "Granting roles/storage.admin role to $PROJECT_SERVICE_ACCOUNT on $PROJECT_BUCKET..."
36 | gcloud storage buckets add-iam-policy-binding gs://$PROJECT_BUCKET \
37 |   --member=serviceAccount:$PROJECT_SERVICE_ACCOUNT_EMAIL \
38 |   --role=roles/storage.admin
39 | 
40 | echo "Granting KMS roles to the service account $PROJECT_SERVICE_ACCOUNT..."
41 | gcloud kms keys add-iam-policy-binding \
42 |   $PROJECT_KEY \
43 |   --keyring=$PROJECT_KEYRING \
44 |   --location=$PROJECT_LOCATION \
45 |   --member=serviceAccount:$PROJECT_SERVICE_ACCOUNT_EMAIL \
46 |   --role=roles/cloudkms.publicKeyViewer
47 | gcloud kms keys add-iam-policy-binding \
48 |   $PROJECT_KEY \
49 |   --keyring=$PROJECT_KEYRING \
50 |   --location=$PROJECT_LOCATION \
51 |   --member=serviceAccount:$PROJECT_SERVICE_ACCOUNT_EMAIL \
52 |   --role=roles/cloudkms.cryptoKeyDecrypter
53 | 
54 | echo "Creating workload identity pool for $PROJECT_NAME..."
55 | create_workload_identity_pool $PROJECT_WORKLOAD_IDENTITY_POOL $PROJECT_LOCATION
56 | 
57 | echo "Attaching service account $PROJECT_SERVICE_ACCOUNT to workload identity pool $PROJECT_WORKLOAD_IDENTITY_POOL..."
58 | gcloud iam service-accounts add-iam-policy-binding $PROJECT_SERVICE_ACCOUNT_EMAIL \
59 |   --member="principalSet://iam.googleapis.com/projects/$PROJECT_NUMBER/locations/$PROJECT_LOCATION/workloadIdentityPools/$PROJECT_WORKLOAD_IDENTITY_POOL/*" \
60 |   --role=roles/iam.workloadIdentityUser
61 | 


--------------------------------------------------------------------------------
/scripts/server.py:
--------------------------------------------------------------------------------
  1 | """Script for running linkage on a server or locally."""
  2 | 
  3 | import logging
  4 | import os
  5 | 
  6 | import google.cloud.logging
  7 | import pandas as pd
  8 | 
  9 | from pprl import config
 10 | from pprl.matching import cloud, local, perform_matching
 11 | 
 12 | 
 13 | def load_environment_variables(path: None | str = None) -> tuple[str, str, str, str, str]:
 14 |     """
 15 |     Load the environment and pull out the core pieces.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     path : str, optional
 20 |         Path to environment file. If running locally, no need to provide
 21 |         anything.
 22 | 
 23 |     Returns
 24 |     -------
 25 |     operator : str
 26 |         Name of the workload operator.
 27 |     party_1 : str
 28 |         Name of the first party.
 29 |     party_2 : str
 30 |         Name of the second party.
 31 |     location : str
 32 |         Location of the workload identity pools and keyrings.
 33 |     version_1 : str
 34 |         Version of the key encryption key for the first party.
 35 |     version_2 : str
 36 |         Version of the key encryption key for the second party.
 37 |     """
 38 | 
 39 |     environ = config.load_environment(path)
 40 | 
 41 |     operator = environ.get("WORKLOAD_OPERATOR_PROJECT")
 42 |     party_1 = environ.get("PARTY_1_PROJECT")
 43 |     party_2 = environ.get("PARTY_2_PROJECT")
 44 |     location = environ.get("PROJECT_LOCATION", "global")
 45 |     version_1 = environ.get("PARTY_1_KEY_VERSION", 1)
 46 |     version_2 = environ.get("PARTY_2_KEY_VERSION", 1)
 47 | 
 48 |     return operator, party_1, party_2, location, version_1, version_2
 49 | 
 50 | 
 51 | def main():
 52 |     """Perform the matching process and save the results."""
 53 | 
 54 |     if int(os.getenv("PRODUCTION", 0)) == 1:
 55 |         logger = google.cloud.logging.Client()
 56 |         logger.setup_logging()
 57 |         logging.info("Logging set up.")
 58 | 
 59 |         operator, party_1, party_2, location, version_1, version_2 = load_environment_variables(
 60 |             ".env"
 61 |         )
 62 |         parties = (party_1, party_2)
 63 | 
 64 |         logging.info("Downloading embedder...")
 65 |         embedder = cloud.download_embedder(parties, operator)
 66 | 
 67 |         logging.info("Preparing assets...")
 68 |         data_1, dek_1 = prepare_party_assets(party_1, operator, location, version_1)
 69 |         data_2, dek_2 = prepare_party_assets(party_2, operator, location, version_2)
 70 | 
 71 |         logging.info("Performing matching...")
 72 |         outputs = perform_matching(data_1, data_2, embedder)
 73 | 
 74 |         logging.info("Uploading results...")
 75 |         for party, output, dek in zip(parties, outputs, (dek_1, dek_2)):
 76 |             logging.info(f"Uploading results for {party}...")
 77 |             cloud.upload_party_results(output, dek, party, operator)
 78 | 
 79 |     else:
 80 |         logging.basicConfig(encoding="utf-8", level=logging.INFO)
 81 | 
 82 |         logging.info("Setting up environment and file paths...")
 83 |         operator, party_1, party_2, *_ = load_environment_variables()
 84 |         inpath_1, outpath_1 = build_local_file_paths(party_1)
 85 |         inpath_2, outpath_2 = build_local_file_paths(party_2)
 86 | 
 87 |         logging.info("Loading files...")
 88 |         embedder = load_embedder()
 89 |         data_1 = pd.read_json(inpath_1)
 90 |         data_2 = pd.read_json(inpath_2)
 91 | 
 92 |         logging.info("Performing matching...")
 93 |         output_1, output_2 = perform_matching(data_1, data_2, embedder)
 94 | 
 95 |         logging.info("Saving results...")
 96 |         output_1.to_json(outpath_1)
 97 |         output_2.to_json(outpath_2)
 98 | 
 99 |     logging.info("Done!")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/src/pprl/embedder/bloom_filters.py:
--------------------------------------------------------------------------------
  1 | """Module for the Bloom filter encoder."""
  2 | 
  3 | import hashlib
  4 | 
  5 | 
  6 | class BloomFilterEncoder:
  7 |     """Encoder of tokens and features via hashing and a Bloom filter.
  8 | 
  9 |     The process for creating a cryptographically secure Bloom filter
 10 |     encoding of a set of tokens is as follows:
 11 | 
 12 |     1. Compute the hash digest for your tokens
 13 |     2. Convert the digest bytes into integers
 14 |     3. Map the integer to a bloom filter vector (modulo the length of the vector)
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     size: int
 19 |         Size of the Bloom filter. Defaults to 1024
 20 |     num_hashes: int
 21 |         Number of hashes to perform. Defaults to two.
 22 |     offset: int
 23 |         Offset for Bloom filter indices to allow for masking. Defaults
 24 |         to zero.
 25 |     salt: str, optional
 26 |         Cryptographic salt appended to tokens prior to hashing.
 27 | 
 28 |     Attributes
 29 |     ----------
 30 |     hash_function: func
 31 |         Hashing function (`hashlib.sha256`).
 32 |     """
 33 | 
 34 |     def __init__(
 35 |         self, size: int = 1024, num_hashes: int = 2, offset: int = 0, salt: str | None = None
 36 |     ) -> None:
 37 |         self.size = size
 38 |         self.num_hashes = num_hashes
 39 |         self.offset = offset
 40 |         self.salt = salt or ""
 41 | 
 42 |         self.hash_function = hashlib.sha256
 43 | 
 44 |     def bloom_filter_vector_collision_fraction(
 45 |         self, feature: list[str]
 46 |     ) -> tuple[list[int], float]:
 47 |         """Convert a feature vector and return its collision fraction.
 48 | 
 49 |         The index vector uses an optional offset for masking.
 50 | 
 51 |         Parameters
 52 |         ----------
 53 |         feature: list
 54 |             List of features to be processed.
 55 | 
 56 |         Returns
 57 |         -------
 58 |         vector_idxs: list
 59 |             Index values used to create the Bloom filter vector.
 60 |         collision_fraction: float
 61 |             Proportion of repeated indices.
 62 | 
 63 |         Examples
 64 |         --------
 65 |         >>> bfe = BloomFilterEncoder()
 66 |         >>> bfe.bloom_filter_vector_collision_fraction(["a","b","c"])
 67 |         ([334, 1013, 192, 381, 18, 720], 0.0)
 68 |         """
 69 |         vec_idx: list = []
 70 | 
 71 |         for gram in feature:
 72 |             for i in range(self.num_hashes):
 73 |                 utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8")
 74 |                 digest = self.hash_function(utf_string_with_salt).digest()
 75 |                 digest_as_int = (int.from_bytes(digest, "little") % self.size) + self.offset
 76 |                 vec_idx.append(digest_as_int)
 77 | 
 78 |         vec_idx_deduped = [*set(vec_idx)]
 79 |         collision_fraction = 1 - len(vec_idx_deduped) / len(vec_idx)
 80 | 
 81 |         return vec_idx_deduped, collision_fraction
 82 | 
 83 |     def bloom_filter_vector(self, feature: list[str]) -> list[int]:
 84 |         """Convert a feature vector into indices for a Bloom vector.
 85 | 
 86 |         The index vector uses an optional offset for masking.
 87 | 
 88 |         Parameters
 89 |         ----------
 90 |         feature: list
 91 |             List of features to be converted.
 92 | 
 93 |         Returns
 94 |         -------
 95 |         vector_idxs: list
 96 |             Index values used to create the Bloom filter vector.
 97 | 
 98 |         Examples
 99 |         --------
100 |         >>> bfe = BloomFilterEncoder()
101 |         >>> bfe.bloom_filter_vector(["a","b","c"])
102 |         [334, 1013, 192, 381, 18, 720]
103 |         """
104 |         vec_idx_deduped, _ = self.bloom_filter_vector_collision_fraction(feature)
105 | 
106 |         return vec_idx_deduped
107 | 


--------------------------------------------------------------------------------
/test/app/test_utils.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for the Flask app utility functions."""
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | 
  6 | from pprl.app import utils
  7 | 
  8 | 
  9 | @pytest.mark.parametrize(
 10 |     "input_filename, expected_output",
 11 |     [
 12 |         ("file1.csv", True),
 13 |         ("D:path/folder1/folder2/myfile.csv", True),
 14 |         ("file.CsV", True),
 15 |         ("file.txt", False),
 16 |         ("D:path/folder1/file.TxT", False),
 17 |         ("file1csv", False),
 18 |         ("other.py", False),
 19 |         (".csv", False),
 20 |     ],
 21 | )
 22 | def test_check_is_csv(input_filename, expected_output):
 23 |     """Check the CSV checker works as it should."""
 24 |     assert utils.check_is_csv(input_filename) is expected_output
 25 | 
 26 | 
 27 | @pytest.mark.parametrize(
 28 |     "form, expected_drop_columns, expected_other_columns, expected_colspec",
 29 |     [
 30 |         (
 31 |             {
 32 |                 "salt": "my_salt",
 33 |                 "upload": "Upload to GCP",
 34 |                 "download": "Download file locally",
 35 |                 "column4": "drop",
 36 |                 "column5": "Name",
 37 |                 "column6": "keep",
 38 |             },
 39 |             ["column4"],
 40 |             ["column6"],
 41 |             {"column5": "name"},
 42 |         ),
 43 |         ({}, [], [], {}),
 44 |     ],
 45 | )
 46 | def test_assign_columns(form, expected_drop_columns, expected_other_columns, expected_colspec):
 47 |     """Test to make sure the correct columns are assigned correctly."""
 48 | 
 49 |     feature_funcs = {
 50 |         "Name": "name",
 51 |         "Date": "dob",
 52 |         "Sex": "sex",
 53 |         "Miscellaneous": "misc_features",
 54 |         "Shingled": "misc_shingled_features",
 55 |     }
 56 | 
 57 |     drop_columns, other_columns, colspec = utils.assign_columns(form, feature_funcs)
 58 |     assert drop_columns == expected_drop_columns
 59 |     assert other_columns == expected_other_columns
 60 |     assert colspec == expected_colspec
 61 | 
 62 | 
 63 | def test_convert_dataframe_to_bf():
 64 |     """Test convert_dataframe_to_bf.
 65 | 
 66 |     Tests the following properties: Returns Pandas DataFrame, dataframe length,
 67 |     column names.
 68 |     """
 69 | 
 70 |     dataframe_values = dict(
 71 |         id_column=["1", "2", 3],
 72 |         name_column=["name1", "name2", "name3"],
 73 |         dob_column=["01/08/1996", "Mar 2000", 2005],
 74 |         sex_column=["M", "F", "Other"],
 75 |         house_number=[6, 1, 7],
 76 |         postcode=["P12 7UP", "LW12, 6PL", "H12 9I6"],
 77 |         other_column=[1, 8, 9],
 78 |     )
 79 |     input_dataframe = pd.DataFrame(dataframe_values)
 80 | 
 81 |     colspec = dict(
 82 |         name_column="name",
 83 |         dob_column="dob",
 84 |         sex_column="sex",
 85 |         house_number="misc_features",
 86 |         postcode="misc_shingled_features",
 87 |     )
 88 | 
 89 |     other_columns = ["id_column"]
 90 | 
 91 |     output_dataframe = utils.convert_dataframe_to_bf(
 92 |         input_dataframe, colspec, other_columns, salt="my_salt"
 93 |     )
 94 | 
 95 |     assert isinstance(output_dataframe, pd.DataFrame)
 96 |     assert len(output_dataframe) == 3
 97 |     assert set(output_dataframe.columns) == set(
 98 |         ["id_column", "bf_indices", "bf_norms", "thresholds"]
 99 |     )
100 | 
101 | 
102 | def test_convert_dataframe_to_bf_other_columns_none():
103 |     """Test convert_dataframe_to_bf.
104 | 
105 |     Tests when the other_columns keyword arguement is set to None.
106 |     """
107 | 
108 |     dataframe_values = dict(
109 |         id_column=["1", "2", 3],
110 |         name_column=["name1", "name2", "name3"],
111 |     )
112 |     input_dataframe = pd.DataFrame(dataframe_values)
113 | 
114 |     colspec = dict(
115 |         name_column="name",
116 |     )
117 | 
118 |     output_dataframe = utils.convert_dataframe_to_bf(input_dataframe, colspec, salt="my_salt")
119 | 
120 |     assert isinstance(output_dataframe, pd.DataFrame)
121 |     assert len(output_dataframe) == 3
122 |     assert set(output_dataframe.columns) == set(["bf_indices", "bf_norms", "thresholds"])
123 | 


--------------------------------------------------------------------------------
/test/embedder/strategies.py:
--------------------------------------------------------------------------------
  1 | """Hypothesis strategies for our embedder subpackage tests."""
  2 | 
  3 | import re
  4 | import string
  5 | from datetime import datetime
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from dateutil.relativedelta import relativedelta
 10 | from hypothesis import strategies as st
 11 | from scipy.linalg import qr
 12 | 
 13 | ALPHABET = string.ascii_letters + string.punctuation
 14 | 
 15 | NAMES = (
 16 |     "Fred Hogan O'Malley",
 17 |     "Angelina Guidone",
 18 |     "Zbyněk Liška",
 19 |     "Jolana Pešková",
 20 |     "Diane Elizabeth Davey-Hurst",
 21 |     "Vanessa Comencini",
 22 |     "Benito Montalcini",
 23 |     "Bettina Nitto",
 24 |     "Sandro Rubbia",
 25 |     "Alexandr Čech",
 26 |     "Adéla Strnadová",
 27 |     "Manuel Boaga",
 28 |     "Jamie Philip Smith",
 29 |     "Jordan Francis",
 30 |     "Melina Cantimori",
 31 |     "Maria Giulia Cattaneo",
 32 |     "Karel Strnad",
 33 |     "Silvie Čechová",
 34 |     "Markéta Sedláková",
 35 |     "Lucy Barrett-O'Reilly",
 36 |     "Tereza Kat'ya Blažková",
 37 | )
 38 | 
 39 | 
 40 | @st.composite
 41 | def st_mutated_names(draw, names=NAMES, mutagens=",-_+ ."):
 42 |     """Generate a name and its mutated form."""
 43 | 
 44 |     name = draw(st.sampled_from(names))
 45 |     mutated = "".join(draw(st.text(alphabet=" ", max_size=2)))
 46 |     for char in name:
 47 |         if char == " ":
 48 |             mutated += draw(st.text(alphabet=mutagens, min_size=1, max_size=3))
 49 |         else:
 50 |             mutated += char
 51 | 
 52 |     return name, mutated
 53 | 
 54 | 
 55 | @st.composite
 56 | def st_tokenized_names(draw, names=NAMES):
 57 |     """Generate a properly tokenized name."""
 58 | 
 59 |     name = draw(st.sampled_from(names))
 60 |     tokens = [f"_{word}_" for word in re.split(r"[\s-]", name)]
 61 | 
 62 |     return tokens
 63 | 
 64 | 
 65 | @st.composite
 66 | def st_names_series(draw, names=NAMES):
 67 |     """Generate a series of names."""
 68 | 
 69 |     names = draw(st.lists(st.sampled_from(names), min_size=1, max_size=100))
 70 | 
 71 |     return pd.Series(names)
 72 | 
 73 | 
 74 | @st.composite
 75 | def st_sexes_series(draw, options=("Male", "Female", "Non-binary", None)):
 76 |     """Generate a series of sexes."""
 77 | 
 78 |     sexes = draw(st.lists(st.sampled_from(options), min_size=1, max_size=100))
 79 | 
 80 |     return pd.Series(sexes)
 81 | 
 82 | 
 83 | @st.composite
 84 | def st_dobs_and_order_params(draw, years_range=100):
 85 |     """Generate a series of date strings and their order parameters."""
 86 | 
 87 |     dayfirst = draw(st.booleans())
 88 |     yearfirst = not dayfirst
 89 |     format_ = "%Y-%m-%d" if yearfirst else "%d/%m/%Y"
 90 | 
 91 |     max_value = datetime.today().date()
 92 |     min_value = max_value - relativedelta(years=years_range)
 93 |     st_dates = st.dates(min_value, max_value).map(lambda date: date.strftime(format_))
 94 |     dobs = draw(st.lists(st.one_of((st.just(None), st_dates)), min_size=1, max_size=10))
 95 | 
 96 |     return pd.Series(dobs), dayfirst, yearfirst, format_
 97 | 
 98 | 
 99 | @st.composite
100 | def st_default_dobs(draw):
101 |     """Generate a default DOB list."""
102 | 
103 |     date = draw(st.dates())
104 | 
105 |     return date.strftime("day<%d>_month<%m>_year<%Y>").split("_")
106 | 
107 | 
108 | @st.composite
109 | def st_fields_series(draw):
110 |     """Generate a series of miscellaneous fields."""
111 | 
112 |     options = (
113 |         st.text(alphabet=ALPHABET),
114 |         st.integers(0, 10),
115 |         st.lists(st.integers(0, 10), min_size=1, max_size=2),
116 |         st.just(None),
117 |     )
118 |     fields = draw(st.lists(st.one_of(options), min_size=1, max_size=100))
119 | 
120 |     return pd.Series(fields)
121 | 
122 | 
123 | @st.composite
124 | def st_strings_series(draw):
125 |     """Generate a series of strings."""
126 | 
127 |     options = (
128 |         st.just(None),
129 |         st.text(alphabet=ALPHABET, min_size=10, max_size=20),
130 |     )
131 |     strings = draw(st.lists(st.one_of(options), min_size=1, max_size=100))
132 | 
133 |     return pd.Series(strings)
134 | 
135 | 
136 | @st.composite
137 | def st_posdef_matrices(draw, bf_size=10):
138 |     """Generate a square positive definite matrix."""
139 | 
140 |     rseed = draw(st.integers(2**10, 2**14))
141 |     rng = np.random.default_rng(rseed)
142 |     H = rng.normal(scale=2, size=(bf_size, bf_size))
143 |     diag_values = rng.exponential(size=bf_size)
144 |     Q, _ = qr(H)
145 | 
146 |     return Q.T @ np.diag(diag_values) @ Q
147 | 
148 | 
149 | @st.composite
150 | def st_bf_indices(draw, bf_size):
151 |     """Generate a list of unique indices."""
152 |     bf_indices = draw(st.lists(st.integers(min_value=0, max_value=bf_size - 1), max_size=bf_size))
153 |     return list(set(bf_indices))
154 | 
155 | 
156 | @st.composite
157 | def st_matrix_and_indices(draw):
158 |     """Generate a pos-def matrix and indices in the same size."""
159 |     bf_size = draw(st.integers(2, 2**10))
160 |     mat = draw(st_posdef_matrices(bf_size))
161 |     bf_indices = draw(st_bf_indices(bf_size))
162 |     return mat, bf_indices
163 | 


--------------------------------------------------------------------------------
/test/app/test_file_selector.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for the file selector part of the app."""
  2 | 
  3 | import io
  4 | 
  5 | import pytest
  6 | 
  7 | 
  8 | @pytest.mark.skip(reason="Test client not working in CI build")
  9 | def test_file_selector(client):
 10 |     """Tests to make sure the upload file page is returned correctly."""
 11 | 
 12 |     response = client.get("/")
 13 |     assert b"<h2>Upload File</h2>" in response.data
 14 | 
 15 | 
 16 | @pytest.mark.skip(reason="Test client not working in CI build")
 17 | def test_upload_file_text(client):
 18 |     """Check the user is informed if they upload the wrong file type."""
 19 | 
 20 |     response = client.post(
 21 |         "/upload",
 22 |         data={"file": (io.BytesIO(b"some_text"), "test.txt")},
 23 |         content_type="multipart/form-data",
 24 |     )
 25 |     assert b"Upload a csv file." in response.data
 26 | 
 27 | 
 28 | @pytest.mark.skip(reason="Test client not working in CI build")
 29 | def test_upload_file_csv(client):
 30 |     """Check the column selector comes up after uploading a CSV."""
 31 | 
 32 |     response = client.post(
 33 |         "/upload",
 34 |         data={"file": (io.BytesIO(b"some_text"), "test.csv")},
 35 |         content_type="multipart/form-data",
 36 |     )
 37 |     assert b"<h2>Choose Salt</h2>" in response.data
 38 | 
 39 | 
 40 | @pytest.mark.skip(reason="Test client not working in CI build")
 41 | def test_upload_file_csv_columns(client):
 42 |     """Check the form format in the column selector page."""
 43 | 
 44 |     response = client.post(
 45 |         "/upload",
 46 |         data={"file": (io.BytesIO(b"column1,column2,mycolumn3"), "test.csv")},
 47 |         content_type="multipart/form-data",
 48 |     )
 49 |     assert b'<label for="column1">column1: </label>' in response.data
 50 |     assert b'<label for="column2">column2: </label>' in response.data
 51 |     assert b'<label for="mycolumn3">mycolumn3: </label>' in response.data
 52 |     assert b'<select id="column1" name="column1">' in response.data
 53 |     assert b'<option value="drop_column">Drop Column</option>' in response.data
 54 |     assert b'<option value="name">Name Column</option>' in response.data
 55 |     assert b'<option value="dob">Date of Birth Column</option>' in response.data
 56 |     assert b'<option value="sex">Sex Column</option>' in response.data
 57 |     assert b'<option value="keep_column">Keep Raw Column</option>' in response.data
 58 | 
 59 | 
 60 | @pytest.mark.skip(reason="Test client not working in CI build")
 61 | def test_process_csv_and_upload_download_json(csv_client):
 62 |     """Check a JSON file is served when downloading a file locally."""
 63 | 
 64 |     response = csv_client.post(
 65 |         "/get_results",
 66 |         data={
 67 |             "submit_button": "Download file locally",
 68 |             "column1": "drop_column",
 69 |             "column2": "drop_column",
 70 |             "salt": "cat",
 71 |         },
 72 |         content_type="multipart/form-data",
 73 |     )
 74 |     assert response.data == b'{"bf_indices":{}}'
 75 | 
 76 | 
 77 | @pytest.mark.skip(reason="Test client not working in CI build")
 78 | def test_process_csv_and_upload_incorrect_credentials(no_party_client):
 79 |     """Check the right page is served when uploading w/o config."""
 80 | 
 81 |     response = no_party_client.post(
 82 |         "/get_results",
 83 |         data={
 84 |             "submit_button": "Upload file to GCP  ",
 85 |             "column1": "drop_column",
 86 |             "column2": "drop_column",
 87 |             "salt": "cat",
 88 |         },
 89 |         content_type="multipart/form-data",
 90 |     )
 91 |     assert b"Google Cloud Platform credentials not configured correctly." in response.data
 92 | 
 93 | 
 94 | @pytest.mark.skip(reason="no way of currently testing this")
 95 | def test_process_csv_and_upload_exception(party1_client):
 96 |     """Check the right page is served when uploading w/ an exception."""
 97 | 
 98 |     response = party1_client.post(
 99 |         "/get_results",
100 |         data={
101 |             "submit_button": "Upload file to GCP  ",
102 |             "column1": "drop_column",
103 |             "column2": "drop_column",
104 |             "salt": "cat",
105 |         },
106 |         content_type="multipart/form-data",
107 |     )
108 |     assert b"Permission denied to upload to GCP bucket or it does not exist." in response.data
109 | 
110 | 
111 | @pytest.mark.skip(reason="no way of currently testing this")
112 | def test_failed_result_check_no_buckets(no_party_client):
113 |     """Check the right page is served when the bucket is not found."""
114 | 
115 |     response = no_party_client.post("/get_result_check")
116 |     assert b"Permission denied downloading from GCP bucket or it does not exist." in response.data
117 | 
118 | 
119 | @pytest.mark.skip(reason="no way of currently testing this")
120 | def test_passed_result_check_no_buckets(no_party_client):
121 |     """Check the right page is served downloading w/ an exception."""
122 | 
123 |     response = no_party_client.post("/download_results")
124 |     assert b"Permission denied downloading from GCP bucket or it does not exist." in response.data
125 | 


--------------------------------------------------------------------------------
/src/pprl/app/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for the party-side app."""
  2 | 
  3 | import os
  4 | import zipfile
  5 | from io import BytesIO
  6 | 
  7 | import dill
  8 | import flask
  9 | import pandas as pd
 10 | 
 11 | from pprl.embedder import features
 12 | from pprl.embedder.embedder import EmbeddedDataFrame, Embedder
 13 | 
 14 | 
 15 | def check_is_csv(path: str) -> bool:
 16 |     """
 17 |     Determine whether a file has the `csv` extension.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     path : str
 22 |         Path to the file.
 23 | 
 24 |     Returns
 25 |     -------
 26 |     is_csv : bool
 27 |         Whether the file name follows the pattern `{name}.csv` or not.
 28 |     """
 29 | 
 30 |     *parts, extension = os.path.basename(path).split(os.path.extsep)
 31 | 
 32 |     return bool(parts) and any(part for part in parts) and extension.lower() == "csv"
 33 | 
 34 | 
 35 | def assign_columns(form: dict, feature_funcs: dict) -> tuple[list, list, dict]:
 36 |     """
 37 |     Assign columns from a form to collections.
 38 | 
 39 |     All columns belong to one of three collections: columns to drop,
 40 |     raw columns to keep, or a column feature factory specification.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     form : dict
 45 |         Form from our column chooser page.
 46 |     feature_funcs : dict
 47 |         Mapping between column types and feature functions.
 48 | 
 49 |     Returns
 50 |     -------
 51 |     drop : list[str]
 52 |         List of columns to drop.
 53 |     keep : list[str]
 54 |         List of columns to keep in their raw format.
 55 |     spec : dict[str, func]
 56 |         Mapping between column names and feature functions.
 57 |     """
 58 | 
 59 |     drop, keep, spec = [], [], {}
 60 |     for column, value in form.items():
 61 |         if value == "drop":
 62 |             drop.append(column)
 63 |         elif value == "keep":
 64 |             keep.append(column)
 65 |         elif column in ("salt", "upload", "download"):
 66 |             continue
 67 |         else:
 68 |             spec[column] = feature_funcs[value]
 69 | 
 70 |     return drop, keep, spec
 71 | 
 72 | 
 73 | def download_files(
 74 |     dataframe: EmbeddedDataFrame, embedder: Embedder, party: str, archive: str = "archive"
 75 | ) -> flask.Response:
 76 |     """
 77 |     Serialize, compress, and send a data frame with its embedder.
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     dataframe : EmbeddedDataFrame
 82 |         Data frame to be downloaded.
 83 |     embedder : Embedder
 84 |         Embedder used to embed `dataframe`.
 85 |     party : str
 86 |         Name of the party.
 87 |     archive : str
 88 |         Name of the archive. Default is `"archive"`.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     response : flask.Response
 93 |         Response containing a ZIP archive with the data frame and
 94 |         its embedder.
 95 |     """
 96 | 
 97 |     stream = BytesIO()
 98 |     with zipfile.ZipFile(stream, "w") as z:
 99 |         with z.open("data.csv", "w") as f:
100 |             dataframe.to_csv(f, index=False)
101 |         with z.open("embedder.pkl", "w") as f:
102 |             dill.dump(embedder, f)
103 | 
104 |     stream.seek(0)
105 |     name = ".".join((party, archive, "zip"))
106 | 
107 |     return flask.send_file(stream, as_attachment=True, download_name=name)
108 | 
109 | 
110 | def convert_dataframe_to_bf(
111 |     df: pd.DataFrame, colspec: dict, other_columns: None | list = None, salt: str = ""
112 | ) -> pd.DataFrame:
113 |     """Convert a dataframe of features to a bloom filter.
114 | 
115 |     Convert the columns to features based on the colspec. The features
116 |     are then combined and converted to Bloom filter indices with the
117 |     Bloom filter norm also calculated.
118 | 
119 |     Parameters
120 |     ----------
121 |     df: pandas.DataFrame
122 |         Data frame of features.
123 |     colspec: dict[str, str]
124 |         Dictionary designating columns in the data frame as particular
125 |         feature types to be processed as appropriate.
126 |     other_columns: list[str]
127 |         Columns to be returned as they appear in the data in addition to
128 |         `bf_indices`, `bf_norms` and `thresholds`.
129 |     salt: str
130 |         Cryptographic salt to add to tokens before hashing.
131 | 
132 |     Returns
133 |     -------
134 |     output: pandas.DataFrame
135 |         Data frame of bloom-filtered data.
136 |     """
137 | 
138 |     NGRAMS = [1, 2, 3, 4]
139 |     FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}}
140 | 
141 |     column_types_dict = {
142 |         "name": features.gen_name_features,
143 |         "dob": features.gen_dateofbirth_features,
144 |         "sex": features.gen_sex_features,
145 |         "misc_features": features.gen_misc_features,
146 |         "misc_shingled_features": features.gen_misc_shingled_features,
147 |     }
148 | 
149 |     embedder = Embedder(
150 |         feature_factory=column_types_dict,
151 |         ff_args=FFARGS,
152 |         salt=salt,
153 |     )
154 | 
155 |     df_bloom_filter = embedder.embed(df, colspec, update_norms=True, update_thresholds=True)
156 |     output = df_bloom_filter.anonymise(other_columns)
157 | 
158 |     return output
159 | 


--------------------------------------------------------------------------------
/src/pprl/matching/perform.py:
--------------------------------------------------------------------------------
  1 | """Functions for performing the matching itself."""
  2 | 
  3 | import logging
  4 | import secrets
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from pprl.embedder.embedder import EmbeddedDataFrame, Embedder
 10 | 
 11 | 
 12 | def add_private_index(
 13 |     df1: pd.DataFrame,
 14 |     df2: pd.DataFrame,
 15 |     match: tuple[np.ndarray, np.ndarray],
 16 |     size_assumed: int = 10_000,
 17 |     colname: str = "private_index",
 18 | ) -> tuple[pd.DataFrame, pd.DataFrame]:
 19 |     """Add anonymous match index to input datasets.
 20 | 
 21 |     The match index assigns indices to both matched and unmatched
 22 |     records, so that they are indistinguishable. It doesn't leak
 23 |     any info about the other dataset.
 24 | 
 25 |     add_private_index only works with unique one-to-one matches. This is
 26 |     because there is no way to match many-to-one without leaking information
 27 |     about the successful matches.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     df1 : pd.DataFrame
 32 |         A dataset.
 33 |     df2 : pd.DataFrame
 34 |         Another dataset.
 35 |     match : tuple[np.ndarray, np.ndarray]
 36 |         A pair of matched indices, with no repeated indices.
 37 |     size_assumed : int
 38 |         The assumed maximum size of each dataset. Default is 10,000.
 39 |     colname: str
 40 |         A column name for the new index. By default `"private_index"`.
 41 | 
 42 |     Returns
 43 |     -------
 44 |     df1, df2: pd.DataFrame
 45 |         The same as input data, with private matching index added.
 46 |     """
 47 |     assert (
 48 |         colname not in df1.columns and colname not in df2.columns
 49 |     ), "The chosen colname for the private index is already in use."
 50 | 
 51 |     assert len(match[0]) == len(np.unique(match[0])) and len(match[1]) == len(
 52 |         np.unique(match[1])
 53 |     ), "add_private_index can't handle repeated match indices (many-to-one matches)"
 54 |     # Generate a private matching index
 55 |     inner_join_size = len(match[0])
 56 |     outer_join_size = len(df1) + len(df2) - inner_join_size
 57 |     rng = np.random.default_rng(secrets.randbits(128))
 58 |     # Sampling from a fixed range to avoid information leakage
 59 |     private_index = rng.permutation(range(size_assumed, 3 * size_assumed))[:outer_join_size]
 60 | 
 61 |     # Assign the private_index to both datasets
 62 |     # Initialise as zero
 63 |     out1 = df1.copy()
 64 |     out2 = df2.copy()
 65 |     out1[colname] = 0
 66 |     out2[colname] = 0
 67 | 
 68 |     # Assign the inner join first
 69 |     out1.iloc[list(match[0]), out1.columns.get_loc(colname)] = private_index[:inner_join_size]
 70 |     out2.iloc[list(match[1]), out2.columns.get_loc(colname)] = private_index[:inner_join_size]
 71 | 
 72 |     # Then assign the left and right remainders
 73 |     data1_size = len(out1)
 74 |     out1.iloc[out1[colname] == 0, out1.columns.get_loc(colname)] = private_index[
 75 |         inner_join_size:data1_size
 76 |     ]
 77 |     out2.iloc[out2[colname] == 0, out2.columns.get_loc(colname)] = private_index[
 78 |         data1_size:outer_join_size
 79 |     ]
 80 | 
 81 |     return out1, out2
 82 | 
 83 | 
 84 | def calculate_performance(
 85 |     data_1: pd.DataFrame, data_2: pd.DataFrame, match: tuple[list, list]
 86 | ) -> None:
 87 |     """
 88 |     Calculate the performance of the match by counting the positives.
 89 | 
 90 |     Performance metrics are sent to the logger.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     data_1 : pandas.DataFrame
 95 |         Data frame for `PARTY1`.
 96 |     data_2 : pandas.DataFrame
 97 |         Data frame for `PARTY2`.
 98 |     match : tuple
 99 |         Tuple of indices of matched pairs between the data frames.
100 |     """
101 | 
102 |     data_1_sorted = data_1.iloc[list(match[0]), :]
103 |     data_2_sorted = data_2.iloc[list(match[1]), :]
104 |     tps = sum(map(np.equal, data_1_sorted["true_id"], data_2_sorted["true_id"]))
105 |     fps = len(match[0]) - tps
106 | 
107 |     logging.info(f"True positives {tps}; false positives {fps}")
108 | 
109 | 
110 | def perform_matching(
111 |     data_1: pd.DataFrame, data_2: pd.DataFrame, embedder: Embedder
112 | ) -> tuple[pd.DataFrame, pd.DataFrame]:
113 |     """
114 |     Initiate the data, get similarities, and match the rows.
115 | 
116 |     Parameters
117 |     ----------
118 |     data_1 : pandas.DataFrame
119 |         Data frame for `PARTY1`.
120 |     data_2 : pandas.DataFrame
121 |         Data frame for `PARTY2`.
122 |     embedder : Embedder
123 |         Instance used to embed both data frames.
124 | 
125 |     Returns
126 |     -------
127 |     output_1 : pandas.DataFrame
128 |         Output for `PARTY1`.
129 |     output_2 : pandas.DataFrame
130 |         Output for `PARTY2`.
131 |     """
132 | 
133 |     logging.info("Initialising data with norms and thresholds...")
134 |     edf_1 = EmbeddedDataFrame(data_1, embedder, update_norms=True, update_thresholds=True)
135 |     edf_2 = EmbeddedDataFrame(data_2, embedder, update_norms=True, update_thresholds=True)
136 | 
137 |     logging.info("Calculating similarities...")
138 |     similarities = embedder.compare(edf_1, edf_2)
139 |     match = similarities.match()
140 | 
141 |     logging.info("Matching completed!")
142 | 
143 |     # size_assumed must be > data size
144 |     output_1, output_2 = add_private_index(data_1, data_2, match, size_assumed=10_000)
145 | 
146 |     # If the true index is provided, print performance
147 |     if "true_id" in data_1.columns and "true_id" in data_2.columns:
148 |         calculate_performance(data_1, data_2, match)
149 | 
150 |     return output_1, output_2
151 | 


--------------------------------------------------------------------------------
/src/pprl/matching/cloud.py:
--------------------------------------------------------------------------------
  1 | """Functions for performing matching in the cloud."""
  2 | 
  3 | import json
  4 | import logging
  5 | 
  6 | import pandas as pd
  7 | from google.auth import identity_pool
  8 | from google.cloud import storage
  9 | 
 10 | from pprl import encryption
 11 | from pprl.embedder.embedder import Embedder
 12 | 
 13 | 
 14 | def create_impersonation_credentials(party: str, operator: str) -> identity_pool.Credentials:
 15 |     """
 16 |     Create credentials from an identity pool for impersonating a party.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     party : str
 21 |         Name of the party to impersonate.
 22 |     operator : str
 23 |         Name of the workload operator.
 24 | 
 25 |     Returns
 26 |     -------
 27 |     credentials : google.auth.identity_pool.Credentials
 28 |         Credentials created using the party attestation verifier.
 29 |     """
 30 | 
 31 |     store = storage.Client()
 32 |     bucket = store.get_bucket(f"{operator}-attestation-bucket")
 33 |     string = bucket.get_blob(f"{party}-attestation-credentials.json").download_as_string()
 34 |     info = json.loads(string)
 35 | 
 36 |     credentials = identity_pool.Credentials.from_info(
 37 |         info, scopes=["https://www.googleapis.com/auth/cloud-platform"]
 38 |     )
 39 | 
 40 |     return credentials
 41 | 
 42 | 
 43 | def download_embedder(parties: list[str], operator: str) -> Embedder:
 44 |     """
 45 |     Download and initiate the embedder from those on GCP.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     parties : list[str]
 50 |         List of data-owning party names.
 51 |     operator : str
 52 |         Name of the workload operator.
 53 | 
 54 |     Returns
 55 |     -------
 56 |     embedder : Embedder
 57 |         Reformed embedder instance.
 58 |     """
 59 | 
 60 |     embedders = []
 61 |     for party in parties:
 62 |         logging.info(f"Retrieving embedder pickle for {party}...")
 63 | 
 64 |         credentials = create_impersonation_credentials(party, operator)
 65 | 
 66 |         store = storage.Client(party, credentials=credentials)
 67 |         bucket = store.get_bucket(f"{party}-bucket")
 68 |         pickled = bucket.get_blob("embedder.pkl").download_as_string()
 69 | 
 70 |         logging.info("Creating embedder from pickle...")
 71 |         embedder = Embedder.from_pickle(pickled=pickled)
 72 | 
 73 |         embedders.append(embedder)
 74 | 
 75 |         logging.info("Embedder recreated.")
 76 | 
 77 |     logging.info("Comparing the embedders...")
 78 |     embedder, embedder_2 = embedders
 79 |     if embedder.checksum != embedder_2.checksum:
 80 |         logging.error("Embedders do not match.")
 81 | 
 82 |     return embedder
 83 | 
 84 | 
 85 | def download_party_assets(store: storage.Client, party: str) -> tuple[bytes, bytes]:
 86 |     """
 87 |     Download the encrypted data and DEK for a party from GCP.
 88 | 
 89 |     Parameters
 90 |     ----------
 91 |     store : google.cloud.storage.Client
 92 |         GCP storage client using identity pool credentials.
 93 |     party : str
 94 |         Name of the party.
 95 | 
 96 |     Returns
 97 |     -------
 98 |     data_encrypted : bytes
 99 |         Encrypted data frame for linkage.
100 |     dek_encrypted : bytes
101 |         Encrypted data encryption key (used to encrypt the data).
102 |     """
103 | 
104 |     bucket = store.get_bucket(f"{party}-bucket")
105 |     data_encrypted = bucket.get_blob("encrypted_data").download_as_string()
106 |     dek_encrypted = bucket.get_blob("encrypted_dek").download_as_string()
107 | 
108 |     return data_encrypted, dek_encrypted
109 | 
110 | 
111 | def prepare_party_assets(
112 |     party: str, operator: str, location: str, version: int | str
113 | ) -> tuple[pd.DataFrame, bytes]:
114 |     """
115 |     Download and decrypt the assets for a party from GCP.
116 | 
117 |     To enable these steps, we must first impersonate the party service
118 |     account via the workload identity pool we created during project
119 |     set-up.
120 | 
121 |     Parameters
122 |     ----------
123 |     party : str
124 |         Name of the party.
125 |     operator : str
126 |         Name of the workload operator.
127 |     location : str
128 |         Location of the party's workload identity pool and keyring on
129 |         GCP.
130 |     version : int | str
131 |         Key version to retrieve for party asymmetric key encryption key.
132 | 
133 |     Returns
134 |     -------
135 |     data : pandas.DataFrame
136 |         Decrypted data frame for linkage.
137 |     dek : bytes
138 |         Decrypted data encryption key.
139 |     """
140 | 
141 |     credentials = create_impersonation_credentials(party, operator)
142 |     store = storage.Client(party, credentials=credentials)
143 | 
144 |     logging.info(f"Loading assets for {party}...")
145 |     data_encrypted, dek_encrypted = download_party_assets(store, party)
146 | 
147 |     logging.info(f"Decrypting DEK for {party}...")
148 |     dek = encryption.decrypt_dek(dek_encrypted, party, location, version, credentials=credentials)
149 | 
150 |     logging.info(f"Decrypting data for {party}...")
151 |     data = encryption.decrypt_data(data_encrypted, dek)
152 | 
153 |     return data, dek
154 | 
155 | 
156 | def upload_party_results(output: pd.DataFrame, dek: bytes, party: str, operator: str) -> None:
157 |     """
158 |     Encrypt and upload a party's results to GCP.
159 | 
160 |     Like `prepare_party_assets`, we must first impersonate the party
161 |     service account to access their storage bucket.
162 | 
163 |     Parameters
164 |     ----------
165 |     output : pandas.DataFrame
166 |         Party's output from the matching.
167 |     dek : bytes
168 |         Data encryption key.
169 |     party : str
170 |         Name of the party whose results are being processed.
171 |     operator : str
172 |         Name of the workload operator.
173 |     """
174 | 
175 |     logging.info(f"Encrypting results for {party}...")
176 |     encrypted, _ = encryption.encrypt_data(output, dek)
177 | 
178 |     credentials = create_impersonation_credentials(party, operator)
179 |     store = storage.Client(credentials=credentials)
180 |     bucket = store.get_bucket(f"{party}-bucket")
181 | 
182 |     logging.info(f"Uploading encrypted results for {party}...")
183 |     bucket.blob("encrypted_output").upload_from_string(encrypted)
184 | 


--------------------------------------------------------------------------------
/src/pprl/app/__init__.py:
--------------------------------------------------------------------------------
  1 | """Party-side Flask app for embedding, encrypting and uploading data."""
  2 | 
  3 | from datetime import datetime, timezone
  4 | 
  5 | import flask
  6 | import pandas as pd
  7 | from google.cloud import storage
  8 | from recordlinkage.datasets import load_febrl4
  9 | 
 10 | from pprl import config, encryption
 11 | 
 12 | from . import utils
 13 | 
 14 | app = flask.Flask(__name__, template_folder="templates")
 15 | 
 16 | app.config["feature_funcs"] = {
 17 |     "Name": "name",
 18 |     "Date": "dob",
 19 |     "Sex": "sex",
 20 |     "Miscellaneous": "misc_features",
 21 |     "Shingled": "misc_shingled_features",
 22 | }
 23 | 
 24 | 
 25 | @app.route("/")
 26 | def home():
 27 |     """View the home page where users choose a configuration file."""
 28 | 
 29 |     environ = config.load_environment()
 30 |     parties = (environ.get("PARTY_1_PROJECT"), environ.get("PARTY_2_PROJECT"))
 31 |     app.config["env"] = environ
 32 |     app.config["parties"] = parties
 33 | 
 34 |     return flask.render_template("home.html", parties=parties)
 35 | 
 36 | 
 37 | @app.route("/", methods=["POST"])
 38 | def home_post():
 39 |     """Load the config, connect to GCP, and go to choosing data."""
 40 | 
 41 |     party = flask.request.form["party"]
 42 |     app.config["party"] = party
 43 |     app.config["store"] = storage.Client(party)
 44 | 
 45 |     return flask.redirect(flask.url_for("choose_data"))
 46 | 
 47 | 
 48 | @app.route("/choose")
 49 | def choose_data():
 50 |     """View the data selection page."""
 51 | 
 52 |     return flask.render_template("choose-data.html")
 53 | 
 54 | 
 55 | @app.route("/choose", methods=["POST"])
 56 | def choose_data_post():
 57 |     """Load in the dataset and go on to process the columns."""
 58 | 
 59 |     request = flask.request
 60 |     if request.form["radio_input"] == "local_file":
 61 |         file = request.files["file"]
 62 |         if utils.check_is_csv(file.filename):
 63 |             data = pd.read_csv(file, dtype=str).fillna("")
 64 |         else:
 65 |             return flask.render_template(
 66 |                 "choose-data.html", message="Please choose a CSV file to upload."
 67 |             )
 68 | 
 69 |     if "FEBRL" in (inp := request.form["radio_input"]):
 70 |         febrl_a, febrl_b = load_febrl4()
 71 |         data = febrl_a if inp.endswith("a") else febrl_b
 72 | 
 73 |     app.config["unprocessed_data"] = data
 74 | 
 75 |     return flask.redirect(flask.url_for("process_data"))
 76 | 
 77 | 
 78 | @app.route("/process")
 79 | def process_data():
 80 |     """View the column processing page."""
 81 | 
 82 |     return flask.render_template(
 83 |         "process-data.html",
 84 |         columns=app.config.get("unprocessed_data").columns,
 85 |         column_types=app.config.get("feature_funcs"),
 86 |     )
 87 | 
 88 | 
 89 | @app.route("/process", methods=["POST"])
 90 | def process_data_post():
 91 |     """Process the data, then download or upload to GCP as needed."""
 92 | 
 93 |     drop, keep, spec = utils.assign_columns(flask.request.form, app.config.get("feature_funcs"))
 94 |     salt = flask.request.form["salt"]
 95 | 
 96 |     data = utils.convert_dataframe_to_bf(
 97 |         app.config.get("unprocessed_data").drop(drop, axis=1), spec, keep, salt
 98 |     )
 99 |     app.config["processed_data"] = data.drop("bf_norms", axis=1)
100 | 
101 |     if "download" in flask.request.form:
102 |         return utils.download_files(data, embedder=None, party=app.config["party"])
103 | 
104 |     if "upload" in flask.request.form:
105 |         return upload_to_gcp(data, embedder=None)
106 | 
107 | 
108 | @app.route("/upload")
109 | def upload_to_gcp(data, embedder):
110 |     """Encrypt and upload the data to GCP, then wait for results."""
111 | 
112 |     app.config["submission_time"] = datetime.now(timezone.utc)
113 |     party = app.config.get("party")
114 |     environ = app.config.get("env")
115 | 
116 |     location = environ.get("PROJECT_LOCATION", "global")
117 | 
118 |     party_num = next(i + 1 for i, part in enumerate(app.config["parties"]) if party == part)
119 |     version = environ.get(f"PARTY_{party_num}_KEY_VERSION", 1)
120 | 
121 |     data_encrypted, dek = encryption.encrypt_data(data)
122 |     dek_encrypted = encryption.encrypt_dek(dek, party, location, version)
123 |     app.config["dek"] = dek
124 | 
125 |     store = app.config.get("store")
126 |     bucket = store.get_bucket(f"{party}-bucket")
127 | 
128 |     bucket.blob("encrypted_data").upload_from_string(data_encrypted)
129 |     bucket.blob("encrypted_dek").upload_from_string(dek_encrypted)
130 | 
131 |     return flask.redirect(flask.url_for("check_results"))
132 | 
133 | 
134 | @app.route("/results")
135 | def check_results():
136 |     """View the results holding page."""
137 | 
138 |     return flask.render_template("check-results.html")
139 | 
140 | 
141 | @app.route("/results", methods=["POST"])
142 | def check_results_post():
143 |     """Check for updated results on GCP. Redirect when they're ready."""
144 | 
145 |     party = app.config.get("party")
146 |     store = app.config.get("store")
147 |     bucket = store.get_bucket(f"{party}-bucket")
148 | 
149 |     blob = bucket.blob("encrypted_output")
150 |     if blob.exists():
151 |         blob = bucket.get_blob("encrypted_output")
152 |         if blob.updated > app.config.get("submission_time"):
153 |             encrypted = blob.download_as_string()
154 |             app.config["embedder"] = bucket.blob("embedder.pkl").download_as_string()
155 |             app.config["processed_data"] = encryption.decrypt_data(
156 |                 encrypted, app.config.get("dek")
157 |             )
158 | 
159 |             return flask.redirect(flask.url_for("download_results"))
160 | 
161 |     now = datetime.now(timezone.utc)
162 |     message = f"Results not available. Last checked at: {now.strftime('%H:%M:%S')}."
163 | 
164 |     return flask.render_template("check-results.html", message=message)
165 | 
166 | 
167 | @app.route("/download-results")
168 | def download_results():
169 |     """View the results downloader page."""
170 | 
171 |     return flask.render_template("download-results.html")
172 | 
173 | 
174 | @app.route("/download-results", methods=["POST"])
175 | def download_results_post():
176 |     """Download the results as a ZIP archive."""
177 | 
178 |     return utils.download_files(
179 |         app.config["processed_data"],
180 |         app.config["embedder"],
181 |         app.config["party"],
182 |         archive="results",
183 |     )
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     app.run(debug=True, port=5000)
188 | 


--------------------------------------------------------------------------------
/src/pprl/encryption.py:
--------------------------------------------------------------------------------
  1 | """Tools for performing envelope encryption on GCP."""
  2 | 
  3 | import json
  4 | 
  5 | import pandas as pd
  6 | from cryptography.fernet import Fernet
  7 | from cryptography.hazmat.backends import default_backend
  8 | from cryptography.hazmat.primitives import hashes, serialization
  9 | from cryptography.hazmat.primitives.asymmetric import padding
 10 | from google.cloud import kms
 11 | 
 12 | 
 13 | def encrypt_data(data: pd.DataFrame, key: None | bytes = None) -> tuple[bytes, bytes]:
 14 |     """
 15 |     Encrypt a data frame.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     data : pd.DataFrame
 20 |         Dataframe to encrypt.
 21 |     key : bytes, optional
 22 |         Fernet key to encrypt data frame. If not specified, create one.
 23 | 
 24 |     Returns
 25 |     -------
 26 |     encrypted : bytes
 27 |         Encrypted data frame.
 28 |     key : bytes
 29 |         Fernet key used to encrypt data frame.
 30 |     """
 31 | 
 32 |     if key is None:
 33 |         key = Fernet.generate_key()
 34 | 
 35 |     fernet = Fernet(key)
 36 |     payload = data.to_json().encode("utf-8")
 37 |     encrypted = fernet.encrypt(payload)
 38 | 
 39 |     return encrypted, key
 40 | 
 41 | 
 42 | def decrypt_data(encrypted: bytes, key: bytes) -> pd.DataFrame:
 43 |     """
 44 |     Decrypt a data frame with the provided key.
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     encrypted : bytes
 49 |         Data to be decrypted.
 50 |     key : bytes
 51 |         Key used to encrypt the data.
 52 | 
 53 |     Returns
 54 |     -------
 55 |     data : pd.DataFrame
 56 |         Decrypted data frame.
 57 |     """
 58 | 
 59 |     fernet = Fernet(key)
 60 |     decrypted = fernet.decrypt(encrypted)
 61 |     data = pd.DataFrame(json.loads(decrypted))
 62 | 
 63 |     return data
 64 | 
 65 | 
 66 | def _build_key_version_path(
 67 |     party: str, location: str, version: int | str, client: kms.KeyManagementServiceClient
 68 | ) -> str:
 69 |     """
 70 |     Build a full key version path for retrieval from KMS.
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     party : str
 75 |         Name of the party whose key to retrieve.
 76 |     location : str
 77 |         Location of the keyring on which the key lives.
 78 |     version : int | str
 79 |         Version of the key to retrieve.
 80 |     client : google.cloud.kms.KeyManagementServiceClient
 81 |         Connection to KMS.
 82 | 
 83 |     Returns
 84 |     -------
 85 |     path : str
 86 |         Key version path on KMS.
 87 |     """
 88 | 
 89 |     keyring, key = f"{party}-akek-kr", f"{party}-akek"
 90 |     path = client.crypto_key_version_path(party, location, keyring, key, str(version))
 91 | 
 92 |     return path
 93 | 
 94 | 
 95 | def _get_public_key(party: str, location: str, version: int | str, **kwargs: dict) -> bytes:
 96 |     """
 97 |     Get the public key from the GCP Key Management Service (KMS).
 98 | 
 99 |     Parameters
100 |     ----------
101 |     party : str
102 |         Name of the party.
103 |     location : str
104 |         Location of the keyring on which the key lives.
105 |     version : int | str
106 |         Key version to use.
107 |     **kwargs : dict
108 |         Keyword arguments to pass when creating an instance of
109 |         `google.cloud.kms.KeyManagementServiceClient`.
110 | 
111 |     Returns
112 |     -------
113 |     key : str
114 |         The public key object in PEM string format.
115 |     """
116 | 
117 |     client = kms.KeyManagementServiceClient(**kwargs)
118 |     key_version_path = _build_key_version_path(party, location, version, client)
119 |     public_key = client.get_public_key(request={"name": key_version_path})
120 |     key = public_key.pem.encode("utf-8")
121 | 
122 |     return key
123 | 
124 | 
125 | def encrypt_dek(
126 |     dek: bytes, party: str, location: str = "global", version: int | str = 1, **kwargs
127 | ) -> bytes:
128 |     """
129 |     Encrypt the data encryption key.
130 | 
131 |     We encrypt the data encryption key using the public key portion of
132 |     an asymmetric key retrieved from the GCP Key Management Service.
133 | 
134 |     Parameters
135 |     ----------
136 |     dek : bytes
137 |         Data encryption key to be encrypted.
138 |     party : str
139 |         Name of the party.
140 |     location : str
141 |         Location of the keyring on which the key lives.
142 |     version : int | str
143 |         Version of the asymmetric key to get from GCP. Default is 1.
144 |     **kwargs : dict
145 |         Keyword arguments to pass when creating an instance of
146 |         `google.cloud.kms.KeyManagementServiceClient`.
147 | 
148 |     Returns
149 |     -------
150 |     encrypted : bytes
151 |         Encrypted data encryption key.
152 |     """
153 | 
154 |     # Extract and parse the public key as a PEM-encoded RSA key
155 |     pem = _get_public_key(party, location, version, **kwargs)
156 |     rsa = serialization.load_pem_public_key(pem, default_backend())
157 | 
158 |     # Construct the padding, which differs based on key choice
159 |     sha = hashes.SHA256()
160 |     mgf = padding.MGF1(algorithm=sha)
161 |     pad = padding.OAEP(mgf=mgf, algorithm=sha, label=None)
162 | 
163 |     encrypted = rsa.encrypt(dek, pad)
164 | 
165 |     return encrypted
166 | 
167 | 
168 | def decrypt_dek(
169 |     encrypted: bytes, party: str, location: str = "global", version: int | str = 1, **kwargs
170 | ) -> bytes:
171 |     """
172 |     Decrypt a data encryption key using an asymmetric key held on KMS.
173 | 
174 |     Owing to the nature of the encryption key set-up of pprl this
175 |     function is only really to be used in the GCP Confidential Space set
176 |     up by the linkage administrator.
177 | 
178 |     Parameters
179 |     ----------
180 |     encrypted : bytes
181 |         Key to be decrypted.
182 |     party : str
183 |         Name of the party whose key we are decrypting.
184 |     location : str
185 |         Location of the keyring on which the key lives.
186 |     version : int | str
187 |         Version of the asymmetric key to get from GCP. Default is 1.
188 |     **kwargs : dict
189 |         Keyword arguments to pass when creating an instance of
190 |         `google.cloud.kms.KeyManagementServiceClient`.
191 | 
192 |     Returns
193 |     -------
194 |     dek : bytes
195 |         Decrypted data encryption key.
196 |     """
197 | 
198 |     client = kms.KeyManagementServiceClient(**kwargs)
199 |     key_version_path = _build_key_version_path(party, location, version, client)
200 |     response = client.asymmetric_decrypt(
201 |         request={"name": key_version_path, "ciphertext": encrypted}
202 |     )
203 | 
204 |     return response.plaintext
205 | 


--------------------------------------------------------------------------------
/docs/tutorials/example-febrl.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Linking the FEBRL datasets
  3 | description: Using PPRL locally to link two well-known datasets
  4 | format: html
  5 | ---
  6 | 
  7 | This tutorial shows how the package can be used locally to match the
  8 | [FEBRL](http://users.cecs.anu.edu.au/~Peter.Christen/publications/hdkm2008slides.pdf)
  9 | datasets, included as example datasets in the
 10 | [`recordlinkage`](https://recordlinkage.readthedocs.io/en/latest/) package.
 11 | 
 12 | ```{python}
 13 | import os
 14 | import time
 15 | from functools import partial
 16 | 
 17 | import pandas as pd
 18 | import numpy as np
 19 | from recordlinkage.datasets import load_febrl4
 20 | 
 21 | from pprl import EmbeddedDataFrame, Embedder, config
 22 | from pprl.embedder import features as feat
 23 | ```
 24 | 
 25 | ## Load the data
 26 | 
 27 | The datasets we are using are 5000 records across two datasets with no
 28 | duplicates, and each of the records has a valid match in the other dataset.
 29 | 
 30 | After loading the data, we can parse the true matched ID number from the
 31 | indices.
 32 | 
 33 | ```{python}
 34 | feb4a, feb4b = load_febrl4()
 35 | 
 36 | feb4a["true_id"] = (
 37 |     feb4a.index.str.extract("^rec-([0-9]*)-")
 38 |     .iloc[:, 0].astype("int")
 39 |     .to_list()
 40 | )
 41 | feb4b["true_id"] = (
 42 |     feb4b.index.str.extract("^rec-([0-9]*)-")
 43 |     .iloc[:, 0].astype("int")
 44 |     .to_list()
 45 | )
 46 | ```
 47 | 
 48 | ## Create a feature factory
 49 | 
 50 | Define the feature processing functions that convert each field into a string
 51 | so that they can be hashed into the Bloom filter. The dictionary of functions
 52 | is called `feature_factory`.
 53 | 
 54 | If we want to pass the feature factory functions into the embedder with
 55 | non-default parameters, we have two options:
 56 | 
 57 | 1. Pass a dictionary of dictionaries of keyword arguments as an optional
 58 |    `ff_args` parameter (e.g. `ff_args = {"dob": {"dayfirst": False, "yearfirst": True}})`)
 59 | 2. Use `functools.partial()`, as we have below.
 60 | 
 61 | ```{python}
 62 | feature_factory = dict(
 63 |     name=feat.gen_name_features,
 64 |     dob=partial(feat.gen_dateofbirth_features, dayfirst=False, yearfirst=True),
 65 |     misc=feat.gen_misc_features,
 66 |     address=partial(
 67 |         feat.gen_misc_shingled_features, label="addr", ngram_length=[4]
 68 |     ),
 69 | )
 70 | ```
 71 | 
 72 | 
 73 | ## Initialise the embedder instance
 74 | 
 75 | This instance embeds each feature twice into a Bloom filter of length 1024.
 76 | 
 77 | ```{python}
 78 | embedder = Embedder(feature_factory, bf_size=1024, num_hashes=2)
 79 | ```
 80 | 
 81 | ## Embed the datasets
 82 | 
 83 | The column specification `colspec` is a dictionary that tells the embedder how
 84 | to map feature-processing functions to columns of the data. Notice that we can
 85 | map more than one column to the same function. This means that we can easily
 86 | handle cases where fields we want to compare all together span several columns.
 87 | 
 88 | This process makes our Bloom filter robust to inconsistencies where, for
 89 | example, surname and given name may be swapped, or addresses may be coded
 90 | inconsistently.
 91 | 
 92 | ::: {.callout-note}
 93 | To ensure that different feature types hash into different buckets, the
 94 | functions `gen_misc_features()` and `gen_misc_shingled_features()` will add a
 95 | label to parsed string features, which will default to the data column name.
 96 | 
 97 | For example, to ensure suburb doesn't collide with state (if they happened to
 98 | be the same), `gen_misc_features()` would encode each of their tokens as
 99 | `suburb<token>` and `state<token>`, respectively. If you want to map different
100 | columns into the same feature, such as `address` below, you can set the label
101 | explicitly when passing the function to the embedder.
102 | :::
103 | 
104 | ```{python}
105 | colspec = dict(
106 |     given_name="name",
107 |     surname="name",
108 |     date_of_birth="dob",
109 |     street_number="misc",
110 |     state="misc",
111 |     soc_sec_id="misc",
112 |     postcode="misc",
113 |     suburb="misc",
114 |     address_1="address",
115 |     address_2="address",
116 | )
117 | 
118 | edf1 = embedder.embed(feb4a, colspec=colspec)
119 | edf2 = embedder.embed(feb4b, colspec=colspec)
120 | ```
121 | 
122 | Store the embedded datasets and their embedder to file.
123 | 
124 | ```{python}
125 | edf1.to_json("party1_data.json")
126 | edf2.to_json("party2_data.json")
127 | embedder.to_pickle("embedder.pkl")
128 | ```
129 | 
130 | ## Calculate similarity
131 | 
132 | Compute the row thresholds to provide a lower bound on matching
133 | similarity scores for each row. This operation is the most
134 | computationally intensive part of the whole process.
135 | 
136 | 
137 | ```{python}
138 | start = time.time()
139 | edf1.update_thresholds()
140 | edf2.update_thresholds()
141 | end = time.time()
142 | 
143 | print(f"Updating thresholds took {end - start:.2f} seconds")
144 | ```
145 | 
146 | Compute the matrix of similarity scores.
147 | 
148 | ```{python}
149 | similarity_scores = embedder.compare(edf1,edf2)
150 | ```
151 | 
152 | ## Compute a match
153 | 
154 | Use the similarity scores to compute a match, using the Hungarian algorithm.
155 | First, we compute the match with the row thresholds.
156 | 
157 | ```{python}
158 | matching = similarity_scores.match(require_thresholds=True)
159 | ```
160 | 
161 | Using the true IDs, evaluate the precision and recall of the match.
162 | 
163 | ```{python}
164 | def get_results(edf1, edf2, matching):
165 |     """Get the results for a given matching."""
166 | 
167 |     trueids_matched1 = edf1.iloc[matching[0], edf1.columns.get_loc("true_id")]
168 |     trueids_matched2 = edf2.iloc[matching[1], edf2.columns.get_loc("true_id")]
169 | 
170 |     nmatches = len(matching[0])
171 |     truepos = sum(map(np.equal, trueids_matched1, trueids_matched2))
172 |     falsepos = nmatches - truepos
173 | 
174 |     print(
175 |         f"True pos: {truepos} | False pos: {falsepos} | "
176 |         f"Precision: {truepos / nmatches:.1%} | Recall: {truepos / 5000:.1%}"
177 |     )
178 | 
179 |     return nmatches, truepos, falsepos
180 | 
181 | _ = get_results(edf1, edf2, matching)
182 | ```
183 | 
184 | Then, we compute the match without using the row thresholds, calculating the
185 | same performance metrics:
186 | 
187 | ```{python}
188 | matching = similarity_scores.match(require_thresholds=False)
189 | _ = get_results(edf1, edf2, matching)
190 | ```
191 | 
192 | Without using the row thresholds, the number of false positives is larger,
193 | but the recall is much better. For some uses this balance may be preferable.
194 | 
195 | In testing, the use of local row thresholds provides a better trade-off between
196 | precision and recall, compared to using a single absolute threshold. It has the
197 | additional advantage, in a privacy-preserving setting, of being automatic and
198 | not requiring clerical review to set the level.
199 | 


--------------------------------------------------------------------------------
/docs/tutorials/run-through.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Embedder API run-through"
  3 | format: html
  4 | ---
  5 | 
  6 | This article shows the main classes, methods and functionality of the Embedder
  7 | API.
  8 | 
  9 | First, we'll import a few modules, including:
 10 | 
 11 | * the `features` module, that has functions for processing data into features
 12 |   for embedding
 13 | * the `config` module, which includes our package configuration (such as the
 14 |   location of data directories)
 15 | * some classes from the main `embedder` module
 16 | 
 17 | ```{python}
 18 | import os
 19 | import numpy as np
 20 | import pandas as pd
 21 | 
 22 | from pprl import EmbeddedDataFrame, Embedder, config
 23 | from pprl.embedder import features as feat
 24 | ```
 25 | 
 26 | ## Data set-up
 27 | 
 28 | For this demo we'll create a really minimal pair of datasets. Notice that they
 29 | don't have to have the same structure or field names.
 30 | 
 31 | ```{python}
 32 | df1 = pd.DataFrame(
 33 |     dict(
 34 |         id=[1,2,3],
 35 |         forename=["Henry", "Sally", "Ina"],
 36 |         surname = ["Tull", "Brown", "Lawrey"],
 37 |         dob=["", "2/1/2001", "4/10/1995"],
 38 |         gender=["male", "Male", "Female"],
 39 |         county=["", np.NaN, "County Durham"]
 40 |     )
 41 | )
 42 | 
 43 | df2 = pd.DataFrame(
 44 |     dict(
 45 |         personid=[4,5,6],
 46 |         full_name=["Harry Tull", "Sali Brown", "Ina Laurie"],
 47 |         date_of_birth=["2/1/2001", "2/1/2001", "4/11/1995"],
 48 |         sex=["M", "M", "F"],
 49 |         county=["Rutland", "Powys", "Durham"]
 50 |     )
 51 | )
 52 | ```
 53 | 
 54 | Features are extracted as different kinds of string objects from each field,
 55 | ready to be hash embedded into the Bloom filters. We need to specify the
 56 | feature extraction functions we'll need.
 57 | 
 58 | In this case we'll need one extractor for names, one for dates of birth, and
 59 | one for sex/gender records. We create a dict with the functions we need. We
 60 | create another dict to store any keyword arguments we want to pass in to each
 61 | function (in this case we use all the default arguments so the keyword argument
 62 | dictionaries are empty):
 63 | 
 64 | ```{python}
 65 | feature_factory = dict(
 66 |     name=feat.gen_name_features,
 67 |     dob=feat.gen_dateofbirth_features,
 68 |     sex=feat.gen_sex_features,
 69 |     misc=feat.gen_misc_features
 70 | )
 71 | 
 72 | ff_args = dict(name={}, sex={}, dob={})
 73 | ```
 74 | 
 75 | ## Embedding
 76 | 
 77 | Now we can create an `Embedder` object. We want our Bloom filter vectors to
 78 | have a length of 1024 elements, and we
 79 | choose to hash each feature two times. These choices seem to work ok, but we
 80 | haven't explored them systematically.
 81 | 
 82 | ```{python}
 83 | embedder = Embedder(feature_factory,
 84 |                     ff_args,
 85 |                     bf_size = 2**10,
 86 |                     num_hashes=2,
 87 |                     )
 88 | ```
 89 | 
 90 | Now we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we
 91 | need to pass a column specification `colspec` that maps each column of the data
 92 | into the `feature_factory` functions. Any columns not mapped will not
 93 | contribute to the embedding.
 94 | 
 95 | ```{python}
 96 | edf1 = embedder.embed(
 97 |     df1, colspec=dict(forename="name", surname="name", dob="dob", gender="sex", county="misc")
 98 | )
 99 | edf2 = embedder.embed(
100 |     df2, colspec=dict(full_name="name", date_of_birth="dob", sex="sex", county="misc")
101 | )
102 | 
103 | print(edf1)
104 | print(edf2)
105 | ```
106 | 
107 | 
108 | ## Training
109 | 
110 | Discuss this at this stage
111 | 
112 | ## Computing the similarity scores and the matching
113 | 
114 | Now we have two embedded datasets, we can compare them and compute all the
115 | pairwise Cosine similarity scores.
116 | 
117 | First, we have to compute the vector norms of each Bloom vector (for scaling
118 | the Cosine similarity) and the thresholds (thresholds are explained here
119 | [link]). Computing the thresholds can be time-consuming for a larger dataset,
120 | because it essentially computes all pairwise comparisons of the data to itself.
121 | 
122 | ```{python}
123 | #| echo: False
124 | 
125 | edf1.update_norms()
126 | edf2.update_norms()
127 | edf1.update_thresholds()
128 | edf2.update_thresholds()
129 | ```
130 | 
131 | NB: there's also a flag to compute these at the same time as the embedding, but
132 | it doesn't by default because, depending on the workflow, you may wish to
133 | compute the norms and thresholds at different times (e.g. on the server).
134 | 
135 | Now you can compute the similarities:
136 | 
137 | ```{python}
138 | similarities = embedder.compare(edf1,edf2)
139 | 
140 | print(similarities)
141 | ```
142 | 
143 | Finally, you can compute the matching:
144 | 
145 | ```{python}
146 | matching = similarities.match(abs_cutoff=0.5)
147 | 
148 | print(matching)
149 | ```
150 | 
151 | 
152 | ## Serialisation and file I/O
153 | 
154 | That's how to do the workflow in one session. However, this demo follows a
155 | multi-stage workflow, so we need to be able to pass objects around. There are a
156 | couple of methods that enable file I/O and serialisation.
157 | 
158 | First, the `Embedder` object itself needs to be written to file and loaded. The
159 | idea is to train it, share it to the data owning parties, and also to the
160 | matching server. For this purpose, it's possible to pickle the entire
161 | `Embedder` object.
162 | 
163 | ```{python}
164 | embedder.to_pickle("embedder.pkl")
165 | 
166 | embedder_copy = Embedder.from_pickle("embedder.pkl")
167 | ```
168 | 
169 | The copy has the same functionality as the original:
170 | 
171 | ```{python}
172 | similarities = embedder_copy.compare(edf1,edf2)
173 | 
174 | print(similarities)
175 | ```
176 | 
177 | NB: This won't work if two datasets were embedded with different `Embedder`
178 | instances, even if they're identical. The `compare()` method checks for the
179 | same embedder object memory reference so it won't work if one was embedded with
180 | the original and the other with the copy. The way to fix this is to
181 | re-initialise the `EmbeddedDataFrame` with the new `Embedder` object.
182 | 
183 | ```{python}
184 | edf2_copy = EmbeddedDataFrame(edf2, embedder_copy)
185 | ```
186 | 
187 | In this case, be careful that the `Embedder` is compatible with the Bloom
188 | filter vectors in the EDF (i.e. uses the same parameters and feature
189 | factories), because while you can refresh the norms and thresholds, you can't
190 | refresh the 'bf_indices' without reembedding the data frame.
191 | 
192 | ## Serialising the data
193 | 
194 | The EDF objects are just a thin wrapper around `pandas.DataFrame` instances, so
195 | you can serialise to JSON using the normal methods.
196 | 
197 | ```{python}
198 | edf1.to_json("edf1.json")
199 | 
200 | edf1_copy = pd.read_json("edf1.json")
201 | 
202 | print(isinstance(edf1_copy,EmbeddedDataFrame))
203 | print(isinstance(edf1_copy,pd.DataFrame))
204 | ```
205 | 
206 | The `bf_indices`, `bf_norms` and `thresholds` columns will be preserved.
207 | However, this demotes the data frames back to normal `pandas.DataFrame`
208 | instances and loses the link to an `Embedder` instance.
209 | 
210 | To fix this, just re-initialise them:
211 | 
212 | ```{python}
213 | edf1_copy = EmbeddedDataFrame(edf1_copy, embedder_copy)
214 | ```
215 | 


--------------------------------------------------------------------------------
/test/embedder/test_embedder.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for the embedder module."""
  2 | 
  3 | import unittest.mock as mock
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from hypothesis import HealthCheck, given, settings
  8 | 
  9 | from pprl import EmbeddedDataFrame, Embedder
 10 | from pprl.embedder import features as feat
 11 | 
 12 | from .strategies import st_matrix_and_indices, st_posdef_matrices
 13 | 
 14 | 
 15 | def alt_calculate_norm(scm_matrix, bf_indices):
 16 |     """Calculate the norm of a bloom filter wrt scm_matrix.
 17 | 
 18 |     An alternative method, used in testing.
 19 |     """
 20 |     bf_binary_vector = np.zeros(len(scm_matrix))
 21 |     bf_binary_vector[bf_indices] = 1.0
 22 |     return np.sqrt(bf_binary_vector.T @ scm_matrix @ bf_binary_vector)
 23 | 
 24 | 
 25 | @given(st_matrix_and_indices())
 26 | @settings(suppress_health_check=[HealthCheck.too_slow])
 27 | def test_calculate_norm(matrix_and_indices):
 28 |     """Test EmbeddedDataFrame._calculate_norm.
 29 | 
 30 |     Tests the following properties: scalar, finite, non-negative
 31 |     equal to sqrt(x.T @ scm_matrix @ x)
 32 |     """
 33 |     scm_matrix, bf_indices = matrix_and_indices
 34 | 
 35 |     # Mock EmbeddedDataFrame
 36 |     self_mock = mock.Mock()
 37 |     self_mock.embedder.scm_matrix = scm_matrix
 38 | 
 39 |     result = EmbeddedDataFrame._calculate_norm(self_mock, bf_indices)
 40 | 
 41 |     expected = alt_calculate_norm(scm_matrix, bf_indices)
 42 | 
 43 |     assert np.isfinite(result)
 44 |     assert isinstance(result, float)
 45 |     assert result >= 0
 46 |     assert np.allclose(result, expected)
 47 | 
 48 | 
 49 | @given(st_posdef_matrices(bf_size=10))
 50 | def test_update_norms(posdef_matrix):
 51 |     """Test EmbeddedDataFrame.update_norms.
 52 | 
 53 |     Tests the following properties: Returns EmbeddedDataFrame, column names,
 54 |     idempotent
 55 |     """
 56 |     nrows = len(posdef_matrix)
 57 |     df = pd.DataFrame(
 58 |         dict(idx=[x for x in range(nrows)], bf_indices=[list(range(i)) for i in range(nrows)])
 59 |     )
 60 |     embedder_mock = mock.Mock(Embedder)
 61 |     embedder_mock.scm_matrix = posdef_matrix
 62 |     embedder_mock.checksum = "1234"
 63 |     edf = EmbeddedDataFrame(df, embedder_mock, update_norms=False)
 64 |     columns0 = list(edf.columns)
 65 |     _ = edf.update_norms()
 66 |     columns1 = list(edf.columns)
 67 |     bf_norms1 = list(edf["bf_norms"])
 68 |     edf.update_norms()
 69 |     columns2 = list(edf.columns)
 70 |     bf_norms2 = list(edf["bf_norms"])
 71 | 
 72 |     assert isinstance(_, EmbeddedDataFrame)
 73 |     assert set(columns1).difference(columns0) == {"bf_norms"}
 74 |     assert columns1 == columns2
 75 |     assert bf_norms1 == bf_norms2
 76 | 
 77 | 
 78 | def test_anonymise():
 79 |     """Tests EmbeddedDataFrame.anonymise.
 80 | 
 81 |     Test that the columns in the keep list are returned in their
 82 |     original order in addition to the bf_indices column.
 83 |     """
 84 | 
 85 |     matrix = np.eye(5)
 86 |     df = pd.DataFrame(
 87 |         dict(
 88 |             idx=[1],
 89 |             firstname=["Fred"],
 90 |             age=[43],
 91 |             lastname=["Hogan O'Malley"],
 92 |             bf_indices=[45],
 93 |         )
 94 |     )
 95 |     embedder_mock = mock.Mock(Embedder)
 96 |     embedder_mock.scm_matrix = matrix
 97 |     embedder_mock.checksum = "1234"
 98 |     edf = EmbeddedDataFrame(df, embedder_mock, update_norms=False, update_thresholds=False)
 99 | 
100 |     edf_anonymised = edf.anonymise(keep=["age", "lastname", "idx", "age"])
101 |     assert list(edf_anonymised.columns) == ["idx", "age", "lastname", "bf_indices"]
102 | 
103 | 
104 | def test_embed_colspec():
105 |     """Check that only the name column in the colspec is processed."""
106 | 
107 |     df = pd.DataFrame(
108 |         dict(
109 |             column1=["datum_1"],
110 |             column2=["datum_2"],
111 |         )
112 |     )
113 | 
114 |     embedder = Embedder(
115 |         feature_factory={
116 |             "name": feat.gen_name_features,
117 |             "dob": feat.gen_dateofbirth_features,
118 |             "sex": feat.gen_sex_features,
119 |         },
120 |         ff_args={"name": {}},
121 |         num_hashes=2,
122 |     )
123 | 
124 |     colspec = dict(column2="name")
125 |     embed_df = embedder.embed(df, colspec)
126 |     assert set(embed_df.columns) == set(
127 |         ["column1", "column2", "column2_features", "all_features", "bf_indices", "bf_norms"]
128 |     )
129 | 
130 | 
131 | def test_embed_name_sex_features():
132 |     """Check the name and sex features are processed correctly."""
133 | 
134 |     df = pd.DataFrame(
135 |         dict(
136 |             column1=["doris smith"],
137 |             column2=["F"],
138 |         )
139 |     )
140 | 
141 |     ground_truth_df = pd.DataFrame(
142 |         dict(
143 |             column1_features=[
144 |                 ["_d", "do", "or", "ri", "is", "s_", "_s", "sm", "mi", "it", "th", "h_"]
145 |             ],
146 |             column2_features=[["sex<f>"]],
147 |         )
148 |     )
149 | 
150 |     colspec = dict(column1="name", column2="sex")
151 | 
152 |     embedder = Embedder(
153 |         feature_factory={
154 |             "name": feat.gen_name_features,
155 |             "sex": feat.gen_sex_features,
156 |         },
157 |         ff_args={"name": {"ngram_length": [2]}},
158 |         num_hashes=2,
159 |     )
160 | 
161 |     embed_df = embedder.embed(df, colspec)
162 |     assert embed_df[["column1_features", "column2_features"]].equals(
163 |         ground_truth_df[["column1_features", "column2_features"]]
164 |     )
165 | 
166 | 
167 | def test_embed_dob_features():
168 |     """Check a birth date is separated out correctly."""
169 | 
170 |     df = pd.DataFrame(
171 |         dict(
172 |             column1=["01/3/2012"],
173 |         )
174 |     )
175 | 
176 |     ground_truth = set(["day<01>", "month<03>", "year<2012>"])
177 | 
178 |     colspec = dict(column1="dob")
179 | 
180 |     embedder = Embedder(
181 |         feature_factory={
182 |             "dob": feat.gen_dateofbirth_features,
183 |         },
184 |         ff_args={"dob": {}},
185 |         num_hashes=2,
186 |     )
187 | 
188 |     embed_df = embedder.embed(df, colspec)
189 |     assert ground_truth == set(embed_df["column1_features"][0])
190 | 
191 | 
192 | def test_embed_all_features():
193 |     """Check the all_features columns is created correctly."""
194 | 
195 |     df = pd.DataFrame(
196 |         dict(
197 |             column1=["doris smith"],
198 |             column2=["jxr"],
199 |         )
200 |     )
201 | 
202 |     ground_truth = set(
203 |         ["_d", "do", "or", "ri", "is", "s_", "_s", "sm", "mi", "it", "th", "h_", "sex<j>"]
204 |     )
205 | 
206 |     colspec = dict(column1="name", column2="sex")
207 | 
208 |     embedder = Embedder(
209 |         feature_factory={
210 |             "name": feat.gen_name_features,
211 |             "sex": feat.gen_sex_features,
212 |         },
213 |         ff_args={"name": {"ngram_length": [2]}, "sex": {}},
214 |         num_hashes=2,
215 |     )
216 | 
217 |     embed_df = embedder.embed(df, colspec)
218 |     assert ground_truth == set(embed_df["all_features"][0])
219 | 
220 | 
221 | def test_SimilarityArray_match():
222 |     """Test for expected output with small dataset and no iteration."""
223 |     df1 = pd.DataFrame(dict(name=["Bob", "Sally", "Samina", "John"]))
224 |     df1.index = df1.name
225 |     df2 = pd.DataFrame(dict(name=["Saly", "Rob", "Jon", "Ade"]))
226 |     df2.index = df2.name
227 |     colspec = dict(name="name")
228 | 
229 |     embedder = Embedder(
230 |         feature_factory=dict(name=feat.gen_name_features),
231 |         ff_args=dict(name=dict(ngram_length=[2])),
232 |         bf_size=1024,
233 |         num_hashes=1,
234 |     )
235 |     em1 = embedder.embed(df1, colspec, update_norms=True, update_thresholds=True)
236 |     em2 = embedder.embed(df2, colspec, update_norms=True, update_thresholds=True)
237 |     comp = embedder.compare(em1, em2)
238 | 
239 |     matching = comp.match(abs_cutoff=0.2, hungarian=True)
240 | 
241 |     ground_truth = (
242 |         np.array([0, 1, 3], dtype=np.int64),
243 |         np.array([1, 0, 2], dtype=np.int64),
244 |     )
245 | 
246 |     np.testing.assert_equal(matching, ground_truth)
247 | 


--------------------------------------------------------------------------------
/docs/tutorials/example-verknupfung.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Exploring a simple linkage example"
  3 | format: html
  4 | ---
  5 | 
  6 | The Python package implements the Bloom filter linkage method ([Schnell et al., 2009](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-9-41)), and can also implement pretrained Hash embeddings ([Miranda et al., 2022](https://arxiv.org/abs/2212.09255)), if a suitable large, pre-matched corpus of data is available.
  7 | 
  8 | Let us consider a small example where we want to link two excerpts of data on
  9 | bands. In this scenario, we are looking at some toy data on the members of a
 10 | fictional, German rock trio called "Verknüpfung". In this example we will see how to use untrained Bloom filters to match data.
 11 | 
 12 | ### Loading the data
 13 | 
 14 | First, we load our data into `pandas.DataFrame` objects. Here, the first
 15 | records align, but the other two records should be swapped to have an aligned
 16 | matching. We will use the toolkit to identify these matches.
 17 | 
 18 | ```{python}
 19 | import pandas as pd
 20 | 
 21 | df1 = pd.DataFrame(
 22 |     {
 23 |         "first_name": ["Laura", "Kaspar", "Grete"],
 24 |         "last_name": ["Daten", "Gorman", "Knopf"],
 25 |         "gender": ["F", "M", "F"],
 26 |         "date_of_birth": ["01/03/1977", "31/12/1975", "12/7/1981"],
 27 |         "instrument": ["bass", "guitar", "drums"],
 28 |     }
 29 | )
 30 | df2 = pd.DataFrame(
 31 |     {
 32 |         "name": ["Laura Datten", "Greta Knopf", "Casper Goreman"],
 33 |         "sex": ["female", "female", "male"],
 34 |         "main_instrument": ["bass guitar", "percussion", "electric guitar"],
 35 |         "birth_date": ["1977-03-23", "1981-07-12", "1975-12-31"],
 36 |     }
 37 | )
 38 | ```
 39 | 
 40 | ::: {.callout-note}
 41 | These datasets don't have the same column names or follow the same encodings,
 42 | and there are several spelling mistakes in the names of the band members, as well as a typo in the dates.
 43 | 
 44 | Thankfully, the PPRL Toolkit is flexible enough to handle this!
 45 | :::
 46 | 
 47 | ### Creating and assigning a feature factory
 48 | 
 49 | The next step is to decide how to process each of the columns in our datasets.
 50 | 
 51 | To do this, we define a feature factory that maps column types to feature
 52 | generation functions, and a column specification for each dataset mapping our
 53 | columns to column types in the factory.
 54 | 
 55 | ```{python}
 56 | from pprl.embedder import features
 57 | from functools import partial
 58 | 
 59 | factory = dict(
 60 |     name=features.gen_name_features,
 61 |     sex=features.gen_sex_features,
 62 |     misc=features.gen_misc_features,
 63 |     dob=features.gen_dateofbirth_features,
 64 |     instrument=partial(features.gen_misc_shingled_features, label="instrument")
 65 | )
 66 | spec1 = dict(
 67 |     first_name="name",
 68 |     last_name="name",
 69 |     gender="sex",
 70 |     instrument="instrument",
 71 |     date_of_birth="dob",
 72 | )
 73 | spec2 = dict(name="name", sex="sex", main_instrument="instrument", birth_date="dob")
 74 | ```
 75 | 
 76 | ::: {.callout-tip}
 77 | The feature generation functions, `features.gen_XXX_features` have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above.
 78 | There are two ways to achieve this. Either use `functools.partial` to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the `Embedder` as `ff_args`.
 79 | :::
 80 | 
 81 | ### Embedding the data
 82 | 
 83 | With our specifications sorted out, we can get to creating our Bloom filter
 84 | embedding. Before doing so, we need to decide on two parameters: the size of
 85 | the filter and the number of hashes. By default, these are 1024 and 2,
 86 | respectively.
 87 | 
 88 | Once we've decided, we can create our `Embedder` instance and use it to embed
 89 | our data with their column specifications.
 90 | 
 91 | ```{python}
 92 | #| warning: false
 93 | from pprl.embedder.embedder import Embedder
 94 | 
 95 | embedder = Embedder(factory, bf_size=1024, num_hashes=2)
 96 | 
 97 | edf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)
 98 | edf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)
 99 | ```
100 | 
101 | If we take a look at one of these embedded datasets, we can see that it has a
102 | whole bunch of new columns. There is a `_features` column for each of the
103 | original columns containing their pre-embedding string features, and there's an `all_features` column that combines the features. Then there are
104 | three additional columns: `bf_indices`, `bf_norms` and `thresholds`.
105 | 
106 | ```{python}
107 | edf1.columns
108 | ```
109 | 
110 | The `bf_indices` column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.
111 | 
112 | ```{python}
113 | print(edf1.bf_indices[0])
114 | ```
115 | 
116 | The `bf_norms` column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to `np.sqrt(len(bf_indices[i]))` for record `i`. The norm is used to scale the similarity measures so that they take values between -1 and 1.
117 | 
118 | The `thresholds` column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It's like a reserve price in an auction -- it stops a record being matched to another record when the similarity isn't high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.
119 | 
120 | ```{python}
121 | print(edf1.loc[:,["bf_norms","thresholds"]])
122 | print(edf2.loc[:,["bf_norms","thresholds"]])
123 | ```
124 | 
125 | <!-- ToDO: Write an explainer on the threshold method, and link it here -->
126 | 
127 | ### The processed features
128 | 
129 | Let's take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.
130 | 
131 | First, we'll look at date of birth:
132 | 
133 | ```{python}
134 | print(edf1.date_of_birth_features[0])
135 | print(edf2.birth_date_features[0])
136 | ```
137 | 
138 | Python can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.
139 | 
140 | Then we'll look at name:
141 | 
142 | ```{python}
143 | print(edf1.first_name_features[0] + edf1.last_name_features[0])
144 | print(edf2.name_features[0])
145 | ```
146 | 
147 | The two datasets store the names differently, but this doesn't matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.
148 | 
149 | The sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:
150 | 
151 | ```{python}
152 | print(edf1.gender_features[0])
153 | print(edf2.sex_features[0])
154 | ```
155 | 
156 | 
157 | Finally, we'll see how our instrument feature function (`partial(features.gen_misc_shingled_features, label="instrument")`) processed the data:
158 | 
159 | ```{python}
160 | print(edf1.instrument_features[0])
161 | print(edf2.main_instrument_features[0])
162 | ```
163 | 
164 | Setting the `label` argument was important to ensure that the shingles match (and are hashed to the same slots) because the default behaviour of the function is to use the column name as a label: since the two columns have different names, the default wouldn't have allowed the features to match to each other.
165 | 
166 | ### Performing the linkage
167 | 
168 | We can now perform the linkage by comparing these Bloom filter embeddings. We
169 | use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted
170 | Hungarian algorithm to match the records based on those similarities.
171 | 
172 | ```{python}
173 | similarities = embedder.compare(edf1, edf2)
174 | similarities
175 | ```
176 | 
177 | This `SimilarityArray` object is an augmented `numpy.ndarray` that can perform
178 | our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn't need one.
179 | 
180 | ```{python}
181 | matching = similarities.match()
182 | matching
183 | ```
184 | 
185 | So, all three of the records in each dataset were matched correctly. Excellent!
186 | 


--------------------------------------------------------------------------------
/scripts/common.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Common utility functions and variables.
  4 | #
  5 | # Adapted from https://github.com/GoogleCloudPlatform/confidential-space, which
  6 | # is available under the Apache License 2.0.
  7 | 
  8 | export $(grep "^[A-Z]" ../.env | xargs -0)
  9 | 
 10 | export PROJECT_LOCATION=global
 11 | 
 12 | export WORKLOAD_SERVICE_ACCOUNT=operator
 13 | export WORKLOAD_SERVICE_ACCOUNT_EMAIL=$WORKLOAD_SERVICE_ACCOUNT@$WORKLOAD_OPERATOR_PROJECT.iam.gserviceaccount.com
 14 | 
 15 | export ATTESTATION_BUCKET=$WORKLOAD_OPERATOR_PROJECT-attestation-bucket
 16 | 
 17 | export ARTIFACT_REPOSITORY=pprl-artifact-repo
 18 | export WORKLOAD_IMAGE_NAME=pprl-image
 19 | export WORKLOAD_IMAGE_TAG=latest
 20 | export WORKLOAD_IMAGE_REFERENCE=$WORKLOAD_AUTHOR_PROJECT_REGION-docker.pkg.dev/$WORKLOAD_AUTHOR_PROJECT/$ARTIFACT_REPOSITORY/$WORKLOAD_IMAGE_NAME
 21 | 
 22 | #######################################
 23 | # Prints an error message.
 24 | # Globals:
 25 | #   None
 26 | # Arguments:
 27 | #   Message
 28 | #######################################
 29 | err() {
 30 |   echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2
 31 | }
 32 | 
 33 | #######################################
 34 | # Sets the GCP project.
 35 | # Globals:
 36 | #   None
 37 | # Arguments:
 38 | #   GCP Project-Id
 39 | #######################################
 40 | set_gcp_project() {
 41 |   echo "Setting project to ${1}..."
 42 |   gcloud config set project ${1} > /dev/null
 43 |   if [[ $? -eq 0 ]]; then
 44 |     echo "Project is set to ${1} successfully."
 45 |   else
 46 |     err "Failed to set project to ${1}."
 47 |     exit 1
 48 |   fi
 49 | }
 50 | 
 51 | #######################################
 52 | # Creates cloud storage bucket.
 53 | # Globals:
 54 | #   None
 55 | # Arguments:
 56 | #   Storage bucket name
 57 | #######################################
 58 | create_storage_bucket() {
 59 |   gsutil ls | grep ${1}
 60 |   if [[ $? -eq 0 ]]; then
 61 |     echo "Storage bucket ${1} already exists. Skipping the creation of new storage bucket..."
 62 |   else
 63 |     echo "Storage bucket ${1} doesn't exists. Creating new storage bucket ${1}..."
 64 |     gsutil mb gs://$1
 65 |     if [[ $? -eq 0 ]]; then
 66 |       echo "Storage bucket ${1} is created successfully."
 67 |     else
 68 |       err "Failed to create a storage bucket ${1}."
 69 |     fi
 70 |   fi
 71 | }
 72 | 
 73 | #######################################
 74 | # Deletes cloud storage bucket.
 75 | # Globals:
 76 | #   None
 77 | # Arguments:
 78 | #   Storage bucket name
 79 | #######################################
 80 | delete_storage_bucket() {
 81 |   gsutil ls | grep ${1}
 82 |   if [[ $? -eq 0 ]]; then
 83 |     echo "Deleting the storage bucket ${1}..."
 84 |     gsutil rm -r gs://$1
 85 |     if [[ $? -eq 0 ]]; then
 86 |       echo "Storage bucket ${1} is deleted successfully."
 87 |     else
 88 |       err "Failed to delete a storage bucket ${1}."
 89 |     fi
 90 |   else
 91 |     echo "Storage bucket ${1} doesn't exists. Skipping the deletion of storage bucket ${1}..."
 92 |   fi
 93 | }
 94 | 
 95 | #######################################
 96 | # Creates KMS keyring.
 97 | # Globals:
 98 | #   None
 99 | # Arguments:
100 | #   Keyring name
101 | #   Location
102 | #######################################
103 | create_kms_keyring() {
104 |   gcloud kms keyrings list --location=${2} | grep ${1}
105 |   if [[ $? -eq 0 ]]; then
106 |     echo "Keyring ${1} already exists. Skipping the creation of new keyring..."
107 |   else
108 |     echo "Keyring ${1} doesn't exists. Creating new keyring ${1}..."
109 |     gcloud kms keyrings create ${1} --location=global
110 |     if [[ $? -eq 0 ]]; then
111 |       echo "KMS keyring ${1} is created successully."
112 |     else
113 |       err "Failed to create a KMS keyring ${1}."
114 |     fi
115 |   fi
116 | }
117 | 
118 | #######################################
119 | # Creates KMS key.
120 | # Globals:
121 | #   None
122 | # Arguments:
123 | #   Key name
124 | #   Keyring name
125 | #   Location
126 | #######################################
127 | create_kms_encryption_key() {
128 |   gcloud kms keys list --keyring=${2} --location=${3} | grep ${1}
129 |   if [[ $? -eq 0 ]]; then
130 |     echo "Key ${1} for keyring ${2} already exists. Skipping the creation of new key..."
131 |   else
132 |     echo "Key ${1} doesn't exists for keyring ${2}. Creating new key ${1}..."
133 |     gcloud kms keys create ${1} \
134 |       --location=${3} \
135 |       --keyring=${2} \
136 |       --purpose=asymmetric-encryption \
137 |       --default-algorithm=rsa-decrypt-oaep-3072-sha256
138 |     if [[ $? -eq 0 ]]; then
139 |       echo "KMS key ${1} is created succesfully."
140 |     else
141 |       err "Failed to create a KMS key ${1}."
142 |     fi
143 |   fi
144 | }
145 | 
146 | 
147 | #######################################
148 | # Deletes KMS key.
149 | # Globals:
150 | #   None
151 | # Arguments:
152 | #   Key name
153 | #   Keyring name
154 | #   Location
155 | #   Key version
156 | #######################################
157 | destroy_kms_key_version() {
158 |   gcloud kms keys list --keyring=${2} --location=${3} --filter="PRIMARY_STATE=(ENABLED)" | grep ${1}
159 |   if [[ $? -eq 0 ]]; then
160 |     gcloud kms keys versions destroy ${4} --key ${1} --keyring ${2} --location ${3}
161 |     if [[ $? -eq 0 ]]; then
162 |       echo "Key ${1} version ${4} deleted successfully."
163 |     else
164 |       err "Failed to delete key ${1} version ${4}."
165 |     fi
166 |   else
167 |     echo "Key ${1} version ${4} doesn't exist. Skipping deletion..."
168 |   fi
169 | }
170 | 
171 | #######################################
172 | # Creates a service-account.
173 | # Globals:
174 | #   None
175 | # Arguments:
176 | #   Name of the service-account
177 | #######################################
178 | create_service_account() {
179 |   gcloud iam service-accounts list | grep ${1}
180 |   if [[ $? -eq 0 ]]; then
181 |     echo "Service-account ${1} already exists. Skipping the create of new service-account..."
182 |   else
183 |     echo "Creating service-account ${1}..."
184 |     gcloud iam service-accounts create ${1}
185 |     if [[ $? -eq 0 ]]; then
186 |       echo "Service-account ${1} is created successfully."
187 |     else
188 |       err "Failed to create service-account ${1}."
189 |     fi
190 |   fi
191 | }
192 | 
193 | #######################################
194 | # Deletes a service-account.
195 | # Globals:
196 | #   None
197 | # Arguments:
198 | #   Name of the service-account
199 | #######################################
200 | delete_service_account() {
201 |   gcloud iam service-accounts list | grep ${1}
202 |   if [[ $? -eq 0 ]]; then
203 |     echo "Deleting service-account ${1}..."
204 |     gcloud iam service-accounts delete ${1} --quiet
205 |     if [[ $? -eq 0 ]]; then
206 |       echo "Service-account ${1} is deleted successfully."
207 |     else
208 |       err "Failed to delete service-account ${1}."
209 |     fi
210 |   else
211 |     echo "Service-account ${1} doesn't exist. Skipping the deletion of workload identity pool ${1}..."
212 |   fi
213 | }
214 | 
215 | #######################################
216 | # Creates a workload identity pool.
217 | # Globals:
218 | #   None
219 | # Arguments:
220 | #   Name of workload identity Pool
221 | #   Location
222 | #######################################
223 | create_workload_identity_pool() {
224 |   gcloud iam workload-identity-pools list --location=${2} | grep ${1}
225 |   if [[ $? -eq 0 ]]; then
226 |     echo "Workload Identity Pool ${1} already exists. Skipping the creation of new workload-idenity-pool..."
227 |   else
228 |     echo "Creating workload identity pool ${1}..."
229 |     gcloud iam workload-identity-pools create ${1} --location ${2}
230 |     if [[ $? -eq 0 ]]; then
231 |       echo "Workload identity pool ${1} is created successfully."
232 |     else
233 |       err "Failed to create workload identity pool ${1}."
234 |     fi
235 |   fi
236 | }
237 | 
238 | #######################################
239 | # Deletes a workload identity pool.
240 | # Globals:
241 | #   None
242 | # Arguments:
243 | #   Name of workload identity Pool
244 | #   Location
245 | #######################################
246 | delete_workload_identity_pool() {
247 |   gcloud iam workload-identity-pools list --filter="state=(ACTIVE)" --location=${2} | grep ${1}
248 |   if [[ $? -eq 0 ]]; then
249 |     echo "Deleting workload-idenity-pool ${1}..."
250 |     gcloud iam workload-identity-pools delete ${1} --location==${2} --quiet
251 |     if [[ $? -eq 0 ]]; then
252 |       echo "Workload identity pool ${1} is deleted successfully."
253 |     else
254 |       err "Failed to delete workload identity pool ${1}."
255 |     fi
256 |   else
257 |     echo "Workload identity pool ${1} doesn't exist. Skipping the deletion of workload identity pool ${1}..."
258 |   fi
259 | }
260 | 
261 | #######################################
262 | # Creates an artifact repository.
263 | # Globals:
264 | #   None
265 | # Arguments:
266 | #   Name of artifact repository
267 | #   Location of artifact repository
268 | #######################################
269 | create_artifact_repository() {
270 |   gcloud artifacts repositories list --location=${2} | grep ${1}
271 |   if [[ $? -eq 0 ]]; then
272 |     echo "Artifact Registry ${1} already exists. Skipping the creation of new artifact registry..."
273 |   else
274 |     echo "Creating new artifact registry ${1}..."
275 |     gcloud artifacts repositories create ${1} --repository-format=docker --location=${2}
276 |     if [[ $? -eq 0 ]]; then
277 |       echo "Artifact registry ${1} is created successfully."
278 |     else
279 |       err "Failed to create a artifact registry ${1}."
280 |     fi
281 |   fi
282 | }
283 | 
284 | #######################################
285 | # Deletes an artifact repository.
286 | # Globals:
287 | #   None
288 | # Arguments:
289 | #   Name of artifact repository
290 | #   Location of artifact repository
291 | #######################################
292 | delete_artifact_repository() {
293 |   gcloud artifacts repositories list --location=${2} | grep ${1}
294 |   if [[ $? -eq 0 ]]; then
295 |     echo "Deleting an artifact repository ${1}..."
296 |     gcloud artifacts repositories delete ${1} --location=${2} --async  --quiet
297 |     if [[ $? -eq 0 ]]; then
298 |       echo "Artifact repository ${1} is deleted successfully."
299 |     else
300 |       err "Failed to delete a artifact repository ${1}."
301 |     fi
302 |   else
303 |     echo "Artifact repository ${1} doesn't exist. Skipping the deletion of ${1}..."
304 |   fi
305 | }
306 | 
307 | #######################################
308 | # Grants write access to a bucket.
309 | # Globals:
310 | #   None
311 | # Arguments:
312 | #   Email of party user
313 | #   Name of attestation bucket
314 | #######################################
315 | grant_attestation_bucket_rights() {
316 |   echo "Granting objectCreator role for ${2} to user ${1}..."
317 |   if ! gsutil iam ch user:${1}:objectCreator gs://${2}; then
318 |     err "Failed to grant objectCreator role for ${2} to user ${1}."
319 |   fi
320 | }
321 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![ONS and DSC logos](https://github.com/datasciencecampus/awesome-campus/blob/master/ons_dsc_logo.png)
  2 | 
  3 | # PPRL Toolkit: A toolkit for Privacy-Preserving Record Linkage
  4 | 
  5 | > "We find ourselves living in a society which is rich with data and the opportunities that comes with this. Yet, when disconnected, this data is limited in its usefulness. ... Being able to link data will be vital for enhancing our understanding of society, driving policy change for greater public good." Sir Ian Diamond, the National Statistician
  6 | 
  7 | The Privacy Preserving Record Linkage (PPRL) toolkit demonstrates the feasibility of record linkage in difficult 'eyes off' settings. It has been designed for a situation where two organisations (perhaps in different jurisdictions) want to link their datasets at record level, to enrich the information they contain, but neither party is able to send sensitive personal identifiers - such as names, addresses or dates of birth - to the other. Building on [previous ONS research](https://www.gov.uk/government/publications/joined-up-data-in-government-the-future-of-data-linking-methods/privacy-preserving-record-linkage-in-the-context-of-a-national-statistics-institute), the toolkit implements a well-known privacy-preserving linkage method in a new way to improve performance, and wraps it in a secure cloud architecture to demonstrate the potential of a layered approach.
  8 | 
  9 | The  toolkit has been developed by data scientists at the [Data Science Campus](https://datasciencecampus.ons.gov.uk/) of the UK Office for National Statistics. This project has benefitted from early collaborations with colleagues at NHS England.
 10 | 
 11 | The two parts of the toolkit are:
 12 | 
 13 | * a Python package for privacy-preserving record linkage with Bloom filters and hash embeddings, that can be used locally with no cloud set-up
 14 | * instructions, scripts and resources to run record linkage in a cloud-based secure enclave. This part of the toolkit requires you to set up Google Cloud accounts with billing
 15 | 
 16 | We're publishing the repo as a prototype and teaching tool. Please feel free to download, adapt and experiment with it in compliance with the open-source license. The reference documentation and tutorials are published [here](https://datasciencecampus.github.io/pprl_toolkit). You can submit issues [here](https://github.com/datasciencecampus/pprl_toolkit/issues). However, as this is a prototype, the development team cannot commit to maintaining the repo indefinitely or responding to all issues.
 17 | 
 18 | This toolkit is not assured for use in production settings, but we believe the tools and methods demonstrated here have great potential for positive impact with further development and adaptation. If you'd like to collaborate with us, to put these ideas into practice for the public good, please [get in touch](https://datasciencecampus.ons.gov.uk/contact/).
 19 | 
 20 | ## Installation
 21 | 
 22 | To install the package from source, you must clone the repository before
 23 | installing locally via `pip`:
 24 | 
 25 | ```shell
 26 | git clone https://github.com/datasciencecampus/pprl_toolkit.git
 27 | cd pprl_toolkit
 28 | python -m pip install .
 29 | ```
 30 | 
 31 | ### Installing as a developer
 32 | 
 33 | If you are developing on (or contributing to) the project, install the package
 34 | as editable with the `dev` optional dependencies:
 35 | 
 36 | ```shell
 37 | python -m pip install -e ".[dev]"
 38 | ```
 39 | 
 40 | We also encourage the use of pre-commit hooks for development work. These hooks
 41 | help us ensure the security of our code base and a consistent code style.
 42 | 
 43 | To install these, run the following command from the root directory of the
 44 | repository:
 45 | 
 46 | ```shell
 47 | pre-commit install
 48 | ```
 49 | 
 50 | ## Getting started
 51 | 
 52 | The Python package implements the Bloom filter linkage method ([Schnell et al., 2009](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-9-41)), and can also implement pretrained Hash embeddings ([Miranda et al., 2022](https://arxiv.org/abs/2212.09255)), if a suitable large, pre-matched corpus of data is available.
 53 | 
 54 | Let us consider a small example where we want to link two excerpts of data on
 55 | bands. In this scenario, we are looking at some toy data on the members of a
 56 | fictional, German rock trio called "Verknüpfung". In this example we will see how to use untrained Bloom filters to match data.
 57 | 
 58 | ### Loading the data
 59 | 
 60 | First, we load our data into `pandas.DataFrame` objects. Here, the first
 61 | records align, but the other two records should be swapped to have an aligned
 62 | matching. We will use the toolkit to identify these matches.
 63 | 
 64 | ```python
 65 | >>> import pandas as pd
 66 | >>>
 67 | >>> df1 = pd.DataFrame(
 68 | ...     {
 69 | ...         "first_name": ["Laura", "Kaspar", "Grete"],
 70 | ...         "last_name": ["Daten", "Gorman", "Knopf"],
 71 | ...         "gender": ["f", "m", "f"],
 72 | ...         "instrument": ["bass", "guitar", "drums"],
 73 | ...     }
 74 | ... )
 75 | >>> df2 = pd.DataFrame(
 76 | ...     {
 77 | ...         "name": ["Laura Datten", "Greta Knopf", "Casper Goreman"],
 78 | ...         "sex": ["female", "female", "male"],
 79 | ...         "main_instrument": ["bass guitar", "percussion", "electric guitar"],
 80 | ...     }
 81 | ... )
 82 | 
 83 | ```
 84 | 
 85 | > [!NOTE]
 86 | > These datasets don't have the same column names or follow the same encodings,
 87 | > and there are several spelling mistakes in the names of the band members.
 88 | >
 89 | > Thankfully, the PPRL Toolkit is flexible enough to handle this!
 90 | 
 91 | ### Creating and assigning a feature factory
 92 | 
 93 | The next step is to decide how to process each of the columns in our datasets.
 94 | The `pprl.embedder.features` module provides functions that process different data types so that they can be embedded into the Bloom filter. We pass these functions into the embedder in a dictionary called a feature factory. We also provide a column specification for each dataset mapping our columns to column types in the factory.
 95 | 
 96 | ```python
 97 | >>> from pprl.embedder import features
 98 | >>> from functools import partial
 99 | >>>
100 | >>> factory = dict(
101 | ...     name=features.gen_name_features,
102 | ...     sex=features.gen_sex_features,
103 | ...     instrument=partial(features.gen_misc_shingled_features, label="instrument"),
104 | ... )
105 | >>> spec1 = dict(
106 | ...     first_name="name",
107 | ...     last_name="name",
108 | ...     gender="sex",
109 | ...     instrument="instrument",
110 | ... )
111 | >>> spec2 = dict(name="name", sex="sex", main_instrument="instrument")
112 | 
113 | ```
114 | 
115 | ### Embedding the data
116 | 
117 | With our specifications sorted out, we can get to creating our Bloom filter
118 | embedding. We can create our `Embedder` instance and use it to embed
119 | our data with their column specifications. The `Embedder` object has two more parameters: the size of the filter and the number of hashes. We can use the defaults.
120 | 
121 | ```python
122 | >>> from pprl.embedder.embedder import Embedder
123 | >>>
124 | >>> embedder = Embedder(factory, bf_size=1024, num_hashes=2)
125 | >>> edf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)
126 | >>> edf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)
127 | 
128 | ```
129 | 
130 | ### Performing the linkage
131 | 
132 | We can now perform the linkage by comparing these Bloom filter embeddings. The package
133 | uses the Soft Cosine Measure to calculate record-wise similarity scores.
134 | 
135 | ```python
136 | >>> similarities = embedder.compare(edf1, edf2)
137 | >>> similarities
138 | SimilarityArray([[0.81229552, 0.1115206 , 0.09557733],
139 |                  [0.35460909, 0.16368072, 0.60428527],
140 |                  [0.11720977, 0.50957391, 0.10343462]])
141 | 
142 | ```
143 | 
144 | Lastly, we compute the matching using an adapted Hungarian algorithm with local match thresholds:
145 | 
146 | ```python
147 | >>> matching = similarities.match()
148 | >>> matching
149 | (array([0, 1, 2]), array([0, 2, 1]))
150 | 
151 | ```
152 | 
153 | So, all three of the records in each dataset were matched correctly. Excellent! You can find a longer version of this tutorial [here](https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-verknupfung.html).
154 | 
155 | 
156 | ## Working in the cloud
157 | 
158 | 
159 | ![A diagram of the PPRL cloud architecture, with the secure enclave and key management services](https://github.com/datasciencecampus/pprl_toolkit/blob/main/docs/assets/pprl_cloud_diagram.png?raw=true)
160 | 
161 | The cloud demo uses a Google Cloud Platform (GCP) Confidential Space compute instance, which is a virtual machine (VM) using AMD [Secure Encrypted Virtualisation](https://www.amd.com/en/developer/sev.html) (AMD-SEV) technology to encrypt data in-memory.
162 | 
163 | The Confidential Space VM can also provide cryptographically signed documents, called attestations, which the server can use to prove that it is running in a secure environment before gaining access to data.
164 | 
165 | The cloud demo assigns four roles: two data-owning
166 | parties, a workload author, and a workload operator. These roles can be summarised as follows:
167 | 
168 | - Each data-owning **party** is responsible for embedding and uploading their data
169 |   to the cloud. They also download their results.
170 | - The workload **author** audits and assures the source code of the server, and then builds and uploads the server as a Docker image.
171 | - The workload **operator** sets up and runs the Confidential
172 |   Space virtual machine, which uses the Docker image to perform the record linkage.
173 | 
174 | We have set up the PPRL Toolkit to allow any configuration of these roles among
175 | users. You could do it all yourself, split the workload roles between two
176 | data owning-parties, or ask a trusted third party to maintain the
177 | workload.
178 | 
179 | > [!WARNING]
180 | > The cloud demo requires you to set up one or more Google Cloud accounts with
181 | > billing. The cost of running the demo should be very small, or within your
182 | > free quota. However, you should ensure that all resources are torn down after
183 | > running the demo to avoid ongoing charges.
184 | 
185 | Please refer to our
186 | [cloud tutorial](https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/in-the-cloud)
187 | for further details on how to get working in the cloud.
188 | 
189 | ## Building the documentation
190 | 
191 | This package is accompanied by documentation which includes tutorials and API
192 | reference materials. These are available on [GitHub Pages](https://datasciencecampus.github.io/pprl_toolkit).
193 | 
194 | If you would like to build the documentation yourself, you will need to install
195 | [Quarto](https://quarto.org/docs/get-started/). After that, install the `docs`
196 | optional dependencies for the toolkit:
197 | 
198 | ```bash
199 | python -m pip install ".[docs]"
200 | ```
201 | 
202 | Now you can build, render, and view the documentation yourself. First, build
203 | the API reference material:
204 | 
205 | ```bash
206 | python -m quartodoc build
207 | ```
208 | 
209 | This will create a set of Quarto files under `docs/reference/`. You can render the
210 | documentation itself with the following command, opening a local version of the
211 | site in your browser:
212 | 
213 | ```bash
214 | quarto preview
215 | ```
216 | 


--------------------------------------------------------------------------------
/src/pprl/embedder/features.py:
--------------------------------------------------------------------------------
  1 | """Feature generation functions for various column types."""
  2 | 
  3 | import re
  4 | from typing import Generator, Hashable
  5 | 
  6 | import pandas as pd
  7 | from metaphone import doublemetaphone
  8 | 
  9 | 
 10 | def split_string_underscore(string: str) -> list[str]:
 11 |     """Split and underwrap a string at typical punctuation marks.
 12 | 
 13 |     Currently, we split at any combination of spaces, dashes, dots,
 14 |     commas, or underscores.
 15 | 
 16 |     Examples
 17 |     --------
 18 |     >>> strings = ("dave  william johnson", "Francesca__Hogan-O'Malley")
 19 |     >>> for string in strings:
 20 |     ...     print(split_string_underscore(string))
 21 |     ["_dave_", "_william_", "_johnson_"]
 22 |     ["_Francesca_", "_Hogan_", "_O'Malley_"]
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     string: str
 27 |         String to split.
 28 | 
 29 |     Returns
 30 |     -------
 31 |     split: list[str]
 32 |         List of the split and wrapped tokens.
 33 |     """
 34 |     words = re.split(r"[\s\+\-\_\,\.]+", string)
 35 |     split = [f"_{word}_" for word in words if word]
 36 | 
 37 |     return split
 38 | 
 39 | 
 40 | def gen_ngram(split_tokens: list, ngram_length: list) -> Generator[str, None, None]:
 41 |     """Generate n-grams from a set of tokens.
 42 | 
 43 |     This is a generator function that contains a series of n-grams the
 44 |     size of the sliding window.
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     split_tokens: list
 49 |         All the split-up tokens from which to form n-grams.
 50 |     ngram_length: list
 51 |         Desired lengths of n-grams. For examples, `ngram_length=[2, 3]`
 52 |         would generate all 2-grams and 3-grams.
 53 | 
 54 |     Returns
 55 |     -------
 56 |     ngram : str
 57 |         The next n-gram in the sequence.
 58 |     """
 59 |     for n in ngram_length:
 60 |         for token in split_tokens:
 61 |             chr_length = len(token)
 62 |             for i in range(chr_length - n + 1):
 63 |                 ngram = token[i : i + n]
 64 |                 if ngram != "_":
 65 |                     yield ngram
 66 | 
 67 | 
 68 | def gen_skip_grams(split_tokens: list) -> Generator[str, None, None]:
 69 |     """Generate skip 2-grams from a set of tokens.
 70 | 
 71 |     This function is a generator that contains a series of skip 2-grams.
 72 | 
 73 |     Examples
 74 |     --------
 75 |     >>> string = "dave james"
 76 |     >>> tokens = split_string_underscore(string)
 77 |     >>> skips = list(gen_skip_grams(tokens))
 78 |     >>> print(skips)
 79 |     ["_a", "dv", "ae", "v_", "_a", "jm", "ae", "ms", "e_"]
 80 | 
 81 | 
 82 |     Parameters
 83 |     ----------
 84 |     split_tokens: list
 85 |         All the split-up tokens from which to form skip 2-grams.
 86 | 
 87 |     Returns
 88 |     -------
 89 |     skip: str
 90 |         The next skip 2-gram in the sequence.
 91 |     """
 92 |     for token in split_tokens:
 93 |         chr_length = len(token)
 94 |         for i in range(chr_length - 2):
 95 |             yield token[i] + token[i + 2]
 96 | 
 97 | 
 98 | def gen_double_metaphone(string: str) -> Generator[str, None, None]:
 99 |     """Generate the double methaphones of a string.
100 | 
101 |     This function is a generator containing all the possible, non-empty
102 |     double metaphones of a given string, separated by spaces. This
103 |     function uses the `metaphone.doublemetaphone()` function under the
104 |     hood, ignoring any empty strings. See their
105 |     [repository](https://github.com/oubiwann/metaphone) for details.
106 | 
107 |     Parameters
108 |     ----------
109 |     string: str
110 |         String from which to derive double metaphones.
111 | 
112 |     Returns
113 |     -------
114 |     metaphone: str
115 |         The next double metaphone in the sequence.
116 |     """
117 |     for token in string.split():
118 |         double_metaphone = doublemetaphone(token)
119 |         for metaphone in double_metaphone:
120 |             if metaphone != "":
121 |                 yield metaphone
122 | 
123 | 
124 | def gen_features(
125 |     string: str,
126 |     ngram_length: list = [2, 3],
127 |     use_gen_ngram: bool = True,
128 |     use_gen_skip_grams: bool = False,
129 |     use_double_metaphone: bool = False,
130 | ) -> Generator[str, None, None]:
131 |     """Generate string features of various types.
132 | 
133 |     This function is a generator capable of producing n-grams, skip
134 |     2-grams, and double metaphones from a single string. These outputs
135 |     are referred to as features.
136 | 
137 |     Parameters
138 |     ----------
139 |     string: str
140 |         Base string from which to generate features.
141 |     ngram_length: list
142 |         Lengths of n-grams to make. Ignored if `use_gen_ngram=False`.
143 |     use_gen_ngram: bool
144 |         Whether to create n-grams. Default is `True`.
145 |     use_gen_skip_grams: bool
146 |         Whether to create skip 2-grams. Default is `False`.
147 |     use_double_metaphone: bool
148 |         Whether to create double metaphones. Default is `False`.
149 | 
150 |     Returns
151 |     -------
152 |     feature: str
153 |         The next feature in the sequence.
154 |     """
155 |     lower = string.lower()
156 |     split_tokens = split_string_underscore(lower)
157 | 
158 |     if use_gen_ngram is True:
159 |         yield from gen_ngram(split_tokens, ngram_length=ngram_length)
160 |     if use_gen_skip_grams is True:
161 |         yield from gen_skip_grams(split_tokens)
162 |     if use_double_metaphone is True:
163 |         yield from gen_double_metaphone(lower)
164 | 
165 | 
166 | def gen_name_features(
167 |     names: pd.Series,
168 |     ngram_length: list[int] = [2, 3],
169 |     use_gen_ngram: bool = True,
170 |     use_gen_skip_grams: bool = False,
171 |     use_double_metaphone: bool = False,
172 | ) -> pd.Series:
173 |     """Generate a features series for a series of names.
174 | 
175 |     Effectively, this function is a call to `pd.Series.apply()` using
176 |     our `gen_features()` string feature generator function.
177 | 
178 |     Parameters
179 |     ----------
180 |     names: pd.Series
181 |         Series of names.
182 |     ngram_length: list
183 |         Lengths of n-grams to make. Ignored if `use_gen_ngram=False`.
184 |     use_gen_ngram: bool
185 |         Whether to create n-grams. Default is `True`.
186 |     use_gen_skip_grams: bool
187 |         Whether to create skip 2-grams. Default is `False`.
188 |     use_double_metaphone: bool
189 |         Whether to create double metaphones. Default is `False`.
190 | 
191 |     Returns
192 |     -------
193 |     pd.Series
194 |         Series containing lists of features.
195 |     """
196 |     features = (
197 |         names.copy()
198 |         .fillna("")
199 |         .apply(
200 |             lambda name: list(
201 |                 gen_features(
202 |                     name,
203 |                     ngram_length,
204 |                     use_gen_ngram,
205 |                     use_gen_skip_grams,
206 |                     use_double_metaphone,
207 |                 )
208 |             )
209 |         )
210 |     )
211 | 
212 |     return features
213 | 
214 | 
215 | def gen_sex_features(sexes: pd.Series) -> pd.Series:
216 |     """Generate labelled sex features from a series of sexes.
217 | 
218 |     Features take the form `["sex<option>"]` or `[""]` for missing data.
219 | 
220 |     Parameters
221 |     ----------
222 |     sexes: pd.Series
223 |         Series of sex data.
224 | 
225 |     Returns
226 |     -------
227 |     pd.Series
228 |         Series containing lists of sex features.
229 |     """
230 |     assert not any(
231 |         isinstance(sex, list) for sex in sexes
232 |     ), "Elements of `sexes` should not be lists"
233 | 
234 |     sexes = (
235 |         sexes.copy()
236 |         .str.casefold()  # make everything lowercase
237 |         .str[0]  # take the first character
238 |         .replace(
239 |             r"(^.*$)",  # match the whole string (in a group)
240 |             r"sex<\1>",  # wrap the string in 'sex<>'
241 |             regex=True,
242 |         )
243 |         .fillna("")
244 |         .apply(lambda sex: [sex])
245 |     )
246 | 
247 |     return sexes
248 | 
249 | 
250 | def gen_dateofbirth_features(
251 |     dob: pd.Series,
252 |     dayfirst: bool = True,
253 |     yearfirst: bool = False,
254 |     default: list[str] = [],
255 | ) -> pd.Series:
256 |     """Generate labelled date features from a series of dates of birth.
257 | 
258 |     Features take the form `["day<dd>", "month<mm>", "year<YYYY>"]`.
259 |     Note that this feature generator can be used for any sort of date
260 |     data, not just dates of birth.
261 | 
262 |     Parameters
263 |     ----------
264 |     dob: pd.Series
265 |         Series of dates of birth.
266 |     dayfirst: bool
267 |         Whether the day comes first in the DOBs. Passed to
268 |         `pd.to_datetime()` and defaults to `True`.
269 |     yearfirst: bool
270 |         Whether the year comes first in the DOBs. Passed to
271 |         `pd.to_datetime()` and defaults to `False`.
272 |     default: list
273 |         Default date to fill in missing data in feature (list) form.
274 |         Default is the feature form of `2050-01-01`.
275 | 
276 |     Returns
277 |     -------
278 |     pd.Series
279 |         Series containing lists of date features.
280 |     """
281 |     datetimes = pd.to_datetime(dob, errors="coerce", dayfirst=dayfirst, yearfirst=yearfirst)
282 | 
283 |     features = (
284 |         datetimes.dt.strftime("day<%d>_month<%m>_year<%Y>")
285 |         .str.split("_")
286 |         .fillna("")
287 |         .apply(lambda date: default if date == "" else date)
288 |     )
289 | 
290 |     return features
291 | 
292 | 
293 | def gen_misc_features(field: pd.Series, label: None | str | Hashable = None) -> pd.Series:
294 |     """Generate miscellaneous categorical features for a series.
295 | 
296 |     Useful for keeping raw columns in the linkage data. All features
297 |     use a label and take the form `["label<option>"]` except for missing
298 |     data, which are coded as `""`.
299 | 
300 |     Parameters
301 |     ----------
302 |     field: pd.Series
303 |         Series from which to generate our features.
304 |     label: str, optional
305 |         Label for the series. By default, the name of the series is
306 |         used if available. Otherwise, if not specified, `misc` is used.
307 | 
308 |     Returns
309 |     -------
310 |     pd.Series
311 |         Series containing lists of miscellaneous features.
312 |     """
313 |     label = label or field.name or "misc"
314 | 
315 |     _field = (
316 |         field.copy()
317 |         .replace("", "no_data", regex=False)
318 |         .fillna("no_data")
319 |         .astype("str")
320 |         .str.casefold()  # make everything lowercase
321 |         .replace(
322 |             r"(^.*$)",  # match the whole string in a group
323 |             rf"{label}<\1>",  # markup the string with label
324 |             regex=True,
325 |         )
326 |     )
327 |     _field_list = _field.apply(lambda x: [x])
328 |     _field_list.loc[_field == f"{label}<no_data>"] = ""  # disappears later
329 | 
330 |     return _field_list
331 | 
332 | 
333 | def gen_misc_shingled_features(
334 |     field: pd.Series,
335 |     ngram_length: list[int] = [2, 3],
336 |     use_gen_skip_grams: bool = False,
337 |     label: None | str | Hashable = None,
338 | ) -> pd.Series:
339 |     """Generate shingled labelled features.
340 | 
341 |     Generate n-grams, with a label to distinguish them from (and ensure
342 |     they're hashed separately from) names. Like `gen_name_features()`,
343 |     this function makes a call to `gen_features()` via
344 |     `pd.Series.apply()`.
345 | 
346 |     Parameters
347 |     ----------
348 |     field : pd.Series
349 |         Series of string data.
350 |     ngram_length : list, optional
351 |         Shingle sizes to generate. By default `[2, 3]`.
352 |     use_gen_skip_grams : bool
353 |         Whether to generate skip 2-grams. `False` by default.
354 |     label : str, optional
355 |         A label to differentiate from other shingled features. If
356 |         `field` has no name, this defaults to `zz`.
357 | 
358 |     Returns
359 |     -------
360 |     pd.Series
361 |         Series containing lists of shingled string features.
362 |     """
363 |     label = label or field.name or "zz"
364 | 
365 |     _field = (
366 |         field.copy()
367 |         .fillna("")
368 |         .apply(
369 |             lambda string: [
370 |                 f"{label}<{feature}>"
371 |                 for feature in gen_features(
372 |                     string,
373 |                     ngram_length=ngram_length,
374 |                     use_gen_skip_grams=use_gen_skip_grams,
375 |                 )
376 |             ]
377 |         )
378 |     )
379 | 
380 |     return _field
381 | 


--------------------------------------------------------------------------------
/docs/tutorials/in-the-cloud.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Working in the cloud
  3 | description: >
  4 |     Get you and your collaborators performing linkage in the cloud
  5 | ---
  6 | 
  7 | This tutorial provides an overview of how to use the PPRL Toolkit on
  8 | Google Cloud Platform (GCP). We go over how to assemble and assign roles in a
  9 | linkage team, how to set up everybody's projects, and end with executing the
 10 | linkage itself.
 11 | 
 12 | ![A diagram of the PPRL cloud architecture, with the secure enclave and key management services](https://github.com/datasciencecampus/pprl_toolkit/blob/main/assets/pprl_cloud_diagram.png?raw=true)
 13 | 
 14 | Above is a diagram showing the PPRL cloud architecture. The cloud demo uses a
 15 | Google Cloud Platform (GCP) Confidential Space compute instance, which is a
 16 | virtual machine (VM) using AMD
 17 | [Secure Encrypted Virtualisation](https://www.amd.com/en/developer/sev.html)
 18 | (AMD-SEV) technology to encrypt data in-memory. The Confidential Space VM can
 19 | also provide cryptographically signed documents, called attestations, which the
 20 | server can use to prove that it is running in a secure environment before
 21 | gaining access to data.
 22 | 
 23 | 
 24 | ## Assembling a linkage team
 25 | 
 26 | There are four roles to fill in any PPRL project: two data-owning **parties**,
 27 | a workload **author**, and a workload **operator**. A workload is how we refer
 28 | to the resources for the linkage operation itself (i.e. the containerised
 29 | linkage code and the environment in which to run it.)
 30 | 
 31 | These roles need not be fulfilled by four separate people. It is perfectly
 32 | possible to perform PPRL on your own, or perhaps you are working under a trust
 33 | model that allows one of the data-owning parties to author the workload while
 34 | the other is the operator.
 35 | 
 36 | ::: {.callout-tip}
 37 | In fact, the PPRL Toolkit is set up to allow any configuration of these roles
 38 | among up to four people.
 39 | :::
 40 | 
 41 | In any case, you must decide who will be doing what from the outset. Each role
 42 | comes with different responsibilities, but all roles require a GCP account and
 43 | access to the `gcloud` command-line tool. Additionally, everyone in the linkage
 44 | project will need to install the PPRL Toolkit.
 45 | 
 46 | ### Data-owning party
 47 | 
 48 | Often referred to as just a **party**, a data owner is responsible for the
 49 | storage and preparation of some confidential data. During set-up, each party
 50 | also sets up a storage bucket, a key management service, and a workload
 51 | identity pool that allows the party to share permissions with the server during
 52 | the linkage operation.
 53 | 
 54 | They create a Bloom filter embedding of their confidential data using an agreed
 55 | configuration, and then upload that to GCP for processing. Once the workload
 56 | operator is finished, the parties are able to retrieve their linkage results.
 57 | 
 58 | ### Workload author
 59 | 
 60 | The workload **author** is responsible for building a Docker image containing
 61 | the cloud-based linkage code and uploading it to a GCP Artifact Registry. This
 62 | image is the workload to be run by the operator.
 63 | 
 64 | ### Workload operator
 65 | 
 66 | The workload **operator** runs the linkage itself using some embedded data from
 67 | the parties and an image from the author. They are responsible for setting up
 68 | and running a
 69 | [Confidential Space](https://cloud.google.com/docs/security/confidential-space)
 70 | in which to perform the linkage. This setting ensures that nobody ever has
 71 | access to all the data at once, and that the data can only be accessed via the
 72 | linkage code itself.
 73 | 
 74 | 
 75 | ## Creating your GCP projects
 76 | 
 77 | Once you have decided who will be playing which role(s), you need to decide on
 78 | a naming structure and make some GCP projects. You will need a project for each
 79 | member of the linkage project - not one for each role. The names of these
 80 | projects will be used throughout the cloud implementation, from configuration
 81 | files to buckets. As such, they need to be descriptive and unique.
 82 | 
 83 | ::: {.callout-warning}
 84 | Since Google Cloud bucket names must be
 85 | [globally unique](https://cloud.google.com/storage/docs/buckets#naming), we
 86 | highly recommend using a hash in your project names to ensure that they are
 87 | unique. This will ensure that bucket names are also globally unique.
 88 | 
 89 | Our aim is to create a globally unique name (and thus ID) for each project.
 90 | :::
 91 | 
 92 | For example, say a UK bank and a US bank are looking to link some data on international
 93 | transactions to fit a machine learning model to predict fraud. Then
 94 | they might use `us-eaglebank` and `uk-royalbank` as their party names, which are succinct
 95 | and descriptive. However, they are generic and rule out future PPRL projects
 96 | with the same names.
 97 | 
 98 | As a remedy, they could make a short hash of their project description to create an
 99 | identifier:
100 | 
101 | ```bash
102 | $ echo -n "pprl us-eaglebank uk-royalbank fraud apr 2024" | sha256sum | cut -c 1-7
103 | 4fb6720
104 | ```
105 | 
106 | So, our project names would be: `uk-royalbank-4fb6720`, `us-eaglebank-4fb6720`. If they had a
107 | third-party linkage administrator (authoring and operating the workload), they
108 | would have a project called something like `admin-4fb6720`.
109 | 
110 | 
111 | ## Setting up your projects
112 | 
113 | Once you have decided on a naming structure, it is time to create the GCP
114 | projects. Each project will need specific Identity and Access Management (IAM)
115 | roles granted to them by the project owner's GCP Administrator. Which IAM roles
116 | depends on the linkage role they are playing. If someone is fulfilling more
117 | than one role, they should follow all the relevant sections below.
118 | 
119 | ::: {.callout-tip}
120 | If you have Administrator permissions for your GCP project, you can grant these
121 | roles using the `gcloud` command-line tool:
122 | 
123 | ```bash
124 | gcloud projects add-iam-policy-binding <project-name> \
125 |   --member=user:<user-email> \
126 |   --role=<role-code>
127 | ```
128 | :::
129 | 
130 | ### Data-owning parties
131 | 
132 | Each data-owning party requires the following IAM roles:
133 | 
134 | | Title                            | Code                                   | Purpose                           |
135 | |----------------------------------|----------------------------------------|-----------------------------------|
136 | | Cloud KMS Admin                  | `roles/cloudkms.admin`                 | Managing encryption keys          |
137 | | IAM Workload Identity Pool Admin | `roles/iam.workloadIdentityPoolAdmin`  | Managing an impersonation service |
138 | | Service Usage Admin              | `roles/serviceusage.serviceUsageAdmin` | Managing access to other APIs     |
139 | | Service Account Admin            | `roles/iam.serviceAccountAdmin`        | Managing a service account        |
140 | | Storage Admin                    | `roles/storage.admin`                  | Managing a bucket for their data  |
141 | 
142 | ### Workload author
143 | 
144 | The workload author only requires one IAM role:
145 | 
146 | | Title                           | Code                           | Purpose                                |
147 | |---------------------------------|--------------------------------|----------------------------------------|
148 | | Artifact Registry Administrator | `roles/artifactregistry.admin` | Managing the registry for the workload |
149 | 
150 | ### Workload operator
151 | 
152 | The workload operator requires three IAM roles:
153 | 
154 | | Title          | Code                      | Purpose                             |
155 | |----------------|---------------------------|-------------------------------------|
156 | | Compute Admin  | `roles/compute.admin`     | Managing the virtual machine        |
157 | | Security Admin | `roles/iam.securityAdmin` | Ability to set and get IAM policies |
158 | | Storage Admin  | `roles/storage.admin`     | Managing a shared bucket            |
159 | 
160 | 
161 | ## Configuring the PPRL Toolkit
162 | 
163 | Now your linkage team has its projects made up, you need to configure
164 | the PPRL Toolkit. This configuration tells the package where to look and what to
165 | call things; we do this with a single environment file containing a short
166 | collection of key-value pairs.
167 | 
168 | We have provided an example environment file in `.env.example`. Copy or rename
169 | that file to `.env` in the root of the PPRL Toolkit installation. Then, fill
170 | in your project details as necessary.
171 | 
172 | For our example above, let's say the ONS will be the workload author and the US
173 | Census Bureau will be the workload operator. The environment file would look
174 | something like this:
175 | 
176 | ```bash
177 | PARTY_1_PROJECT=uk-royalbank-4fb6720
178 | PARTY_1_KEY_VERSION=1
179 | 
180 | PARTY_2_PROJECT=us-eaglebank-4fb6720
181 | PARTY_2_KEY_VERSION=1
182 | 
183 | WORKLOAD_AUTHOR_PROJECT=uk-royalbank-4fb6720
184 | WORKLOAD_AUTHOR_PROJECT_REGION=europe-west2
185 | 
186 | WORKLOAD_OPERATOR_PROJECT=us-eaglebank-4fb6720
187 | WORKLOAD_OPERATOR_PROJECT_ZONE=us-east4-a
188 | ```
189 | 
190 | ::: {.callout-important}
191 | Your environment file should be identical among all the members of your linkage
192 | project.
193 | :::
194 | 
195 | 
196 | ## Creating the other resources
197 | 
198 | The last step in setting up your linkage project is to create and configure the
199 | other resources on GCP. To make things straightforward for users, we have
200 | packaged up the steps to do this into a number of `bash` scripts. These scripts
201 | are located in the `scripts/` directory and are numbered. You and your team
202 | must execute them from the `scripts/` directory in their named order according
203 | to which role(s) each member is fulfilling in the linkage project.
204 | 
205 | ::: {.callout-tip}
206 | Make sure you have set up `gcloud` on the command line. Once you've installed
207 | it, log in and set the application default:
208 | 
209 | ```bash
210 | gcloud auth login
211 | gcloud auth application-default login
212 | ```
213 | :::
214 | 
215 | 1. The data-owning parties set up: a key encryption key; a bucket in which to
216 |    store their encrypted data, data encryption key and results; a service
217 |    account for accessing said bucket and key; and a workload identity pool to
218 |    allow impersonations under stringent conditions.
219 |    ```bash
220 |    sh ./01-setup-party-resources.sh <name-of-party-project>
221 |    ```
222 | 2. The workload operator sets up a bucket for the parties to put their
223 |    (non-sensitive) attestation credentials, and a service account for running
224 |    the workload.
225 |    ```bash
226 |    sh ./02-setup-workload-operator.sh
227 |    ```
228 | 3. The workload author sets up an Artifact Registry on GCP, creates a Docker
229 |    image and uploads that image to their registry.
230 |    ```bash
231 |    sh ./03-setup-workload-author.sh
232 |    ```
233 | 4. The data-owning parties authorise the workload operator's service account to
234 |    use the workload identity pool to impersonate their service account in a
235 |    Confidential Space.
236 |    ```bash
237 |    sh ./04-authorise-workload.sh <name-of-party-project>
238 |    ```
239 | 
240 | 
241 | ## Processing and uploading the results
242 | 
243 | ::: {.callout-important}
244 | This section only applies to data-owning parties. The workload author is
245 | finished now, and the workload operator should wait for this section to be
246 | completed before moving on to the next section.
247 | :::
248 | 
249 | <!-- TODO: Link to create an embedding tutorial -->
250 | Now that all the cloud infrastructure has been set up, we are ready to start
251 | the first step in doing the actual linkage. That is, to create a Bloom filter
252 | embedding of their data, encrypt it, and upload that to GCP.
253 | 
254 | For users who prefer a graphical user interface, we have included a Flask app
255 | to handle the processing and uploading of data behind the scenes. This app will
256 | also be used to download the results once the linkage has completed.
257 | 
258 | To launch the app, run the following in your terminal:
259 | 
260 | ```bash
261 | python -m flask --app src/pprl/app run
262 | ```
263 | 
264 | You should now be able to find the app in your browser of choice at
265 | [127.0.0.1:5000](http://127.0.0.1:5000). It should look something like this:
266 | 
267 | ![A screenshot of the app](../_static/app-home-screenshot.png)
268 | 
269 | From here, the process to upload your data is as follows:
270 | 
271 | 1. Choose which party you are uploading for. Click `Submit`.
272 | 2. Select `Upload local file` and click `Choose file` to open your file
273 |    browser. Navigate to and select your dataset. Click `Submit`.
274 | 3. Assign types to each column in your dataset. Enter the agreed salt.
275 | 4. Click `Upload file to GCP`.
276 | 
277 | ::: {.callout-note}
278 | If you choose to use the Flask app to process your data, you will use a set of
279 | defaults for processing the confidential data before it gets embedded. If you
280 | want more control, then you'll have to agree an embedding configuration with
281 | the other data-owning party and do the processing directly.
282 | :::
283 | 
284 | Once you have worked through the selection, processing, and GCP upload portions
285 | of the app, you will be at a holding page. This page can be updated by clicking
286 | the button, and when your results are ready you will be taken to another page
287 | where you can download them.
288 | 
289 | 
290 | ## Running the linkage
291 | 
292 | ::: {.callout-important}
293 | This section only applies to the workload operator.
294 | :::
295 | 
296 | Once the data-owning parties have uploaded their processed data, you are able
297 | to begin the linkage. To do so, run the `05-run-workload.sh` bash script from
298 | `scripts/`:
299 | 
300 | ```bash
301 | cd /path/to/pprl_toolkit/scripts
302 | sh ./05-run-workload.sh
303 | ```
304 | 
305 | You can follow the progress of the workload from the Logs Explorer on GCP. Once
306 | it is complete, the data-owning parties will be able to download their results.
307 | 


--------------------------------------------------------------------------------
/test/embedder/test_features.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for the features module."""
  2 | 
  3 | import re
  4 | from datetime import datetime
  5 | from unittest import mock
  6 | 
  7 | import pandas as pd
  8 | import pytest
  9 | from hypothesis import given
 10 | from hypothesis import strategies as st
 11 | from metaphone import doublemetaphone
 12 | 
 13 | from pprl.embedder import features as feat
 14 | 
 15 | from .strategies import (
 16 |     NAMES,
 17 |     st_default_dobs,
 18 |     st_dobs_and_order_params,
 19 |     st_fields_series,
 20 |     st_mutated_names,
 21 |     st_names_series,
 22 |     st_sexes_series,
 23 |     st_strings_series,
 24 |     st_tokenized_names,
 25 | )
 26 | 
 27 | 
 28 | @given(st_mutated_names())
 29 | def test_split_string_underscore(name_mutated):
 30 |     """Test the string splitter can tokenize names correctly."""
 31 | 
 32 |     name, mutated = name_mutated
 33 |     tokens = feat.split_string_underscore(mutated)
 34 |     expected = [f"_{word}_" for word in re.split("[ -]", name)]
 35 | 
 36 |     assert tokens == expected
 37 | 
 38 | 
 39 | @given(st_tokenized_names(), st.lists(st.integers(1, 3), min_size=1, max_size=2, unique=True))
 40 | def test_gen_ngram(tokens, lengths):
 41 |     """Test the ngram generator can process tokenized names."""
 42 | 
 43 |     ngrams = list(feat.gen_ngram(tokens, lengths))
 44 | 
 45 |     assert all(gram in "".join(tokens) for gram in ngrams)
 46 |     assert all(len(gram) in lengths for gram in ngrams)
 47 | 
 48 | 
 49 | @given(st_tokenized_names())
 50 | def test_gen_ngram_too_long(tokens):
 51 |     """Test the ngram generator returns nothing for too-long ngrams."""
 52 | 
 53 |     length = max(map(len, tokens)) + 1
 54 |     ngrams = list(feat.gen_ngram(tokens, [length]))
 55 | 
 56 |     assert ngrams == []
 57 | 
 58 | 
 59 | @pytest.mark.parametrize(
 60 |     "test_input,expected,ngram_length",
 61 |     [
 62 |         (
 63 |             ["dave", "wilson"],
 64 |             ["da", "av", "ve", "wi", "il", "ls", "so", "on", "wilso", "ilson"],
 65 |             [2, 5],
 66 |         ),
 67 |         (["ron", "bill"], ["r", "o", "n", "b", "i", "l", "l"], [1]),
 68 |     ],
 69 | )
 70 | def test_gen_ngram_examples(test_input, expected, ngram_length):
 71 |     """Test to make sure n-grams are generated correctly.
 72 | 
 73 |     Our examples include tests for multiple n-gram lengths and for
 74 |     1-grams.
 75 |     """
 76 |     assert set([i for i in feat.gen_ngram(test_input, ngram_length)]) == set(expected)
 77 | 
 78 | 
 79 | @given(st_tokenized_names())
 80 | def test_gen_skip_grams(tokens):
 81 |     """Test the skip 2-gram generator works."""
 82 | 
 83 |     skip2grams = list(feat.gen_skip_grams(tokens))
 84 | 
 85 |     joined = "".join(tokens)
 86 |     cursor = 0
 87 |     for a, b in skip2grams:
 88 |         match = re.match(f"{a}.{b}", joined[cursor:])
 89 |         cursor += 3 if b == "_" else 1
 90 |         assert match is not None
 91 | 
 92 | 
 93 | @pytest.mark.parametrize(
 94 |     "test_input,expected",
 95 |     [
 96 |         (["dave", "wilson"], ["dv", "ae", "wl", "is", "lo", "sn"]),
 97 |         (["ron", "bill"], ["rn", "bl", "il"]),
 98 |     ],
 99 | )
100 | def test_gen_skip_grams_examples(test_input, expected):
101 |     """Tests to make sure skipgrams are generated separately for each token."""
102 |     assert set([i for i in feat.gen_skip_grams(test_input)]) == set(expected)
103 | 
104 | 
105 | @given(st.sampled_from(NAMES))
106 | def test_gen_double_metaphone(name):
107 |     """Test the double metaphone generator works."""
108 | 
109 |     doubles = list(feat.gen_double_metaphone(name))
110 | 
111 |     split = name.split()
112 |     double_by_word = [doublemetaphone(word) for word in split]
113 |     expected = [meta for double in double_by_word for meta in double if meta]
114 | 
115 |     assert len(split) <= len(doubles) <= len(split) * 2
116 |     assert doubles == expected
117 | 
118 | 
119 | @pytest.mark.parametrize(
120 |     "test_input,expected", [("dave wilson", ["ALSN", "FLSN", "TF"]), ("ron bill", ["PL", "RN"])]
121 | )
122 | def test_gen_double_metaphone_examples(test_input, expected):
123 |     """Tests to make sure double metaphones are generated separately for each token."""
124 |     assert set([i for i in feat.gen_double_metaphone(test_input)]) == set(expected)
125 | 
126 | 
127 | @given(
128 |     st.sampled_from(NAMES),
129 |     st.lists(st.integers(1, 3), min_size=1, max_size=2),
130 |     st.booleans(),
131 |     st.booleans(),
132 |     st.booleans(),
133 | )
134 | def test_gen_features(name, lengths, ngram, skip2gram, double_metaphone):
135 |     """Test the feature generator works."""
136 | 
137 |     with (
138 |         mock.patch("pprl.embedder.features.split_string_underscore") as splitter,
139 |         mock.patch("pprl.embedder.features.gen_ngram") as gen_ngram,
140 |         mock.patch("pprl.embedder.features.gen_skip_grams") as gen_skip_grams,
141 |         mock.patch("pprl.embedder.features.gen_double_metaphone") as gen_double_metaphone,
142 |     ):
143 |         splitter.return_value = "tokens"
144 |         gen_ngram.return_value = iter(["ngram"])
145 |         gen_skip_grams.return_value = iter(["skip2grams"])
146 |         gen_double_metaphone.return_value = iter(["double_metaphone"])
147 | 
148 |         features = list(feat.gen_features(name, lengths, ngram, skip2gram, double_metaphone))
149 | 
150 |     assert isinstance(features, list)
151 |     assert len(features) == sum((ngram, skip2gram, double_metaphone))
152 | 
153 |     if ngram:
154 |         assert "ngram" in features
155 |         gen_ngram.assert_called_once_with("tokens", ngram_length=lengths)
156 |     else:
157 |         gen_ngram.assert_not_called()
158 | 
159 |     if skip2gram:
160 |         assert "skip2grams" in features
161 |         gen_skip_grams.assert_called_once_with("tokens")
162 |     else:
163 |         gen_skip_grams.assert_not_called()
164 | 
165 |     if double_metaphone:
166 |         assert "double_metaphone" in features
167 |         gen_double_metaphone.assert_called_once_with(name.lower())
168 |     else:
169 |         gen_double_metaphone.assert_not_called()
170 | 
171 |     splitter.assert_called_once_with(name.lower())
172 | 
173 | 
174 | @pytest.mark.parametrize(
175 |     "test_input,expected",
176 |     [
177 |         (
178 |             "dave wilson",
179 |             [
180 |                 "_d",
181 |                 "da",
182 |                 "av",
183 |                 "ve",
184 |                 "e_",
185 |                 "_w",
186 |                 "wi",
187 |                 "il",
188 |                 "ls",
189 |                 "so",
190 |                 "on",
191 |                 "n_",
192 |                 "_a",
193 |                 "dv",
194 |                 "ae",
195 |                 "v_",
196 |                 "_i",
197 |                 "wl",
198 |                 "is",
199 |                 "lo",
200 |                 "sn",
201 |                 "o_",
202 |                 "ALSN",
203 |                 "FLSN",
204 |                 "TF",
205 |             ],
206 |         ),
207 |         (
208 |             "ron bill",
209 |             [
210 |                 "_r",
211 |                 "ro",
212 |                 "on",
213 |                 "n_",
214 |                 "_b",
215 |                 "bi",
216 |                 "il",
217 |                 "ll",
218 |                 "l_",
219 |                 "_o",
220 |                 "rn",
221 |                 "o_",
222 |                 "_i",
223 |                 "bl",
224 |                 "il",
225 |                 "l_",
226 |                 "PL",
227 |                 "RN",
228 |             ],
229 |         ),
230 |     ],
231 | )
232 | def test_gen_features_examples(test_input, expected):
233 |     """Tests to make sure all the string features can be made correctly.
234 | 
235 |     These include n-grams, skip-grams and double metaphones for the
236 |     names 'dave wilson' and 'ron bill'.
237 |     """
238 |     assert set(
239 |         [
240 |             i
241 |             for i in feat.gen_features(
242 |                 test_input, use_gen_skip_grams=True, use_double_metaphone=True, ngram_length=[2]
243 |             )
244 |         ]
245 |     ) == set(expected)
246 | 
247 | 
248 | @given(
249 |     st_names_series(),
250 |     st.lists(st.integers(1, 3), min_size=1, max_size=2),
251 |     st.booleans(),
252 |     st.booleans(),
253 |     st.booleans(),
254 | )
255 | def test_gen_name_features(names, lengths, ngram, skip2gram, double_metaphone):
256 |     """Test the name series feature generator works."""
257 | 
258 |     with mock.patch("pprl.embedder.features.gen_features") as gen_features:
259 |         gen_features.return_value = ["foo"]
260 | 
261 |         name_features = feat.gen_name_features(names, lengths, ngram, skip2gram, double_metaphone)
262 | 
263 |     assert isinstance(name_features, pd.Series)
264 |     assert name_features.to_list() == [["foo"]] * len(names)
265 | 
266 |     assert gen_features.call_count == len(names)
267 |     gen_features.assert_called_with(names.iloc[-1], lengths, ngram, skip2gram, double_metaphone)
268 | 
269 | 
270 | def test_gen_name_features_examples():
271 |     """Tests to make sure all the name features are made correctly.
272 | 
273 |     These include n-grams, skip-grams and double metaphones for a series
274 |     made up of 'dave wilson' and 'ron bill'.
275 |     """
276 |     name_series = pd.Series(["dave wilson", "ron bill"])
277 |     ground_truth_series = pd.Series(
278 |         [
279 |             [
280 |                 "_d",
281 |                 "da",
282 |                 "av",
283 |                 "ve",
284 |                 "e_",
285 |                 "_w",
286 |                 "wi",
287 |                 "il",
288 |                 "ls",
289 |                 "so",
290 |                 "on",
291 |                 "n_",
292 |                 "_a",
293 |                 "dv",
294 |                 "ae",
295 |                 "v_",
296 |                 "_i",
297 |                 "wl",
298 |                 "is",
299 |                 "lo",
300 |                 "sn",
301 |                 "o_",
302 |                 "TF",
303 |                 "ALSN",
304 |                 "FLSN",
305 |             ],
306 |             [
307 |                 "_r",
308 |                 "ro",
309 |                 "on",
310 |                 "n_",
311 |                 "_b",
312 |                 "bi",
313 |                 "il",
314 |                 "ll",
315 |                 "l_",
316 |                 "_o",
317 |                 "rn",
318 |                 "o_",
319 |                 "_i",
320 |                 "bl",
321 |                 "il",
322 |                 "l_",
323 |                 "RN",
324 |                 "PL",
325 |             ],
326 |         ]
327 |     )
328 |     name_series_output = feat.gen_name_features(
329 |         name_series,
330 |         ngram_length=[2],
331 |         use_gen_ngram=True,
332 |         use_gen_skip_grams=True,
333 |         use_double_metaphone=True,
334 |     )
335 |     assert name_series_output.equals(ground_truth_series)
336 | 
337 | 
338 | @given(st_sexes_series())
339 | def test_gen_sex_features(sexes):
340 |     """Test the sex categoriser works."""
341 | 
342 |     features = feat.gen_sex_features(sexes)
343 | 
344 |     assert isinstance(features, pd.Series)
345 |     assert len(features) == len(sexes)
346 |     assert features.dtype == list
347 | 
348 |     for feature, sex in zip(features, sexes):
349 |         assert feature == [""] if sex is None else [f"sex<{sex[0].lower()}>"]
350 | 
351 | 
352 | def test_gen_sex_features_example():
353 |     """Tests to make sure the sex features function works correctly.
354 | 
355 |     These examples ensure the function takes the first letter of any
356 |     string and converts it to sex<letter>. It also makes sure any other
357 |     values are converted to ''.
358 |     """
359 |     sex_features = feat.gen_sex_features(pd.Series(["Ostrich", "Male", None, "female", 42]))
360 |     sex_features_ground_truth = pd.Series([["sex<o>"], ["sex<m>"], [""], ["sex<f>"], [""]])
361 |     assert sex_features_ground_truth.equals(sex_features)
362 | 
363 | 
364 | @given(st_dobs_and_order_params(), st_default_dobs())
365 | def test_gen_dateofbirth_features(dobs_dayfirst_yearfirst_format, default):
366 |     """Test the DOB feature generator works."""
367 | 
368 |     dobs, dayfirst, yearfirst, format_ = dobs_dayfirst_yearfirst_format
369 | 
370 |     features = feat.gen_dateofbirth_features(dobs, dayfirst, yearfirst, default)
371 | 
372 |     assert isinstance(features, pd.Series)
373 |     assert len(features) == len(dobs)
374 |     assert features.dtype == list
375 | 
376 |     for feature, dob in zip(features, dobs):
377 |         if dob is None:
378 |             assert feature == default
379 |         else:
380 |             date = datetime.strptime(dob, format_)
381 |             assert all(
382 |                 str(getattr(date, name)) in part
383 |                 for name, part in zip(("day", "month", "year"), feature)
384 |             )
385 | 
386 | 
387 | @pytest.mark.parametrize(
388 |     "test_input,expected,default",
389 |     [
390 |         (
391 |             pd.Series(["01/03/2012", "12/25/1993", "11/12/1960", ""]),
392 |             pd.Series(
393 |                 [
394 |                     ["day<01>", "month<03>", "year<2012>"],
395 |                     ["missing"],
396 |                     ["day<11>", "month<12>", "year<1960>"],
397 |                     ["missing"],
398 |                 ]
399 |             ),
400 |             ["missing"],
401 |         ),
402 |         (
403 |             pd.Series(["01/03/2012", "12/25/1993", "11/12/1960", ""]),
404 |             pd.Series(
405 |                 [
406 |                     ["day<01>", "month<03>", "year<2012>"],
407 |                     "missing",
408 |                     ["day<11>", "month<12>", "year<1960>"],
409 |                     "missing",
410 |                 ]
411 |             ),
412 |             "missing",
413 |         ),
414 |     ],
415 | )
416 | def test_gen_dateofbirth_features_examples(test_input, expected, default):
417 |     """Tests to make sure date of birth is generated correctly.
418 | 
419 |     The examples include dates and missing or invalid dates, checking
420 |     they are replaced with the default value regardless of data type.
421 |     """
422 |     dob_features = feat.gen_dateofbirth_features(test_input, default=default)
423 |     assert dob_features.equals(expected)
424 | 
425 | 
426 | @given(st_fields_series(), st.sampled_from(("misc", "foo", "label")))
427 | def test_gen_misc_features(fields, label):
428 |     """Test the miscellaneous feature generator works."""
429 | 
430 |     features = feat.gen_misc_features(fields, label)
431 | 
432 |     assert isinstance(features, pd.Series)
433 |     assert len(features) == len(fields)
434 |     assert features.dtype == list
435 | 
436 |     for feature, field in zip(features, fields):
437 |         if field is None or field == "" or (isinstance(field, float) and pd.isna(field)):
438 |             assert feature == ""
439 |         else:
440 |             assert feature == [f"{label}<{str(field).casefold()}>"]
441 | 
442 | 
443 | @pytest.mark.parametrize(
444 |     "test_input,expected,label",
445 |     [
446 |         (pd.Series(list("abc")), pd.Series([["foo<a>"], ["foo<b>"], ["foo<c>"]]), "foo"),
447 |         (
448 |             pd.Series([1, 2, ["a", 1], None]),
449 |             pd.Series([["bar<1>"], ["bar<2>"], ["bar<['a', 1]>"], ""]),
450 |             "bar",
451 |         ),
452 |     ],
453 | )
454 | def test_gen_misc_features_examples(test_input, expected, label):
455 |     """Tests for the miscellaneous string feature generator."""
456 |     misc_features = feat.gen_misc_features(test_input, label=label)
457 | 
458 |     assert (misc_features).equals(expected)
459 | 
460 | 
461 | @given(
462 |     st_strings_series(),
463 |     st.lists(st.integers(1, 3), min_size=1, max_size=2, unique=True),
464 |     st.booleans(),
465 |     st.sampled_from(("zz", "shingle")),
466 | )
467 | def test_gen_misc_shingled_features(fields, lengths, skip2grams, label):
468 |     """Test the shingled labelled feature generator works."""
469 | 
470 |     with mock.patch("pprl.embedder.features.gen_features") as gen_features:
471 |         gen_features.return_value = ["foo"]
472 |         features = feat.gen_misc_shingled_features(fields, lengths, skip2grams, label)
473 | 
474 |     nrows = len(fields)
475 |     assert isinstance(features, pd.Series)
476 |     assert len(features) == nrows
477 | 
478 |     assert features.to_list() == [[f"{label}<foo>"]] * nrows
479 | 
480 |     assert gen_features.call_count == nrows
481 |     last_field = fields.iloc[-1] or ""
482 |     gen_features.assert_called_with(
483 |         last_field, ngram_length=lengths, use_gen_skip_grams=skip2grams
484 |     )
485 | 
486 | 
487 | @pytest.mark.parametrize(
488 |     "test_input,expected,ngram_length",
489 |     [
490 |         (
491 |             pd.Series(["russ abbott", "terry wogan"]),
492 |             pd.Series(
493 |                 [
494 |                     ["zz<_russ_>", "zz<_abbot>", "zz<abbott>", "zz<bbott_>"],
495 |                     ["zz<_terry>", "zz<terry_>", "zz<_wogan>", "zz<wogan_>"],
496 |                 ]
497 |             ),
498 |             [6],
499 |         ),
500 |         (
501 |             pd.Series(["ab", "cd", "ef"]),
502 |             pd.Series(
503 |                 [
504 |                     ["zz<a>", "zz<b>", "zz<_a>", "zz<ab>", "zz<b_>"],
505 |                     ["zz<c>", "zz<d>", "zz<_c>", "zz<cd>", "zz<d_>"],
506 |                     ["zz<e>", "zz<f>", "zz<_e>", "zz<ef>", "zz<f_>"],
507 |                 ]
508 |             ),
509 |             [1, 2],
510 |         ),
511 |     ],
512 | )
513 | def test_gen_misc_shingled_features_examples(test_input, expected, ngram_length):
514 |     """Tests for the miscellaneous shingled feature generator."""
515 |     misc_features = feat.gen_misc_shingled_features(test_input, ngram_length=ngram_length)
516 | 
517 |     assert (misc_features).equals(expected)
518 | 


--------------------------------------------------------------------------------
/src/pprl/app/templates/base.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html xml:lang="en" lang="en"><head>
  3 | 
  4 |   <head>
  5 |     <meta charset="utf-8">
  6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  8 |     <title>PPRL</title>
  9 |     <link rel="stylesheet" href="https://cdn.ons.gov.uk/sdc/design-system/70.0.0/css/main.css">
 10 |     <link rel="stylesheet" media="print" href="https://cdn.ons.gov.uk/sdc/design-system/70.0.0/css/print.css">
 11 |     <meta name="theme-color" content="#206095" />
 12 |     <!-- Open Graph -->
 13 |     <meta property="og:type" content="website">
 14 |     <meta property="og:url" content="">
 15 |     <meta property="og:title" content="">
 16 |     <meta property="og:image" href="{{ url_for('static', filename='ons_files/favicon.ico') }}">
 17 |     <meta property="og:image:type" content="image/png">
 18 |     <meta property="og:image:width" content="1200">
 19 |     <meta property="og:image:height" content="630">
 20 |     <meta property="og:description" content="">
 21 |     <meta property="og:site_name" content="Service name">
 22 |     <meta property="og:locale" content="en">
 23 |     <!-- Favicons -->
 24 |     <link rel="icon" type="image/x-icon" href="{{ url_for('static', filename='ons_files/favicon.ico') }}">
 25 |     <link rel="icon" type="image/png" href="{{ url_for('static', filename='ons_files/favicon.ico') }}"
 26 |       sizes="32x32">
 27 |     <link rel="icon" type="image/png" href="{{ url_for('static', filename='ons_files/favicon.ico') }}"
 28 |       sizes="16x16">
 29 |     <link rel="mask-icon" href="{{ url_for('static', filename='ons_files/favicon.ico') }}"
 30 |       color="#000000">
 31 |     <link rel="apple-touch-icon" type="image/png"
 32 |       href="{{ url_for('static', filename='ons_files/favicon.ico') }}" sizes="180x180">
 33 |     <link rel="manifest" href="{{ url_for('static', filename='ons_files/favicon.ico') }}">
 34 |   </head>
 35 | 
 36 |   <body>
 37 |     <script>
 38 |       document.body.className = ((document.body.className) ? document.body.className + ' ons-js-enabled' :
 39 |         'ons-js-enabled');
 40 |     </script>
 41 |     <div class="ons-page">
 42 |       <div class="ons-page__content">
 43 |         <a class="ons-skip-to-content" href="#main-content">Skip to main content</a>
 44 |         <header class="ons-header" role="banner">
 45 |           <div class="ons-browser-banner">
 46 |             <div class="ons-container">
 47 |               <p class="ons-browser-banner__content"><span class="ons-browser-banner__lead">This website no longer
 48 |                   supports your browser.</span><span class="ons-browser-banner__cta"> You can <a
 49 |                     class="ons-browser-banner__link" href="https://www.ons.gov.uk/help/browsers">upgrade your browser to
 50 |                     the latest version</a>.</span></p>
 51 |             </div>
 52 |           </div>
 53 |           <div class="ons-header__top">
 54 |             <div class="ons-container">
 55 |               <div
 56 |                 class="ons-header__grid-top ons-grid ons-grid--flex ons-grid--between ons-grid--vertical-center ons-grid--no-wrap ons-grid--gutterless">
 57 |                 <div class="ons-grid__col ons-col-auto">
 58 |                   <div class="ons-header__org-logo ons-header__org-logo--large">
 59 |                     <svg class="ons-icon--logo" xmlns="http://www.w3.org/2000/svg" width="197" height="19"
 60 |                       viewBox="33 2 552 60" aria-labelledby="ons-logo-en-alt" role="img">
 61 |                       <title id="ons-logo-en-alt">Office for National Statistics homepage</title>
 62 |                       <g class="ons-icon--logo__group ons-icon--logo__group--secondary" fill="#a8bd3a">
 63 |                         <path
 64 |                           d="M0,34.6c.8-1.69,1.39-3,2.32-4.6A38.28,38.28,0,0,1,0,23.4V34.6M5,3S0,3,0,9.25v1A62.12,62.12,0,0,0,4.2,27a43.77,43.77,0,0,1,9.42-10.79C21.69,9.21,31.16,5.13,45.9,3Z" />
 65 |                       </g>
 66 |                       <g class="ons-icon--logo__group ons-icon--logo__group--primary" fill="#003c57">
 67 |                         <path
 68 |                           d="M53.06,6.42C36.2,8,24.68,12.92,16.43,20.07A41.46,41.46,0,0,0,6.4,32.2C12.87,44.93,28.88,57,46.6,57H47s6.32.21,6.32-6.91V6.36a1.22,1.22,0,0,1-.26.06M9.72,42.67a44.25,44.25,0,0,1-5-7.42A80.59,80.59,0,0,0,0,46.38V56.91L31.06,57c-9.83-3-15.74-7.64-21.34-14.3" />
 69 |                       </g>
 70 |                       <g class="ons-icon--logo__group ons-icon--logo__group--text" fill="#003c57">
 71 |                         <path
 72 |                           d="M82,47.49c-9.07,0-13.13-7.51-13.13-16.77S72.91,14,82,14s13.1,7.61,13.1,16.77S91.1,47.54,82,47.54m0-30.91c-6.69,0-9.07,7.33-9.07,14.05s2.16,13.9,9.07,13.9,9-7.28,9-13.9-2.34-14-9-14" />
 73 |                         <path
 74 |                           d="M106.36,23.81V46.88h-3.67V23.81H98.93V21.56h3.76V17.9c0-4.61,2.72-7.95,8.08-7.95.38,0,.86.05.86.05v2.35h-.43c-2.55,0-4.84,1.64-4.84,5.12v4.09h5.27v2.25Z" />
 75 |                         <path
 76 |                           d="M121.53,23.81V46.88h-3.67V23.81H114.1V21.56h3.76V17.9c0-4.61,2.72-7.95,8.08-7.95.38,0,.86.05.86.05v2.35h-.43c-2.55,0-4.84,1.64-4.84,5.12v4.09h5.27v2.25Z" />
 77 |                         <path
 78 |                           d="M132.85,16.72a2.28,2.28,0,0,1-2.33-2.23v0a2.34,2.34,0,0,1,4.67,0,2.28,2.28,0,0,1-2.3,2.26h0M131,21.56h3.71V46.88H131Z" />
 79 |                         <path
 80 |                           d="M150.53,47.49c-6,0-10.63-5.16-10.63-13.29S144.52,21,150.66,21a9.76,9.76,0,0,1,6.17,1.74l-1,2.25a7.53,7.53,0,0,0-4.4-1.36c-5.15,0-7.78,4.46-7.78,10.48,0,6.2,3,10.62,7.65,10.62a8,8,0,0,0,4.49-1.37l1,2.45a10.21,10.21,0,0,1-6.3,1.73" />
 81 |                         <path
 82 |                           d="M162.84,35.75c.48,6,3.76,9,8.9,9a14.66,14.66,0,0,0,6.88-1.55l1.08,2.59a18,18,0,0,1-8.22,1.73c-7.12,0-12.18-4.23-12.18-13.34,0-8.69,4.67-13.2,11-13.2s10.37,3.95,10.37,12.4Zm7.35-12.41c-4.1,0-7.56,3.2-7.52,10.29l14.39-2c0-5.87-2.81-8.32-6.87-8.32" />
 83 |                         <path
 84 |                           d="M198.57,23.81V46.88H194.9V23.81h-3.76V21.56h3.76V17.9c0-4.61,2.72-7.95,8.08-7.95.39,0,.87.05.87.05v2.35h-.44c-2.54,0-4.84,1.64-4.84,5.12v4.09h5.28v2.25Z" />
 85 |                         <path
 86 |                           d="M217.28,47.49c-7.47,0-10.89-5.78-10.89-13.24S209.81,21,217.28,21s10.85,5.82,10.85,13.3-3.37,13.24-10.85,13.24m0-24c-5.53,0-7.13,5.59-7.13,10.81s1.73,10.56,7.13,10.56,7.13-5.35,7.13-10.56-1.6-10.81-7.13-10.81" />
 87 |                         <path
 88 |                           d="M244.08,23.91c-2.34-.61-5.75-.52-7.35.47v22.5H233V22.69c2.67-1.13,5.36-1.74,10.11-1.74H245Z" />
 89 |                         <path
 90 |                           d="M277.42,47.13,263.07,25a32.2,32.2,0,0,1-1.85-3.29h-.09s.13,1.88.13,3.85V47.13h-4.71V14.8h5.31l13.61,20.82A28.76,28.76,0,0,1,277.38,39h.08s-.17-1.84-.17-3.77V14.8H282V47.13Z" />
 91 |                         <path
 92 |                           d="M297.52,47.79c-7.43,0-10.93-3-10.93-7.81,0-6.8,7.12-8.64,15.59-9.39V29.13c0-3.47-2.37-4.51-5.83-4.51a18,18,0,0,0-6.87,1.46L288.23,23a24,24,0,0,1,9.12-1.83c5.61,0,9.93,2.3,9.93,8.78V46a22.71,22.71,0,0,1-9.76,1.83m4.66-14.67c-6.26.67-10.45,1.84-10.45,6.73,0,3.42,2.42,4.88,6.22,4.88a10.09,10.09,0,0,0,4.23-.84Z" />
 93 |                         <path
 94 |                           d="M322,47.69c-5.31,0-7.34-3.43-7.34-6.86V25.09h-3.55V21.81h3.55V16.12l5.4-1.5v7.19H325v3.28h-5V40.55a3.26,3.26,0,0,0,3,3.52h.5a5.5,5.5,0,0,0,1.46-.23v3.33a7.69,7.69,0,0,1-3,.52" />
 95 |                         <path
 96 |                           d="M331.91,17.43a3,3,0,0,1-3.15-2.81,3.17,3.17,0,0,1,6.31,0,3,3,0,0,1-3.16,2.81m-2.72,4.38h5.44V47.13h-5.44Z" />
 97 |                         <path
 98 |                           d="M350.88,47.79c-7.73,0-11.57-5.74-11.57-13.3s3.84-13.34,11.57-13.34,11.54,5.78,11.54,13.34-3.8,13.3-11.54,13.3m0-23.17c-4.66,0-6.05,4.89-6.05,9.82s1.47,9.63,6.05,9.63,6.05-4.7,6.05-9.63-1.38-9.82-6.05-9.82" />
 99 |                         <path
100 |                           d="M382.52,47.13V29c0-3.24-2.77-4.47-5.88-4.47a12.3,12.3,0,0,0-4.37.76v21.8h-5.39V23a26.81,26.81,0,0,1,10.06-1.83c6.61,0,11,2.25,11,7.8V47.13Z" />
101 |                         <path
102 |                           d="M403.18,47.79c-7.43,0-10.94-3-10.94-7.81,0-6.8,7.13-8.64,15.6-9.39V29.13c0-3.47-2.37-4.51-5.83-4.51a18,18,0,0,0-6.87,1.46L393.89,23A24,24,0,0,1,403,21.15c5.62,0,9.94,2.3,9.94,8.78V46a22.71,22.71,0,0,1-9.76,1.83m4.66-14.67c-6.27.67-10.46,1.84-10.46,6.73,0,3.42,2.43,4.88,6.23,4.88a10.09,10.09,0,0,0,4.23-.84Z" />
103 |                         <polygon
104 |                           points="418.52 47.13 418.52 34.91 418.52 10.25 423.92 10.25 423.92 22.76 423.92 47.13 418.52 47.13" />
105 |                         <path
106 |                           d="M445.39,47.79A19.11,19.11,0,0,1,436.58,46l1.51-4a13.48,13.48,0,0,0,6.22,1.55c3.76,0,6.44-2.21,6.44-5.41,0-7.09-13.44-4.36-13.44-14.42,0-5.13,4.15-9.59,10.72-9.59A15.82,15.82,0,0,1,455.8,16l-1.38,3.52a11.93,11.93,0,0,0-5.66-1.5c-3.5,0-5.79,2.11-5.79,5.12,0,7,13.74,3.94,13.74,14.65,0,5.74-4.71,10-11.32,10" />
107 |                         <path
108 |                           d="M470.41,47.69c-5.31,0-7.34-3.43-7.34-6.86V25.09h-3.54V21.81h3.54V16.12l5.4-1.5v7.19h4.92v3.28h-4.92V40.55a3.27,3.27,0,0,0,3,3.52h.48a5.12,5.12,0,0,0,1.46-.23v3.33a7.69,7.69,0,0,1-3,.52" />
109 |                         <path
110 |                           d="M487.27,47.79c-7.44,0-10.93-3-10.93-7.81,0-6.8,7.13-8.64,15.6-9.39V29.13c0-3.47-2.38-4.51-5.84-4.51a18,18,0,0,0-6.87,1.46L478,23a23.94,23.94,0,0,1,9.11-1.83c5.62,0,9.94,2.3,9.94,8.78V46a22.71,22.71,0,0,1-9.76,1.83M492,33.16c-6.27.67-10.46,1.84-10.46,6.73,0,3.42,2.42,4.88,6.22,4.88a10,10,0,0,0,4.24-.84Z" />
111 |                         <path
112 |                           d="M511.73,47.69c-5.32,0-7.35-3.43-7.35-6.86V25.09h-3.54V21.81h3.54V16.12l5.4-1.5v7.19h4.92v3.28h-4.92V40.55a3.26,3.26,0,0,0,3,3.52h.5a5.5,5.5,0,0,0,1.46-.23v3.33a7.69,7.69,0,0,1-3,.52" />
113 |                         <path
114 |                           d="M521.66,17.43a3,3,0,0,1-3.15-2.81,3.17,3.17,0,0,1,6.31,0,3,3,0,0,1-3.16,2.81m-2.72,4.38h5.45V47.13h-5.45Z" />
115 |                         <path
116 |                           d="M536.19,47.79A15.9,15.9,0,0,1,528.54,46L530,42.48a10.53,10.53,0,0,0,5.52,1.5c2.77,0,5-1.78,5-3.94,0-6-11.1-3.2-11.1-11.47,0-3.76,3.37-7.42,8.86-7.42A13.56,13.56,0,0,1,545.34,23l-1.42,3.14a8.47,8.47,0,0,0-4.62-1.45c-2.81,0-4.54,1.69-4.54,3.62,0,5.64,11.32,3.14,11.32,11.6,0,4-3.85,7.9-9.89,7.9" />
117 |                         <path
118 |                           d="M559.83,47.69c-5.31,0-7.35-3.43-7.35-6.86V25.09h-3.54V21.81h3.54V16.12l5.4-1.5v7.19h4.93v3.28h-4.93V40.55a3.27,3.27,0,0,0,3,3.52h.48a5.64,5.64,0,0,0,1.47-.23v3.33a7.72,7.72,0,0,1-3,.52" />
119 |                         <path
120 |                           d="M569.77,17.43a3,3,0,0,1-3.15-2.81,3.17,3.17,0,0,1,6.31,0,3,3,0,0,1-3.16,2.81m-2.72,4.38h5.44V47.13h-5.44Z" />
121 |                         <path
122 |                           d="M588.14,47.79c-6.23,0-11-5.08-11-13.35s4.88-13.29,11-13.29A10.51,10.51,0,0,1,594.66,23l-1.21,3a6.87,6.87,0,0,0-4-1.22c-4.4,0-6.69,3.81-6.69,9.49s2.63,9.59,6.61,9.59a6.74,6.74,0,0,0,4-1.28L594.7,46c-1.12.94-3.33,1.84-6.56,1.84" />
123 |                         <path
124 |                           d="M605.1,47.79A15.9,15.9,0,0,1,597.45,46l1.42-3.47A10.54,10.54,0,0,0,604.4,44c2.77,0,5-1.78,5-3.94,0-6-11.1-3.2-11.1-11.47,0-3.76,3.37-7.42,8.85-7.42a13.49,13.49,0,0,1,7.1,1.83l-1.42,3.14a8.42,8.42,0,0,0-4.63-1.45c-2.8,0-4.53,1.69-4.53,3.62,0,5.64,11.32,3.14,11.32,11.6,0,4-3.85,7.9-9.89,7.9" />
125 |                       </g>
126 |                     </svg>
127 |                   </div>
128 |                   <div class="ons-header__org-logo ons-header__org-logo--small">
129 |                     <svg class="ons-icon--logo" xmlns="http://www.w3.org/2000/svg" width="120" height="27"
130 |                       viewBox="0 5 595 116" aria-labelledby="ons-logo-stacked-en-alt" role="img">
131 |                       <title id="ons-logo-stacked-en-alt">Office for National Statistics logo</title>
132 |                       <g class="ons-icon--logo__group ons-icon--logo__group--secondary" fill="#a8bd3a">
133 |                         <path
134 |                           d="M0,70.5c1.8-3.7,3.6-7.2,5.6-10.7A127.94,127.94,0,0,1,0,42.6V70.5M10.9,0S0,0,0,13.5v7.2A128.06,128.06,0,0,0,7.9,56.2a114.75,114.75,0,0,1,22.3-26C47.8,15.1,71.5,4.7,103.7.1Z" />
135 |                       </g>
136 |                       <g class="ons-icon--logo__group ons-icon--logo__group--primary" fill="#003c57">
137 |                         <path
138 |                           d="M115.9,7.3c-36.8,3.5-62,14-80,29.4a108.15,108.15,0,0,0-23.6,29c14.1,27.4,41.1,47.6,86,50.5h4.4s13.8.5,13.8-14.9V7.2l-.6.1M21.2,85.4a92.68,92.68,0,0,1-11-16A173,173,0,0,0,0,93.4v22.7l73.6.1c-22.9-5.5-40.1-16.4-52.4-30.8" />
139 |                       </g>
140 |                       <g class="ons-icon--logo__group ons-icon--logo__group--text" fill="#003c57">
141 |                         <path
142 |                           d="M161,51.9c-11.3,0-16.3-9.3-16.3-20.8s5-20.8,16.3-20.8,16.3,9.5,16.3,20.8c-.1,11.5-5.1,20.8-16.3,20.8m0-38.3c-8.3,0-11.3,9.1-11.3,17.4s2.7,17.3,11.3,17.3,11.2-9.1,11.2-17.3S169.3,13.6,161,13.6m30.2,8.9V51.2h-4.5V22.6H182V19.8h4.7V15.2c0-5.7,3.4-9.9,10-9.9a8,8,0,0,1,1.1.1V8.3h-.5c-3.2,0-6,2.1-6,6.4v5.1h6.6v2.8l-6.7-.1Zm18.9,0V51.2h-4.5V22.6h-4.7V19.8h4.7V15.2c0-5.7,3.4-9.9,10-9.9a8,8,0,0,1,1.1.1V8.3h-.5c-3.2,0-6,2.1-6,6.4v5.1h6.6v2.8l-6.7-.1Zm14-8.8a2.82,2.82,0,0,1-2.9-2.8,2.9,2.9,0,0,1,5.8,0,2.76,2.76,0,0,1-2.9,2.8m-2.3,6h4.6V51.2h-4.6Zm24.3,32.2c-7.4,0-13.2-6.4-13.2-16.5,0-10.3,5.8-16.5,13.4-16.5a12.36,12.36,0,0,1,7.7,2.2l-1.2,2.8a8.92,8.92,0,0,0-5.5-1.7c-6.4,0-9.7,5.5-9.7,13,0,7.7,3.7,13.2,9.5,13.2a9.8,9.8,0,0,0,5.6-1.7l1.2,3c-1.3,1.2-4,2.2-7.8,2.2m15.3-14.6c.6,7.4,4.7,11.1,11.1,11.1a18.36,18.36,0,0,0,8.5-1.9l1.3,3.2a22.58,22.58,0,0,1-10.2,2.1c-8.8,0-15.1-5.3-15.1-16.6,0-10.8,5.8-16.4,13.7-16.4s12.9,4.9,12.9,15.4l-22.2,3.1ZM270.5,22c-5.1,0-9.4,4-9.3,12.8l17.9-2.5C279,25,275.5,22,270.5,22m42.2.5V51.2h-4.5V22.6h-4.7V19.8h4.7V15.2c0-5.7,3.4-9.9,10-9.9a8,8,0,0,1,1.1.1V8.3h-.5c-3.2,0-6,2.1-6,6.4v5.1h6.6v2.8Zm23.2,29.4c-9.3,0-13.5-7.2-13.5-16.5s4.2-16.5,13.5-16.5,13.5,7.2,13.5,16.5-4.2,16.5-13.5,16.5m0-29.8c-6.9,0-8.8,7-8.8,13.4s2.1,13.1,8.8,13.1c6.9,0,8.9-6.6,8.9-13.1s-2-13.4-8.9-13.4m33.3.6c-2.9-.8-7.1-.6-9.1.6V51.2h-4.6V21.1c3.3-1.4,6.6-2.2,12.5-2.2h2.4c0,.1-1.2,3.8-1.2,3.8ZM171.3,114.8,153.5,87.3c-1.3-2.1-2.3-4.1-2.3-4.1h-.1s.2,2.3.2,4.8v26.8h-5.8V74.7h6.6L169,100.5a46.13,46.13,0,0,1,2.4,4.1h.1s-.2-2.3-.2-4.7V74.6h5.9v40.1l-5.9.1Zm25,.8c-9.2,0-13.6-3.7-13.6-9.7,0-8.5,8.8-10.7,19.4-11.7V92.4c0-4.3-2.9-5.6-7.2-5.6a22.34,22.34,0,0,0-8.5,1.8l-1.6-3.8a30.2,30.2,0,0,1,11.3-2.3c7,0,12.3,2.9,12.3,10.9v19.9c-2.7,1.4-6.9,2.3-12.1,2.3m5.8-18.2c-7.8.8-13,2.3-13,8.3,0,4.2,3,6.1,7.7,6.1a12.33,12.33,0,0,0,5.3-1.1Zm24.5,18.1c-6.6,0-9.1-4.3-9.1-8.5V87.5h-4.4V83.4h4.4v-7l6.7-1.9v8.9h6.1v4.1h-6.1v19.2c0,2.5,1.4,4.4,4.3,4.4a5.66,5.66,0,0,0,1.8-.3v4.1a11.47,11.47,0,0,1-3.7.6M239,77.9a3.52,3.52,0,1,1,3.9-3.5,3.71,3.71,0,0,1-3.9,3.5m-3.4,5.5h6.8v31.4h-6.8Zm26.9,32.2c-9.6,0-14.4-7.1-14.4-16.5s4.8-16.6,14.4-16.6,14.3,7.2,14.3,16.6-4.7,16.5-14.3,16.5m0-28.7c-5.8,0-7.5,6.1-7.5,12.2s1.8,11.9,7.5,11.9,7.5-5.8,7.5-11.9-1.7-12.2-7.5-12.2m39.3,27.9V92.3c0-4-3.4-5.5-7.3-5.5a16,16,0,0,0-5.4.9v27.1h-6.7v-30a32.8,32.8,0,0,1,12.5-2.3c8.2,0,13.7,2.8,13.7,9.7v22.6Zm25.7.8c-9.2,0-13.6-3.7-13.6-9.7,0-8.5,8.9-10.7,19.4-11.7V92.4c0-4.3-2.9-5.6-7.2-5.6a22.34,22.34,0,0,0-8.5,1.8L316,84.8a30.2,30.2,0,0,1,11.3-2.3c7,0,12.3,2.9,12.3,10.9v19.9c-2.7,1.4-6.9,2.3-12.1,2.3m5.8-18.2c-7.8.8-13,2.3-13,8.3,0,4.2,3,6.1,7.7,6.1a12.33,12.33,0,0,0,5.3-1.1Zm13.2,17.4V69h6.7v45.8Zm38.6.8a23.94,23.94,0,0,1-10.9-2.3l1.9-4.9a17,17,0,0,0,7.7,1.9c4.7,0,8-2.7,8-6.7,0-8.8-16.7-5.4-16.7-17.9,0-6.4,5.2-11.9,13.3-11.9a20.22,20.22,0,0,1,9.7,2.3l-1.7,4.4a14.57,14.57,0,0,0-7-1.9c-4.3,0-7.2,2.6-7.2,6.4,0,8.6,17.1,4.9,17.1,18.2-.1,7.1-6,12.4-14.2,12.4m31.1-.1c-6.6,0-9.1-4.3-9.1-8.5V87.5h-4.4V83.4h4.4v-7l6.7-1.9v8.9h6.1v4.1h-6.1v19.2a4.07,4.07,0,0,0,4.3,4.4,5.66,5.66,0,0,0,1.8-.3v4.1a12.06,12.06,0,0,1-3.7.6m20.9.1c-9.2,0-13.6-3.7-13.6-9.7,0-8.5,8.9-10.7,19.4-11.7V92.4c0-4.3-2.9-5.6-7.2-5.6a22.34,22.34,0,0,0-8.5,1.8l-1.6-3.8a30.2,30.2,0,0,1,11.3-2.3c7,0,12.3,2.9,12.3,10.9v19.9c-2.6,1.4-6.9,2.3-12.1,2.3m5.8-18.2c-7.8.8-13,2.3-13,8.3,0,4.2,3,6.1,7.7,6.1a12.33,12.33,0,0,0,5.3-1.1Zm24.6,18.1c-6.6,0-9.1-4.3-9.1-8.5V87.5H454V83.4h4.4v-7l6.7-1.9v8.9h6.1v4.1h-6.1v19.2a4.07,4.07,0,0,0,4.3,4.4,5.66,5.66,0,0,0,1.8-.3v4.1a12.69,12.69,0,0,1-3.7.6m12.3-37.6a3.52,3.52,0,1,1,3.9-3.5,3.65,3.65,0,0,1-3.9,3.5m-3.4,5.5h6.8v31.4h-6.8Zm21.4,32.2a19.46,19.46,0,0,1-9.5-2.3l1.8-4.3a13.21,13.21,0,0,0,6.9,1.9c3.4,0,6.2-2.2,6.2-4.9,0-7.5-13.8-4-13.8-14.2,0-4.7,4.2-9.2,11-9.2a16.21,16.21,0,0,1,8.8,2.3l-1.8,3.9a10.31,10.31,0,0,0-5.7-1.8c-3.5,0-5.6,2.1-5.6,4.5,0,7,14,3.9,14,14.4,0,4.9-4.7,9.7-12.3,9.7m29.4-.1c-6.6,0-9.1-4.3-9.1-8.5V87.5h-4.4V83.4h4.4v-7l6.7-1.9v8.9h6.1v4.1h-6.1v19.2c0,2.5,1.4,4.4,4.3,4.4a5.66,5.66,0,0,0,1.8-.3v4.1a12.06,12.06,0,0,1-3.7.6m12.3-37.6a3.52,3.52,0,1,1,3.9-3.5c.1,2-1.7,3.5-3.9,3.5m-3.3,5.5H543v31.4h-6.8Zm26.2,32.2c-7.7,0-13.6-6.3-13.6-16.6s6.1-16.5,13.7-16.5c3.9,0,6.6,1.1,8,2.3L569,88.6a8.61,8.61,0,0,0-4.9-1.5c-5.5,0-8.3,4.7-8.3,11.8s3.3,11.9,8.2,11.9a8.39,8.39,0,0,0,4.9-1.6l1.7,4.1c-1.5,1.2-4.2,2.3-8.2,2.3m20.6,0a19.46,19.46,0,0,1-9.5-2.3l1.8-4.3a13.21,13.21,0,0,0,6.9,1.9c3.4,0,6.2-2.2,6.2-4.9,0-7.5-13.8-4-13.8-14.2,0-4.7,4.2-9.2,11-9.2a16.85,16.85,0,0,1,8.9,2.3l-1.8,3.9A10.31,10.31,0,0,0,587,87c-3.5,0-5.6,2.1-5.6,4.5,0,7,14,3.9,14,14.4-.1,4.9-4.9,9.7-12.4,9.7" />
143 |                       </g>
144 |                     </svg>
145 |                   </div>
146 |                 </div>
147 |               </div>
148 |             </div>
149 |           </div>
150 |           <div class="ons-header__main">
151 |             <div class="ons-container">
152 |               <div
153 |                 class="ons-grid ons-grid--gutterless ons-grid--flex ons-grid--between ons-grid--vertical-center ons-grid--no-wrap">
154 |                 <div class="ons-grid__col ons-col-auto ons-u-flex-shrink">
155 |                   <div class="ons-header__title">Privacy Preserving Record Linkage</div>
156 |                 </div>
157 |               </div>
158 |             </div>
159 |           </div>
160 |         </header>
161 |         <div class="ons-page__container ons-container">
162 |           <div class="ons-grid">
163 |             <div class="ons-grid__col ons-col-12@m ">
164 |               <main id="main-content" class="ons-page__main ">
165 |                 {% block body %}{% endblock %}
166 |               </main>
167 |             </div>
168 |           </div>
169 |         </div>
170 |       </div>
171 |       <footer class="ons-footer">
172 |         <div class="ons-footer__body ons-page__footer" data-analytics="footer">
173 |           <div class="ons-container">
174 |             <div class="ons-grid">
175 |             </div>
176 |             <div class="ons-grid ons-grid--flex ons-grid--vertical-top ons-grid--between">
177 |               <div class="ons-grid__col">
178 |                 <!-- OGL -->
179 |                 <div class="ons-footer__license ons-u-mb-m"><svg class="ons-footer__ogl-img"
180 |                     xmlns="http://www.w3.org/2000/svg" width="50px" height="20px" viewBox="0 0 60 24" focusable="false"
181 |                     aria-hidden="true">
182 |                     <title>Open Government License logo</title>
183 |                     <path
184 |                       d="M51.7,17.5V0l-6.2,4v19.8h13.8v-6.2H51.7z M36.7,16.3c-1,0.9-2.4,1.4-3.8,1.4c-3.2,0-5.8-2.6-5.8-5.8s2.6-5.8,5.8-5.8c2,0,3.9,1.1,4.9,2.7L43,5.6C40.9,2.2,37.1,0,32.9,0c-4.5,0-8.4,2.5-10.4,6.1C20.4,2.5,16.5,0,12,0C5.4,0,0,5.4,0,12s5.4,12,12,12c4.5,0,8.4-2.5,10.4-6.1c2.1,3.6,6,6.1,10.4,6.1c3,0,5.8-1.1,7.9-3l2.4,2.7h0.4V13h-9.8L36.7,16.3zM12,17.8c-3.2,0-5.8-2.6-5.8-5.8S8.8,6.2,12,6.2s5.8,2.6,5.8,5.8S15.2,17.8,12,17.8"
185 |                       fill="#595959"></path>
186 |                   </svg> All content is available under the <a
187 |                     href="https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"
188 |                     class="ons-external-link" target="_blank" rel="noopener">
189 |                     <span class="ons-external-link__text">Open Government Licence v3.0</span><span
190 |                       class="ons-external-link__icon">&nbsp;<svg class="ons-icon" viewBox="0 0 12 12"
191 |                         xmlns="http://www.w3.org/2000/svg" focusable="false" aria-hidden=true>
192 |                         <path
193 |                           d="M13.5,9H13a.5.5,0,0,0-.5.5v3h-9v-9h3A.5.5,0,0,0,7,3V2.5A.5.5,0,0,0,6.5,2h-4a.5.5,0,0,0-.5.5v11a.5.5,0,0,0,.5.5h11a.5.5,0,0,0,.5-.5v-4A.5.5,0,0,0,13.5,9Z"
194 |                           transform="translate(-2 -1.99)" />
195 |                         <path
196 |                           d="M8.83,7.88a.51.51,0,0,0,.71,0l2.31-2.32,1.28,1.28A.51.51,0,0,0,14,6.49v-4a.52.52,0,0,0-.5-.5h-4A.51.51,0,0,0,9,2.52a.58.58,0,0,0,.14.33l1.28,1.28L8.12,6.46a.51.51,0,0,0,0,.71Z"
197 |                           transform="translate(-2 -1.99)" />
198 |                       </svg></span><span class="ons-external-link__new-window-description ons-u-vh">(opens in a new
199 |                       tab)</span></a> , except where otherwise stated </div>
200 |                 <a class="ons-footer__poweredBy-link" href="https://www.ons.gov.uk/">
201 |                   <div class="ons-footer__poweredby-logo ons-u-mb-m">
202 |                     <svg class="ons-icon--logo" xmlns="http://www.w3.org/2000/svg" width="197" height="19"
203 |                       viewBox="33 2 552 60" aria-labelledby="ons-logo-en-footer-alt" role="img">
204 |                       <title id="ons-logo-en-footer-alt">Office for National Statistics</title>
205 |                       <g class="ons-icon--logo__group ons-icon--logo__group--secondary" fill="#a8bd3a">
206 |                         <path
207 |                           d="M0,34.6c.8-1.69,1.39-3,2.32-4.6A38.28,38.28,0,0,1,0,23.4V34.6M5,3S0,3,0,9.25v1A62.12,62.12,0,0,0,4.2,27a43.77,43.77,0,0,1,9.42-10.79C21.69,9.21,31.16,5.13,45.9,3Z" />
208 |                       </g>
209 |                       <g class="ons-icon--logo__group ons-icon--logo__group--primary" fill="#003c57">
210 |                         <path
211 |                           d="M53.06,6.42C36.2,8,24.68,12.92,16.43,20.07A41.46,41.46,0,0,0,6.4,32.2C12.87,44.93,28.88,57,46.6,57H47s6.32.21,6.32-6.91V6.36a1.22,1.22,0,0,1-.26.06M9.72,42.67a44.25,44.25,0,0,1-5-7.42A80.59,80.59,0,0,0,0,46.38V56.91L31.06,57c-9.83-3-15.74-7.64-21.34-14.3" />
212 |                       </g>
213 |                       <g class="ons-icon--logo__group ons-icon--logo__group--text" fill="#003c57">
214 |                         <path
215 |                           d="M82,47.49c-9.07,0-13.13-7.51-13.13-16.77S72.91,14,82,14s13.1,7.61,13.1,16.77S91.1,47.54,82,47.54m0-30.91c-6.69,0-9.07,7.33-9.07,14.05s2.16,13.9,9.07,13.9,9-7.28,9-13.9-2.34-14-9-14" />
216 |                         <path
217 |                           d="M106.36,23.81V46.88h-3.67V23.81H98.93V21.56h3.76V17.9c0-4.61,2.72-7.95,8.08-7.95.38,0,.86.05.86.05v2.35h-.43c-2.55,0-4.84,1.64-4.84,5.12v4.09h5.27v2.25Z" />
218 |                         <path
219 |                           d="M121.53,23.81V46.88h-3.67V23.81H114.1V21.56h3.76V17.9c0-4.61,2.72-7.95,8.08-7.95.38,0,.86.05.86.05v2.35h-.43c-2.55,0-4.84,1.64-4.84,5.12v4.09h5.27v2.25Z" />
220 |                         <path
221 |                           d="M132.85,16.72a2.28,2.28,0,0,1-2.33-2.23v0a2.34,2.34,0,0,1,4.67,0,2.28,2.28,0,0,1-2.3,2.26h0M131,21.56h3.71V46.88H131Z" />
222 |                         <path
223 |                           d="M150.53,47.49c-6,0-10.63-5.16-10.63-13.29S144.52,21,150.66,21a9.76,9.76,0,0,1,6.17,1.74l-1,2.25a7.53,7.53,0,0,0-4.4-1.36c-5.15,0-7.78,4.46-7.78,10.48,0,6.2,3,10.62,7.65,10.62a8,8,0,0,0,4.49-1.37l1,2.45a10.21,10.21,0,0,1-6.3,1.73" />
224 |                         <path
225 |                           d="M162.84,35.75c.48,6,3.76,9,8.9,9a14.66,14.66,0,0,0,6.88-1.55l1.08,2.59a18,18,0,0,1-8.22,1.73c-7.12,0-12.18-4.23-12.18-13.34,0-8.69,4.67-13.2,11-13.2s10.37,3.95,10.37,12.4Zm7.35-12.41c-4.1,0-7.56,3.2-7.52,10.29l14.39-2c0-5.87-2.81-8.32-6.87-8.32" />
226 |                         <path
227 |                           d="M198.57,23.81V46.88H194.9V23.81h-3.76V21.56h3.76V17.9c0-4.61,2.72-7.95,8.08-7.95.39,0,.87.05.87.05v2.35h-.44c-2.54,0-4.84,1.64-4.84,5.12v4.09h5.28v2.25Z" />
228 |                         <path
229 |                           d="M217.28,47.49c-7.47,0-10.89-5.78-10.89-13.24S209.81,21,217.28,21s10.85,5.82,10.85,13.3-3.37,13.24-10.85,13.24m0-24c-5.53,0-7.13,5.59-7.13,10.81s1.73,10.56,7.13,10.56,7.13-5.35,7.13-10.56-1.6-10.81-7.13-10.81" />
230 |                         <path
231 |                           d="M244.08,23.91c-2.34-.61-5.75-.52-7.35.47v22.5H233V22.69c2.67-1.13,5.36-1.74,10.11-1.74H245Z" />
232 |                         <path
233 |                           d="M277.42,47.13,263.07,25a32.2,32.2,0,0,1-1.85-3.29h-.09s.13,1.88.13,3.85V47.13h-4.71V14.8h5.31l13.61,20.82A28.76,28.76,0,0,1,277.38,39h.08s-.17-1.84-.17-3.77V14.8H282V47.13Z" />
234 |                         <path
235 |                           d="M297.52,47.79c-7.43,0-10.93-3-10.93-7.81,0-6.8,7.12-8.64,15.59-9.39V29.13c0-3.47-2.37-4.51-5.83-4.51a18,18,0,0,0-6.87,1.46L288.23,23a24,24,0,0,1,9.12-1.83c5.61,0,9.93,2.3,9.93,8.78V46a22.71,22.71,0,0,1-9.76,1.83m4.66-14.67c-6.26.67-10.45,1.84-10.45,6.73,0,3.42,2.42,4.88,6.22,4.88a10.09,10.09,0,0,0,4.23-.84Z" />
236 |                         <path
237 |                           d="M322,47.69c-5.31,0-7.34-3.43-7.34-6.86V25.09h-3.55V21.81h3.55V16.12l5.4-1.5v7.19H325v3.28h-5V40.55a3.26,3.26,0,0,0,3,3.52h.5a5.5,5.5,0,0,0,1.46-.23v3.33a7.69,7.69,0,0,1-3,.52" />
238 |                         <path
239 |                           d="M331.91,17.43a3,3,0,0,1-3.15-2.81,3.17,3.17,0,0,1,6.31,0,3,3,0,0,1-3.16,2.81m-2.72,4.38h5.44V47.13h-5.44Z" />
240 |                         <path
241 |                           d="M350.88,47.79c-7.73,0-11.57-5.74-11.57-13.3s3.84-13.34,11.57-13.34,11.54,5.78,11.54,13.34-3.8,13.3-11.54,13.3m0-23.17c-4.66,0-6.05,4.89-6.05,9.82s1.47,9.63,6.05,9.63,6.05-4.7,6.05-9.63-1.38-9.82-6.05-9.82" />
242 |                         <path
243 |                           d="M382.52,47.13V29c0-3.24-2.77-4.47-5.88-4.47a12.3,12.3,0,0,0-4.37.76v21.8h-5.39V23a26.81,26.81,0,0,1,10.06-1.83c6.61,0,11,2.25,11,7.8V47.13Z" />
244 |                         <path
245 |                           d="M403.18,47.79c-7.43,0-10.94-3-10.94-7.81,0-6.8,7.13-8.64,15.6-9.39V29.13c0-3.47-2.37-4.51-5.83-4.51a18,18,0,0,0-6.87,1.46L393.89,23A24,24,0,0,1,403,21.15c5.62,0,9.94,2.3,9.94,8.78V46a22.71,22.71,0,0,1-9.76,1.83m4.66-14.67c-6.27.67-10.46,1.84-10.46,6.73,0,3.42,2.43,4.88,6.23,4.88a10.09,10.09,0,0,0,4.23-.84Z" />
246 |                         <polygon
247 |                           points="418.52 47.13 418.52 34.91 418.52 10.25 423.92 10.25 423.92 22.76 423.92 47.13 418.52 47.13" />
248 |                         <path
249 |                           d="M445.39,47.79A19.11,19.11,0,0,1,436.58,46l1.51-4a13.48,13.48,0,0,0,6.22,1.55c3.76,0,6.44-2.21,6.44-5.41,0-7.09-13.44-4.36-13.44-14.42,0-5.13,4.15-9.59,10.72-9.59A15.82,15.82,0,0,1,455.8,16l-1.38,3.52a11.93,11.93,0,0,0-5.66-1.5c-3.5,0-5.79,2.11-5.79,5.12,0,7,13.74,3.94,13.74,14.65,0,5.74-4.71,10-11.32,10" />
250 |                         <path
251 |                           d="M470.41,47.69c-5.31,0-7.34-3.43-7.34-6.86V25.09h-3.54V21.81h3.54V16.12l5.4-1.5v7.19h4.92v3.28h-4.92V40.55a3.27,3.27,0,0,0,3,3.52h.48a5.12,5.12,0,0,0,1.46-.23v3.33a7.69,7.69,0,0,1-3,.52" />
252 |                         <path
253 |                           d="M487.27,47.79c-7.44,0-10.93-3-10.93-7.81,0-6.8,7.13-8.64,15.6-9.39V29.13c0-3.47-2.38-4.51-5.84-4.51a18,18,0,0,0-6.87,1.46L478,23a23.94,23.94,0,0,1,9.11-1.83c5.62,0,9.94,2.3,9.94,8.78V46a22.71,22.71,0,0,1-9.76,1.83M492,33.16c-6.27.67-10.46,1.84-10.46,6.73,0,3.42,2.42,4.88,6.22,4.88a10,10,0,0,0,4.24-.84Z" />
254 |                         <path
255 |                           d="M511.73,47.69c-5.32,0-7.35-3.43-7.35-6.86V25.09h-3.54V21.81h3.54V16.12l5.4-1.5v7.19h4.92v3.28h-4.92V40.55a3.26,3.26,0,0,0,3,3.52h.5a5.5,5.5,0,0,0,1.46-.23v3.33a7.69,7.69,0,0,1-3,.52" />
256 |                         <path
257 |                           d="M521.66,17.43a3,3,0,0,1-3.15-2.81,3.17,3.17,0,0,1,6.31,0,3,3,0,0,1-3.16,2.81m-2.72,4.38h5.45V47.13h-5.45Z" />
258 |                         <path
259 |                           d="M536.19,47.79A15.9,15.9,0,0,1,528.54,46L530,42.48a10.53,10.53,0,0,0,5.52,1.5c2.77,0,5-1.78,5-3.94,0-6-11.1-3.2-11.1-11.47,0-3.76,3.37-7.42,8.86-7.42A13.56,13.56,0,0,1,545.34,23l-1.42,3.14a8.47,8.47,0,0,0-4.62-1.45c-2.81,0-4.54,1.69-4.54,3.62,0,5.64,11.32,3.14,11.32,11.6,0,4-3.85,7.9-9.89,7.9" />
260 |                         <path
261 |                           d="M559.83,47.69c-5.31,0-7.35-3.43-7.35-6.86V25.09h-3.54V21.81h3.54V16.12l5.4-1.5v7.19h4.93v3.28h-4.93V40.55a3.27,3.27,0,0,0,3,3.52h.48a5.64,5.64,0,0,0,1.47-.23v3.33a7.72,7.72,0,0,1-3,.52" />
262 |                         <path
263 |                           d="M569.77,17.43a3,3,0,0,1-3.15-2.81,3.17,3.17,0,0,1,6.31,0,3,3,0,0,1-3.16,2.81m-2.72,4.38h5.44V47.13h-5.44Z" />
264 |                         <path
265 |                           d="M588.14,47.79c-6.23,0-11-5.08-11-13.35s4.88-13.29,11-13.29A10.51,10.51,0,0,1,594.66,23l-1.21,3a6.87,6.87,0,0,0-4-1.22c-4.4,0-6.69,3.81-6.69,9.49s2.63,9.59,6.61,9.59a6.74,6.74,0,0,0,4-1.28L594.7,46c-1.12.94-3.33,1.84-6.56,1.84" />
266 |                         <path
267 |                           d="M605.1,47.79A15.9,15.9,0,0,1,597.45,46l1.42-3.47A10.54,10.54,0,0,0,604.4,44c2.77,0,5-1.78,5-3.94,0-6-11.1-3.2-11.1-11.47,0-3.76,3.37-7.42,8.85-7.42a13.49,13.49,0,0,1,7.1,1.83l-1.42,3.14a8.42,8.42,0,0,0-4.63-1.45c-2.8,0-4.53,1.69-4.53,3.62,0,5.64,11.32,3.14,11.32,11.6,0,4-3.85,7.9-9.89,7.9" />
268 |                       </g>
269 |                     </svg>
270 |                   </div>
271 |                 </a>
272 |               </div>
273 |             </div>
274 |           </div>
275 |         </div>
276 |       </footer>
277 |     </div>
278 |     <script>
279 |       (function() {
280 |         var s = ['https://cdn.ons.gov.uk/sdc/design-system/70.0.0/scripts/main.js'],
281 |           c = document.createElement('script');
282 |         if (!('noModule' in c)) {
283 |           for (var i = 0; i < s.length; i++) {
284 |             s[i] = s[i].replace('.js', '.es5.js');
285 |           }
286 |         }
287 |         for (var i = 0; i < s.length; i++) {
288 |           var e = document.createElement('script');
289 |           e.src = s[i];
290 |           document.body.appendChild(e);
291 |         }
292 |       })();
293 |     </script>
294 |     <div class="embeddable"></div>
295 |     <style>
296 |       body {
297 |         height: auto;
298 |       }
299 |     </style>
300 |     <script src="/js/main.js?t=1710949206297"></script>
301 |   </body>
302 | 
303 | </html>
304 | 


--------------------------------------------------------------------------------