├── MANIFEST.in ├── test ├── embedder │ ├── __init__.py │ ├── test_bloom_filters.py │ ├── strategies.py │ ├── test_embedder.py │ └── test_features.py ├── test_encryption.py ├── app │ ├── conftest.py │ ├── test_utils.py │ └── test_file_selector.py ├── matching │ └── test_perform.py └── test_config.py ├── docs ├── _static │ ├── app-home-screenshot.png │ └── 02-client-screenshot.png ├── assets │ └── pprl_cloud_diagram.png └── tutorials │ ├── index.qmd │ ├── example-febrl.qmd │ ├── run-through.qmd │ ├── example-verknupfung.qmd │ └── in-the-cloud.qmd ├── src └── pprl │ ├── app │ ├── static │ │ └── ons_files │ │ │ └── favicon.ico │ ├── templates │ │ ├── download-results.html │ │ ├── check-results.html │ │ ├── home.html │ │ ├── choose-data.html │ │ ├── process-data.html │ │ └── base.html │ ├── utils.py │ └── __init__.py │ ├── matching │ ├── __init__.py │ ├── local.py │ ├── perform.py │ └── cloud.py │ ├── __init__.py │ ├── embedder │ ├── __init__.py │ ├── bloom_filters.py │ └── features.py │ ├── config.py │ └── encryption.py ├── .env.example ├── scripts ├── 07-tear-down-author.sh ├── 06-tear-down-operator.sh ├── 08-tear-down-party.sh ├── 05-run-workload.sh ├── 03-setup-workload-author.sh ├── 04-authorise-workload.sh ├── 02-setup-workload-operator.sh ├── 01-setup-party-resources.sh ├── server.py └── common.sh ├── Dockerfile ├── .gitignore ├── index.qmd ├── .github ├── ISSUE_TEMPLATE │ ├── feature-idea.md │ └── bug_report.md └── workflows │ ├── ci.yml │ └── docs.yml ├── LICENSE ├── pyproject.toml ├── .pre-commit-config.yaml ├── .secrets.baseline ├── _quarto.yml └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/embedder/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the embedder subpackage.""" 2 | -------------------------------------------------------------------------------- /docs/_static/app-home-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/_static/app-home-screenshot.png -------------------------------------------------------------------------------- /docs/assets/pprl_cloud_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/assets/pprl_cloud_diagram.png -------------------------------------------------------------------------------- /docs/_static/02-client-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/_static/02-client-screenshot.png -------------------------------------------------------------------------------- /src/pprl/app/static/ons_files/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/src/pprl/app/static/ons_files/favicon.ico -------------------------------------------------------------------------------- /src/pprl/matching/__init__.py: -------------------------------------------------------------------------------- 1 | """Functions for performing the matching locally or in the cloud.""" 2 | 3 | from .perform import perform_matching 4 | 5 | __all__ = ["perform_matching"] 6 | -------------------------------------------------------------------------------- /src/pprl/__init__.py: -------------------------------------------------------------------------------- 1 | """Privacy-preserving record linkage via Bloom filter embeddings.""" 2 | 3 | from .embedder import EmbeddedDataFrame, Embedder 4 | 5 | __all__ = ["EmbeddedDataFrame", "Embedder"] 6 | -------------------------------------------------------------------------------- /src/pprl/embedder/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools for generating our Bloom filter embeddings and matchings.""" 2 | 3 | from .embedder import EmbeddedDataFrame, Embedder 4 | 5 | __all__ = ["EmbeddedDataFrame", "Embedder"] 6 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | PARTY_1_PROJECT=pprl-party-1 2 | PARTY_1_KEY_VERSION=1 3 | 4 | PARTY_2_PROJECT=pprl-party-2 5 | PARTY_2_KEY_VERSION=1 6 | 7 | WORKLOAD_AUTHOR_PROJECT=pprl-party-1 8 | WORKLOAD_AUTHOR_PROJECT_REGION=europe-west2 9 | 10 | WORKLOAD_OPERATOR_PROJECT=pprl-party-2 11 | WORKLOAD_OPERATOR_PROJECT_ZONE=europe-west2-c 12 | -------------------------------------------------------------------------------- /scripts/07-tear-down-author.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Tears down all billable resources for the workload author. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | set_gcp_project $WORKLOAD_AUTHOR_PROJECT 9 | 10 | delete_artifact_repository $ARTIFACT_REPOSITORY $WORKLOAD_AUTHOR_PROJECT_REGION 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 python:3.11-slim-bookworm 2 | 3 | ENV PYTHONUNBUFFERED=True 4 | ENV PRODUCTION=1 5 | 6 | COPY pyproject.toml . 7 | ADD src/pprl src/pprl 8 | RUN python -m pip install --upgrade pip 9 | RUN python -m pip install --no-cache-dir . 10 | 11 | COPY .env . 12 | COPY scripts/server.py . 13 | 14 | CMD [ "python", "server.py" ] 15 | -------------------------------------------------------------------------------- /src/pprl/app/templates/download-results.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block head %} 4 | {% endblock %} 5 | 6 | {% block body %} 7 |

Download your results

8 |
9 | Press the button to download your results. 10 |

11 |
12 | 13 |
14 | {% endblock %} 15 | -------------------------------------------------------------------------------- /docs/tutorials/index.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Tutorials 3 | listing: 4 | type: table 5 | contents: 6 | - "*.qmd" 7 | fields: [title, description, reading-time] 8 | sort-ui: false 9 | filter-ui: false 10 | --- 11 | 12 | These tutorials walk you through some of the essential workflows for pprl. 13 | The purpose of these documents is for you to learn how to use the pprl 14 | package for your own linkage projects. 15 | 16 |
17 | -------------------------------------------------------------------------------- /src/pprl/app/templates/check-results.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block head %} 4 | {% endblock %} 5 | 6 | {% block body %} 7 |

Check for your results

8 |
9 | 10 | {% if message %} 11 |

{{message}}


12 | {% endif %} 13 | 14 | Press the button to check for your results. 15 |

16 | 17 |
18 | 19 |
20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | !.gitkeep 3 | 4 | # data 5 | data/* 6 | secrets/* 7 | *.json 8 | 9 | # logs 10 | log/* 11 | 12 | 13 | 14 | # environment 15 | .env 16 | 17 | # documentation 18 | /.quarto/ 19 | /_site/ 20 | 21 | # tests 22 | .tox/ 23 | .coverage 24 | .mypy_cache/ 25 | .pytest_cache/ 26 | .hypothesis/ 27 | .ruff_cache/ 28 | 29 | 30 | # system 31 | .DS_Store 32 | .vscode/ 33 | 34 | # cache 35 | */__pycache__/ 36 | */**/__pycache__/ 37 | 38 | # build 39 | build/ 40 | dist/ 41 | *.egg-info/ 42 | *.egg 43 | -------------------------------------------------------------------------------- /scripts/06-tear-down-operator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Tears down all billable resources for the workload operator. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | set_gcp_project $WORKLOAD_OPERATOR_PROJECT 9 | 10 | echo "Deleting workload virtual machine..." 11 | gcloud compute instances delete \ 12 | projects/$WORKLOAD_OPERATOR_PROJECT/zones/$WORKLOAD_OPERATOR_PROJECT_ZONE/instances/pprl-cvm 13 | 14 | delete_storage_bucket $ATTESTATION_BUCKET 15 | 16 | delete_service_account $WORKLOAD_SERVICE_ACCOUNT_EMAIL 17 | -------------------------------------------------------------------------------- /scripts/08-tear-down-party.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Tears down all billable resources for the data-owning party. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | export PROJECT_NAME=${1} 9 | export PROJECT_KEY_VERSION=${2} 10 | if [ ! $PROJECT_KEY_VERSION ]; then 11 | do 12 | export PROJECT_KEY_VERSION=1 13 | done 14 | 15 | set_gcp_project $PROJECT_NAME 16 | 17 | delete_storage_bucket $PROJECT_NAME-bucket 18 | 19 | destroy_kms_key_version \ 20 | $PROJECT_NAME-akek $PROJECT_NAME-akek-kr $PROJECT_LOCATION $PROJECT_KEY_VERSION 21 | 22 | delete_workload_identity_pool $PROJECT_NAME-wip $PROJECT_LOCATION 23 | 24 | delete_service_account $PROJECT_NAME-sa@$PROJECT_NAME.iam.gserviceaccount.com 25 | -------------------------------------------------------------------------------- /scripts/05-run-workload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Sets the workload running on GCP. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | set_gcp_project $WORKLOAD_OPERATOR_PROJECT 9 | 10 | echo "Setting up confidential VM..." 11 | gcloud compute instances create pprl-cvm \ 12 | --confidential-compute \ 13 | --shielded-secure-boot \ 14 | --maintenance-policy=TERMINATE \ 15 | --scopes=cloud-platform \ 16 | --zone=$WORKLOAD_OPERATOR_PROJECT_ZONE \ 17 | --image-project=confidential-space-images \ 18 | --image-family=confidential-space \ 19 | --service-account=$WORKLOAD_SERVICE_ACCOUNT_EMAIL \ 20 | --metadata "^~^tee-image-reference=$WORKLOAD_IMAGE_REFERENCE:$WORKLOAD_IMAGE_TAG~tee-restart-policy=Never" 21 | -------------------------------------------------------------------------------- /index.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Welcome to the **pprl** documentation! 3 | toc: false 4 | sidebar: false 5 | about: 6 | template: marquee 7 | links: 8 | - icon: github 9 | href: https://github.com/datasciencecampus/pprl 10 | text: GitHub 11 | --- 12 | 13 | ## What is this and why does it exist? 14 | 15 | This package, **pprl**, implements a method for performing 16 | Privacy Preserving Record Linkage. This linkage can be done 17 | locally or through Google Cloud Platform. 18 | 19 | ## Where do I go now? 20 | 21 | If you're looking to get stuck in with pprl, head over to our 22 | [tutorials](docs/tutorials/index.qmd). 23 | 24 | For more focused, technical details of how this all works, see our 25 | [API reference](docs/reference/index.qmd). 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-idea.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature idea 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] : " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please be aware that, as pprl is an experimental package, ONS cannot promise to implement feature ideas. 11 | 12 | ### Does your feature idea solve a problem? 13 | If this applies to your idea, please provide a clear and concise description of what the problem is. 14 | 15 | ### Describe the solution you'd like 16 | A clear and concise description of what you want to happen. 17 | 18 | ### Describe alternatives you've considered 19 | A clear and concise description of any alternative solutions or features you've considered. 20 | 21 | ### Additional context 22 | Add any other context or screenshots about the feature request here. 23 | -------------------------------------------------------------------------------- /src/pprl/app/templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block body %} 4 |

Welcome to the PPRL application

5 | 6 |

7 | This application is for data-owning parties to process and upload their data 8 | to a Google Cloud Platform (GCP) bucket. Once both parties have uploaded 9 | their data, the operator can run the workload to link your datasets in a 10 | secure environment. 11 | 12 | Keep this app open and you will be able to download your results at the end. 13 |

14 |
15 |

16 | To begin, please select which party you are. 17 |

18 |
19 |
20 | 25 |
26 | 27 |
28 | {% endblock %} 29 | -------------------------------------------------------------------------------- /src/pprl/app/templates/choose-data.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block body %} 4 |

Choose a dataset

5 | 6 | Use one of the FEBRL datasets from the RecordLinkage package or upload 7 | your own dataset. If you choose the latter, your file must be a CSV. 8 |
9 | {{message}} 10 |
11 |
12 |

13 | 14 | 15 |

16 |

17 | 18 | 19 |

20 |

21 | 22 | 23 |

24 |

25 |
26 | 27 |

28 |
29 | {% endblock %} 30 | -------------------------------------------------------------------------------- /test/test_encryption.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the `encryption` module.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pprl import encryption 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "input_df", 11 | [ 12 | ( 13 | pd.DataFrame( 14 | dict( 15 | ints=[1, 4, 5, 1273873], 16 | bools=[True, False, False, True], 17 | strings=["a", "bchc", "12djd", "]p8s|"], 18 | ) 19 | ) 20 | ), 21 | (pd.DataFrame(dict())), 22 | (pd.DataFrame(dict(mixed=[1, True, "my_string", 1.43434]))), 23 | ], 24 | ) 25 | def test_encrypt_decrypt_data(input_df): 26 | """Make sure the dataframe is unchanged by the encryption process. 27 | 28 | We ignore the index here. 29 | """ 30 | 31 | payload, data_enc_key = encryption.encrypt_data(input_df) 32 | decrypted_dataframe = encryption.decrypt_data(payload, data_enc_key) 33 | assert input_df.equals(decrypted_dataframe.reset_index(drop=True)) 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] : " 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please be aware that, as pprl is an experimental package, ONS cannot promise to resolve bugs. 11 | 12 | ### Describe the bug 13 | A clear and concise description of what the bug is. 14 | 15 | ### How to reproduce the bug 16 | Steps to reproduce the behaviour: 17 | 1. Go to '...' 18 | 2. Click on '....' 19 | 3. Scroll down to '....' 20 | 4. See error 21 | 22 | ### Expected behaviour 23 | A clear and concise description of what you expected to happen. 24 | 25 | ### Evidence (tracebacks and screenshots) 26 | If applicable, please add any tracebacks or screenshots to help explain your problem. 27 | 28 | ### System information 29 | Please provide the following information about your environment: 30 | 31 | - OS: [e.g. macOS] 32 | - Browser (when using the client-side app or GCP): [e.g. Chrome, Safari] 33 | - pprl version: [e.g. 0.0.1] 34 | 35 | ### Additional context 36 | Add any other context about the problem here. 37 | -------------------------------------------------------------------------------- /test/app/conftest.py: -------------------------------------------------------------------------------- 1 | """Test configuration.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pprl.app import app 7 | 8 | 9 | @pytest.fixture() 10 | def client(): 11 | """Create a test client.""" 12 | return app.test_client() 13 | 14 | 15 | @pytest.fixture() 16 | def csv_client(): 17 | """Create a test client with a CSV attached.""" 18 | app.config["unprocessed_dataframe"] = pd.DataFrame(dict(column1=[], column2=[])) 19 | app.config["filename"] = "my_file.csv" 20 | return app.test_client() 21 | 22 | 23 | @pytest.fixture() 24 | def no_party_client(): 25 | """Create a test client with a CSV but no party number.""" 26 | app.config["unprocessed_dataframe"] = pd.DataFrame(dict(column1=[], column2=[])) 27 | app.config["party_number"] = None 28 | return app.test_client() 29 | 30 | 31 | @pytest.fixture() 32 | def party1_client(): 33 | """Create a test client with a CSV and party number 1.""" 34 | app.config["unprocessed_dataframe"] = pd.DataFrame(dict(column1=[], column2=[])) 35 | app.config["party_number"] = 1 36 | return app.test_client() 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Crown copyright Office for National Statistics 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/pprl/matching/local.py: -------------------------------------------------------------------------------- 1 | """Functions for performing matching locally.""" 2 | 3 | import os 4 | 5 | from pprl import config 6 | from pprl.embedder.embedder import Embedder 7 | 8 | 9 | def build_local_file_paths(party: str) -> tuple[str, str]: 10 | """ 11 | Construct the paths for the input and output datasets for a party. 12 | 13 | Parameters 14 | ---------- 15 | party : str 16 | Name of the party. 17 | 18 | Returns 19 | ------- 20 | inpath : str 21 | Location of the party data. 22 | outpath : str 23 | Location to put the party results. 24 | """ 25 | 26 | stem = config.DIR_DATA_INTERIM 27 | inpath = os.path.join(stem, f"{party}-data.json") 28 | outpath = os.path.join(stem, f"{party}-output.json") 29 | 30 | return inpath, outpath 31 | 32 | 33 | def load_embedder() -> Embedder: 34 | """ 35 | Load an embedder from a pickle in the local data directory. 36 | 37 | Returns 38 | ------- 39 | embedder : Embedder 40 | Reformed embedder instance. 41 | """ 42 | 43 | path = os.path.join(config.DIR_DATA_INTERIM, "embedder.pkl") 44 | embedder = Embedder.from_pickle(path=path) 45 | 46 | return embedder 47 | -------------------------------------------------------------------------------- /scripts/03-setup-workload-author.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Creates artifact repository and workload image, then uploads the image. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | set_gcp_project $WORKLOAD_AUTHOR_PROJECT 9 | 10 | echo "Enabling APIs for workload author on $WORKLOAD_AUTHOR_PROJECT..." 11 | gcloud services enable artifactregistry.googleapis.com 12 | 13 | create_artifact_repository $ARTIFACT_REPOSITORY $WORKLOAD_AUTHOR_REGION 14 | 15 | gcloud auth configure-docker $WORKLOAD_AUTHOR_PROJECT_REGION-docker.pkg.dev 16 | 17 | echo "Building the workload Docker image..." 18 | cd .. 19 | docker build . -t $WORKLOAD_IMAGE_REFERENCE 20 | cd scripts 21 | 22 | echo "Pushing the workload Docker image to artifact registry $ARTIFACT_REPOSITORY..." 23 | docker push $WORKLOAD_IMAGE_REFERENCE:$WORKLOAD_IMAGE_TAG 24 | 25 | echo "Granting roles/artifactregistry.reader role to workload service account $WORKLOAD_SERVICE_ACCOUNT..." 26 | gcloud artifacts repositories add-iam-policy-binding $ARTIFACT_REPOSITORY \ 27 | --project=$WORKLOAD_AUTHOR_PROJECT \ 28 | --role=roles/artifactregistry.reader \ 29 | --location=$WORKLOAD_AUTHOR_PROJECT_REGION \ 30 | --member="serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL" 31 | -------------------------------------------------------------------------------- /test/embedder/test_bloom_filters.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the bloom_filters module.""" 2 | 3 | from hypothesis import given 4 | from hypothesis import strategies as st 5 | 6 | from pprl.embedder.bloom_filters import BloomFilterEncoder 7 | 8 | 9 | @given( 10 | st.lists(st.integers() | st.floats() | st.text(min_size=1), min_size=1, max_size=40), 11 | st.integers(min_value=2, max_value=100), 12 | st.integers(min_value=1, max_value=5), 13 | st.integers(min_value=0, max_value=50), 14 | st.text(), 15 | ) 16 | def test_bloom_filter_vector_collision_fraction(feature, size, num_hashes, offset, salt): 17 | """Test BloomFilterEncoder.bloom_filter_vector_collision_fraction. 18 | 19 | Tests the following properties for vec_idx_deduped: list[0 < int < size]. 20 | Tests the following properties for collision_fraction: >= 0, <= 1. 21 | """ 22 | bfencoder = BloomFilterEncoder(size=size, num_hashes=num_hashes, offset=offset, salt=salt) 23 | vec_idx_deduped, collision_fraction = bfencoder.bloom_filter_vector_collision_fraction(feature) 24 | 25 | assert all(isinstance(element, int) for element in vec_idx_deduped) 26 | assert all(element <= (size + offset - 1) for element in vec_idx_deduped) 27 | assert all(element >= offset for element in vec_idx_deduped) 28 | 29 | assert collision_fraction <= 1 30 | assert collision_fraction >= 0 31 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | - "dev*" 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest, windows-latest] 17 | python-version: ["3.10", "3.11"] 18 | 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v3 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | cache: "pip" 27 | - name: Update pip and install test dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install ".[test]" 31 | - name: Run tests 32 | run: | 33 | python -m pytest test 34 | - name: Run doctests 35 | if: | 36 | matrix.python-version == '3.11' && 37 | matrix.os == 'ubuntu-latest' 38 | run: | 39 | python -m doctest README.md 40 | - name: Install and run linters 41 | if: | 42 | matrix.python-version == '3.11' && 43 | matrix.os == 'ubuntu-latest' 44 | run: | 45 | python -m pip install ".[lint]" 46 | python -m ruff check src test 47 | python -m ruff format --check src test 48 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Publish documentation 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: main 7 | 8 | jobs: 9 | build-deploy: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | pages: write 14 | steps: 15 | - name: Check out repository 16 | uses: actions/checkout@v3 17 | - name: Set up Quarto 18 | uses: quarto-dev/quarto-actions/setup@v2 19 | - name: Install Python and dependencies 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.11" 23 | cache: "pip" 24 | - name: Build API reference 25 | run: | 26 | python -m pip install ".[docs]" 27 | python -m quartodoc build 28 | - name: Render and publish 29 | uses: quarto-dev/quarto-actions/publish@v2 30 | with: 31 | target: gh-pages 32 | env: 33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | PARTY_1_PROJECT: pprl-party-1 35 | PARTY_1_KEY_VERSION: 1 36 | PARTY_2_PROJECT: pprl-party-2 37 | PARTY_2_KEY_VERSION: 1 38 | WORKLOAD_AUTHOR_PROJECT: pprl-party-1 39 | WORKLOAD_AUTHOR_PROJECT_REGION: europe-west2 40 | WORKLOAD_OPERATOR_PROJECT: pprl-party-2 41 | WORKLOAD_OPERATOR_PROJECT_ZONE: europe-west2-c 42 | -------------------------------------------------------------------------------- /test/matching/test_perform.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the server utilities module.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pprl.matching import perform 7 | 8 | 9 | # Adding row index to check for bug to do with use of .loc instead of .iloc 10 | @pytest.mark.parametrize( 11 | "df1,df2,match,colname,expected", 12 | [ 13 | ( 14 | pd.DataFrame(dict(x=list("abcd")), index=["ann", "oying", "ind", "ex"]), 15 | pd.DataFrame(dict(y=list("abcd")), index=["an", "oth", "ero", "ne"]), 16 | ([0, 2], [0, 2]), 17 | "private_index", 18 | ["aa", "cc"], 19 | ), 20 | ], 21 | ) 22 | def test_add_private_index(df1, df2, match, colname, expected): 23 | """Test adding a private index works with move to `.iloc`.""" 24 | out1, out2 = perform.add_private_index(df1=df1, df2=df2, match=match, colname=colname) 25 | result = out1.merge(out2, on=colname).loc[:, ["x", "y"]].agg("".join, axis=1).to_list() 26 | 27 | assert result == expected 28 | 29 | 30 | @pytest.mark.parametrize( 31 | "df1,df2,match", 32 | [ 33 | ( 34 | pd.DataFrame(dict(x=list("abcd")), index=["ann", "oying", "ind", "ex"]), 35 | pd.DataFrame(dict(y=list("abcd")), index=["an", "oth", "ero", "ne"]), 36 | ([0, 2], [0, 2]), 37 | ), 38 | ], 39 | ) 40 | def test_add_private_index_complete(df1, df2, match): 41 | """Check that the private indexes are all integers (no missing).""" 42 | out1, out2 = perform.add_private_index(df1=df1, df2=df2, match=match) 43 | 44 | assert all(isinstance(i, int) for i in out1.private_index) 45 | assert all(isinstance(i, int) for i in out2.private_index) 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=62", "wheel", "cython"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "pprl" 7 | version = "0.1.0" 8 | authors = [ 9 | {name = "Mat Weldon"}, 10 | {name = "Samuel Stock"}, 11 | {name = "Kevin Fasusi"}, 12 | {name = "Henry Wilde"}, 13 | {name = "Data Science Campus", email = "datacampus@ons.gov.uk"}, 14 | ] 15 | description = "Privacy-preserving record linkage via Bloom filter embedding" 16 | readme = "README.md" 17 | requires-python = ">=3.10" 18 | license = {text = "MIT License"} 19 | dependencies = [ 20 | "dill", 21 | "flask", 22 | "numpy", 23 | "pandas==2.0.2", 24 | "python-dotenv", 25 | "requests==2.30.0", 26 | "metaphone", 27 | "cryptography", 28 | "google-cloud-storage", 29 | "google-cloud-logging", 30 | "google-cloud-kms", 31 | "scipy", 32 | "recordlinkage", 33 | ] 34 | 35 | [project.urls] 36 | homepage = "https://github.com/datasciencecampus/pprl_toolkit" 37 | 38 | [project.optional-dependencies] 39 | lint = ["ruff==0.3.0", "mypy"] 40 | test = [ 41 | "hypothesis", 42 | "pytest", 43 | "pytest-randomly", 44 | "pytest-sugar", 45 | "pytest-cov" 46 | ] 47 | docs = [ 48 | "ipykernel", 49 | "nbclient>=0.9.0", 50 | "nbformat>=5.9.2", 51 | "quartodoc>=0.6.6", 52 | ] 53 | dev = [ 54 | "pre-commit==3.1.0", 55 | "pprl[lint,docs,test]" 56 | ] 57 | 58 | [tool.ruff] 59 | line-length = 99 60 | exclude = ["notebooks/*"] 61 | extend-include = ["*.ipynb"] 62 | 63 | [tool.ruff.lint] 64 | extend-select = ["D", "I", "W"] 65 | ignore = ["D105", "D107", "D202", "D413"] 66 | 67 | [tool.ruff.lint.isort] 68 | known-first-party = ["pprl"] 69 | 70 | [tool.ruff.lint.pydocstyle] 71 | convention = "numpy" 72 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | # Run 'pre-commit autoupdate' to update hook versions 4 | repos: 5 | - repo: https://github.com/kynan/nbstripout 6 | rev: 0.7.1 7 | hooks: 8 | - id: nbstripout 9 | name: nbstripout - Strip outputs from notebooks (auto-fixes) 10 | args: 11 | - --extra-keys 12 | - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" 13 | - repo: https://github.com/pre-commit/pre-commit-hooks 14 | rev: v4.5.0 15 | hooks: 16 | - id: check-added-large-files 17 | name: Check for files larger than 5 MB 18 | args: [ "--maxkb=5120" ] 19 | - id: end-of-file-fixer 20 | name: Check for a blank line at the end of scripts (auto-fixes) 21 | exclude: '\.Rd' 22 | - id: trailing-whitespace 23 | name: Check for trailing whitespaces (auto-fixes) 24 | - repo: https://github.com/astral-sh/ruff-pre-commit 25 | # Ruff version. 26 | rev: v0.3.0 27 | hooks: 28 | # Run the linter. 29 | - id: ruff 30 | args: [ --fix ] 31 | # Run the formatter. 32 | - id: ruff-format 33 | - repo: https://github.com/Yelp/detect-secrets 34 | rev: v1.4.0 35 | hooks: 36 | - id: detect-secrets 37 | name: detect-secrets - Detect secrets in staged code 38 | args: [ "--baseline", ".secrets.baseline", '--exclude-files', '.*\.(ipynb|qmd)$', ] 39 | exclude: .*/tests/.*|^\.cruft\.json$ 40 | - id: detect-secrets 41 | name: 'detect-secrets-jupyter' 42 | args: ['--exclude-files', '.*[^i][^p][^y][^n][^b]$', '--exclude-lines', '"(hash|id|image/\w+)":.*', ] 43 | -------------------------------------------------------------------------------- /src/pprl/config.py: -------------------------------------------------------------------------------- 1 | """Functions for handling PPRL configuration.""" 2 | 3 | import inspect 4 | import os 5 | from pathlib import Path 6 | 7 | import dotenv 8 | 9 | import pprl 10 | 11 | 12 | def _find_directory(kind: str, what: str | None = None) -> Path: 13 | """ 14 | Find a directory in the root of the pprl installation. 15 | 16 | Parameters 17 | ---------- 18 | kind : str 19 | The category of directory to find. Typically `data` or `log`. 20 | what : str, optional 21 | The name of the directory in `kind` to find. If not specified, 22 | then `kind` is treated as the name of the directory. 23 | 24 | Returns 25 | ------- 26 | where : pathlib.Path 27 | Path object to the directory. 28 | """ 29 | 30 | where = Path(inspect.getfile(pprl)).parent.parent.parent / kind 31 | 32 | if what is not None: 33 | where /= what 34 | 35 | return where 36 | 37 | 38 | def load_environment(path: None | str = None) -> dict[str, None | str]: 39 | """ 40 | Load the configuration file as a dictionary. 41 | 42 | Parameters 43 | ---------- 44 | path : str, optional 45 | Location of the configuration file to load. If not specified, 46 | try to load the configuration file from the root of the pprl 47 | installation called `.env`. 48 | 49 | Returns 50 | ------- 51 | config : collections.OrderedDict 52 | Mapping of the key-value pairs in the configuration file. 53 | """ 54 | 55 | if path is None: 56 | path = os.path.join(PPRL_ROOT, ".env") 57 | 58 | return dotenv.dotenv_values(path) 59 | 60 | 61 | PPRL_ROOT = _find_directory("") 62 | DIR_DATA_RAW = _find_directory("data", "raw") 63 | DIR_DATA_INTERIM = _find_directory("data", "interim") 64 | DIR_DATA_PROCESSED = _find_directory("data", "processed") 65 | DIR_LOGS = _find_directory("log") 66 | -------------------------------------------------------------------------------- /test/test_config.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the `config` module.""" 2 | 3 | import os 4 | import string 5 | from pathlib import Path 6 | from unittest import mock 7 | 8 | from hypothesis import given 9 | from hypothesis import strategies as st 10 | 11 | import pprl 12 | from pprl import config 13 | 14 | st_text = st.text(alphabet=string.ascii_lowercase, min_size=1) 15 | 16 | 17 | @given(st_text, st.one_of((st.just(None), st_text))) 18 | def test_find_directory(kind, what): 19 | """Test that a directory can be found correctly.""" 20 | 21 | root = Path("/path/to/a/test/module") 22 | with mock.patch("pprl.config.inspect.getfile") as get: 23 | get.return_value = root / "where" / "stuff" / "lives" 24 | directory = config._find_directory(kind, what) 25 | 26 | assert isinstance(directory, Path) 27 | 28 | if what is None: 29 | assert directory.stem == kind 30 | assert directory.parent == root 31 | else: 32 | assert directory.stem == what 33 | assert directory.parent == root / kind 34 | 35 | get.assert_called_once_with(pprl) 36 | 37 | 38 | @given(st_text) 39 | def test_load_environment_with_filename(filename): 40 | """Test the config loader works with a file name.""" 41 | 42 | with ( 43 | mock.patch("pprl.config.dotenv.dotenv_values") as values, 44 | mock.patch("pprl.config.os.path.join") as join, 45 | ): 46 | values.return_value = "foo" 47 | result = config.load_environment(filename) 48 | 49 | assert result == "foo" 50 | 51 | values.assert_called_once_with(filename) 52 | join.assert_not_called() 53 | 54 | 55 | def test_load_environment_default(): 56 | """Test the config loader works without a file name.""" 57 | 58 | with mock.patch("pprl.config.dotenv.dotenv_values") as values: 59 | values.return_value = "foo" 60 | result = config.load_environment() 61 | 62 | assert result == "foo" 63 | 64 | values.assert_called_once_with(os.path.join(config.PPRL_ROOT, ".env")) 65 | -------------------------------------------------------------------------------- /src/pprl/app/templates/process-data.html: -------------------------------------------------------------------------------- 1 | 8 | 9 | {% extends "base.html" %} 10 | 11 | {% block body %} 12 | 13 |
14 |

Select column types

15 | 16 | Please choose how each column should be treated from the dropdown menus 17 | below. See our feature 19 | documentation for details. 20 |

21 |
22 | {% for column in columns%} 23 |

24 | 25 |
32 |

33 | {% endfor %} 34 | 35 |
36 |

Choose a salt (optional)

37 |

38 | A cryptographic salt is a string that can be appended to all data before 39 | they are encrypted and sent to GCP. 40 | The salt should be the same for both parties. 41 |

42 |
43 |

44 | 45 | 46 |

47 | 48 |
49 |

Where next?

50 |

51 | You can either continue with the linkage by uploading the processed data to 52 | Google Cloud Platform, or download your processed data locally. The local 53 | download contains the Bloom filter embedding for your chosen dataset. 54 |

55 |
56 | 57 |   58 | 59 |
60 | {% endblock %} 61 | 62 | 63 | form { display: table; } 64 | p { display: table-row; } 65 | label { display: table-cell; } 66 | input { display: table-cell; } 67 | -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.0.3", 3 | "plugins_used": [ 4 | { 5 | "name": "ArtifactoryDetector" 6 | }, 7 | { 8 | "name": "AWSKeyDetector" 9 | }, 10 | { 11 | "name": "AzureStorageKeyDetector" 12 | }, 13 | { 14 | "name": "Base64HighEntropyString", 15 | "limit": 4.5 16 | }, 17 | { 18 | "name": "BasicAuthDetector" 19 | }, 20 | { 21 | "name": "CloudantDetector" 22 | }, 23 | { 24 | "name": "HexHighEntropyString", 25 | "limit": 3.0 26 | }, 27 | { 28 | "name": "IbmCloudIamDetector" 29 | }, 30 | { 31 | "name": "IbmCosHmacDetector" 32 | }, 33 | { 34 | "name": "JwtTokenDetector" 35 | }, 36 | { 37 | "name": "KeywordDetector", 38 | "keyword_exclude": "" 39 | }, 40 | { 41 | "name": "MailchimpDetector" 42 | }, 43 | { 44 | "name": "NpmDetector" 45 | }, 46 | { 47 | "name": "PrivateKeyDetector" 48 | }, 49 | { 50 | "name": "SlackDetector" 51 | }, 52 | { 53 | "name": "SoftlayerDetector" 54 | }, 55 | { 56 | "name": "SquareOAuthDetector" 57 | }, 58 | { 59 | "name": "StripeDetector" 60 | }, 61 | { 62 | "name": "TwilioKeyDetector" 63 | } 64 | ], 65 | "filters_used": [ 66 | { 67 | "path": "detect_secrets.filters.allowlist.is_line_allowlisted" 68 | }, 69 | { 70 | "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", 71 | "min_level": 2 72 | }, 73 | { 74 | "path": "detect_secrets.filters.heuristic.is_indirect_reference" 75 | }, 76 | { 77 | "path": "detect_secrets.filters.heuristic.is_likely_id_string" 78 | }, 79 | { 80 | "path": "detect_secrets.filters.heuristic.is_potential_uuid" 81 | }, 82 | { 83 | "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" 84 | }, 85 | { 86 | "path": "detect_secrets.filters.heuristic.is_sequential_string" 87 | }, 88 | { 89 | "path": "detect_secrets.filters.heuristic.is_templated_secret" 90 | } 91 | ], 92 | "results": {}, 93 | "generated_at": "2021-06-14T10:43:14Z" 94 | } 95 | -------------------------------------------------------------------------------- /scripts/04-authorise-workload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Authorises the workload to use the identity pool. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | export PROJECT_NAME=${1} 9 | export PROJECT_WORKLOAD_IDENTITY_POOL=$PROJECT_NAME-wip 10 | export PROJECT_WIP_PROVIDER=$PROJECT_WORKLOAD_IDENTITY_POOL-provider 11 | export PROJECT_NUMBER=$(gcloud projects describe $PROJECT_NAME --format="value(projectNumber)") 12 | export PROJECT_SERVICE_ACCOUNT_EMAIL=$PROJECT_NAME-sa@$PROJECT_NAME.iam.gserviceaccount.com 13 | 14 | export OPERATION=${2} 15 | if [ ! $OPERATION ]; then 16 | export OPERATION=create 17 | fi 18 | 19 | set_gcp_project $PROJECT_NAME 20 | 21 | echo "Creating provider for $PROJECT_WORKLOAD_IDENTITY_POOL authorising $WORKLOAD_IMAGE_REFERENCE..." 22 | gcloud iam workload-identity-pools providers ${OPERATION}-oidc $PROJECT_WIP_PROVIDER \ 23 | --location=$PROJECT_LOCATION \ 24 | --workload-identity-pool="$PROJECT_WORKLOAD_IDENTITY_POOL" \ 25 | --issuer-uri="https://confidentialcomputing.googleapis.com/" \ 26 | --allowed-audiences="https://sts.googleapis.com" \ 27 | --attribute-mapping="google.subject='assertion.sub'" \ 28 | --attribute-condition="assertion.swname == 'CONFIDENTIAL_SPACE' && 29 | 'STABLE' in assertion.submods.confidential_space.support_attributes && 30 | assertion.submods.container.image_reference == '$WORKLOAD_IMAGE_REFERENCE:$WORKLOAD_IMAGE_TAG' && 31 | '$WORKLOAD_SERVICE_ACCOUNT_EMAIL' in assertion.google_service_accounts" 32 | 33 | echo "Creating attestation credentials file for $WORKLOAD_SERVICE_ACCOUNT..." 34 | gcloud iam workload-identity-pools create-cred-config \ 35 | projects/$PROJECT_NUMBER/locations/$PROJECT_LOCATION/workloadIdentityPools/$PROJECT_WORKLOAD_IDENTITY_POOL/providers/$PROJECT_WIP_PROVIDER \ 36 | --service-account=$PROJECT_SERVICE_ACCOUNT_EMAIL \ 37 | --credential-source-file="/run/container_launcher/attestation_verifier_claims_token" \ 38 | --output-file=../secrets/$PROJECT_NAME-attestation-credentials.json 39 | 40 | echo "Copying attestation credentials for $PROJECT_NAME to $ATTESTATION_BUCKET..." 41 | if ! gsutil cp ../secrets/$PROJECT_NAME-attestation-credentials.json gs://$ATTESTATION_BUCKET/; then 42 | err "Failed to upload the attestation credentials for $PROJECT_NAME to $ATTESTATION_BUCKET." 43 | fi 44 | -------------------------------------------------------------------------------- /_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | 4 | website: 5 | title: "**pprl**" 6 | navbar: 7 | left: 8 | - href: index.qmd 9 | text: About 10 | - href: docs/tutorials/index.qmd 11 | text: Tutorials 12 | - href: docs/reference/index.qmd 13 | text: API Reference 14 | right: 15 | - icon: github 16 | menu: 17 | - text: Source code 18 | url: https://github.com/datasciencecampus/pprl_toolkit 19 | - text: Open an issue 20 | url: https://github.com/datasciencecampus/pprl_toolkit/issues 21 | sidebar: 22 | style: docked 23 | search: true 24 | contents: 25 | - text: About 26 | href: index.qmd 27 | - auto: "*.qmd" 28 | reader-mode: true 29 | page-footer: 30 | left: > 31 | All content is available under the 32 | [Open Government Licence V3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/), 33 | except where otherwise stated. 34 | center: > 35 | Built using [Quarto](https://quarto.org/). 36 | 37 | format: 38 | html: 39 | mainfont: Arial 40 | theme: 41 | light: flatly 42 | dark: darkly 43 | lang: en-GB 44 | 45 | metadata-files: 46 | - docs/_sidebar.yml 47 | 48 | quartodoc: 49 | title: API reference 50 | package: pprl 51 | dir: docs/reference 52 | sidebar: docs/_sidebar.yml 53 | sections: 54 | - title: Embeddings 55 | desc: > 56 | Tools for generating a Bloom filter embedding and its underlying 57 | features. 58 | package: pprl.embedder 59 | contents: 60 | - bloom_filters 61 | - embedder 62 | - features 63 | - title: Encryption 64 | desc: Functions for handling the data and key encryption processes. 65 | contents: 66 | - encryption 67 | - title: Configuration 68 | desc: Functions for working out and handling linkage configuration. 69 | contents: 70 | - config 71 | - title: Client-side app 72 | desc: > 73 | Functions for the Flask application where users upload, process, and 74 | download their data. 75 | package: pprl.app 76 | contents: 77 | - utils 78 | - title: Server functions 79 | desc: > 80 | Functions for the matching workload server. Used in `scripts/server.py` 81 | package: pprl.matching 82 | contents: 83 | - cloud 84 | - local 85 | - perform 86 | -------------------------------------------------------------------------------- /scripts/02-setup-workload-operator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Creates a service account for the workload operator. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | set_gcp_project $WORKLOAD_OPERATOR_PROJECT 9 | 10 | echo "Enabling APIs for workload operator on $WORKLOAD_OPERATOR_PROJECT..." 11 | gcloud services enable \ 12 | compute.googleapis.com \ 13 | confidentialcomputing.googleapis.com \ 14 | logging.googleapis.com 15 | 16 | echo "Creating attestion bucket for $WORKLOAD_OPERATOR_PROJECT..." 17 | create_storage_bucket $ATTESTATION_BUCKET 18 | 19 | echo "Granting parties the rights to access $ATTESTATION_BUCKET..." 20 | grant_attestation_bucket_rights $PARTY_1_PROJECT_EMAIL $ATTESTATION_BUCKET 21 | grant_attestation_bucket_rights $PARTY_2_PROJECT_EMAIL $ATTESTATION_BUCKET 22 | 23 | echo "Creating workload service account $WORKLOAD_SERVICE_ACCOUNT under $WORKLOAD_OPERATOR_PROJECT..." 24 | create_service_account $WORKLOAD_SERVICE_ACCOUNT 25 | 26 | echo "Granting roles/storage.admin role for $ATTESTATION_BUCKET to service account $WORKLOAD_SERVICE_ACCOUNT..." 27 | if ! gcloud storage buckets add-iam-policy-binding gs://$ATTESTATION_BUCKET \ 28 | --member=serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL \ 29 | --role=roles/storage.admin; then 30 | err "Failed to grant roles/storage.admin role for $ATTESTATION_BUCKET to service account $WORKLOAD_SERVICE_ACCOUNT." 31 | fi 32 | 33 | echo "Granting roles/iam.serviceAccountUser role to workload operator..." 34 | if ! gcloud iam service-accounts add-iam-policy-binding $WORKLOAD_SERVICE_ACCOUNT_EMAIL \ 35 | --member="user:$(gcloud config get-value account)" \ 36 | --role="roles/iam.serviceAccountUser"; then 37 | err "Failed to grant role to workload operator $WORKLOAD_OPERATOR_USER under $WORKLOAD_OPERATOR_PROJECT." 38 | fi 39 | 40 | echo "Granting roles/confidentialcomputing.workloadUser to service account $WORKLOAD_SERVICE_ACCOUNT..." 41 | if ! gcloud projects add-iam-policy-binding $WORKLOAD_OPERATOR_PROJECT \ 42 | --member="serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL" \ 43 | --role="roles/confidentialcomputing.workloadUser"; then 44 | err "Failed to grant roles/confidentialcomputing.workloadUser to service-account $WORKLOAD_SERVICE_ACCOUNT." 45 | fi 46 | 47 | echo "Granting roles/logging.logWriter to service account $WORKLOAD_SERVICE_ACCOUNT..." 48 | if ! gcloud projects add-iam-policy-binding $WORKLOAD_OPERATOR_PROJECT \ 49 | --member="serviceAccount:$WORKLOAD_SERVICE_ACCOUNT_EMAIL" \ 50 | --role="roles/logging.logWriter"; then 51 | err "Failed to grant roles/logging.logWriter to service account $WORKLOAD_SERVICE_ACCOUNT." 52 | fi 53 | -------------------------------------------------------------------------------- /scripts/01-setup-party-resources.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Sets up the cloud resources for a data-owning party. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | export PROJECT_NAME=$1 9 | export PROJECT_BUCKET=$PROJECT_NAME-bucket 10 | export PROJECT_KEYRING=$PROJECT_NAME-akek-kr 11 | export PROJECT_KEY=$PROJECT_NAME-akek 12 | export PROJECT_SERVICE_ACCOUNT=$PROJECT_NAME-sa 13 | export PROJECT_SERVICE_ACCOUNT_DOMAIN=$PROJECT_NAME.iam.gserviceaccount.com 14 | export PROJECT_SERVICE_ACCOUNT_EMAIL=$PROJECT_SERVICE_ACCOUNT@$PROJECT_SERVICE_ACCOUNT_DOMAIN 15 | export PROJECT_WORKLOAD_IDENTITY_POOL=$PROJECT_NAME-wip 16 | export PROJECT_NUMBER=$(gcloud projects describe $PROJECT_NAME --format="value(projectNumber)") 17 | 18 | set_gcp_project $PROJECT_NAME 19 | 20 | echo "Enabling APIs for data owners on $PROJECT_NAME..." 21 | gcloud services enable cloudkms.googleapis.com iamcredentials.googleapis.com 22 | 23 | echo "Creating bucket for $PROJECT_NAME..." 24 | create_storage_bucket $PROJECT_BUCKET 25 | 26 | echo "Creating keyring for $PROJECT_NAME..." 27 | create_kms_keyring $PROJECT_KEYRING $PROJECT_LOCATION 28 | 29 | echo "Creating key encryption key on $PROJECT_KEYRING..." 30 | create_kms_encryption_key $PROJECT_KEY $PROJECT_KEYRING global 31 | 32 | echo "Creating service account for $PROJECT_NAME..." 33 | create_service_account $PROJECT_SERVICE_ACCOUNT 34 | 35 | echo "Granting roles/storage.admin role to $PROJECT_SERVICE_ACCOUNT on $PROJECT_BUCKET..." 36 | gcloud storage buckets add-iam-policy-binding gs://$PROJECT_BUCKET \ 37 | --member=serviceAccount:$PROJECT_SERVICE_ACCOUNT_EMAIL \ 38 | --role=roles/storage.admin 39 | 40 | echo "Granting KMS roles to the service account $PROJECT_SERVICE_ACCOUNT..." 41 | gcloud kms keys add-iam-policy-binding \ 42 | $PROJECT_KEY \ 43 | --keyring=$PROJECT_KEYRING \ 44 | --location=$PROJECT_LOCATION \ 45 | --member=serviceAccount:$PROJECT_SERVICE_ACCOUNT_EMAIL \ 46 | --role=roles/cloudkms.publicKeyViewer 47 | gcloud kms keys add-iam-policy-binding \ 48 | $PROJECT_KEY \ 49 | --keyring=$PROJECT_KEYRING \ 50 | --location=$PROJECT_LOCATION \ 51 | --member=serviceAccount:$PROJECT_SERVICE_ACCOUNT_EMAIL \ 52 | --role=roles/cloudkms.cryptoKeyDecrypter 53 | 54 | echo "Creating workload identity pool for $PROJECT_NAME..." 55 | create_workload_identity_pool $PROJECT_WORKLOAD_IDENTITY_POOL $PROJECT_LOCATION 56 | 57 | echo "Attaching service account $PROJECT_SERVICE_ACCOUNT to workload identity pool $PROJECT_WORKLOAD_IDENTITY_POOL..." 58 | gcloud iam service-accounts add-iam-policy-binding $PROJECT_SERVICE_ACCOUNT_EMAIL \ 59 | --member="principalSet://iam.googleapis.com/projects/$PROJECT_NUMBER/locations/$PROJECT_LOCATION/workloadIdentityPools/$PROJECT_WORKLOAD_IDENTITY_POOL/*" \ 60 | --role=roles/iam.workloadIdentityUser 61 | -------------------------------------------------------------------------------- /scripts/server.py: -------------------------------------------------------------------------------- 1 | """Script for running linkage on a server or locally.""" 2 | 3 | import logging 4 | import os 5 | 6 | import google.cloud.logging 7 | import pandas as pd 8 | 9 | from pprl import config 10 | from pprl.matching import cloud, local, perform_matching 11 | 12 | 13 | def load_environment_variables(path: None | str = None) -> tuple[str, str, str, str, str]: 14 | """ 15 | Load the environment and pull out the core pieces. 16 | 17 | Parameters 18 | ---------- 19 | path : str, optional 20 | Path to environment file. If running locally, no need to provide 21 | anything. 22 | 23 | Returns 24 | ------- 25 | operator : str 26 | Name of the workload operator. 27 | party_1 : str 28 | Name of the first party. 29 | party_2 : str 30 | Name of the second party. 31 | location : str 32 | Location of the workload identity pools and keyrings. 33 | version_1 : str 34 | Version of the key encryption key for the first party. 35 | version_2 : str 36 | Version of the key encryption key for the second party. 37 | """ 38 | 39 | environ = config.load_environment(path) 40 | 41 | operator = environ.get("WORKLOAD_OPERATOR_PROJECT") 42 | party_1 = environ.get("PARTY_1_PROJECT") 43 | party_2 = environ.get("PARTY_2_PROJECT") 44 | location = environ.get("PROJECT_LOCATION", "global") 45 | version_1 = environ.get("PARTY_1_KEY_VERSION", 1) 46 | version_2 = environ.get("PARTY_2_KEY_VERSION", 1) 47 | 48 | return operator, party_1, party_2, location, version_1, version_2 49 | 50 | 51 | def main(): 52 | """Perform the matching process and save the results.""" 53 | 54 | if int(os.getenv("PRODUCTION", 0)) == 1: 55 | logger = google.cloud.logging.Client() 56 | logger.setup_logging() 57 | logging.info("Logging set up.") 58 | 59 | operator, party_1, party_2, location, version_1, version_2 = load_environment_variables( 60 | ".env" 61 | ) 62 | parties = (party_1, party_2) 63 | 64 | logging.info("Downloading embedder...") 65 | embedder = cloud.download_embedder(parties, operator) 66 | 67 | logging.info("Preparing assets...") 68 | data_1, dek_1 = prepare_party_assets(party_1, operator, location, version_1) 69 | data_2, dek_2 = prepare_party_assets(party_2, operator, location, version_2) 70 | 71 | logging.info("Performing matching...") 72 | outputs = perform_matching(data_1, data_2, embedder) 73 | 74 | logging.info("Uploading results...") 75 | for party, output, dek in zip(parties, outputs, (dek_1, dek_2)): 76 | logging.info(f"Uploading results for {party}...") 77 | cloud.upload_party_results(output, dek, party, operator) 78 | 79 | else: 80 | logging.basicConfig(encoding="utf-8", level=logging.INFO) 81 | 82 | logging.info("Setting up environment and file paths...") 83 | operator, party_1, party_2, *_ = load_environment_variables() 84 | inpath_1, outpath_1 = build_local_file_paths(party_1) 85 | inpath_2, outpath_2 = build_local_file_paths(party_2) 86 | 87 | logging.info("Loading files...") 88 | embedder = load_embedder() 89 | data_1 = pd.read_json(inpath_1) 90 | data_2 = pd.read_json(inpath_2) 91 | 92 | logging.info("Performing matching...") 93 | output_1, output_2 = perform_matching(data_1, data_2, embedder) 94 | 95 | logging.info("Saving results...") 96 | output_1.to_json(outpath_1) 97 | output_2.to_json(outpath_2) 98 | 99 | logging.info("Done!") 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /src/pprl/embedder/bloom_filters.py: -------------------------------------------------------------------------------- 1 | """Module for the Bloom filter encoder.""" 2 | 3 | import hashlib 4 | 5 | 6 | class BloomFilterEncoder: 7 | """Encoder of tokens and features via hashing and a Bloom filter. 8 | 9 | The process for creating a cryptographically secure Bloom filter 10 | encoding of a set of tokens is as follows: 11 | 12 | 1. Compute the hash digest for your tokens 13 | 2. Convert the digest bytes into integers 14 | 3. Map the integer to a bloom filter vector (modulo the length of the vector) 15 | 16 | Parameters 17 | ---------- 18 | size: int 19 | Size of the Bloom filter. Defaults to 1024 20 | num_hashes: int 21 | Number of hashes to perform. Defaults to two. 22 | offset: int 23 | Offset for Bloom filter indices to allow for masking. Defaults 24 | to zero. 25 | salt: str, optional 26 | Cryptographic salt appended to tokens prior to hashing. 27 | 28 | Attributes 29 | ---------- 30 | hash_function: func 31 | Hashing function (`hashlib.sha256`). 32 | """ 33 | 34 | def __init__( 35 | self, size: int = 1024, num_hashes: int = 2, offset: int = 0, salt: str | None = None 36 | ) -> None: 37 | self.size = size 38 | self.num_hashes = num_hashes 39 | self.offset = offset 40 | self.salt = salt or "" 41 | 42 | self.hash_function = hashlib.sha256 43 | 44 | def bloom_filter_vector_collision_fraction( 45 | self, feature: list[str] 46 | ) -> tuple[list[int], float]: 47 | """Convert a feature vector and return its collision fraction. 48 | 49 | The index vector uses an optional offset for masking. 50 | 51 | Parameters 52 | ---------- 53 | feature: list 54 | List of features to be processed. 55 | 56 | Returns 57 | ------- 58 | vector_idxs: list 59 | Index values used to create the Bloom filter vector. 60 | collision_fraction: float 61 | Proportion of repeated indices. 62 | 63 | Examples 64 | -------- 65 | >>> bfe = BloomFilterEncoder() 66 | >>> bfe.bloom_filter_vector_collision_fraction(["a","b","c"]) 67 | ([334, 1013, 192, 381, 18, 720], 0.0) 68 | """ 69 | vec_idx: list = [] 70 | 71 | for gram in feature: 72 | for i in range(self.num_hashes): 73 | utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8") 74 | digest = self.hash_function(utf_string_with_salt).digest() 75 | digest_as_int = (int.from_bytes(digest, "little") % self.size) + self.offset 76 | vec_idx.append(digest_as_int) 77 | 78 | vec_idx_deduped = [*set(vec_idx)] 79 | collision_fraction = 1 - len(vec_idx_deduped) / len(vec_idx) 80 | 81 | return vec_idx_deduped, collision_fraction 82 | 83 | def bloom_filter_vector(self, feature: list[str]) -> list[int]: 84 | """Convert a feature vector into indices for a Bloom vector. 85 | 86 | The index vector uses an optional offset for masking. 87 | 88 | Parameters 89 | ---------- 90 | feature: list 91 | List of features to be converted. 92 | 93 | Returns 94 | ------- 95 | vector_idxs: list 96 | Index values used to create the Bloom filter vector. 97 | 98 | Examples 99 | -------- 100 | >>> bfe = BloomFilterEncoder() 101 | >>> bfe.bloom_filter_vector(["a","b","c"]) 102 | [334, 1013, 192, 381, 18, 720] 103 | """ 104 | vec_idx_deduped, _ = self.bloom_filter_vector_collision_fraction(feature) 105 | 106 | return vec_idx_deduped 107 | -------------------------------------------------------------------------------- /test/app/test_utils.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the Flask app utility functions.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from pprl.app import utils 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "input_filename, expected_output", 11 | [ 12 | ("file1.csv", True), 13 | ("D:path/folder1/folder2/myfile.csv", True), 14 | ("file.CsV", True), 15 | ("file.txt", False), 16 | ("D:path/folder1/file.TxT", False), 17 | ("file1csv", False), 18 | ("other.py", False), 19 | (".csv", False), 20 | ], 21 | ) 22 | def test_check_is_csv(input_filename, expected_output): 23 | """Check the CSV checker works as it should.""" 24 | assert utils.check_is_csv(input_filename) is expected_output 25 | 26 | 27 | @pytest.mark.parametrize( 28 | "form, expected_drop_columns, expected_other_columns, expected_colspec", 29 | [ 30 | ( 31 | { 32 | "salt": "my_salt", 33 | "upload": "Upload to GCP", 34 | "download": "Download file locally", 35 | "column4": "drop", 36 | "column5": "Name", 37 | "column6": "keep", 38 | }, 39 | ["column4"], 40 | ["column6"], 41 | {"column5": "name"}, 42 | ), 43 | ({}, [], [], {}), 44 | ], 45 | ) 46 | def test_assign_columns(form, expected_drop_columns, expected_other_columns, expected_colspec): 47 | """Test to make sure the correct columns are assigned correctly.""" 48 | 49 | feature_funcs = { 50 | "Name": "name", 51 | "Date": "dob", 52 | "Sex": "sex", 53 | "Miscellaneous": "misc_features", 54 | "Shingled": "misc_shingled_features", 55 | } 56 | 57 | drop_columns, other_columns, colspec = utils.assign_columns(form, feature_funcs) 58 | assert drop_columns == expected_drop_columns 59 | assert other_columns == expected_other_columns 60 | assert colspec == expected_colspec 61 | 62 | 63 | def test_convert_dataframe_to_bf(): 64 | """Test convert_dataframe_to_bf. 65 | 66 | Tests the following properties: Returns Pandas DataFrame, dataframe length, 67 | column names. 68 | """ 69 | 70 | dataframe_values = dict( 71 | id_column=["1", "2", 3], 72 | name_column=["name1", "name2", "name3"], 73 | dob_column=["01/08/1996", "Mar 2000", 2005], 74 | sex_column=["M", "F", "Other"], 75 | house_number=[6, 1, 7], 76 | postcode=["P12 7UP", "LW12, 6PL", "H12 9I6"], 77 | other_column=[1, 8, 9], 78 | ) 79 | input_dataframe = pd.DataFrame(dataframe_values) 80 | 81 | colspec = dict( 82 | name_column="name", 83 | dob_column="dob", 84 | sex_column="sex", 85 | house_number="misc_features", 86 | postcode="misc_shingled_features", 87 | ) 88 | 89 | other_columns = ["id_column"] 90 | 91 | output_dataframe = utils.convert_dataframe_to_bf( 92 | input_dataframe, colspec, other_columns, salt="my_salt" 93 | ) 94 | 95 | assert isinstance(output_dataframe, pd.DataFrame) 96 | assert len(output_dataframe) == 3 97 | assert set(output_dataframe.columns) == set( 98 | ["id_column", "bf_indices", "bf_norms", "thresholds"] 99 | ) 100 | 101 | 102 | def test_convert_dataframe_to_bf_other_columns_none(): 103 | """Test convert_dataframe_to_bf. 104 | 105 | Tests when the other_columns keyword arguement is set to None. 106 | """ 107 | 108 | dataframe_values = dict( 109 | id_column=["1", "2", 3], 110 | name_column=["name1", "name2", "name3"], 111 | ) 112 | input_dataframe = pd.DataFrame(dataframe_values) 113 | 114 | colspec = dict( 115 | name_column="name", 116 | ) 117 | 118 | output_dataframe = utils.convert_dataframe_to_bf(input_dataframe, colspec, salt="my_salt") 119 | 120 | assert isinstance(output_dataframe, pd.DataFrame) 121 | assert len(output_dataframe) == 3 122 | assert set(output_dataframe.columns) == set(["bf_indices", "bf_norms", "thresholds"]) 123 | -------------------------------------------------------------------------------- /test/embedder/strategies.py: -------------------------------------------------------------------------------- 1 | """Hypothesis strategies for our embedder subpackage tests.""" 2 | 3 | import re 4 | import string 5 | from datetime import datetime 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from dateutil.relativedelta import relativedelta 10 | from hypothesis import strategies as st 11 | from scipy.linalg import qr 12 | 13 | ALPHABET = string.ascii_letters + string.punctuation 14 | 15 | NAMES = ( 16 | "Fred Hogan O'Malley", 17 | "Angelina Guidone", 18 | "Zbyněk Liška", 19 | "Jolana Pešková", 20 | "Diane Elizabeth Davey-Hurst", 21 | "Vanessa Comencini", 22 | "Benito Montalcini", 23 | "Bettina Nitto", 24 | "Sandro Rubbia", 25 | "Alexandr Čech", 26 | "Adéla Strnadová", 27 | "Manuel Boaga", 28 | "Jamie Philip Smith", 29 | "Jordan Francis", 30 | "Melina Cantimori", 31 | "Maria Giulia Cattaneo", 32 | "Karel Strnad", 33 | "Silvie Čechová", 34 | "Markéta Sedláková", 35 | "Lucy Barrett-O'Reilly", 36 | "Tereza Kat'ya Blažková", 37 | ) 38 | 39 | 40 | @st.composite 41 | def st_mutated_names(draw, names=NAMES, mutagens=",-_+ ."): 42 | """Generate a name and its mutated form.""" 43 | 44 | name = draw(st.sampled_from(names)) 45 | mutated = "".join(draw(st.text(alphabet=" ", max_size=2))) 46 | for char in name: 47 | if char == " ": 48 | mutated += draw(st.text(alphabet=mutagens, min_size=1, max_size=3)) 49 | else: 50 | mutated += char 51 | 52 | return name, mutated 53 | 54 | 55 | @st.composite 56 | def st_tokenized_names(draw, names=NAMES): 57 | """Generate a properly tokenized name.""" 58 | 59 | name = draw(st.sampled_from(names)) 60 | tokens = [f"_{word}_" for word in re.split(r"[\s-]", name)] 61 | 62 | return tokens 63 | 64 | 65 | @st.composite 66 | def st_names_series(draw, names=NAMES): 67 | """Generate a series of names.""" 68 | 69 | names = draw(st.lists(st.sampled_from(names), min_size=1, max_size=100)) 70 | 71 | return pd.Series(names) 72 | 73 | 74 | @st.composite 75 | def st_sexes_series(draw, options=("Male", "Female", "Non-binary", None)): 76 | """Generate a series of sexes.""" 77 | 78 | sexes = draw(st.lists(st.sampled_from(options), min_size=1, max_size=100)) 79 | 80 | return pd.Series(sexes) 81 | 82 | 83 | @st.composite 84 | def st_dobs_and_order_params(draw, years_range=100): 85 | """Generate a series of date strings and their order parameters.""" 86 | 87 | dayfirst = draw(st.booleans()) 88 | yearfirst = not dayfirst 89 | format_ = "%Y-%m-%d" if yearfirst else "%d/%m/%Y" 90 | 91 | max_value = datetime.today().date() 92 | min_value = max_value - relativedelta(years=years_range) 93 | st_dates = st.dates(min_value, max_value).map(lambda date: date.strftime(format_)) 94 | dobs = draw(st.lists(st.one_of((st.just(None), st_dates)), min_size=1, max_size=10)) 95 | 96 | return pd.Series(dobs), dayfirst, yearfirst, format_ 97 | 98 | 99 | @st.composite 100 | def st_default_dobs(draw): 101 | """Generate a default DOB list.""" 102 | 103 | date = draw(st.dates()) 104 | 105 | return date.strftime("day<%d>_month<%m>_year<%Y>").split("_") 106 | 107 | 108 | @st.composite 109 | def st_fields_series(draw): 110 | """Generate a series of miscellaneous fields.""" 111 | 112 | options = ( 113 | st.text(alphabet=ALPHABET), 114 | st.integers(0, 10), 115 | st.lists(st.integers(0, 10), min_size=1, max_size=2), 116 | st.just(None), 117 | ) 118 | fields = draw(st.lists(st.one_of(options), min_size=1, max_size=100)) 119 | 120 | return pd.Series(fields) 121 | 122 | 123 | @st.composite 124 | def st_strings_series(draw): 125 | """Generate a series of strings.""" 126 | 127 | options = ( 128 | st.just(None), 129 | st.text(alphabet=ALPHABET, min_size=10, max_size=20), 130 | ) 131 | strings = draw(st.lists(st.one_of(options), min_size=1, max_size=100)) 132 | 133 | return pd.Series(strings) 134 | 135 | 136 | @st.composite 137 | def st_posdef_matrices(draw, bf_size=10): 138 | """Generate a square positive definite matrix.""" 139 | 140 | rseed = draw(st.integers(2**10, 2**14)) 141 | rng = np.random.default_rng(rseed) 142 | H = rng.normal(scale=2, size=(bf_size, bf_size)) 143 | diag_values = rng.exponential(size=bf_size) 144 | Q, _ = qr(H) 145 | 146 | return Q.T @ np.diag(diag_values) @ Q 147 | 148 | 149 | @st.composite 150 | def st_bf_indices(draw, bf_size): 151 | """Generate a list of unique indices.""" 152 | bf_indices = draw(st.lists(st.integers(min_value=0, max_value=bf_size - 1), max_size=bf_size)) 153 | return list(set(bf_indices)) 154 | 155 | 156 | @st.composite 157 | def st_matrix_and_indices(draw): 158 | """Generate a pos-def matrix and indices in the same size.""" 159 | bf_size = draw(st.integers(2, 2**10)) 160 | mat = draw(st_posdef_matrices(bf_size)) 161 | bf_indices = draw(st_bf_indices(bf_size)) 162 | return mat, bf_indices 163 | -------------------------------------------------------------------------------- /test/app/test_file_selector.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the file selector part of the app.""" 2 | 3 | import io 4 | 5 | import pytest 6 | 7 | 8 | @pytest.mark.skip(reason="Test client not working in CI build") 9 | def test_file_selector(client): 10 | """Tests to make sure the upload file page is returned correctly.""" 11 | 12 | response = client.get("/") 13 | assert b"

Upload File

" in response.data 14 | 15 | 16 | @pytest.mark.skip(reason="Test client not working in CI build") 17 | def test_upload_file_text(client): 18 | """Check the user is informed if they upload the wrong file type.""" 19 | 20 | response = client.post( 21 | "/upload", 22 | data={"file": (io.BytesIO(b"some_text"), "test.txt")}, 23 | content_type="multipart/form-data", 24 | ) 25 | assert b"Upload a csv file." in response.data 26 | 27 | 28 | @pytest.mark.skip(reason="Test client not working in CI build") 29 | def test_upload_file_csv(client): 30 | """Check the column selector comes up after uploading a CSV.""" 31 | 32 | response = client.post( 33 | "/upload", 34 | data={"file": (io.BytesIO(b"some_text"), "test.csv")}, 35 | content_type="multipart/form-data", 36 | ) 37 | assert b"

Choose Salt

" in response.data 38 | 39 | 40 | @pytest.mark.skip(reason="Test client not working in CI build") 41 | def test_upload_file_csv_columns(client): 42 | """Check the form format in the column selector page.""" 43 | 44 | response = client.post( 45 | "/upload", 46 | data={"file": (io.BytesIO(b"column1,column2,mycolumn3"), "test.csv")}, 47 | content_type="multipart/form-data", 48 | ) 49 | assert b'' in response.data 50 | assert b'' in response.data 51 | assert b'' in response.data 52 | assert b'