├── MANIFEST.in ├── test ├── embedder │ ├── __init__.py │ ├── test_bloom_filters.py │ ├── strategies.py │ ├── test_embedder.py │ └── test_features.py ├── test_encryption.py ├── app │ ├── conftest.py │ ├── test_utils.py │ └── test_file_selector.py ├── matching │ └── test_perform.py └── test_config.py ├── docs ├── _static │ ├── app-home-screenshot.png │ └── 02-client-screenshot.png ├── assets │ └── pprl_cloud_diagram.png └── tutorials │ ├── index.qmd │ ├── example-febrl.qmd │ ├── run-through.qmd │ ├── example-verknupfung.qmd │ └── in-the-cloud.qmd ├── src └── pprl │ ├── app │ ├── static │ │ └── ons_files │ │ │ └── favicon.ico │ ├── templates │ │ ├── download-results.html │ │ ├── check-results.html │ │ ├── home.html │ │ ├── choose-data.html │ │ ├── process-data.html │ │ └── base.html │ ├── utils.py │ └── __init__.py │ ├── matching │ ├── __init__.py │ ├── local.py │ ├── perform.py │ └── cloud.py │ ├── __init__.py │ ├── embedder │ ├── __init__.py │ ├── bloom_filters.py │ └── features.py │ ├── config.py │ └── encryption.py ├── .env.example ├── scripts ├── 07-tear-down-author.sh ├── 06-tear-down-operator.sh ├── 08-tear-down-party.sh ├── 05-run-workload.sh ├── 03-setup-workload-author.sh ├── 04-authorise-workload.sh ├── 02-setup-workload-operator.sh ├── 01-setup-party-resources.sh ├── server.py └── common.sh ├── Dockerfile ├── .gitignore ├── index.qmd ├── .github ├── ISSUE_TEMPLATE │ ├── feature-idea.md │ └── bug_report.md └── workflows │ ├── ci.yml │ └── docs.yml ├── LICENSE ├── pyproject.toml ├── .pre-commit-config.yaml ├── .secrets.baseline ├── _quarto.yml └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/embedder/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the embedder subpackage.""" 2 | -------------------------------------------------------------------------------- /docs/_static/app-home-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/_static/app-home-screenshot.png -------------------------------------------------------------------------------- /docs/assets/pprl_cloud_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/assets/pprl_cloud_diagram.png -------------------------------------------------------------------------------- /docs/_static/02-client-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/docs/_static/02-client-screenshot.png -------------------------------------------------------------------------------- /src/pprl/app/static/ons_files/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencecampus/pprl_toolkit/HEAD/src/pprl/app/static/ons_files/favicon.ico -------------------------------------------------------------------------------- /src/pprl/matching/__init__.py: -------------------------------------------------------------------------------- 1 | """Functions for performing the matching locally or in the cloud.""" 2 | 3 | from .perform import perform_matching 4 | 5 | __all__ = ["perform_matching"] 6 | -------------------------------------------------------------------------------- /src/pprl/__init__.py: -------------------------------------------------------------------------------- 1 | """Privacy-preserving record linkage via Bloom filter embeddings.""" 2 | 3 | from .embedder import EmbeddedDataFrame, Embedder 4 | 5 | __all__ = ["EmbeddedDataFrame", "Embedder"] 6 | -------------------------------------------------------------------------------- /src/pprl/embedder/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools for generating our Bloom filter embeddings and matchings.""" 2 | 3 | from .embedder import EmbeddedDataFrame, Embedder 4 | 5 | __all__ = ["EmbeddedDataFrame", "Embedder"] 6 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | PARTY_1_PROJECT=pprl-party-1 2 | PARTY_1_KEY_VERSION=1 3 | 4 | PARTY_2_PROJECT=pprl-party-2 5 | PARTY_2_KEY_VERSION=1 6 | 7 | WORKLOAD_AUTHOR_PROJECT=pprl-party-1 8 | WORKLOAD_AUTHOR_PROJECT_REGION=europe-west2 9 | 10 | WORKLOAD_OPERATOR_PROJECT=pprl-party-2 11 | WORKLOAD_OPERATOR_PROJECT_ZONE=europe-west2-c 12 | -------------------------------------------------------------------------------- /scripts/07-tear-down-author.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Tears down all billable resources for the workload author. 4 | 5 | echo "Loading functions and environment variables..." 6 | source common.sh 7 | 8 | set_gcp_project $WORKLOAD_AUTHOR_PROJECT 9 | 10 | delete_artifact_repository $ARTIFACT_REPOSITORY $WORKLOAD_AUTHOR_PROJECT_REGION 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 python:3.11-slim-bookworm 2 | 3 | ENV PYTHONUNBUFFERED=True 4 | ENV PRODUCTION=1 5 | 6 | COPY pyproject.toml . 7 | ADD src/pprl src/pprl 8 | RUN python -m pip install --upgrade pip 9 | RUN python -m pip install --no-cache-dir . 10 | 11 | COPY .env . 12 | COPY scripts/server.py . 13 | 14 | CMD [ "python", "server.py" ] 15 | -------------------------------------------------------------------------------- /src/pprl/app/templates/download-results.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block head %} 4 | {% endblock %} 5 | 6 | {% block body %} 7 |
7 | This application is for data-owning parties to process and upload their data 8 | to a Google Cloud Platform (GCP) bucket. Once both parties have uploaded 9 | their data, the operator can run the workload to link your datasets in a 10 | secure environment. 11 | 12 | Keep this app open and you will be able to download your results at the end. 13 |
14 |16 | To begin, please select which party you are. 17 |
18 |