├── .circleci └── config.yml ├── .dockerignore ├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── bin ├── authenticate ├── cleanup ├── configure-mc ├── configure-spark-conf ├── dataproc ├── generate ├── insert └── process ├── config ├── content.json ├── spark │ ├── log4j.properties │ └── spark-defaults.conf.template ├── telemetry_origin_data_inc.json └── test-small.json ├── deployment ├── testing-v3 │ ├── .gitignore │ ├── LISTING.md │ ├── README.md │ ├── compose │ │ ├── admin │ │ │ ├── .env.template │ │ │ └── docker-compose.yml │ │ ├── server-a │ │ │ ├── .env.template │ │ │ └── docker-compose.yml │ │ └── server-b │ │ │ ├── .env.template │ │ │ └── docker-compose.yml │ ├── content.json │ ├── scripts │ │ ├── cleanup │ │ ├── generate-dotenv │ │ ├── generate-service-account-keys │ │ ├── integrate │ │ └── list-bucket │ └── terraform │ │ ├── .terraform.lock.hcl │ │ ├── main.tf │ │ └── modules │ │ ├── bucket-permissions │ │ └── main.tf │ │ └── bucket │ │ └── main.tf ├── testing-v4-gcloud-self │ ├── .gitignore │ ├── README.md │ ├── compose │ │ ├── ingest │ │ │ ├── .env.template │ │ │ └── docker-compose.yml │ │ ├── server-a │ │ │ ├── .env.template │ │ │ └── docker-compose.yml │ │ └── server-b │ │ │ ├── .env.template │ │ │ ├── bootstrap.sh │ │ │ ├── docker-compose.yml │ │ │ └── minio-config.json │ ├── content.json │ ├── scripts │ │ ├── build │ │ ├── cleanup │ │ ├── copy-minio-configuration │ │ ├── down │ │ ├── generate-dotenv │ │ ├── generate-minio-configuration │ │ ├── generate-service-account-keys │ │ └── integrate │ └── terraform │ │ ├── .terraform.lock.hcl │ │ ├── main.tf │ │ └── modules │ │ ├── bucket-permissions │ │ └── main.tf │ │ └── bucket │ │ └── main.tf └── testing-v4 │ ├── .gitignore │ ├── LISTING.md │ ├── README.md │ ├── compose │ ├── ingest │ │ ├── .env.template │ │ └── docker-compose.yml │ ├── server-a │ │ ├── .env.template │ │ └── docker-compose.yml │ └── server-b │ │ ├── .env.template │ │ └── docker-compose.yml │ ├── content.json │ ├── scripts │ ├── build │ ├── cleanup │ ├── generate-dotenv │ ├── generate-service-account-keys │ ├── integrate │ └── list-bucket │ └── terraform │ ├── .terraform.lock.hcl │ ├── main.tf │ └── modules │ ├── bucket-permissions │ └── main.tf │ └── bucket │ └── main.tf ├── docker-compose.yml ├── docs ├── README.md ├── airflow.md ├── cli-help.md ├── guide.md ├── images │ └── airflow-dag.png └── link │ └── CODE_OF_CONDUCT.md ├── examples ├── README.md ├── asyncio │ ├── README.md │ ├── dag.png │ └── main.py ├── batched-processing │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── docker-compose.yml │ ├── policy │ │ ├── server-a.json │ │ └── server-b.json │ └── scripts │ │ ├── bootstrap.sh │ │ ├── check-aggregates.sh │ │ ├── client.sh │ │ ├── integration.sh │ │ └── server.sh ├── benchmarks │ ├── README.md │ ├── client_encoding_time.png │ ├── encrypted_sizes.png │ ├── main.py │ ├── requirements.in │ └── requirements.txt ├── browser-validation │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── docker-compose.yml │ ├── generate.py │ └── main.py ├── docker-asyncio │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── client.py │ ├── docker-compose.yml │ └── server.py ├── python-wrapper │ ├── README.md │ └── main.py └── swig-wrapper │ ├── README.md │ └── main.py ├── google-cloud-sdk.repo ├── mkdocs.yml ├── notebooks ├── 2020-08-25-benchmarking-exploration.ipynb ├── 2020-08-25-cpu-time-by-n-data.csv ├── 2020-08-25-cpu-time-by-n-rows.csv └── 2020-11-05-benchmarking-results.ipynb ├── prio_processor ├── __init__.py ├── origin │ ├── __init__.py │ ├── commands.py │ ├── indexing.py │ ├── origins.py │ └── staging.py ├── prio │ ├── __init__.py │ ├── commands.py │ ├── options.py │ ├── types.py │ └── wrapper.py └── spark │ ├── __init__.py │ ├── commands.py │ └── udf.py ├── requirements-dev.in ├── requirements-dev.txt ├── requirements.txt ├── scripts ├── copy-spark-config ├── create-folder ├── download-mapping ├── print-cli-help ├── test-cli-integration ├── test-cli-integration-dataproc └── test-cli-integration-spark ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── resources ├── cli │ ├── client │ │ └── data.ndjson │ ├── config.json │ ├── server_a │ │ ├── intermediate │ │ │ ├── external │ │ │ │ ├── aggregate │ │ │ │ │ └── data.ndjson │ │ │ │ ├── verify1 │ │ │ │ │ └── data.ndjson │ │ │ │ └── verify2 │ │ │ │ │ └── data.ndjson │ │ │ └── internal │ │ │ │ ├── aggregate │ │ │ │ └── data.ndjson │ │ │ │ ├── verify1 │ │ │ │ └── data.ndjson │ │ │ │ └── verify2 │ │ │ │ └── data.ndjson │ │ ├── processed │ │ │ └── data.ndjson │ │ └── raw │ │ │ └── data.ndjson │ ├── server_a_keys.json │ ├── server_b │ │ ├── intermediate │ │ │ ├── external │ │ │ │ ├── aggregate │ │ │ │ │ └── data.ndjson │ │ │ │ ├── verify1 │ │ │ │ │ └── data.ndjson │ │ │ │ └── verify2 │ │ │ │ │ └── data.ndjson │ │ │ └── internal │ │ │ │ ├── aggregate │ │ │ │ └── data.ndjson │ │ │ │ ├── verify1 │ │ │ │ └── data.ndjson │ │ │ │ └── verify2 │ │ │ │ └── data.ndjson │ │ ├── processed │ │ │ └── data.ndjson │ │ └── raw │ │ │ └── data.ndjson │ ├── server_b_keys.json │ └── shared_seed.json └── fx-69.0a1.json ├── test_origin_indexing.py ├── test_origin_origins.py ├── test_origin_staging.py ├── test_prio_commands.py ├── test_prio_commands_end_to_end.py ├── test_prio_wrapper_client.py ├── test_prio_wrapper_serialize.py ├── test_spark_commands.py └── test_spark_udf.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | # https://github.com/mozilla-services/Dockerflow/blob/master/.circleci/config.yml 6 | # DOCKERHUB_REPO - docker hub repo, format: / 7 | # DOCKER_USER - login info for docker hub 8 | # DOCKER_PASS 9 | 10 | version: 2 11 | jobs: 12 | build: 13 | docker: 14 | - image: docker:stable-git 15 | steps: 16 | - checkout 17 | - setup_remote_docker 18 | - run: 19 | name: Create a version.json 20 | command: | 21 | # create a version.json per https://github.com/mozilla-services/Dockerflow/blob/master/docs/version_object.md 22 | printf '{"commit":"%s","version":"%s","source":"https://github.com/%s/%s","build":"%s"}\n' \ 23 | "$CIRCLE_SHA1" \ 24 | "$CIRCLE_TAG" \ 25 | "$CIRCLE_PROJECT_USERNAME" \ 26 | "$CIRCLE_PROJECT_REPONAME" \ 27 | "$CIRCLE_BUILD_URL" > version.json 28 | - run: 29 | name: Build development image 30 | command: | 31 | docker build -t prio:latest . 32 | - run: 33 | name: Save image into cache 34 | command: | 35 | docker save -o /tmp/latest.tar "prio:latest" 36 | - save_cache: 37 | key: v1-{{ .Branch }}-{{ epoch }} 38 | paths: 39 | - /tmp/latest.tar 40 | 41 | test: 42 | docker: 43 | - image: docker:stable-git 44 | steps: 45 | - setup_remote_docker 46 | - restore_cache: 47 | key: v1-{{ .Branch }} 48 | - run: 49 | name: Restore cache 50 | command: | 51 | docker load -i /tmp/latest.tar 52 | - run: 53 | name: Run the default tests 54 | command: docker run prio:latest 55 | 56 | test-batch-example: 57 | machine: true 58 | working_directory: ~/prio-processor/examples/batched-processing 59 | steps: 60 | - checkout: 61 | path: ~/prio-processor 62 | - restore_cache: 63 | key: v1-{{.Branch}} 64 | - run: 65 | name: Restore Docker image cache 66 | command: docker load -i /tmp/latest.tar 67 | - run: 68 | name: Build the compose container 69 | command: | 70 | # examples expect a prio:dev image 71 | docker tag prio:latest prio:dev 72 | docker-compose build 73 | - run: 74 | name: Test batched-processing integration with MinIO 75 | command: make test 76 | 77 | test-cli-integration-spark: 78 | docker: 79 | - image: docker:stable-git 80 | steps: 81 | - setup_remote_docker 82 | - restore_cache: 83 | key: v1-{{ .Branch }} 84 | - run: 85 | name: Restore cache 86 | command: | 87 | docker load -i /tmp/latest.tar 88 | - run: 89 | name: Run the default tests 90 | command: docker run prio:latest scripts/test-cli-integration-spark 91 | 92 | deploy: 93 | docker: 94 | - image: docker:stable-git 95 | steps: 96 | - checkout 97 | - setup_remote_docker 98 | - run: 99 | name: Create a version.json 100 | command: | 101 | # create a version.json per https://github.com/mozilla-services/Dockerflow/blob/master/docs/version_object.md 102 | printf '{"commit":"%s","version":"%s","source":"https://github.com/%s/%s","build":"%s"}\n' \ 103 | "$CIRCLE_SHA1" \ 104 | "$CIRCLE_TAG" \ 105 | "$CIRCLE_PROJECT_USERNAME" \ 106 | "$CIRCLE_PROJECT_REPONAME" \ 107 | "$CIRCLE_BUILD_URL" > version.json 108 | - restore_cache: 109 | key: v1-{{.Branch}} 110 | - run: 111 | name: Restore Docker image cache 112 | command: docker load -i /tmp/latest.tar 113 | - run: 114 | name: Rerun sanity checks before deploy 115 | command: docker run prio:latest 116 | - run: 117 | name: Deploy to Dockerhub 118 | command: | 119 | echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin 120 | # deploy main 121 | if [ "${CIRCLE_BRANCH}" == "main" ]; then 122 | docker tag prio:latest ${DOCKERHUB_REPO}:latest 123 | docker push ${DOCKERHUB_REPO}:latest 124 | elif [ ! -z "${CIRCLE_TAG}" ]; then 125 | # deploy a release tag... 126 | echo "${DOCKERHUB_REPO}:${CIRCLE_TAG}" 127 | docker tag prio:latest "${DOCKERHUB_REPO}:${CIRCLE_TAG}" 128 | docker images 129 | docker push "${DOCKERHUB_REPO}:${CIRCLE_TAG}" 130 | fi 131 | 132 | workflows: 133 | version: 2 134 | build-test-deploy: 135 | jobs: 136 | - build: 137 | filters: 138 | tags: 139 | only: /.*/ 140 | - test: 141 | requires: 142 | - build 143 | filters: 144 | tags: 145 | only: /.*/ 146 | - test-cli-integration-spark: 147 | requires: 148 | - build 149 | filters: 150 | tags: 151 | only: /.*/ 152 | - test-batch-example: 153 | requires: 154 | - build 155 | filters: 156 | tags: 157 | only: /.*/ 158 | - deploy: 159 | requires: 160 | - build 161 | - test 162 | - test-cli-integration-spark 163 | filters: 164 | tags: 165 | only: /.*/ 166 | branches: 167 | only: main 168 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | **/*.pyo 3 | **/__pycache__ 4 | *.so 5 | *.egg-info 6 | MANIFEST 7 | 8 | **/examples/*/Pipfile.lock 9 | 10 | .coverage 11 | .pytest_cache 12 | .vscode 13 | .tox 14 | .config/ 15 | .gsutil/ 16 | .ipynb_checkpoints/ 17 | .parallel 18 | .vscode 19 | .env 20 | 21 | data/ 22 | working/ 23 | **/build/ 24 | **/dist/ 25 | **/working/ 26 | **/venv/ 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | **/*.pyo 3 | **/__pycache__ 4 | **/.ipynb_checkpoints 5 | *.so 6 | *.egg-info 7 | MANIFEST 8 | 9 | **/examples/*/Pipfile.lock 10 | 11 | .coverage 12 | .pytest_cache 13 | .vscode 14 | .env 15 | .tox 16 | 17 | build/ 18 | data/ 19 | dist/ 20 | working/ 21 | venv/ 22 | .mc 23 | .ash_history 24 | 25 | .terraform/ 26 | .bash_history 27 | 28 | # we only want to keep the template and generate the actual config at runtime, 29 | # don't accidentally check values that may have sensitive values into 30 | # source-control 31 | config/spark/spark-defaults.conf 32 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/.gitmodules -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette 4 | guidelines. For more details, please read the [Mozilla Community Participation 5 | Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | 9 | For more information on how to report violations of the Community Participation 10 | Guidelines, please read our '[How to 11 | Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' 12 | page. 13 | 14 | 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:7 2 | LABEL maintainer="amiyaguchi@mozilla.com" 3 | 4 | ENV LANG en_US.utf8 5 | 6 | COPY ./google-cloud-sdk.repo /etc/yum.repos.d/ 7 | RUN yum install -y epel-release \ 8 | && yum install -y \ 9 | nss \ 10 | nspr \ 11 | msgpack \ 12 | python36 \ 13 | java-1.8.0-openjdk \ 14 | google-cloud-sdk \ 15 | rsync \ 16 | jq \ 17 | parallel \ 18 | which \ 19 | tree \ 20 | wget \ 21 | && yum clean all \ 22 | && rm -rf /var/cache/yum 23 | 24 | RUN gcloud config set disable_usage_reporting true 25 | 26 | RUN groupadd --gid 10001 app && \ 27 | useradd -g app --uid 10001 --shell /usr/sbin/nologin --create-home \ 28 | --home-dir /app app 29 | 30 | WORKDIR /app 31 | COPY requirements.txt requirements-dev.txt ./ 32 | 33 | ENV PATH="$PATH:~/.local/bin" 34 | RUN python3 -m ensurepip && \ 35 | pip3 install --upgrade pip wheel && \ 36 | pip3 install -r requirements.txt -r requirements-dev.txt 37 | 38 | ENV SPARK_HOME=/usr/local/lib/python3.6/site-packages/pyspark 39 | ENV PYSPARK_PYTHON=python3 40 | 41 | # Install libraries for interacting with cloud storage. We utilize the s3a 42 | # adaptor for cross-cloud compatibility, but use of the gcs connector may be 43 | # more performant when running directly in GCP. 44 | # https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage 45 | RUN gsutil cp gs://hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar "${SPARK_HOME}/jars" 46 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar 47 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar 48 | 49 | # Use the MinIO client for cross platform behavior, even with self-hosting 50 | RUN wget --directory-prefix /usr/local/bin https://dl.min.io/client/mc/release/linux-amd64/mc 51 | RUN chmod +x /usr/local/bin/mc 52 | 53 | ADD . /app 54 | 55 | # Symlink the spark config into SPARK_HOME so it can be updated via volume mounts 56 | RUN ln -s /app/config/spark ${SPARK_HOME}/conf 57 | 58 | # build the binary egg for distribution on Spark clusters 59 | RUN python3 setup.py bdist_egg && pip3 install -e . 60 | RUN chown -R app:app /app 61 | 62 | USER app 63 | CMD pytest -v tests && \ 64 | scripts/test-cli-integration && \ 65 | prio --help && \ 66 | prio-processor --help 67 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build clean test 2 | 3 | build: 4 | docker-compose build 5 | 6 | clean: 7 | docker-compose down 8 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Configuration under config/spark is derived from the Apache Spark project 2 | (https://github.com/apache/spark/tree/v3.0.1/conf). 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # prio-processor 2 | 3 | [![CircleCI](https://circleci.com/gh/mozilla/prio-processor.svg?style=svg)](https://circleci.com/gh/mozilla/prio-processor) 4 | 5 | Prio is a system for aggregating data in a privacy-preserving way. This 6 | repository includes a command-line tool for batch processing in Prio's 7 | multi-server architecture. 8 | 9 | For more information about Prio, see [this blog 10 | post](https://hacks.mozilla.org/2018/10/testing-privacy-preserving-telemetry-with-prio/). 11 | 12 | ## Docker 13 | 14 | This project contains a pre-configured build and test environment via docker. 15 | 16 | ```bash 17 | make 18 | 19 | # or run directly though docker-compose 20 | docker-compose build 21 | ``` 22 | 23 | You can mount your working directory and shell into the container for 24 | development work. 25 | 26 | ```bash 27 | docker-compose run -v $PWD:/app prio_processor bash 28 | ``` 29 | 30 | ## Adding new dependencies 31 | 32 | To add new Python dependencies to the container, use `pip-tools` to manage the 33 | `requirements.txt`. 34 | 35 | ```bash 36 | pip install pip-tools 37 | 38 | # generate the installation requirements from setup.py 39 | pip-compile 40 | 41 | # generate dev requirements 42 | pip-compile requirements-dev.in 43 | ``` 44 | 45 | Any new system dependencies should be added to the `Dockerfile` at the root of 46 | the repository. These will be available during runtime. 47 | 48 | ## Deployment Configuration 49 | 50 | See the `deployment` directory for examples of configuration that can be used to 51 | aid deployment. These may also be run as integration tests to determine whether 52 | resources are configured properly. These will typically assume Google Cloud 53 | Platform (GCP) as a resource provider. 54 | 55 | See the [guide](docs/guide.md) for more details. 56 | -------------------------------------------------------------------------------- /bin/authenticate: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Authenticate against Google Cloud services via service account if they exist, 4 | # otherwise log the assumption that the container is running on GCE. 5 | 6 | # ensure the variable is set, even if it's empty 7 | : "${GOOGLE_APPLICATION_CREDENTIALS:=}" 8 | 9 | if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then 10 | gcloud auth activate-service-account --key-file "${GOOGLE_APPLICATION_CREDENTIALS}" 11 | else 12 | # https://cloud.google.com/kubernetes-engine/docs/tutorials/authenticating-to-cloud-platform 13 | echo "No JSON credentials provided, using default scopes and project" 14 | fi 15 | -------------------------------------------------------------------------------- /bin/cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/. 5 | 6 | set -euo pipefail 7 | set -x 8 | 9 | : "${BUCKET_INTERNAL_INGEST?}" 10 | : "${BUCKET_INTERNAL_PRIVATE?}" 11 | : "${BUCKET_INTERNAL_SHARED?}" 12 | 13 | echo "Running cleanup..." 14 | 15 | "${BASH_SOURCE%/*}/configure-mc" 16 | mc stat "internal/${BUCKET_INTERNAL_INGEST}" 17 | mc stat "internal/${BUCKET_INTERNAL_PRIVATE}" 18 | mc stat "internal/${BUCKET_INTERNAL_SHARED}" 19 | (mc rm --recursive --force "internal/${BUCKET_INTERNAL_INGEST}" || echo "nothing to delete") 20 | (mc rm --recursive --force "internal/${BUCKET_INTERNAL_PRIVATE}" || echo "nothing to delete") 21 | (mc rm --recursive --force "internal/${BUCKET_INTERNAL_SHARED}" || echo "nothing to delete") 22 | -------------------------------------------------------------------------------- /bin/configure-mc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Configure the MinIO command-line tool against the environment variables 3 | 4 | set -e 5 | # do *not* print commands here since they utilize sensitive environment variables 6 | set +x 7 | 8 | : "${BUCKET_INTERNAL_ACCESS_KEY?}" 9 | : "${BUCKET_INTERNAL_SECRET_KEY?}" 10 | : "${BUCKET_INTERNAL_ENDPOINT?}" 11 | : "${BUCKET_EXTERNAL_SECRET_KEY?}" 12 | : "${BUCKET_EXTERNAL_ACCESS_KEY?}" 13 | : "${BUCKET_EXTERNAL_ENDPOINT?}" 14 | 15 | mc alias set internal \ 16 | $BUCKET_INTERNAL_ENDPOINT \ 17 | $BUCKET_INTERNAL_ACCESS_KEY \ 18 | $BUCKET_INTERNAL_SECRET_KEY \ 19 | --api S3v4 20 | 21 | mc alias set external \ 22 | $BUCKET_EXTERNAL_ENDPOINT \ 23 | $BUCKET_EXTERNAL_ACCESS_KEY \ 24 | $BUCKET_EXTERNAL_SECRET_KEY \ 25 | --api S3v4 26 | -------------------------------------------------------------------------------- /bin/configure-spark-conf: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Configure the spark defaults for use with the s3a adapter. Other settings may 4 | # be configured here too. 5 | 6 | set -e 7 | set +x 8 | 9 | : "${BUCKET_INTERNAL_ACCESS_KEY?}" 10 | : "${BUCKET_INTERNAL_SECRET_KEY?}" 11 | : "${BUCKET_INTERNAL_ENDPOINT?}" 12 | 13 | # work from the parent directory 14 | cd "$(dirname "$0")/.." 15 | 16 | # note that this directory may be mounted, so we've added this file to the 17 | # .gitignore 18 | output=config/spark/spark-defaults.conf 19 | cp config/spark/spark-defaults.conf.template $output 20 | 21 | # append our configuration 22 | cat << EOF >> $output 23 | 24 | spark.hadoop.fs.s3a.access.key $BUCKET_INTERNAL_ACCESS_KEY 25 | spark.hadoop.fs.s3a.secret.key $BUCKET_INTERNAL_SECRET_KEY 26 | spark.hadoop.fs.s3a.endpoint $BUCKET_INTERNAL_ENDPOINT 27 | EOF 28 | -------------------------------------------------------------------------------- /bin/dataproc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # A testing script for verifying the spark-bigquery connector with the existing 4 | # mozaggregator code. This requires `gcloud` to be configured to point at a 5 | # sandbox project for reading data from `payload_bytes_decoded`. 6 | 7 | set -e 8 | 9 | REGION=${REGION:-us-west1} 10 | MACHINE_TYPE=${MACHINE_TYPE:-"n1-standard-4"} 11 | NUM_WORKERS=${NUM_WORKERS:-0} 12 | MODULE="prio_processor" 13 | SUBMODULE=${SUBMODULE:-"spark"} 14 | 15 | function bootstrap() { 16 | local bucket=${1?"bucket must be provided"} 17 | 18 | # create the initialization script and runner 19 | mkdir -p bootstrap 20 | 21 | # create the package artifacts 22 | rm -rf dist build 23 | python3 setup.py bdist_egg 24 | cp dist/${MODULE}*.egg bootstrap/${MODULE}.egg 25 | cp requirements.txt bootstrap/ 26 | tee bootstrap/install-python-requirements.sh >/dev/null </dev/null </dev/null < /dev/null ; then 58 | echo "creating dataset: ${dataset}" 59 | bq mk "${dataset}" 60 | fi 61 | 62 | bq load \ 63 | --source_format=NEWLINE_DELIMITED_JSON \ 64 | --autodetect \ 65 | --replace="${BQ_REPLACE}" \ 66 | "${dataset}.${table}" \ 67 | "${input}" 68 | 69 | bq query "select count(*) from ${dataset}.${table}" 70 | } 71 | 72 | function main() { 73 | data_in=$(mktemp -d -t data-XXX) 74 | data_out=$(mktemp -d -t data-XXX) 75 | 76 | "${BASH_SOURCE%/*}/authenticate" 77 | 78 | prefix=${BUCKET_PREFIX}/${PUBLIC_KEY_HEX_EXTERNAL}/${APP_NAME}/${SUBMISSION_DATE} 79 | gsutil -m cp -r "gs://${BUCKET_INTERNAL_PRIVATE}/${prefix}/processed/publish" "${data_in}" 80 | index "${data_in}" "${data_out}" 81 | 82 | insert "${data_out}"/*.json "${DATASET}" "${TABLE}" 83 | } 84 | 85 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 86 | main "$@" 87 | fi 88 | -------------------------------------------------------------------------------- /config/content.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "batch_id": "content.blocking_blocked-0", 4 | "n_data": 2046 5 | }, 6 | { 7 | "batch_id": "content.blocking_blocked-1", 8 | "n_data": 441 9 | }, 10 | { 11 | "batch_id": "content.blocking_blocked_TESTONLY-0", 12 | "n_data": 2046 13 | }, 14 | { 15 | "batch_id": "content.blocking_blocked_TESTONLY-1", 16 | "n_data": 441 17 | }, 18 | { 19 | "batch_id": "content.blocking_opener_after_user_interaction_exempt-0", 20 | "n_data": 2046 21 | }, 22 | { 23 | "batch_id": "content.blocking_opener_after_user_interaction_exempt-1", 24 | "n_data": 441 25 | }, 26 | { 27 | "batch_id": "content.blocking_opener_after_user_interaction_exempt_TESTONLY-0", 28 | "n_data": 2046 29 | }, 30 | { 31 | "batch_id": "content.blocking_opener_after_user_interaction_exempt_TESTONLY-1", 32 | "n_data": 441 33 | }, 34 | { 35 | "batch_id": "content.blocking_opener_exempt-0", 36 | "n_data": 2046 37 | }, 38 | { 39 | "batch_id": "content.blocking_opener_exempt-1", 40 | "n_data": 441 41 | }, 42 | { 43 | "batch_id": "content.blocking_opener_exempt_TESTONLY-0", 44 | "n_data": 2046 45 | }, 46 | { 47 | "batch_id": "content.blocking_opener_exempt_TESTONLY-1", 48 | "n_data": 441 49 | }, 50 | { 51 | "batch_id": "content.blocking_storage_access_api_exempt-0", 52 | "n_data": 2046 53 | }, 54 | { 55 | "batch_id": "content.blocking_storage_access_api_exempt-1", 56 | "n_data": 441 57 | }, 58 | { 59 | "batch_id": "content.blocking_storage_access_api_exempt_TESTONLY-0", 60 | "n_data": 2046 61 | }, 62 | { 63 | "batch_id": "content.blocking_storage_access_api_exempt_TESTONLY-1", 64 | "n_data": 441 65 | } 66 | ] 67 | -------------------------------------------------------------------------------- /config/spark/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console, but reduce verbosity 19 | log4j.rootCategory=WARN, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 24 | 25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 26 | # log level for this class is used to overwrite the root logger's log level, so that 27 | # the user can have different defaults for the shell and regular Spark apps. 28 | log4j.logger.org.apache.spark.repl.Main=WARN 29 | 30 | # Settings to quiet third party logs that are too verbose 31 | log4j.logger.org.sparkproject.jetty=WARN 32 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 35 | log4j.logger.org.apache.parquet=ERROR 36 | log4j.logger.parquet=ERROR 37 | 38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR -------------------------------------------------------------------------------- /config/spark/spark-defaults.conf.template: -------------------------------------------------------------------------------- 1 | # Default parameters for spark while using minio. We'll also assume that the 2 | # performance is generally acceptable for GCS too using the same connector. 3 | # see https://docs.min.io/docs/disaggregated-spark-and-hadoop-hive-with-minio.html 4 | 5 | # Ensure these values are set before running spark 6 | # spark.hadoop.fs.s3a.access.key 7 | # spark.hadoop.fs.s3a.secret.key 8 | # spark.hadoop.fs.s3a.endpoint http://minio:9000 9 | 10 | spark.hadoop.fs.s3a.path.style.access true 11 | spark.hadoop.fs.s3a.block.size 512M 12 | spark.hadoop.fs.s3a.buffer.dir ${hadoop.tmp.dir}/s3a 13 | spark.hadoop.fs.s3a.committer.magic.enabled false 14 | spark.hadoop.fs.s3a.committer.name directory 15 | spark.hadoop.fs.s3a.committer.staging.abort.pending.uploads true 16 | spark.hadoop.fs.s3a.committer.staging.conflict-mode append 17 | spark.hadoop.fs.s3a.committer.staging.tmp.path /tmp/staging 18 | spark.hadoop.fs.s3a.committer.staging.unique-filenames true 19 | # number of threads writing to MinIO 20 | spark.hadoop.fs.s3a.committer.threads 2048 21 | spark.hadoop.fs.s3a.connection.establish.timeout 5000 22 | # maximum number of concurrent conns 23 | spark.hadoop.fs.s3a.connection.maximum 8192 24 | spark.hadoop.fs.s3a.connection.ssl.enabled false 25 | spark.hadoop.fs.s3a.connection.timeout 200000 26 | # number of parallel uploads 27 | spark.hadoop.fs.s3a.fast.upload.active.blocks 2048 28 | # use disk as the buffer for uploads 29 | spark.hadoop.fs.s3a.fast.upload.buffer disk 30 | spark.hadoop.fs.s3a.fast.upload true 31 | # maximum number of parallel tasks 32 | spark.hadoop.fs.s3a.max.total.tasks 2048 33 | # socket buffering hints 34 | spark.hadoop.fs.s3a.socket.recv.buffer 65536 35 | spark.hadoop.fs.s3a.socket.send.buffer 65536 36 | # maximum number of threads for S3A 37 | spark.hadoop.fs.s3a.threads.max 2048 38 | 39 | # add the progress bar to update the console for Airflow timeouts (relevant 40 | # if running using the KubernetesPodOperator with Airflow: 41 | # https://github.com/mozilla/telemetry-airflow/issues/844). 42 | spark.ui.showConsoleProgress true 43 | -------------------------------------------------------------------------------- /config/test-small.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "batch_id": "test-0", 4 | "n_data": 10 5 | }, 6 | { 7 | "batch_id": "test-1", 8 | "n_data": 20 9 | }, 10 | { 11 | "batch_id": "test-2", 12 | "n_data": 3 13 | } 14 | ] 15 | -------------------------------------------------------------------------------- /deployment/testing-v3/.gitignore: -------------------------------------------------------------------------------- 1 | .service-account-keys/ -------------------------------------------------------------------------------- /deployment/testing-v3/README.md: -------------------------------------------------------------------------------- 1 | # Testing configuration for v3 containers 2 | 3 | This directory contains terraform configuration to bring relevant resources for 4 | an integration test of the prio-processor v3.x containers. 5 | 6 | To create a new project that uses the same configuration, change the terraform 7 | backend appropriately. Here, the state is placed into a storage bucket that has 8 | been created beforehand. Ensure the project has also been created. Then: 9 | 10 | ```bash 11 | cd terraform 12 | 13 | # if you're choosing a different project or change any modules 14 | terraform init 15 | 16 | # apply any changes 17 | terraform apply 18 | ``` 19 | 20 | To configure the tests: 21 | 22 | ```bash 23 | # There is a maximum of 10 keys per service account. This script doesn't 24 | # handle key rotations, so disable old keys as necessary. 25 | scripts/generate-service-account-keys 26 | 27 | # generate new keys (or alternatively copy .env.template files to their .env locations) 28 | scripts/generate-dotenv 29 | ``` 30 | 31 | The above commands only need to be run once. To run the tests: 32 | 33 | ```bash 34 | # run the integration script 35 | scripts/integrate 36 | 37 | # clean up the buckets 38 | scripts/cleanup 39 | ``` 40 | -------------------------------------------------------------------------------- /deployment/testing-v3/compose/admin/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | 4 | APP_NAME=test-app 5 | DATA_CONFIG=/app/config/content.json 6 | ORIGIN_CONFIG=/app/config/telemetry_origin_data_inc.json 7 | BUCKET_PREFIX=test-app/v1 8 | 9 | # relative to the docker-compose file 10 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-admin-private-key.json" 11 | 12 | PUBLIC_KEY_HEX_INTERNAL=1723D28D922FD045A3EBECE5FA9BBD67DF28B60B5666203DA06E0CE296D7DF11 13 | BUCKET_INTERNAL_PRIVATE=gs://a-private-a82843a795cf9ef5 14 | 15 | PUBLIC_KEY_HEX_EXTERNAL=91D65A37411C70A7E86070659EACEEED5C01CF57656AE922BD456AD79EAE9E3B 16 | BUCKET_EXTERNAL_PRIVATE=gs://b-private-a82843a795cf9ef5 17 | 18 | DATASET=telemetry 19 | TABLE=content_blocking 20 | BQ_REPLACE=true 21 | CLOUDSDK_CORE_PROJECT=amiyaguchi-prio-processor-v3 22 | -------------------------------------------------------------------------------- /deployment/testing-v3/compose/admin/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | app: 5 | image: mozilla/prio-processor:v3.1.1 6 | command: "true" 7 | volumes: 8 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 9 | - ../../content.json:/app/config/content.json 10 | environment: 11 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 12 | - APP_NAME 13 | - DATA_CONFIG 14 | - ORIGIN_CONFIG 15 | - PUBLIC_KEY_HEX_INTERNAL 16 | - PUBLIC_KEY_HEX_EXTERNAL 17 | - BUCKET_INTERNAL_PRIVATE 18 | - BUCKET_EXTERNAL_PRIVATE 19 | - BUCKET_PREFIX 20 | - DATASET 21 | - TABLE 22 | - BQ_REPLACE 23 | - CLOUDSDK_CORE_PROJECT 24 | - SUBMISSION_DATE 25 | -------------------------------------------------------------------------------- /deployment/testing-v3/compose/server-a/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | 4 | APP_NAME=test-app 5 | DATA_CONFIG=/app/config/content.json 6 | BUCKET_PREFIX=test-app/v1 7 | SERVER_ID=A 8 | SHARED_SECRET=FxuW0JdQWtZruGijAsaKCw== 9 | 10 | # relative to the docker-compose file 11 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-a-private-key.json" 12 | 13 | PRIVATE_KEY_HEX=181DC4D11ECF21F08EFA21DE79CF602C89FF6B96AB2A1BBD1EBB5FFF4AC51259 14 | PUBLIC_KEY_HEX_INTERNAL=1723D28D922FD045A3EBECE5FA9BBD67DF28B60B5666203DA06E0CE296D7DF11 15 | BUCKET_INTERNAL_PRIVATE=gs://a-private-a82843a795cf9ef5 16 | BUCKET_INTERNAL_SHARED=gs://a-shared-a82843a795cf9ef5 17 | 18 | PUBLIC_KEY_HEX_EXTERNAL=91D65A37411C70A7E86070659EACEEED5C01CF57656AE922BD456AD79EAE9E3B 19 | BUCKET_EXTERNAL_SHARED=gs://b-shared-a82843a795cf9ef5 20 | -------------------------------------------------------------------------------- /deployment/testing-v3/compose/server-a/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | app: 5 | image: mozilla/prio-processor:v3.1.1 6 | working_dir: /app 7 | command: bin/process 8 | volumes: 9 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 10 | - ../../content.json:/app/config/content.json 11 | environment: 12 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 13 | - APP_NAME 14 | - DATA_CONFIG 15 | - SERVER_ID 16 | - SHARED_SECRET 17 | - PRIVATE_KEY_HEX 18 | - PUBLIC_KEY_HEX_INTERNAL 19 | - PUBLIC_KEY_HEX_EXTERNAL 20 | - BUCKET_INTERNAL_PRIVATE 21 | - BUCKET_INTERNAL_SHARED 22 | - BUCKET_EXTERNAL_SHARED 23 | - BUCKET_PREFIX 24 | - SUBMISSION_DATE 25 | -------------------------------------------------------------------------------- /deployment/testing-v3/compose/server-b/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | 4 | APP_NAME=test-app 5 | DATA_CONFIG=/app/config/content.json 6 | BUCKET_PREFIX=test-app/v1 7 | SERVER_ID=B 8 | SHARED_SECRET=FxuW0JdQWtZruGijAsaKCw== 9 | 10 | # relative to the docker-compose file 11 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-b-private-key.json" 12 | 13 | PRIVATE_KEY_HEX=6C0213B9319234BC81E166E8592739B0311D86EE1BE8391E2F773F930C1991C3 14 | PUBLIC_KEY_HEX_INTERNAL=91D65A37411C70A7E86070659EACEEED5C01CF57656AE922BD456AD79EAE9E3B 15 | BUCKET_INTERNAL_PRIVATE=gs://b-private-a82843a795cf9ef5 16 | BUCKET_INTERNAL_SHARED=gs://b-shared-a82843a795cf9ef5 17 | 18 | PUBLIC_KEY_HEX_EXTERNAL=1723D28D922FD045A3EBECE5FA9BBD67DF28B60B5666203DA06E0CE296D7DF11 19 | BUCKET_EXTERNAL_SHARED=gs://a-shared-a82843a795cf9ef5 20 | -------------------------------------------------------------------------------- /deployment/testing-v3/compose/server-b/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | app: 5 | image: mozilla/prio-processor:v3.1.1 6 | working_dir: /app 7 | command: bin/process 8 | volumes: 9 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 10 | - ../../content.json:/app/config/content.json 11 | environment: 12 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 13 | - APP_NAME 14 | - DATA_CONFIG 15 | - SERVER_ID 16 | - SHARED_SECRET 17 | - PRIVATE_KEY_HEX 18 | - PUBLIC_KEY_HEX_INTERNAL 19 | - PUBLIC_KEY_HEX_EXTERNAL 20 | - BUCKET_INTERNAL_PRIVATE 21 | - BUCKET_INTERNAL_SHARED 22 | - BUCKET_EXTERNAL_SHARED 23 | - BUCKET_PREFIX 24 | - SUBMISSION_DATE 25 | -------------------------------------------------------------------------------- /deployment/testing-v3/content.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "batch_id": "content.blocking_blocked_TESTONLY-0", 4 | "n_data": 2046 5 | }, 6 | { 7 | "batch_id": "content.blocking_blocked_TESTONLY-1", 8 | "n_data": 441 9 | } 10 | ] 11 | -------------------------------------------------------------------------------- /deployment/testing-v3/scripts/cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Delegate cleanup of buckets to the appropriate service account 4 | 5 | set -euo pipefail 6 | cd "$(dirname "$0")/.." 7 | 8 | pushd compose/server-a 9 | docker-compose run --rm app bin/cleanup 10 | popd 11 | 12 | pushd compose/server-b 13 | docker-compose run --rm app bin/cleanup 14 | popd 15 | -------------------------------------------------------------------------------- /deployment/testing-v3/scripts/generate-dotenv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Generate dotenv files for each of the compose configurations 3 | 4 | set -e 5 | 6 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v3} 7 | TAG=${TAG:-mozilla/prio-processor:v3.0.0} 8 | 9 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 10 | echo "project is not set correctly; run 'gcloud config set project $PROJECT'" 11 | exit 1 12 | fi 13 | 14 | function get-key { 15 | local json=$1 16 | local key=$2 17 | echo "$json" | jq -r ".$key" 18 | } 19 | 20 | # reuse results from a single gsutil call 21 | _results=$(gsutil ls) 22 | function get-bucket { 23 | local pattern=$1 24 | path=$(echo "$_results" | grep "$pattern") 25 | # strip any trailing slashes 26 | echo ${path%/} 27 | } 28 | 29 | # work from the parent directory 30 | cd "$(dirname "$0")/.." 31 | 32 | keys_a=$(docker run -it "$TAG" prio keygen) 33 | keys_b=$(docker run -it "$TAG" prio keygen) 34 | seed=$(docker run -it "$TAG" prio shared-seed) 35 | 36 | # list out all the variables we might need... 37 | app_name="test-app" 38 | bucket_prefix="$app_name/v1" 39 | data_config="/app/config/content.json" 40 | origin_config="/app/config/telemetry_origin_data_inc.json" 41 | 42 | cat << EOF > compose/admin/.env.template 43 | # This configuration is generated by scripts/generate-dotenv. Do not check in 44 | # manually edited values into source control. 45 | 46 | APP_NAME=$app_name 47 | DATA_CONFIG=$data_config 48 | ORIGIN_CONFIG=$origin_config 49 | BUCKET_PREFIX=$bucket_prefix 50 | 51 | # relative to the docker-compose file 52 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-admin-private-key.json" 53 | 54 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$keys_a" public_key) 55 | BUCKET_INTERNAL_PRIVATE=$(get-bucket a-private) 56 | 57 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$keys_b" public_key) 58 | BUCKET_EXTERNAL_PRIVATE=$(get-bucket b-private) 59 | 60 | DATASET=telemetry 61 | TABLE=content_blocking 62 | BQ_REPLACE=true 63 | CLOUDSDK_CORE_PROJECT=$PROJECT 64 | EOF 65 | cp compose/admin/.env.template compose/admin/.env 66 | 67 | function server-env { 68 | local server_id=$1 69 | local internal_key=$2 70 | local external_key=$3 71 | local other_id; 72 | other_id=$(if [[ $server_id == a ]]; then echo b; else echo a; fi) 73 | cat << EOF > "compose/server-$server_id/.env.template" 74 | # This configuration is generated by scripts/generate-dotenv. Do not check in 75 | # manually edited values into source control. 76 | 77 | APP_NAME=$app_name 78 | DATA_CONFIG=$data_config 79 | BUCKET_PREFIX=$bucket_prefix 80 | SERVER_ID=$(echo "$server_id" | tr '[:lower:]' '[:upper:]') 81 | SHARED_SECRET=$(get-key "$seed" shared_seed) 82 | 83 | # relative to the docker-compose file 84 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-${server_id}-private-key.json" 85 | 86 | PRIVATE_KEY_HEX=$(get-key "$internal_key" private_key) 87 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$internal_key" public_key) 88 | BUCKET_INTERNAL_PRIVATE=$(get-bucket "${server_id}-private") 89 | BUCKET_INTERNAL_SHARED=$(get-bucket "${server_id}-shared") 90 | 91 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$external_key" public_key) 92 | BUCKET_EXTERNAL_SHARED=$(get-bucket "${other_id}-shared") 93 | EOF 94 | cp "compose/server-$server_id/.env.template" "compose/server-$server_id/.env" 95 | } 96 | 97 | server-env a "$keys_a" "$keys_b" 98 | server-env b "$keys_b" "$keys_a" 99 | 100 | -------------------------------------------------------------------------------- /deployment/testing-v3/scripts/generate-service-account-keys: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v3} 6 | 7 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 8 | echo "project is not set correctly; run 'gcloud config set project $PROJECT'" 9 | exit 1 10 | fi 11 | 12 | # work from the parent directory 13 | cd "$(dirname "$0")/.." 14 | output=.service-account-keys 15 | mkdir -p $output 16 | 17 | function create_service_account { 18 | local project=$1 19 | local output=$2 20 | local name=$3 21 | gcloud iam service-accounts keys create "$output/$name-private-key.json" \ 22 | --iam-account "$name@$project.iam.gserviceaccount.com" 23 | 24 | } 25 | 26 | create_service_account "$PROJECT" "$output" service-account-admin 27 | create_service_account "$PROJECT" "$output" service-account-a 28 | create_service_account "$PROJECT" "$output" service-account-b 29 | -------------------------------------------------------------------------------- /deployment/testing-v3/scripts/integrate: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script controls the docker-compose workflow for integration testing. The 4 | # containers are defined in the docker-compose.yml, but are orchestrated through 5 | # this script for verification. 6 | 7 | set -euo pipefail 8 | 9 | cd "$(dirname "$0")/.." 10 | 11 | # Copy data into the appropriate buckets 12 | pushd compose/admin 13 | docker-compose run --rm app bin/generate 14 | popd 15 | 16 | # Start server A 17 | pushd compose/server-a 18 | docker-compose run --rm app bin/process & 19 | server_a_pid=$! 20 | popd 21 | 22 | # offset the start times by a short amount for proper authentication against GCP 23 | sleep 2 24 | 25 | # Start server B 26 | pushd compose/server-b 27 | docker-compose run --rm app bin/process & 28 | server_b_pid=$! 29 | popd 30 | 31 | # Return the exit code of the backgrounded docker-compose container. Since 32 | # `wait` is a blocking function, a failure in server B will not be detected 33 | # until timeout in server A. 34 | wait $server_a_pid 35 | wait $server_b_pid 36 | 37 | pushd compose/admin 38 | docker-compose run --rm app bin/insert 39 | popd 40 | -------------------------------------------------------------------------------- /deployment/testing-v3/scripts/list-bucket: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | : << EOF 3 | To use this script, run the following command: 4 | 5 | scripts/list-bucket > LISTING.md 6 | EOF 7 | 8 | set -e 9 | 10 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v3} 11 | 12 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 13 | echo "project is not set correctly; run 'gcloud config set project $PROJECT'" 14 | exit 1 15 | fi 16 | 17 | function sort_recursive_listing { 18 | local bucket=$1 19 | # remove lines that end with /:, empty lines, or the summary line 20 | gsutil ls -lr "$bucket" | grep -v :$ | grep -v ^$ | grep -v ^TOTAL | sort -k2 21 | } 22 | 23 | cat << EOF 24 | # Directory listing 25 | 26 | This listing was generated from \`scripts/list-bucket\`. It is a list of all 27 | objects stored across the the two servers. 28 | 29 | ## Server A buckets 30 | 31 | EOF 32 | 33 | buckets=$(gsutil ls | sort) 34 | for bucket in $(echo "$buckets" | grep a- ); do 35 | cat << EOF 36 | ### \`$bucket\` 37 | 38 | \`\`\` 39 | $(sort_recursive_listing "$bucket") 40 | \`\`\` 41 | 42 | EOF 43 | done 44 | 45 | echo "## Server B buckets" 46 | echo "" 47 | 48 | for bucket in $(echo "$buckets" | grep b-); do 49 | cat << EOF 50 | ### \`$bucket\` 51 | 52 | \`\`\` 53 | $(sort_recursive_listing "$bucket") 54 | \`\`\` 55 | 56 | EOF 57 | done 58 | -------------------------------------------------------------------------------- /deployment/testing-v3/terraform/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/google" { 5 | version = "3.65.0" 6 | hashes = [ 7 | "h1:ZvXCeUYoex3aOLlZYqv08WZ3hcPaf5p/gEa/DeMrkfs=", 8 | "zh:402b8ba03f19558f7d0e2a453a9b82747882fb3519ce686ce26a9afd4593d05e", 9 | "zh:523a306c2906c213b630d1c2f1e48698769bfffe360b68388d935d0bd171c55c", 10 | "zh:76af4170f5a524ff353e60dd68d728c55dcbd9f6c5f60648e28e4f8f9ca8e958", 11 | "zh:7d00a44769d26144f42b413c82272e31ae9b63153532b9a135a8f69a6608b9a6", 12 | "zh:7f5d0ab79d213809726663f7603004c173694602bd22f2419c445d6897729ca2", 13 | "zh:a1c23e3d280a5053bae9102ad55df1315585395f8656ddf83928978c7e6cd307", 14 | "zh:a81d0af5ef58c193197f81dc3059f8b22c7dde0575bb3198a0360aff7f9ca476", 15 | "zh:b5b79fa8f9e49d2d26badfded64a1e460cdb11b152168e578443cf92df679bca", 16 | "zh:ec4f88d1fd8990511b86205709c1a76ac3a444d0088a810c82a4f5db37ca4afe", 17 | "zh:f15390a40dc6e9c5b5285bc2b6a8c54b6030ae9cc04cc4a31ecf9b14145c467b", 18 | "zh:fb1a150464d822aa9182cd46a0b7bc2c279ff9400017b4bb3238256224ab41b6", 19 | ] 20 | } 21 | 22 | provider "registry.terraform.io/hashicorp/random" { 23 | version = "3.1.0" 24 | hashes = [ 25 | "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=", 26 | "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc", 27 | "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626", 28 | "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff", 29 | "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2", 30 | "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992", 31 | "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427", 32 | "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc", 33 | "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f", 34 | "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b", 35 | "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7", 36 | "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a", 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /deployment/testing-v3/terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | // When forking this configuration, set the configuration appropriately. A 3 | // remote backend is a good choice since it can be shared across a team. 4 | backend "gcs" { 5 | bucket = "amiyaguchi-prio-processor-v3" 6 | prefix = "tf-state" 7 | } 8 | } 9 | 10 | variable "project" { 11 | type = string 12 | default = "amiyaguchi-prio-processor-v3" 13 | } 14 | 15 | variable "region" { 16 | type = string 17 | default = "us-central-1" 18 | } 19 | 20 | provider "google" { 21 | project = var.project 22 | region = var.region 23 | } 24 | 25 | // Choose a different bucket name if the project changes 26 | resource "random_id" "project" { 27 | keepers = { 28 | project = var.project 29 | } 30 | byte_length = 8 31 | } 32 | 33 | module "bucket-a" { 34 | source = "./modules/bucket" 35 | server_id = "a" 36 | suffix = random_id.project.hex 37 | } 38 | 39 | module "bucket-b" { 40 | source = "./modules/bucket" 41 | server_id = "b" 42 | suffix = random_id.project.hex 43 | } 44 | 45 | // Create the service accounts for the tests 46 | resource "google_service_account" "admin" { 47 | account_id = "service-account-admin" 48 | display_name = "Service account for the administrator" 49 | } 50 | 51 | resource "google_service_account" "a" { 52 | account_id = "service-account-a" 53 | display_name = "Service account for server A" 54 | } 55 | 56 | resource "google_service_account" "b" { 57 | account_id = "service-account-b" 58 | display_name = "Service account for server B" 59 | } 60 | 61 | // Assign service account permissions to each bucket. There are quite a few rules, 62 | // so we break this out into a module. 63 | 64 | module "bucket-permissions-a" { 65 | source = "./modules/bucket-permissions" 66 | bucket_private = module.bucket-a.private 67 | bucket_shared = module.bucket-a.shared 68 | service_account_internal = google_service_account.a.email 69 | service_account_external = google_service_account.b.email 70 | service_account_admin = google_service_account.admin.email 71 | } 72 | 73 | module "bucket-permissions-b" { 74 | source = "./modules/bucket-permissions" 75 | bucket_private = module.bucket-b.private 76 | bucket_shared = module.bucket-b.shared 77 | service_account_internal = google_service_account.b.email 78 | service_account_external = google_service_account.a.email 79 | service_account_admin = google_service_account.admin.email 80 | 81 | } 82 | 83 | // testing whether origin telemetry inserts into BigQuery correctly 84 | resource "google_project_service" "bigquery" { 85 | service = "bigquery.googleapis.com" 86 | } 87 | 88 | resource "google_bigquery_dataset" "telemetry" { 89 | dataset_id = "telemetry" 90 | location = "US" 91 | } 92 | 93 | // Grant access to the admin service account 94 | resource "google_project_iam_member" "bigquery-admin" { 95 | role = "roles/bigquery.admin" 96 | member = "serviceAccount:${google_service_account.admin.email}" 97 | } 98 | -------------------------------------------------------------------------------- /deployment/testing-v3/terraform/modules/bucket-permissions/main.tf: -------------------------------------------------------------------------------- 1 | variable "bucket_private" { 2 | type = string 3 | description = "The private bucket for the current processor" 4 | } 5 | 6 | variable "bucket_shared" { 7 | type = string 8 | description = "The shared bucket for both processors" 9 | } 10 | 11 | variable "service_account_internal" { 12 | type = string 13 | description = "The service account for the current processor" 14 | } 15 | 16 | variable "service_account_external" { 17 | type = string 18 | description = "The service account for the co-processor" 19 | } 20 | 21 | variable "service_account_admin" { 22 | type = string 23 | description = "The service account for the admin" 24 | } 25 | 26 | // The admin account needs to be able to write to the internal bucket. See 27 | // issue #102 for possible simplification that doesn't require editor access. 28 | resource "google_storage_bucket_iam_binding" "private" { 29 | bucket = var.bucket_private 30 | role = "roles/storage.objectAdmin" 31 | members = [ 32 | "serviceAccount:${var.service_account_internal}", 33 | "serviceAccount:${var.service_account_admin}" 34 | ] 35 | } 36 | 37 | resource "google_storage_bucket_iam_binding" "shared" { 38 | bucket = var.bucket_shared 39 | role = "roles/storage.objectAdmin" 40 | members = [ 41 | "serviceAccount:${var.service_account_internal}", 42 | "serviceAccount:${var.service_account_external}" 43 | ] 44 | } 45 | 46 | -------------------------------------------------------------------------------- /deployment/testing-v3/terraform/modules/bucket/main.tf: -------------------------------------------------------------------------------- 1 | variable "server_id" { 2 | type = string 3 | description = "The identifier for the server" 4 | } 5 | 6 | variable "suffix" { 7 | type = string 8 | description = "A shared suffix used for the bucket" 9 | } 10 | 11 | // Create all of the storage resources necessary for the tests. We choose to 12 | // delete files older than 7 days since these are testing resources. 13 | resource "google_storage_bucket" "private" { 14 | name = "${var.server_id}-private-${var.suffix}" 15 | uniform_bucket_level_access = true 16 | lifecycle_rule { 17 | condition { 18 | age = 7 19 | } 20 | action { 21 | type = "Delete" 22 | } 23 | } 24 | } 25 | 26 | resource "google_storage_bucket" "shared" { 27 | name = "${var.server_id}-shared-${var.suffix}" 28 | uniform_bucket_level_access = true 29 | lifecycle_rule { 30 | condition { 31 | age = 7 32 | } 33 | action { 34 | type = "Delete" 35 | } 36 | } 37 | } 38 | 39 | output "private" { 40 | value = google_storage_bucket.private.name 41 | } 42 | 43 | output "shared" { 44 | value = google_storage_bucket.shared.name 45 | } 46 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/.gitignore: -------------------------------------------------------------------------------- 1 | .service-account-keys/ 2 | .secrets/ 3 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/ingest/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | COMPOSE_PROJECT_NAME="testing-v4-gcloud-self-ingest" 4 | 5 | APP_NAME=test-app 6 | DATA_CONFIG=/app/config/content.json 7 | ORIGIN_CONFIG=/app/config/telemetry_origin_data_inc.json 8 | BUCKET_PREFIX=test-app/v1 9 | 10 | # relative to the docker-compose file 11 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-ingest-private-key.json" 12 | 13 | PUBLIC_KEY_HEX_INTERNAL=A79C2D3DFE6AB347640DB2393A6EA4AF6A80294E2DF09EA2311E7B609D1AAC7C 14 | BUCKET_INTERNAL_INGEST=a-ingest-411db3b9503395de 15 | # The keys for the internal gateway don't particularly matter since it generally 16 | # shouldn't be accessible over the public internet. 17 | BUCKET_INTERNAL_ACCESS_KEY=ingest-access-key 18 | BUCKET_INTERNAL_SECRET_KEY=36e6be74ac2513770134ddd9f020b3f99b2c6d6151b4a61c6cd1cdcd51cad726 19 | BUCKET_INTERNAL_ENDPOINT=http://gcs-gateway-ingest:9000 20 | 21 | PUBLIC_KEY_HEX_EXTERNAL=BFAB37C8E174142F5B06B32FCC7212D63326889E06FBCC687622E9CE77A76937 22 | BUCKET_EXTERNAL_INGEST=b-ingest-032fc7c2cca96ddb 23 | BUCKET_EXTERNAL_ACCESS_KEY=ingest-032fc7c2cca96ddb 24 | BUCKET_EXTERNAL_SECRET_KEY=78aa4bb15f7571492472112dacb39a9760c5f4e46aba146813c6b5722478d81f 25 | BUCKET_EXTERNAL_ENDPOINT=http://minio-b:9000 26 | 27 | # The ingest also gets access to the private internal bucket, because ingest and 28 | # server A are operated by the same entity in the origin telemetry setup 29 | BUCKET_INTERNAL_PRIVATE=a-private-411db3b9503395de 30 | DATASET=telemetry 31 | TABLE=content_blocking 32 | BQ_REPLACE=true 33 | CLOUDSDK_CORE_PROJECT=amiyaguchi-prio-processor-v4-1 34 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/ingest/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | networks: 4 | testing-v4-gcloud-self-b_default: 5 | external: true 6 | 7 | services: 8 | # https://docs.min.io/docs/minio-gateway-for-gcs.html 9 | gcs-gateway-ingest: 10 | image: minio/minio:RELEASE.2021-06-17T00-10-46Z 11 | command: gateway gcs ${CLOUDSDK_CORE_PROJECT} 12 | volumes: 13 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 14 | ports: 15 | - 9001:9000 16 | environment: 17 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 18 | - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY 19 | - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY 20 | 21 | app: 22 | build: ../../../.. 23 | command: "true" 24 | volumes: 25 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 26 | - ../../content.json:/app/config/content.json 27 | - ../../../../bin:/app/bin 28 | networks: 29 | - default 30 | - testing-v4-gcloud-self-b_default 31 | depends_on: 32 | - gcs-gateway-ingest 33 | environment: 34 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 35 | - APP_NAME 36 | - DATA_CONFIG 37 | - ORIGIN_CONFIG 38 | - PUBLIC_KEY_HEX_INTERNAL 39 | - PUBLIC_KEY_HEX_EXTERNAL 40 | - BUCKET_INTERNAL_ACCESS_KEY 41 | - BUCKET_INTERNAL_SECRET_KEY 42 | - BUCKET_INTERNAL_ENDPOINT 43 | - BUCKET_EXTERNAL_ACCESS_KEY 44 | - BUCKET_EXTERNAL_SECRET_KEY 45 | - BUCKET_EXTERNAL_ENDPOINT 46 | - BUCKET_INTERNAL_INGEST 47 | - BUCKET_EXTERNAL_INGEST 48 | - BUCKET_INTERNAL_PRIVATE 49 | - BUCKET_PREFIX 50 | - DATASET 51 | - TABLE 52 | - BQ_REPLACE 53 | - CLOUDSDK_CORE_PROJECT 54 | - SUBMISSION_DATE 55 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/server-a/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | COMPOSE_PROJECT_NAME="testing-v4-gcloud-self-a" 4 | 5 | APP_NAME=test-app 6 | DATA_CONFIG=/app/config/content.json 7 | BUCKET_PREFIX=test-app/v1 8 | SERVER_ID=A 9 | SHARED_SECRET=xut4T8StPN83xiK2QAj/oQ== 10 | 11 | # Used for the MinIO GCS gateway 12 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-a-private-key.json" 13 | 14 | PRIVATE_KEY_HEX=488C9D8A141332F3F4FD11695E563803A11526C4FB6C464C20922842FEEF2A2C 15 | PUBLIC_KEY_HEX_INTERNAL=A79C2D3DFE6AB347640DB2393A6EA4AF6A80294E2DF09EA2311E7B609D1AAC7C 16 | BUCKET_INTERNAL_INGEST=a-ingest-411db3b9503395de 17 | BUCKET_INTERNAL_PRIVATE=a-private-411db3b9503395de 18 | BUCKET_INTERNAL_SHARED=a-shared-411db3b9503395de 19 | # The keys for the internal gateway don't particularly matter since it generally 20 | # shouldn't be accessible over the public internet. 21 | BUCKET_INTERNAL_ACCESS_KEY=a-access-key 22 | BUCKET_INTERNAL_SECRET_KEY=2d87853b05a86022873277c3311c4b48c6717111bde72c09fc84f3816893ffaf 23 | BUCKET_INTERNAL_ENDPOINT=http://gcs-gateway-a:9000 24 | 25 | PUBLIC_KEY_HEX_EXTERNAL=BFAB37C8E174142F5B06B32FCC7212D63326889E06FBCC687622E9CE77A76937 26 | BUCKET_EXTERNAL_SHARED=b-shared-032fc7c2cca96ddb 27 | BUCKET_EXTERNAL_ACCESS_KEY=a-032fc7c2cca96ddb 28 | BUCKET_EXTERNAL_SECRET_KEY=d3fe697dd6d4acc606d9a637e339ea5f04afeef55db14d8c4ad5321b31ae405f 29 | BUCKET_EXTERNAL_ENDPOINT=http://minio-b:9000 30 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/server-a/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | # Only used when testing locally. When applied to an external host for server b, 4 | # use the public ip address of the node and remove this network dependency. 5 | networks: 6 | testing-v4-gcloud-self-b_default: 7 | external: true 8 | 9 | services: 10 | # https://docs.min.io/docs/minio-gateway-for-gcs.html 11 | gcs-gateway-a: 12 | image: minio/minio:RELEASE.2021-06-17T00-10-46Z 13 | command: gateway gcs 14 | volumes: 15 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 16 | ports: 17 | - 9002:9000 18 | environment: 19 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 20 | - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY 21 | - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY 22 | 23 | app: 24 | build: ../../../.. 25 | working_dir: /app 26 | command: bin/process 27 | volumes: 28 | - ../../content.json:/app/config/content.json 29 | - ../../../../bin:/app/bin 30 | networks: 31 | - default 32 | - testing-v4-gcloud-self-b_default 33 | depends_on: 34 | - gcs-gateway-a 35 | environment: 36 | - APP_NAME 37 | - DATA_CONFIG 38 | - SERVER_ID 39 | - SHARED_SECRET 40 | - PRIVATE_KEY_HEX 41 | - PUBLIC_KEY_HEX_INTERNAL 42 | - PUBLIC_KEY_HEX_EXTERNAL 43 | - BUCKET_INTERNAL_ACCESS_KEY 44 | - BUCKET_INTERNAL_SECRET_KEY 45 | - BUCKET_INTERNAL_ENDPOINT 46 | - BUCKET_EXTERNAL_ACCESS_KEY 47 | - BUCKET_EXTERNAL_SECRET_KEY 48 | - BUCKET_EXTERNAL_ENDPOINT 49 | - BUCKET_INTERNAL_INGEST 50 | - BUCKET_INTERNAL_PRIVATE 51 | - BUCKET_INTERNAL_SHARED 52 | - BUCKET_EXTERNAL_SHARED 53 | - BUCKET_PREFIX 54 | - SUBMISSION_DATE 55 | - RETRY_LIMIT 56 | - RETRY_DELAY 57 | - RETRY_BACKOFF_EXPONENT 58 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/server-b/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | COMPOSE_PROJECT_NAME="testing-v4-gcloud-self-b" 4 | 5 | APP_NAME=test-app 6 | DATA_CONFIG=/app/config/content.json 7 | BUCKET_PREFIX=test-app/v1 8 | SERVER_ID=B 9 | SHARED_SECRET=xut4T8StPN83xiK2QAj/oQ== 10 | 11 | # Used for the MinIO GCS gateway 12 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-b-private-key.json" 13 | 14 | PRIVATE_KEY_HEX=C366087126F212F484E156E49B8A2783C4DEBF58A1B168C1ECD8A7DE0D5EB591 15 | PUBLIC_KEY_HEX_INTERNAL=BFAB37C8E174142F5B06B32FCC7212D63326889E06FBCC687622E9CE77A76937 16 | BUCKET_INTERNAL_INGEST=b-ingest-032fc7c2cca96ddb 17 | BUCKET_INTERNAL_PRIVATE=b-private-032fc7c2cca96ddb 18 | BUCKET_INTERNAL_SHARED=b-shared-032fc7c2cca96ddb 19 | BUCKET_INTERNAL_ACCESS_KEY=b-032fc7c2cca96ddb 20 | BUCKET_INTERNAL_SECRET_KEY=a6b4094711a66498c561a8763b97d79087a989ab173ce84963c2247e7039fd74 21 | BUCKET_INTERNAL_ENDPOINT=http://minio-b:9000 22 | 23 | # access to the external bucket is mediated by the gcs gateway, use the same 24 | # internal keys for gcs-gateway as for access to the normal minio instance. 25 | PUBLIC_KEY_HEX_EXTERNAL=A79C2D3DFE6AB347640DB2393A6EA4AF6A80294E2DF09EA2311E7B609D1AAC7C 26 | BUCKET_EXTERNAL_SHARED=a-shared-411db3b9503395de 27 | BUCKET_EXTERNAL_ACCESS_KEY=b-032fc7c2cca96ddb 28 | BUCKET_EXTERNAL_SECRET_KEY=a6b4094711a66498c561a8763b97d79087a989ab173ce84963c2247e7039fd74 29 | BUCKET_EXTERNAL_ENDPOINT=http://gcs-gateway-b:9000 30 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/server-b/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | set -x 5 | 6 | TARGET="minio" 7 | 8 | function get_key { 9 | local key=$1 10 | jq -r ".$key" minio-config.json 11 | } 12 | 13 | mc config host add $TARGET \ 14 | $BUCKET_INTERNAL_ENDPOINT \ 15 | $BUCKET_INTERNAL_ACCESS_KEY \ 16 | $BUCKET_INTERNAL_SECRET_KEY 17 | 18 | for type in private shared ingest; do 19 | bucket="$(get_key "buckets.$type")" 20 | mc mb $TARGET/$bucket 21 | done 22 | 23 | # the internal user is the admin, and doesn't need a policy applied 24 | for type in external ingest; do 25 | policy="$(get_key "policy.$type")" 26 | access_key="$(get_key "keys.$type.access_key")" 27 | secret_key="$(get_key "keys.$type.secret_key")" 28 | 29 | # dump policy to tmp directory 30 | policy_dir="/tmp/$type.json" 31 | echo "$policy" > "$policy_dir" 32 | 33 | # mc admin policy add TARGET POLICYNAME POLICYFILE 34 | mc admin policy add $TARGET $type $policy_dir 35 | 36 | # mc admin user add TARGET ACCESSKEY SECRETKEY 37 | mc admin user add $TARGET $access_key $secret_key 38 | 39 | # mc admin policy set TARGET POLICYNAME user=ACCESSKEY 40 | mc admin policy set $TARGET $type user=$access_key 41 | done 42 | 43 | echo "done setting up policies" 44 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/server-b/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | services: 4 | minio-b: 5 | image: minio/minio:RELEASE.2021-06-17T00-10-46Z 6 | command: server /data 7 | ports: 8 | - 9004:9000 9 | environment: 10 | - MINIO_ACCESS_KEY=$BUCKET_INTERNAL_ACCESS_KEY 11 | - MINIO_SECRET_KEY=$BUCKET_INTERNAL_SECRET_KEY 12 | 13 | # This is run to set up policies on the buckets 14 | minio-bootstrap: 15 | build: ../../../.. 16 | depends_on: 17 | - minio-b 18 | working_dir: /root 19 | command: bash bootstrap.sh 20 | volumes: 21 | - .:/root/ 22 | environment: 23 | - BUCKET_INTERNAL_ACCESS_KEY 24 | - BUCKET_INTERNAL_SECRET_KEY 25 | - BUCKET_INTERNAL_ENDPOINT 26 | 27 | # https://docs.min.io/docs/minio-gateway-for-gcs.html 28 | gcs-gateway-b: 29 | image: minio/minio:RELEASE.2021-06-17T00-10-46Z 30 | command: gateway gcs 31 | volumes: 32 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 33 | ports: 34 | - 9003:9000 35 | environment: 36 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 37 | - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY 38 | - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY 39 | 40 | app: 41 | build: ../../../.. 42 | working_dir: /app 43 | command: bin/process 44 | volumes: 45 | - ../../content.json:/app/config/content.json 46 | - ../../../../bin:/app/bin 47 | depends_on: 48 | - gcs-gateway-b 49 | - minio-b 50 | - minio-bootstrap 51 | environment: 52 | - APP_NAME 53 | - DATA_CONFIG 54 | - SERVER_ID 55 | - SHARED_SECRET 56 | - PRIVATE_KEY_HEX 57 | - PUBLIC_KEY_HEX_INTERNAL 58 | - PUBLIC_KEY_HEX_EXTERNAL 59 | - BUCKET_INTERNAL_ACCESS_KEY 60 | - BUCKET_INTERNAL_SECRET_KEY 61 | - BUCKET_INTERNAL_ENDPOINT 62 | - BUCKET_EXTERNAL_ACCESS_KEY 63 | - BUCKET_EXTERNAL_SECRET_KEY 64 | - BUCKET_EXTERNAL_ENDPOINT 65 | - BUCKET_INTERNAL_INGEST 66 | - BUCKET_INTERNAL_PRIVATE 67 | - BUCKET_INTERNAL_SHARED 68 | - BUCKET_EXTERNAL_SHARED 69 | - BUCKET_PREFIX 70 | - SUBMISSION_DATE 71 | - RETRY_LIMIT 72 | - RETRY_DELAY 73 | - RETRY_BACKOFF_EXPONENT 74 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/compose/server-b/minio-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "buckets": { 3 | "private": "b-private-032fc7c2cca96ddb", 4 | "shared": "b-shared-032fc7c2cca96ddb", 5 | "ingest": "b-ingest-032fc7c2cca96ddb" 6 | }, 7 | "policy": { 8 | "internal": { 9 | "Version": "2012-10-17", 10 | "Statement": [ 11 | { 12 | "Action": [ 13 | "s3:*" 14 | ], 15 | "Effect": "Allow", 16 | "Resource": [ 17 | "arn:aws:s3:::b-private-032fc7c2cca96ddb/*", 18 | "arn:aws:s3:::b-shared-032fc7c2cca96ddb/*", 19 | "arn:aws:s3:::b-ingest-032fc7c2cca96ddb/*" 20 | ], 21 | "Sid": "" 22 | } 23 | ] 24 | }, 25 | "external": { 26 | "Version": "2012-10-17", 27 | "Statement": [ 28 | { 29 | "Action": [ 30 | "s3:*" 31 | ], 32 | "Effect": "Allow", 33 | "Resource": [ 34 | "arn:aws:s3:::b-shared-032fc7c2cca96ddb/*" 35 | ], 36 | "Sid": "" 37 | } 38 | ] 39 | }, 40 | "ingest": { 41 | "Version": "2012-10-17", 42 | "Statement": [ 43 | { 44 | "Action": [ 45 | "s3:*" 46 | ], 47 | "Effect": "Allow", 48 | "Resource": [ 49 | "arn:aws:s3:::b-ingest-032fc7c2cca96ddb/*" 50 | ], 51 | "Sid": "" 52 | } 53 | ] 54 | } 55 | }, 56 | "keys": { 57 | "internal": { 58 | "access_key": "b-032fc7c2cca96ddb", 59 | "secret_key": "a6b4094711a66498c561a8763b97d79087a989ab173ce84963c2247e7039fd74" 60 | }, 61 | "external": { 62 | "access_key": "a-032fc7c2cca96ddb", 63 | "secret_key": "d3fe697dd6d4acc606d9a637e339ea5f04afeef55db14d8c4ad5321b31ae405f" 64 | }, 65 | "ingest": { 66 | "access_key": "ingest-032fc7c2cca96ddb", 67 | "secret_key": "78aa4bb15f7571492472112dacb39a9760c5f4e46aba146813c6b5722478d81f" 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/content.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "batch_id": "content.blocking_blocked_TESTONLY-0", 4 | "n_data": 2046 5 | }, 6 | { 7 | "batch_id": "content.blocking_blocked_TESTONLY-1", 8 | "n_data": 441 9 | } 10 | ] 11 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/scripts/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build each docker-compose service by changing directories 4 | 5 | set -euo pipefail 6 | cd "$(dirname "$0")/.." 7 | 8 | pushd compose/ingest 9 | docker-compose build 10 | popd 11 | 12 | pushd compose/server-a 13 | docker-compose build 14 | popd 15 | 16 | pushd compose/server-b 17 | docker-compose build 18 | popd 19 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/scripts/cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run cleanup scripts for each server 4 | 5 | set -euo pipefail 6 | cd "$(dirname "$0")/.." 7 | 8 | pushd compose/server-b 9 | docker-compose run --rm app bin/cleanup 10 | # keep the container around since the network depends on server b 11 | 12 | pushd ../../compose/server-a 13 | docker-compose run --rm app bin/cleanup 14 | docker-compose down 15 | popd 16 | 17 | docker-compose down 18 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/scripts/copy-minio-configuration: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Make a copy of the minio configuration in server b directory 3 | 4 | set -ex 5 | cp .secrets/minio-config.json compose/server-b 6 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/scripts/down: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # shut down docker containers 4 | 5 | set -euo pipefail 6 | cd "$(dirname "$0")/.." 7 | 8 | pushd compose/ingest 9 | docker-compose down 10 | popd 11 | 12 | pushd compose/server-a 13 | docker-compose down 14 | popd 15 | 16 | pushd compose/server-b 17 | docker-compose down 18 | popd 19 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/scripts/generate-minio-configuration: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Generate MinIO configuration for server B.""" 3 | from pathlib import Path 4 | import secrets 5 | import json 6 | 7 | ROOT = Path(__file__).parent.parent 8 | 9 | 10 | def policy(buckets): 11 | return { 12 | "Version": "2012-10-17", 13 | "Statement": [ 14 | { 15 | "Action": ["s3:*"], 16 | "Effect": "Allow", 17 | "Resource": [f"arn:aws:s3:::{bucket}/*" for bucket in buckets], 18 | "Sid": "", 19 | } 20 | ], 21 | } 22 | 23 | 24 | def keypair(name): 25 | return {"access_key": name, "secret_key": secrets.token_hex(32)} 26 | 27 | 28 | # create the relevant buckets 29 | def main(): 30 | salt = secrets.token_hex(8) 31 | server_id = "b" 32 | other_id = "a" 33 | 34 | # generate the buckets 35 | private = f"{server_id}-private-{salt}" 36 | shared = f"{server_id}-shared-{salt}" 37 | ingest = f"{server_id}-ingest-{salt}" 38 | 39 | payload = { 40 | "buckets": {"private": private, "shared": shared, "ingest": ingest}, 41 | "policy": { 42 | "internal": policy([private, shared, ingest]), 43 | "external": policy([shared]), 44 | "ingest": policy([ingest]), 45 | }, 46 | "keys": { 47 | "internal": keypair(f"{server_id}-{salt}"), 48 | "external": keypair(f"{other_id}-{salt}"), 49 | "ingest": keypair(f"ingest-{salt}"), 50 | }, 51 | } 52 | 53 | dotsecrets = ROOT / ".secrets" 54 | dotsecrets.mkdir(parents=True, exist_ok=True) 55 | configfile = dotsecrets / "minio-config.json" 56 | if configfile.exists(): 57 | raise FileExistsError(f"{configfile} already exists!") 58 | configfile.write_text(json.dumps(payload, indent=2)) 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/scripts/generate-service-account-keys: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4-1} 6 | 7 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 8 | echo "project is not set correctly; run 'gcloud config set project $PROJECT'" 9 | exit 1 10 | fi 11 | 12 | # work from the parent directory 13 | cd "$(dirname "$0")/.." 14 | output=.secrets 15 | mkdir -p $output 16 | 17 | function create_service_account { 18 | local project=$1 19 | local output=$2 20 | local name=$3 21 | gcloud iam service-accounts keys create "$output/$name-private-key.json" \ 22 | --iam-account "$name@$project.iam.gserviceaccount.com" 23 | } 24 | 25 | create_service_account "$PROJECT" "$output" service-account-ingest 26 | create_service_account "$PROJECT" "$output" service-account-a 27 | create_service_account "$PROJECT" "$output" service-account-b 28 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/scripts/integrate: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script controls the docker-compose workflow for integration testing. The 4 | # containers are defined in the docker-compose.yml, but are orchestrated through 5 | # this script for verification. 6 | 7 | set -euo pipefail 8 | 9 | cd "$(dirname "$0")/.." 10 | 11 | # Start server B with minio server 12 | pushd compose/server-b 13 | docker-compose run --rm app bin/process & 14 | server_b_pid=$! 15 | popd 16 | 17 | # wait for the network to come online 18 | sleep 5 19 | 20 | # Copy data into the appropriate buckets 21 | pushd compose/ingest 22 | docker-compose run --rm app bin/generate 23 | docker-compose down 24 | popd 25 | 26 | # Start server A 27 | pushd compose/server-a 28 | docker-compose run --rm app bin/process & 29 | server_a_pid=$! 30 | popd 31 | 32 | # Return the exit code of the backgrounded docker-compose container. Since 33 | # `wait` is a blocking function, a failure in server B will not be detected 34 | # until timeout in server A. 35 | wait $server_a_pid 36 | wait $server_b_pid 37 | 38 | # clean up the containers 39 | pushd compose/server-a 40 | docker-compose down 41 | popd 42 | 43 | pushd compose/ingest 44 | docker-compose run --rm app bin/insert 45 | docker-compose down 46 | popd 47 | 48 | pushd compose/server-b 49 | docker-compose down 50 | popd 51 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/terraform/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/google" { 5 | version = "3.65.0" 6 | hashes = [ 7 | "h1:ZvXCeUYoex3aOLlZYqv08WZ3hcPaf5p/gEa/DeMrkfs=", 8 | "zh:402b8ba03f19558f7d0e2a453a9b82747882fb3519ce686ce26a9afd4593d05e", 9 | "zh:523a306c2906c213b630d1c2f1e48698769bfffe360b68388d935d0bd171c55c", 10 | "zh:76af4170f5a524ff353e60dd68d728c55dcbd9f6c5f60648e28e4f8f9ca8e958", 11 | "zh:7d00a44769d26144f42b413c82272e31ae9b63153532b9a135a8f69a6608b9a6", 12 | "zh:7f5d0ab79d213809726663f7603004c173694602bd22f2419c445d6897729ca2", 13 | "zh:a1c23e3d280a5053bae9102ad55df1315585395f8656ddf83928978c7e6cd307", 14 | "zh:a81d0af5ef58c193197f81dc3059f8b22c7dde0575bb3198a0360aff7f9ca476", 15 | "zh:b5b79fa8f9e49d2d26badfded64a1e460cdb11b152168e578443cf92df679bca", 16 | "zh:ec4f88d1fd8990511b86205709c1a76ac3a444d0088a810c82a4f5db37ca4afe", 17 | "zh:f15390a40dc6e9c5b5285bc2b6a8c54b6030ae9cc04cc4a31ecf9b14145c467b", 18 | "zh:fb1a150464d822aa9182cd46a0b7bc2c279ff9400017b4bb3238256224ab41b6", 19 | ] 20 | } 21 | 22 | provider "registry.terraform.io/hashicorp/random" { 23 | version = "3.1.0" 24 | hashes = [ 25 | "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=", 26 | "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc", 27 | "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626", 28 | "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff", 29 | "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2", 30 | "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992", 31 | "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427", 32 | "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc", 33 | "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f", 34 | "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b", 35 | "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7", 36 | "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a", 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | // When forking this configuration, set the configuration appropriately. A 3 | // remote backend is a good choice since it can be shared across a team. 4 | backend "gcs" { 5 | bucket = "amiyaguchi-prio-processor-v4-1" 6 | prefix = "tf-state" 7 | } 8 | } 9 | 10 | variable "project" { 11 | type = string 12 | default = "amiyaguchi-prio-processor-v4-1" 13 | } 14 | 15 | variable "region" { 16 | type = string 17 | default = "us-central-1" 18 | } 19 | 20 | provider "google" { 21 | project = var.project 22 | region = var.region 23 | } 24 | 25 | // Choose a different bucket name if the project changes 26 | resource "random_id" "project" { 27 | keepers = { 28 | project = var.project 29 | } 30 | byte_length = 8 31 | } 32 | 33 | module "bucket-a" { 34 | source = "./modules/bucket" 35 | server_id = "a" 36 | suffix = random_id.project.hex 37 | } 38 | 39 | // Create the service accounts for the tests 40 | resource "google_service_account" "ingest" { 41 | account_id = "service-account-ingest" 42 | display_name = "Service account for the ingestion service" 43 | } 44 | 45 | resource "google_service_account" "a" { 46 | account_id = "service-account-a" 47 | display_name = "Service account for server A" 48 | } 49 | 50 | resource "google_service_account" "b" { 51 | account_id = "service-account-b" 52 | display_name = "Service account for server B" 53 | } 54 | 55 | // Assign service account permissions to each bucket. There are quite a few rules, 56 | // so we break this out into a module. 57 | module "bucket-permissions-a" { 58 | source = "./modules/bucket-permissions" 59 | bucket_private = module.bucket-a.private 60 | bucket_shared = module.bucket-a.shared 61 | bucket_ingest = module.bucket-a.ingest 62 | service_account_internal = google_service_account.a.email 63 | service_account_external = google_service_account.b.email 64 | service_account_ingest = google_service_account.ingest.email 65 | } 66 | 67 | // testing whether origin telemetry inserts into BigQuery correctly 68 | 69 | // The ingest container will be used for coordination, and gets access to 70 | // server A's private bucket because they are operated by the same entity. 71 | resource "google_storage_bucket_iam_member" "ingest_internal_private" { 72 | bucket = module.bucket-a.private 73 | role = "roles/storage.objectViewer" 74 | member = "serviceAccount:${google_service_account.ingest.email}" 75 | } 76 | 77 | resource "google_project_service" "bigquery" { 78 | service = "bigquery.googleapis.com" 79 | } 80 | 81 | resource "google_bigquery_dataset" "telemetry" { 82 | dataset_id = "telemetry" 83 | location = "US" 84 | } 85 | 86 | // Grant access to the admin service account 87 | resource "google_project_iam_member" "bigquery-admin" { 88 | role = "roles/bigquery.admin" 89 | member = "serviceAccount:${google_service_account.ingest.email}" 90 | } 91 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/terraform/modules/bucket-permissions/main.tf: -------------------------------------------------------------------------------- 1 | variable "bucket_private" { 2 | type = string 3 | description = "The private bucket for the current processor" 4 | } 5 | 6 | variable "bucket_shared" { 7 | type = string 8 | description = "The shared bucket for both processors" 9 | } 10 | 11 | variable "bucket_ingest" { 12 | type = string 13 | description = "The bucket shared with the ingestion service" 14 | } 15 | 16 | variable "service_account_internal" { 17 | type = string 18 | description = "The service account for the current processor" 19 | } 20 | 21 | variable "service_account_external" { 22 | type = string 23 | description = "The service account for the co-processor" 24 | } 25 | 26 | variable "service_account_ingest" { 27 | type = string 28 | description = "The service account for the ingestor" 29 | } 30 | 31 | resource "google_storage_bucket_iam_binding" "private" { 32 | bucket = var.bucket_private 33 | role = "roles/storage.admin" 34 | members = ["serviceAccount:${var.service_account_internal}"] 35 | } 36 | 37 | resource "google_storage_bucket_iam_binding" "shared" { 38 | bucket = var.bucket_shared 39 | role = "roles/storage.admin" 40 | members = [ 41 | "serviceAccount:${var.service_account_internal}", 42 | "serviceAccount:${var.service_account_external}" 43 | ] 44 | } 45 | 46 | resource "google_storage_bucket_iam_binding" "ingest" { 47 | bucket = var.bucket_ingest 48 | role = "roles/storage.admin" 49 | members = [ 50 | "serviceAccount:${var.service_account_internal}", 51 | "serviceAccount:${var.service_account_ingest}" 52 | ] 53 | } 54 | -------------------------------------------------------------------------------- /deployment/testing-v4-gcloud-self/terraform/modules/bucket/main.tf: -------------------------------------------------------------------------------- 1 | variable "server_id" { 2 | type = string 3 | description = "The identifier for the server" 4 | } 5 | 6 | variable "suffix" { 7 | type = string 8 | description = "A shared suffix used for the bucket" 9 | } 10 | 11 | // Create all of the storage resources necessary for the tests. We choose to 12 | // delete files older than 7 days since these are testing resources. 13 | 14 | resource "google_storage_bucket" "ingest" { 15 | name = "${var.server_id}-ingest-${var.suffix}" 16 | uniform_bucket_level_access = true 17 | lifecycle_rule { 18 | condition { 19 | age = 7 20 | } 21 | action { 22 | type = "Delete" 23 | } 24 | } 25 | } 26 | 27 | resource "google_storage_bucket" "private" { 28 | name = "${var.server_id}-private-${var.suffix}" 29 | uniform_bucket_level_access = true 30 | lifecycle_rule { 31 | condition { 32 | age = 7 33 | } 34 | action { 35 | type = "Delete" 36 | } 37 | } 38 | } 39 | 40 | resource "google_storage_bucket" "shared" { 41 | name = "${var.server_id}-shared-${var.suffix}" 42 | uniform_bucket_level_access = true 43 | lifecycle_rule { 44 | condition { 45 | age = 7 46 | } 47 | action { 48 | type = "Delete" 49 | } 50 | } 51 | } 52 | 53 | output "ingest" { 54 | value = google_storage_bucket.ingest.name 55 | } 56 | 57 | output "private" { 58 | value = google_storage_bucket.private.name 59 | } 60 | 61 | output "shared" { 62 | value = google_storage_bucket.shared.name 63 | } 64 | -------------------------------------------------------------------------------- /deployment/testing-v4/.gitignore: -------------------------------------------------------------------------------- 1 | .service-account-keys/ 2 | .secrets/ 3 | -------------------------------------------------------------------------------- /deployment/testing-v4/README.md: -------------------------------------------------------------------------------- 1 | # Testing configuration for v4 containers 2 | 3 | This directory contains terraform configuration to bring relevant resources for 4 | an integration test of the prio-processor v4.x containers. 5 | 6 | To create a new project that uses the same configuration, change the terraform 7 | backend appropriately. Here, the state is placed into a storage bucket that has 8 | been created beforehand. Ensure the project has also been created. Then: 9 | 10 | ```bash 11 | cd terraform 12 | 13 | # if you're choosing a different project or change any modules 14 | terraform init 15 | 16 | # apply any changes 17 | terraform apply 18 | ``` 19 | 20 | To configure the tests: 21 | 22 | ```bash 23 | # There is a maximum of 10 keys per service account. This script doesn't 24 | # handle key rotations, so disable old keys as necessary. 25 | scripts/generate-service-account-keys 26 | 27 | # generate new keys (or alternatively copy .env.template files to their .env locations) 28 | scripts/generate-dotenv 29 | ``` 30 | 31 | The above commands only need to be run once. To run the tests: 32 | 33 | ```bash 34 | # run the integration script 35 | scripts/integrate 36 | 37 | # clean up the buckets 38 | scripts/cleanup 39 | ``` 40 | 41 | In order to be agnostic to the storage provider, MinIO and `mc` are used for 42 | transferring data between the different parties. A GCS gateway is provisioned 43 | for each container that is associated with a service account. Each MinIO 44 | container has an HTTP entrypoint for browsing files that can be found on the 45 | following locations: 46 | 47 | - http://localhost:9001 for the ingestion server 48 | - http://localhost:9002 for server a 49 | - http://localhost:9003 for server b 50 | -------------------------------------------------------------------------------- /deployment/testing-v4/compose/ingest/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | 4 | APP_NAME=test-app 5 | DATA_CONFIG=/app/config/content.json 6 | ORIGIN_CONFIG=/app/config/telemetry_origin_data_inc.json 7 | BUCKET_PREFIX=test-app/v1 8 | 9 | # relative to the docker-compose file 10 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-ingest-private-key.json" 11 | 12 | PUBLIC_KEY_HEX_INTERNAL=16A6203457348A5B957E02D71CAD726D8C7C7A25C4DBFA73DDF2F0748C893166 13 | BUCKET_INTERNAL_INGEST=a-ingest-d70d758a4b28a791 14 | BUCKET_INTERNAL_ACCESS_KEY=server-ingest-access-key 15 | BUCKET_INTERNAL_SECRET_KEY=server-ingest-secret-key 16 | BUCKET_INTERNAL_ENDPOINT=http://minio:9000 17 | 18 | PUBLIC_KEY_HEX_EXTERNAL=702E7941DE5F024B02F6CB5AE7176413EDC90F008368579021D115A45F95326C 19 | BUCKET_EXTERNAL_INGEST=b-ingest-d70d758a4b28a791 20 | BUCKET_EXTERNAL_ACCESS_KEY=server-ingest-access-key 21 | BUCKET_EXTERNAL_SECRET_KEY=server-ingest-secret-key 22 | BUCKET_EXTERNAL_ENDPOINT=http://minio:9000 23 | 24 | # The ingest also gets access to the private internal bucket, because ingest and 25 | # server A are operated by the same entity in the origin telemetry setup 26 | BUCKET_INTERNAL_PRIVATE=a-private-d70d758a4b28a791 27 | DATASET=telemetry 28 | TABLE=content_blocking 29 | BQ_REPLACE=true 30 | CLOUDSDK_CORE_PROJECT=amiyaguchi-prio-processor-v4 31 | -------------------------------------------------------------------------------- /deployment/testing-v4/compose/ingest/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | services: 4 | # https://docs.min.io/docs/minio-gateway-for-gcs.html 5 | minio: 6 | image: minio/minio:latest 7 | command: gateway gcs ${CLOUDSDK_CORE_PROJECT} 8 | volumes: 9 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 10 | ports: 11 | - 9001:9000 12 | environment: 13 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 14 | - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY 15 | - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY 16 | 17 | app: 18 | build: ../../../.. 19 | command: "true" 20 | volumes: 21 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 22 | - ../../content.json:/app/config/content.json 23 | - ../../../../bin:/app/bin 24 | depends_on: 25 | - minio 26 | environment: 27 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 28 | - APP_NAME 29 | - DATA_CONFIG 30 | - ORIGIN_CONFIG 31 | - PUBLIC_KEY_HEX_INTERNAL 32 | - PUBLIC_KEY_HEX_EXTERNAL 33 | - BUCKET_INTERNAL_ACCESS_KEY 34 | - BUCKET_INTERNAL_SECRET_KEY 35 | - BUCKET_INTERNAL_ENDPOINT 36 | - BUCKET_EXTERNAL_ACCESS_KEY 37 | - BUCKET_EXTERNAL_SECRET_KEY 38 | - BUCKET_EXTERNAL_ENDPOINT 39 | - BUCKET_INTERNAL_INGEST 40 | - BUCKET_EXTERNAL_INGEST 41 | # for submitting results to bigquery, granted access via GCP primitives 42 | # instead of HMAC keys 43 | - BUCKET_INTERNAL_PRIVATE 44 | - BUCKET_PREFIX 45 | - DATASET 46 | - TABLE 47 | - BQ_REPLACE 48 | - CLOUDSDK_CORE_PROJECT 49 | - SUBMISSION_DATE 50 | -------------------------------------------------------------------------------- /deployment/testing-v4/compose/server-a/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | 4 | APP_NAME=test-app 5 | DATA_CONFIG=/app/config/content.json 6 | BUCKET_PREFIX=test-app/v1 7 | SERVER_ID=A 8 | SHARED_SECRET=g8EbbygYtecFDnpzkRyPjw== 9 | 10 | # Used for the MinIO GCS gateway 11 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-a-private-key.json" 12 | 13 | PRIVATE_KEY_HEX=3D4967F5FC58272E826F329F6DE930987265EF032136192C5B9B1EB1A6D15337 14 | PUBLIC_KEY_HEX_INTERNAL=16A6203457348A5B957E02D71CAD726D8C7C7A25C4DBFA73DDF2F0748C893166 15 | BUCKET_INTERNAL_INGEST=a-ingest-d70d758a4b28a791 16 | BUCKET_INTERNAL_PRIVATE=a-private-d70d758a4b28a791 17 | BUCKET_INTERNAL_SHARED=a-shared-d70d758a4b28a791 18 | BUCKET_INTERNAL_ACCESS_KEY=server-a-access-key 19 | BUCKET_INTERNAL_SECRET_KEY=server-a-secret-key 20 | BUCKET_INTERNAL_ENDPOINT=http://minio:9000 21 | 22 | PUBLIC_KEY_HEX_EXTERNAL=702E7941DE5F024B02F6CB5AE7176413EDC90F008368579021D115A45F95326C 23 | BUCKET_EXTERNAL_SHARED=b-shared-d70d758a4b28a791 24 | # NOTE: the keys are shared since permissions are configured at the service 25 | # account level in the gateway 26 | BUCKET_EXTERNAL_ACCESS_KEY=server-a-access-key 27 | BUCKET_EXTERNAL_SECRET_KEY=server-a-secret-key 28 | BUCKET_EXTERNAL_ENDPOINT=http://minio:9000 29 | -------------------------------------------------------------------------------- /deployment/testing-v4/compose/server-a/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | services: 4 | # https://docs.min.io/docs/minio-gateway-for-gcs.html 5 | minio: 6 | image: minio/minio:latest 7 | command: gateway gcs 8 | volumes: 9 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 10 | ports: 11 | - 9002:9000 12 | environment: 13 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 14 | - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY 15 | - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY 16 | 17 | app: 18 | build: ../../../.. 19 | working_dir: /app 20 | command: bin/process 21 | volumes: 22 | - ../../content.json:/app/config/content.json 23 | - ../../../../bin:/app/bin 24 | depends_on: 25 | - minio 26 | environment: 27 | - APP_NAME 28 | - DATA_CONFIG 29 | - SERVER_ID 30 | - SHARED_SECRET 31 | - PRIVATE_KEY_HEX 32 | - PUBLIC_KEY_HEX_INTERNAL 33 | - PUBLIC_KEY_HEX_EXTERNAL 34 | - BUCKET_INTERNAL_ACCESS_KEY 35 | - BUCKET_INTERNAL_SECRET_KEY 36 | - BUCKET_INTERNAL_ENDPOINT 37 | - BUCKET_EXTERNAL_ACCESS_KEY 38 | - BUCKET_EXTERNAL_SECRET_KEY 39 | - BUCKET_EXTERNAL_ENDPOINT 40 | - BUCKET_INTERNAL_INGEST 41 | - BUCKET_INTERNAL_PRIVATE 42 | - BUCKET_INTERNAL_SHARED 43 | - BUCKET_EXTERNAL_SHARED 44 | - BUCKET_PREFIX 45 | - SUBMISSION_DATE 46 | -------------------------------------------------------------------------------- /deployment/testing-v4/compose/server-b/.env.template: -------------------------------------------------------------------------------- 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in 2 | # manually edited values into source control. 3 | 4 | APP_NAME=test-app 5 | DATA_CONFIG=/app/config/content.json 6 | BUCKET_PREFIX=test-app/v1 7 | SERVER_ID=B 8 | SHARED_SECRET=g8EbbygYtecFDnpzkRyPjw== 9 | 10 | # Used for the MinIO GCS gateway 11 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-b-private-key.json" 12 | 13 | PRIVATE_KEY_HEX=0D400D1055E1C20D93EC90EC5F5BD5DCB08B483B02035C6E16E86BF842D70A7A 14 | PUBLIC_KEY_HEX_INTERNAL=702E7941DE5F024B02F6CB5AE7176413EDC90F008368579021D115A45F95326C 15 | BUCKET_INTERNAL_INGEST=b-ingest-d70d758a4b28a791 16 | BUCKET_INTERNAL_PRIVATE=b-private-d70d758a4b28a791 17 | BUCKET_INTERNAL_SHARED=b-shared-d70d758a4b28a791 18 | BUCKET_INTERNAL_ACCESS_KEY=server-b-access-key 19 | BUCKET_INTERNAL_SECRET_KEY=server-b-secret-key 20 | BUCKET_INTERNAL_ENDPOINT=http://minio:9000 21 | 22 | PUBLIC_KEY_HEX_EXTERNAL=16A6203457348A5B957E02D71CAD726D8C7C7A25C4DBFA73DDF2F0748C893166 23 | BUCKET_EXTERNAL_SHARED=a-shared-d70d758a4b28a791 24 | # NOTE: the keys are shared since permissions are configured at the service 25 | # account level in the gateway 26 | BUCKET_EXTERNAL_ACCESS_KEY=server-b-access-key 27 | BUCKET_EXTERNAL_SECRET_KEY=server-b-secret-key 28 | BUCKET_EXTERNAL_ENDPOINT=http://minio:9000 29 | -------------------------------------------------------------------------------- /deployment/testing-v4/compose/server-b/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | services: 4 | # https://docs.min.io/docs/minio-gateway-for-gcs.html 5 | minio: 6 | image: minio/minio:latest 7 | command: gateway gcs 8 | volumes: 9 | - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials 10 | ports: 11 | - 9003:9000 12 | environment: 13 | - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials 14 | - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY 15 | - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY 16 | 17 | app: 18 | build: ../../../.. 19 | working_dir: /app 20 | command: bin/process 21 | volumes: 22 | - ../../content.json:/app/config/content.json 23 | - ../../../../bin:/app/bin 24 | depends_on: 25 | - minio 26 | environment: 27 | - APP_NAME 28 | - DATA_CONFIG 29 | - SERVER_ID 30 | - SHARED_SECRET 31 | - PRIVATE_KEY_HEX 32 | - PUBLIC_KEY_HEX_INTERNAL 33 | - PUBLIC_KEY_HEX_EXTERNAL 34 | - BUCKET_INTERNAL_ACCESS_KEY 35 | - BUCKET_INTERNAL_SECRET_KEY 36 | - BUCKET_INTERNAL_ENDPOINT 37 | - BUCKET_EXTERNAL_ACCESS_KEY 38 | - BUCKET_EXTERNAL_SECRET_KEY 39 | - BUCKET_EXTERNAL_ENDPOINT 40 | - BUCKET_INTERNAL_INGEST 41 | - BUCKET_INTERNAL_PRIVATE 42 | - BUCKET_INTERNAL_SHARED 43 | - BUCKET_EXTERNAL_SHARED 44 | - BUCKET_PREFIX 45 | - SUBMISSION_DATE 46 | -------------------------------------------------------------------------------- /deployment/testing-v4/content.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "batch_id": "content.blocking_blocked_TESTONLY-0", 4 | "n_data": 2046 5 | }, 6 | { 7 | "batch_id": "content.blocking_blocked_TESTONLY-1", 8 | "n_data": 441 9 | } 10 | ] 11 | -------------------------------------------------------------------------------- /deployment/testing-v4/scripts/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Delegate docker-compose build to each service 4 | 5 | set -euo pipefail 6 | cd "$(dirname "$0")/.." 7 | 8 | pushd compose/ingest 9 | docker-compose build 10 | popd 11 | 12 | pushd compose/server-a 13 | docker-compose build 14 | popd 15 | 16 | pushd compose/server-b 17 | docker-compose build 18 | popd 19 | -------------------------------------------------------------------------------- /deployment/testing-v4/scripts/cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Delegate cleanup of buckets to the appropriate service account 4 | 5 | set -euo pipefail 6 | cd "$(dirname "$0")/.." 7 | 8 | pushd compose/server-a 9 | docker-compose run --rm app bin/cleanup 10 | docker-compose down 11 | popd 12 | 13 | pushd compose/server-b 14 | docker-compose run --rm app bin/cleanup 15 | docker-compose down 16 | popd 17 | -------------------------------------------------------------------------------- /deployment/testing-v4/scripts/generate-dotenv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Generate dotenv files for each of the compose configurations 3 | 4 | set -e 5 | 6 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4} 7 | TAG=${TAG:-mozilla/prio-processor:v3.0.0} 8 | 9 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 10 | echo "project is not set correctly; run 'gcloud config set project $PROJECT'" 11 | exit 1 12 | fi 13 | 14 | function get-key { 15 | local json=$1 16 | local key=$2 17 | echo "$json" | jq -r ".$key" 18 | } 19 | 20 | # reuse results from a single gsutil call 21 | _results=$(gsutil ls) 22 | function get-bucket { 23 | local pattern=$1 24 | path=$(echo "$_results" | grep "$pattern") 25 | # strip any trailing slashes 26 | trim="${path%/}" 27 | # trim gs:// prefix 28 | trim="${trim#gs://}" 29 | echo $trim 30 | } 31 | 32 | function upper { 33 | local text=$1 34 | echo "$text" | tr '[:lower:]' '[:upper:]' 35 | } 36 | 37 | 38 | function minio { 39 | # either a, b, or ingest 40 | local server_id=$1 41 | # either internal or external 42 | local type=$2 43 | # NOTE: use a better keypair than this... also, due to the nature of this 44 | # test, the internal/external keys are the same, using the gateway's service 45 | # account for authorization to the buckets. 46 | cat << EOF 47 | BUCKET_$(upper "$type")_ACCESS_KEY=server-$server_id-access-key 48 | BUCKET_$(upper "$type")_SECRET_KEY=server-$server_id-secret-key 49 | BUCKET_$(upper "$type")_ENDPOINT=http://minio:9000 50 | EOF 51 | } 52 | 53 | # work from the parent directory 54 | cd "$(dirname "$0")/.." 55 | 56 | keys_a=$(docker run -it "$TAG" prio keygen) 57 | keys_b=$(docker run -it "$TAG" prio keygen) 58 | seed=$(docker run -it "$TAG" prio shared-seed) 59 | 60 | # list out all the variables we might need... 61 | app_name="test-app" 62 | bucket_prefix="$app_name/v1" 63 | data_config="/app/config/content.json" 64 | origin_config="/app/config/telemetry_origin_data_inc.json" 65 | 66 | function ingest-env { 67 | local is_template=$1 68 | local output; 69 | output=$(if [[ $is_template == true ]]; then echo .env.template; else echo .env; fi) 70 | cat << EOF > "compose/ingest/$output" 71 | # This configuration is generated by scripts/generate-dotenv. Do not check in 72 | # manually edited values into source control. 73 | 74 | APP_NAME=$app_name 75 | DATA_CONFIG=$data_config 76 | ORIGIN_CONFIG=$origin_config 77 | BUCKET_PREFIX=$bucket_prefix 78 | 79 | # relative to the docker-compose file 80 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-ingest-private-key.json" 81 | 82 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$keys_a" public_key) 83 | BUCKET_INTERNAL_INGEST=$(get-bucket a-ingest) 84 | $(minio ingest internal) 85 | 86 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$keys_b" public_key) 87 | BUCKET_EXTERNAL_INGEST=$(get-bucket b-ingest) 88 | $(minio ingest external) 89 | 90 | # The ingest also gets access to the private internal bucket, because ingest and 91 | # server A are operated by the same entity in the origin telemetry setup 92 | BUCKET_INTERNAL_PRIVATE=$(get-bucket a-private) 93 | DATASET=telemetry 94 | TABLE=content_blocking 95 | BQ_REPLACE=true 96 | CLOUDSDK_CORE_PROJECT=$PROJECT 97 | EOF 98 | } 99 | 100 | function server-env { 101 | local server_id=$1 102 | local internal_key=$2 103 | local external_key=$3 104 | local is_template=$4 105 | 106 | local other_id; 107 | other_id=$(if [[ $server_id == a ]]; then echo b; else echo a; fi) 108 | local output; 109 | output=$(if [[ $is_template == true ]]; then echo .env.template; else echo .env; fi) 110 | cat << EOF > "compose/server-$server_id/$output" 111 | # This configuration is generated by scripts/generate-dotenv. Do not check in 112 | # manually edited values into source control. 113 | 114 | APP_NAME=$app_name 115 | DATA_CONFIG=$data_config 116 | BUCKET_PREFIX=$bucket_prefix 117 | SERVER_ID=$(upper "$server_id") 118 | SHARED_SECRET=$(get-key "$seed" shared_seed) 119 | 120 | # Used for the MinIO GCS gateway 121 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-$server_id-private-key.json" 122 | 123 | PRIVATE_KEY_HEX=$(get-key "$internal_key" private_key) 124 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$internal_key" public_key) 125 | BUCKET_INTERNAL_INGEST=$(get-bucket "${server_id}-ingest") 126 | BUCKET_INTERNAL_PRIVATE=$(get-bucket "${server_id}-private") 127 | BUCKET_INTERNAL_SHARED=$(get-bucket "${server_id}-shared") 128 | $(minio "$server_id" internal) 129 | 130 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$external_key" public_key) 131 | BUCKET_EXTERNAL_SHARED=$(get-bucket "${other_id}-shared") 132 | # NOTE: the keys are shared since permissions are configured at the service 133 | # account level in the gateway 134 | $(minio "$server_id" external) 135 | EOF 136 | } 137 | 138 | ingest-env true 139 | ingest-env false 140 | server-env a "$keys_a" "$keys_b" true 141 | server-env a "$keys_a" "$keys_b" false 142 | server-env b "$keys_b" "$keys_a" true 143 | server-env b "$keys_b" "$keys_a" false 144 | -------------------------------------------------------------------------------- /deployment/testing-v4/scripts/generate-service-account-keys: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4} 6 | 7 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 8 | echo "project is not set correctly; run 'gcloud config set project $PROJECT'" 9 | exit 1 10 | fi 11 | 12 | # work from the parent directory 13 | cd "$(dirname "$0")/.." 14 | output=.secrets 15 | mkdir -p $output 16 | 17 | function create_service_account { 18 | local project=$1 19 | local output=$2 20 | local name=$3 21 | gcloud iam service-accounts keys create "$output/$name-private-key.json" \ 22 | --iam-account "$name@$project.iam.gserviceaccount.com" 23 | } 24 | 25 | create_service_account "$PROJECT" "$output" service-account-ingest 26 | create_service_account "$PROJECT" "$output" service-account-a 27 | create_service_account "$PROJECT" "$output" service-account-b 28 | -------------------------------------------------------------------------------- /deployment/testing-v4/scripts/integrate: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script controls the docker-compose workflow for integration testing. The 4 | # containers are defined in the docker-compose.yml, but are orchestrated through 5 | # this script for verification. 6 | 7 | set -euo pipefail 8 | 9 | cd "$(dirname "$0")/.." 10 | 11 | # Copy data into the appropriate buckets 12 | pushd compose/ingest 13 | docker-compose run --rm app bin/generate 14 | docker-compose down 15 | popd 16 | 17 | # Start server A 18 | pushd compose/server-a 19 | docker-compose run --rm app bin/process & 20 | server_a_pid=$! 21 | popd 22 | 23 | # offset the start times by a short amount for proper authentication against GCP 24 | sleep 2 25 | 26 | # Start server B 27 | pushd compose/server-b 28 | docker-compose run --rm app bin/process & 29 | server_b_pid=$! 30 | popd 31 | 32 | # Return the exit code of the backgrounded docker-compose container. Since 33 | # `wait` is a blocking function, a failure in server B will not be detected 34 | # until timeout in server A. 35 | wait $server_a_pid 36 | wait $server_b_pid 37 | 38 | # clean up the containers 39 | pushd compose/server-a 40 | docker-compose down 41 | popd 42 | pushd compose/server-b 43 | docker-compose down 44 | popd 45 | 46 | pushd compose/ingest 47 | docker-compose run --rm app bin/insert 48 | docker-compose down 49 | popd 50 | -------------------------------------------------------------------------------- /deployment/testing-v4/scripts/list-bucket: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | : << EOF 3 | To use this script, run the following command: 4 | 5 | scripts/list-bucket > LISTING.md 6 | EOF 7 | 8 | set -e 9 | 10 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4} 11 | 12 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 13 | echo "project is not set correctly; run 'gcloud config set project $PROJECT'" 14 | exit 1 15 | fi 16 | 17 | function sort_recursive_listing { 18 | local bucket=$1 19 | # remove lines that end with /:, empty lines, or the summary line 20 | # then remove extra spacing, sort by date, and take the name of the path 21 | gsutil ls -lr "$bucket" | \ 22 | grep -v :$ | grep -v ^$ | grep -v ^TOTAL | \ 23 | tr -s " " | sort -k2 | cut -d " " -f4 | \ 24 | tree --fromfile 25 | } 26 | 27 | cat << EOF 28 | # Directory listing 29 | 30 | This listing was generated from \`scripts/list-bucket\`. It is a list of all 31 | objects stored across the the two servers. 32 | 33 | ## Server A buckets 34 | 35 | EOF 36 | 37 | buckets=$(gsutil ls | sort) 38 | for bucket in $(echo "$buckets" | grep a- ); do 39 | cat << EOF 40 | ### \`$bucket\` 41 | 42 | \`\`\` 43 | $(sort_recursive_listing "$bucket") 44 | \`\`\` 45 | 46 | EOF 47 | done 48 | 49 | echo "## Server B buckets" 50 | echo "" 51 | 52 | for bucket in $(echo "$buckets" | grep b-); do 53 | cat << EOF 54 | ### \`$bucket\` 55 | 56 | \`\`\` 57 | $(sort_recursive_listing "$bucket") 58 | \`\`\` 59 | 60 | EOF 61 | done 62 | -------------------------------------------------------------------------------- /deployment/testing-v4/terraform/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/google" { 5 | version = "3.65.0" 6 | hashes = [ 7 | "h1:ZvXCeUYoex3aOLlZYqv08WZ3hcPaf5p/gEa/DeMrkfs=", 8 | "zh:402b8ba03f19558f7d0e2a453a9b82747882fb3519ce686ce26a9afd4593d05e", 9 | "zh:523a306c2906c213b630d1c2f1e48698769bfffe360b68388d935d0bd171c55c", 10 | "zh:76af4170f5a524ff353e60dd68d728c55dcbd9f6c5f60648e28e4f8f9ca8e958", 11 | "zh:7d00a44769d26144f42b413c82272e31ae9b63153532b9a135a8f69a6608b9a6", 12 | "zh:7f5d0ab79d213809726663f7603004c173694602bd22f2419c445d6897729ca2", 13 | "zh:a1c23e3d280a5053bae9102ad55df1315585395f8656ddf83928978c7e6cd307", 14 | "zh:a81d0af5ef58c193197f81dc3059f8b22c7dde0575bb3198a0360aff7f9ca476", 15 | "zh:b5b79fa8f9e49d2d26badfded64a1e460cdb11b152168e578443cf92df679bca", 16 | "zh:ec4f88d1fd8990511b86205709c1a76ac3a444d0088a810c82a4f5db37ca4afe", 17 | "zh:f15390a40dc6e9c5b5285bc2b6a8c54b6030ae9cc04cc4a31ecf9b14145c467b", 18 | "zh:fb1a150464d822aa9182cd46a0b7bc2c279ff9400017b4bb3238256224ab41b6", 19 | ] 20 | } 21 | 22 | provider "registry.terraform.io/hashicorp/random" { 23 | version = "3.1.0" 24 | hashes = [ 25 | "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=", 26 | "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc", 27 | "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626", 28 | "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff", 29 | "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2", 30 | "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992", 31 | "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427", 32 | "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc", 33 | "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f", 34 | "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b", 35 | "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7", 36 | "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a", 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /deployment/testing-v4/terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | // When forking this configuration, set the configuration appropriately. A 3 | // remote backend is a good choice since it can be shared across a team. 4 | backend "gcs" { 5 | bucket = "amiyaguchi-prio-processor-v4" 6 | prefix = "tf-state" 7 | } 8 | } 9 | 10 | variable "project" { 11 | type = string 12 | default = "amiyaguchi-prio-processor-v4" 13 | } 14 | 15 | variable "region" { 16 | type = string 17 | default = "us-central-1" 18 | } 19 | 20 | provider "google" { 21 | project = var.project 22 | region = var.region 23 | } 24 | 25 | // Choose a different bucket name if the project changes 26 | resource "random_id" "project" { 27 | keepers = { 28 | project = var.project 29 | } 30 | byte_length = 8 31 | } 32 | 33 | module "bucket-a" { 34 | source = "./modules/bucket" 35 | server_id = "a" 36 | suffix = random_id.project.hex 37 | } 38 | 39 | module "bucket-b" { 40 | source = "./modules/bucket" 41 | server_id = "b" 42 | suffix = random_id.project.hex 43 | } 44 | 45 | 46 | // Create the service accounts for the tests 47 | resource "google_service_account" "ingest" { 48 | account_id = "service-account-ingest" 49 | display_name = "Service account for the ingestion service" 50 | } 51 | 52 | resource "google_service_account" "a" { 53 | account_id = "service-account-a" 54 | display_name = "Service account for server A" 55 | } 56 | 57 | resource "google_service_account" "b" { 58 | account_id = "service-account-b" 59 | display_name = "Service account for server B" 60 | } 61 | 62 | // Assign service account permissions to each bucket. There are quite a few rules, 63 | // so we break this out into a module. 64 | module "bucket-permissions-a" { 65 | source = "./modules/bucket-permissions" 66 | bucket_private = module.bucket-a.private 67 | bucket_shared = module.bucket-a.shared 68 | bucket_ingest = module.bucket-a.ingest 69 | service_account_internal = google_service_account.a.email 70 | service_account_external = google_service_account.b.email 71 | service_account_ingest = google_service_account.ingest.email 72 | } 73 | 74 | module "bucket-permissions-b" { 75 | source = "./modules/bucket-permissions" 76 | bucket_private = module.bucket-b.private 77 | bucket_shared = module.bucket-b.shared 78 | bucket_ingest = module.bucket-b.ingest 79 | service_account_internal = google_service_account.b.email 80 | service_account_external = google_service_account.a.email 81 | service_account_ingest = google_service_account.ingest.email 82 | } 83 | 84 | // testing whether origin telemetry inserts into BigQuery correctly 85 | 86 | // The ingest container will be used for coordination, and gets access to 87 | // server A's private bucket because they are operated by the same entity. 88 | resource "google_storage_bucket_iam_member" "ingest_internal_private" { 89 | bucket = module.bucket-a.private 90 | role = "roles/storage.objectViewer" 91 | member = "serviceAccount:${google_service_account.ingest.email}" 92 | } 93 | 94 | resource "google_project_service" "bigquery" { 95 | service = "bigquery.googleapis.com" 96 | } 97 | 98 | resource "google_bigquery_dataset" "telemetry" { 99 | dataset_id = "telemetry" 100 | location = "US" 101 | } 102 | 103 | // Grant access to the admin service account 104 | resource "google_project_iam_member" "bigquery-admin" { 105 | role = "roles/bigquery.admin" 106 | member = "serviceAccount:${google_service_account.ingest.email}" 107 | } 108 | -------------------------------------------------------------------------------- /deployment/testing-v4/terraform/modules/bucket-permissions/main.tf: -------------------------------------------------------------------------------- 1 | variable "bucket_private" { 2 | type = string 3 | description = "The private bucket for the current processor" 4 | } 5 | 6 | variable "bucket_shared" { 7 | type = string 8 | description = "The shared bucket for both processors" 9 | } 10 | 11 | variable "bucket_ingest" { 12 | type = string 13 | description = "The bucket shared with the ingestion service" 14 | } 15 | 16 | variable "service_account_internal" { 17 | type = string 18 | description = "The service account for the current processor" 19 | } 20 | 21 | variable "service_account_external" { 22 | type = string 23 | description = "The service account for the co-processor" 24 | } 25 | 26 | variable "service_account_ingest" { 27 | type = string 28 | description = "The service account for the ingestor" 29 | } 30 | 31 | resource "google_storage_bucket_iam_binding" "private" { 32 | bucket = var.bucket_private 33 | role = "roles/storage.admin" 34 | members = ["serviceAccount:${var.service_account_internal}"] 35 | } 36 | 37 | resource "google_storage_bucket_iam_binding" "shared" { 38 | bucket = var.bucket_shared 39 | role = "roles/storage.admin" 40 | members = [ 41 | "serviceAccount:${var.service_account_internal}", 42 | "serviceAccount:${var.service_account_external}" 43 | ] 44 | } 45 | 46 | resource "google_storage_bucket_iam_binding" "ingest" { 47 | bucket = var.bucket_ingest 48 | role = "roles/storage.admin" 49 | members = [ 50 | "serviceAccount:${var.service_account_internal}", 51 | "serviceAccount:${var.service_account_ingest}" 52 | ] 53 | } 54 | -------------------------------------------------------------------------------- /deployment/testing-v4/terraform/modules/bucket/main.tf: -------------------------------------------------------------------------------- 1 | variable "server_id" { 2 | type = string 3 | description = "The identifier for the server" 4 | } 5 | 6 | variable "suffix" { 7 | type = string 8 | description = "A shared suffix used for the bucket" 9 | } 10 | 11 | // Create all of the storage resources necessary for the tests. We choose to 12 | // delete files older than 7 days since these are testing resources. 13 | 14 | resource "google_storage_bucket" "ingest" { 15 | name = "${var.server_id}-ingest-${var.suffix}" 16 | uniform_bucket_level_access = true 17 | lifecycle_rule { 18 | condition { 19 | age = 7 20 | } 21 | action { 22 | type = "Delete" 23 | } 24 | } 25 | } 26 | 27 | resource "google_storage_bucket" "private" { 28 | name = "${var.server_id}-private-${var.suffix}" 29 | uniform_bucket_level_access = true 30 | lifecycle_rule { 31 | condition { 32 | age = 7 33 | } 34 | action { 35 | type = "Delete" 36 | } 37 | } 38 | } 39 | 40 | resource "google_storage_bucket" "shared" { 41 | name = "${var.server_id}-shared-${var.suffix}" 42 | uniform_bucket_level_access = true 43 | lifecycle_rule { 44 | condition { 45 | age = 7 46 | } 47 | action { 48 | type = "Delete" 49 | } 50 | } 51 | } 52 | 53 | output "ingest" { 54 | value = google_storage_bucket.ingest.name 55 | } 56 | 57 | output "private" { 58 | value = google_storage_bucket.private.name 59 | } 60 | 61 | output "shared" { 62 | value = google_storage_bucket.shared.name 63 | } 64 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | # This service runs the tests and shuts down. This can also be used as 5 | # entrypoint into the container by running `docker-compose run prio_processor 6 | # bash`. 7 | prio_processor: 8 | build: 9 | context: . 10 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /docs/airflow.md: -------------------------------------------------------------------------------- 1 | # Running prio-processor on Airflow 2 | 3 | A prio-processor job is implemented in 4 | [mozilla/telemetry-airflow](https://github.com/mozilla/telemetry-airflow/), the 5 | repository that powers many of the scheduled queries and jobs within the data 6 | organization within Mozilla. This section describes how the DAG (directed 7 | acyclic graph of tasks) is orgnized within the repository, and the Google Cloud 8 | Platform services that it utilizes. 9 | 10 | ## DAG overview 11 | 12 | The `prio-processor` DAG is split up between work that is done by an `admin` 13 | entity and an server `a` entity. 14 | 15 | ![airflow-dag](./images/airflow-dag.png) 16 | 17 | The `admin` project handles the following jobs: 18 | 19 | - `bootstrap` - copy the built python egg and entrypoint scripts into a cloud 20 | storage bucket for use in Dataproc (hosted Spark). Check `bin/dataproc` for 21 | more details. 22 | - `staging` - Read data from BigQuery and write out batch-id partitioned ndjson 23 | files into a bucket owned by the `admin` project. 24 | - `transfer_*` - Copy data from the `admin` project into the storage buckets of 25 | server a and server b 26 | 27 | Server a then runs the `processor_a` job. This is run on an emphemeral 28 | Kubernetes cluster [using the 29 | `GKEPodOperator`](https://airflow.apache.org/docs/apache-airflow/1.10.15/_api/airflow/contrib/operators/gcp_container_operator/index.html). 30 | The pod operator will fetch the relevant container image, and the configure the 31 | job using environment variables that include secrets inside of the Airflow 32 | cluster. 33 | 34 | Finally, the `admin` project will read data from server a's private bucket where 35 | the final aggregate results are stored. This is written into a BigQuery table 36 | that lives within the admin project. 37 | 38 | ## Infrastructure overview 39 | 40 | The infrastructure is managed in the [mozilla-services/cloudops-infra][cloudops] 41 | repository. It is split into two domains: prod and nonprod. The prod projects 42 | are used to run code inside of workflow.telemetry.mozilla.org, and contains the 43 | secret keys for Origin Telemetry aggregates that are running on Prio v1. The 44 | nonprod projects are used in the local development workflow for running jobs 45 | using inside of the telemetry-airflow repository. 46 | 47 | In each realm, the projects are broken up as follows: 48 | 49 | - `admin` - This contains resources that are designed to interop with the rest 50 | of the Mozilla data platform. Service accounts in this project have the 51 | ability to read from the BigQuery datasets in `moz-fx-data-shared-prod`. 52 | Having access to the main BigQuery project is necessary to preprocess the raw 53 | data into a format that is acceptable for `prio-processor` containers. This 54 | project utilizes Dataproc for preprocessing data, and also run various scripts 55 | using the `prio-processor` image on an ephemeral GKE cluster. 56 | - `server-a` - This project contains resources necessary to run a container on 57 | GCP. In particular, service accounts provisioned for Airflow have the ability 58 | to create and delete GKE clusters. These clusters are spun up on-demand, which 59 | allows data engineers to change the node pool specifications quickly. There 60 | are various cloud storage buckets that are configured to support multi-party 61 | communication. 62 | - `server-b` - A copy of server-a, but designed to be dropped for use with an 63 | external partner. 64 | 65 | ## Development 66 | 67 | As of 2021-09-20, there are 6 projects involved with the processing of Firefox nightly data. 68 | 69 | - moz-fx-prio-admin-prod-098j 70 | - moz-fx-prio-admin-nonprod-8uy7 71 | - moz-fx-prio-a-prod-kju7 72 | - moz-fx-prio-a-nonprod-bf65 73 | - moz-fx-prio-b-prod-a67n 74 | - moz-fx-prio-b-nonprod-h77y 75 | 76 | The `prod` variant is used to run the job in the production telemetry-airflow 77 | environment. The `nonprod` projects are used in development, which is the 78 | docker-compose workflow that runs on a local host. In addition to the general 79 | README instructions on the telemetry-airflow repository, run the following 80 | script [located in this gist][update-creds]. For this, you will need to have 81 | access to each individual project and the ability to generate service account 82 | credentials that you can store locally on disk. 83 | 84 | Enable the `prio_processor` DAG and clear tasks to begin processing of data. An 85 | ephemeral kubernetes cluster is responsible for running the the containerized 86 | application. Parameters for the job are passed in via the environment. 87 | 88 | [prio-dag]: https://github.com/mozilla/telemetry-airflow/blob/915a78e1e936acbb89ec9d3d35e64ce77adc6781/dags/prio_processor.py 89 | [prio-utils]: https://github.com/mozilla/telemetry-airflow/tree/915a78e1e936acbb89ec9d3d35e64ce77adc6781/dags/prio 90 | [update-creds]: https://gist.github.com/acmiyaguchi/a1652f3d56f589e773a9c270bd7f1e6a 91 | [cloudops]: https://github.com/mozilla-services/cloudops-infra/tree/900cafb27cd42fb950d3249e152d3c72541ff424/projects/prio 92 | -------------------------------------------------------------------------------- /docs/images/airflow-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/docs/images/airflow-dag.png -------------------------------------------------------------------------------- /docs/link/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ../../CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples of the Python `prio` package 2 | 3 | There are various examples included in the repository that demonstrate small 4 | applications that can be built using this set of tools. 5 | 6 | * `swig-wrapper` - A simple application demonstrating the full Prio pipeline. 7 | * `python-wrapper` - A usage of the object-oriented Python wrapper. 8 | * `benchmarks` - Various benchmarks in a Jupyter notebook. 9 | * `browser-validation` - The validation code used to verify existing Firefox 10 | measurements for this [blog 11 | post](https://hacks.mozilla.org/2018/10/testing-privacy-preserving-telemetry-with-prio/). 12 | * `asyncio` - An asynchronous pipeline. 13 | * `docker-asyncio` - An asynchronous pipeline using a publish-subscribe 14 | architecture. 15 | * `batched-processing` - A batched-object processing system using docker and minio. 16 | -------------------------------------------------------------------------------- /examples/asyncio/README.md: -------------------------------------------------------------------------------- 1 | # asyncio Example Usage 2 | 3 | This example demonstrates usage of the python wrapper using asyncio. 4 | 5 | ## Running the example 6 | 7 | ```bash 8 | docker run -v $(pwd):/app -it prio:dev python3 main.py 9 | ``` 10 | 11 | Results in: 12 | 13 | ```bash 14 | INFO:root:Starting asyncio prio pipeline. 15 | INFO:root:Client 0: Generate shares 16 | INFO:root:Client 1: Generate shares 17 | INFO:root:Client 2: Generate shares 18 | INFO:root:Client 3: Generate shares 19 | INFO:root:Server 1, PID 0: Generate verify packet 1 20 | INFO:root:Server 0, PID 0: Generate verify packet 1 21 | INFO:root:Server 0, PID 1: Generate verify packet 1 22 | INFO:root:Server 0, PID 2: Generate verify packet 1 23 | INFO:root:Server 1, PID 1: Generate verify packet 1 24 | INFO:root:Server 0, PID 3: Generate verify packet 1 25 | INFO:root:Server 0, PID 0: Generate verify packet 2 26 | INFO:root:Server 1, PID 2: Generate verify packet 1 27 | INFO:root:Server 0, PID 1: Generate verify packet 2 28 | INFO:root:Server 0, PID 2: Generate verify packet 2 29 | INFO:root:Server 1, PID 3: Generate verify packet 1 30 | INFO:root:Server 0, PID 3: Generate verify packet 2 31 | INFO:root:Server 1, PID 0: Generate verify packet 2 32 | INFO:root:Server 0, PID 0: Aggregate data 33 | INFO:root:Server 1, PID 1: Generate verify packet 2 34 | INFO:root:Server 0, PID 1: Aggregate data 35 | INFO:root:Server 1, PID 2: Generate verify packet 2 36 | INFO:root:Server 0, PID 2: Aggregate data 37 | INFO:root:Server 1, PID 3: Generate verify packet 2 38 | INFO:root:Server 0, PID 3: Aggregate data 39 | INFO:root:Server 1, PID 0: Aggregate data 40 | INFO:root:Server 1, PID 1: Aggregate data 41 | INFO:root:Server 1, PID 2: Aggregate data 42 | INFO:root:Server 1, PID 3: Aggregate data 43 | INFO:root:Done! 44 | ``` 45 | 46 | ## Dataflow diagram 47 | 48 | ![Dataflow DAG](./dag.png) 49 | 50 | ```mermaid 51 | graph TD 52 | 53 | client -->|shares| create_p1A(create_verify1) 54 | subgraph Server A 55 | subgraph Verifier 56 | create_p1A --> |p1A| create_p2A(create_verify2) 57 | create_p2A --> |p2A| isvalid_A{isValid} 58 | end 59 | isvalid_A --> aggregate_A(Aggregate) 60 | end 61 | 62 | create_p1A --> |p1A| create_p2B 63 | create_p2A --> |p2A| isvalid_B 64 | 65 | 66 | client -->|shares| create_p1B(create_verify1) 67 | subgraph Server B 68 | subgraph Verifier 69 | create_p1B --> |p1B| create_p2B(create_verify2) 70 | create_p2B --> |p2B| isvalid_B{isValid} 71 | end 72 | isvalid_B --> aggregate_B(Aggregate) 73 | end 74 | 75 | create_p1B --> |p1B| create_p2A 76 | create_p2B --> |p2B| isvalid_A 77 | 78 | 79 | aggregate_A --> Total 80 | aggregate_B --> Total 81 | ``` 82 | -------------------------------------------------------------------------------- /examples/asyncio/dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/examples/asyncio/dag.png -------------------------------------------------------------------------------- /examples/asyncio/main.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import asyncio 6 | import logging 7 | import random 8 | import sys 9 | from collections import namedtuple 10 | 11 | from prio_processor.prio import wrapper as prio 12 | from prio.libprio import Prio_init, Prio_clear 13 | 14 | logging.basicConfig() 15 | logger = logging.getLogger() 16 | logger.setLevel(logging.INFO) 17 | 18 | PACKET_DATA = 0 19 | PACKET_VERIFY_1 = 1 20 | PACKET_VERIFY_2 = 2 21 | 22 | Packet = namedtuple("Packet", ["id", "type", "data"]) 23 | 24 | 25 | async def server_consume(server, read_queue, write_queue): 26 | # maintain state of the server's shares in the verifier, along with the 27 | # generated verification packets 28 | cache = {} 29 | 30 | while True: 31 | # add random jitter to simulate io 32 | await asyncio.sleep(random.random()) 33 | 34 | packet = await read_queue.get() 35 | pid = packet.id 36 | v, p1, p2 = cache.get(pid, (None, None, None)) 37 | 38 | def log(line): 39 | logger.info("Server {}, PID {}: {}".format(server.server_id, pid, line)) 40 | 41 | # out of order packet execution is dealt with by pushing data back 42 | # into the queue 43 | 44 | if packet.type == PACKET_DATA: 45 | log("Generate verify packet 1") 46 | v = server.create_verifier(packet.data) 47 | p1 = v.create_verify1() 48 | await write_queue.put(Packet(id=pid, type=PACKET_VERIFY_1, data=p1)) 49 | elif packet.type == PACKET_VERIFY_1: 50 | if not p1: 51 | await read_queue.put(packet) 52 | read_queue.task_done() 53 | continue 54 | log("Generate verify packet 2") 55 | p2 = v.create_verify2(p1, packet.data) 56 | await write_queue.put(Packet(id=pid, type=PACKET_VERIFY_2, data=p2)) 57 | elif packet.type == PACKET_VERIFY_2: 58 | if not p2: 59 | await read_queue.put(packet) 60 | read_queue.task_done() 61 | continue 62 | if v.is_valid(p2, packet.data): 63 | log("Aggregate data") 64 | server.aggregate(v) 65 | else: 66 | log("Invalid data") 67 | del cache[pid] 68 | 69 | read_queue.task_done() 70 | cache[pid] = (v, p1, p2) 71 | 72 | 73 | async def client_produce(client, data_items, queue_a, queue_b, n_clients): 74 | for i in range(n_clients): 75 | logger.info("Client {}: Generate shares".format(i)) 76 | for_server_a, for_server_b = client.encode(data_items) 77 | await queue_a.put(Packet(id=i, type=PACKET_DATA, data=for_server_a)) 78 | await queue_b.put(Packet(id=i, type=PACKET_DATA, data=for_server_b)) 79 | 80 | 81 | async def main(): 82 | Prio_init() 83 | n_clients = 4 84 | n_data = 133 85 | server_secret = prio.PRGSeed() 86 | skA, pkA = prio.create_keypair() 87 | skB, pkB = prio.create_keypair() 88 | 89 | cfg = prio.Config(n_data, pkA, pkB, b"test_batch") 90 | sA = prio.Server(cfg, prio.PRIO_SERVER_A, skA, server_secret) 91 | sB = prio.Server(cfg, prio.PRIO_SERVER_B, skB, server_secret) 92 | 93 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 94 | 95 | logger.info("Starting asyncio prio pipeline.") 96 | client = prio.Client(cfg) 97 | queue_a = asyncio.Queue() 98 | queue_b = asyncio.Queue() 99 | 100 | await client_produce(client, data_items, queue_a, queue_b, n_clients) 101 | 102 | consumers = asyncio.ensure_future( 103 | asyncio.gather( 104 | server_consume(sA, queue_a, queue_b), server_consume(sB, queue_b, queue_a) 105 | ) 106 | ) 107 | 108 | await asyncio.gather(queue_a.join(), queue_b.join()) 109 | 110 | t_a = sA.total_shares() 111 | t_b = sB.total_shares() 112 | 113 | output = prio.total_share_final(cfg, t_a, t_b) 114 | 115 | expected = [item * n_clients for item in list(data_items)] 116 | assert list(output) == expected 117 | 118 | consumers.cancel() 119 | Prio_clear() 120 | logger.info("Done!") 121 | 122 | 123 | if __name__ == "__main__": 124 | loop = asyncio.get_event_loop() 125 | loop.run_until_complete(main()) 126 | loop.close() 127 | -------------------------------------------------------------------------------- /examples/batched-processing/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prio:dev 2 | 3 | RUN curl https://dl.minio.io/client/mc/release/linux-amd64/mc -o mc 4 | RUN chmod +x mc 5 | ENV PATH="/app/:${PATH}" 6 | 7 | CMD bash 8 | -------------------------------------------------------------------------------- /examples/batched-processing/Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | docker build --target development -t prio:dev ../.. 3 | docker-compose build 4 | 5 | test: 6 | scripts/integration.sh 7 | 8 | clean: 9 | docker-compose down 10 | -------------------------------------------------------------------------------- /examples/batched-processing/README.md: -------------------------------------------------------------------------------- 1 | # Batched Processing 2 | 3 | This example is an example of a minimal two-server aggregation scheme that 4 | fulfils the privacy guarantees of the Prio system. 5 | 6 | ## Quickstart 7 | 8 | ```bash 9 | # Build the containers 10 | make build 11 | 12 | # Run the test 13 | make test 14 | ``` 15 | 16 | ## Resources 17 | 18 | ### Generated Keys 19 | ```json 20 | # Server A 21 | { 22 | "private_key": "19DDC146FB8EE4A0B762A7DAE7E96033F87C9528DBBF8CA899CCD1DB8CD74984", 23 | "public_key": "445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78" 24 | } 25 | 26 | # Server B 27 | { 28 | "private_key": "E3AA3CC952C8553E46E699646A9DC3CBA7E3D4C7F0779D58574ABF945E259202", 29 | "public_key": "01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642" 30 | } 31 | ``` 32 | 33 | ## Misc 34 | 35 | * Generating an [s3 policy file](https://docs.aws.amazon.com/AmazonS3/latest/dev/example-policies-s3.html) 36 | * [MinIO multi-user quickstart guide](https://docs.min.io/docs/minio-multi-user-quickstart-guide.html) 37 | -------------------------------------------------------------------------------- /examples/batched-processing/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | minio: 5 | # https://docs.min.io/docs/minio-docker-quickstart-guide 6 | # https://docs.min.io/docs/minio-multi-user-quickstart-guide.html 7 | image: minio/minio:latest 8 | command: server /data 9 | ports: 10 | - 9000:9000 11 | environment: 12 | - MINIO_ACCESS_KEY=admin 13 | - MINIO_SECRET_KEY=password 14 | 15 | bootstrap: 16 | image: minio/mc:latest 17 | depends_on: 18 | - minio 19 | working_dir: /root 20 | entrypoint: sh 21 | command: scripts/bootstrap.sh 22 | volumes: 23 | - .:/root/ 24 | 25 | server_a: 26 | build: . 27 | working_dir: /app/examples/batched-processing 28 | command: "true" 29 | volumes: 30 | - .:/app/examples/batched-processing 31 | depends_on: 32 | - minio 33 | - bootstrap 34 | environment: 35 | - N_DATA=3 36 | - BATCH_ID=test 37 | - SERVER_ID=A 38 | - SHARED_SECRET=m/AqDal/ZSA9597GwMM+VA== 39 | - PRIVATE_KEY_HEX=19DDC146FB8EE4A0B762A7DAE7E96033F87C9528DBBF8CA899CCD1DB8CD74984 40 | - PUBLIC_KEY_HEX_INTERNAL=445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78 41 | - PUBLIC_KEY_HEX_EXTERNAL=01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642 42 | - MINIO_ACCESS_KEY=server-a 43 | - MINIO_SECRET_KEY=password 44 | - BUCKET_INTERNAL=server-a 45 | - BUCKET_EXTERNAL=server-b 46 | 47 | server_b: 48 | build: . 49 | working_dir: /app/examples/batched-processing 50 | command: "true" 51 | volumes: 52 | - .:/app/examples/batched-processing 53 | depends_on: 54 | - minio 55 | - bootstrap 56 | environment: 57 | - N_DATA=3 58 | - BATCH_ID=test 59 | - SERVER_ID=B 60 | - SHARED_SECRET=m/AqDal/ZSA9597GwMM+VA== 61 | - PRIVATE_KEY_HEX=E3AA3CC952C8553E46E699646A9DC3CBA7E3D4C7F0779D58574ABF945E259202 62 | - PUBLIC_KEY_HEX_INTERNAL=01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642 63 | - PUBLIC_KEY_HEX_EXTERNAL=445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78 64 | - MINIO_ACCESS_KEY=server-b 65 | - MINIO_SECRET_KEY=password 66 | - BUCKET_INTERNAL=server-b 67 | - BUCKET_EXTERNAL=server-a 68 | 69 | client: 70 | build: . 71 | working_dir: /app/examples/batched-processing 72 | command: "true" 73 | volumes: 74 | - .:/app/examples/batched-processing 75 | depends_on: 76 | - server_a 77 | - server_b 78 | environment: 79 | - N_DATA=3 80 | - BATCH_ID=test 81 | # Server A and B respectively 82 | - PUBLIC_KEY_HEX_INTERNAL=445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78 83 | - PUBLIC_KEY_HEX_EXTERNAL=01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642 84 | - MINIO_ACCESS_KEY=admin 85 | - MINIO_SECRET_KEY=password 86 | - BUCKET_SERVER_A=server-a 87 | - BUCKET_SERVER_B=server-b 88 | -------------------------------------------------------------------------------- /examples/batched-processing/policy/server-a.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Action": [ 6 | "s3:*" 7 | ], 8 | "Effect": "Allow", 9 | "Resource": [ 10 | "arn:aws:s3:::server-a/*", 11 | "arn:aws:s3:::server-b/intermediate/external/*" 12 | ], 13 | "Sid": "" 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /examples/batched-processing/policy/server-b.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Action": [ 6 | "s3:*" 7 | ], 8 | "Effect": "Allow", 9 | "Resource": [ 10 | "arn:aws:s3:::server-b/*", 11 | "arn:aws:s3:::server-a/intermediate/external/*" 12 | ], 13 | "Sid": "" 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /examples/batched-processing/scripts/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -euo pipefail 4 | set -x 5 | 6 | TARGET="minio" 7 | 8 | mc config host add $TARGET http://minio:9000 admin password 9 | mc mb $TARGET/server-a 10 | mc mb $TARGET/server-b 11 | 12 | # mc admin policy add TARGET POLICYNAME POLICYFILE 13 | mc admin policy add $TARGET server-a policy/server-a.json 14 | mc admin policy add $TARGET server-b policy/server-b.json 15 | 16 | # mc admin user add TARGET ACCESSKEY SECRETKEY 17 | mc admin user add $TARGET server-a password 18 | mc admin user add $TARGET server-b password 19 | 20 | # mc admin policy set TARGET POLICYNAME user=ACCESSKEY 21 | mc admin policy set $TARGET server-a user=server-a 22 | mc admin policy set $TARGET server-b user=server-b 23 | -------------------------------------------------------------------------------- /examples/batched-processing/scripts/check-aggregates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check that aggregates that are computed via client.sh and server.sh are 4 | # correct and published to the correct location. This script should be run 5 | # inside of the client container. 6 | 7 | set -euo pipefail 8 | set -x 9 | 10 | : ${MINIO_ACCESS_KEY?} 11 | : ${MINIO_SECRET_KEY?} 12 | : ${BUCKET_SERVER_A?} 13 | : ${BUCKET_SERVER_B?} 14 | 15 | TARGET="minio" 16 | mc config host add $TARGET http://minio:9000 ${MINIO_ACCESS_KEY} ${MINIO_SECRET_KEY} 17 | 18 | function get_payload() { 19 | local path=$1 20 | mc cat "${path}" | jq -c '.payload' 21 | } 22 | 23 | [[ $(get_payload $TARGET/$BUCKET_SERVER_A/processed/part-0.ndjson) == "[3,2,1]" ]] 24 | [[ $(get_payload $TARGET/$BUCKET_SERVER_B/processed/part-0.ndjson) == "[3,2,1]" ]] 25 | 26 | [[ $(get_payload $TARGET/$BUCKET_SERVER_A/processed/part-1.ndjson) == "[4,2,4]" ]] 27 | [[ $(get_payload $TARGET/$BUCKET_SERVER_B/processed/part-1.ndjson) == "[4,2,4]" ]] 28 | 29 | [[ $(get_payload $TARGET/$BUCKET_SERVER_A/processed/part-2.ndjson) == "[7,3,1]" ]] 30 | [[ $(get_payload $TARGET/$BUCKET_SERVER_B/processed/part-2.ndjson) == "[7,3,1]" ]] 31 | -------------------------------------------------------------------------------- /examples/batched-processing/scripts/client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eou pipefail 4 | set -x 5 | 6 | # Parameters that are read through the environment 7 | : ${N_DATA?} 8 | : ${BATCH_ID?} 9 | : ${PUBLIC_KEY_HEX_INTERNAL?} 10 | : ${PUBLIC_KEY_HEX_EXTERNAL?} 11 | 12 | : ${MINIO_ACCESS_KEY?} 13 | : ${MINIO_SECRET_KEY?} 14 | : ${BUCKET_SERVER_A?} 15 | : ${BUCKET_SERVER_B?} 16 | 17 | TARGET="minio" 18 | mc config host add $TARGET http://minio:9000 ${MINIO_ACCESS_KEY} ${MINIO_SECRET_KEY} 19 | 20 | # The bucket name is used for the local file directory and for the remote minio 21 | # bucket. 22 | cd /tmp 23 | output_a=$BUCKET_SERVER_A/raw 24 | output_b=$BUCKET_SERVER_B/raw 25 | mkdir -p $output_a 26 | mkdir -p $output_b 27 | 28 | jq -c '{payload: .}' <part-0.ndjson 29 | [1, 0, 0] 30 | [1, 1, 0] 31 | [1, 1, 1] 32 | EOF 33 | 34 | jq -c '{payload: .}' <part-1.ndjson 35 | [1, 0, 1] 36 | [1, 1, 1] 37 | [1, 0, 1] 38 | [1, 1, 1] 39 | EOF 40 | 41 | jq -c '{payload: .}' <part-2.ndjson 42 | [1, 0, 0] 43 | [1, 0, 0] 44 | [1, 0, 0] 45 | [1, 0, 0] 46 | [1, 1, 0] 47 | [1, 1, 0] 48 | [1, 1, 1] 49 | EOF 50 | 51 | for filename in $(find . -name "*.ndjson"); do 52 | prio encode-shares \ 53 | --input $filename \ 54 | --output-A $output_a \ 55 | --output-B $output_b 56 | 57 | jq -c '.' $output_a/$filename 58 | jq -c '.' $output_b/$filename 59 | done 60 | 61 | mc cp --recursive $output_a/ $TARGET/$output_a/ 62 | mc cp --recursive $output_b/ $TARGET/$output_b/ 63 | 64 | touch _SUCCESS 65 | mc cp _SUCCESS $TARGET/$output_a/_SUCCESS 66 | mc cp _SUCCESS $TARGET/$output_b/_SUCCESS 67 | -------------------------------------------------------------------------------- /examples/batched-processing/scripts/integration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script controls the docker-compose workflow for integration testing. The 4 | # containers are defined in the docker-compose.yml, but are orchestrated through 5 | # this script for verification. 6 | # 7 | # See: https://stackoverflow.com/questions/40907954/terminate-docker-compose-when-test-container-finishes 8 | 9 | set -euo pipefail 10 | 11 | docker-compose up -d 12 | 13 | # Add a cleanup handler for the exit signal 14 | function cleanup { 15 | docker-compose down 16 | } 17 | trap cleanup EXIT 18 | 19 | # Start server A 20 | docker-compose run server_a scripts/server.sh & 21 | server_a_pid=$! 22 | 23 | # Start server B 24 | docker-compose run server_b scripts/server.sh & 25 | server_b_pid=$! 26 | 27 | # Copy data into the appropriate buckets 28 | docker-compose run client scripts/client.sh 29 | 30 | # Return the exit code of the backgrounded docker-compose container. Since 31 | # `wait` is a blocking function, a failure in server B will not be detected 32 | # until timeout in server A. 33 | wait $server_a_pid 34 | wait $server_b_pid 35 | 36 | docker-compose run client scripts/check-aggregates.sh 37 | -------------------------------------------------------------------------------- /examples/benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # libprio benchmarks 2 | 3 | Information about payload sizes and client encoding time as a function of input 4 | size. 5 | 6 | ## Reproducing results 7 | 8 | ```bash 9 | pip install -r requirements.txt 10 | python3 main.py 11 | ``` 12 | 13 | ![encrypted sizes](./encrypted_sizes.png) 14 | ![client encoding time](./client_encoding_time.png) -------------------------------------------------------------------------------- /examples/benchmarks/client_encoding_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/examples/benchmarks/client_encoding_time.png -------------------------------------------------------------------------------- /examples/benchmarks/encrypted_sizes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/examples/benchmarks/encrypted_sizes.png -------------------------------------------------------------------------------- /examples/benchmarks/main.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | from prio_processor.prio import wrapper as prio 7 | from prio import PrioContext 8 | from tqdm import tqdm 9 | 10 | 11 | @PrioContext() 12 | def bench_encrypted_sizes(path): 13 | _, pubkey = prio.create_keypair() 14 | 15 | def size(n): 16 | cfg = prio.Config(n, pubkey, pubkey, b"test") 17 | a, b = prio.Client(cfg).encode(bytes([1] * k)) 18 | return [k, len(a), len(b)] 19 | 20 | sizes = [] 21 | for k in tqdm(range(0, 10000, 100)): 22 | try: 23 | sizes.append(size(k)) 24 | except: 25 | print(f"Prio excepted at {k} items") 26 | break 27 | 28 | fig, ax = plt.subplots() 29 | ax.set_xscale("log", basex=2) 30 | ax.set_yscale("log", basey=2) 31 | ax.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter()) 32 | ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter()) 33 | plt.title("Prio measurement size vs payload size") 34 | plt.xlabel("measurement size (bits)") 35 | plt.ylabel("payload size (bytes)") 36 | plt.plot(*np.array(sizes).T[:2]) 37 | plt.savefig(path) 38 | 39 | 40 | @PrioContext() 41 | def bench_client_encoding(path): 42 | runs = 10 ** 2 43 | timings = [] 44 | for k in tqdm([8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]): 45 | _, pubkey = prio.create_keypair() 46 | cfg = prio.Config(k, pubkey, pubkey, b"test_batch") 47 | client = prio.Client(cfg) 48 | data = bytes([1] * k) 49 | timing = timeit.timeit("client.encode(data)", number=runs, globals=locals()) 50 | timings.append([k, timing]) 51 | 52 | data = np.array(timings) 53 | y = data[:, 1] / runs 54 | x = data[:, 0] 55 | 56 | fig, ax = plt.subplots() 57 | plt.title(f"measurement size vs encoding time (n={runs})") 58 | plt.xlabel("measurement size (bits)") 59 | plt.ylabel("encoding time (seconds)") 60 | ax.set_xscale("log", basex=2) 61 | ax.set_yscale("log", basey=2) 62 | ax.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter()) 63 | ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter()) 64 | plt.plot(x, y) 65 | plt.savefig(path) 66 | 67 | 68 | def main(): 69 | print("running benchmark for encrypted sizes") 70 | bench_encrypted_sizes("encrypted_sizes.png") 71 | print("running benchmark for client encoding time") 72 | bench_client_encoding("client_encoding_time.png") 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /examples/benchmarks/requirements.in: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy 3 | tqdm 4 | -e file:../.. 5 | -------------------------------------------------------------------------------- /examples/benchmarks/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile 6 | # 7 | -e file:../.. 8 | attrs==19.3.0 # via jsonschema 9 | cachetools==4.1.1 # via google-auth 10 | certifi==2020.6.20 # via requests 11 | chardet==3.0.4 # via requests 12 | click==7.0 13 | cycler==0.10.0 # via matplotlib 14 | decorator==4.4.2 # via gcsfs 15 | gcsfs==0.2.3 16 | google-auth-oauthlib==0.4.1 # via gcsfs 17 | google-auth==1.20.0 # via gcsfs, google-auth-oauthlib 18 | idna==2.10 # via requests 19 | importlib-metadata==1.7.0 # via jsonschema 20 | jsonschema==3.2.0 21 | kiwisolver==1.1.0 # via matplotlib 22 | matplotlib==3.1.3 23 | numpy==1.18.1 24 | oauthlib==3.1.0 # via requests-oauthlib 25 | prio==1.1 26 | py4j==0.10.9 # via pyspark 27 | pyasn1-modules==0.2.8 # via google-auth 28 | pyasn1==0.4.8 # via pyasn1-modules, rsa 29 | pyparsing==2.4.6 # via matplotlib 30 | pyrsistent==0.16.0 # via jsonschema 31 | pyspark==3.0.0 32 | python-dateutil==2.8.1 # via matplotlib 33 | requests-oauthlib==1.3.0 # via google-auth-oauthlib 34 | requests==2.24.0 # via gcsfs, requests-oauthlib 35 | rsa==4.6 # via google-auth 36 | six==1.14.0 # via cycler, google-auth, jsonschema, pyrsistent, python-dateutil 37 | tqdm==4.43.0 38 | urllib3==1.25.10 # via requests 39 | zipp==3.1.0 # via importlib-metadata 40 | 41 | # The following packages are considered to be unsafe in a requirements file: 42 | # setuptools 43 | -------------------------------------------------------------------------------- /examples/browser-validation/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prio:dev 2 | 3 | # install wait-for for docker-compose services 4 | RUN curl -o /usr/local/bin/wait-for-it https://raw.githubusercontent.com/vishnubob/wait-for-it/master/wait-for-it.sh 5 | RUN chmod +x /usr/local/bin/wait-for-it 6 | 7 | RUN pip3 install \ 8 | s3fs \ 9 | pyarrow \ 10 | click \ 11 | pandas \ 12 | 13 | CMD bash 14 | -------------------------------------------------------------------------------- /examples/browser-validation/Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | docker build --target development -t prio:dev ../.. 3 | 4 | run: 5 | docker-compose run app 6 | 7 | test: 8 | docker-compose run bash -c "python3 generate.py --path test.batch.json | bash" 9 | -------------------------------------------------------------------------------- /examples/browser-validation/README.md: -------------------------------------------------------------------------------- 1 | # Validate Browser Data 2 | 3 | This example validates the results from a pilot experiment. The API has changed 4 | and the original data source does not exist in this form any longer due to a 5 | transition to GCP. This code is deprecated, but may be run in it's most recent 6 | working state in the v1.6.1 tag of the container. 7 | 8 | ## Usage 9 | 10 | Setup 11 | 12 | ```bash 13 | make run 14 | make test 15 | ``` 16 | 17 | To test against generated data, run the `generate.py` script. 18 | 19 | ``` 20 | $ python generate.py --path test.batch.json 21 | ``` 22 | 23 | This will generate the corresponding command for validation. 24 | Verify the output of this command before running it. 25 | 26 | ``` 27 | $ python generate.py --path test.batch.json | bash 28 | ``` 29 | 30 | To run against a real browser ping, you can run a command in the following form: 31 | 32 | ``` 33 | $ python main.py \ 34 | --pings sample.batch.json \ 35 | --pubkey-A \ 36 | --pvtkey-A \ 37 | --pubkey-B \ 38 | --pvtkey-B 39 | ``` 40 | 41 | The `--pings` argument generally takes a set of json documents; one per line and delimited by a new line. 42 | 43 | The ping should be compacted before being presented to the program. 44 | 45 | ``` 46 | # use `jq -c` to compact a json document 47 | $ cat my-ping.json | jq -c . > my-ping.batch.json 48 | ``` 49 | 50 | To run against the parquet dataset, make sure you have AWS credentials with access to the appropriate bucket. To verify that everything is set up correctly: 51 | 52 | ``` 53 | $ aws s3 ls s3://net-mozaws-prod-us-west-2-pipeline-analysis/amiyaguchi/prio/v1 54 | ``` 55 | 56 | Then run the following command: 57 | 58 | ``` 59 | $ python main.py \ 60 | --date 20181007 \ 61 | --pubkey-A \ 62 | --pvtkey-A \ 63 | --pubkey-B \ 64 | --pvtkey-B 65 | ``` 66 | 67 | ### Docker 68 | 69 | This image may also be run via docker. Pass the appropriate environment variables as follows: 70 | 71 | ```bash 72 | $ make build 73 | 74 | $ make test 75 | 76 | $ AWS_ACCESS_KEY_ID= \ 77 | AWS_SECRET_ACCESS_KEY= \ 78 | PRIO_DATE= \ 79 | PRIO_PUBKEY_A= \ 80 | PRIO_PVTKEY_A= \ 81 | PRIO_PUBKEY_B= \ 82 | PRIO_PVTKEY_B= \ 83 | make run 84 | ``` 85 | -------------------------------------------------------------------------------- /examples/browser-validation/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | 3 | services: 4 | app: 5 | build: . 6 | volumes: 7 | - .:/app/examples/docker-asyncio 8 | command: python3 main.py 9 | environment: 10 | - AWS_ACCESS_KEY_ID 11 | - AWS_SECRET_ACCESS_KEY 12 | - PRIO_DATE 13 | - PRIO_PINGS 14 | - PRIO_PUBKEY_A 15 | - PRIO_PVTKEY_A 16 | - PRIO_PUBKEY_B 17 | - PRIO_PVTKEY_B 18 | -------------------------------------------------------------------------------- /examples/browser-validation/generate.py: -------------------------------------------------------------------------------- 1 | import json 2 | from itertools import product 3 | 4 | import click 5 | from prio import prio 6 | 7 | 8 | # cardinality of the input vector 9 | N_DATA = 3 10 | 11 | 12 | def construct(build_id, user_default, newtab, pdf, data_a, data_b): 13 | ping = { 14 | "environment": {"build": {"buildId": build_id}}, 15 | "payload": { 16 | "histograms": { 17 | "BROWSER_IS_USER_DEFAULT": {"sum": user_default}, 18 | "NEWTAB_PAGE_ENABLED": {"sum": newtab}, 19 | "PDF_VIEWER_USED": {"sum": pdf}, 20 | }, 21 | "prio": { 22 | "a": {k: int(v) for k, v in enumerate(data_a)}, 23 | "b": {k: int(v) for k, v in enumerate(data_b)}, 24 | }, 25 | }, 26 | } 27 | return ping 28 | 29 | 30 | def generate(build_id, client): 31 | data = [] 32 | for vector in product([0, 1], [0, 1], [0, 1]): 33 | args = list(vector) + client.encode(bytes(vector)) 34 | ping = construct(build_id, *args) 35 | data.append(ping) 36 | return data 37 | 38 | 39 | def write(fp, data): 40 | fp.write("\n".join(map(json.dumps, data))) 41 | 42 | 43 | @click.command() 44 | @click.option("--path", type=click.Path(exists=False), required=True) 45 | @click.option("--batch-id", type=str, default="test-batch") 46 | def main(path, batch_id): 47 | # create the encryption keys 48 | skA, pkA = prio.create_keypair() 49 | skB, pkB = prio.create_keypair() 50 | 51 | # create the client 52 | cfg = prio.Config(N_DATA, pkA, pkB, bytes(batch_id, "utf-8")) 53 | client = prio.Client(cfg) 54 | 55 | # generate test data 56 | data = generate(batch_id, client) 57 | with open(path, "w") as f: 58 | write(f, data) 59 | 60 | # print a command to use 61 | def clean(s): 62 | return s[:-1].decode("utf-8") 63 | 64 | args = { 65 | "--pings": path, 66 | "--pubkey-A": clean(pkA.export_hex()), 67 | "--pvtkey-A": clean(skA.export_hex()), 68 | "--pubkey-B": clean(pkB.export_hex()), 69 | "--pvtkey-B": clean(skB.export_hex()), 70 | } 71 | argstr = " \\".join([f"\n\t{k} {v}" for k, v in args.items()]) 72 | print(f"python main.py \\{argstr}") 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /examples/docker-asyncio/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prio:dev 2 | 3 | # install wait-for for docker-compose services 4 | RUN curl -o /usr/local/bin/wait-for-it https://raw.githubusercontent.com/vishnubob/wait-for-it/master/wait-for-it.sh 5 | RUN chmod +x /usr/local/bin/wait-for-it 6 | 7 | RUN pip3 install \ 8 | aioamqp \ 9 | click \ 10 | aio-pika 11 | 12 | CMD bash 13 | -------------------------------------------------------------------------------- /examples/docker-asyncio/Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | docker build --target development -t prio:dev ../.. 3 | docker-compose build 4 | 5 | run: 6 | docker-compose up 7 | 8 | clean: 9 | docker-compose down -------------------------------------------------------------------------------- /examples/docker-asyncio/client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aio_pika 3 | import logging 4 | import click 5 | 6 | from prio_processor.prio import wrapper as prio 7 | from prio import PrioContext 8 | 9 | logging.basicConfig() 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | async def run_client(pubkey_a, pubkey_b, n_clients, n_fields, batch_id): 15 | connection = await aio_pika.connect_robust("amqp://guest:guest@rabbitmq:5672/") 16 | channel = await connection.channel() 17 | await channel.declare_queue("prio.0") 18 | await channel.declare_queue("prio.1") 19 | 20 | # delay for server setup 21 | await asyncio.sleep(3) 22 | 23 | pkA = prio.PublicKey().import_hex(pubkey_a) 24 | pkB = prio.PublicKey().import_hex(pubkey_b) 25 | 26 | config = prio.Config(n_fields, pkA, pkB, batch_id) 27 | client = prio.Client(config) 28 | 29 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_fields)]) 30 | 31 | for i in range(n_clients): 32 | 33 | logger.info("Client {}: Generated shares".format(i)) 34 | for_server_a, for_server_b = client.encode(data_items) 35 | 36 | await channel.default_exchange.publish( 37 | aio_pika.Message(body=for_server_a, message_id=str(i), type="data"), 38 | routing_key="prio.0", 39 | ) 40 | await channel.default_exchange.publish( 41 | aio_pika.Message(body=for_server_b, message_id=str(i), type="data"), 42 | routing_key="prio.1", 43 | ) 44 | await connection.close() 45 | logger.info("Client done!") 46 | 47 | 48 | @click.command() 49 | @click.option("--pubkey-A", type=str) 50 | @click.option("--pubkey-B", type=str) 51 | @click.option("--n-clients", type=int, default=10) 52 | @click.option("--n-fields", type=int, required=True) 53 | @click.option("--batch-id", type=str, default="test_batch") 54 | @PrioContext() 55 | def main(pubkey_a, pubkey_b, n_clients, n_fields, batch_id): 56 | loop = asyncio.get_event_loop() 57 | loop.run_until_complete( 58 | run_client( 59 | bytes(pubkey_a, "utf-8"), 60 | bytes(pubkey_b, "utf-8"), 61 | n_clients, 62 | n_fields, 63 | bytes(batch_id, "utf-8"), 64 | ) 65 | ) 66 | loop.close() 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /examples/docker-asyncio/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | 3 | x-app: &app 4 | build: . 5 | volumes: 6 | - .:/app/examples/docker-asyncio 7 | 8 | services: 9 | client: 10 | <<: *app 11 | depends_on: 12 | - rabbitmq 13 | - server_a 14 | - server_b 15 | command: > 16 | bash -c "cd examples/docker-asyncio && 17 | wait-for-it rabbitmq:5672 -- python3 client.py \ 18 | --pubkey-A F63F2FB9B823B7B672684A526AC467DCFC110D4BB242F6DF0D3EA9F09CE14B51 \ 19 | --pubkey-B 15DC84D87C73A36120E0389D4ABCD433EDC5147DC71A4093E2A5952968D51F07 \ 20 | --n-clients 10 \ 21 | --n-fields 133" 22 | 23 | server_a: 24 | <<: *app 25 | depends_on: 26 | - rabbitmq 27 | command: > 28 | bash -c "cd examples/docker-asyncio && 29 | wait-for-it rabbitmq:5672 -- python3 server.py \ 30 | --pubkey F63F2FB9B823B7B672684A526AC467DCFC110D4BB242F6DF0D3EA9F09CE14B51 \ 31 | --pvtkey 7A0AA608C08CB74A86409F5026865435B2F17F40B20636CEFD2656585097FBE0 \ 32 | --pubkey-other 15DC84D87C73A36120E0389D4ABCD433EDC5147DC71A4093E2A5952968D51F07 \ 33 | --server-id a \ 34 | --n-fields 133" 35 | 36 | server_b: 37 | <<: *app 38 | depends_on: 39 | - rabbitmq 40 | command: > 41 | bash -c "cd examples/docker-asyncio && 42 | wait-for-it rabbitmq:5672 -- python3 server.py \ 43 | --pubkey 15DC84D87C73A36120E0389D4ABCD433EDC5147DC71A4093E2A5952968D51F07 \ 44 | --pvtkey 50C7329DE18DE3087A0DE963D5585A4DB7A156C7A29FA854760373B053D86919 \ 45 | --pubkey-other F63F2FB9B823B7B672684A526AC467DCFC110D4BB242F6DF0D3EA9F09CE14B51 \ 46 | --server-id b \ 47 | --n-fields 133" 48 | 49 | rabbitmq: 50 | image: rabbitmq:latest 51 | ports: 52 | - 5672:5672 53 | -------------------------------------------------------------------------------- /examples/docker-asyncio/server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aio_pika 3 | import logging 4 | import os 5 | import sys 6 | import pickle 7 | from functools import partial 8 | 9 | import click 10 | from prio_processor.prio import wrapper as prio 11 | from prio import PrioContext 12 | 13 | logging.basicConfig() 14 | logger = logging.getLogger() 15 | logger.setLevel(logging.INFO) 16 | 17 | DEFAULT_SHARED_SEED = b"vY\xc1\t\x93\xfb\xc6\x97*\x07j\xd63i+\x86" 18 | 19 | 20 | def get_other_server(server_id): 21 | mapping = { 22 | prio.PRIO_SERVER_A: prio.PRIO_SERVER_B, 23 | prio.PRIO_SERVER_B: prio.PRIO_SERVER_A, 24 | } 25 | return mapping[server_id] 26 | 27 | 28 | async def run_server( 29 | pubkey, pvtkey, pubkey_other, server_id, n_fields, batch_id, shared_seed 30 | ): 31 | connection = await aio_pika.connect_robust("amqp://guest:guest@rabbitmq:5672/") 32 | channel = await connection.channel() 33 | queue = await channel.declare_queue(f"prio.{server_id}") 34 | 35 | pk = prio.PublicKey().import_hex(pubkey) 36 | sk = prio.PrivateKey().import_hex(pvtkey, pubkey) 37 | pk_other = prio.PublicKey().import_hex(pubkey_other) 38 | 39 | seed = prio.PRGSeed() 40 | seed.instance = shared_seed 41 | 42 | config = prio.Config(n_fields, pk, pk_other, batch_id) 43 | server = prio.Server(config, server_id, sk, seed) 44 | 45 | cache = {} 46 | 47 | async for message in queue: 48 | with message.process(): 49 | pid = message.message_id 50 | v, p1, p2 = cache.get(pid, (None, None, None)) 51 | 52 | def log(line): 53 | logger.info("Message {}: {}".format(pid, line)) 54 | 55 | ptype = message.type 56 | routing_key = "prio.{}".format(get_other_server(server_id)) 57 | 58 | if (ptype == "verify1" and not p1) or (ptype == "verify2" and not p2): 59 | log("Re-queuing message!") 60 | await channel.default_exchange.publish( 61 | aio_pika.Message( 62 | body=message.body, 63 | message_id=message.message_id, 64 | type=message.type, 65 | ), 66 | routing_key="prio.{}".format(server_id), 67 | ) 68 | elif ptype == "data": 69 | log("Generating verify packet 1") 70 | v = server.create_verifier(message.body) 71 | p1 = v.create_verify1() 72 | await channel.default_exchange.publish( 73 | aio_pika.Message( 74 | body=pickle.dumps(p1), 75 | message_id=message.message_id, 76 | type="verify1", 77 | ), 78 | routing_key=routing_key, 79 | ) 80 | elif ptype == "verify1": 81 | log("Generating verify packet 2") 82 | p2 = v.create_verify2(p1, pickle.loads(message.body)) 83 | await channel.default_exchange.publish( 84 | aio_pika.Message( 85 | body=pickle.dumps(p2), 86 | message_id=message.message_id, 87 | type="verify2", 88 | ), 89 | routing_key=routing_key, 90 | ) 91 | elif ptype == "verify2": 92 | if v.is_valid(p2, pickle.loads(message.body)): 93 | log("Aggregate data") 94 | server.aggregate(v) 95 | else: 96 | log("Invalid data") 97 | del cache[pid] 98 | else: 99 | log("Bad message type {}".format(ptype)) 100 | 101 | cache[pid] = (v, p1, p2) 102 | 103 | 104 | @click.command() 105 | @click.option("--pubkey", type=str) 106 | @click.option("--pvtkey", type=str) 107 | @click.option("--pubkey-other", type=str) 108 | @click.option("--server-id", type=click.Choice(["a", "b"]), required=True) 109 | @click.option("--n-fields", type=int, required=True) 110 | @click.option("--batch-id", type=str, default="test_batch") 111 | @PrioContext() 112 | def main(pubkey, pvtkey, pubkey_other, server_id, n_fields, batch_id): 113 | loop = asyncio.get_event_loop() 114 | server_id = prio.PRIO_SERVER_A if server_id == "a" else prio.PRIO_SERVER_B 115 | loop.run_until_complete( 116 | run_server( 117 | bytes(pubkey, "utf-8"), 118 | bytes(pvtkey, "utf-8"), 119 | bytes(pubkey_other, "utf-8"), 120 | server_id, 121 | n_fields, 122 | bytes(batch_id, "utf-8"), 123 | DEFAULT_SHARED_SEED, 124 | ) 125 | ) 126 | loop.run_forever() 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /examples/python-wrapper/README.md: -------------------------------------------------------------------------------- 1 | # Pythonic Wrapper Example Usage 2 | 3 | This example demonstrates usage of the python wrapper around the swig libprio functions. 4 | 5 | ## Running the example 6 | 7 | ```bash 8 | docker run -v $(pwd):/app -it prio:dev python3 main.py 9 | ``` 10 | 11 | Results in: 12 | 13 | ```bash 14 | 15 | [0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0] 16 | ``` 17 | -------------------------------------------------------------------------------- /examples/python-wrapper/main.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from prio_processor.prio import wrapper as prio 6 | from prio import PrioContext 7 | import sys 8 | 9 | with PrioContext(): 10 | skA, pkA = prio.create_keypair() 11 | skB, pkB = prio.create_keypair() 12 | 13 | n_data = 133 14 | batch_id = b"test_batch" 15 | cfg = prio.Config(n_data, pkA, pkB, batch_id) 16 | 17 | server_secret = prio.PRGSeed() 18 | 19 | sA = prio.Server(cfg, prio.PRIO_SERVER_A, skA, server_secret) 20 | sB = prio.Server(cfg, prio.PRIO_SERVER_B, skB, server_secret) 21 | 22 | client = prio.Client(cfg) 23 | 24 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 25 | for_server_a, for_server_b = client.encode(data_items) 26 | 27 | # Setup verification 28 | vA = sA.create_verifier(for_server_a) 29 | vB = sB.create_verifier(for_server_b) 30 | 31 | # Produce a packet1 and send to the other party 32 | p1A = vA.create_verify1() 33 | p1B = vB.create_verify1() 34 | 35 | # Produce packet2 and send to the other party 36 | p2A = vA.create_verify2(p1A, p1B) 37 | p2B = vB.create_verify2(p1A, p1B) 38 | 39 | # Check validity of the request 40 | if not vA.is_valid(p2A, p2B): 41 | print("data for server A is not valid!") 42 | sys.exit(1) 43 | if not vB.is_valid(p2A, p2B): 44 | print("data for server A is not valid!") 45 | sys.exit(1) 46 | 47 | sA.aggregate(vA) 48 | sB.aggregate(vB) 49 | 50 | # Collect from many clients and share data 51 | tA = sA.total_shares() 52 | tB = sB.total_shares() 53 | 54 | output = prio.total_share_final(cfg, tA, tB) 55 | 56 | # check the output 57 | assert list(data_items) == list(output) 58 | print(f"{list(output)}") 59 | -------------------------------------------------------------------------------- /examples/swig-wrapper/README.md: -------------------------------------------------------------------------------- 1 | # SWIG-Wrapper Example Usage 2 | 3 | This example demonstrates usage of the generated wrapper around the libprio 4 | functions. The wrapper is no longer maintained within this repository and has 5 | moved to [the python wrapper of 6 | libprio](https://github.com/mozilla/libprio/tree/master/python). It is used 7 | heavily within the prio-processor, however. 8 | 9 | ## Running the example 10 | 11 | With docker, run from the current directory. 12 | 13 | ```bash 14 | docker run -v $(pwd):/app -it prio:dev python3 main.py 15 | ``` 16 | 17 | Result: 18 | 19 | ```bash 20 | 21 | [0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0] 22 | ``` 23 | -------------------------------------------------------------------------------- /examples/swig-wrapper/main.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from prio.libprio import * 6 | from array import array 7 | 8 | Prio_init() 9 | skA, pkA = Keypair_new() 10 | skB, pkB = Keypair_new() 11 | 12 | n_data = 133 13 | batch_id = b"test_batch" 14 | cfg = PrioConfig_new(n_data, pkA, pkB, batch_id) 15 | 16 | server_secret = PrioPRGSeed_randomize() 17 | 18 | sA = PrioServer_new(cfg, PRIO_SERVER_A, skA, server_secret) 19 | sB = PrioServer_new(cfg, PRIO_SERVER_B, skB, server_secret) 20 | 21 | vA = PrioVerifier_new(sA) 22 | vB = PrioVerifier_new(sB) 23 | 24 | tA = PrioTotalShare_new() 25 | tB = PrioTotalShare_new() 26 | 27 | p1A = PrioPacketVerify1_new() 28 | p1B = PrioPacketVerify1_new() 29 | p2A = PrioPacketVerify2_new() 30 | p2B = PrioPacketVerify2_new() 31 | 32 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 33 | for_server_a, for_server_b = PrioClient_encode(cfg, data_items) 34 | 35 | # Setup verification 36 | PrioVerifier_set_data(vA, for_server_a) 37 | PrioVerifier_set_data(vB, for_server_b) 38 | 39 | # Produce a packet1 and send to the other party 40 | PrioPacketVerify1_set_data(p1A, vA) 41 | PrioPacketVerify1_set_data(p1B, vB) 42 | 43 | # Produce packet2 and send to the other party 44 | PrioPacketVerify2_set_data(p2A, vA, p1A, p1B) 45 | PrioPacketVerify2_set_data(p2B, vB, p1A, p1B) 46 | 47 | # Check validity of the request 48 | PrioVerifier_isValid(vA, p2A, p2B) 49 | PrioVerifier_isValid(vB, p2A, p2B) 50 | 51 | PrioServer_aggregate(sA, vA) 52 | PrioServer_aggregate(sB, vB) 53 | 54 | # Collect from many clients and share data 55 | PrioTotalShare_set_data(tA, sA) 56 | PrioTotalShare_set_data(tB, sB) 57 | 58 | output = PrioTotalShare_final(cfg, tA, tB) 59 | output = array("L", output) 60 | 61 | # check the output 62 | assert list(data_items) == list(output), "results do not match" 63 | print(f"{list(output)}") 64 | Prio_clear() 65 | -------------------------------------------------------------------------------- /google-cloud-sdk.repo: -------------------------------------------------------------------------------- 1 | [google-cloud-sdk] 2 | name=Google Cloud SDK 3 | baseurl=https://packages.cloud.google.com/yum/repos/cloud-sdk-el7-x86_64 4 | enabled=1 5 | gpgcheck=1 6 | repo_gpgcheck=1 7 | gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg 8 | https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg 9 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: prio-processor 2 | nav: 3 | - Home: README.md 4 | - Guide: guide.md 5 | - Airflow: airflow.md 6 | - Command Line Reference: cli-help.md 7 | - Code of Conduct: link/CODE_OF_CONDUCT.md 8 | theme: readthedocs 9 | -------------------------------------------------------------------------------- /notebooks/2020-08-25-cpu-time-by-n-data.csv: -------------------------------------------------------------------------------- 1 | server_id,sequence_id,n_data,2500,5000,7500,10000 2 | a,0,32,12.5,20.7,32.8,43.1 3 | a,0,64,17.0,38.0,54.7,74.4 4 | a,0,128,37.5,74.7,141.0,221.2 5 | a,1,32,12.5,20.8,30.5,39.5 6 | a,1,64,17.4,34.9,54.1,74.0 7 | a,1,128,38.9,76.3,142.0,224.5 8 | a,2,32,19.0,24.9,32.4,37.0 9 | a,2,64,23.9,36.3,47.4,132.7 10 | a,2,128,36.0,131.9,215.5,248.8 11 | a,3,32,3.6,3.4,3.6,3.6 12 | a,3,64,3.7,3.6,3.6,3.7 13 | a,3,128,3.6,3.7,3.6,3.9 14 | b,0,32,8.6,12.6,18.9,22.2 15 | b,0,64,13.5,23.7,32.6,41.3 16 | b,0,128,25.0,44.1,67.8,91.8 17 | b,1,32,8.4,12.6,17.5,22.3 18 | b,1,64,13.2,22.5,32.2,41.8 19 | b,1,128,25.3,43.8,67.0,91.4 20 | b,2,32,17.1,21.4,26.3,31.7 21 | b,2,64,21.8,31.6,39.7,48.6 22 | b,2,128,33.8,52.8,76.0,100.2 23 | b,3,32,3.5,3.5,3.5,3.5 24 | b,3,64,3.6,3.8,3.5,3.6 25 | b,3,128,3.7,3.8,3.4,3.5 26 | -------------------------------------------------------------------------------- /notebooks/2020-08-25-cpu-time-by-n-rows.csv: -------------------------------------------------------------------------------- 1 | server_id,sequence_id,n_rows,32,64,128 2 | a,0,2500,12.5,17.0,37.5 3 | a,0,5000,20.7,38.0,74.7 4 | a,0,7500,32.8,54.7,141.0 5 | a,0,10000,43.1,74.4,221.2 6 | a,1,2500,12.5,17.4,38.9 7 | a,1,5000,20.8,34.9,76.3 8 | a,1,7500,30.5,54.1,142.0 9 | a,1,10000,39.5,74.0,224.5 10 | a,2,2500,19.0,23.9,36.0 11 | a,2,5000,24.9,36.3,131.9 12 | a,2,7500,32.4,47.4,215.5 13 | a,2,10000,37.0,132.7,248.8 14 | a,3,2500,3.6,3.7,3.6 15 | a,3,5000,3.4,3.6,3.7 16 | a,3,7500,3.6,3.6,3.6 17 | a,3,10000,3.6,3.7,3.9 18 | b,0,2500,8.6,13.5,25.0 19 | b,0,5000,12.6,23.7,44.1 20 | b,0,7500,18.9,32.6,67.8 21 | b,0,10000,22.2,41.3,91.8 22 | b,1,2500,8.4,13.2,25.3 23 | b,1,5000,12.6,22.5,43.8 24 | b,1,7500,17.5,32.2,67.0 25 | b,1,10000,22.3,41.8,91.4 26 | b,2,2500,17.1,21.8,33.8 27 | b,2,5000,21.4,31.6,52.8 28 | b,2,7500,26.3,39.7,76.0 29 | b,2,10000,31.7,48.6,100.2 30 | b,3,2500,3.5,3.6,3.7 31 | b,3,5000,3.5,3.8,3.8 32 | b,3,7500,3.5,3.5,3.4 33 | b,3,10000,3.5,3.6,3.5 34 | -------------------------------------------------------------------------------- /prio_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/prio_processor/__init__.py -------------------------------------------------------------------------------- /prio_processor/origin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/prio_processor/origin/__init__.py -------------------------------------------------------------------------------- /prio_processor/origin/commands.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import click 3 | from . import staging, origins, indexing 4 | 5 | logging.basicConfig(level=logging.INFO) 6 | 7 | 8 | @click.group() 9 | def entry_point(): 10 | pass 11 | 12 | 13 | entry_point.add_command(staging.run, "staging") 14 | entry_point.add_command(origins.run, "fetch-origins") 15 | entry_point.add_command(indexing.run, "index") 16 | 17 | if __name__ == "__main__": 18 | entry_point() 19 | -------------------------------------------------------------------------------- /prio_processor/origin/indexing.py: -------------------------------------------------------------------------------- 1 | """Map Prio-aggregated data to their corresponding origins.""" 2 | import json 3 | 4 | import click 5 | from jsonschema import validate 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql.functions import explode, udf 8 | from pyspark.sql.types import ( 9 | ArrayType, 10 | IntegerType, 11 | StringType, 12 | StructField, 13 | StructType, 14 | ) 15 | 16 | 17 | def validate_origins(origins): 18 | schema = { 19 | "type": "array", 20 | "items": { 21 | "type": "object", 22 | "properties": { 23 | "name": {"type": "string"}, 24 | "hash": {"type": "string"}, 25 | "index": {"type": "integer", "minimum": 0}, 26 | }, 27 | }, 28 | } 29 | validate(instance=origins, schema=schema) 30 | 31 | 32 | def extract(spark, input): 33 | return spark.read.json(input) 34 | 35 | 36 | def transform(aggregates, config, origins): 37 | @udf( 38 | ArrayType( 39 | StructType( 40 | [ 41 | StructField("batch_id", StringType(), False), 42 | StructField("origin", StringType(), False), 43 | StructField("hash", StringType(), False), 44 | StructField("index", IntegerType(), False), 45 | StructField("aggregate", IntegerType(), False), 46 | ] 47 | ) 48 | ) 49 | ) 50 | def _apply_structure(batch_id, payload): 51 | """Create a user-defined function that maps partitioned batch-ids into 52 | list of structures containing the aggregate value and its metadata.""" 53 | 54 | # assumption: hyphens are used to define a partition of origins 55 | if batch_id not in [d["batch_id"] for d in config]: 56 | return [] 57 | 58 | # currently all batch-ids contain a single hyphen with 2 parts 59 | split = batch_id.split("-") 60 | assert len(split) == 2, "currently only supports batch-ids in 2 parts" 61 | batch_id = split[0] 62 | part_num = int(split[1]) 63 | 64 | # the offset is relative to the origins list 65 | if part_num == 0: 66 | offset = 0 67 | elif part_num == 1: 68 | # pick up where the last part left off 69 | d = [d for d in config if d["batch_id"] == f"{batch_id}-0"][0] 70 | offset = d["n_data"] 71 | else: 72 | # Hard-fail, this code path should not occur if the config file is 73 | # being properly maintained. 74 | raise NotImplementedError("batch-id is split into more than 2 parts") 75 | 76 | result = [] 77 | for origin, aggregate in zip(origins[offset:], payload): 78 | row = (batch_id, origin["name"], origin["hash"], origin["index"], aggregate) 79 | result.append(row) 80 | return result 81 | 82 | return aggregates.withColumn( 83 | "indexed", explode(_apply_structure("batch_id", "payload")) 84 | ).select("id", "timestamp", "indexed.*") 85 | 86 | 87 | def load(df, output): 88 | df.repartition(1).write.mode("overwrite").json(output) 89 | 90 | 91 | @click.command() 92 | @click.option( 93 | "--input", type=str, required=True, help="location of the prio aggregated-data" 94 | ) 95 | @click.option( 96 | "--output", type=str, required=True, help="location of the resulting indexed data" 97 | ) 98 | @click.option( 99 | "--config", 100 | type=str, 101 | required=True, 102 | help="location of the whitelist of batch-ids and their sizes", 103 | ) 104 | @click.option( 105 | "--origins", type=str, required=True, help="JSON document with origins data" 106 | ) 107 | def run(input, output, config, origins): 108 | """Take the resulting Prio aggregates and map the indices to their original origins.""" 109 | spark = SparkSession.builder.getOrCreate() 110 | extracted = extract(spark, input) 111 | 112 | with open(config) as f: 113 | config_data = json.load(f) 114 | with open(origins) as f: 115 | origin_data = json.load(f) 116 | 117 | validate_origins(origin_data) 118 | 119 | transformed = transform(extracted, config_data, origin_data) 120 | load(transformed, output) 121 | transformed.show(truncate=False) 122 | 123 | 124 | if __name__ == "__main__": 125 | run() 126 | -------------------------------------------------------------------------------- /prio_processor/origin/origins.py: -------------------------------------------------------------------------------- 1 | import json 2 | import urllib.request 3 | from collections import namedtuple 4 | 5 | import click 6 | 7 | TELEMETRY_ORIGIN_DATA = "https://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/core/TelemetryOriginData.inc" 8 | ORIGIN = namedtuple("Origin", ["name", "hash"]) 9 | 10 | 11 | def ignore(line): 12 | return not (line.startswith(b"//") or not line.strip()) 13 | 14 | 15 | def transform(index, origin): 16 | return {"name": origin.name, "hash": origin.hash, "index": index} 17 | 18 | 19 | @click.command() 20 | @click.option("--url", type=str, default=TELEMETRY_ORIGIN_DATA) 21 | @click.option("--output", type=click.File("w"), default="-") 22 | def run(url, output): 23 | """Fetch data about origins being collected by Firefox telemetry via Prio.""" 24 | resp = urllib.request.urlopen(url) 25 | parsed = map(eval, filter(ignore, resp.readlines())) 26 | data = [transform(idx, origin) for idx, origin in enumerate(parsed)] 27 | 28 | # in-band metadata about origin telemetry 29 | # https://searchfox.org/mozilla-central/rev/325c1a707819602feff736f129cb36055ba6d94f/toolkit/components/telemetry/core/TelemetryOrigin.cpp#145-149 30 | data.append( 31 | {"name": "__UNKNOWN__", "hash": "__UNKNOWN__", "index": data[-1]["index"] + 1} 32 | ) 33 | output.write(json.dumps(data, indent=2)) 34 | 35 | 36 | if __name__ == "__main__": 37 | run() 38 | -------------------------------------------------------------------------------- /prio_processor/prio/__init__.py: -------------------------------------------------------------------------------- 1 | import click 2 | from .commands import ( 3 | shared_seed, 4 | keygen, 5 | encode_shares, 6 | verify1, 7 | verify2, 8 | aggregate, 9 | publish, 10 | ) 11 | 12 | 13 | @click.group() 14 | def main(args=None): 15 | """Command line utility for prio.""" 16 | pass 17 | 18 | 19 | main.add_command(shared_seed) 20 | main.add_command(keygen) 21 | 22 | main.add_command(encode_shares) 23 | main.add_command(verify1) 24 | main.add_command(verify2) 25 | main.add_command(aggregate) 26 | main.add_command(publish) 27 | -------------------------------------------------------------------------------- /prio_processor/prio/options.py: -------------------------------------------------------------------------------- 1 | import click 2 | from .types import BYTE_STRING 3 | 4 | 5 | def apply_options(func, options): 6 | for option in options: 7 | func = option(func) 8 | return func 9 | 10 | 11 | def public_key(func): 12 | options = [ 13 | click.option( 14 | "--public-key-hex-internal", 15 | envvar="PUBLIC_KEY_HEX_INTERNAL", 16 | required=True, 17 | type=BYTE_STRING, 18 | help="The public key of the processing server as a hex string.", 19 | ), 20 | click.option( 21 | "--public-key-hex-external", 22 | envvar="PUBLIC_KEY_HEX_EXTERNAL", 23 | required=True, 24 | type=BYTE_STRING, 25 | help="The public key of the co-processing server as a hex string.", 26 | ), 27 | ] 28 | return apply_options(func, options) 29 | 30 | 31 | def server_config(func): 32 | options = [ 33 | click.option( 34 | "--server-id", 35 | envvar="SERVER_ID", 36 | required=True, 37 | type=click.Choice(["A", "B"]), 38 | help="The identifier for match.", 39 | ), 40 | click.option( 41 | "--private-key-hex", 42 | envvar="PRIVATE_KEY_HEX", 43 | required=True, 44 | type=BYTE_STRING, 45 | help="The private key of the processing server as a hex string.", 46 | ), 47 | click.option( 48 | "--shared-secret", 49 | envvar="SHARED_SECRET", 50 | required=True, 51 | type=BYTE_STRING, 52 | help="The shared server secret encoded in base64.", 53 | ), 54 | ] 55 | return apply_options(func, options) 56 | 57 | 58 | def output_1(func): 59 | options = [ 60 | click.option( 61 | "--output", 62 | envvar="OUTPUT", 63 | required=True, 64 | type=click.Path(file_okay=False), 65 | help="The path to the output directory.", 66 | ) 67 | ] 68 | return apply_options(func, options) 69 | 70 | 71 | def output_2(func): 72 | options = [ 73 | click.option( 74 | "--output-A", 75 | envvar="OUTPUT_A", 76 | required=True, 77 | type=click.Path(file_okay=False), 78 | help="The path to the input directory of server A.", 79 | ), 80 | click.option( 81 | "--output-B", 82 | envvar="OUTPUT_B", 83 | required=True, 84 | type=click.Path(file_okay=False), 85 | help="The path to the input directory of server B.", 86 | ), 87 | ] 88 | return apply_options(func, options) 89 | 90 | 91 | def input_1(func): 92 | options = [ 93 | click.option( 94 | "--input", 95 | envvar="INPUT", 96 | required=True, 97 | help="File containing shares from clients.", 98 | ) 99 | ] 100 | return apply_options(func, options) 101 | 102 | 103 | def input_2(func): 104 | options = [ 105 | click.option( 106 | "--input-internal", 107 | envvar="INPUT_INTERNAL", 108 | required=True, 109 | help="File containing data generated by the processing server.", 110 | ), 111 | click.option( 112 | "--input-external", 113 | envvar="INPUT_EXTERNAL", 114 | required=True, 115 | help="File containing data generated by the co-processing server.", 116 | ), 117 | ] 118 | return apply_options(func, options) 119 | 120 | 121 | def data_config(func): 122 | options = [ 123 | click.option( 124 | "--batch-id", 125 | envvar="BATCH_ID", 126 | required=True, 127 | type=BYTE_STRING, 128 | help="A shared batch identifier used as a validity check.", 129 | ), 130 | click.option( 131 | "--n-data", 132 | envvar="N_DATA", 133 | required=True, 134 | type=click.INT, 135 | help="The size of the input bit-vector.", 136 | ), 137 | ] 138 | return apply_options(func, options) 139 | -------------------------------------------------------------------------------- /prio_processor/prio/types.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | 4 | class ByteStringType(click.ParamType): 5 | name = "byte-string" 6 | 7 | def convert(self, value, param, ctx): 8 | try: 9 | return bytes(value, "utf-8") 10 | except: 11 | self.fail("{} cannot be encoded into a bytestring".format(value)) 12 | 13 | 14 | BYTE_STRING = ByteStringType() 15 | -------------------------------------------------------------------------------- /prio_processor/spark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/prio_processor/spark/__init__.py -------------------------------------------------------------------------------- /requirements-dev.in: -------------------------------------------------------------------------------- 1 | -c requirements.txt 2 | pytest 3 | mkdocs 4 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements-dev.in 6 | # 7 | attrs==19.3.0 8 | # via 9 | # -c requirements.txt 10 | # pytest 11 | click==7.1.2 12 | # via 13 | # -c requirements.txt 14 | # mkdocs 15 | # nltk 16 | future==0.18.2 17 | # via lunr 18 | iniconfig==1.0.1 19 | # via pytest 20 | jinja2==2.11.3 21 | # via mkdocs 22 | joblib==0.17.0 23 | # via nltk 24 | livereload==2.6.3 25 | # via mkdocs 26 | lunr[languages]==0.5.8 27 | # via mkdocs 28 | markdown==3.3.2 29 | # via mkdocs 30 | markupsafe==1.1.1 31 | # via jinja2 32 | mkdocs==1.1.2 33 | # via -r requirements-dev.in 34 | more-itertools==8.4.0 35 | # via pytest 36 | nltk==3.5 37 | # via lunr 38 | packaging==20.4 39 | # via pytest 40 | pluggy==0.13.1 41 | # via pytest 42 | py==1.10.0 43 | # via pytest 44 | pyparsing==2.4.7 45 | # via packaging 46 | pytest==6.0.1 47 | # via -r requirements-dev.in 48 | pyyaml==5.4 49 | # via mkdocs 50 | regex==2020.10.23 51 | # via nltk 52 | six==1.15.0 53 | # via 54 | # -c requirements.txt 55 | # livereload 56 | # lunr 57 | # packaging 58 | toml==0.10.1 59 | # via pytest 60 | tornado==6.0.4 61 | # via 62 | # livereload 63 | # mkdocs 64 | tqdm==4.50.2 65 | # via nltk 66 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile 6 | # 7 | attrs==19.3.0 8 | # via jsonschema 9 | click==7.1.2 10 | # via prio_processor (setup.py) 11 | jsonschema==3.2.0 12 | # via prio_processor (setup.py) 13 | numpy==1.19.1 14 | # via 15 | # pandas 16 | # pyarrow 17 | pandas==1.1.0 18 | # via 19 | # prio_processor (setup.py) 20 | # pyspark 21 | prio==1.1 22 | # via prio_processor (setup.py) 23 | py4j==0.10.9 24 | # via pyspark 25 | pyarrow==1.0.0 26 | # via pyspark 27 | pyrsistent==0.16.0 28 | # via jsonschema 29 | pyspark[sql]==3.1.1 30 | # via prio_processor (setup.py) 31 | python-dateutil==2.8.1 32 | # via pandas 33 | pytz==2020.1 34 | # via pandas 35 | six==1.15.0 36 | # via 37 | # jsonschema 38 | # pyrsistent 39 | # python-dateutil 40 | 41 | # The following packages are considered to be unsafe in a requirements file: 42 | # setuptools 43 | -------------------------------------------------------------------------------- /scripts/copy-spark-config: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install the Spark configuration from this repository into the Spark home 4 | # directory of the active pyspark installation. It's strongly encouraged that 5 | # Spark is installed via pip in a virtual environment if this script is used 6 | # on a local machine. 7 | set -e 8 | 9 | # Find the directory of spark from the active python packages 10 | SPARK_HOME=$(python -c "import pyspark; print(pyspark.__path__[0])") 11 | cp -r config/spark "${SPARK_HOME}/conf" 12 | -------------------------------------------------------------------------------- /scripts/create-folder: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | dirname = os.path.dirname(os.path.dirname(__file__)) 6 | workdir = os.path.join(dirname, "working") 7 | 8 | paths = [ 9 | "raw/", 10 | "intermediate/internal/verify1", 11 | "intermediate/external/verify1", 12 | "intermediate/internal/verify2", 13 | "intermediate/external/verify2", 14 | "intermediate/internal/aggregate", 15 | "intermediate/external/aggregate", 16 | "processed/", 17 | ] 18 | 19 | for server in ["server_a", "server_b"]: 20 | for path in paths: 21 | p = os.path.join(workdir, server, path) 22 | os.makedirs(p, exist_ok=True) 23 | 24 | os.makedirs(os.path.join(workdir, "client"), exist_ok=True) 25 | -------------------------------------------------------------------------------- /scripts/download-mapping: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import urllib.request 4 | from collections import namedtuple 5 | import json 6 | 7 | ORIGIN = namedtuple("Origin", ["name", "hash"]) 8 | 9 | url = "https://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/core/TelemetryOriginData.inc" 10 | resp = urllib.request.urlopen(url) 11 | 12 | 13 | def ignore(line): 14 | return not (line.startswith(b"//") or not line.strip()) 15 | 16 | 17 | data = map(eval, filter(ignore, resp.readlines())) 18 | 19 | origins = [datum.name for datum in data] 20 | 21 | print(json.dumps(origins)) 22 | -------------------------------------------------------------------------------- /scripts/print-cli-help: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Generate a markdown page to serve as markdown documentation and a diffing 4 | # mechanism between revisions of the CLI. 5 | # 6 | # Usage: 7 | # pip install . 8 | # ./scripts/print-cli-help > docs/cli-help.md 9 | # 10 | 11 | set -euo pipefail 12 | 13 | function md_fence() { 14 | echo '```bash' 15 | echo "${1}" 16 | echo '```' 17 | } 18 | 19 | function command_help() { 20 | local cmd=$1 21 | echo "## ${cmd} help" 22 | echo "" 23 | md_fence "$($cmd --help)" 24 | 25 | commands=$($cmd --help | sed "1,/Commands:/d" | grep "^ \w" | awk '{print $1}') 26 | for command in ${commands}; do 27 | echo "" 28 | echo "### \`$cmd ${command}\`" 29 | echo "" 30 | md_fence "$($cmd "${command}" --help)" 31 | done 32 | } 33 | 34 | cat <= 1.1", 24 | "pandas", 25 | ], 26 | packages=find_packages(), 27 | ) 28 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark.sql import SparkSession 3 | 4 | 5 | @pytest.fixture(scope="session") 6 | def spark(): 7 | spark = SparkSession.builder.getOrCreate() 8 | spark.conf.set("spark.sql.session.timeZone", "UTC") 9 | yield spark 10 | spark.stop() 11 | -------------------------------------------------------------------------------- /tests/resources/cli/client/data.ndjson: -------------------------------------------------------------------------------- 1 | {"payload":[1,0,0,0,0]} 2 | {"payload":[1,1,0,0,0]} 3 | {"payload":[1,1,1,0,0]} 4 | {"payload":[1,1,1,1,0]} 5 | {"payload":[1,1,1,1,1]} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_data": 5, 3 | "batch_id": "test" 4 | } 5 | -------------------------------------------------------------------------------- /tests/resources/cli/server_a/intermediate/external/aggregate/data.ndjson: -------------------------------------------------------------------------------- 1 | {"payload": "AZWrGOtHqPpMYsilUNWrUHHRIPRqMf3C69irUb2TwH9ZuJyv9emrLW156KXDwXGECByrCzCBU11jCIZxlYo=", "error": 0, "total": 5} -------------------------------------------------------------------------------- /tests/resources/cli/server_a/intermediate/external/verify1/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q3Gq1Z9fhPrky6G/q1lcoep+YuTc9XY4"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qysxFvmP/OQPfiLeqwNoitb2VhumnUPV"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q3WPmeTtM3tzhRmhq32/mXkuXHzX0vgW"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q38w7qT0uJCmUQspqzDzu2hC8Ui2QQjA"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1e5dk6X0dwV/RNFqxGDScXW3vfmOXt2"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_a/intermediate/external/verify2/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "qxcUiHwSIONlRevL"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "q2vZI9UqN+ZVCb+4"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qx3PyQDbzeCLb7i1"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qxKUbFgwKJeRQiNE"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1oqtTCt1wKDQ/cv"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_a/intermediate/internal/aggregate/data.ndjson: -------------------------------------------------------------------------------- 1 | {"payload": "AJWrZxS4VwWznTdirzGrL44u3wuVzgJFFC2rLkJsP4CmR2NYChurUpKGF1o8Po6D9+erdM9+rKKc93mWang=", "error": 0, "total": 5} -------------------------------------------------------------------------------- /tests/resources/cli/server_a/intermediate/internal/verify1/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2efCH9Lx6BGCgnkqx++ThBNUdJnogXh"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qx9bgZ2V4sRcXOy8q2+0vgQxd3NJaozj"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qzC4bEwyAJyYnNKQq13E6aZ46ptcR2vk"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qyyVOWz/SLccQL3HqzkxRXViiwRRTTtO"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qwxXx19ZZDLNaSN2qzTBlIEvB65xChBP"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_a/intermediate/internal/verify2/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2jrd4Pt3xyawhQ2"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qxQm3CrVyBmq/kBJ"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q2IwNv8kMh90mEdM"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q21rk6fP12huxdy9"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qyXVSs9SKP18xAjS"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_a/processed/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "0f12dff4-d2a4-4d3c-b499-ca6da4c78cbe", "timestamp": "2020-08-03T20:06:46.920410", "payload": [5, 4, 3, 2, 1]} -------------------------------------------------------------------------------- /tests/resources/cli/server_a/raw/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "2un4rKGCFhw2/CAntbW+JGYicF1Pm2v/qWTSQbOTCV52DaR7u1bQ15my/UE0rUrMaTPu27wpYIErxcxPPIhw1gwcYQUtA9nAZzH7ToHW1HOjVyDc0f6SrGQ8A59hIn2GI8uZLz2QA1hnHVsvNi8SLFN5IvqL7tj2F0El/EywteAkdHeTGuTO3SA9XvGNtI/40ZQoG/T80eyNYeg2Wj7vQQ1Sky1nSCOaEZwG/Hw+gmOp/CdTJlw5gVpwvUbEe+ymeV9f8qv7AsMGe6o3meL65y14FGWBERtxBiNMmGGZpGg/p/802JBtx9wDuIgjH1MK29Qa/mLHyA/VQDpnkLji2UdnCQRwUO71YsxskwTvaH9DdIl3XJvgmMKgYZG8YYbFsLD6fEZdrFw="} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "ORQ7tFgZ4+Ch+9gSbbwKYSl+t+z/mC2VGs+I+wiWm0T7fpOrKzLXzsq7HT48QI8fpw4cb9PVWKsNbmnmz1rJuo3PCCF8vx1e2hfuxPiYJCNq1fOdksEA9hAhMvSmpKbcfhSwanVIo2sGHOZzEHR7Fv4lLdchuISz9mc7keMta4V0R9s8JnLzJ5VI8DPhwaQb8LL4eGoGep+vWsev1U2SjAIQB+T/ZXOAxkhBopbalfGUq9Fxo/mirf6xMu90i9n5yi4I4wOjC0XeA7dm6TR8QWthCzuD9z/pnuDs6K6zKZqTp5DVcI0DHoBoP/jPaEir2KM8TtRRt3LObQusYosw/wW8ca/YWdqMquuiqNSLdKA8iz+GwPN7gAhcpuGve/6W7MYyu5J5uaY="} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "ImLU5IMq/HClYUPcl/nHvS3Gr/LJZQGfFx/geq6UpRBB+RhEsXF7+C4dngojc2mjUP8+bguV41oG6fNrdmyxyI1EIODfiThbvTKLVej8K07X9YaXFOLerzJudQTUqRoCRhEuPuFpYQvyN0+na0vQQxsAWoc+K+qRndAo6DvLuO0AdcwmL3Y/+jeOwlVjxVUeLhVP3zd4IXYDxXb8ubf6eiH+P51JZomp+C9xD31nEpY6YXlyhLfZubnDQcUbKeSVgi2pqAehHireJN7rZFSepcpoyk8Zy3428IArF3mABDoaLVnxZ2Jj8sIYympDvSTiTSMnalWGve4/CBGbbCCI3MIj9JtgHfyZnAoR1hv1YqfrSxCVpAoHotw7kjaz+4ZXB7X/0cDDafs="} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "p0m16Er/atF83mexCMQLrRp4kDsrVKo3NXdKCiBV53VFlUacFfU7Jk+SxBmAlk695vp6BWOrsfvAJ7pcNey+tQkXlADiDG3x8MrV/tDfrsv+ptaH3p7yW2pc4BVrOArih6OJkAyl4ssIyws2zEooeexNaRWGpditaWUsxOdQ2vShz4U2q4URYaq5iOIWqZrPatw48HeKxQ5xuA3oGUvsM9j1JiW7GlU2fGk9khNR3B89SCxYWs5nacq3MNJYZXCQz/XBn9zFlYSjTSuiy2bQRfC7UNwOTSaUZx+h/sQf86lOq+c0OVSygqFEIVv2nsAVNH8H9TSgnb1Z83C9YviBvzcYju9AFX+433nE6Bfa0QT09PVK+IrG+N2yR0+OnN4fP43+kjxsdJ8="} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "DUwgrSoMyJRlUs/2oKih7114jMgsbUqTzVjAlPo1XxQTuFs77y4wTNm8RAc+yt3oNjeZbbKeDwL8ui9tf5SdHdC/WUuT29LptSj5d9Xd/G7BjtbYZvwKl1GvhmV1E5s50pk7M0r7CPIGUq7X8iwUkBFU5vkrcCYpwBE+v1HEZO4pUGd0lxWjdDDjiLP+jUqpWpyYkaaVv8+wFRrUyIIMoNW80IPLeI5/bkxGnfJUR7T6Dq/PnyqJY/eYHiyBEZPrdBFFA4e5sckGSIaIxqDXfbu2HoU6OMhG6XL/iYTfWd8HO7Wzj7iCJ2Jqq96JZvaye3ecI8EM9Te2G81hYxpZXisX4K8e22DL/u5D8LIusi4KmS2tJE4f8BqaAPNYcEe/PGzmOjNXYq8="} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_a_keys.json: -------------------------------------------------------------------------------- 1 | { 2 | "private_key": "BD6BB9BEB089DBA6B6A75B4455A615D577699F973FAD9E327A33D9528B5C7F64", 3 | "public_key": "E74E9CDD78258D9EEFAFA0CA2C08733F95AB7C4297DEEA3C3A63AC2053C45127" 4 | } 5 | -------------------------------------------------------------------------------- /tests/resources/cli/server_b/intermediate/external/aggregate/data.ndjson: -------------------------------------------------------------------------------- 1 | {"payload": "AJWrZxS4VwWznTdirzGrL44u3wuVzgJFFC2rLkJsP4CmR2NYChurUpKGF1o8Po6D9+erdM9+rKKc93mWang=", "error": 0, "total": 5} -------------------------------------------------------------------------------- /tests/resources/cli/server_b/intermediate/external/verify1/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2efCH9Lx6BGCgnkqx++ThBNUdJnogXh"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qx9bgZ2V4sRcXOy8q2+0vgQxd3NJaozj"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qzC4bEwyAJyYnNKQq13E6aZ46ptcR2vk"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qyyVOWz/SLccQL3HqzkxRXViiwRRTTtO"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qwxXx19ZZDLNaSN2qzTBlIEvB65xChBP"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_b/intermediate/external/verify2/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2jrd4Pt3xyawhQ2"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qxQm3CrVyBmq/kBJ"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q2IwNv8kMh90mEdM"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q21rk6fP12huxdy9"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qyXVSs9SKP18xAjS"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_b/intermediate/internal/aggregate/data.ndjson: -------------------------------------------------------------------------------- 1 | {"payload": "AZWrGOtHqPpMYsilUNWrUHHRIPRqMf3C69irUb2TwH9ZuJyv9emrLW156KXDwXGECByrCzCBU11jCIZxlYo=", "error": 0, "total": 5} -------------------------------------------------------------------------------- /tests/resources/cli/server_b/intermediate/internal/verify1/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q3Gq1Z9fhPrky6G/q1lcoep+YuTc9XY4"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qysxFvmP/OQPfiLeqwNoitb2VhumnUPV"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q3WPmeTtM3tzhRmhq32/mXkuXHzX0vgW"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q38w7qT0uJCmUQspqzDzu2hC8Ui2QQjA"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1e5dk6X0dwV/RNFqxGDScXW3vfmOXt2"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_b/intermediate/internal/verify2/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "qxcUiHwSIONlRevL"} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "q2vZI9UqN+ZVCb+4"} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qx3PyQDbzeCLb7i1"} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qxKUbFgwKJeRQiNE"} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1oqtTCt1wKDQ/cv"} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_b/processed/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "6fe8d132-e86b-4fb3-8bfb-00c7019137bb", "timestamp": "2020-08-03T20:06:47.196802", "payload": [5, 4, 3, 2, 1]} -------------------------------------------------------------------------------- /tests/resources/cli/server_b/raw/data.ndjson: -------------------------------------------------------------------------------- 1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "dE35idVP3iHF/9comKagS1F9ePc1ReuHxsz12Ib74GwYNm5fA4Bj0ENmeCXzKVbhC4WVdxYp8wy+e32y1JjmY8UwYQaKZq7RPYdDupoMio0JZJ3gEsgefzqKHQXFy5tn7aqrlGp2MHo4VVDbVzzTa4f0osLvIXxsvmW2SjwY925eoIvX6H1zrs/EnMJXKJOEWbCKzEwrREoQ+W0="} 2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "F+BdUW5gM66hBd4Rr4iUl0unEhH0jwcc+6HzvNdNOS5JxsdXqZLhf2WeKQJrEoSakQpm5Qvpkrs6r2jSVMQ8ARGIXcb+/exOnCq/YwHxta7VsKwE5ZUqA1ULicnKPR0hN9z2gAR+CVqSYcFK7YZ6miFoQOopub6WslyFd+J05cg9S3ceNBL+bNfHRAbQkE6DRP5xnHjoXiR+66c="} 3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "DXpSB+s2g/jAm1JnmFbiHXfQzs6LZC2WWneW6ueUwTvKY9nST6rwkcR8qF+7hzp5iN4FQA+RCGwTZIOUU7xAwNH0RA5T/KoSJU2sD2KBWBYykb+mfIHHEKt8Ufp1bQA8HhVU5PejEoYOirwPyl8JbvnF/6F0qvU00P4518Z2xugDaCbAU3LBNm98mmgMfpibGO70kp6w5NdBTYQ="} 4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "WFIVqOgc1IuMwBG2xOdcWu1HcWcpZVoTKN/igsAGBj0ZZ95cl88O45qyny4eriZhhUq6I9IickOHIS2aTYRAyX4XHe5I7N42rwDnSriHAaNhdqTne/zgsvCtWgYSwzSPhJ1D80cLmfxD1ErBxToZomEr7YZUuZAOPJEBR6hw4qvHBFT+GaMvPQ5C71Irblj1AfCMpsDpKCigYVI="} 5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "ePka0xALUTfFXmQ4vAmoaxNGWNFE4Jo8Q2Vu6DY9Tmu6MjixXEXXGLZylJ7Gd133DzIbf3yjpKUqtORkIqpGvxtihmLO36P393lgCkxBM4gU/VcAr1IPhy9qjHTgzLgnI364NzPaE94/TqKLibmtsuD1tmXoGhr9hhcb+LRFZWOUjL5ndCfqx16/Fv2pBAqPsZmMcAxms5FDyzI="} 6 | -------------------------------------------------------------------------------- /tests/resources/cli/server_b_keys.json: -------------------------------------------------------------------------------- 1 | { 2 | "private_key": "766B14C6899560BD0B136043AA4817AFA0D5ECD0E17BE47896AD9F5F72C1862C", 3 | "public_key": "78D9E153651EFD04C07B95492F0485B743AA77013D8FC317DCAE33BACDC32D0A" 4 | } 5 | -------------------------------------------------------------------------------- /tests/resources/cli/shared_seed.json: -------------------------------------------------------------------------------- 1 | { 2 | "shared_seed": "nedME1QT1TS+7asOVOBqnA==" 3 | } 4 | -------------------------------------------------------------------------------- /tests/test_origin_indexing.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from pathlib import Path 4 | from uuid import uuid4 5 | 6 | import pytest 7 | from click.testing import CliRunner 8 | from prio_processor.origin import indexing 9 | from pyspark.sql import Row 10 | 11 | 12 | @pytest.fixture() 13 | def config_path(): 14 | return Path(__file__).parent.parent / "config" 15 | 16 | 17 | @pytest.fixture() 18 | def origins_dict(config_path): 19 | path = config_path / "telemetry_origin_data_inc.json" 20 | with open(path) as f: 21 | return json.load(f) 22 | 23 | 24 | @pytest.fixture() 25 | def config(config_path): 26 | path = config_path / "content.json" 27 | with open(path) as f: 28 | return json.load(f) 29 | 30 | 31 | def test_origins_dict(origins_dict): 32 | indexing.validate_origins(origins_dict) 33 | assert sorted(origins_dict[0].keys()) == sorted(["name", "hash", "index"]) 34 | assert len(origins_dict) == origins_dict[-1]["index"] + 1 35 | 36 | 37 | def test_config(config): 38 | batch_id = "content.blocking_blocked-{index}" 39 | (part_0, part_1) = [ 40 | [d["n_data"] for d in config if d["batch_id"] == batch_id.format(index=i)][0] 41 | for i in (0, 1) 42 | ] 43 | assert part_0 == 2046 44 | assert part_1 == 441 45 | 46 | 47 | @pytest.fixture() 48 | def prio_aggregated_data(tmp_path, spark, config): 49 | """ 50 | ├── _SUCCESS 51 | ├── batch_id=content.blocking_blocked-0 52 | │ └── part-00000-45945db7-4b6d-4eef-9e6f-76f98a3aefd4.c000.json 53 | ├── batch_id=content.blocking_blocked-1 54 | │ └── part-00001-45945db7-4b6d-4eef-9e6f-76f98a3aefd4.c000.json 55 | ... 56 | └── batch_id=content.blocking_storage_access_api_exempt_TESTONLY-1 57 | └── part-00011-45945db7-4b6d-4eef-9e6f-76f98a3aefd4.c000.json 58 | """ 59 | output = str(tmp_path / "data") 60 | rows = [] 61 | for d in config: 62 | batch_id = d["batch_id"] 63 | n_data = d["n_data"] 64 | # write data in such a way where each aggregate value matches to the 65 | # index value 66 | if int(batch_id.split("-")[1]) == 1: 67 | offset = 2046 68 | else: 69 | offset = 0 70 | datum = [offset + i for i in range(n_data)] 71 | row = Row( 72 | batch_id=batch_id, 73 | id=str(uuid4()), 74 | timestamp=datetime.utcnow().isoformat(), 75 | payload=datum, 76 | ) 77 | rows.append(row) 78 | df = spark.createDataFrame(rows) 79 | df.write.partitionBy("batch_id").json(output) 80 | return output 81 | 82 | 83 | def test_prio_aggregated_data_fixture(spark, prio_aggregated_data, config): 84 | df = spark.read.json(prio_aggregated_data) 85 | assert df.count() == len(config) 86 | 87 | 88 | def test_indexing_transform_unit(spark): 89 | whitelist = [ 90 | {"batch_id": "test-0", "n_data": 3}, 91 | {"batch_id": "test-1", "n_data": 2}, 92 | ] 93 | origins = [] 94 | for i, ch in enumerate("abcde"): 95 | origins.append({"name": ch, "hash": ch, "index": i}) 96 | 97 | def build_row(batch_id, payload): 98 | return Row( 99 | batch_id=batch_id, 100 | id=str(uuid4()), 101 | timestamp=datetime.utcnow().isoformat(), 102 | payload=payload, 103 | ) 104 | 105 | data = [build_row("test-0", [0, 1, 2]), build_row("test-1", [3, 4])] 106 | df = spark.createDataFrame(data) 107 | transformed = indexing.transform(df, whitelist, origins) 108 | assert transformed.count() == 5 109 | assert transformed.where("index <> aggregate").count() == 0 110 | 111 | with pytest.raises(Exception): 112 | whitelist["test-3"] = 1 113 | # `origins` doesn't need to be modified because transform should throw before then 114 | data.append(build_row("test-3", [5])) 115 | indexing.transform(spark.createDataFrame(data), whitelist, origins).count() 116 | 117 | 118 | def test_indexing_transform(spark, prio_aggregated_data, config, origins_dict): 119 | df = spark.read.json(prio_aggregated_data) 120 | transformed = indexing.transform(df, config, origins_dict) 121 | 122 | merged_batches = {} 123 | for d in config: 124 | batch_id = d["batch_id"] 125 | n_data = d["n_data"] 126 | key = batch_id.split("-")[0] 127 | merged_batches[key] = merged_batches.get(key, 0) + n_data 128 | 129 | assert transformed.select("batch_id").distinct().count() == len(merged_batches) 130 | assert transformed.count() == sum(merged_batches.values()) 131 | assert transformed.where("index <> aggregate").count() == 0 132 | 133 | 134 | def test_indexing_cli(spark, tmp_path, prio_aggregated_data, config_path): 135 | output = str(tmp_path / "output") 136 | runner = CliRunner() 137 | result = runner.invoke( 138 | indexing.run, 139 | [ 140 | "--input", 141 | prio_aggregated_data, 142 | "--output", 143 | output, 144 | "--config", 145 | str(config_path / "content.json"), 146 | "--origins", 147 | str(config_path / "telemetry_origin_data_inc.json"), 148 | ], 149 | catch_exceptions=False, 150 | ) 151 | assert result.exit_code == 0 152 | 153 | df = spark.read.json(output) 154 | assert df.count() > 0 155 | assert df.where("index <> aggregate").count() == 0 156 | -------------------------------------------------------------------------------- /tests/test_origin_origins.py: -------------------------------------------------------------------------------- 1 | import json 2 | import urllib.request 3 | from io import BytesIO 4 | 5 | import pytest 6 | from click.testing import CliRunner 7 | from prio_processor.origin import origins 8 | 9 | # First five origins from mozilla-central 10 | # https://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/core/TelemetryOriginData.inc 11 | TELEMETRY_ORIGIN_DATA_INC = """ 12 | // dummy origin, this is used to be a counter of page loads. 13 | ORIGIN("PAGELOAD", "PAGELOAD") 14 | 15 | ORIGIN("advertstream.com", "lzPiT1FuoHNMKQ1Hw8AaTi68TokOB24ciFBqmCk62ek=") 16 | ORIGIN("kitaramedia.com", "r+U9PL3uMrjCKe8/T8goY9MHPA+6JckC3R+/1R9TQKA=") 17 | ORIGIN("questionmarket.com", "3KCO/qN+KmApmfH3RaXAmdR65Z/TRfrr6pds7aDKn1c=") 18 | ORIGIN("3lift.com", "33Xrix7c41Jc9q3InjMWHq+yKVoa/u2IB511kr4X+Ro=") 19 | """ 20 | 21 | 22 | @pytest.fixture() 23 | def mock_request(monkeypatch): 24 | def _mocked(*args, **kwargs): 25 | return BytesIO(TELEMETRY_ORIGIN_DATA_INC.encode()) 26 | 27 | monkeypatch.setattr(urllib.request, "urlopen", _mocked) 28 | 29 | 30 | def test_origins_cli(mock_request, tmp_path): 31 | output = str(tmp_path / "output") 32 | runner = CliRunner() 33 | result = runner.invoke( 34 | origins.run, ["--output", str(output)], catch_exceptions=False 35 | ) 36 | assert result.exit_code == 0 37 | 38 | with open(output) as f: 39 | data = json.load(f) 40 | 41 | assert len(data) == 6 42 | assert data[0] == {"name": "PAGELOAD", "hash": "PAGELOAD", "index": 0} 43 | assert data[-2] == { 44 | "name": "3lift.com", 45 | "hash": "33Xrix7c41Jc9q3InjMWHq+yKVoa/u2IB511kr4X+Ro=", 46 | "index": 4, 47 | } 48 | assert data[-1] == {"name": "__UNKNOWN__", "hash": "__UNKNOWN__", "index": 5} 49 | -------------------------------------------------------------------------------- /tests/test_prio_wrapper_client.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import pytest 6 | from prio_processor.prio import wrapper as prio 7 | from prio import PrioContext, libprio 8 | 9 | 10 | @PrioContext() 11 | @pytest.mark.parametrize("n_clients", [1, 2, 10]) 12 | def test_client_agg(n_clients): 13 | seed = prio.PRGSeed() 14 | 15 | skA, pkA = prio.create_keypair() 16 | skB, pkB = prio.create_keypair() 17 | 18 | # the config is shared across all actors 19 | config = prio.Config(133, pkA, pkB, b"test_batch") 20 | 21 | sA = prio.Server(config, prio.PRIO_SERVER_A, skA, seed) 22 | sB = prio.Server(config, prio.PRIO_SERVER_B, skB, seed) 23 | 24 | client = prio.Client(config) 25 | 26 | n_data = config.num_data_fields() 27 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 28 | 29 | for i in range(n_clients): 30 | for_server_a, for_server_b = client.encode(data_items) 31 | 32 | # Setup verification 33 | vA = sA.create_verifier(for_server_a) 34 | vB = sB.create_verifier(for_server_b) 35 | 36 | # Produce a packet1 and send to the other party 37 | p1A = vA.create_verify1() 38 | p1B = vB.create_verify1() 39 | 40 | # Produce packet2 and send to the other party 41 | p2A = vA.create_verify2(p1A, p1B) 42 | p2B = vB.create_verify2(p1A, p1B) 43 | 44 | assert vA.is_valid(p2A, p2B) 45 | assert vB.is_valid(p2A, p2B) 46 | 47 | sA.aggregate(vA) 48 | sB.aggregate(vB) 49 | 50 | t_a = sA.total_shares() 51 | t_b = sB.total_shares() 52 | 53 | output = prio.total_share_final(config, t_a, t_b) 54 | 55 | expected = [item * n_clients for item in list(data_items)] 56 | assert list(output) == expected 57 | 58 | 59 | @PrioContext() 60 | def test_publickey_export(): 61 | raw_bytes = bytes((3 * x + 7) % 0xFF for x in range(libprio.CURVE25519_KEY_LEN)) 62 | pubkey = prio.PublicKey().import_bin(raw_bytes) 63 | raw_bytes2 = pubkey.export_bin() 64 | 65 | assert raw_bytes == raw_bytes2 66 | 67 | 68 | @PrioContext() 69 | @pytest.mark.parametrize( 70 | "hex_bytes", 71 | [ 72 | b"102030405060708090A0B0C0D0E0F00000FFEEDDCCBBAA998877665544332211", 73 | b"102030405060708090a0B0C0D0E0F00000FfeEddcCbBaa998877665544332211", 74 | ], 75 | ) 76 | def test_publickey_import_hex(hex_bytes): 77 | expect = bytes( 78 | [ 79 | 0x10, 80 | 0x20, 81 | 0x30, 82 | 0x40, 83 | 0x50, 84 | 0x60, 85 | 0x70, 86 | 0x80, 87 | 0x90, 88 | 0xA0, 89 | 0xB0, 90 | 0xC0, 91 | 0xD0, 92 | 0xE0, 93 | 0xF0, 94 | 0x00, 95 | 0x00, 96 | 0xFF, 97 | 0xEE, 98 | 0xDD, 99 | 0xCC, 100 | 0xBB, 101 | 0xAA, 102 | 0x99, 103 | 0x88, 104 | 0x77, 105 | 0x66, 106 | 0x55, 107 | 0x44, 108 | 0x33, 109 | 0x22, 110 | 0x11, 111 | ] 112 | ) 113 | 114 | pubkey = prio.PublicKey().import_hex(hex_bytes) 115 | raw_bytes = pubkey.export_bin() 116 | 117 | assert raw_bytes == expect 118 | 119 | 120 | @PrioContext() 121 | def test_publickey_import_hex_bad_length_raises_exception(): 122 | hex_bytes = b"102030405060708090A" 123 | pubkey = prio.PublicKey() 124 | with pytest.raises(RuntimeError): 125 | pubkey.import_hex(hex_bytes) 126 | 127 | 128 | @PrioContext() 129 | def test_publickey_export_hex(): 130 | # the output includes the null-byte 131 | expect = b"102030405060708090A0B0C0D0E0F00000FFEEDDCCBBAA998877665544332211" 132 | raw_bytes = bytes( 133 | [ 134 | 0x10, 135 | 0x20, 136 | 0x30, 137 | 0x40, 138 | 0x50, 139 | 0x60, 140 | 0x70, 141 | 0x80, 142 | 0x90, 143 | 0xA0, 144 | 0xB0, 145 | 0xC0, 146 | 0xD0, 147 | 0xE0, 148 | 0xF0, 149 | 0x00, 150 | 0x00, 151 | 0xFF, 152 | 0xEE, 153 | 0xDD, 154 | 0xCC, 155 | 0xBB, 156 | 0xAA, 157 | 0x99, 158 | 0x88, 159 | 0x77, 160 | 0x66, 161 | 0x55, 162 | 0x44, 163 | 0x33, 164 | 0x22, 165 | 0x11, 166 | ] 167 | ) 168 | pubkey = prio.PublicKey().import_bin(raw_bytes) 169 | hex_bytes = pubkey.export_hex() 170 | assert bytes(hex_bytes) == expect 171 | 172 | 173 | @PrioContext() 174 | def test_publickey_export_missing_key(): 175 | pubkey = prio.PublicKey() 176 | assert pubkey.export_bin() is None 177 | assert pubkey.export_hex() is None 178 | 179 | 180 | @PrioContext() 181 | def test_privatekey(): 182 | pvtkey, pubkey = prio.create_keypair() 183 | pvtdata = pvtkey.export_bin() 184 | pubdata = pubkey.export_bin() 185 | new_pvtkey = prio.PrivateKey().import_bin(pvtdata, pubdata) 186 | assert pvtdata == new_pvtkey.export_bin() 187 | -------------------------------------------------------------------------------- /tests/test_prio_wrapper_serialize.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import pickle 6 | import pytest 7 | from prio_processor.prio import wrapper as prio 8 | from prio import libprio 9 | 10 | 11 | @pytest.fixture(autouse=True) 12 | def init(): 13 | # Note: PrioContext breaks with the fixtures 14 | libprio.Prio_init() 15 | yield 16 | libprio.Prio_clear() 17 | 18 | 19 | @pytest.fixture 20 | def seed(): 21 | return prio.PRGSeed() 22 | 23 | 24 | @pytest.fixture 25 | def serverA_keypair(): 26 | return prio.create_keypair() 27 | 28 | 29 | @pytest.fixture 30 | def serverB_keypair(): 31 | return prio.create_keypair() 32 | 33 | 34 | @pytest.fixture 35 | def config(serverA_keypair, serverB_keypair): 36 | _, pkA = serverA_keypair 37 | _, pkB = serverB_keypair 38 | return prio.Config(133, pkA, pkB, b"test_batch") 39 | 40 | 41 | @pytest.fixture 42 | def serverA(seed, config, serverA_keypair): 43 | sk, _ = serverA_keypair 44 | return prio.Server(config, prio.PRIO_SERVER_A, sk, seed) 45 | 46 | 47 | @pytest.fixture 48 | def serverB(seed, config, serverB_keypair): 49 | sk, _ = serverB_keypair 50 | return prio.Server(config, prio.PRIO_SERVER_B, sk, seed) 51 | 52 | 53 | @pytest.fixture 54 | def client(config): 55 | return prio.Client(config) 56 | 57 | 58 | @pytest.mark.skip 59 | def test_serialize_verifier(config, client, serverA, serverB): 60 | n_data = config.num_data_fields() 61 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 62 | 63 | for_server_a, for_server_b = client.encode(data_items) 64 | 65 | vA = pickle.loads(pickle.dumps(serverA.create_verifier(for_server_a))) 66 | vB = serverB.create_verifier(for_server_b) 67 | 68 | p1A = vA.create_verify1() 69 | p1B = vB.create_verify1() 70 | 71 | p2A = vA.create_verify2(p1A, p1B) 72 | p2B = vB.create_verify2(p1A, p1B) 73 | 74 | assert vA.is_valid(p2A, p2B) 75 | assert vB.is_valid(p2A, p2B) 76 | 77 | 78 | def test_serialize_verify1(config, client, serverA, serverB): 79 | n_data = config.num_data_fields() 80 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 81 | 82 | for_server_a, for_server_b = client.encode(data_items) 83 | 84 | vA = serverA.create_verifier(for_server_a) 85 | vB = serverB.create_verifier(for_server_b) 86 | 87 | p1A = pickle.loads(pickle.dumps(vA.create_verify1())) 88 | p1B = vB.create_verify1() 89 | 90 | p2A = vA.create_verify2(p1A, p1B) 91 | p2B = vB.create_verify2(p1A, p1B) 92 | 93 | assert vA.is_valid(p2A, p2B) 94 | assert vB.is_valid(p2A, p2B) 95 | 96 | 97 | def test_serialize_verify2(config, client, serverA, serverB): 98 | n_data = config.num_data_fields() 99 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 100 | 101 | for_server_a, for_server_b = client.encode(data_items) 102 | 103 | vA = serverA.create_verifier(for_server_a) 104 | vB = serverB.create_verifier(for_server_b) 105 | 106 | p1A = vA.create_verify1() 107 | p1B = vB.create_verify1() 108 | 109 | p2A = pickle.loads(pickle.dumps(vA.create_verify2(p1A, p1B))) 110 | p2B = vB.create_verify2(p1A, p1B) 111 | 112 | assert vA.is_valid(p2A, p2B) 113 | assert vB.is_valid(p2A, p2B) 114 | 115 | 116 | def test_serialize_total_shares(config, client, serverA, serverB): 117 | n_data = config.num_data_fields() 118 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)]) 119 | 120 | for_server_a, for_server_b = client.encode(data_items) 121 | 122 | vA = serverA.create_verifier(for_server_a) 123 | vB = serverB.create_verifier(for_server_b) 124 | 125 | p1A = vA.create_verify1() 126 | p1B = vB.create_verify1() 127 | 128 | p2A = vA.create_verify2(p1A, p1B) 129 | p2B = vB.create_verify2(p1A, p1B) 130 | 131 | assert vA.is_valid(p2A, p2B) 132 | assert vB.is_valid(p2A, p2B) 133 | 134 | serverA.aggregate(vA) 135 | serverB.aggregate(vB) 136 | 137 | t_a = pickle.loads(pickle.dumps(serverA.total_shares())) 138 | t_b = serverB.total_shares() 139 | output = prio.total_share_final(config, t_a, t_b) 140 | assert list(output) == list(data_items) 141 | --------------------------------------------------------------------------------