├── .circleci
    └── config.yml
├── .dockerignore
├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── bin
    ├── authenticate
    ├── cleanup
    ├── configure-mc
    ├── configure-spark-conf
    ├── dataproc
    ├── generate
    ├── insert
    └── process
├── config
    ├── content.json
    ├── spark
    │   ├── log4j.properties
    │   └── spark-defaults.conf.template
    ├── telemetry_origin_data_inc.json
    └── test-small.json
├── deployment
    ├── testing-v3
    │   ├── .gitignore
    │   ├── LISTING.md
    │   ├── README.md
    │   ├── compose
    │   │   ├── admin
    │   │   │   ├── .env.template
    │   │   │   └── docker-compose.yml
    │   │   ├── server-a
    │   │   │   ├── .env.template
    │   │   │   └── docker-compose.yml
    │   │   └── server-b
    │   │   │   ├── .env.template
    │   │   │   └── docker-compose.yml
    │   ├── content.json
    │   ├── scripts
    │   │   ├── cleanup
    │   │   ├── generate-dotenv
    │   │   ├── generate-service-account-keys
    │   │   ├── integrate
    │   │   └── list-bucket
    │   └── terraform
    │   │   ├── .terraform.lock.hcl
    │   │   ├── main.tf
    │   │   └── modules
    │   │       ├── bucket-permissions
    │   │           └── main.tf
    │   │       └── bucket
    │   │           └── main.tf
    ├── testing-v4-gcloud-self
    │   ├── .gitignore
    │   ├── README.md
    │   ├── compose
    │   │   ├── ingest
    │   │   │   ├── .env.template
    │   │   │   └── docker-compose.yml
    │   │   ├── server-a
    │   │   │   ├── .env.template
    │   │   │   └── docker-compose.yml
    │   │   └── server-b
    │   │   │   ├── .env.template
    │   │   │   ├── bootstrap.sh
    │   │   │   ├── docker-compose.yml
    │   │   │   └── minio-config.json
    │   ├── content.json
    │   ├── scripts
    │   │   ├── build
    │   │   ├── cleanup
    │   │   ├── copy-minio-configuration
    │   │   ├── down
    │   │   ├── generate-dotenv
    │   │   ├── generate-minio-configuration
    │   │   ├── generate-service-account-keys
    │   │   └── integrate
    │   └── terraform
    │   │   ├── .terraform.lock.hcl
    │   │   ├── main.tf
    │   │   └── modules
    │   │       ├── bucket-permissions
    │   │           └── main.tf
    │   │       └── bucket
    │   │           └── main.tf
    └── testing-v4
    │   ├── .gitignore
    │   ├── LISTING.md
    │   ├── README.md
    │   ├── compose
    │       ├── ingest
    │       │   ├── .env.template
    │       │   └── docker-compose.yml
    │       ├── server-a
    │       │   ├── .env.template
    │       │   └── docker-compose.yml
    │       └── server-b
    │       │   ├── .env.template
    │       │   └── docker-compose.yml
    │   ├── content.json
    │   ├── scripts
    │       ├── build
    │       ├── cleanup
    │       ├── generate-dotenv
    │       ├── generate-service-account-keys
    │       ├── integrate
    │       └── list-bucket
    │   └── terraform
    │       ├── .terraform.lock.hcl
    │       ├── main.tf
    │       └── modules
    │           ├── bucket-permissions
    │               └── main.tf
    │           └── bucket
    │               └── main.tf
├── docker-compose.yml
├── docs
    ├── README.md
    ├── airflow.md
    ├── cli-help.md
    ├── guide.md
    ├── images
    │   └── airflow-dag.png
    └── link
    │   └── CODE_OF_CONDUCT.md
├── examples
    ├── README.md
    ├── asyncio
    │   ├── README.md
    │   ├── dag.png
    │   └── main.py
    ├── batched-processing
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── policy
    │   │   ├── server-a.json
    │   │   └── server-b.json
    │   └── scripts
    │   │   ├── bootstrap.sh
    │   │   ├── check-aggregates.sh
    │   │   ├── client.sh
    │   │   ├── integration.sh
    │   │   └── server.sh
    ├── benchmarks
    │   ├── README.md
    │   ├── client_encoding_time.png
    │   ├── encrypted_sizes.png
    │   ├── main.py
    │   ├── requirements.in
    │   └── requirements.txt
    ├── browser-validation
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── generate.py
    │   └── main.py
    ├── docker-asyncio
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── README.md
    │   ├── client.py
    │   ├── docker-compose.yml
    │   └── server.py
    ├── python-wrapper
    │   ├── README.md
    │   └── main.py
    └── swig-wrapper
    │   ├── README.md
    │   └── main.py
├── google-cloud-sdk.repo
├── mkdocs.yml
├── notebooks
    ├── 2020-08-25-benchmarking-exploration.ipynb
    ├── 2020-08-25-cpu-time-by-n-data.csv
    ├── 2020-08-25-cpu-time-by-n-rows.csv
    └── 2020-11-05-benchmarking-results.ipynb
├── prio_processor
    ├── __init__.py
    ├── origin
    │   ├── __init__.py
    │   ├── commands.py
    │   ├── indexing.py
    │   ├── origins.py
    │   └── staging.py
    ├── prio
    │   ├── __init__.py
    │   ├── commands.py
    │   ├── options.py
    │   ├── types.py
    │   └── wrapper.py
    └── spark
    │   ├── __init__.py
    │   ├── commands.py
    │   └── udf.py
├── requirements-dev.in
├── requirements-dev.txt
├── requirements.txt
├── scripts
    ├── copy-spark-config
    ├── create-folder
    ├── download-mapping
    ├── print-cli-help
    ├── test-cli-integration
    ├── test-cli-integration-dataproc
    └── test-cli-integration-spark
├── setup.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── resources
        ├── cli
        │   ├── client
        │   │   └── data.ndjson
        │   ├── config.json
        │   ├── server_a
        │   │   ├── intermediate
        │   │   │   ├── external
        │   │   │   │   ├── aggregate
        │   │   │   │   │   └── data.ndjson
        │   │   │   │   ├── verify1
        │   │   │   │   │   └── data.ndjson
        │   │   │   │   └── verify2
        │   │   │   │   │   └── data.ndjson
        │   │   │   └── internal
        │   │   │   │   ├── aggregate
        │   │   │   │       └── data.ndjson
        │   │   │   │   ├── verify1
        │   │   │   │       └── data.ndjson
        │   │   │   │   └── verify2
        │   │   │   │       └── data.ndjson
        │   │   ├── processed
        │   │   │   └── data.ndjson
        │   │   └── raw
        │   │   │   └── data.ndjson
        │   ├── server_a_keys.json
        │   ├── server_b
        │   │   ├── intermediate
        │   │   │   ├── external
        │   │   │   │   ├── aggregate
        │   │   │   │   │   └── data.ndjson
        │   │   │   │   ├── verify1
        │   │   │   │   │   └── data.ndjson
        │   │   │   │   └── verify2
        │   │   │   │   │   └── data.ndjson
        │   │   │   └── internal
        │   │   │   │   ├── aggregate
        │   │   │   │       └── data.ndjson
        │   │   │   │   ├── verify1
        │   │   │   │       └── data.ndjson
        │   │   │   │   └── verify2
        │   │   │   │       └── data.ndjson
        │   │   ├── processed
        │   │   │   └── data.ndjson
        │   │   └── raw
        │   │   │   └── data.ndjson
        │   ├── server_b_keys.json
        │   └── shared_seed.json
        └── fx-69.0a1.json
    ├── test_origin_indexing.py
    ├── test_origin_origins.py
    ├── test_origin_staging.py
    ├── test_prio_commands.py
    ├── test_prio_commands_end_to_end.py
    ├── test_prio_wrapper_client.py
    ├── test_prio_wrapper_serialize.py
    ├── test_spark_commands.py
    └── test_spark_udf.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | # This Source Code Form is subject to the terms of the Mozilla Public
  2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | # https://github.com/mozilla-services/Dockerflow/blob/master/.circleci/config.yml
  6 | # DOCKERHUB_REPO - docker hub repo, format: <username>/<repo>
  7 | # DOCKER_USER    - login info for docker hub
  8 | # DOCKER_PASS
  9 | 
 10 | version: 2
 11 | jobs:
 12 |   build:
 13 |     docker:
 14 |       - image: docker:stable-git
 15 |     steps:
 16 |       - checkout
 17 |       - setup_remote_docker
 18 |       - run:
 19 |           name: Create a version.json
 20 |           command: |
 21 |             # create a version.json per https://github.com/mozilla-services/Dockerflow/blob/master/docs/version_object.md
 22 |             printf '{"commit":"%s","version":"%s","source":"https://github.com/%s/%s","build":"%s"}\n' \
 23 |             "$CIRCLE_SHA1" \
 24 |             "$CIRCLE_TAG" \
 25 |             "$CIRCLE_PROJECT_USERNAME" \
 26 |             "$CIRCLE_PROJECT_REPONAME" \
 27 |             "$CIRCLE_BUILD_URL" > version.json
 28 |       - run:
 29 |           name: Build development image
 30 |           command: |
 31 |             docker build -t prio:latest .
 32 |       - run:
 33 |           name: Save image into cache
 34 |           command: |
 35 |             docker save -o /tmp/latest.tar "prio:latest"
 36 |       - save_cache:
 37 |           key: v1-{{ .Branch }}-{{ epoch }}
 38 |           paths:
 39 |             - /tmp/latest.tar
 40 | 
 41 |   test:
 42 |     docker:
 43 |       - image: docker:stable-git
 44 |     steps:
 45 |       - setup_remote_docker
 46 |       - restore_cache:
 47 |           key: v1-{{ .Branch }}
 48 |       - run:
 49 |           name: Restore cache
 50 |           command: |
 51 |             docker load -i /tmp/latest.tar
 52 |       - run:
 53 |           name: Run the default tests
 54 |           command: docker run prio:latest
 55 | 
 56 |   test-batch-example:
 57 |     machine: true
 58 |     working_directory: ~/prio-processor/examples/batched-processing
 59 |     steps:
 60 |       - checkout:
 61 |           path: ~/prio-processor
 62 |       - restore_cache:
 63 |           key: v1-{{.Branch}}
 64 |       - run:
 65 |           name: Restore Docker image cache
 66 |           command: docker load -i /tmp/latest.tar
 67 |       - run:
 68 |           name: Build the compose container
 69 |           command: |
 70 |             # examples expect a prio:dev image
 71 |             docker tag prio:latest prio:dev
 72 |             docker-compose build
 73 |       - run:
 74 |           name: Test batched-processing integration with MinIO
 75 |           command: make test
 76 | 
 77 |   test-cli-integration-spark:
 78 |     docker:
 79 |       - image: docker:stable-git
 80 |     steps:
 81 |       - setup_remote_docker
 82 |       - restore_cache:
 83 |           key: v1-{{ .Branch }}
 84 |       - run:
 85 |           name: Restore cache
 86 |           command: |
 87 |             docker load -i /tmp/latest.tar
 88 |       - run:
 89 |           name: Run the default tests
 90 |           command: docker run prio:latest scripts/test-cli-integration-spark
 91 | 
 92 |   deploy:
 93 |     docker:
 94 |       - image: docker:stable-git
 95 |     steps:
 96 |       - checkout
 97 |       - setup_remote_docker
 98 |       - run:
 99 |           name: Create a version.json
100 |           command: |
101 |             # create a version.json per https://github.com/mozilla-services/Dockerflow/blob/master/docs/version_object.md
102 |             printf '{"commit":"%s","version":"%s","source":"https://github.com/%s/%s","build":"%s"}\n' \
103 |             "$CIRCLE_SHA1" \
104 |             "$CIRCLE_TAG" \
105 |             "$CIRCLE_PROJECT_USERNAME" \
106 |             "$CIRCLE_PROJECT_REPONAME" \
107 |             "$CIRCLE_BUILD_URL" > version.json
108 |       - restore_cache:
109 |           key: v1-{{.Branch}}
110 |       - run:
111 |           name: Restore Docker image cache
112 |           command: docker load -i /tmp/latest.tar
113 |       - run:
114 |           name: Rerun sanity checks before deploy
115 |           command: docker run prio:latest
116 |       - run:
117 |           name: Deploy to Dockerhub
118 |           command: |
119 |             echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
120 |             # deploy main
121 |             if [ "${CIRCLE_BRANCH}" == "main" ]; then
122 |               docker tag prio:latest ${DOCKERHUB_REPO}:latest
123 |               docker push ${DOCKERHUB_REPO}:latest
124 |             elif  [ ! -z "${CIRCLE_TAG}" ]; then
125 |             # deploy a release tag...
126 |               echo "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
127 |               docker tag prio:latest "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
128 |               docker images
129 |               docker push "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
130 |             fi
131 | 
132 | workflows:
133 |   version: 2
134 |   build-test-deploy:
135 |     jobs:
136 |       - build:
137 |           filters:
138 |             tags:
139 |               only: /.*/
140 |       - test:
141 |           requires:
142 |             - build
143 |           filters:
144 |             tags:
145 |               only: /.*/
146 |       - test-cli-integration-spark:
147 |           requires:
148 |             - build
149 |           filters:
150 |             tags:
151 |               only: /.*/
152 |       - test-batch-example:
153 |           requires:
154 |             - build
155 |           filters:
156 |             tags:
157 |               only: /.*/
158 |       - deploy:
159 |           requires:
160 |             - build
161 |             - test
162 |             - test-cli-integration-spark
163 |           filters:
164 |             tags:
165 |               only: /.*/
166 |             branches:
167 |               only: main
168 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | **/*.pyc
 2 | **/*.pyo
 3 | **/__pycache__
 4 | *.so
 5 | *.egg-info
 6 | MANIFEST
 7 | 
 8 | **/examples/*/Pipfile.lock
 9 | 
10 | .coverage
11 | .pytest_cache
12 | .vscode
13 | .tox
14 | .config/
15 | .gsutil/
16 | .ipynb_checkpoints/
17 | .parallel
18 | .vscode
19 | .env
20 | 
21 | data/
22 | working/
23 | **/build/
24 | **/dist/
25 | **/working/
26 | **/venv/
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.pyc
 2 | **/*.pyo
 3 | **/__pycache__
 4 | **/.ipynb_checkpoints
 5 | *.so
 6 | *.egg-info
 7 | MANIFEST
 8 | 
 9 | **/examples/*/Pipfile.lock
10 | 
11 | .coverage
12 | .pytest_cache
13 | .vscode
14 | .env
15 | .tox
16 | 
17 | build/
18 | data/
19 | dist/
20 | working/
21 | venv/
22 | .mc
23 | .ash_history
24 | 
25 | .terraform/
26 | .bash_history
27 | 
28 | # we only want to keep the template and generate the actual config at runtime,
29 | # don't accidentally check values that may have sensitive values into
30 | # source-control
31 | config/spark/spark-defaults.conf
32 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/.gitmodules


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Community Participation Guidelines
 2 | 
 3 | This repository is governed by Mozilla's code of conduct and etiquette
 4 | guidelines. For more details, please read the [Mozilla Community Participation
 5 | Guidelines](https://www.mozilla.org/about/governance/policies/participation/).
 6 | 
 7 | ## How to Report
 8 | 
 9 | For more information on how to report violations of the Community Participation
10 | Guidelines, please read our '[How to
11 | Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)'
12 | page.
13 | 
14 | <!--
15 | ## Project Specific Etiquette
16 | 
17 | In some cases, there will be additional project etiquette i.e.:
18 | (https://bugzilla.mozilla.org/page.cgi?id=etiquette.html). Please update for
19 | your project.
20 | -->
21 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM centos:7
 2 | LABEL maintainer="amiyaguchi@mozilla.com"
 3 | 
 4 | ENV LANG en_US.utf8
 5 | 
 6 | COPY ./google-cloud-sdk.repo /etc/yum.repos.d/
 7 | RUN yum install -y epel-release \
 8 |         && yum install -y \
 9 |         nss \
10 |         nspr \
11 |         msgpack \
12 |         python36 \
13 |         java-1.8.0-openjdk \
14 |         google-cloud-sdk \
15 |         rsync \
16 |         jq \
17 |         parallel \
18 |         which \
19 |         tree \
20 |         wget \
21 |         && yum clean all \
22 |         && rm -rf /var/cache/yum
23 | 
24 | RUN gcloud config set disable_usage_reporting true
25 | 
26 | RUN groupadd --gid 10001 app && \
27 |         useradd -g app --uid 10001 --shell /usr/sbin/nologin --create-home \
28 |         --home-dir /app app
29 | 
30 | WORKDIR /app
31 | COPY requirements.txt requirements-dev.txt ./
32 | 
33 | ENV PATH="$PATH:~/.local/bin"
34 | RUN python3 -m ensurepip && \
35 |         pip3 install --upgrade pip wheel && \
36 |         pip3 install -r requirements.txt -r requirements-dev.txt
37 | 
38 | ENV SPARK_HOME=/usr/local/lib/python3.6/site-packages/pyspark
39 | ENV PYSPARK_PYTHON=python3
40 | 
41 | # Install libraries for interacting with cloud storage. We utilize the s3a
42 | # adaptor for cross-cloud compatibility, but use of the gcs connector may be
43 | # more performant when running directly in GCP.
44 | # https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage
45 | RUN gsutil cp gs://hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar "${SPARK_HOME}/jars"
46 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar
47 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar
48 | 
49 | # Use the MinIO client for cross platform behavior, even with self-hosting
50 | RUN wget --directory-prefix /usr/local/bin https://dl.min.io/client/mc/release/linux-amd64/mc
51 | RUN chmod +x /usr/local/bin/mc
52 | 
53 | ADD . /app
54 | 
55 | # Symlink the spark config into SPARK_HOME so it can be updated via volume mounts
56 | RUN ln -s /app/config/spark ${SPARK_HOME}/conf
57 | 
58 | # build the binary egg for distribution on Spark clusters
59 | RUN python3 setup.py bdist_egg && pip3 install -e .
60 | RUN chown -R app:app /app
61 | 
62 | USER app
63 | CMD pytest -v tests && \
64 |         scripts/test-cli-integration && \
65 |         prio --help && \
66 |         prio-processor --help
67 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: build clean test
2 | 
3 | build:
4 | 	docker-compose build
5 | 
6 | clean:
7 | 	docker-compose down
8 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Configuration under config/spark is derived from the Apache Spark project
2 | (https://github.com/apache/spark/tree/v3.0.1/conf).
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # prio-processor
 2 | 
 3 | [![CircleCI](https://circleci.com/gh/mozilla/prio-processor.svg?style=svg)](https://circleci.com/gh/mozilla/prio-processor)
 4 | 
 5 | Prio is a system for aggregating data in a privacy-preserving way. This
 6 | repository includes a command-line tool for batch processing in Prio's
 7 | multi-server architecture.
 8 | 
 9 | For more information about Prio, see [this blog
10 | post](https://hacks.mozilla.org/2018/10/testing-privacy-preserving-telemetry-with-prio/).
11 | 
12 | ## Docker
13 | 
14 | This project contains a pre-configured build and test environment via docker.
15 | 
16 | ```bash
17 | make
18 | 
19 | # or run directly though docker-compose
20 | docker-compose build
21 | ```
22 | 
23 | You can mount your working directory and shell into the container for
24 | development work.
25 | 
26 | ```bash
27 | docker-compose run -v $PWD:/app prio_processor bash
28 | ```
29 | 
30 | ## Adding new dependencies
31 | 
32 | To add new Python dependencies to the container, use `pip-tools` to manage the
33 | `requirements.txt`.
34 | 
35 | ```bash
36 | pip install pip-tools
37 | 
38 | # generate the installation requirements from setup.py
39 | pip-compile
40 | 
41 | # generate dev requirements
42 | pip-compile requirements-dev.in
43 | ```
44 | 
45 | Any new system dependencies should be added to the `Dockerfile` at the root of
46 | the repository. These will be available during runtime.
47 | 
48 | ## Deployment Configuration
49 | 
50 | See the `deployment` directory for examples of configuration that can be used to
51 | aid deployment. These may also be run as integration tests to determine whether
52 | resources are configured properly. These will typically assume Google Cloud
53 | Platform (GCP) as a resource provider.
54 | 
55 | See the [guide](docs/guide.md) for more details.
56 | 


--------------------------------------------------------------------------------
/bin/authenticate:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Authenticate against Google Cloud services via service account if they exist,
 4 | # otherwise log the assumption that the container is running on GCE.
 5 | 
 6 | # ensure the variable is set, even if it's empty
 7 | : "${GOOGLE_APPLICATION_CREDENTIALS:=}"
 8 | 
 9 | if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then
10 |     gcloud auth activate-service-account --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
11 | else
12 |     # https://cloud.google.com/kubernetes-engine/docs/tutorials/authenticating-to-cloud-platform
13 |     echo "No JSON credentials provided, using default scopes and project"
14 | fi
15 | 


--------------------------------------------------------------------------------
/bin/cleanup:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | set -euo pipefail
 7 | set -x
 8 | 
 9 | : "${BUCKET_INTERNAL_INGEST?}"
10 | : "${BUCKET_INTERNAL_PRIVATE?}"
11 | : "${BUCKET_INTERNAL_SHARED?}"
12 | 
13 | echo "Running cleanup..."
14 | 
15 | "${BASH_SOURCE%/*}/configure-mc"
16 | mc stat "internal/${BUCKET_INTERNAL_INGEST}"
17 | mc stat "internal/${BUCKET_INTERNAL_PRIVATE}"
18 | mc stat "internal/${BUCKET_INTERNAL_SHARED}"
19 | (mc rm --recursive --force "internal/${BUCKET_INTERNAL_INGEST}" || echo "nothing to delete")
20 | (mc rm --recursive --force "internal/${BUCKET_INTERNAL_PRIVATE}" || echo "nothing to delete")
21 | (mc rm --recursive --force "internal/${BUCKET_INTERNAL_SHARED}" || echo "nothing to delete")
22 | 


--------------------------------------------------------------------------------
/bin/configure-mc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Configure the MinIO command-line tool against the environment variables
 3 | 
 4 | set -e
 5 | # do *not* print commands here since they utilize sensitive environment variables
 6 | set +x 
 7 | 
 8 | : "${BUCKET_INTERNAL_ACCESS_KEY?}"
 9 | : "${BUCKET_INTERNAL_SECRET_KEY?}"
10 | : "${BUCKET_INTERNAL_ENDPOINT?}"
11 | : "${BUCKET_EXTERNAL_SECRET_KEY?}"
12 | : "${BUCKET_EXTERNAL_ACCESS_KEY?}"
13 | : "${BUCKET_EXTERNAL_ENDPOINT?}"
14 | 
15 | mc alias set internal \
16 |     $BUCKET_INTERNAL_ENDPOINT \
17 |     $BUCKET_INTERNAL_ACCESS_KEY \
18 |     $BUCKET_INTERNAL_SECRET_KEY \
19 |     --api S3v4
20 | 
21 | mc alias set external \
22 |     $BUCKET_EXTERNAL_ENDPOINT \
23 |     $BUCKET_EXTERNAL_ACCESS_KEY \
24 |     $BUCKET_EXTERNAL_SECRET_KEY \
25 |     --api S3v4
26 | 


--------------------------------------------------------------------------------
/bin/configure-spark-conf:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Configure the spark defaults for use with the s3a adapter. Other settings may
 4 | # be configured here too.
 5 | 
 6 | set -e
 7 | set +x
 8 | 
 9 | : "${BUCKET_INTERNAL_ACCESS_KEY?}"
10 | : "${BUCKET_INTERNAL_SECRET_KEY?}"
11 | : "${BUCKET_INTERNAL_ENDPOINT?}"
12 | 
13 | # work from the parent directory
14 | cd "$(dirname "$0")/.."
15 | 
16 | # note that this directory may be mounted, so we've added this file to the
17 | # .gitignore
18 | output=config/spark/spark-defaults.conf
19 | cp config/spark/spark-defaults.conf.template $output
20 | 
21 | # append our configuration
22 | cat << EOF >> $output
23 | 
24 | spark.hadoop.fs.s3a.access.key $BUCKET_INTERNAL_ACCESS_KEY
25 | spark.hadoop.fs.s3a.secret.key $BUCKET_INTERNAL_SECRET_KEY
26 | spark.hadoop.fs.s3a.endpoint $BUCKET_INTERNAL_ENDPOINT
27 | EOF
28 | 


--------------------------------------------------------------------------------
/bin/dataproc:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # A testing script for verifying the spark-bigquery connector with the existing
  4 | # mozaggregator code. This requires `gcloud` to be configured to point at a
  5 | # sandbox project for reading data from `payload_bytes_decoded`.
  6 | 
  7 | set -e
  8 | 
  9 | REGION=${REGION:-us-west1}
 10 | MACHINE_TYPE=${MACHINE_TYPE:-"n1-standard-4"}
 11 | NUM_WORKERS=${NUM_WORKERS:-0}
 12 | MODULE="prio_processor"
 13 | SUBMODULE=${SUBMODULE:-"spark"}
 14 | 
 15 | function bootstrap() {
 16 |     local bucket=${1?"bucket must be provided"}
 17 | 
 18 |     # create the initialization script and runner
 19 |     mkdir -p bootstrap
 20 | 
 21 |     # create the package artifacts
 22 |     rm -rf dist build
 23 |     python3 setup.py bdist_egg
 24 |     cp dist/${MODULE}*.egg bootstrap/${MODULE}.egg
 25 |     cp requirements.txt bootstrap/
 26 |     tee bootstrap/install-python-requirements.sh >/dev/null <<EOF
 27 | #!/bin/bash
 28 | apt update && apt install --yes python-dev libmsgpackc2 libnss3
 29 | gsutil cp ${bucket}/requirements.txt .
 30 | pip install -r requirements.txt
 31 | EOF
 32 |     tee bootstrap/processor-spark.py >/dev/null <<EOF
 33 | from ${MODULE}.spark import commands
 34 | commands.entry_point()
 35 | EOF
 36 |     tee bootstrap/processor-origin.py >/dev/null <<EOF
 37 | from ${MODULE}.origin import commands
 38 | commands.entry_point()
 39 | EOF
 40 | 
 41 |     # upload the bootstrap files
 42 |     gsutil rsync -r bootstrap/ "${bucket}/"
 43 | }
 44 | 
 45 | function delete_cluster() {
 46 |     local cluster_id=$1
 47 |     gcloud dataproc clusters delete ${cluster_id} --region=${REGION}
 48 | }
 49 | 
 50 | function create_cluster() {
 51 |     local cluster_id=$1
 52 |     local bucket=$2
 53 | 
 54 |     # note that we're using the beta API to enable the component gateway and to
 55 |     # ensure that we can kill the cluster after some time elapses. We use the
 56 |     # preview version (2.x) for Spark 3.0 support. Finally, we need to use
 57 |     # external jars that support scala 2.12.
 58 |     gcloud beta dataproc clusters create ${cluster_id} \
 59 |         --image-version preview-ubuntu18 \
 60 |         --enable-component-gateway \
 61 |         --master-machine-type=$MACHINE_TYPE \
 62 |         --worker-machine-type=$MACHINE_TYPE \
 63 |         --num-workers ${NUM_WORKERS} \
 64 |         --properties ^#^spark:spark.jars=gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar#spark:spark.hadoop.fs.s3a.access.key=${AWS_ACCESS_KEY_ID:-""} \
 65 |         --initialization-actions ${bucket}/install-python-requirements.sh \
 66 |         --region=${REGION} \
 67 |         --max-idle 10m
 68 | }
 69 | 
 70 | function submit() {
 71 |     cluster_id=$1
 72 |     bucket=$2
 73 |     # pass the rest of the parameters from the main function
 74 |     shift 2
 75 |     gcloud dataproc jobs submit pyspark \
 76 |         ${bucket}/processor-${SUBMODULE}.py \
 77 |         --cluster ${cluster_id} \
 78 |         --region ${REGION} \
 79 |         --py-files=${bucket}/${MODULE}.egg \
 80 |         -- "$@"
 81 | }
 82 | 
 83 | function main() {
 84 |     cd "$(dirname "$0")/.."
 85 |     bucket=gs://$(gcloud config get-value project)
 86 |     cluster_id="test-prio-processor-${RANDOM}"
 87 |     bootstrap $bucket
 88 |     create_cluster $cluster_id $bucket
 89 |     # does not handle issues where the cluster fails on startup
 90 |     function cleanup() {
 91 |         delete_cluster ${cluster_id}
 92 |     }
 93 |     trap cleanup EXIT
 94 |     submit $cluster_id $bucket "$@"
 95 | }
 96 | 
 97 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
 98 |     main "$@"
 99 | fi
100 | 


--------------------------------------------------------------------------------
/bin/generate:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | # This scripts generates data for testing the pipeline. The data is generated
 7 | # based on the data configuration file. Note that data under each of the buckets
 8 | # may be deleted due to rsync removing any file does not match.
 9 | 
10 | set -euo pipefail
11 | set -x
12 | 
13 | today=$(python3 -c "from datetime import datetime as dt; print(dt.utcnow().isoformat()[:10])")
14 | : "${APP_NAME?}"
15 | : "${SUBMISSION_DATE:=${today}}"
16 | : "${BUCKET_PREFIX:=test-data/v1}"
17 | : "${DATA_CONFIG?}"
18 | : "${PUBLIC_KEY_HEX_INTERNAL?}"
19 | : "${PUBLIC_KEY_HEX_EXTERNAL?}"
20 | : "${BUCKET_INTERNAL_INGEST?}"
21 | : "${BUCKET_EXTERNAL_INGEST?}"
22 | 
23 | function rsync_delete() {
24 |     local src=$1
25 |     local dst=$2
26 | 
27 |     # NOTE: this deletes files on the recieving end
28 |     mc mirror --remove --overwrite "${src}/" "${dst}/"
29 |     touch _SUCCESS
30 |     mc cp _SUCCESS "${dst}"
31 | }
32 | 
33 | function generate_dataset() {
34 |     local output=$1
35 |     local bootstrap
36 |     bootstrap=$(mktemp -d -t bootstrap-XXX)
37 |     source "${BASH_SOURCE%/*}/dataproc"
38 |     SUBMODULE="spark" bootstrap "${bootstrap}"
39 |     # drop a batch id since we may be missing data sometimes
40 |     spark-submit \
41 |         --py-files "${bootstrap}/prio_processor.egg" \
42 |         "${bootstrap}/processor-spark.py" \
43 |         generate-integration \
44 |         --n-drop-batch 1 \
45 |         --output "${output}"
46 | }
47 | 
48 | function main() {
49 |     "${BASH_SOURCE%/*}/configure-mc"
50 | 
51 |     output="$(mktemp -d -t generate-XXX)"
52 |     generate_dataset "${output}"
53 | 
54 |     # The public key of the partner server is used as a prefix
55 |     rsync_delete \
56 |         "${output}/server_id=a" \
57 |         "internal/${BUCKET_INTERNAL_INGEST}/${BUCKET_PREFIX}/${PUBLIC_KEY_HEX_EXTERNAL}/${APP_NAME}/${SUBMISSION_DATE}/raw/shares"
58 | 
59 |     rsync_delete \
60 |         "${output}/server_id=b" \
61 |         "external/${BUCKET_EXTERNAL_INGEST}/${BUCKET_PREFIX}/${PUBLIC_KEY_HEX_INTERNAL}/${APP_NAME}/${SUBMISSION_DATE}/raw/shares"
62 | }
63 | 
64 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
65 |     main "$@"
66 | fi
67 | 


--------------------------------------------------------------------------------
/bin/insert:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | # Script to map aggregates to origins and insert the results into BigQuery. Note
 7 | # that this script expects direct access to a GCS bucket for processing instead
 8 | # of using a MinIO gateway.
 9 | 
10 | set -euo pipefail
11 | set -x
12 | 
13 | today=$(python3 -c "from datetime import datetime as dt; print(dt.utcnow().isoformat()[:10])")
14 | : "${APP_NAME?}"
15 | : "${SUBMISSION_DATE:=${today}}"
16 | : "${BUCKET_PREFIX:=data/v1}"
17 | # assumes that data is stored according to convention set in process
18 | : "${BUCKET_INTERNAL_PRIVATE?}"
19 | : "${PUBLIC_KEY_HEX_EXTERNAL?}"
20 | : "${DATA_CONFIG?}"
21 | : "${ORIGIN_CONFIG?}"
22 | : "${GOOGLE_APPLICATION_CREDENTIALS?:=}"
23 | 
24 | : "${DATASET:-telemetry}"
25 | : "${TABLE:-content_blocking_dev}"
26 | 
27 | # set to "true" to overwrite the existing table
28 | : "${BQ_REPLACE:-false}"
29 | 
30 | function index() {
31 |     # Map the resulting aggregates against the list of origins.
32 | 
33 |     local input=$1
34 |     local output=$2
35 | 
36 |     resources=$(mktemp -d -t resources-XXX)
37 |     source "${BASH_SOURCE%/*}/dataproc"
38 |     SUBMODULE="origin" bootstrap "${resources}"
39 | 
40 |     spark-submit \
41 |         --py-files "${resources}/prio_processor.egg" \
42 |         "${resources}/processor-origin.py" \
43 |         index \
44 |         --input "${input}" \
45 |         --output "${output}" \
46 |         --config "${DATA_CONFIG}" \
47 |         --origins "${ORIGIN_CONFIG}"
48 | }
49 | 
50 | function insert() {
51 |     # Insert processed data into a BigQuery table in the current project.
52 | 
53 |     local input=$1
54 |     local dataset=$2
55 |     local table=$3
56 | 
57 |     if ! bq ls | grep "${dataset}" > /dev/null ; then
58 |         echo "creating dataset: ${dataset}"
59 |         bq mk "${dataset}"
60 |     fi
61 | 
62 |     bq load \
63 |         --source_format=NEWLINE_DELIMITED_JSON \
64 |         --autodetect \
65 |         --replace="${BQ_REPLACE}" \
66 |         "${dataset}.${table}" \
67 |         "${input}"
68 | 
69 |     bq query "select count(*) from ${dataset}.${table}"
70 | }
71 | 
72 | function main() {
73 |     data_in=$(mktemp -d -t data-XXX)
74 |     data_out=$(mktemp -d -t data-XXX)
75 | 
76 |     "${BASH_SOURCE%/*}/authenticate"
77 | 
78 |     prefix=${BUCKET_PREFIX}/${PUBLIC_KEY_HEX_EXTERNAL}/${APP_NAME}/${SUBMISSION_DATE}
79 |     gsutil -m cp -r "gs://${BUCKET_INTERNAL_PRIVATE}/${prefix}/processed/publish" "${data_in}"
80 |     index "${data_in}" "${data_out}"
81 | 
82 |     insert "${data_out}"/*.json "${DATASET}" "${TABLE}"
83 | }
84 | 
85 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
86 |     main "$@"
87 | fi
88 | 


--------------------------------------------------------------------------------
/config/content.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "batch_id": "content.blocking_blocked-0",
 4 |     "n_data": 2046
 5 |   },
 6 |   {
 7 |     "batch_id": "content.blocking_blocked-1",
 8 |     "n_data": 441
 9 |   },
10 |   {
11 |     "batch_id": "content.blocking_blocked_TESTONLY-0",
12 |     "n_data": 2046
13 |   },
14 |   {
15 |     "batch_id": "content.blocking_blocked_TESTONLY-1",
16 |     "n_data": 441
17 |   },
18 |   {
19 |     "batch_id": "content.blocking_opener_after_user_interaction_exempt-0",
20 |     "n_data": 2046
21 |   },
22 |   {
23 |     "batch_id": "content.blocking_opener_after_user_interaction_exempt-1",
24 |     "n_data": 441
25 |   },
26 |   {
27 |     "batch_id": "content.blocking_opener_after_user_interaction_exempt_TESTONLY-0",
28 |     "n_data": 2046
29 |   },
30 |   {
31 |     "batch_id": "content.blocking_opener_after_user_interaction_exempt_TESTONLY-1",
32 |     "n_data": 441
33 |   },
34 |   {
35 |     "batch_id": "content.blocking_opener_exempt-0",
36 |     "n_data": 2046
37 |   },
38 |   {
39 |     "batch_id": "content.blocking_opener_exempt-1",
40 |     "n_data": 441
41 |   },
42 |   {
43 |     "batch_id": "content.blocking_opener_exempt_TESTONLY-0",
44 |     "n_data": 2046
45 |   },
46 |   {
47 |     "batch_id": "content.blocking_opener_exempt_TESTONLY-1",
48 |     "n_data": 441
49 |   },
50 |   {
51 |     "batch_id": "content.blocking_storage_access_api_exempt-0",
52 |     "n_data": 2046
53 |   },
54 |   {
55 |     "batch_id": "content.blocking_storage_access_api_exempt-1",
56 |     "n_data": 441
57 |   },
58 |   {
59 |     "batch_id": "content.blocking_storage_access_api_exempt_TESTONLY-0",
60 |     "n_data": 2046
61 |   },
62 |   {
63 |     "batch_id": "content.blocking_storage_access_api_exempt_TESTONLY-1",
64 |     "n_data": 441
65 |   }
66 | ]
67 | 


--------------------------------------------------------------------------------
/config/spark/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the console, but reduce verbosity
19 | log4j.rootCategory=WARN, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24 | 
25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
26 | # log level for this class is used to overwrite the root logger's log level, so that
27 | # the user can have different defaults for the shell and regular Spark apps.
28 | log4j.logger.org.apache.spark.repl.Main=WARN
29 | 
30 | # Settings to quiet third party logs that are too verbose
31 | log4j.logger.org.sparkproject.jetty=WARN
32 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
35 | log4j.logger.org.apache.parquet=ERROR
36 | log4j.logger.parquet=ERROR
37 | 
38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR


--------------------------------------------------------------------------------
/config/spark/spark-defaults.conf.template:
--------------------------------------------------------------------------------
 1 | # Default parameters for spark while using minio. We'll also assume that the
 2 | # performance is generally acceptable for GCS too using the same connector.
 3 | # see https://docs.min.io/docs/disaggregated-spark-and-hadoop-hive-with-minio.html
 4 | 
 5 | # Ensure these values are set before running spark
 6 | # spark.hadoop.fs.s3a.access.key <ACCESS_KEY>
 7 | # spark.hadoop.fs.s3a.secret.key <SECRET_KEY>
 8 | # spark.hadoop.fs.s3a.endpoint http://minio:9000
 9 | 
10 | spark.hadoop.fs.s3a.path.style.access true
11 | spark.hadoop.fs.s3a.block.size 512M
12 | spark.hadoop.fs.s3a.buffer.dir ${hadoop.tmp.dir}/s3a
13 | spark.hadoop.fs.s3a.committer.magic.enabled false
14 | spark.hadoop.fs.s3a.committer.name directory
15 | spark.hadoop.fs.s3a.committer.staging.abort.pending.uploads true
16 | spark.hadoop.fs.s3a.committer.staging.conflict-mode append
17 | spark.hadoop.fs.s3a.committer.staging.tmp.path /tmp/staging
18 | spark.hadoop.fs.s3a.committer.staging.unique-filenames true
19 |  # number of threads writing to MinIO
20 | spark.hadoop.fs.s3a.committer.threads 2048
21 | spark.hadoop.fs.s3a.connection.establish.timeout 5000
22 |  # maximum number of concurrent conns
23 | spark.hadoop.fs.s3a.connection.maximum 8192
24 | spark.hadoop.fs.s3a.connection.ssl.enabled false
25 | spark.hadoop.fs.s3a.connection.timeout 200000
26 | # number of parallel uploads
27 | spark.hadoop.fs.s3a.fast.upload.active.blocks 2048
28 | # use disk as the buffer for uploads
29 | spark.hadoop.fs.s3a.fast.upload.buffer disk
30 | spark.hadoop.fs.s3a.fast.upload true
31 |  # maximum number of parallel tasks
32 | spark.hadoop.fs.s3a.max.total.tasks 2048
33 | # socket buffering hints
34 | spark.hadoop.fs.s3a.socket.recv.buffer 65536
35 | spark.hadoop.fs.s3a.socket.send.buffer 65536
36 |  # maximum number of threads for S3A
37 | spark.hadoop.fs.s3a.threads.max 2048
38 | 
39 | # add the progress bar to update the console for Airflow timeouts (relevant
40 | # if running using the KubernetesPodOperator with Airflow: 
41 | # https://github.com/mozilla/telemetry-airflow/issues/844).
42 | spark.ui.showConsoleProgress true
43 | 


--------------------------------------------------------------------------------
/config/test-small.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "batch_id": "test-0",
 4 |     "n_data": 10
 5 |   },
 6 |   {
 7 |     "batch_id": "test-1",
 8 |     "n_data": 20
 9 |   },
10 |   {
11 |     "batch_id": "test-2",
12 |     "n_data": 3
13 |   }
14 | ]
15 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/.gitignore:
--------------------------------------------------------------------------------
1 | .service-account-keys/


--------------------------------------------------------------------------------
/deployment/testing-v3/README.md:
--------------------------------------------------------------------------------
 1 | # Testing configuration for v3 containers
 2 | 
 3 | This directory contains terraform configuration to bring relevant resources for
 4 | an integration test of the prio-processor v3.x containers.
 5 | 
 6 | To create a new project that uses the same configuration, change the terraform
 7 | backend appropriately. Here, the state is placed into a storage bucket that has
 8 | been created beforehand. Ensure the project has also been created. Then:
 9 | 
10 | ```bash
11 | cd terraform
12 | 
13 | # if you're choosing a different project or change any modules
14 | terraform init
15 | 
16 | # apply any changes
17 | terraform apply
18 | ```
19 | 
20 | To configure the tests:
21 | 
22 | ```bash
23 | # There is a maximum of 10 keys per service account. This script doesn't
24 | # handle key rotations, so disable old keys as necessary.
25 | scripts/generate-service-account-keys
26 | 
27 | # generate new keys (or alternatively copy .env.template files to their .env locations)
28 | scripts/generate-dotenv
29 | ```
30 | 
31 | The above commands only need to be run once. To run the tests:
32 | 
33 | ```bash
34 | # run the integration script
35 | scripts/integrate
36 | 
37 | # clean up the buckets
38 | scripts/cleanup
39 | ```
40 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/compose/admin/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | 
 4 | APP_NAME=test-app
 5 | DATA_CONFIG=/app/config/content.json
 6 | ORIGIN_CONFIG=/app/config/telemetry_origin_data_inc.json
 7 | BUCKET_PREFIX=test-app/v1
 8 | 
 9 | # relative to the docker-compose file
10 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-admin-private-key.json"
11 | 
12 | PUBLIC_KEY_HEX_INTERNAL=1723D28D922FD045A3EBECE5FA9BBD67DF28B60B5666203DA06E0CE296D7DF11
13 | BUCKET_INTERNAL_PRIVATE=gs://a-private-a82843a795cf9ef5
14 | 
15 | PUBLIC_KEY_HEX_EXTERNAL=91D65A37411C70A7E86070659EACEEED5C01CF57656AE922BD456AD79EAE9E3B
16 | BUCKET_EXTERNAL_PRIVATE=gs://b-private-a82843a795cf9ef5
17 | 
18 | DATASET=telemetry
19 | TABLE=content_blocking
20 | BQ_REPLACE=true
21 | CLOUDSDK_CORE_PROJECT=amiyaguchi-prio-processor-v3
22 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/compose/admin/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   app:
 5 |     image: mozilla/prio-processor:v3.1.1
 6 |     command: "true"
 7 |     volumes:
 8 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
 9 |       - ../../content.json:/app/config/content.json
10 |     environment:
11 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
12 |       - APP_NAME
13 |       - DATA_CONFIG
14 |       - ORIGIN_CONFIG
15 |       - PUBLIC_KEY_HEX_INTERNAL
16 |       - PUBLIC_KEY_HEX_EXTERNAL
17 |       - BUCKET_INTERNAL_PRIVATE
18 |       - BUCKET_EXTERNAL_PRIVATE
19 |       - BUCKET_PREFIX
20 |       - DATASET
21 |       - TABLE
22 |       - BQ_REPLACE
23 |       - CLOUDSDK_CORE_PROJECT
24 |       - SUBMISSION_DATE
25 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/compose/server-a/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | 
 4 | APP_NAME=test-app
 5 | DATA_CONFIG=/app/config/content.json
 6 | BUCKET_PREFIX=test-app/v1
 7 | SERVER_ID=A
 8 | SHARED_SECRET=FxuW0JdQWtZruGijAsaKCw==
 9 | 
10 | # relative to the docker-compose file
11 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-a-private-key.json"
12 | 
13 | PRIVATE_KEY_HEX=181DC4D11ECF21F08EFA21DE79CF602C89FF6B96AB2A1BBD1EBB5FFF4AC51259
14 | PUBLIC_KEY_HEX_INTERNAL=1723D28D922FD045A3EBECE5FA9BBD67DF28B60B5666203DA06E0CE296D7DF11
15 | BUCKET_INTERNAL_PRIVATE=gs://a-private-a82843a795cf9ef5
16 | BUCKET_INTERNAL_SHARED=gs://a-shared-a82843a795cf9ef5
17 | 
18 | PUBLIC_KEY_HEX_EXTERNAL=91D65A37411C70A7E86070659EACEEED5C01CF57656AE922BD456AD79EAE9E3B
19 | BUCKET_EXTERNAL_SHARED=gs://b-shared-a82843a795cf9ef5
20 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/compose/server-a/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   app:
 5 |     image: mozilla/prio-processor:v3.1.1
 6 |     working_dir: /app
 7 |     command: bin/process
 8 |     volumes:
 9 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
10 |       - ../../content.json:/app/config/content.json
11 |     environment:
12 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
13 |       - APP_NAME
14 |       - DATA_CONFIG
15 |       - SERVER_ID
16 |       - SHARED_SECRET
17 |       - PRIVATE_KEY_HEX
18 |       - PUBLIC_KEY_HEX_INTERNAL
19 |       - PUBLIC_KEY_HEX_EXTERNAL
20 |       - BUCKET_INTERNAL_PRIVATE
21 |       - BUCKET_INTERNAL_SHARED
22 |       - BUCKET_EXTERNAL_SHARED
23 |       - BUCKET_PREFIX
24 |       - SUBMISSION_DATE
25 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/compose/server-b/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | 
 4 | APP_NAME=test-app
 5 | DATA_CONFIG=/app/config/content.json
 6 | BUCKET_PREFIX=test-app/v1
 7 | SERVER_ID=B
 8 | SHARED_SECRET=FxuW0JdQWtZruGijAsaKCw==
 9 | 
10 | # relative to the docker-compose file
11 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-b-private-key.json"
12 | 
13 | PRIVATE_KEY_HEX=6C0213B9319234BC81E166E8592739B0311D86EE1BE8391E2F773F930C1991C3
14 | PUBLIC_KEY_HEX_INTERNAL=91D65A37411C70A7E86070659EACEEED5C01CF57656AE922BD456AD79EAE9E3B
15 | BUCKET_INTERNAL_PRIVATE=gs://b-private-a82843a795cf9ef5
16 | BUCKET_INTERNAL_SHARED=gs://b-shared-a82843a795cf9ef5
17 | 
18 | PUBLIC_KEY_HEX_EXTERNAL=1723D28D922FD045A3EBECE5FA9BBD67DF28B60B5666203DA06E0CE296D7DF11
19 | BUCKET_EXTERNAL_SHARED=gs://a-shared-a82843a795cf9ef5
20 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/compose/server-b/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   app:
 5 |     image: mozilla/prio-processor:v3.1.1
 6 |     working_dir: /app
 7 |     command: bin/process
 8 |     volumes:
 9 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
10 |       - ../../content.json:/app/config/content.json
11 |     environment:
12 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
13 |       - APP_NAME
14 |       - DATA_CONFIG
15 |       - SERVER_ID
16 |       - SHARED_SECRET
17 |       - PRIVATE_KEY_HEX
18 |       - PUBLIC_KEY_HEX_INTERNAL
19 |       - PUBLIC_KEY_HEX_EXTERNAL
20 |       - BUCKET_INTERNAL_PRIVATE
21 |       - BUCKET_INTERNAL_SHARED
22 |       - BUCKET_EXTERNAL_SHARED
23 |       - BUCKET_PREFIX
24 |       - SUBMISSION_DATE
25 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/content.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "batch_id": "content.blocking_blocked_TESTONLY-0",
 4 |     "n_data": 2046
 5 |   },
 6 |   {
 7 |     "batch_id": "content.blocking_blocked_TESTONLY-1",
 8 |     "n_data": 441
 9 |   }
10 | ]
11 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/scripts/cleanup:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Delegate cleanup of buckets to the appropriate service account
 4 | 
 5 | set -euo pipefail
 6 | cd "$(dirname "$0")/.."
 7 | 
 8 | pushd compose/server-a
 9 | docker-compose run --rm app bin/cleanup
10 | popd
11 | 
12 | pushd compose/server-b
13 | docker-compose run --rm app bin/cleanup
14 | popd
15 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/scripts/generate-dotenv:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Generate dotenv files for each of the compose configurations
  3 | 
  4 | set -e
  5 | 
  6 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v3}
  7 | TAG=${TAG:-mozilla/prio-processor:v3.0.0}
  8 | 
  9 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 
 10 |     echo "project is not set correctly; run 'gcloud config set project $PROJECT'"
 11 |     exit 1
 12 | fi
 13 | 
 14 | function get-key {
 15 |     local json=$1
 16 |     local key=$2
 17 |     echo "$json" | jq -r ".$key"
 18 | }
 19 | 
 20 | # reuse results from a single gsutil call
 21 | _results=$(gsutil ls)
 22 | function get-bucket {
 23 |     local pattern=$1
 24 |     path=$(echo "$_results" | grep "$pattern")
 25 |     # strip any trailing slashes
 26 |     echo ${path%/}
 27 | }
 28 | 
 29 | # work from the parent directory
 30 | cd "$(dirname "$0")/.."
 31 | 
 32 | keys_a=$(docker run -it "$TAG" prio keygen)
 33 | keys_b=$(docker run -it "$TAG" prio keygen)
 34 | seed=$(docker run -it "$TAG" prio shared-seed)
 35 | 
 36 | # list out all the variables we might need...
 37 | app_name="test-app"
 38 | bucket_prefix="$app_name/v1"
 39 | data_config="/app/config/content.json"
 40 | origin_config="/app/config/telemetry_origin_data_inc.json"
 41 | 
 42 | cat << EOF > compose/admin/.env.template
 43 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 44 | # manually edited values into source control.
 45 | 
 46 | APP_NAME=$app_name
 47 | DATA_CONFIG=$data_config
 48 | ORIGIN_CONFIG=$origin_config
 49 | BUCKET_PREFIX=$bucket_prefix
 50 | 
 51 | # relative to the docker-compose file
 52 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-admin-private-key.json"
 53 | 
 54 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$keys_a" public_key)
 55 | BUCKET_INTERNAL_PRIVATE=$(get-bucket a-private)
 56 | 
 57 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$keys_b" public_key)
 58 | BUCKET_EXTERNAL_PRIVATE=$(get-bucket b-private)
 59 | 
 60 | DATASET=telemetry
 61 | TABLE=content_blocking
 62 | BQ_REPLACE=true
 63 | CLOUDSDK_CORE_PROJECT=$PROJECT
 64 | EOF
 65 | cp compose/admin/.env.template compose/admin/.env
 66 | 
 67 | function server-env {
 68 |     local server_id=$1
 69 |     local internal_key=$2
 70 |     local external_key=$3
 71 |     local other_id;
 72 |     other_id=$(if [[ $server_id == a ]]; then echo b; else echo a; fi)
 73 |     cat << EOF > "compose/server-$server_id/.env.template"
 74 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 75 | # manually edited values into source control.
 76 | 
 77 | APP_NAME=$app_name
 78 | DATA_CONFIG=$data_config
 79 | BUCKET_PREFIX=$bucket_prefix
 80 | SERVER_ID=$(echo "$server_id" | tr '[:lower:]' '[:upper:]')
 81 | SHARED_SECRET=$(get-key "$seed" shared_seed)
 82 | 
 83 | # relative to the docker-compose file
 84 | GOOGLE_APPLICATION_CREDENTIALS="../../.service-account-keys/service-account-${server_id}-private-key.json"
 85 | 
 86 | PRIVATE_KEY_HEX=$(get-key "$internal_key" private_key)
 87 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$internal_key" public_key)
 88 | BUCKET_INTERNAL_PRIVATE=$(get-bucket "${server_id}-private")
 89 | BUCKET_INTERNAL_SHARED=$(get-bucket "${server_id}-shared")
 90 | 
 91 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$external_key" public_key)
 92 | BUCKET_EXTERNAL_SHARED=$(get-bucket "${other_id}-shared")
 93 | EOF
 94 |     cp "compose/server-$server_id/.env.template" "compose/server-$server_id/.env"
 95 | }
 96 | 
 97 | server-env a "$keys_a" "$keys_b"
 98 | server-env b "$keys_b" "$keys_a"
 99 | 
100 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/scripts/generate-service-account-keys:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v3}
 6 | 
 7 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 
 8 |     echo "project is not set correctly; run 'gcloud config set project $PROJECT'"
 9 |     exit 1
10 | fi
11 | 
12 | # work from the parent directory
13 | cd "$(dirname "$0")/.."
14 | output=.service-account-keys
15 | mkdir -p $output
16 | 
17 | function create_service_account {
18 |     local project=$1
19 |     local output=$2
20 |     local name=$3
21 |     gcloud iam service-accounts keys create "$output/$name-private-key.json" \
22 |         --iam-account "$name@$project.iam.gserviceaccount.com"
23 | 
24 | }
25 | 
26 | create_service_account "$PROJECT" "$output" service-account-admin
27 | create_service_account "$PROJECT" "$output" service-account-a
28 | create_service_account "$PROJECT" "$output" service-account-b
29 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/scripts/integrate:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script controls the docker-compose workflow for integration testing. The
 4 | # containers are defined in the docker-compose.yml, but are orchestrated through
 5 | # this script for verification.
 6 | 
 7 | set -euo pipefail
 8 | 
 9 | cd "$(dirname "$0")/.."
10 | 
11 | # Copy data into the appropriate buckets
12 | pushd compose/admin
13 | docker-compose run --rm app bin/generate
14 | popd
15 | 
16 | # Start server A
17 | pushd compose/server-a
18 | docker-compose run --rm app bin/process &
19 | server_a_pid=$!
20 | popd
21 | 
22 | # offset the start times by a short amount for proper authentication against GCP
23 | sleep 2
24 | 
25 | # Start server B
26 | pushd compose/server-b
27 | docker-compose run --rm app bin/process &
28 | server_b_pid=$!
29 | popd
30 | 
31 | # Return the exit code of the backgrounded docker-compose container. Since
32 | # `wait` is a blocking function, a failure in server B will not be detected
33 | # until timeout in server A.
34 | wait $server_a_pid
35 | wait $server_b_pid
36 | 
37 | pushd compose/admin
38 | docker-compose run --rm app bin/insert
39 | popd
40 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/scripts/list-bucket:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | : << EOF
 3 | To use this script, run the following command:
 4 | 
 5 | scripts/list-bucket > LISTING.md
 6 | EOF
 7 | 
 8 | set -e
 9 | 
10 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v3}
11 | 
12 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 
13 |     echo "project is not set correctly; run 'gcloud config set project $PROJECT'"
14 |     exit 1
15 | fi
16 | 
17 | function sort_recursive_listing {
18 |     local bucket=$1
19 |     # remove lines that end with /:, empty lines, or the summary line
20 |     gsutil ls -lr "$bucket" | grep -v :$ | grep -v ^$ | grep -v ^TOTAL | sort -k2
21 | }
22 | 
23 | cat << EOF
24 | # Directory listing
25 | 
26 | This listing was generated from \`scripts/list-bucket\`. It is a list of all
27 | objects stored across the the two servers.
28 | 
29 | ## Server A buckets
30 | 
31 | EOF
32 | 
33 | buckets=$(gsutil ls | sort)
34 | for bucket in $(echo "$buckets" | grep a- ); do
35 | cat << EOF
36 | ### \`$bucket\`
37 | 
38 | \`\`\`
39 | $(sort_recursive_listing "$bucket")
40 | \`\`\`
41 | 
42 | EOF
43 | done
44 | 
45 | echo "## Server B buckets"
46 | echo ""
47 | 
48 | for bucket in $(echo "$buckets" | grep b-); do
49 | cat << EOF
50 | ### \`$bucket\`
51 | 
52 | \`\`\`
53 | $(sort_recursive_listing "$bucket")
54 | \`\`\`
55 | 
56 | EOF
57 | done
58 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/terraform/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/google" {
 5 |   version = "3.65.0"
 6 |   hashes = [
 7 |     "h1:ZvXCeUYoex3aOLlZYqv08WZ3hcPaf5p/gEa/DeMrkfs=",
 8 |     "zh:402b8ba03f19558f7d0e2a453a9b82747882fb3519ce686ce26a9afd4593d05e",
 9 |     "zh:523a306c2906c213b630d1c2f1e48698769bfffe360b68388d935d0bd171c55c",
10 |     "zh:76af4170f5a524ff353e60dd68d728c55dcbd9f6c5f60648e28e4f8f9ca8e958",
11 |     "zh:7d00a44769d26144f42b413c82272e31ae9b63153532b9a135a8f69a6608b9a6",
12 |     "zh:7f5d0ab79d213809726663f7603004c173694602bd22f2419c445d6897729ca2",
13 |     "zh:a1c23e3d280a5053bae9102ad55df1315585395f8656ddf83928978c7e6cd307",
14 |     "zh:a81d0af5ef58c193197f81dc3059f8b22c7dde0575bb3198a0360aff7f9ca476",
15 |     "zh:b5b79fa8f9e49d2d26badfded64a1e460cdb11b152168e578443cf92df679bca",
16 |     "zh:ec4f88d1fd8990511b86205709c1a76ac3a444d0088a810c82a4f5db37ca4afe",
17 |     "zh:f15390a40dc6e9c5b5285bc2b6a8c54b6030ae9cc04cc4a31ecf9b14145c467b",
18 |     "zh:fb1a150464d822aa9182cd46a0b7bc2c279ff9400017b4bb3238256224ab41b6",
19 |   ]
20 | }
21 | 
22 | provider "registry.terraform.io/hashicorp/random" {
23 |   version = "3.1.0"
24 |   hashes = [
25 |     "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=",
26 |     "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc",
27 |     "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626",
28 |     "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff",
29 |     "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2",
30 |     "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992",
31 |     "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427",
32 |     "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc",
33 |     "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f",
34 |     "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b",
35 |     "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7",
36 |     "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a",
37 |   ]
38 | }
39 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   // When forking this configuration, set the configuration appropriately. A
 3 |   // remote backend is a good choice since it can be shared across a team.
 4 |   backend "gcs" {
 5 |     bucket = "amiyaguchi-prio-processor-v3"
 6 |     prefix = "tf-state"
 7 |   }
 8 | }
 9 | 
10 | variable "project" {
11 |   type    = string
12 |   default = "amiyaguchi-prio-processor-v3"
13 | }
14 | 
15 | variable "region" {
16 |   type    = string
17 |   default = "us-central-1"
18 | }
19 | 
20 | provider "google" {
21 |   project = var.project
22 |   region  = var.region
23 | }
24 | 
25 | // Choose a different bucket name if the project changes
26 | resource "random_id" "project" {
27 |   keepers = {
28 |     project = var.project
29 |   }
30 |   byte_length = 8
31 | }
32 | 
33 | module "bucket-a" {
34 |   source    = "./modules/bucket"
35 |   server_id = "a"
36 |   suffix    = random_id.project.hex
37 | }
38 | 
39 | module "bucket-b" {
40 |   source    = "./modules/bucket"
41 |   server_id = "b"
42 |   suffix    = random_id.project.hex
43 | }
44 | 
45 | // Create the service accounts for the tests
46 | resource "google_service_account" "admin" {
47 |   account_id   = "service-account-admin"
48 |   display_name = "Service account for the administrator"
49 | }
50 | 
51 | resource "google_service_account" "a" {
52 |   account_id   = "service-account-a"
53 |   display_name = "Service account for server A"
54 | }
55 | 
56 | resource "google_service_account" "b" {
57 |   account_id   = "service-account-b"
58 |   display_name = "Service account for server B"
59 | }
60 | 
61 | // Assign service account permissions to each bucket. There are quite a few rules,
62 | // so we break this out into a module.
63 | 
64 | module "bucket-permissions-a" {
65 |   source                   = "./modules/bucket-permissions"
66 |   bucket_private           = module.bucket-a.private
67 |   bucket_shared            = module.bucket-a.shared
68 |   service_account_internal = google_service_account.a.email
69 |   service_account_external = google_service_account.b.email
70 |   service_account_admin    = google_service_account.admin.email
71 | }
72 | 
73 | module "bucket-permissions-b" {
74 |   source                   = "./modules/bucket-permissions"
75 |   bucket_private           = module.bucket-b.private
76 |   bucket_shared            = module.bucket-b.shared
77 |   service_account_internal = google_service_account.b.email
78 |   service_account_external = google_service_account.a.email
79 |   service_account_admin    = google_service_account.admin.email
80 | 
81 | }
82 | 
83 | // testing whether origin telemetry inserts into BigQuery correctly
84 | resource "google_project_service" "bigquery" {
85 |   service = "bigquery.googleapis.com"
86 | }
87 | 
88 | resource "google_bigquery_dataset" "telemetry" {
89 |   dataset_id = "telemetry"
90 |   location   = "US"
91 | }
92 | 
93 | // Grant access to the admin service account
94 | resource "google_project_iam_member" "bigquery-admin" {
95 |   role   = "roles/bigquery.admin"
96 |   member = "serviceAccount:${google_service_account.admin.email}"
97 | }
98 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/terraform/modules/bucket-permissions/main.tf:
--------------------------------------------------------------------------------
 1 | variable "bucket_private" {
 2 |   type        = string
 3 |   description = "The private bucket for the current processor"
 4 | }
 5 | 
 6 | variable "bucket_shared" {
 7 |   type        = string
 8 |   description = "The shared bucket for both processors"
 9 | }
10 | 
11 | variable "service_account_internal" {
12 |   type        = string
13 |   description = "The service account for the current processor"
14 | }
15 | 
16 | variable "service_account_external" {
17 |   type        = string
18 |   description = "The service account for the co-processor"
19 | }
20 | 
21 | variable "service_account_admin" {
22 |   type        = string
23 |   description = "The service account for the admin"
24 | }
25 | 
26 | // The admin account needs to be able to write to the internal bucket. See 
27 | // issue #102 for possible simplification that doesn't require editor access.
28 | resource "google_storage_bucket_iam_binding" "private" {
29 |   bucket = var.bucket_private
30 |   role   = "roles/storage.objectAdmin"
31 |   members = [
32 |     "serviceAccount:${var.service_account_internal}",
33 |     "serviceAccount:${var.service_account_admin}"
34 |   ]
35 | }
36 | 
37 | resource "google_storage_bucket_iam_binding" "shared" {
38 |   bucket = var.bucket_shared
39 |   role   = "roles/storage.objectAdmin"
40 |   members = [
41 |     "serviceAccount:${var.service_account_internal}",
42 |     "serviceAccount:${var.service_account_external}"
43 |   ]
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/deployment/testing-v3/terraform/modules/bucket/main.tf:
--------------------------------------------------------------------------------
 1 | variable "server_id" {
 2 |   type        = string
 3 |   description = "The identifier for the server"
 4 | }
 5 | 
 6 | variable "suffix" {
 7 |   type        = string
 8 |   description = "A shared suffix used for the bucket"
 9 | }
10 | 
11 | // Create all of the storage resources necessary for the tests. We choose to
12 | // delete files older than 7 days since these are testing resources.
13 | resource "google_storage_bucket" "private" {
14 |   name                        = "${var.server_id}-private-${var.suffix}"
15 |   uniform_bucket_level_access = true
16 |   lifecycle_rule {
17 |     condition {
18 |       age = 7
19 |     }
20 |     action {
21 |       type = "Delete"
22 |     }
23 |   }
24 | }
25 | 
26 | resource "google_storage_bucket" "shared" {
27 |   name                        = "${var.server_id}-shared-${var.suffix}"
28 |   uniform_bucket_level_access = true
29 |   lifecycle_rule {
30 |     condition {
31 |       age = 7
32 |     }
33 |     action {
34 |       type = "Delete"
35 |     }
36 |   }
37 | }
38 | 
39 | output "private" {
40 |   value = google_storage_bucket.private.name
41 | }
42 | 
43 | output "shared" {
44 |   value = google_storage_bucket.shared.name
45 | }
46 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/.gitignore:
--------------------------------------------------------------------------------
1 | .service-account-keys/
2 | .secrets/
3 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/ingest/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | COMPOSE_PROJECT_NAME="testing-v4-gcloud-self-ingest"
 4 | 
 5 | APP_NAME=test-app
 6 | DATA_CONFIG=/app/config/content.json
 7 | ORIGIN_CONFIG=/app/config/telemetry_origin_data_inc.json
 8 | BUCKET_PREFIX=test-app/v1
 9 | 
10 | # relative to the docker-compose file
11 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-ingest-private-key.json"
12 | 
13 | PUBLIC_KEY_HEX_INTERNAL=A79C2D3DFE6AB347640DB2393A6EA4AF6A80294E2DF09EA2311E7B609D1AAC7C
14 | BUCKET_INTERNAL_INGEST=a-ingest-411db3b9503395de
15 | # The keys for the internal gateway don't particularly matter since it generally
16 | # shouldn't be accessible over the public internet.
17 | BUCKET_INTERNAL_ACCESS_KEY=ingest-access-key
18 | BUCKET_INTERNAL_SECRET_KEY=36e6be74ac2513770134ddd9f020b3f99b2c6d6151b4a61c6cd1cdcd51cad726
19 | BUCKET_INTERNAL_ENDPOINT=http://gcs-gateway-ingest:9000
20 | 
21 | PUBLIC_KEY_HEX_EXTERNAL=BFAB37C8E174142F5B06B32FCC7212D63326889E06FBCC687622E9CE77A76937
22 | BUCKET_EXTERNAL_INGEST=b-ingest-032fc7c2cca96ddb
23 | BUCKET_EXTERNAL_ACCESS_KEY=ingest-032fc7c2cca96ddb
24 | BUCKET_EXTERNAL_SECRET_KEY=78aa4bb15f7571492472112dacb39a9760c5f4e46aba146813c6b5722478d81f
25 | BUCKET_EXTERNAL_ENDPOINT=http://minio-b:9000
26 | 
27 | # The ingest also gets access to the private internal bucket, because ingest and
28 | # server A are operated by the same entity in the origin telemetry setup
29 | BUCKET_INTERNAL_PRIVATE=a-private-411db3b9503395de
30 | DATASET=telemetry
31 | TABLE=content_blocking
32 | BQ_REPLACE=true
33 | CLOUDSDK_CORE_PROJECT=amiyaguchi-prio-processor-v4-1
34 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/ingest/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | networks:
 4 |   testing-v4-gcloud-self-b_default:
 5 |     external: true
 6 | 
 7 | services:
 8 |   # https://docs.min.io/docs/minio-gateway-for-gcs.html
 9 |   gcs-gateway-ingest:
10 |     image: minio/minio:RELEASE.2021-06-17T00-10-46Z
11 |     command: gateway gcs ${CLOUDSDK_CORE_PROJECT}
12 |     volumes:
13 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
14 |     ports:
15 |       - 9001:9000
16 |     environment:
17 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
18 |       - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY
19 |       - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY
20 | 
21 |   app:
22 |     build: ../../../..
23 |     command: "true"
24 |     volumes:
25 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
26 |       - ../../content.json:/app/config/content.json
27 |       - ../../../../bin:/app/bin
28 |     networks:
29 |       - default
30 |       - testing-v4-gcloud-self-b_default
31 |     depends_on:
32 |       - gcs-gateway-ingest
33 |     environment:
34 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
35 |       - APP_NAME
36 |       - DATA_CONFIG
37 |       - ORIGIN_CONFIG
38 |       - PUBLIC_KEY_HEX_INTERNAL
39 |       - PUBLIC_KEY_HEX_EXTERNAL
40 |       - BUCKET_INTERNAL_ACCESS_KEY
41 |       - BUCKET_INTERNAL_SECRET_KEY
42 |       - BUCKET_INTERNAL_ENDPOINT
43 |       - BUCKET_EXTERNAL_ACCESS_KEY
44 |       - BUCKET_EXTERNAL_SECRET_KEY
45 |       - BUCKET_EXTERNAL_ENDPOINT
46 |       - BUCKET_INTERNAL_INGEST
47 |       - BUCKET_EXTERNAL_INGEST
48 |       - BUCKET_INTERNAL_PRIVATE
49 |       - BUCKET_PREFIX
50 |       - DATASET
51 |       - TABLE
52 |       - BQ_REPLACE
53 |       - CLOUDSDK_CORE_PROJECT
54 |       - SUBMISSION_DATE
55 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/server-a/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | COMPOSE_PROJECT_NAME="testing-v4-gcloud-self-a"
 4 | 
 5 | APP_NAME=test-app
 6 | DATA_CONFIG=/app/config/content.json
 7 | BUCKET_PREFIX=test-app/v1
 8 | SERVER_ID=A
 9 | SHARED_SECRET=xut4T8StPN83xiK2QAj/oQ==
10 | 
11 | # Used for the MinIO GCS gateway
12 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-a-private-key.json"
13 | 
14 | PRIVATE_KEY_HEX=488C9D8A141332F3F4FD11695E563803A11526C4FB6C464C20922842FEEF2A2C
15 | PUBLIC_KEY_HEX_INTERNAL=A79C2D3DFE6AB347640DB2393A6EA4AF6A80294E2DF09EA2311E7B609D1AAC7C
16 | BUCKET_INTERNAL_INGEST=a-ingest-411db3b9503395de
17 | BUCKET_INTERNAL_PRIVATE=a-private-411db3b9503395de
18 | BUCKET_INTERNAL_SHARED=a-shared-411db3b9503395de
19 | # The keys for the internal gateway don't particularly matter since it generally
20 | # shouldn't be accessible over the public internet.
21 | BUCKET_INTERNAL_ACCESS_KEY=a-access-key
22 | BUCKET_INTERNAL_SECRET_KEY=2d87853b05a86022873277c3311c4b48c6717111bde72c09fc84f3816893ffaf
23 | BUCKET_INTERNAL_ENDPOINT=http://gcs-gateway-a:9000
24 | 
25 | PUBLIC_KEY_HEX_EXTERNAL=BFAB37C8E174142F5B06B32FCC7212D63326889E06FBCC687622E9CE77A76937
26 | BUCKET_EXTERNAL_SHARED=b-shared-032fc7c2cca96ddb
27 | BUCKET_EXTERNAL_ACCESS_KEY=a-032fc7c2cca96ddb
28 | BUCKET_EXTERNAL_SECRET_KEY=d3fe697dd6d4acc606d9a637e339ea5f04afeef55db14d8c4ad5321b31ae405f
29 | BUCKET_EXTERNAL_ENDPOINT=http://minio-b:9000
30 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/server-a/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | # Only used when testing locally. When applied to an external host for server b,
 4 | # use the public ip address of the node and remove this network dependency.
 5 | networks:
 6 |   testing-v4-gcloud-self-b_default:
 7 |     external: true
 8 | 
 9 | services:
10 |   # https://docs.min.io/docs/minio-gateway-for-gcs.html
11 |   gcs-gateway-a:
12 |     image: minio/minio:RELEASE.2021-06-17T00-10-46Z
13 |     command: gateway gcs
14 |     volumes:
15 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
16 |     ports:
17 |       - 9002:9000
18 |     environment:
19 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
20 |       - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY
21 |       - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY
22 | 
23 |   app:
24 |     build: ../../../..
25 |     working_dir: /app
26 |     command: bin/process
27 |     volumes:
28 |       - ../../content.json:/app/config/content.json
29 |       - ../../../../bin:/app/bin
30 |     networks:
31 |       - default
32 |       - testing-v4-gcloud-self-b_default
33 |     depends_on:
34 |       - gcs-gateway-a
35 |     environment:
36 |       - APP_NAME
37 |       - DATA_CONFIG
38 |       - SERVER_ID
39 |       - SHARED_SECRET
40 |       - PRIVATE_KEY_HEX
41 |       - PUBLIC_KEY_HEX_INTERNAL
42 |       - PUBLIC_KEY_HEX_EXTERNAL
43 |       - BUCKET_INTERNAL_ACCESS_KEY
44 |       - BUCKET_INTERNAL_SECRET_KEY
45 |       - BUCKET_INTERNAL_ENDPOINT
46 |       - BUCKET_EXTERNAL_ACCESS_KEY
47 |       - BUCKET_EXTERNAL_SECRET_KEY
48 |       - BUCKET_EXTERNAL_ENDPOINT
49 |       - BUCKET_INTERNAL_INGEST
50 |       - BUCKET_INTERNAL_PRIVATE
51 |       - BUCKET_INTERNAL_SHARED
52 |       - BUCKET_EXTERNAL_SHARED
53 |       - BUCKET_PREFIX
54 |       - SUBMISSION_DATE
55 |       - RETRY_LIMIT
56 |       - RETRY_DELAY
57 |       - RETRY_BACKOFF_EXPONENT
58 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/server-b/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | COMPOSE_PROJECT_NAME="testing-v4-gcloud-self-b"
 4 | 
 5 | APP_NAME=test-app
 6 | DATA_CONFIG=/app/config/content.json
 7 | BUCKET_PREFIX=test-app/v1
 8 | SERVER_ID=B
 9 | SHARED_SECRET=xut4T8StPN83xiK2QAj/oQ==
10 | 
11 | # Used for the MinIO GCS gateway
12 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-b-private-key.json"
13 | 
14 | PRIVATE_KEY_HEX=C366087126F212F484E156E49B8A2783C4DEBF58A1B168C1ECD8A7DE0D5EB591
15 | PUBLIC_KEY_HEX_INTERNAL=BFAB37C8E174142F5B06B32FCC7212D63326889E06FBCC687622E9CE77A76937
16 | BUCKET_INTERNAL_INGEST=b-ingest-032fc7c2cca96ddb
17 | BUCKET_INTERNAL_PRIVATE=b-private-032fc7c2cca96ddb
18 | BUCKET_INTERNAL_SHARED=b-shared-032fc7c2cca96ddb
19 | BUCKET_INTERNAL_ACCESS_KEY=b-032fc7c2cca96ddb
20 | BUCKET_INTERNAL_SECRET_KEY=a6b4094711a66498c561a8763b97d79087a989ab173ce84963c2247e7039fd74
21 | BUCKET_INTERNAL_ENDPOINT=http://minio-b:9000
22 | 
23 | # access to the external bucket is mediated by the gcs gateway, use the same
24 | # internal keys for gcs-gateway as for access to the normal minio instance.
25 | PUBLIC_KEY_HEX_EXTERNAL=A79C2D3DFE6AB347640DB2393A6EA4AF6A80294E2DF09EA2311E7B609D1AAC7C
26 | BUCKET_EXTERNAL_SHARED=a-shared-411db3b9503395de
27 | BUCKET_EXTERNAL_ACCESS_KEY=b-032fc7c2cca96ddb
28 | BUCKET_EXTERNAL_SECRET_KEY=a6b4094711a66498c561a8763b97d79087a989ab173ce84963c2247e7039fd74
29 | BUCKET_EXTERNAL_ENDPOINT=http://gcs-gateway-b:9000
30 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/server-b/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | set -x
 5 | 
 6 | TARGET="minio"
 7 | 
 8 | function get_key {
 9 |     local key=$1
10 |     jq -r ".$key" minio-config.json
11 | }
12 | 
13 | mc config host add $TARGET \
14 |     $BUCKET_INTERNAL_ENDPOINT \
15 |     $BUCKET_INTERNAL_ACCESS_KEY \
16 |     $BUCKET_INTERNAL_SECRET_KEY
17 | 
18 | for type in private shared ingest; do
19 |     bucket="$(get_key "buckets.$type")"
20 |     mc mb $TARGET/$bucket
21 | done
22 | 
23 | # the internal user is the admin, and doesn't need a policy applied
24 | for type in external ingest; do
25 |     policy="$(get_key "policy.$type")"
26 |     access_key="$(get_key "keys.$type.access_key")"
27 |     secret_key="$(get_key "keys.$type.secret_key")"
28 | 
29 |     # dump policy to tmp directory
30 |     policy_dir="/tmp/$type.json"
31 |     echo "$policy" > "$policy_dir"
32 | 
33 |     # mc admin policy add TARGET POLICYNAME POLICYFILE
34 |     mc admin policy add $TARGET $type $policy_dir
35 | 
36 |     # mc admin user add TARGET ACCESSKEY SECRETKEY
37 |     mc admin user add $TARGET $access_key $secret_key
38 | 
39 |     # mc admin policy set TARGET POLICYNAME user=ACCESSKEY
40 |     mc admin policy set $TARGET $type user=$access_key
41 | done
42 | 
43 | echo "done setting up policies"
44 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/server-b/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services:
 4 |   minio-b:
 5 |     image: minio/minio:RELEASE.2021-06-17T00-10-46Z
 6 |     command: server /data
 7 |     ports:
 8 |       - 9004:9000
 9 |     environment:
10 |       - MINIO_ACCESS_KEY=$BUCKET_INTERNAL_ACCESS_KEY
11 |       - MINIO_SECRET_KEY=$BUCKET_INTERNAL_SECRET_KEY
12 | 
13 |   # This is run to set up policies on the buckets
14 |   minio-bootstrap:
15 |     build: ../../../..
16 |     depends_on:
17 |       - minio-b
18 |     working_dir: /root
19 |     command: bash bootstrap.sh
20 |     volumes:
21 |       - .:/root/
22 |     environment:
23 |       - BUCKET_INTERNAL_ACCESS_KEY
24 |       - BUCKET_INTERNAL_SECRET_KEY
25 |       - BUCKET_INTERNAL_ENDPOINT
26 | 
27 |   # https://docs.min.io/docs/minio-gateway-for-gcs.html
28 |   gcs-gateway-b:
29 |     image: minio/minio:RELEASE.2021-06-17T00-10-46Z
30 |     command: gateway gcs
31 |     volumes:
32 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
33 |     ports:
34 |       - 9003:9000
35 |     environment:
36 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
37 |       - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY
38 |       - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY
39 | 
40 |   app:
41 |     build: ../../../..
42 |     working_dir: /app
43 |     command: bin/process
44 |     volumes:
45 |       - ../../content.json:/app/config/content.json
46 |       - ../../../../bin:/app/bin
47 |     depends_on:
48 |       - gcs-gateway-b
49 |       - minio-b
50 |       - minio-bootstrap
51 |     environment:
52 |       - APP_NAME
53 |       - DATA_CONFIG
54 |       - SERVER_ID
55 |       - SHARED_SECRET
56 |       - PRIVATE_KEY_HEX
57 |       - PUBLIC_KEY_HEX_INTERNAL
58 |       - PUBLIC_KEY_HEX_EXTERNAL
59 |       - BUCKET_INTERNAL_ACCESS_KEY
60 |       - BUCKET_INTERNAL_SECRET_KEY
61 |       - BUCKET_INTERNAL_ENDPOINT
62 |       - BUCKET_EXTERNAL_ACCESS_KEY
63 |       - BUCKET_EXTERNAL_SECRET_KEY
64 |       - BUCKET_EXTERNAL_ENDPOINT
65 |       - BUCKET_INTERNAL_INGEST
66 |       - BUCKET_INTERNAL_PRIVATE
67 |       - BUCKET_INTERNAL_SHARED
68 |       - BUCKET_EXTERNAL_SHARED
69 |       - BUCKET_PREFIX
70 |       - SUBMISSION_DATE
71 |       - RETRY_LIMIT
72 |       - RETRY_DELAY
73 |       - RETRY_BACKOFF_EXPONENT
74 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/compose/server-b/minio-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "buckets": {
 3 |     "private": "b-private-032fc7c2cca96ddb",
 4 |     "shared": "b-shared-032fc7c2cca96ddb",
 5 |     "ingest": "b-ingest-032fc7c2cca96ddb"
 6 |   },
 7 |   "policy": {
 8 |     "internal": {
 9 |       "Version": "2012-10-17",
10 |       "Statement": [
11 |         {
12 |           "Action": [
13 |             "s3:*"
14 |           ],
15 |           "Effect": "Allow",
16 |           "Resource": [
17 |             "arn:aws:s3:::b-private-032fc7c2cca96ddb/*",
18 |             "arn:aws:s3:::b-shared-032fc7c2cca96ddb/*",
19 |             "arn:aws:s3:::b-ingest-032fc7c2cca96ddb/*"
20 |           ],
21 |           "Sid": ""
22 |         }
23 |       ]
24 |     },
25 |     "external": {
26 |       "Version": "2012-10-17",
27 |       "Statement": [
28 |         {
29 |           "Action": [
30 |             "s3:*"
31 |           ],
32 |           "Effect": "Allow",
33 |           "Resource": [
34 |             "arn:aws:s3:::b-shared-032fc7c2cca96ddb/*"
35 |           ],
36 |           "Sid": ""
37 |         }
38 |       ]
39 |     },
40 |     "ingest": {
41 |       "Version": "2012-10-17",
42 |       "Statement": [
43 |         {
44 |           "Action": [
45 |             "s3:*"
46 |           ],
47 |           "Effect": "Allow",
48 |           "Resource": [
49 |             "arn:aws:s3:::b-ingest-032fc7c2cca96ddb/*"
50 |           ],
51 |           "Sid": ""
52 |         }
53 |       ]
54 |     }
55 |   },
56 |   "keys": {
57 |     "internal": {
58 |       "access_key": "b-032fc7c2cca96ddb",
59 |       "secret_key": "a6b4094711a66498c561a8763b97d79087a989ab173ce84963c2247e7039fd74"
60 |     },
61 |     "external": {
62 |       "access_key": "a-032fc7c2cca96ddb",
63 |       "secret_key": "d3fe697dd6d4acc606d9a637e339ea5f04afeef55db14d8c4ad5321b31ae405f"
64 |     },
65 |     "ingest": {
66 |       "access_key": "ingest-032fc7c2cca96ddb",
67 |       "secret_key": "78aa4bb15f7571492472112dacb39a9760c5f4e46aba146813c6b5722478d81f"
68 |     }
69 |   }
70 | }


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/content.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "batch_id": "content.blocking_blocked_TESTONLY-0",
 4 |     "n_data": 2046
 5 |   },
 6 |   {
 7 |     "batch_id": "content.blocking_blocked_TESTONLY-1",
 8 |     "n_data": 441
 9 |   }
10 | ]
11 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/scripts/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Build each docker-compose service by changing directories
 4 | 
 5 | set -euo pipefail
 6 | cd "$(dirname "$0")/.."
 7 | 
 8 | pushd compose/ingest
 9 | docker-compose build
10 | popd
11 | 
12 | pushd compose/server-a
13 | docker-compose build
14 | popd
15 | 
16 | pushd compose/server-b
17 | docker-compose build
18 | popd
19 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/scripts/cleanup:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run cleanup scripts for each server
 4 | 
 5 | set -euo pipefail
 6 | cd "$(dirname "$0")/.."
 7 | 
 8 | pushd compose/server-b
 9 | docker-compose run --rm app bin/cleanup
10 | # keep the container around since the network depends on server b
11 | 
12 | pushd ../../compose/server-a
13 | docker-compose run --rm app bin/cleanup
14 | docker-compose down
15 | popd
16 | 
17 | docker-compose down
18 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/scripts/copy-minio-configuration:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Make a copy of the minio configuration in server b directory
3 | 
4 | set -ex
5 | cp .secrets/minio-config.json compose/server-b
6 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/scripts/down:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # shut down docker containers
 4 | 
 5 | set -euo pipefail
 6 | cd "$(dirname "$0")/.."
 7 | 
 8 | pushd compose/ingest
 9 | docker-compose down
10 | popd
11 | 
12 | pushd compose/server-a
13 | docker-compose down
14 | popd
15 | 
16 | pushd compose/server-b
17 | docker-compose down
18 | popd
19 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/scripts/generate-minio-configuration:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Generate MinIO configuration for server B."""
 3 | from pathlib import Path
 4 | import secrets
 5 | import json
 6 | 
 7 | ROOT = Path(__file__).parent.parent
 8 | 
 9 | 
10 | def policy(buckets):
11 |     return {
12 |         "Version": "2012-10-17",
13 |         "Statement": [
14 |             {
15 |                 "Action": ["s3:*"],
16 |                 "Effect": "Allow",
17 |                 "Resource": [f"arn:aws:s3:::{bucket}/*" for bucket in buckets],
18 |                 "Sid": "",
19 |             }
20 |         ],
21 |     }
22 | 
23 | 
24 | def keypair(name):
25 |     return {"access_key": name, "secret_key": secrets.token_hex(32)}
26 | 
27 | 
28 | # create the relevant buckets
29 | def main():
30 |     salt = secrets.token_hex(8)
31 |     server_id = "b"
32 |     other_id = "a"
33 | 
34 |     # generate the buckets
35 |     private = f"{server_id}-private-{salt}"
36 |     shared = f"{server_id}-shared-{salt}"
37 |     ingest = f"{server_id}-ingest-{salt}"
38 | 
39 |     payload = {
40 |         "buckets": {"private": private, "shared": shared, "ingest": ingest},
41 |         "policy": {
42 |             "internal": policy([private, shared, ingest]),
43 |             "external": policy([shared]),
44 |             "ingest": policy([ingest]),
45 |         },
46 |         "keys": {
47 |             "internal": keypair(f"{server_id}-{salt}"),
48 |             "external": keypair(f"{other_id}-{salt}"),
49 |             "ingest": keypair(f"ingest-{salt}"),
50 |         },
51 |     }
52 | 
53 |     dotsecrets = ROOT / ".secrets"
54 |     dotsecrets.mkdir(parents=True, exist_ok=True)
55 |     configfile = dotsecrets / "minio-config.json"
56 |     if configfile.exists():
57 |         raise FileExistsError(f"{configfile} already exists!")
58 |     configfile.write_text(json.dumps(payload, indent=2))
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/scripts/generate-service-account-keys:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4-1}
 6 | 
 7 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 
 8 |     echo "project is not set correctly; run 'gcloud config set project $PROJECT'"
 9 |     exit 1
10 | fi
11 | 
12 | # work from the parent directory
13 | cd "$(dirname "$0")/.."
14 | output=.secrets
15 | mkdir -p $output
16 | 
17 | function create_service_account {
18 |     local project=$1
19 |     local output=$2
20 |     local name=$3
21 |     gcloud iam service-accounts keys create "$output/$name-private-key.json" \
22 |         --iam-account "$name@$project.iam.gserviceaccount.com"
23 | }
24 | 
25 | create_service_account "$PROJECT" "$output" service-account-ingest
26 | create_service_account "$PROJECT" "$output" service-account-a
27 | create_service_account "$PROJECT" "$output" service-account-b
28 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/scripts/integrate:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script controls the docker-compose workflow for integration testing. The
 4 | # containers are defined in the docker-compose.yml, but are orchestrated through
 5 | # this script for verification.
 6 | 
 7 | set -euo pipefail
 8 | 
 9 | cd "$(dirname "$0")/.."
10 | 
11 | # Start server B with minio server
12 | pushd compose/server-b
13 | docker-compose run --rm app bin/process &
14 | server_b_pid=$!
15 | popd
16 | 
17 | # wait for the network to come online
18 | sleep 5
19 | 
20 | # Copy data into the appropriate buckets
21 | pushd compose/ingest
22 | docker-compose run --rm app bin/generate
23 | docker-compose down
24 | popd
25 | 
26 | # Start server A
27 | pushd compose/server-a
28 | docker-compose run --rm app bin/process &
29 | server_a_pid=$!
30 | popd
31 | 
32 | # Return the exit code of the backgrounded docker-compose container. Since
33 | # `wait` is a blocking function, a failure in server B will not be detected
34 | # until timeout in server A.
35 | wait $server_a_pid
36 | wait $server_b_pid
37 | 
38 | # clean up the containers
39 | pushd compose/server-a
40 | docker-compose down
41 | popd
42 | 
43 | pushd compose/ingest
44 | docker-compose run --rm app bin/insert
45 | docker-compose down
46 | popd
47 | 
48 | pushd compose/server-b
49 | docker-compose down
50 | popd
51 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/terraform/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/google" {
 5 |   version = "3.65.0"
 6 |   hashes = [
 7 |     "h1:ZvXCeUYoex3aOLlZYqv08WZ3hcPaf5p/gEa/DeMrkfs=",
 8 |     "zh:402b8ba03f19558f7d0e2a453a9b82747882fb3519ce686ce26a9afd4593d05e",
 9 |     "zh:523a306c2906c213b630d1c2f1e48698769bfffe360b68388d935d0bd171c55c",
10 |     "zh:76af4170f5a524ff353e60dd68d728c55dcbd9f6c5f60648e28e4f8f9ca8e958",
11 |     "zh:7d00a44769d26144f42b413c82272e31ae9b63153532b9a135a8f69a6608b9a6",
12 |     "zh:7f5d0ab79d213809726663f7603004c173694602bd22f2419c445d6897729ca2",
13 |     "zh:a1c23e3d280a5053bae9102ad55df1315585395f8656ddf83928978c7e6cd307",
14 |     "zh:a81d0af5ef58c193197f81dc3059f8b22c7dde0575bb3198a0360aff7f9ca476",
15 |     "zh:b5b79fa8f9e49d2d26badfded64a1e460cdb11b152168e578443cf92df679bca",
16 |     "zh:ec4f88d1fd8990511b86205709c1a76ac3a444d0088a810c82a4f5db37ca4afe",
17 |     "zh:f15390a40dc6e9c5b5285bc2b6a8c54b6030ae9cc04cc4a31ecf9b14145c467b",
18 |     "zh:fb1a150464d822aa9182cd46a0b7bc2c279ff9400017b4bb3238256224ab41b6",
19 |   ]
20 | }
21 | 
22 | provider "registry.terraform.io/hashicorp/random" {
23 |   version = "3.1.0"
24 |   hashes = [
25 |     "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=",
26 |     "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc",
27 |     "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626",
28 |     "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff",
29 |     "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2",
30 |     "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992",
31 |     "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427",
32 |     "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc",
33 |     "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f",
34 |     "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b",
35 |     "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7",
36 |     "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a",
37 |   ]
38 | }
39 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   // When forking this configuration, set the configuration appropriately. A
 3 |   // remote backend is a good choice since it can be shared across a team.
 4 |   backend "gcs" {
 5 |     bucket = "amiyaguchi-prio-processor-v4-1"
 6 |     prefix = "tf-state"
 7 |   }
 8 | }
 9 | 
10 | variable "project" {
11 |   type    = string
12 |   default = "amiyaguchi-prio-processor-v4-1"
13 | }
14 | 
15 | variable "region" {
16 |   type    = string
17 |   default = "us-central-1"
18 | }
19 | 
20 | provider "google" {
21 |   project = var.project
22 |   region  = var.region
23 | }
24 | 
25 | // Choose a different bucket name if the project changes
26 | resource "random_id" "project" {
27 |   keepers = {
28 |     project = var.project
29 |   }
30 |   byte_length = 8
31 | }
32 | 
33 | module "bucket-a" {
34 |   source    = "./modules/bucket"
35 |   server_id = "a"
36 |   suffix    = random_id.project.hex
37 | }
38 | 
39 | // Create the service accounts for the tests
40 | resource "google_service_account" "ingest" {
41 |   account_id   = "service-account-ingest"
42 |   display_name = "Service account for the ingestion service"
43 | }
44 | 
45 | resource "google_service_account" "a" {
46 |   account_id   = "service-account-a"
47 |   display_name = "Service account for server A"
48 | }
49 | 
50 | resource "google_service_account" "b" {
51 |   account_id   = "service-account-b"
52 |   display_name = "Service account for server B"
53 | }
54 | 
55 | // Assign service account permissions to each bucket. There are quite a few rules,
56 | // so we break this out into a module.
57 | module "bucket-permissions-a" {
58 |   source                   = "./modules/bucket-permissions"
59 |   bucket_private           = module.bucket-a.private
60 |   bucket_shared            = module.bucket-a.shared
61 |   bucket_ingest            = module.bucket-a.ingest
62 |   service_account_internal = google_service_account.a.email
63 |   service_account_external = google_service_account.b.email
64 |   service_account_ingest   = google_service_account.ingest.email
65 | }
66 | 
67 | // testing whether origin telemetry inserts into BigQuery correctly
68 | 
69 | // The ingest container will be used for coordination, and gets access to
70 | // server A's private bucket because they are operated by the same entity.
71 | resource "google_storage_bucket_iam_member" "ingest_internal_private" {
72 |   bucket = module.bucket-a.private
73 |   role   = "roles/storage.objectViewer"
74 |   member = "serviceAccount:${google_service_account.ingest.email}"
75 | }
76 | 
77 | resource "google_project_service" "bigquery" {
78 |   service = "bigquery.googleapis.com"
79 | }
80 | 
81 | resource "google_bigquery_dataset" "telemetry" {
82 |   dataset_id = "telemetry"
83 |   location   = "US"
84 | }
85 | 
86 | // Grant access to the admin service account
87 | resource "google_project_iam_member" "bigquery-admin" {
88 |   role   = "roles/bigquery.admin"
89 |   member = "serviceAccount:${google_service_account.ingest.email}"
90 | }
91 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/terraform/modules/bucket-permissions/main.tf:
--------------------------------------------------------------------------------
 1 | variable "bucket_private" {
 2 |   type        = string
 3 |   description = "The private bucket for the current processor"
 4 | }
 5 | 
 6 | variable "bucket_shared" {
 7 |   type        = string
 8 |   description = "The shared bucket for both processors"
 9 | }
10 | 
11 | variable "bucket_ingest" {
12 |   type        = string
13 |   description = "The bucket shared with the ingestion service"
14 | }
15 | 
16 | variable "service_account_internal" {
17 |   type        = string
18 |   description = "The service account for the current processor"
19 | }
20 | 
21 | variable "service_account_external" {
22 |   type        = string
23 |   description = "The service account for the co-processor"
24 | }
25 | 
26 | variable "service_account_ingest" {
27 |   type        = string
28 |   description = "The service account for the ingestor"
29 | }
30 | 
31 | resource "google_storage_bucket_iam_binding" "private" {
32 |   bucket  = var.bucket_private
33 |   role    = "roles/storage.admin"
34 |   members = ["serviceAccount:${var.service_account_internal}"]
35 | }
36 | 
37 | resource "google_storage_bucket_iam_binding" "shared" {
38 |   bucket = var.bucket_shared
39 |   role   = "roles/storage.admin"
40 |   members = [
41 |     "serviceAccount:${var.service_account_internal}",
42 |     "serviceAccount:${var.service_account_external}"
43 |   ]
44 | }
45 | 
46 | resource "google_storage_bucket_iam_binding" "ingest" {
47 |   bucket = var.bucket_ingest
48 |   role   = "roles/storage.admin"
49 |   members = [
50 |     "serviceAccount:${var.service_account_internal}",
51 |     "serviceAccount:${var.service_account_ingest}"
52 |   ]
53 | }
54 | 


--------------------------------------------------------------------------------
/deployment/testing-v4-gcloud-self/terraform/modules/bucket/main.tf:
--------------------------------------------------------------------------------
 1 | variable "server_id" {
 2 |   type        = string
 3 |   description = "The identifier for the server"
 4 | }
 5 | 
 6 | variable "suffix" {
 7 |   type        = string
 8 |   description = "A shared suffix used for the bucket"
 9 | }
10 | 
11 | // Create all of the storage resources necessary for the tests. We choose to
12 | // delete files older than 7 days since these are testing resources.
13 | 
14 | resource "google_storage_bucket" "ingest" {
15 |   name                        = "${var.server_id}-ingest-${var.suffix}"
16 |   uniform_bucket_level_access = true
17 |   lifecycle_rule {
18 |     condition {
19 |       age = 7
20 |     }
21 |     action {
22 |       type = "Delete"
23 |     }
24 |   }
25 | }
26 | 
27 | resource "google_storage_bucket" "private" {
28 |   name                        = "${var.server_id}-private-${var.suffix}"
29 |   uniform_bucket_level_access = true
30 |   lifecycle_rule {
31 |     condition {
32 |       age = 7
33 |     }
34 |     action {
35 |       type = "Delete"
36 |     }
37 |   }
38 | }
39 | 
40 | resource "google_storage_bucket" "shared" {
41 |   name                        = "${var.server_id}-shared-${var.suffix}"
42 |   uniform_bucket_level_access = true
43 |   lifecycle_rule {
44 |     condition {
45 |       age = 7
46 |     }
47 |     action {
48 |       type = "Delete"
49 |     }
50 |   }
51 | }
52 | 
53 | output "ingest" {
54 |   value = google_storage_bucket.ingest.name
55 | }
56 | 
57 | output "private" {
58 |   value = google_storage_bucket.private.name
59 | }
60 | 
61 | output "shared" {
62 |   value = google_storage_bucket.shared.name
63 | }
64 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/.gitignore:
--------------------------------------------------------------------------------
1 | .service-account-keys/
2 | .secrets/
3 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/README.md:
--------------------------------------------------------------------------------
 1 | # Testing configuration for v4 containers
 2 | 
 3 | This directory contains terraform configuration to bring relevant resources for
 4 | an integration test of the prio-processor v4.x containers.
 5 | 
 6 | To create a new project that uses the same configuration, change the terraform
 7 | backend appropriately. Here, the state is placed into a storage bucket that has
 8 | been created beforehand. Ensure the project has also been created. Then:
 9 | 
10 | ```bash
11 | cd terraform
12 | 
13 | # if you're choosing a different project or change any modules
14 | terraform init
15 | 
16 | # apply any changes
17 | terraform apply
18 | ```
19 | 
20 | To configure the tests:
21 | 
22 | ```bash
23 | # There is a maximum of 10 keys per service account. This script doesn't
24 | # handle key rotations, so disable old keys as necessary.
25 | scripts/generate-service-account-keys
26 | 
27 | # generate new keys (or alternatively copy .env.template files to their .env locations)
28 | scripts/generate-dotenv
29 | ```
30 | 
31 | The above commands only need to be run once. To run the tests:
32 | 
33 | ```bash
34 | # run the integration script
35 | scripts/integrate
36 | 
37 | # clean up the buckets
38 | scripts/cleanup
39 | ```
40 | 
41 | In order to be agnostic to the storage provider, MinIO and `mc` are used for
42 | transferring data between the different parties. A GCS gateway is provisioned
43 | for each container that is associated with a service account. Each MinIO
44 | container has an HTTP entrypoint for browsing files that can be found on the
45 | following locations:
46 | 
47 | - http://localhost:9001 for the ingestion server
48 | - http://localhost:9002 for server a
49 | - http://localhost:9003 for server b
50 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/compose/ingest/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | 
 4 | APP_NAME=test-app
 5 | DATA_CONFIG=/app/config/content.json
 6 | ORIGIN_CONFIG=/app/config/telemetry_origin_data_inc.json
 7 | BUCKET_PREFIX=test-app/v1
 8 | 
 9 | # relative to the docker-compose file
10 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-ingest-private-key.json"
11 | 
12 | PUBLIC_KEY_HEX_INTERNAL=16A6203457348A5B957E02D71CAD726D8C7C7A25C4DBFA73DDF2F0748C893166
13 | BUCKET_INTERNAL_INGEST=a-ingest-d70d758a4b28a791
14 | BUCKET_INTERNAL_ACCESS_KEY=server-ingest-access-key
15 | BUCKET_INTERNAL_SECRET_KEY=server-ingest-secret-key
16 | BUCKET_INTERNAL_ENDPOINT=http://minio:9000
17 | 
18 | PUBLIC_KEY_HEX_EXTERNAL=702E7941DE5F024B02F6CB5AE7176413EDC90F008368579021D115A45F95326C
19 | BUCKET_EXTERNAL_INGEST=b-ingest-d70d758a4b28a791
20 | BUCKET_EXTERNAL_ACCESS_KEY=server-ingest-access-key
21 | BUCKET_EXTERNAL_SECRET_KEY=server-ingest-secret-key
22 | BUCKET_EXTERNAL_ENDPOINT=http://minio:9000
23 | 
24 | # The ingest also gets access to the private internal bucket, because ingest and
25 | # server A are operated by the same entity in the origin telemetry setup
26 | BUCKET_INTERNAL_PRIVATE=a-private-d70d758a4b28a791
27 | DATASET=telemetry
28 | TABLE=content_blocking
29 | BQ_REPLACE=true
30 | CLOUDSDK_CORE_PROJECT=amiyaguchi-prio-processor-v4
31 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/compose/ingest/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services:
 4 |   # https://docs.min.io/docs/minio-gateway-for-gcs.html
 5 |   minio:
 6 |     image: minio/minio:latest
 7 |     command: gateway gcs ${CLOUDSDK_CORE_PROJECT}
 8 |     volumes:
 9 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
10 |     ports:
11 |       - 9001:9000
12 |     environment:
13 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
14 |       - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY
15 |       - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY
16 | 
17 |   app:
18 |     build: ../../../..
19 |     command: "true"
20 |     volumes:
21 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
22 |       - ../../content.json:/app/config/content.json
23 |       - ../../../../bin:/app/bin
24 |     depends_on:
25 |       - minio
26 |     environment:
27 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
28 |       - APP_NAME
29 |       - DATA_CONFIG
30 |       - ORIGIN_CONFIG
31 |       - PUBLIC_KEY_HEX_INTERNAL
32 |       - PUBLIC_KEY_HEX_EXTERNAL
33 |       - BUCKET_INTERNAL_ACCESS_KEY
34 |       - BUCKET_INTERNAL_SECRET_KEY
35 |       - BUCKET_INTERNAL_ENDPOINT
36 |       - BUCKET_EXTERNAL_ACCESS_KEY
37 |       - BUCKET_EXTERNAL_SECRET_KEY
38 |       - BUCKET_EXTERNAL_ENDPOINT
39 |       - BUCKET_INTERNAL_INGEST
40 |       - BUCKET_EXTERNAL_INGEST
41 |       # for submitting results to bigquery, granted access via GCP primitives
42 |       # instead of HMAC keys
43 |       - BUCKET_INTERNAL_PRIVATE
44 |       - BUCKET_PREFIX
45 |       - DATASET
46 |       - TABLE
47 |       - BQ_REPLACE
48 |       - CLOUDSDK_CORE_PROJECT
49 |       - SUBMISSION_DATE
50 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/compose/server-a/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | 
 4 | APP_NAME=test-app
 5 | DATA_CONFIG=/app/config/content.json
 6 | BUCKET_PREFIX=test-app/v1
 7 | SERVER_ID=A
 8 | SHARED_SECRET=g8EbbygYtecFDnpzkRyPjw==
 9 | 
10 | # Used for the MinIO GCS gateway
11 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-a-private-key.json"
12 | 
13 | PRIVATE_KEY_HEX=3D4967F5FC58272E826F329F6DE930987265EF032136192C5B9B1EB1A6D15337
14 | PUBLIC_KEY_HEX_INTERNAL=16A6203457348A5B957E02D71CAD726D8C7C7A25C4DBFA73DDF2F0748C893166
15 | BUCKET_INTERNAL_INGEST=a-ingest-d70d758a4b28a791
16 | BUCKET_INTERNAL_PRIVATE=a-private-d70d758a4b28a791
17 | BUCKET_INTERNAL_SHARED=a-shared-d70d758a4b28a791
18 | BUCKET_INTERNAL_ACCESS_KEY=server-a-access-key
19 | BUCKET_INTERNAL_SECRET_KEY=server-a-secret-key
20 | BUCKET_INTERNAL_ENDPOINT=http://minio:9000
21 | 
22 | PUBLIC_KEY_HEX_EXTERNAL=702E7941DE5F024B02F6CB5AE7176413EDC90F008368579021D115A45F95326C
23 | BUCKET_EXTERNAL_SHARED=b-shared-d70d758a4b28a791
24 | # NOTE: the keys are shared since permissions are configured at the service
25 | # account level in the gateway
26 | BUCKET_EXTERNAL_ACCESS_KEY=server-a-access-key
27 | BUCKET_EXTERNAL_SECRET_KEY=server-a-secret-key
28 | BUCKET_EXTERNAL_ENDPOINT=http://minio:9000
29 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/compose/server-a/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services:
 4 |   # https://docs.min.io/docs/minio-gateway-for-gcs.html
 5 |   minio:
 6 |     image: minio/minio:latest
 7 |     command: gateway gcs
 8 |     volumes:
 9 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
10 |     ports:
11 |       - 9002:9000
12 |     environment:
13 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
14 |       - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY
15 |       - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY
16 | 
17 |   app:
18 |     build: ../../../..
19 |     working_dir: /app
20 |     command: bin/process
21 |     volumes:
22 |       - ../../content.json:/app/config/content.json
23 |       - ../../../../bin:/app/bin
24 |     depends_on:
25 |       - minio
26 |     environment:
27 |       - APP_NAME
28 |       - DATA_CONFIG
29 |       - SERVER_ID
30 |       - SHARED_SECRET
31 |       - PRIVATE_KEY_HEX
32 |       - PUBLIC_KEY_HEX_INTERNAL
33 |       - PUBLIC_KEY_HEX_EXTERNAL
34 |       - BUCKET_INTERNAL_ACCESS_KEY
35 |       - BUCKET_INTERNAL_SECRET_KEY
36 |       - BUCKET_INTERNAL_ENDPOINT
37 |       - BUCKET_EXTERNAL_ACCESS_KEY
38 |       - BUCKET_EXTERNAL_SECRET_KEY
39 |       - BUCKET_EXTERNAL_ENDPOINT
40 |       - BUCKET_INTERNAL_INGEST
41 |       - BUCKET_INTERNAL_PRIVATE
42 |       - BUCKET_INTERNAL_SHARED
43 |       - BUCKET_EXTERNAL_SHARED
44 |       - BUCKET_PREFIX
45 |       - SUBMISSION_DATE
46 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/compose/server-b/.env.template:
--------------------------------------------------------------------------------
 1 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 2 | # manually edited values into source control.
 3 | 
 4 | APP_NAME=test-app
 5 | DATA_CONFIG=/app/config/content.json
 6 | BUCKET_PREFIX=test-app/v1
 7 | SERVER_ID=B
 8 | SHARED_SECRET=g8EbbygYtecFDnpzkRyPjw==
 9 | 
10 | # Used for the MinIO GCS gateway
11 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-b-private-key.json"
12 | 
13 | PRIVATE_KEY_HEX=0D400D1055E1C20D93EC90EC5F5BD5DCB08B483B02035C6E16E86BF842D70A7A
14 | PUBLIC_KEY_HEX_INTERNAL=702E7941DE5F024B02F6CB5AE7176413EDC90F008368579021D115A45F95326C
15 | BUCKET_INTERNAL_INGEST=b-ingest-d70d758a4b28a791
16 | BUCKET_INTERNAL_PRIVATE=b-private-d70d758a4b28a791
17 | BUCKET_INTERNAL_SHARED=b-shared-d70d758a4b28a791
18 | BUCKET_INTERNAL_ACCESS_KEY=server-b-access-key
19 | BUCKET_INTERNAL_SECRET_KEY=server-b-secret-key
20 | BUCKET_INTERNAL_ENDPOINT=http://minio:9000
21 | 
22 | PUBLIC_KEY_HEX_EXTERNAL=16A6203457348A5B957E02D71CAD726D8C7C7A25C4DBFA73DDF2F0748C893166
23 | BUCKET_EXTERNAL_SHARED=a-shared-d70d758a4b28a791
24 | # NOTE: the keys are shared since permissions are configured at the service
25 | # account level in the gateway
26 | BUCKET_EXTERNAL_ACCESS_KEY=server-b-access-key
27 | BUCKET_EXTERNAL_SECRET_KEY=server-b-secret-key
28 | BUCKET_EXTERNAL_ENDPOINT=http://minio:9000
29 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/compose/server-b/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services:
 4 |   # https://docs.min.io/docs/minio-gateway-for-gcs.html
 5 |   minio:
 6 |     image: minio/minio:latest
 7 |     command: gateway gcs
 8 |     volumes:
 9 |       - ${GOOGLE_APPLICATION_CREDENTIALS?"missing credentials"}:/tmp/.credentials
10 |     ports:
11 |       - 9003:9000
12 |     environment:
13 |       - GOOGLE_APPLICATION_CREDENTIALS=/tmp/.credentials
14 |       - MINIO_ROOT_USER=$BUCKET_INTERNAL_ACCESS_KEY
15 |       - MINIO_ROOT_PASSWORD=$BUCKET_INTERNAL_SECRET_KEY
16 | 
17 |   app:
18 |     build: ../../../..
19 |     working_dir: /app
20 |     command: bin/process
21 |     volumes:
22 |       - ../../content.json:/app/config/content.json
23 |       - ../../../../bin:/app/bin
24 |     depends_on:
25 |       - minio
26 |     environment:
27 |       - APP_NAME
28 |       - DATA_CONFIG
29 |       - SERVER_ID
30 |       - SHARED_SECRET
31 |       - PRIVATE_KEY_HEX
32 |       - PUBLIC_KEY_HEX_INTERNAL
33 |       - PUBLIC_KEY_HEX_EXTERNAL
34 |       - BUCKET_INTERNAL_ACCESS_KEY
35 |       - BUCKET_INTERNAL_SECRET_KEY
36 |       - BUCKET_INTERNAL_ENDPOINT
37 |       - BUCKET_EXTERNAL_ACCESS_KEY
38 |       - BUCKET_EXTERNAL_SECRET_KEY
39 |       - BUCKET_EXTERNAL_ENDPOINT
40 |       - BUCKET_INTERNAL_INGEST
41 |       - BUCKET_INTERNAL_PRIVATE
42 |       - BUCKET_INTERNAL_SHARED
43 |       - BUCKET_EXTERNAL_SHARED
44 |       - BUCKET_PREFIX
45 |       - SUBMISSION_DATE
46 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/content.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "batch_id": "content.blocking_blocked_TESTONLY-0",
 4 |     "n_data": 2046
 5 |   },
 6 |   {
 7 |     "batch_id": "content.blocking_blocked_TESTONLY-1",
 8 |     "n_data": 441
 9 |   }
10 | ]
11 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/scripts/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Delegate docker-compose build to each service
 4 | 
 5 | set -euo pipefail
 6 | cd "$(dirname "$0")/.."
 7 | 
 8 | pushd compose/ingest
 9 | docker-compose build
10 | popd
11 | 
12 | pushd compose/server-a
13 | docker-compose build
14 | popd
15 | 
16 | pushd compose/server-b
17 | docker-compose build
18 | popd
19 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/scripts/cleanup:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Delegate cleanup of buckets to the appropriate service account
 4 | 
 5 | set -euo pipefail
 6 | cd "$(dirname "$0")/.."
 7 | 
 8 | pushd compose/server-a
 9 | docker-compose run --rm app bin/cleanup
10 | docker-compose down
11 | popd
12 | 
13 | pushd compose/server-b
14 | docker-compose run --rm app bin/cleanup
15 | docker-compose down
16 | popd
17 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/scripts/generate-dotenv:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Generate dotenv files for each of the compose configurations
  3 | 
  4 | set -e
  5 | 
  6 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4}
  7 | TAG=${TAG:-mozilla/prio-processor:v3.0.0}
  8 | 
  9 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 
 10 |     echo "project is not set correctly; run 'gcloud config set project $PROJECT'"
 11 |     exit 1
 12 | fi
 13 | 
 14 | function get-key {
 15 |     local json=$1
 16 |     local key=$2
 17 |     echo "$json" | jq -r ".$key"
 18 | }
 19 | 
 20 | # reuse results from a single gsutil call
 21 | _results=$(gsutil ls)
 22 | function get-bucket {
 23 |     local pattern=$1
 24 |     path=$(echo "$_results" | grep "$pattern")
 25 |     # strip any trailing slashes
 26 |     trim="${path%/}"
 27 |     # trim gs:// prefix
 28 |     trim="${trim#gs://}"
 29 |     echo $trim
 30 | }
 31 | 
 32 | function upper {
 33 |     local text=$1
 34 |     echo "$text" | tr '[:lower:]' '[:upper:]'
 35 | }
 36 | 
 37 | 
 38 | function minio {
 39 |     # either a, b, or ingest
 40 |     local server_id=$1
 41 |     # either internal or external
 42 |     local type=$2
 43 |     # NOTE: use a better keypair than this... also, due to the nature of this
 44 |     # test, the internal/external keys are the same, using the gateway's service
 45 |     # account for authorization to the buckets.
 46 |     cat << EOF
 47 | BUCKET_$(upper "$type")_ACCESS_KEY=server-$server_id-access-key
 48 | BUCKET_$(upper "$type")_SECRET_KEY=server-$server_id-secret-key
 49 | BUCKET_$(upper "$type")_ENDPOINT=http://minio:9000
 50 | EOF
 51 | }
 52 | 
 53 | # work from the parent directory
 54 | cd "$(dirname "$0")/.."
 55 | 
 56 | keys_a=$(docker run -it "$TAG" prio keygen)
 57 | keys_b=$(docker run -it "$TAG" prio keygen)
 58 | seed=$(docker run -it "$TAG" prio shared-seed)
 59 | 
 60 | # list out all the variables we might need...
 61 | app_name="test-app"
 62 | bucket_prefix="$app_name/v1"
 63 | data_config="/app/config/content.json"
 64 | origin_config="/app/config/telemetry_origin_data_inc.json"
 65 | 
 66 | function ingest-env {
 67 |     local is_template=$1
 68 |     local output;
 69 |     output=$(if [[ $is_template == true ]]; then echo .env.template; else echo .env; fi)
 70 |     cat << EOF > "compose/ingest/$output"
 71 | # This configuration is generated by scripts/generate-dotenv. Do not check in
 72 | # manually edited values into source control.
 73 | 
 74 | APP_NAME=$app_name
 75 | DATA_CONFIG=$data_config
 76 | ORIGIN_CONFIG=$origin_config
 77 | BUCKET_PREFIX=$bucket_prefix
 78 | 
 79 | # relative to the docker-compose file
 80 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-ingest-private-key.json"
 81 | 
 82 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$keys_a" public_key)
 83 | BUCKET_INTERNAL_INGEST=$(get-bucket a-ingest)
 84 | $(minio ingest internal)
 85 | 
 86 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$keys_b" public_key)
 87 | BUCKET_EXTERNAL_INGEST=$(get-bucket b-ingest)
 88 | $(minio ingest external)
 89 | 
 90 | # The ingest also gets access to the private internal bucket, because ingest and
 91 | # server A are operated by the same entity in the origin telemetry setup
 92 | BUCKET_INTERNAL_PRIVATE=$(get-bucket a-private)
 93 | DATASET=telemetry
 94 | TABLE=content_blocking
 95 | BQ_REPLACE=true
 96 | CLOUDSDK_CORE_PROJECT=$PROJECT
 97 | EOF
 98 | }
 99 | 
100 | function server-env {
101 |     local server_id=$1
102 |     local internal_key=$2
103 |     local external_key=$3
104 |     local is_template=$4
105 | 
106 |     local other_id;
107 |     other_id=$(if [[ $server_id == a ]]; then echo b; else echo a; fi)
108 |     local output;
109 |     output=$(if [[ $is_template == true ]]; then echo .env.template; else echo .env; fi)
110 |     cat << EOF > "compose/server-$server_id/$output"
111 | # This configuration is generated by scripts/generate-dotenv. Do not check in
112 | # manually edited values into source control.
113 | 
114 | APP_NAME=$app_name
115 | DATA_CONFIG=$data_config
116 | BUCKET_PREFIX=$bucket_prefix
117 | SERVER_ID=$(upper "$server_id")
118 | SHARED_SECRET=$(get-key "$seed" shared_seed)
119 | 
120 | # Used for the MinIO GCS gateway
121 | GOOGLE_APPLICATION_CREDENTIALS="../../.secrets/service-account-$server_id-private-key.json"
122 | 
123 | PRIVATE_KEY_HEX=$(get-key "$internal_key" private_key)
124 | PUBLIC_KEY_HEX_INTERNAL=$(get-key "$internal_key" public_key)
125 | BUCKET_INTERNAL_INGEST=$(get-bucket "${server_id}-ingest")
126 | BUCKET_INTERNAL_PRIVATE=$(get-bucket "${server_id}-private")
127 | BUCKET_INTERNAL_SHARED=$(get-bucket "${server_id}-shared")
128 | $(minio "$server_id" internal)
129 | 
130 | PUBLIC_KEY_HEX_EXTERNAL=$(get-key "$external_key" public_key)
131 | BUCKET_EXTERNAL_SHARED=$(get-bucket "${other_id}-shared")
132 | # NOTE: the keys are shared since permissions are configured at the service
133 | # account level in the gateway
134 | $(minio "$server_id" external)
135 | EOF
136 | }
137 | 
138 | ingest-env true
139 | ingest-env false
140 | server-env a "$keys_a" "$keys_b" true
141 | server-env a "$keys_a" "$keys_b" false
142 | server-env b "$keys_b" "$keys_a" true
143 | server-env b "$keys_b" "$keys_a" false
144 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/scripts/generate-service-account-keys:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4}
 6 | 
 7 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 
 8 |     echo "project is not set correctly; run 'gcloud config set project $PROJECT'"
 9 |     exit 1
10 | fi
11 | 
12 | # work from the parent directory
13 | cd "$(dirname "$0")/.."
14 | output=.secrets
15 | mkdir -p $output
16 | 
17 | function create_service_account {
18 |     local project=$1
19 |     local output=$2
20 |     local name=$3
21 |     gcloud iam service-accounts keys create "$output/$name-private-key.json" \
22 |         --iam-account "$name@$project.iam.gserviceaccount.com"
23 | }
24 | 
25 | create_service_account "$PROJECT" "$output" service-account-ingest
26 | create_service_account "$PROJECT" "$output" service-account-a
27 | create_service_account "$PROJECT" "$output" service-account-b
28 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/scripts/integrate:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script controls the docker-compose workflow for integration testing. The
 4 | # containers are defined in the docker-compose.yml, but are orchestrated through
 5 | # this script for verification.
 6 | 
 7 | set -euo pipefail
 8 | 
 9 | cd "$(dirname "$0")/.."
10 | 
11 | # Copy data into the appropriate buckets
12 | pushd compose/ingest
13 | docker-compose run --rm app bin/generate
14 | docker-compose down
15 | popd
16 | 
17 | # Start server A
18 | pushd compose/server-a
19 | docker-compose run --rm app bin/process &
20 | server_a_pid=$!
21 | popd
22 | 
23 | # offset the start times by a short amount for proper authentication against GCP
24 | sleep 2
25 | 
26 | # Start server B
27 | pushd compose/server-b
28 | docker-compose run --rm app bin/process &
29 | server_b_pid=$!
30 | popd
31 | 
32 | # Return the exit code of the backgrounded docker-compose container. Since
33 | # `wait` is a blocking function, a failure in server B will not be detected
34 | # until timeout in server A.
35 | wait $server_a_pid
36 | wait $server_b_pid
37 | 
38 | # clean up the containers
39 | pushd compose/server-a
40 | docker-compose down
41 | popd
42 | pushd compose/server-b
43 | docker-compose down
44 | popd
45 | 
46 | pushd compose/ingest
47 | docker-compose run --rm app bin/insert
48 | docker-compose down
49 | popd
50 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/scripts/list-bucket:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | : << EOF
 3 | To use this script, run the following command:
 4 | 
 5 | scripts/list-bucket > LISTING.md
 6 | EOF
 7 | 
 8 | set -e
 9 | 
10 | PROJECT=${PROJECT:-amiyaguchi-prio-processor-v4}
11 | 
12 | if [[ $(gcloud config get-value project) != "$PROJECT" ]]; then 
13 |     echo "project is not set correctly; run 'gcloud config set project $PROJECT'"
14 |     exit 1
15 | fi
16 | 
17 | function sort_recursive_listing {
18 |     local bucket=$1
19 |     # remove lines that end with /:, empty lines, or the summary line
20 |     # then remove extra spacing, sort by date, and take the name of the path
21 |     gsutil ls -lr "$bucket" | \
22 |         grep -v :$ | grep -v ^$ | grep -v ^TOTAL | \
23 |         tr -s " " | sort -k2 | cut -d " " -f4 | \
24 |         tree --fromfile
25 | }
26 | 
27 | cat << EOF
28 | # Directory listing
29 | 
30 | This listing was generated from \`scripts/list-bucket\`. It is a list of all
31 | objects stored across the the two servers.
32 | 
33 | ## Server A buckets
34 | 
35 | EOF
36 | 
37 | buckets=$(gsutil ls | sort)
38 | for bucket in $(echo "$buckets" | grep a- ); do
39 | cat << EOF
40 | ### \`$bucket\`
41 | 
42 | \`\`\`
43 | $(sort_recursive_listing "$bucket")
44 | \`\`\`
45 | 
46 | EOF
47 | done
48 | 
49 | echo "## Server B buckets"
50 | echo ""
51 | 
52 | for bucket in $(echo "$buckets" | grep b-); do
53 | cat << EOF
54 | ### \`$bucket\`
55 | 
56 | \`\`\`
57 | $(sort_recursive_listing "$bucket")
58 | \`\`\`
59 | 
60 | EOF
61 | done
62 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/terraform/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/google" {
 5 |   version = "3.65.0"
 6 |   hashes = [
 7 |     "h1:ZvXCeUYoex3aOLlZYqv08WZ3hcPaf5p/gEa/DeMrkfs=",
 8 |     "zh:402b8ba03f19558f7d0e2a453a9b82747882fb3519ce686ce26a9afd4593d05e",
 9 |     "zh:523a306c2906c213b630d1c2f1e48698769bfffe360b68388d935d0bd171c55c",
10 |     "zh:76af4170f5a524ff353e60dd68d728c55dcbd9f6c5f60648e28e4f8f9ca8e958",
11 |     "zh:7d00a44769d26144f42b413c82272e31ae9b63153532b9a135a8f69a6608b9a6",
12 |     "zh:7f5d0ab79d213809726663f7603004c173694602bd22f2419c445d6897729ca2",
13 |     "zh:a1c23e3d280a5053bae9102ad55df1315585395f8656ddf83928978c7e6cd307",
14 |     "zh:a81d0af5ef58c193197f81dc3059f8b22c7dde0575bb3198a0360aff7f9ca476",
15 |     "zh:b5b79fa8f9e49d2d26badfded64a1e460cdb11b152168e578443cf92df679bca",
16 |     "zh:ec4f88d1fd8990511b86205709c1a76ac3a444d0088a810c82a4f5db37ca4afe",
17 |     "zh:f15390a40dc6e9c5b5285bc2b6a8c54b6030ae9cc04cc4a31ecf9b14145c467b",
18 |     "zh:fb1a150464d822aa9182cd46a0b7bc2c279ff9400017b4bb3238256224ab41b6",
19 |   ]
20 | }
21 | 
22 | provider "registry.terraform.io/hashicorp/random" {
23 |   version = "3.1.0"
24 |   hashes = [
25 |     "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=",
26 |     "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc",
27 |     "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626",
28 |     "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff",
29 |     "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2",
30 |     "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992",
31 |     "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427",
32 |     "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc",
33 |     "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f",
34 |     "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b",
35 |     "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7",
36 |     "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a",
37 |   ]
38 | }
39 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/terraform/main.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   // When forking this configuration, set the configuration appropriately. A
  3 |   // remote backend is a good choice since it can be shared across a team.
  4 |   backend "gcs" {
  5 |     bucket = "amiyaguchi-prio-processor-v4"
  6 |     prefix = "tf-state"
  7 |   }
  8 | }
  9 | 
 10 | variable "project" {
 11 |   type    = string
 12 |   default = "amiyaguchi-prio-processor-v4"
 13 | }
 14 | 
 15 | variable "region" {
 16 |   type    = string
 17 |   default = "us-central-1"
 18 | }
 19 | 
 20 | provider "google" {
 21 |   project = var.project
 22 |   region  = var.region
 23 | }
 24 | 
 25 | // Choose a different bucket name if the project changes
 26 | resource "random_id" "project" {
 27 |   keepers = {
 28 |     project = var.project
 29 |   }
 30 |   byte_length = 8
 31 | }
 32 | 
 33 | module "bucket-a" {
 34 |   source    = "./modules/bucket"
 35 |   server_id = "a"
 36 |   suffix    = random_id.project.hex
 37 | }
 38 | 
 39 | module "bucket-b" {
 40 |   source    = "./modules/bucket"
 41 |   server_id = "b"
 42 |   suffix    = random_id.project.hex
 43 | }
 44 | 
 45 | 
 46 | // Create the service accounts for the tests
 47 | resource "google_service_account" "ingest" {
 48 |   account_id   = "service-account-ingest"
 49 |   display_name = "Service account for the ingestion service"
 50 | }
 51 | 
 52 | resource "google_service_account" "a" {
 53 |   account_id   = "service-account-a"
 54 |   display_name = "Service account for server A"
 55 | }
 56 | 
 57 | resource "google_service_account" "b" {
 58 |   account_id   = "service-account-b"
 59 |   display_name = "Service account for server B"
 60 | }
 61 | 
 62 | // Assign service account permissions to each bucket. There are quite a few rules,
 63 | // so we break this out into a module.
 64 | module "bucket-permissions-a" {
 65 |   source                   = "./modules/bucket-permissions"
 66 |   bucket_private           = module.bucket-a.private
 67 |   bucket_shared            = module.bucket-a.shared
 68 |   bucket_ingest            = module.bucket-a.ingest
 69 |   service_account_internal = google_service_account.a.email
 70 |   service_account_external = google_service_account.b.email
 71 |   service_account_ingest   = google_service_account.ingest.email
 72 | }
 73 | 
 74 | module "bucket-permissions-b" {
 75 |   source                   = "./modules/bucket-permissions"
 76 |   bucket_private           = module.bucket-b.private
 77 |   bucket_shared            = module.bucket-b.shared
 78 |   bucket_ingest            = module.bucket-b.ingest
 79 |   service_account_internal = google_service_account.b.email
 80 |   service_account_external = google_service_account.a.email
 81 |   service_account_ingest   = google_service_account.ingest.email
 82 | }
 83 | 
 84 | // testing whether origin telemetry inserts into BigQuery correctly
 85 | 
 86 | // The ingest container will be used for coordination, and gets access to
 87 | // server A's private bucket because they are operated by the same entity.
 88 | resource "google_storage_bucket_iam_member" "ingest_internal_private" {
 89 |   bucket = module.bucket-a.private
 90 |   role   = "roles/storage.objectViewer"
 91 |   member = "serviceAccount:${google_service_account.ingest.email}"
 92 | }
 93 | 
 94 | resource "google_project_service" "bigquery" {
 95 |   service = "bigquery.googleapis.com"
 96 | }
 97 | 
 98 | resource "google_bigquery_dataset" "telemetry" {
 99 |   dataset_id = "telemetry"
100 |   location   = "US"
101 | }
102 | 
103 | // Grant access to the admin service account
104 | resource "google_project_iam_member" "bigquery-admin" {
105 |   role   = "roles/bigquery.admin"
106 |   member = "serviceAccount:${google_service_account.ingest.email}"
107 | }
108 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/terraform/modules/bucket-permissions/main.tf:
--------------------------------------------------------------------------------
 1 | variable "bucket_private" {
 2 |   type        = string
 3 |   description = "The private bucket for the current processor"
 4 | }
 5 | 
 6 | variable "bucket_shared" {
 7 |   type        = string
 8 |   description = "The shared bucket for both processors"
 9 | }
10 | 
11 | variable "bucket_ingest" {
12 |   type        = string
13 |   description = "The bucket shared with the ingestion service"
14 | }
15 | 
16 | variable "service_account_internal" {
17 |   type        = string
18 |   description = "The service account for the current processor"
19 | }
20 | 
21 | variable "service_account_external" {
22 |   type        = string
23 |   description = "The service account for the co-processor"
24 | }
25 | 
26 | variable "service_account_ingest" {
27 |   type        = string
28 |   description = "The service account for the ingestor"
29 | }
30 | 
31 | resource "google_storage_bucket_iam_binding" "private" {
32 |   bucket  = var.bucket_private
33 |   role    = "roles/storage.admin"
34 |   members = ["serviceAccount:${var.service_account_internal}"]
35 | }
36 | 
37 | resource "google_storage_bucket_iam_binding" "shared" {
38 |   bucket = var.bucket_shared
39 |   role   = "roles/storage.admin"
40 |   members = [
41 |     "serviceAccount:${var.service_account_internal}",
42 |     "serviceAccount:${var.service_account_external}"
43 |   ]
44 | }
45 | 
46 | resource "google_storage_bucket_iam_binding" "ingest" {
47 |   bucket = var.bucket_ingest
48 |   role   = "roles/storage.admin"
49 |   members = [
50 |     "serviceAccount:${var.service_account_internal}",
51 |     "serviceAccount:${var.service_account_ingest}"
52 |   ]
53 | }
54 | 


--------------------------------------------------------------------------------
/deployment/testing-v4/terraform/modules/bucket/main.tf:
--------------------------------------------------------------------------------
 1 | variable "server_id" {
 2 |   type        = string
 3 |   description = "The identifier for the server"
 4 | }
 5 | 
 6 | variable "suffix" {
 7 |   type        = string
 8 |   description = "A shared suffix used for the bucket"
 9 | }
10 | 
11 | // Create all of the storage resources necessary for the tests. We choose to
12 | // delete files older than 7 days since these are testing resources.
13 | 
14 | resource "google_storage_bucket" "ingest" {
15 |   name                        = "${var.server_id}-ingest-${var.suffix}"
16 |   uniform_bucket_level_access = true
17 |   lifecycle_rule {
18 |     condition {
19 |       age = 7
20 |     }
21 |     action {
22 |       type = "Delete"
23 |     }
24 |   }
25 | }
26 | 
27 | resource "google_storage_bucket" "private" {
28 |   name                        = "${var.server_id}-private-${var.suffix}"
29 |   uniform_bucket_level_access = true
30 |   lifecycle_rule {
31 |     condition {
32 |       age = 7
33 |     }
34 |     action {
35 |       type = "Delete"
36 |     }
37 |   }
38 | }
39 | 
40 | resource "google_storage_bucket" "shared" {
41 |   name                        = "${var.server_id}-shared-${var.suffix}"
42 |   uniform_bucket_level_access = true
43 |   lifecycle_rule {
44 |     condition {
45 |       age = 7
46 |     }
47 |     action {
48 |       type = "Delete"
49 |     }
50 |   }
51 | }
52 | 
53 | output "ingest" {
54 |   value = google_storage_bucket.ingest.name
55 | }
56 | 
57 | output "private" {
58 |   value = google_storage_bucket.private.name
59 | }
60 | 
61 | output "shared" {
62 |   value = google_storage_bucket.shared.name
63 | }
64 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.4"
 2 | 
 3 | services:
 4 |   # This service runs the tests and shuts down. This can also be used as
 5 |   # entrypoint into the container by running `docker-compose run prio_processor
 6 |   # bash`.
 7 |   prio_processor:
 8 |     build:
 9 |       context: .
10 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | ../README.md


--------------------------------------------------------------------------------
/docs/airflow.md:
--------------------------------------------------------------------------------
 1 | # Running prio-processor on Airflow
 2 | 
 3 | A prio-processor job is implemented in
 4 | [mozilla/telemetry-airflow](https://github.com/mozilla/telemetry-airflow/), the
 5 | repository that powers many of the scheduled queries and jobs within the data
 6 | organization within Mozilla. This section describes how the DAG (directed
 7 | acyclic graph of tasks) is orgnized within the repository, and the Google Cloud
 8 | Platform services that it utilizes.
 9 | 
10 | ## DAG overview
11 | 
12 | The `prio-processor` DAG is split up between work that is done by an `admin`
13 | entity and an server `a` entity.
14 | 
15 | ![airflow-dag](./images/airflow-dag.png)
16 | 
17 | The `admin` project handles the following jobs:
18 | 
19 | - `bootstrap` - copy the built python egg and entrypoint scripts into a cloud
20 |   storage bucket for use in Dataproc (hosted Spark). Check `bin/dataproc` for
21 |   more details.
22 | - `staging` - Read data from BigQuery and write out batch-id partitioned ndjson
23 |   files into a bucket owned by the `admin` project.
24 | - `transfer_*` - Copy data from the `admin` project into the storage buckets of
25 |   server a and server b
26 | 
27 | Server a then runs the `processor_a` job. This is run on an emphemeral
28 | Kubernetes cluster [using the
29 | `GKEPodOperator`](https://airflow.apache.org/docs/apache-airflow/1.10.15/_api/airflow/contrib/operators/gcp_container_operator/index.html).
30 | The pod operator will fetch the relevant container image, and the configure the
31 | job using environment variables that include secrets inside of the Airflow
32 | cluster.
33 | 
34 | Finally, the `admin` project will read data from server a's private bucket where
35 | the final aggregate results are stored. This is written into a BigQuery table
36 | that lives within the admin project.
37 | 
38 | ## Infrastructure overview
39 | 
40 | The infrastructure is managed in the [mozilla-services/cloudops-infra][cloudops]
41 | repository. It is split into two domains: prod and nonprod. The prod projects
42 | are used to run code inside of workflow.telemetry.mozilla.org, and contains the
43 | secret keys for Origin Telemetry aggregates that are running on Prio v1. The
44 | nonprod projects are used in the local development workflow for running jobs
45 | using inside of the telemetry-airflow repository.
46 | 
47 | In each realm, the projects are broken up as follows:
48 | 
49 | - `admin` - This contains resources that are designed to interop with the rest
50 |   of the Mozilla data platform. Service accounts in this project have the
51 |   ability to read from the BigQuery datasets in `moz-fx-data-shared-prod`.
52 |   Having access to the main BigQuery project is necessary to preprocess the raw
53 |   data into a format that is acceptable for `prio-processor` containers. This
54 |   project utilizes Dataproc for preprocessing data, and also run various scripts
55 |   using the `prio-processor` image on an ephemeral GKE cluster.
56 | - `server-a` - This project contains resources necessary to run a container on
57 |   GCP. In particular, service accounts provisioned for Airflow have the ability
58 |   to create and delete GKE clusters. These clusters are spun up on-demand, which
59 |   allows data engineers to change the node pool specifications quickly. There
60 |   are various cloud storage buckets that are configured to support multi-party
61 |   communication.
62 | - `server-b` - A copy of server-a, but designed to be dropped for use with an
63 |   external partner.
64 | 
65 | ## Development
66 | 
67 | As of 2021-09-20, there are 6 projects involved with the processing of Firefox nightly data.
68 | 
69 | - moz-fx-prio-admin-prod-098j
70 | - moz-fx-prio-admin-nonprod-8uy7
71 | - moz-fx-prio-a-prod-kju7
72 | - moz-fx-prio-a-nonprod-bf65
73 | - moz-fx-prio-b-prod-a67n
74 | - moz-fx-prio-b-nonprod-h77y
75 | 
76 | The `prod` variant is used to run the job in the production telemetry-airflow
77 | environment. The `nonprod` projects are used in development, which is the
78 | docker-compose workflow that runs on a local host. In addition to the general
79 | README instructions on the telemetry-airflow repository, run the following
80 | script [located in this gist][update-creds]. For this, you will need to have
81 | access to each individual project and the ability to generate service account
82 | credentials that you can store locally on disk.
83 | 
84 | Enable the `prio_processor` DAG and clear tasks to begin processing of data. An
85 | ephemeral kubernetes cluster is responsible for running the the containerized
86 | application. Parameters for the job are passed in via the environment.
87 | 
88 | [prio-dag]: https://github.com/mozilla/telemetry-airflow/blob/915a78e1e936acbb89ec9d3d35e64ce77adc6781/dags/prio_processor.py
89 | [prio-utils]: https://github.com/mozilla/telemetry-airflow/tree/915a78e1e936acbb89ec9d3d35e64ce77adc6781/dags/prio
90 | [update-creds]: https://gist.github.com/acmiyaguchi/a1652f3d56f589e773a9c270bd7f1e6a
91 | [cloudops]: https://github.com/mozilla-services/cloudops-infra/tree/900cafb27cd42fb950d3249e152d3c72541ff424/projects/prio
92 | 


--------------------------------------------------------------------------------
/docs/images/airflow-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/docs/images/airflow-dag.png


--------------------------------------------------------------------------------
/docs/link/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ../../CODE_OF_CONDUCT.md


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples of the Python `prio` package
 2 | 
 3 | There are various examples included in the repository that demonstrate small
 4 | applications that can be built using this set of tools.
 5 | 
 6 | * `swig-wrapper` - A simple application demonstrating the full Prio pipeline.
 7 | * `python-wrapper` - A usage of the object-oriented Python wrapper.
 8 | * `benchmarks` - Various benchmarks in a Jupyter notebook.
 9 | * `browser-validation` - The validation code used to verify existing Firefox
10 |   measurements for this [blog
11 |   post](https://hacks.mozilla.org/2018/10/testing-privacy-preserving-telemetry-with-prio/).
12 | * `asyncio` - An asynchronous pipeline.
13 | * `docker-asyncio` - An asynchronous pipeline using a publish-subscribe
14 |   architecture.
15 | * `batched-processing` - A batched-object processing system using docker and minio.
16 | 


--------------------------------------------------------------------------------
/examples/asyncio/README.md:
--------------------------------------------------------------------------------
 1 | # asyncio Example Usage
 2 | 
 3 | This example demonstrates usage of the python wrapper using asyncio. 
 4 | 
 5 | ## Running the example
 6 | 
 7 | ```bash
 8 | docker run -v $(pwd):/app -it prio:dev python3 main.py
 9 | ```
10 | 
11 | Results in:
12 | 
13 | ```bash
14 | INFO:root:Starting asyncio prio pipeline.
15 | INFO:root:Client 0: Generate shares
16 | INFO:root:Client 1: Generate shares
17 | INFO:root:Client 2: Generate shares
18 | INFO:root:Client 3: Generate shares
19 | INFO:root:Server 1, PID 0: Generate verify packet 1
20 | INFO:root:Server 0, PID 0: Generate verify packet 1
21 | INFO:root:Server 0, PID 1: Generate verify packet 1
22 | INFO:root:Server 0, PID 2: Generate verify packet 1
23 | INFO:root:Server 1, PID 1: Generate verify packet 1
24 | INFO:root:Server 0, PID 3: Generate verify packet 1
25 | INFO:root:Server 0, PID 0: Generate verify packet 2
26 | INFO:root:Server 1, PID 2: Generate verify packet 1
27 | INFO:root:Server 0, PID 1: Generate verify packet 2
28 | INFO:root:Server 0, PID 2: Generate verify packet 2
29 | INFO:root:Server 1, PID 3: Generate verify packet 1
30 | INFO:root:Server 0, PID 3: Generate verify packet 2
31 | INFO:root:Server 1, PID 0: Generate verify packet 2
32 | INFO:root:Server 0, PID 0: Aggregate data
33 | INFO:root:Server 1, PID 1: Generate verify packet 2
34 | INFO:root:Server 0, PID 1: Aggregate data
35 | INFO:root:Server 1, PID 2: Generate verify packet 2
36 | INFO:root:Server 0, PID 2: Aggregate data
37 | INFO:root:Server 1, PID 3: Generate verify packet 2
38 | INFO:root:Server 0, PID 3: Aggregate data
39 | INFO:root:Server 1, PID 0: Aggregate data
40 | INFO:root:Server 1, PID 1: Aggregate data
41 | INFO:root:Server 1, PID 2: Aggregate data
42 | INFO:root:Server 1, PID 3: Aggregate data
43 | INFO:root:Done!
44 | ```
45 | 
46 | ## Dataflow diagram
47 | 
48 | ![Dataflow DAG](./dag.png)
49 | 
50 | ```mermaid
51 | graph TD
52 | 
53 | client -->|shares| create_p1A(create_verify1)
54 | subgraph Server A
55 |     subgraph Verifier
56 |         create_p1A --> |p1A| create_p2A(create_verify2)
57 |         create_p2A --> |p2A| isvalid_A{isValid}
58 |     end
59 |     isvalid_A --> aggregate_A(Aggregate)
60 | end 
61 | 
62 | create_p1A --> |p1A| create_p2B
63 | create_p2A --> |p2A| isvalid_B
64 | 
65 | 
66 | client -->|shares| create_p1B(create_verify1)
67 | subgraph Server B
68 |     subgraph Verifier
69 |         create_p1B --> |p1B| create_p2B(create_verify2)
70 |         create_p2B --> |p2B| isvalid_B{isValid}
71 |     end
72 |     isvalid_B --> aggregate_B(Aggregate)
73 | end 
74 | 
75 | create_p1B --> |p1B| create_p2A
76 | create_p2B --> |p2B| isvalid_A
77 | 
78 | 
79 | aggregate_A --> Total
80 | aggregate_B --> Total
81 | ```
82 | 


--------------------------------------------------------------------------------
/examples/asyncio/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/examples/asyncio/dag.png


--------------------------------------------------------------------------------
/examples/asyncio/main.py:
--------------------------------------------------------------------------------
  1 | # This Source Code Form is subject to the terms of the Mozilla Public
  2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | import asyncio
  6 | import logging
  7 | import random
  8 | import sys
  9 | from collections import namedtuple
 10 | 
 11 | from prio_processor.prio import wrapper as prio
 12 | from prio.libprio import Prio_init, Prio_clear
 13 | 
 14 | logging.basicConfig()
 15 | logger = logging.getLogger()
 16 | logger.setLevel(logging.INFO)
 17 | 
 18 | PACKET_DATA = 0
 19 | PACKET_VERIFY_1 = 1
 20 | PACKET_VERIFY_2 = 2
 21 | 
 22 | Packet = namedtuple("Packet", ["id", "type", "data"])
 23 | 
 24 | 
 25 | async def server_consume(server, read_queue, write_queue):
 26 |     # maintain state of the server's shares in the verifier, along with the
 27 |     # generated verification packets
 28 |     cache = {}
 29 | 
 30 |     while True:
 31 |         # add random jitter to simulate io
 32 |         await asyncio.sleep(random.random())
 33 | 
 34 |         packet = await read_queue.get()
 35 |         pid = packet.id
 36 |         v, p1, p2 = cache.get(pid, (None, None, None))
 37 | 
 38 |         def log(line):
 39 |             logger.info("Server {}, PID {}: {}".format(server.server_id, pid, line))
 40 | 
 41 |         # out of order packet execution is dealt with by pushing data back
 42 |         # into the queue
 43 | 
 44 |         if packet.type == PACKET_DATA:
 45 |             log("Generate verify packet 1")
 46 |             v = server.create_verifier(packet.data)
 47 |             p1 = v.create_verify1()
 48 |             await write_queue.put(Packet(id=pid, type=PACKET_VERIFY_1, data=p1))
 49 |         elif packet.type == PACKET_VERIFY_1:
 50 |             if not p1:
 51 |                 await read_queue.put(packet)
 52 |                 read_queue.task_done()
 53 |                 continue
 54 |             log("Generate verify packet 2")
 55 |             p2 = v.create_verify2(p1, packet.data)
 56 |             await write_queue.put(Packet(id=pid, type=PACKET_VERIFY_2, data=p2))
 57 |         elif packet.type == PACKET_VERIFY_2:
 58 |             if not p2:
 59 |                 await read_queue.put(packet)
 60 |                 read_queue.task_done()
 61 |                 continue
 62 |             if v.is_valid(p2, packet.data):
 63 |                 log("Aggregate data")
 64 |                 server.aggregate(v)
 65 |             else:
 66 |                 log("Invalid data")
 67 |             del cache[pid]
 68 | 
 69 |         read_queue.task_done()
 70 |         cache[pid] = (v, p1, p2)
 71 | 
 72 | 
 73 | async def client_produce(client, data_items, queue_a, queue_b, n_clients):
 74 |     for i in range(n_clients):
 75 |         logger.info("Client {}: Generate shares".format(i))
 76 |         for_server_a, for_server_b = client.encode(data_items)
 77 |         await queue_a.put(Packet(id=i, type=PACKET_DATA, data=for_server_a))
 78 |         await queue_b.put(Packet(id=i, type=PACKET_DATA, data=for_server_b))
 79 | 
 80 | 
 81 | async def main():
 82 |     Prio_init()
 83 |     n_clients = 4
 84 |     n_data = 133
 85 |     server_secret = prio.PRGSeed()
 86 |     skA, pkA = prio.create_keypair()
 87 |     skB, pkB = prio.create_keypair()
 88 | 
 89 |     cfg = prio.Config(n_data, pkA, pkB, b"test_batch")
 90 |     sA = prio.Server(cfg, prio.PRIO_SERVER_A, skA, server_secret)
 91 |     sB = prio.Server(cfg, prio.PRIO_SERVER_B, skB, server_secret)
 92 | 
 93 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
 94 | 
 95 |     logger.info("Starting asyncio prio pipeline.")
 96 |     client = prio.Client(cfg)
 97 |     queue_a = asyncio.Queue()
 98 |     queue_b = asyncio.Queue()
 99 | 
100 |     await client_produce(client, data_items, queue_a, queue_b, n_clients)
101 | 
102 |     consumers = asyncio.ensure_future(
103 |         asyncio.gather(
104 |             server_consume(sA, queue_a, queue_b), server_consume(sB, queue_b, queue_a)
105 |         )
106 |     )
107 | 
108 |     await asyncio.gather(queue_a.join(), queue_b.join())
109 | 
110 |     t_a = sA.total_shares()
111 |     t_b = sB.total_shares()
112 | 
113 |     output = prio.total_share_final(cfg, t_a, t_b)
114 | 
115 |     expected = [item * n_clients for item in list(data_items)]
116 |     assert list(output) == expected
117 | 
118 |     consumers.cancel()
119 |     Prio_clear()
120 |     logger.info("Done!")
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     loop = asyncio.get_event_loop()
125 |     loop.run_until_complete(main())
126 |     loop.close()
127 | 


--------------------------------------------------------------------------------
/examples/batched-processing/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM prio:dev
2 | 
3 | RUN curl https://dl.minio.io/client/mc/release/linux-amd64/mc -o mc
4 | RUN chmod +x mc
5 | ENV PATH="/app/:${PATH}"
6 | 
7 | CMD bash
8 | 


--------------------------------------------------------------------------------
/examples/batched-processing/Makefile:
--------------------------------------------------------------------------------
 1 | build:
 2 | 	docker build --target development -t prio:dev ../..
 3 | 	docker-compose build
 4 | 
 5 | test:
 6 | 	scripts/integration.sh
 7 | 
 8 | clean:
 9 | 	docker-compose down
10 | 


--------------------------------------------------------------------------------
/examples/batched-processing/README.md:
--------------------------------------------------------------------------------
 1 | # Batched Processing
 2 | 
 3 | This example is an example of a minimal two-server aggregation scheme that
 4 | fulfils the privacy guarantees of the Prio system.
 5 | 
 6 | ## Quickstart
 7 | 
 8 | ```bash
 9 | # Build the containers
10 | make build
11 | 
12 | # Run the test
13 | make test
14 | ```
15 | 
16 | ## Resources
17 | 
18 | ### Generated Keys
19 | ```json
20 | # Server A
21 | {
22 |     "private_key": "19DDC146FB8EE4A0B762A7DAE7E96033F87C9528DBBF8CA899CCD1DB8CD74984",
23 |     "public_key": "445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78"
24 | }
25 | 
26 | # Server B
27 | {
28 |     "private_key": "E3AA3CC952C8553E46E699646A9DC3CBA7E3D4C7F0779D58574ABF945E259202",
29 |     "public_key": "01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642"
30 | }
31 | ```
32 | 
33 | ## Misc
34 | 
35 | * Generating an [s3 policy file](https://docs.aws.amazon.com/AmazonS3/latest/dev/example-policies-s3.html)
36 | * [MinIO multi-user quickstart guide](https://docs.min.io/docs/minio-multi-user-quickstart-guide.html)
37 | 


--------------------------------------------------------------------------------
/examples/batched-processing/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   minio:
 5 |     # https://docs.min.io/docs/minio-docker-quickstart-guide
 6 |     # https://docs.min.io/docs/minio-multi-user-quickstart-guide.html
 7 |     image: minio/minio:latest
 8 |     command: server /data
 9 |     ports:
10 |       - 9000:9000
11 |     environment:
12 |       - MINIO_ACCESS_KEY=admin
13 |       - MINIO_SECRET_KEY=password
14 | 
15 |   bootstrap:
16 |     image: minio/mc:latest
17 |     depends_on:
18 |       - minio
19 |     working_dir: /root
20 |     entrypoint: sh
21 |     command: scripts/bootstrap.sh
22 |     volumes:
23 |       - .:/root/
24 | 
25 |   server_a:
26 |     build: .
27 |     working_dir: /app/examples/batched-processing
28 |     command: "true"
29 |     volumes:
30 |       - .:/app/examples/batched-processing
31 |     depends_on:
32 |       - minio
33 |       - bootstrap
34 |     environment:
35 |       - N_DATA=3
36 |       - BATCH_ID=test
37 |       - SERVER_ID=A
38 |       - SHARED_SECRET=m/AqDal/ZSA9597GwMM+VA==
39 |       - PRIVATE_KEY_HEX=19DDC146FB8EE4A0B762A7DAE7E96033F87C9528DBBF8CA899CCD1DB8CD74984
40 |       - PUBLIC_KEY_HEX_INTERNAL=445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78
41 |       - PUBLIC_KEY_HEX_EXTERNAL=01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642
42 |       - MINIO_ACCESS_KEY=server-a
43 |       - MINIO_SECRET_KEY=password
44 |       - BUCKET_INTERNAL=server-a
45 |       - BUCKET_EXTERNAL=server-b
46 | 
47 |   server_b:
48 |     build: .
49 |     working_dir: /app/examples/batched-processing
50 |     command: "true"
51 |     volumes:
52 |       - .:/app/examples/batched-processing
53 |     depends_on:
54 |       - minio
55 |       - bootstrap
56 |     environment:
57 |       - N_DATA=3
58 |       - BATCH_ID=test
59 |       - SERVER_ID=B
60 |       - SHARED_SECRET=m/AqDal/ZSA9597GwMM+VA==
61 |       - PRIVATE_KEY_HEX=E3AA3CC952C8553E46E699646A9DC3CBA7E3D4C7F0779D58574ABF945E259202
62 |       - PUBLIC_KEY_HEX_INTERNAL=01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642
63 |       - PUBLIC_KEY_HEX_EXTERNAL=445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78
64 |       - MINIO_ACCESS_KEY=server-b
65 |       - MINIO_SECRET_KEY=password
66 |       - BUCKET_INTERNAL=server-b
67 |       - BUCKET_EXTERNAL=server-a
68 | 
69 |   client:
70 |     build: .
71 |     working_dir: /app/examples/batched-processing
72 |     command: "true"
73 |     volumes:
74 |       - .:/app/examples/batched-processing
75 |     depends_on:
76 |       - server_a
77 |       - server_b
78 |     environment:
79 |       - N_DATA=3
80 |       - BATCH_ID=test
81 |       # Server A and B respectively
82 |       - PUBLIC_KEY_HEX_INTERNAL=445C126981113E5684D517826E508F5731A1B35485BACCD63DAA8120DD11DA78
83 |       - PUBLIC_KEY_HEX_EXTERNAL=01D5D4F179ED233140CF97F79594F0190528268A99A6CDF57EF0E1569E673642
84 |       - MINIO_ACCESS_KEY=admin
85 |       - MINIO_SECRET_KEY=password
86 |       - BUCKET_SERVER_A=server-a
87 |       - BUCKET_SERVER_B=server-b
88 | 


--------------------------------------------------------------------------------
/examples/batched-processing/policy/server-a.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "Version": "2012-10-17",
 3 |    "Statement": [
 4 |      {
 5 |        "Action": [
 6 |          "s3:*"
 7 |        ],
 8 |        "Effect": "Allow",
 9 |        "Resource": [
10 |          "arn:aws:s3:::server-a/*",
11 |          "arn:aws:s3:::server-b/intermediate/external/*"
12 |        ],
13 |        "Sid": ""
14 |      }
15 |    ]
16 |  }
17 | 


--------------------------------------------------------------------------------
/examples/batched-processing/policy/server-b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |       {
 5 |         "Action": [
 6 |           "s3:*"
 7 |         ],
 8 |         "Effect": "Allow",
 9 |         "Resource": [
10 |           "arn:aws:s3:::server-b/*",
11 |           "arn:aws:s3:::server-a/intermediate/external/*"
12 |         ],
13 |         "Sid": ""
14 |       }
15 |     ]
16 |   }
17 | 


--------------------------------------------------------------------------------
/examples/batched-processing/scripts/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -euo pipefail
 4 | set -x
 5 | 
 6 | TARGET="minio"
 7 | 
 8 | mc config host add $TARGET http://minio:9000 admin password
 9 | mc mb $TARGET/server-a
10 | mc mb $TARGET/server-b
11 | 
12 | # mc admin policy add TARGET POLICYNAME POLICYFILE
13 | mc admin policy add $TARGET server-a policy/server-a.json
14 | mc admin policy add $TARGET server-b policy/server-b.json
15 | 
16 | # mc admin user add TARGET ACCESSKEY SECRETKEY
17 | mc admin user add $TARGET server-a password
18 | mc admin user add $TARGET server-b password
19 | 
20 | # mc admin policy set TARGET POLICYNAME user=ACCESSKEY
21 | mc admin policy set $TARGET server-a user=server-a
22 | mc admin policy set $TARGET server-b user=server-b
23 | 


--------------------------------------------------------------------------------
/examples/batched-processing/scripts/check-aggregates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check that aggregates that are computed via client.sh and server.sh are
 4 | # correct and published to the correct location. This script should be run
 5 | # inside of the client container.
 6 | 
 7 | set -euo pipefail
 8 | set -x
 9 | 
10 | : ${MINIO_ACCESS_KEY?}
11 | : ${MINIO_SECRET_KEY?}
12 | : ${BUCKET_SERVER_A?}
13 | : ${BUCKET_SERVER_B?}
14 | 
15 | TARGET="minio"
16 | mc config host add $TARGET http://minio:9000 ${MINIO_ACCESS_KEY} ${MINIO_SECRET_KEY}
17 | 
18 | function get_payload() {
19 |     local path=$1
20 |     mc cat "${path}" | jq -c '.payload'
21 | }
22 | 
23 | [[ $(get_payload $TARGET/$BUCKET_SERVER_A/processed/part-0.ndjson) == "[3,2,1]" ]]
24 | [[ $(get_payload $TARGET/$BUCKET_SERVER_B/processed/part-0.ndjson) == "[3,2,1]" ]]
25 | 
26 | [[ $(get_payload $TARGET/$BUCKET_SERVER_A/processed/part-1.ndjson) == "[4,2,4]" ]]
27 | [[ $(get_payload $TARGET/$BUCKET_SERVER_B/processed/part-1.ndjson) == "[4,2,4]" ]]
28 | 
29 | [[ $(get_payload $TARGET/$BUCKET_SERVER_A/processed/part-2.ndjson) == "[7,3,1]" ]]
30 | [[ $(get_payload $TARGET/$BUCKET_SERVER_B/processed/part-2.ndjson) == "[7,3,1]" ]]
31 | 


--------------------------------------------------------------------------------
/examples/batched-processing/scripts/client.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eou pipefail
 4 | set -x
 5 | 
 6 | # Parameters that are read through the environment
 7 | : ${N_DATA?}
 8 | : ${BATCH_ID?}
 9 | : ${PUBLIC_KEY_HEX_INTERNAL?}
10 | : ${PUBLIC_KEY_HEX_EXTERNAL?}
11 | 
12 | : ${MINIO_ACCESS_KEY?}
13 | : ${MINIO_SECRET_KEY?}
14 | : ${BUCKET_SERVER_A?}
15 | : ${BUCKET_SERVER_B?}
16 | 
17 | TARGET="minio"
18 | mc config host add $TARGET http://minio:9000 ${MINIO_ACCESS_KEY} ${MINIO_SECRET_KEY}
19 | 
20 | # The bucket name is used for the local file directory and for the remote minio
21 | # bucket.
22 | cd /tmp
23 | output_a=$BUCKET_SERVER_A/raw
24 | output_b=$BUCKET_SERVER_B/raw
25 | mkdir -p $output_a
26 | mkdir -p $output_b
27 | 
28 | jq -c '{payload: .}' <<EOF >part-0.ndjson
29 | [1, 0, 0]
30 | [1, 1, 0]
31 | [1, 1, 1]
32 | EOF
33 | 
34 | jq -c '{payload: .}' <<EOF >part-1.ndjson
35 | [1, 0, 1]
36 | [1, 1, 1]
37 | [1, 0, 1]
38 | [1, 1, 1]
39 | EOF
40 | 
41 | jq -c '{payload: .}' <<EOF >part-2.ndjson
42 | [1, 0, 0]
43 | [1, 0, 0]
44 | [1, 0, 0]
45 | [1, 0, 0]
46 | [1, 1, 0]
47 | [1, 1, 0]
48 | [1, 1, 1]
49 | EOF
50 | 
51 | for filename in $(find . -name "*.ndjson"); do
52 |     prio encode-shares \
53 |         --input $filename \
54 |         --output-A $output_a \
55 |         --output-B $output_b
56 | 
57 |     jq -c '.' $output_a/$filename
58 |     jq -c '.' $output_b/$filename
59 | done
60 | 
61 | mc cp --recursive $output_a/ $TARGET/$output_a/
62 | mc cp --recursive $output_b/ $TARGET/$output_b/
63 | 
64 | touch _SUCCESS
65 | mc cp _SUCCESS $TARGET/$output_a/_SUCCESS
66 | mc cp _SUCCESS $TARGET/$output_b/_SUCCESS
67 | 


--------------------------------------------------------------------------------
/examples/batched-processing/scripts/integration.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script controls the docker-compose workflow for integration testing. The
 4 | # containers are defined in the docker-compose.yml, but are orchestrated through
 5 | # this script for verification.
 6 | #
 7 | # See: https://stackoverflow.com/questions/40907954/terminate-docker-compose-when-test-container-finishes
 8 | 
 9 | set -euo pipefail
10 | 
11 | docker-compose up -d
12 | 
13 | # Add a cleanup handler for the exit signal
14 | function cleanup {
15 |     docker-compose down
16 | }
17 | trap cleanup EXIT
18 | 
19 | # Start server A
20 | docker-compose run server_a scripts/server.sh &
21 | server_a_pid=$!
22 | 
23 | # Start server B
24 | docker-compose run server_b scripts/server.sh &
25 | server_b_pid=$!
26 | 
27 | # Copy data into the appropriate buckets
28 | docker-compose run client scripts/client.sh
29 | 
30 | # Return the exit code of the backgrounded docker-compose container. Since
31 | # `wait` is a blocking function, a failure in server B will not be detected
32 | # until timeout in server A. 
33 | wait $server_a_pid
34 | wait $server_b_pid
35 | 
36 | docker-compose run client scripts/check-aggregates.sh
37 | 


--------------------------------------------------------------------------------
/examples/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # libprio benchmarks
 2 | 
 3 | Information about payload sizes and client encoding time as a function of input
 4 | size.
 5 | 
 6 | ## Reproducing results
 7 | 
 8 | ```bash
 9 | pip install -r requirements.txt
10 | python3 main.py
11 | ```
12 | 
13 | ![encrypted sizes](./encrypted_sizes.png)
14 | ![client encoding time](./client_encoding_time.png)


--------------------------------------------------------------------------------
/examples/benchmarks/client_encoding_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/examples/benchmarks/client_encoding_time.png


--------------------------------------------------------------------------------
/examples/benchmarks/encrypted_sizes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/examples/benchmarks/encrypted_sizes.png


--------------------------------------------------------------------------------
/examples/benchmarks/main.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | 
 3 | import matplotlib
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | from prio_processor.prio import wrapper as prio
 7 | from prio import PrioContext
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | @PrioContext()
12 | def bench_encrypted_sizes(path):
13 |     _, pubkey = prio.create_keypair()
14 | 
15 |     def size(n):
16 |         cfg = prio.Config(n, pubkey, pubkey, b"test")
17 |         a, b = prio.Client(cfg).encode(bytes([1] * k))
18 |         return [k, len(a), len(b)]
19 | 
20 |     sizes = []
21 |     for k in tqdm(range(0, 10000, 100)):
22 |         try:
23 |             sizes.append(size(k))
24 |         except:
25 |             print(f"Prio excepted at {k} items")
26 |             break
27 | 
28 |     fig, ax = plt.subplots()
29 |     ax.set_xscale("log", basex=2)
30 |     ax.set_yscale("log", basey=2)
31 |     ax.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter())
32 |     ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter())
33 |     plt.title("Prio measurement size vs payload size")
34 |     plt.xlabel("measurement size (bits)")
35 |     plt.ylabel("payload size (bytes)")
36 |     plt.plot(*np.array(sizes).T[:2])
37 |     plt.savefig(path)
38 | 
39 | 
40 | @PrioContext()
41 | def bench_client_encoding(path):
42 |     runs = 10 ** 2
43 |     timings = []
44 |     for k in tqdm([8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]):
45 |         _, pubkey = prio.create_keypair()
46 |         cfg = prio.Config(k, pubkey, pubkey, b"test_batch")
47 |         client = prio.Client(cfg)
48 |         data = bytes([1] * k)
49 |         timing = timeit.timeit("client.encode(data)", number=runs, globals=locals())
50 |         timings.append([k, timing])
51 | 
52 |     data = np.array(timings)
53 |     y = data[:, 1] / runs
54 |     x = data[:, 0]
55 | 
56 |     fig, ax = plt.subplots()
57 |     plt.title(f"measurement size vs encoding time (n={runs})")
58 |     plt.xlabel("measurement size (bits)")
59 |     plt.ylabel("encoding time (seconds)")
60 |     ax.set_xscale("log", basex=2)
61 |     ax.set_yscale("log", basey=2)
62 |     ax.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter())
63 |     ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter())
64 |     plt.plot(x, y)
65 |     plt.savefig(path)
66 | 
67 | 
68 | def main():
69 |     print("running benchmark for encrypted sizes")
70 |     bench_encrypted_sizes("encrypted_sizes.png")
71 |     print("running benchmark for client encoding time")
72 |     bench_client_encoding("client_encoding_time.png")
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/examples/benchmarks/requirements.in:
--------------------------------------------------------------------------------
1 | matplotlib
2 | numpy
3 | tqdm
4 | -e file:../..
5 | 


--------------------------------------------------------------------------------
/examples/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile
 6 | #
 7 | -e file:../..
 8 | attrs==19.3.0             # via jsonschema
 9 | cachetools==4.1.1         # via google-auth
10 | certifi==2020.6.20        # via requests
11 | chardet==3.0.4            # via requests
12 | click==7.0
13 | cycler==0.10.0            # via matplotlib
14 | decorator==4.4.2          # via gcsfs
15 | gcsfs==0.2.3
16 | google-auth-oauthlib==0.4.1  # via gcsfs
17 | google-auth==1.20.0       # via gcsfs, google-auth-oauthlib
18 | idna==2.10                # via requests
19 | importlib-metadata==1.7.0  # via jsonschema
20 | jsonschema==3.2.0
21 | kiwisolver==1.1.0         # via matplotlib
22 | matplotlib==3.1.3
23 | numpy==1.18.1
24 | oauthlib==3.1.0           # via requests-oauthlib
25 | prio==1.1
26 | py4j==0.10.9              # via pyspark
27 | pyasn1-modules==0.2.8     # via google-auth
28 | pyasn1==0.4.8             # via pyasn1-modules, rsa
29 | pyparsing==2.4.6          # via matplotlib
30 | pyrsistent==0.16.0        # via jsonschema
31 | pyspark==3.0.0
32 | python-dateutil==2.8.1    # via matplotlib
33 | requests-oauthlib==1.3.0  # via google-auth-oauthlib
34 | requests==2.24.0          # via gcsfs, requests-oauthlib
35 | rsa==4.6                  # via google-auth
36 | six==1.14.0               # via cycler, google-auth, jsonschema, pyrsistent, python-dateutil
37 | tqdm==4.43.0
38 | urllib3==1.25.10          # via requests
39 | zipp==3.1.0               # via importlib-metadata
40 | 
41 | # The following packages are considered to be unsafe in a requirements file:
42 | # setuptools
43 | 


--------------------------------------------------------------------------------
/examples/browser-validation/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM prio:dev
 2 | 
 3 | # install wait-for for docker-compose services
 4 | RUN curl -o /usr/local/bin/wait-for-it https://raw.githubusercontent.com/vishnubob/wait-for-it/master/wait-for-it.sh
 5 | RUN chmod +x /usr/local/bin/wait-for-it
 6 | 
 7 | RUN pip3 install \
 8 |     s3fs \
 9 |     pyarrow \
10 |     click \
11 |     pandas \
12 | 
13 | CMD bash
14 | 


--------------------------------------------------------------------------------
/examples/browser-validation/Makefile:
--------------------------------------------------------------------------------
1 | build:
2 | 	docker build --target development -t prio:dev ../..
3 | 
4 | run:
5 | 	docker-compose run app
6 | 
7 | test:
8 | 	docker-compose run bash -c "python3 generate.py --path test.batch.json | bash"
9 | 


--------------------------------------------------------------------------------
/examples/browser-validation/README.md:
--------------------------------------------------------------------------------
 1 | # Validate Browser Data
 2 | 
 3 | This example validates the results from a pilot experiment. The API has changed
 4 | and the original data source does not exist in this form any longer due to a
 5 | transition to GCP. This code is deprecated, but may be run in it's most recent
 6 | working state in the v1.6.1 tag of the container.
 7 | 
 8 | ## Usage
 9 | 
10 | Setup
11 | 
12 | ```bash
13 | make run
14 | make test
15 | ```
16 | 
17 | To test against generated data, run the `generate.py` script.
18 | 
19 | ```
20 | $ python generate.py --path test.batch.json
21 | ```
22 | 
23 | This will generate the corresponding command for validation.
24 | Verify the output of this command before running it.
25 | 
26 | ```
27 | $ python generate.py --path test.batch.json | bash
28 | ```
29 | 
30 | To run against a real browser ping, you can run a command in the following form:
31 | 
32 | ```
33 | $ python main.py \
34 |     --pings sample.batch.json \
35 |     --pubkey-A <HEXKEY> \
36 |     --pvtkey-A <HEXKEY> \
37 |     --pubkey-B <HEXKEY> \
38 |     --pvtkey-B <HEXKEY>
39 | ```
40 | 
41 | The `--pings` argument generally takes a set of json documents; one per line and delimited by a new line.
42 | 
43 | The ping should be compacted before being presented to the program.
44 | 
45 | ```
46 | # use `jq -c` to compact a json document
47 | $ cat my-ping.json | jq -c . > my-ping.batch.json
48 | ```
49 | 
50 | To run against the parquet dataset, make sure you have AWS credentials with access to the appropriate bucket. To verify that everything is set up correctly:
51 | 
52 | ```
53 | $ aws s3 ls s3://net-mozaws-prod-us-west-2-pipeline-analysis/amiyaguchi/prio/v1
54 | ```
55 | 
56 | Then run the following command:
57 | 
58 | ```
59 | $ python main.py \
60 |     --date 20181007 \
61 |     --pubkey-A <HEXKEY> \
62 |     --pvtkey-A <HEXKEY> \
63 |     --pubkey-B <HEXKEY> \
64 |     --pvtkey-B <HEXKEY>
65 | ```
66 | 
67 | ### Docker
68 | 
69 | This image may also be run via docker. Pass the appropriate environment variables as follows:
70 | 
71 | ```bash
72 | $ make build
73 | 
74 | $ make test
75 | 
76 | $ AWS_ACCESS_KEY_ID= \
77 | AWS_SECRET_ACCESS_KEY= \
78 | PRIO_DATE= \
79 | PRIO_PUBKEY_A= \
80 | PRIO_PVTKEY_A= \
81 | PRIO_PUBKEY_B= \
82 | PRIO_PVTKEY_B= \
83 | make run
84 | ```
85 | 


--------------------------------------------------------------------------------
/examples/browser-validation/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.4'
 2 | 
 3 | services:
 4 |   app:
 5 |     build: .
 6 |     volumes:
 7 |       - .:/app/examples/docker-asyncio
 8 |     command: python3 main.py
 9 |     environment:
10 |       - AWS_ACCESS_KEY_ID
11 |       - AWS_SECRET_ACCESS_KEY
12 |       - PRIO_DATE 
13 |       - PRIO_PINGS 
14 |       - PRIO_PUBKEY_A 
15 |       - PRIO_PVTKEY_A 
16 |       - PRIO_PUBKEY_B 
17 |       - PRIO_PVTKEY_B 
18 | 


--------------------------------------------------------------------------------
/examples/browser-validation/generate.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from itertools import product
 3 | 
 4 | import click
 5 | from prio import prio
 6 | 
 7 | 
 8 | # cardinality of the input vector
 9 | N_DATA = 3
10 | 
11 | 
12 | def construct(build_id, user_default, newtab, pdf, data_a, data_b):
13 |     ping = {
14 |         "environment": {"build": {"buildId": build_id}},
15 |         "payload": {
16 |             "histograms": {
17 |                 "BROWSER_IS_USER_DEFAULT": {"sum": user_default},
18 |                 "NEWTAB_PAGE_ENABLED": {"sum": newtab},
19 |                 "PDF_VIEWER_USED": {"sum": pdf},
20 |             },
21 |             "prio": {
22 |                 "a": {k: int(v) for k, v in enumerate(data_a)},
23 |                 "b": {k: int(v) for k, v in enumerate(data_b)},
24 |             },
25 |         },
26 |     }
27 |     return ping
28 | 
29 | 
30 | def generate(build_id, client):
31 |     data = []
32 |     for vector in product([0, 1], [0, 1], [0, 1]):
33 |         args = list(vector) + client.encode(bytes(vector))
34 |         ping = construct(build_id, *args)
35 |         data.append(ping)
36 |     return data
37 | 
38 | 
39 | def write(fp, data):
40 |     fp.write("\n".join(map(json.dumps, data)))
41 | 
42 | 
43 | @click.command()
44 | @click.option("--path", type=click.Path(exists=False), required=True)
45 | @click.option("--batch-id", type=str, default="test-batch")
46 | def main(path, batch_id):
47 |     # create the encryption keys
48 |     skA, pkA = prio.create_keypair()
49 |     skB, pkB = prio.create_keypair()
50 | 
51 |     # create the client
52 |     cfg = prio.Config(N_DATA, pkA, pkB, bytes(batch_id, "utf-8"))
53 |     client = prio.Client(cfg)
54 | 
55 |     # generate test data
56 |     data = generate(batch_id, client)
57 |     with open(path, "w") as f:
58 |         write(f, data)
59 | 
60 |     # print a command to use
61 |     def clean(s):
62 |         return s[:-1].decode("utf-8")
63 | 
64 |     args = {
65 |         "--pings": path,
66 |         "--pubkey-A": clean(pkA.export_hex()),
67 |         "--pvtkey-A": clean(skA.export_hex()),
68 |         "--pubkey-B": clean(pkB.export_hex()),
69 |         "--pvtkey-B": clean(skB.export_hex()),
70 |     }
71 |     argstr = " \\".join([f"\n\t{k} {v}" for k, v in args.items()])
72 |     print(f"python main.py \\{argstr}")
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/examples/docker-asyncio/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM prio:dev
 2 | 
 3 | # install wait-for for docker-compose services
 4 | RUN curl -o /usr/local/bin/wait-for-it https://raw.githubusercontent.com/vishnubob/wait-for-it/master/wait-for-it.sh
 5 | RUN chmod +x /usr/local/bin/wait-for-it
 6 | 
 7 | RUN pip3 install \
 8 |     aioamqp \
 9 |     click \
10 |     aio-pika
11 | 
12 | CMD bash
13 | 


--------------------------------------------------------------------------------
/examples/docker-asyncio/Makefile:
--------------------------------------------------------------------------------
1 | build:
2 | 	docker build --target development -t prio:dev ../..
3 | 	docker-compose build
4 | 
5 | run:
6 | 	docker-compose up
7 | 
8 | clean:
9 | 	docker-compose down


--------------------------------------------------------------------------------
/examples/docker-asyncio/client.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import aio_pika
 3 | import logging
 4 | import click
 5 | 
 6 | from prio_processor.prio import wrapper as prio
 7 | from prio import PrioContext
 8 | 
 9 | logging.basicConfig()
10 | logger = logging.getLogger()
11 | logger.setLevel(logging.INFO)
12 | 
13 | 
14 | async def run_client(pubkey_a, pubkey_b, n_clients, n_fields, batch_id):
15 |     connection = await aio_pika.connect_robust("amqp://guest:guest@rabbitmq:5672/")
16 |     channel = await connection.channel()
17 |     await channel.declare_queue("prio.0")
18 |     await channel.declare_queue("prio.1")
19 | 
20 |     # delay for server setup
21 |     await asyncio.sleep(3)
22 | 
23 |     pkA = prio.PublicKey().import_hex(pubkey_a)
24 |     pkB = prio.PublicKey().import_hex(pubkey_b)
25 | 
26 |     config = prio.Config(n_fields, pkA, pkB, batch_id)
27 |     client = prio.Client(config)
28 | 
29 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_fields)])
30 | 
31 |     for i in range(n_clients):
32 | 
33 |         logger.info("Client {}: Generated shares".format(i))
34 |         for_server_a, for_server_b = client.encode(data_items)
35 | 
36 |         await channel.default_exchange.publish(
37 |             aio_pika.Message(body=for_server_a, message_id=str(i), type="data"),
38 |             routing_key="prio.0",
39 |         )
40 |         await channel.default_exchange.publish(
41 |             aio_pika.Message(body=for_server_b, message_id=str(i), type="data"),
42 |             routing_key="prio.1",
43 |         )
44 |     await connection.close()
45 |     logger.info("Client done!")
46 | 
47 | 
48 | @click.command()
49 | @click.option("--pubkey-A", type=str)
50 | @click.option("--pubkey-B", type=str)
51 | @click.option("--n-clients", type=int, default=10)
52 | @click.option("--n-fields", type=int, required=True)
53 | @click.option("--batch-id", type=str, default="test_batch")
54 | @PrioContext()
55 | def main(pubkey_a, pubkey_b, n_clients, n_fields, batch_id):
56 |     loop = asyncio.get_event_loop()
57 |     loop.run_until_complete(
58 |         run_client(
59 |             bytes(pubkey_a, "utf-8"),
60 |             bytes(pubkey_b, "utf-8"),
61 |             n_clients,
62 |             n_fields,
63 |             bytes(batch_id, "utf-8"),
64 |         )
65 |     )
66 |     loop.close()
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/examples/docker-asyncio/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.4'
 2 | 
 3 | x-app: &app
 4 |   build: .
 5 |   volumes:
 6 |     - .:/app/examples/docker-asyncio
 7 | 
 8 | services:
 9 |   client:
10 |     <<: *app
11 |     depends_on:
12 |       - rabbitmq
13 |       - server_a
14 |       - server_b
15 |     command: >
16 |       bash -c "cd examples/docker-asyncio &&
17 |       wait-for-it rabbitmq:5672 -- python3 client.py \
18 |           --pubkey-A F63F2FB9B823B7B672684A526AC467DCFC110D4BB242F6DF0D3EA9F09CE14B51 \
19 |           --pubkey-B 15DC84D87C73A36120E0389D4ABCD433EDC5147DC71A4093E2A5952968D51F07 \
20 |           --n-clients 10 \
21 |           --n-fields 133"
22 | 
23 |   server_a:
24 |     <<: *app
25 |     depends_on:
26 |       - rabbitmq
27 |     command: >
28 |       bash -c "cd examples/docker-asyncio &&
29 |       wait-for-it rabbitmq:5672 -- python3 server.py \
30 |           --pubkey F63F2FB9B823B7B672684A526AC467DCFC110D4BB242F6DF0D3EA9F09CE14B51 \
31 |           --pvtkey 7A0AA608C08CB74A86409F5026865435B2F17F40B20636CEFD2656585097FBE0 \
32 |           --pubkey-other 15DC84D87C73A36120E0389D4ABCD433EDC5147DC71A4093E2A5952968D51F07 \
33 |           --server-id a \
34 |           --n-fields 133"
35 | 
36 |   server_b:
37 |     <<: *app
38 |     depends_on:
39 |       - rabbitmq
40 |     command: >
41 |       bash -c "cd examples/docker-asyncio &&
42 |       wait-for-it rabbitmq:5672 -- python3 server.py \
43 |           --pubkey 15DC84D87C73A36120E0389D4ABCD433EDC5147DC71A4093E2A5952968D51F07 \
44 |           --pvtkey 50C7329DE18DE3087A0DE963D5585A4DB7A156C7A29FA854760373B053D86919 \
45 |           --pubkey-other F63F2FB9B823B7B672684A526AC467DCFC110D4BB242F6DF0D3EA9F09CE14B51 \
46 |           --server-id b \
47 |           --n-fields 133"
48 | 
49 |   rabbitmq:
50 |     image: rabbitmq:latest
51 |     ports:
52 |       - 5672:5672
53 | 


--------------------------------------------------------------------------------
/examples/docker-asyncio/server.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import aio_pika
  3 | import logging
  4 | import os
  5 | import sys
  6 | import pickle
  7 | from functools import partial
  8 | 
  9 | import click
 10 | from prio_processor.prio import wrapper as prio
 11 | from prio import PrioContext
 12 | 
 13 | logging.basicConfig()
 14 | logger = logging.getLogger()
 15 | logger.setLevel(logging.INFO)
 16 | 
 17 | DEFAULT_SHARED_SEED = b"vY\xc1\t\x93\xfb\xc6\x97*\x07j\xd63i+\x86"
 18 | 
 19 | 
 20 | def get_other_server(server_id):
 21 |     mapping = {
 22 |         prio.PRIO_SERVER_A: prio.PRIO_SERVER_B,
 23 |         prio.PRIO_SERVER_B: prio.PRIO_SERVER_A,
 24 |     }
 25 |     return mapping[server_id]
 26 | 
 27 | 
 28 | async def run_server(
 29 |     pubkey, pvtkey, pubkey_other, server_id, n_fields, batch_id, shared_seed
 30 | ):
 31 |     connection = await aio_pika.connect_robust("amqp://guest:guest@rabbitmq:5672/")
 32 |     channel = await connection.channel()
 33 |     queue = await channel.declare_queue(f"prio.{server_id}")
 34 | 
 35 |     pk = prio.PublicKey().import_hex(pubkey)
 36 |     sk = prio.PrivateKey().import_hex(pvtkey, pubkey)
 37 |     pk_other = prio.PublicKey().import_hex(pubkey_other)
 38 | 
 39 |     seed = prio.PRGSeed()
 40 |     seed.instance = shared_seed
 41 | 
 42 |     config = prio.Config(n_fields, pk, pk_other, batch_id)
 43 |     server = prio.Server(config, server_id, sk, seed)
 44 | 
 45 |     cache = {}
 46 | 
 47 |     async for message in queue:
 48 |         with message.process():
 49 |             pid = message.message_id
 50 |             v, p1, p2 = cache.get(pid, (None, None, None))
 51 | 
 52 |             def log(line):
 53 |                 logger.info("Message {}: {}".format(pid, line))
 54 | 
 55 |             ptype = message.type
 56 |             routing_key = "prio.{}".format(get_other_server(server_id))
 57 | 
 58 |             if (ptype == "verify1" and not p1) or (ptype == "verify2" and not p2):
 59 |                 log("Re-queuing message!")
 60 |                 await channel.default_exchange.publish(
 61 |                     aio_pika.Message(
 62 |                         body=message.body,
 63 |                         message_id=message.message_id,
 64 |                         type=message.type,
 65 |                     ),
 66 |                     routing_key="prio.{}".format(server_id),
 67 |                 )
 68 |             elif ptype == "data":
 69 |                 log("Generating verify packet 1")
 70 |                 v = server.create_verifier(message.body)
 71 |                 p1 = v.create_verify1()
 72 |                 await channel.default_exchange.publish(
 73 |                     aio_pika.Message(
 74 |                         body=pickle.dumps(p1),
 75 |                         message_id=message.message_id,
 76 |                         type="verify1",
 77 |                     ),
 78 |                     routing_key=routing_key,
 79 |                 )
 80 |             elif ptype == "verify1":
 81 |                 log("Generating verify packet 2")
 82 |                 p2 = v.create_verify2(p1, pickle.loads(message.body))
 83 |                 await channel.default_exchange.publish(
 84 |                     aio_pika.Message(
 85 |                         body=pickle.dumps(p2),
 86 |                         message_id=message.message_id,
 87 |                         type="verify2",
 88 |                     ),
 89 |                     routing_key=routing_key,
 90 |                 )
 91 |             elif ptype == "verify2":
 92 |                 if v.is_valid(p2, pickle.loads(message.body)):
 93 |                     log("Aggregate data")
 94 |                     server.aggregate(v)
 95 |                 else:
 96 |                     log("Invalid data")
 97 |                 del cache[pid]
 98 |             else:
 99 |                 log("Bad message type {}".format(ptype))
100 | 
101 |             cache[pid] = (v, p1, p2)
102 | 
103 | 
104 | @click.command()
105 | @click.option("--pubkey", type=str)
106 | @click.option("--pvtkey", type=str)
107 | @click.option("--pubkey-other", type=str)
108 | @click.option("--server-id", type=click.Choice(["a", "b"]), required=True)
109 | @click.option("--n-fields", type=int, required=True)
110 | @click.option("--batch-id", type=str, default="test_batch")
111 | @PrioContext()
112 | def main(pubkey, pvtkey, pubkey_other, server_id, n_fields, batch_id):
113 |     loop = asyncio.get_event_loop()
114 |     server_id = prio.PRIO_SERVER_A if server_id == "a" else prio.PRIO_SERVER_B
115 |     loop.run_until_complete(
116 |         run_server(
117 |             bytes(pubkey, "utf-8"),
118 |             bytes(pvtkey, "utf-8"),
119 |             bytes(pubkey_other, "utf-8"),
120 |             server_id,
121 |             n_fields,
122 |             bytes(batch_id, "utf-8"),
123 |             DEFAULT_SHARED_SEED,
124 |         )
125 |     )
126 |     loop.run_forever()
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/examples/python-wrapper/README.md:
--------------------------------------------------------------------------------
 1 | # Pythonic Wrapper Example Usage
 2 | 
 3 | This example demonstrates usage of the python wrapper around the swig libprio functions.
 4 | 
 5 | ## Running the example
 6 | 
 7 | ```bash
 8 | docker run -v $(pwd):/app -it prio:dev python3 main.py
 9 | ```
10 | 
11 | Results in:
12 | 
13 | ```bash
14 | 
15 | [0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0]
16 | ```
17 | 


--------------------------------------------------------------------------------
/examples/python-wrapper/main.py:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | from prio_processor.prio import wrapper as prio
 6 | from prio import PrioContext
 7 | import sys
 8 | 
 9 | with PrioContext():
10 |     skA, pkA = prio.create_keypair()
11 |     skB, pkB = prio.create_keypair()
12 | 
13 |     n_data = 133
14 |     batch_id = b"test_batch"
15 |     cfg = prio.Config(n_data, pkA, pkB, batch_id)
16 | 
17 |     server_secret = prio.PRGSeed()
18 | 
19 |     sA = prio.Server(cfg, prio.PRIO_SERVER_A, skA, server_secret)
20 |     sB = prio.Server(cfg, prio.PRIO_SERVER_B, skB, server_secret)
21 | 
22 |     client = prio.Client(cfg)
23 | 
24 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
25 |     for_server_a, for_server_b = client.encode(data_items)
26 | 
27 |     # Setup verification
28 |     vA = sA.create_verifier(for_server_a)
29 |     vB = sB.create_verifier(for_server_b)
30 | 
31 |     # Produce a packet1 and send to the other party
32 |     p1A = vA.create_verify1()
33 |     p1B = vB.create_verify1()
34 | 
35 |     # Produce packet2 and send to the other party
36 |     p2A = vA.create_verify2(p1A, p1B)
37 |     p2B = vB.create_verify2(p1A, p1B)
38 | 
39 |     # Check validity of the request
40 |     if not vA.is_valid(p2A, p2B):
41 |         print("data for server A is not valid!")
42 |         sys.exit(1)
43 |     if not vB.is_valid(p2A, p2B):
44 |         print("data for server A is not valid!")
45 |         sys.exit(1)
46 | 
47 |     sA.aggregate(vA)
48 |     sB.aggregate(vB)
49 | 
50 |     # Collect from many clients and share data
51 |     tA = sA.total_shares()
52 |     tB = sB.total_shares()
53 | 
54 |     output = prio.total_share_final(cfg, tA, tB)
55 | 
56 | # check the output
57 | assert list(data_items) == list(output)
58 | print(f"{list(output)}")
59 | 


--------------------------------------------------------------------------------
/examples/swig-wrapper/README.md:
--------------------------------------------------------------------------------
 1 | # SWIG-Wrapper Example Usage
 2 | 
 3 | This example demonstrates usage of the generated wrapper around the libprio
 4 | functions. The wrapper is no longer maintained within this repository and has
 5 | moved to [the python wrapper of
 6 | libprio](https://github.com/mozilla/libprio/tree/master/python). It is used
 7 | heavily within the prio-processor, however.
 8 | 
 9 | ## Running the example
10 | 
11 | With docker, run from the current directory.
12 | 
13 | ```bash
14 | docker run -v $(pwd):/app -it prio:dev python3 main.py
15 | ```
16 | 
17 | Result:
18 | 
19 | ```bash
20 | 
21 | [0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0]
22 | ```
23 | 


--------------------------------------------------------------------------------
/examples/swig-wrapper/main.py:
--------------------------------------------------------------------------------
 1 | # This Source Code Form is subject to the terms of the Mozilla Public
 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | from prio.libprio import *
 6 | from array import array
 7 | 
 8 | Prio_init()
 9 | skA, pkA = Keypair_new()
10 | skB, pkB = Keypair_new()
11 | 
12 | n_data = 133
13 | batch_id = b"test_batch"
14 | cfg = PrioConfig_new(n_data, pkA, pkB, batch_id)
15 | 
16 | server_secret = PrioPRGSeed_randomize()
17 | 
18 | sA = PrioServer_new(cfg, PRIO_SERVER_A, skA, server_secret)
19 | sB = PrioServer_new(cfg, PRIO_SERVER_B, skB, server_secret)
20 | 
21 | vA = PrioVerifier_new(sA)
22 | vB = PrioVerifier_new(sB)
23 | 
24 | tA = PrioTotalShare_new()
25 | tB = PrioTotalShare_new()
26 | 
27 | p1A = PrioPacketVerify1_new()
28 | p1B = PrioPacketVerify1_new()
29 | p2A = PrioPacketVerify2_new()
30 | p2B = PrioPacketVerify2_new()
31 | 
32 | data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
33 | for_server_a, for_server_b = PrioClient_encode(cfg, data_items)
34 | 
35 | # Setup verification
36 | PrioVerifier_set_data(vA, for_server_a)
37 | PrioVerifier_set_data(vB, for_server_b)
38 | 
39 | # Produce a packet1 and send to the other party
40 | PrioPacketVerify1_set_data(p1A, vA)
41 | PrioPacketVerify1_set_data(p1B, vB)
42 | 
43 | # Produce packet2 and send to the other party
44 | PrioPacketVerify2_set_data(p2A, vA, p1A, p1B)
45 | PrioPacketVerify2_set_data(p2B, vB, p1A, p1B)
46 | 
47 | # Check validity of the request
48 | PrioVerifier_isValid(vA, p2A, p2B)
49 | PrioVerifier_isValid(vB, p2A, p2B)
50 | 
51 | PrioServer_aggregate(sA, vA)
52 | PrioServer_aggregate(sB, vB)
53 | 
54 | # Collect from many clients and share data
55 | PrioTotalShare_set_data(tA, sA)
56 | PrioTotalShare_set_data(tB, sB)
57 | 
58 | output = PrioTotalShare_final(cfg, tA, tB)
59 | output = array("L", output)
60 | 
61 | # check the output
62 | assert list(data_items) == list(output), "results do not match"
63 | print(f"{list(output)}")
64 | Prio_clear()
65 | 


--------------------------------------------------------------------------------
/google-cloud-sdk.repo:
--------------------------------------------------------------------------------
1 | [google-cloud-sdk]
2 | name=Google Cloud SDK
3 | baseurl=https://packages.cloud.google.com/yum/repos/cloud-sdk-el7-x86_64
4 | enabled=1
5 | gpgcheck=1
6 | repo_gpgcheck=1
7 | gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg
8 |        https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
9 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: prio-processor
2 | nav:
3 |   - Home: README.md
4 |   - Guide: guide.md
5 |   - Airflow: airflow.md
6 |   - Command Line Reference: cli-help.md
7 |   - Code of Conduct: link/CODE_OF_CONDUCT.md
8 | theme: readthedocs
9 | 


--------------------------------------------------------------------------------
/notebooks/2020-08-25-cpu-time-by-n-data.csv:
--------------------------------------------------------------------------------
 1 | server_id,sequence_id,n_data,2500,5000,7500,10000
 2 | a,0,32,12.5,20.7,32.8,43.1
 3 | a,0,64,17.0,38.0,54.7,74.4
 4 | a,0,128,37.5,74.7,141.0,221.2
 5 | a,1,32,12.5,20.8,30.5,39.5
 6 | a,1,64,17.4,34.9,54.1,74.0
 7 | a,1,128,38.9,76.3,142.0,224.5
 8 | a,2,32,19.0,24.9,32.4,37.0
 9 | a,2,64,23.9,36.3,47.4,132.7
10 | a,2,128,36.0,131.9,215.5,248.8
11 | a,3,32,3.6,3.4,3.6,3.6
12 | a,3,64,3.7,3.6,3.6,3.7
13 | a,3,128,3.6,3.7,3.6,3.9
14 | b,0,32,8.6,12.6,18.9,22.2
15 | b,0,64,13.5,23.7,32.6,41.3
16 | b,0,128,25.0,44.1,67.8,91.8
17 | b,1,32,8.4,12.6,17.5,22.3
18 | b,1,64,13.2,22.5,32.2,41.8
19 | b,1,128,25.3,43.8,67.0,91.4
20 | b,2,32,17.1,21.4,26.3,31.7
21 | b,2,64,21.8,31.6,39.7,48.6
22 | b,2,128,33.8,52.8,76.0,100.2
23 | b,3,32,3.5,3.5,3.5,3.5
24 | b,3,64,3.6,3.8,3.5,3.6
25 | b,3,128,3.7,3.8,3.4,3.5
26 | 


--------------------------------------------------------------------------------
/notebooks/2020-08-25-cpu-time-by-n-rows.csv:
--------------------------------------------------------------------------------
 1 | server_id,sequence_id,n_rows,32,64,128
 2 | a,0,2500,12.5,17.0,37.5
 3 | a,0,5000,20.7,38.0,74.7
 4 | a,0,7500,32.8,54.7,141.0
 5 | a,0,10000,43.1,74.4,221.2
 6 | a,1,2500,12.5,17.4,38.9
 7 | a,1,5000,20.8,34.9,76.3
 8 | a,1,7500,30.5,54.1,142.0
 9 | a,1,10000,39.5,74.0,224.5
10 | a,2,2500,19.0,23.9,36.0
11 | a,2,5000,24.9,36.3,131.9
12 | a,2,7500,32.4,47.4,215.5
13 | a,2,10000,37.0,132.7,248.8
14 | a,3,2500,3.6,3.7,3.6
15 | a,3,5000,3.4,3.6,3.7
16 | a,3,7500,3.6,3.6,3.6
17 | a,3,10000,3.6,3.7,3.9
18 | b,0,2500,8.6,13.5,25.0
19 | b,0,5000,12.6,23.7,44.1
20 | b,0,7500,18.9,32.6,67.8
21 | b,0,10000,22.2,41.3,91.8
22 | b,1,2500,8.4,13.2,25.3
23 | b,1,5000,12.6,22.5,43.8
24 | b,1,7500,17.5,32.2,67.0
25 | b,1,10000,22.3,41.8,91.4
26 | b,2,2500,17.1,21.8,33.8
27 | b,2,5000,21.4,31.6,52.8
28 | b,2,7500,26.3,39.7,76.0
29 | b,2,10000,31.7,48.6,100.2
30 | b,3,2500,3.5,3.6,3.7
31 | b,3,5000,3.5,3.8,3.8
32 | b,3,7500,3.5,3.5,3.4
33 | b,3,10000,3.5,3.6,3.5
34 | 


--------------------------------------------------------------------------------
/prio_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/prio_processor/__init__.py


--------------------------------------------------------------------------------
/prio_processor/origin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/prio_processor/origin/__init__.py


--------------------------------------------------------------------------------
/prio_processor/origin/commands.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import click
 3 | from . import staging, origins, indexing
 4 | 
 5 | logging.basicConfig(level=logging.INFO)
 6 | 
 7 | 
 8 | @click.group()
 9 | def entry_point():
10 |     pass
11 | 
12 | 
13 | entry_point.add_command(staging.run, "staging")
14 | entry_point.add_command(origins.run, "fetch-origins")
15 | entry_point.add_command(indexing.run, "index")
16 | 
17 | if __name__ == "__main__":
18 |     entry_point()
19 | 


--------------------------------------------------------------------------------
/prio_processor/origin/indexing.py:
--------------------------------------------------------------------------------
  1 | """Map Prio-aggregated data to their corresponding origins."""
  2 | import json
  3 | 
  4 | import click
  5 | from jsonschema import validate
  6 | from pyspark.sql import SparkSession
  7 | from pyspark.sql.functions import explode, udf
  8 | from pyspark.sql.types import (
  9 |     ArrayType,
 10 |     IntegerType,
 11 |     StringType,
 12 |     StructField,
 13 |     StructType,
 14 | )
 15 | 
 16 | 
 17 | def validate_origins(origins):
 18 |     schema = {
 19 |         "type": "array",
 20 |         "items": {
 21 |             "type": "object",
 22 |             "properties": {
 23 |                 "name": {"type": "string"},
 24 |                 "hash": {"type": "string"},
 25 |                 "index": {"type": "integer", "minimum": 0},
 26 |             },
 27 |         },
 28 |     }
 29 |     validate(instance=origins, schema=schema)
 30 | 
 31 | 
 32 | def extract(spark, input):
 33 |     return spark.read.json(input)
 34 | 
 35 | 
 36 | def transform(aggregates, config, origins):
 37 |     @udf(
 38 |         ArrayType(
 39 |             StructType(
 40 |                 [
 41 |                     StructField("batch_id", StringType(), False),
 42 |                     StructField("origin", StringType(), False),
 43 |                     StructField("hash", StringType(), False),
 44 |                     StructField("index", IntegerType(), False),
 45 |                     StructField("aggregate", IntegerType(), False),
 46 |                 ]
 47 |             )
 48 |         )
 49 |     )
 50 |     def _apply_structure(batch_id, payload):
 51 |         """Create a user-defined function that maps partitioned batch-ids into
 52 |         list of structures containing the aggregate value and its metadata."""
 53 | 
 54 |         # assumption: hyphens are used to define a partition of origins
 55 |         if batch_id not in [d["batch_id"] for d in config]:
 56 |             return []
 57 | 
 58 |         # currently all batch-ids contain a single hyphen with 2 parts
 59 |         split = batch_id.split("-")
 60 |         assert len(split) == 2, "currently only supports batch-ids in 2 parts"
 61 |         batch_id = split[0]
 62 |         part_num = int(split[1])
 63 | 
 64 |         # the offset is relative to the origins list
 65 |         if part_num == 0:
 66 |             offset = 0
 67 |         elif part_num == 1:
 68 |             # pick up where the last part left off
 69 |             d = [d for d in config if d["batch_id"] == f"{batch_id}-0"][0]
 70 |             offset = d["n_data"]
 71 |         else:
 72 |             # Hard-fail, this code path should not occur if the config file is
 73 |             # being properly maintained.
 74 |             raise NotImplementedError("batch-id is split into more than 2 parts")
 75 | 
 76 |         result = []
 77 |         for origin, aggregate in zip(origins[offset:], payload):
 78 |             row = (batch_id, origin["name"], origin["hash"], origin["index"], aggregate)
 79 |             result.append(row)
 80 |         return result
 81 | 
 82 |     return aggregates.withColumn(
 83 |         "indexed", explode(_apply_structure("batch_id", "payload"))
 84 |     ).select("id", "timestamp", "indexed.*")
 85 | 
 86 | 
 87 | def load(df, output):
 88 |     df.repartition(1).write.mode("overwrite").json(output)
 89 | 
 90 | 
 91 | @click.command()
 92 | @click.option(
 93 |     "--input", type=str, required=True, help="location of the prio aggregated-data"
 94 | )
 95 | @click.option(
 96 |     "--output", type=str, required=True, help="location of the resulting indexed data"
 97 | )
 98 | @click.option(
 99 |     "--config",
100 |     type=str,
101 |     required=True,
102 |     help="location of the whitelist of batch-ids and their sizes",
103 | )
104 | @click.option(
105 |     "--origins", type=str, required=True, help="JSON document with origins data"
106 | )
107 | def run(input, output, config, origins):
108 |     """Take the resulting Prio aggregates and map the indices to their original origins."""
109 |     spark = SparkSession.builder.getOrCreate()
110 |     extracted = extract(spark, input)
111 | 
112 |     with open(config) as f:
113 |         config_data = json.load(f)
114 |     with open(origins) as f:
115 |         origin_data = json.load(f)
116 | 
117 |     validate_origins(origin_data)
118 | 
119 |     transformed = transform(extracted, config_data, origin_data)
120 |     load(transformed, output)
121 |     transformed.show(truncate=False)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     run()
126 | 


--------------------------------------------------------------------------------
/prio_processor/origin/origins.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import urllib.request
 3 | from collections import namedtuple
 4 | 
 5 | import click
 6 | 
 7 | TELEMETRY_ORIGIN_DATA = "https://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/core/TelemetryOriginData.inc"
 8 | ORIGIN = namedtuple("Origin", ["name", "hash"])
 9 | 
10 | 
11 | def ignore(line):
12 |     return not (line.startswith(b"//") or not line.strip())
13 | 
14 | 
15 | def transform(index, origin):
16 |     return {"name": origin.name, "hash": origin.hash, "index": index}
17 | 
18 | 
19 | @click.command()
20 | @click.option("--url", type=str, default=TELEMETRY_ORIGIN_DATA)
21 | @click.option("--output", type=click.File("w"), default="-")
22 | def run(url, output):
23 |     """Fetch data about origins being collected by Firefox telemetry via Prio."""
24 |     resp = urllib.request.urlopen(url)
25 |     parsed = map(eval, filter(ignore, resp.readlines()))
26 |     data = [transform(idx, origin) for idx, origin in enumerate(parsed)]
27 | 
28 |     # in-band metadata about origin telemetry
29 |     # https://searchfox.org/mozilla-central/rev/325c1a707819602feff736f129cb36055ba6d94f/toolkit/components/telemetry/core/TelemetryOrigin.cpp#145-149
30 |     data.append(
31 |         {"name": "__UNKNOWN__", "hash": "__UNKNOWN__", "index": data[-1]["index"] + 1}
32 |     )
33 |     output.write(json.dumps(data, indent=2))
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     run()
38 | 


--------------------------------------------------------------------------------
/prio_processor/prio/__init__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from .commands import (
 3 |     shared_seed,
 4 |     keygen,
 5 |     encode_shares,
 6 |     verify1,
 7 |     verify2,
 8 |     aggregate,
 9 |     publish,
10 | )
11 | 
12 | 
13 | @click.group()
14 | def main(args=None):
15 |     """Command line utility for prio."""
16 |     pass
17 | 
18 | 
19 | main.add_command(shared_seed)
20 | main.add_command(keygen)
21 | 
22 | main.add_command(encode_shares)
23 | main.add_command(verify1)
24 | main.add_command(verify2)
25 | main.add_command(aggregate)
26 | main.add_command(publish)
27 | 


--------------------------------------------------------------------------------
/prio_processor/prio/options.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | from .types import BYTE_STRING
  3 | 
  4 | 
  5 | def apply_options(func, options):
  6 |     for option in options:
  7 |         func = option(func)
  8 |     return func
  9 | 
 10 | 
 11 | def public_key(func):
 12 |     options = [
 13 |         click.option(
 14 |             "--public-key-hex-internal",
 15 |             envvar="PUBLIC_KEY_HEX_INTERNAL",
 16 |             required=True,
 17 |             type=BYTE_STRING,
 18 |             help="The public key of the processing server as a hex string.",
 19 |         ),
 20 |         click.option(
 21 |             "--public-key-hex-external",
 22 |             envvar="PUBLIC_KEY_HEX_EXTERNAL",
 23 |             required=True,
 24 |             type=BYTE_STRING,
 25 |             help="The public key of the co-processing server as a hex string.",
 26 |         ),
 27 |     ]
 28 |     return apply_options(func, options)
 29 | 
 30 | 
 31 | def server_config(func):
 32 |     options = [
 33 |         click.option(
 34 |             "--server-id",
 35 |             envvar="SERVER_ID",
 36 |             required=True,
 37 |             type=click.Choice(["A", "B"]),
 38 |             help="The identifier for match.",
 39 |         ),
 40 |         click.option(
 41 |             "--private-key-hex",
 42 |             envvar="PRIVATE_KEY_HEX",
 43 |             required=True,
 44 |             type=BYTE_STRING,
 45 |             help="The private key of the processing server as a hex string.",
 46 |         ),
 47 |         click.option(
 48 |             "--shared-secret",
 49 |             envvar="SHARED_SECRET",
 50 |             required=True,
 51 |             type=BYTE_STRING,
 52 |             help="The shared server secret encoded in base64.",
 53 |         ),
 54 |     ]
 55 |     return apply_options(func, options)
 56 | 
 57 | 
 58 | def output_1(func):
 59 |     options = [
 60 |         click.option(
 61 |             "--output",
 62 |             envvar="OUTPUT",
 63 |             required=True,
 64 |             type=click.Path(file_okay=False),
 65 |             help="The path to the output directory.",
 66 |         )
 67 |     ]
 68 |     return apply_options(func, options)
 69 | 
 70 | 
 71 | def output_2(func):
 72 |     options = [
 73 |         click.option(
 74 |             "--output-A",
 75 |             envvar="OUTPUT_A",
 76 |             required=True,
 77 |             type=click.Path(file_okay=False),
 78 |             help="The path to the input directory of server A.",
 79 |         ),
 80 |         click.option(
 81 |             "--output-B",
 82 |             envvar="OUTPUT_B",
 83 |             required=True,
 84 |             type=click.Path(file_okay=False),
 85 |             help="The path to the input directory of server B.",
 86 |         ),
 87 |     ]
 88 |     return apply_options(func, options)
 89 | 
 90 | 
 91 | def input_1(func):
 92 |     options = [
 93 |         click.option(
 94 |             "--input",
 95 |             envvar="INPUT",
 96 |             required=True,
 97 |             help="File containing shares from clients.",
 98 |         )
 99 |     ]
100 |     return apply_options(func, options)
101 | 
102 | 
103 | def input_2(func):
104 |     options = [
105 |         click.option(
106 |             "--input-internal",
107 |             envvar="INPUT_INTERNAL",
108 |             required=True,
109 |             help="File containing data generated by the processing server.",
110 |         ),
111 |         click.option(
112 |             "--input-external",
113 |             envvar="INPUT_EXTERNAL",
114 |             required=True,
115 |             help="File containing data generated by the co-processing server.",
116 |         ),
117 |     ]
118 |     return apply_options(func, options)
119 | 
120 | 
121 | def data_config(func):
122 |     options = [
123 |         click.option(
124 |             "--batch-id",
125 |             envvar="BATCH_ID",
126 |             required=True,
127 |             type=BYTE_STRING,
128 |             help="A shared batch identifier used as a validity check.",
129 |         ),
130 |         click.option(
131 |             "--n-data",
132 |             envvar="N_DATA",
133 |             required=True,
134 |             type=click.INT,
135 |             help="The size of the input bit-vector.",
136 |         ),
137 |     ]
138 |     return apply_options(func, options)
139 | 


--------------------------------------------------------------------------------
/prio_processor/prio/types.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | 
 4 | class ByteStringType(click.ParamType):
 5 |     name = "byte-string"
 6 | 
 7 |     def convert(self, value, param, ctx):
 8 |         try:
 9 |             return bytes(value, "utf-8")
10 |         except:
11 |             self.fail("{} cannot be encoded into a bytestring".format(value))
12 | 
13 | 
14 | BYTE_STRING = ByteStringType()
15 | 


--------------------------------------------------------------------------------
/prio_processor/spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/prio_processor/spark/__init__.py


--------------------------------------------------------------------------------
/requirements-dev.in:
--------------------------------------------------------------------------------
1 | -c requirements.txt
2 | pytest
3 | mkdocs
4 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile requirements-dev.in
 6 | #
 7 | attrs==19.3.0
 8 |     # via
 9 |     #   -c requirements.txt
10 |     #   pytest
11 | click==7.1.2
12 |     # via
13 |     #   -c requirements.txt
14 |     #   mkdocs
15 |     #   nltk
16 | future==0.18.2
17 |     # via lunr
18 | iniconfig==1.0.1
19 |     # via pytest
20 | jinja2==2.11.3
21 |     # via mkdocs
22 | joblib==0.17.0
23 |     # via nltk
24 | livereload==2.6.3
25 |     # via mkdocs
26 | lunr[languages]==0.5.8
27 |     # via mkdocs
28 | markdown==3.3.2
29 |     # via mkdocs
30 | markupsafe==1.1.1
31 |     # via jinja2
32 | mkdocs==1.1.2
33 |     # via -r requirements-dev.in
34 | more-itertools==8.4.0
35 |     # via pytest
36 | nltk==3.5
37 |     # via lunr
38 | packaging==20.4
39 |     # via pytest
40 | pluggy==0.13.1
41 |     # via pytest
42 | py==1.10.0
43 |     # via pytest
44 | pyparsing==2.4.7
45 |     # via packaging
46 | pytest==6.0.1
47 |     # via -r requirements-dev.in
48 | pyyaml==5.4
49 |     # via mkdocs
50 | regex==2020.10.23
51 |     # via nltk
52 | six==1.15.0
53 |     # via
54 |     #   -c requirements.txt
55 |     #   livereload
56 |     #   lunr
57 |     #   packaging
58 | toml==0.10.1
59 |     # via pytest
60 | tornado==6.0.4
61 |     # via
62 |     #   livereload
63 |     #   mkdocs
64 | tqdm==4.50.2
65 |     # via nltk
66 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile
 6 | #
 7 | attrs==19.3.0
 8 |     # via jsonschema
 9 | click==7.1.2
10 |     # via prio_processor (setup.py)
11 | jsonschema==3.2.0
12 |     # via prio_processor (setup.py)
13 | numpy==1.19.1
14 |     # via
15 |     #   pandas
16 |     #   pyarrow
17 | pandas==1.1.0
18 |     # via
19 |     #   prio_processor (setup.py)
20 |     #   pyspark
21 | prio==1.1
22 |     # via prio_processor (setup.py)
23 | py4j==0.10.9
24 |     # via pyspark
25 | pyarrow==1.0.0
26 |     # via pyspark
27 | pyrsistent==0.16.0
28 |     # via jsonschema
29 | pyspark[sql]==3.1.1
30 |     # via prio_processor (setup.py)
31 | python-dateutil==2.8.1
32 |     # via pandas
33 | pytz==2020.1
34 |     # via pandas
35 | six==1.15.0
36 |     # via
37 |     #   jsonschema
38 |     #   pyrsistent
39 |     #   python-dateutil
40 | 
41 | # The following packages are considered to be unsafe in a requirements file:
42 | # setuptools
43 | 


--------------------------------------------------------------------------------
/scripts/copy-spark-config:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install the Spark configuration from this repository into the Spark home
 4 | # directory of the active pyspark installation. It's strongly encouraged that
 5 | # Spark is installed via pip in a virtual environment if this script is used
 6 | # on a local machine.
 7 | set -e
 8 | 
 9 | # Find the directory of spark from the active python packages
10 | SPARK_HOME=$(python -c "import pyspark; print(pyspark.__path__[0])")
11 | cp -r config/spark "${SPARK_HOME}/conf"
12 | 


--------------------------------------------------------------------------------
/scripts/create-folder:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | 
 5 | dirname = os.path.dirname(os.path.dirname(__file__))
 6 | workdir = os.path.join(dirname, "working")
 7 | 
 8 | paths = [
 9 |     "raw/",
10 |     "intermediate/internal/verify1",
11 |     "intermediate/external/verify1",
12 |     "intermediate/internal/verify2",
13 |     "intermediate/external/verify2",
14 |     "intermediate/internal/aggregate",
15 |     "intermediate/external/aggregate",
16 |     "processed/",
17 | ]
18 | 
19 | for server in ["server_a", "server_b"]:
20 |     for path in paths:
21 |         p = os.path.join(workdir, server, path)
22 |         os.makedirs(p, exist_ok=True)
23 | 
24 | os.makedirs(os.path.join(workdir, "client"), exist_ok=True)
25 | 


--------------------------------------------------------------------------------
/scripts/download-mapping:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import urllib.request
 4 | from collections import namedtuple
 5 | import json
 6 | 
 7 | ORIGIN = namedtuple("Origin", ["name", "hash"])
 8 | 
 9 | url = "https://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/core/TelemetryOriginData.inc"
10 | resp = urllib.request.urlopen(url)
11 | 
12 | 
13 | def ignore(line):
14 |     return not (line.startswith(b"//") or not line.strip())
15 | 
16 | 
17 | data = map(eval, filter(ignore, resp.readlines()))
18 | 
19 | origins = [datum.name for datum in data]
20 | 
21 | print(json.dumps(origins))
22 | 


--------------------------------------------------------------------------------
/scripts/print-cli-help:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Generate a markdown page to serve as markdown documentation and a diffing
 4 | # mechanism between revisions of the CLI.
 5 | #
 6 | # Usage:
 7 | #   pip install .
 8 | #   ./scripts/print-cli-help > docs/cli-help.md
 9 | #
10 | 
11 | set -euo pipefail
12 | 
13 | function md_fence() {
14 |     echo '```bash'
15 |     echo "${1}"
16 |     echo '```'
17 | }
18 | 
19 | function command_help() {
20 |     local cmd=$1
21 |     echo "## ${cmd} help"
22 |     echo ""
23 |     md_fence "$($cmd --help)"
24 | 
25 |     commands=$($cmd --help | sed "1,/Commands:/d" | grep "^  \w" | awk '{print $1}')
26 |     for command in ${commands}; do
27 |         echo ""
28 |         echo "### \`$cmd ${command}\`"
29 |         echo ""
30 |         md_fence "$($cmd "${command}" --help)"
31 |     done
32 | }
33 | 
34 | cat <<EOF
35 | # Command Line Reference
36 | 
37 | $(command_help prio)
38 | 
39 | $(command_help prio-spark)
40 | 
41 | $(command_help prio-processor)
42 | EOF
43 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="prio_processor",
 5 |     version="4.1.0",
 6 |     description="A processing engine for prio data",
 7 |     long_description_content_type="text/markdown",
 8 |     author="Anthony Miyaguchi",
 9 |     author_email="amiyaguchi@mozilla.com",
10 |     url="https://github.com/mozilla/prio-processor",
11 |     entry_points={
12 |         "console_scripts": [
13 |             "prio-processor=prio_processor.origin.commands:entry_point",
14 |             "prio=prio_processor.prio.commands:entry_point",
15 |             "prio-spark=prio_processor.spark.commands:entry_point",
16 |         ]
17 |     },
18 |     install_requires=[
19 |         "click",
20 |         # starting pyspark 3.1, the default hadoop distribution is 3.2
21 |         "pyspark[sql] ~= 3.1",
22 |         "jsonschema",
23 |         "prio >= 1.1",
24 |         "pandas",
25 |     ],
26 |     packages=find_packages(),
27 | )
28 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/prio-processor/5bbb33ecd7a45480ffd5a47c677bfa60660166fe/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | 
 5 | @pytest.fixture(scope="session")
 6 | def spark():
 7 |     spark = SparkSession.builder.getOrCreate()
 8 |     spark.conf.set("spark.sql.session.timeZone", "UTC")
 9 |     yield spark
10 |     spark.stop()
11 | 


--------------------------------------------------------------------------------
/tests/resources/cli/client/data.ndjson:
--------------------------------------------------------------------------------
1 | {"payload":[1,0,0,0,0]}
2 | {"payload":[1,1,0,0,0]}
3 | {"payload":[1,1,1,0,0]}
4 | {"payload":[1,1,1,1,0]}
5 | {"payload":[1,1,1,1,1]}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "n_data": 5,
3 |     "batch_id": "test"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/intermediate/external/aggregate/data.ndjson:
--------------------------------------------------------------------------------
1 | {"payload": "AZWrGOtHqPpMYsilUNWrUHHRIPRqMf3C69irUb2TwH9ZuJyv9emrLW156KXDwXGECByrCzCBU11jCIZxlYo=", "error": 0, "total": 5}


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/intermediate/external/verify1/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q3Gq1Z9fhPrky6G/q1lcoep+YuTc9XY4"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qysxFvmP/OQPfiLeqwNoitb2VhumnUPV"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q3WPmeTtM3tzhRmhq32/mXkuXHzX0vgW"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q38w7qT0uJCmUQspqzDzu2hC8Ui2QQjA"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1e5dk6X0dwV/RNFqxGDScXW3vfmOXt2"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/intermediate/external/verify2/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "qxcUiHwSIONlRevL"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "q2vZI9UqN+ZVCb+4"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qx3PyQDbzeCLb7i1"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qxKUbFgwKJeRQiNE"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1oqtTCt1wKDQ/cv"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/intermediate/internal/aggregate/data.ndjson:
--------------------------------------------------------------------------------
1 | {"payload": "AJWrZxS4VwWznTdirzGrL44u3wuVzgJFFC2rLkJsP4CmR2NYChurUpKGF1o8Po6D9+erdM9+rKKc93mWang=", "error": 0, "total": 5}


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/intermediate/internal/verify1/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2efCH9Lx6BGCgnkqx++ThBNUdJnogXh"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qx9bgZ2V4sRcXOy8q2+0vgQxd3NJaozj"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qzC4bEwyAJyYnNKQq13E6aZ46ptcR2vk"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qyyVOWz/SLccQL3HqzkxRXViiwRRTTtO"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qwxXx19ZZDLNaSN2qzTBlIEvB65xChBP"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/intermediate/internal/verify2/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2jrd4Pt3xyawhQ2"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qxQm3CrVyBmq/kBJ"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q2IwNv8kMh90mEdM"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q21rk6fP12huxdy9"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qyXVSs9SKP18xAjS"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/processed/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "0f12dff4-d2a4-4d3c-b499-ca6da4c78cbe", "timestamp": "2020-08-03T20:06:46.920410", "payload": [5, 4, 3, 2, 1]}


--------------------------------------------------------------------------------
/tests/resources/cli/server_a/raw/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "2un4rKGCFhw2/CAntbW+JGYicF1Pm2v/qWTSQbOTCV52DaR7u1bQ15my/UE0rUrMaTPu27wpYIErxcxPPIhw1gwcYQUtA9nAZzH7ToHW1HOjVyDc0f6SrGQ8A59hIn2GI8uZLz2QA1hnHVsvNi8SLFN5IvqL7tj2F0El/EywteAkdHeTGuTO3SA9XvGNtI/40ZQoG/T80eyNYeg2Wj7vQQ1Sky1nSCOaEZwG/Hw+gmOp/CdTJlw5gVpwvUbEe+ymeV9f8qv7AsMGe6o3meL65y14FGWBERtxBiNMmGGZpGg/p/802JBtx9wDuIgjH1MK29Qa/mLHyA/VQDpnkLji2UdnCQRwUO71YsxskwTvaH9DdIl3XJvgmMKgYZG8YYbFsLD6fEZdrFw="}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "ORQ7tFgZ4+Ch+9gSbbwKYSl+t+z/mC2VGs+I+wiWm0T7fpOrKzLXzsq7HT48QI8fpw4cb9PVWKsNbmnmz1rJuo3PCCF8vx1e2hfuxPiYJCNq1fOdksEA9hAhMvSmpKbcfhSwanVIo2sGHOZzEHR7Fv4lLdchuISz9mc7keMta4V0R9s8JnLzJ5VI8DPhwaQb8LL4eGoGep+vWsev1U2SjAIQB+T/ZXOAxkhBopbalfGUq9Fxo/mirf6xMu90i9n5yi4I4wOjC0XeA7dm6TR8QWthCzuD9z/pnuDs6K6zKZqTp5DVcI0DHoBoP/jPaEir2KM8TtRRt3LObQusYosw/wW8ca/YWdqMquuiqNSLdKA8iz+GwPN7gAhcpuGve/6W7MYyu5J5uaY="}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "ImLU5IMq/HClYUPcl/nHvS3Gr/LJZQGfFx/geq6UpRBB+RhEsXF7+C4dngojc2mjUP8+bguV41oG6fNrdmyxyI1EIODfiThbvTKLVej8K07X9YaXFOLerzJudQTUqRoCRhEuPuFpYQvyN0+na0vQQxsAWoc+K+qRndAo6DvLuO0AdcwmL3Y/+jeOwlVjxVUeLhVP3zd4IXYDxXb8ubf6eiH+P51JZomp+C9xD31nEpY6YXlyhLfZubnDQcUbKeSVgi2pqAehHireJN7rZFSepcpoyk8Zy3428IArF3mABDoaLVnxZ2Jj8sIYympDvSTiTSMnalWGve4/CBGbbCCI3MIj9JtgHfyZnAoR1hv1YqfrSxCVpAoHotw7kjaz+4ZXB7X/0cDDafs="}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "p0m16Er/atF83mexCMQLrRp4kDsrVKo3NXdKCiBV53VFlUacFfU7Jk+SxBmAlk695vp6BWOrsfvAJ7pcNey+tQkXlADiDG3x8MrV/tDfrsv+ptaH3p7yW2pc4BVrOArih6OJkAyl4ssIyws2zEooeexNaRWGpditaWUsxOdQ2vShz4U2q4URYaq5iOIWqZrPatw48HeKxQ5xuA3oGUvsM9j1JiW7GlU2fGk9khNR3B89SCxYWs5nacq3MNJYZXCQz/XBn9zFlYSjTSuiy2bQRfC7UNwOTSaUZx+h/sQf86lOq+c0OVSygqFEIVv2nsAVNH8H9TSgnb1Z83C9YviBvzcYju9AFX+433nE6Bfa0QT09PVK+IrG+N2yR0+OnN4fP43+kjxsdJ8="}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "DUwgrSoMyJRlUs/2oKih7114jMgsbUqTzVjAlPo1XxQTuFs77y4wTNm8RAc+yt3oNjeZbbKeDwL8ui9tf5SdHdC/WUuT29LptSj5d9Xd/G7BjtbYZvwKl1GvhmV1E5s50pk7M0r7CPIGUq7X8iwUkBFU5vkrcCYpwBE+v1HEZO4pUGd0lxWjdDDjiLP+jUqpWpyYkaaVv8+wFRrUyIIMoNW80IPLeI5/bkxGnfJUR7T6Dq/PnyqJY/eYHiyBEZPrdBFFA4e5sckGSIaIxqDXfbu2HoU6OMhG6XL/iYTfWd8HO7Wzj7iCJ2Jqq96JZvaye3ecI8EM9Te2G81hYxpZXisX4K8e22DL/u5D8LIusi4KmS2tJE4f8BqaAPNYcEe/PGzmOjNXYq8="}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_a_keys.json:
--------------------------------------------------------------------------------
1 | {
2 |   "private_key": "BD6BB9BEB089DBA6B6A75B4455A615D577699F973FAD9E327A33D9528B5C7F64",
3 |   "public_key": "E74E9CDD78258D9EEFAFA0CA2C08733F95AB7C4297DEEA3C3A63AC2053C45127"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/intermediate/external/aggregate/data.ndjson:
--------------------------------------------------------------------------------
1 | {"payload": "AJWrZxS4VwWznTdirzGrL44u3wuVzgJFFC2rLkJsP4CmR2NYChurUpKGF1o8Po6D9+erdM9+rKKc93mWang=", "error": 0, "total": 5}


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/intermediate/external/verify1/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2efCH9Lx6BGCgnkqx++ThBNUdJnogXh"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qx9bgZ2V4sRcXOy8q2+0vgQxd3NJaozj"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qzC4bEwyAJyYnNKQq13E6aZ46ptcR2vk"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qyyVOWz/SLccQL3HqzkxRXViiwRRTTtO"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qwxXx19ZZDLNaSN2qzTBlIEvB65xChBP"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/intermediate/external/verify2/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q2jrd4Pt3xyawhQ2"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qxQm3CrVyBmq/kBJ"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q2IwNv8kMh90mEdM"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q21rk6fP12huxdy9"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "qyXVSs9SKP18xAjS"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/intermediate/internal/aggregate/data.ndjson:
--------------------------------------------------------------------------------
1 | {"payload": "AZWrGOtHqPpMYsilUNWrUHHRIPRqMf3C69irUb2TwH9ZuJyv9emrLW156KXDwXGECByrCzCBU11jCIZxlYo=", "error": 0, "total": 5}


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/intermediate/internal/verify1/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "q3Gq1Z9fhPrky6G/q1lcoep+YuTc9XY4"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "qysxFvmP/OQPfiLeqwNoitb2VhumnUPV"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "q3WPmeTtM3tzhRmhq32/mXkuXHzX0vgW"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "q38w7qT0uJCmUQspqzDzu2hC8Ui2QQjA"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1e5dk6X0dwV/RNFqxGDScXW3vfmOXt2"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/intermediate/internal/verify2/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "qxcUiHwSIONlRevL"}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "q2vZI9UqN+ZVCb+4"}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "qx3PyQDbzeCLb7i1"}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "qxKUbFgwKJeRQiNE"}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "q1oqtTCt1wKDQ/cv"}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/processed/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "6fe8d132-e86b-4fb3-8bfb-00c7019137bb", "timestamp": "2020-08-03T20:06:47.196802", "payload": [5, 4, 3, 2, 1]}


--------------------------------------------------------------------------------
/tests/resources/cli/server_b/raw/data.ndjson:
--------------------------------------------------------------------------------
1 | {"id": "d991b404-339a-4782-8a00-5e663e64de34", "payload": "dE35idVP3iHF/9comKagS1F9ePc1ReuHxsz12Ib74GwYNm5fA4Bj0ENmeCXzKVbhC4WVdxYp8wy+e32y1JjmY8UwYQaKZq7RPYdDupoMio0JZJ3gEsgefzqKHQXFy5tn7aqrlGp2MHo4VVDbVzzTa4f0osLvIXxsvmW2SjwY925eoIvX6H1zrs/EnMJXKJOEWbCKzEwrREoQ+W0="}
2 | {"id": "0b6dcd73-6187-464a-a391-a700b2d35c46", "payload": "F+BdUW5gM66hBd4Rr4iUl0unEhH0jwcc+6HzvNdNOS5JxsdXqZLhf2WeKQJrEoSakQpm5Qvpkrs6r2jSVMQ8ARGIXcb+/exOnCq/YwHxta7VsKwE5ZUqA1ULicnKPR0hN9z2gAR+CVqSYcFK7YZ6miFoQOopub6WslyFd+J05cg9S3ceNBL+bNfHRAbQkE6DRP5xnHjoXiR+66c="}
3 | {"id": "8b5e8453-3bd8-4779-b450-8281486d3729", "payload": "DXpSB+s2g/jAm1JnmFbiHXfQzs6LZC2WWneW6ueUwTvKY9nST6rwkcR8qF+7hzp5iN4FQA+RCGwTZIOUU7xAwNH0RA5T/KoSJU2sD2KBWBYykb+mfIHHEKt8Ufp1bQA8HhVU5PejEoYOirwPyl8JbvnF/6F0qvU00P4518Z2xugDaCbAU3LBNm98mmgMfpibGO70kp6w5NdBTYQ="}
4 | {"id": "02c4a0f4-1920-495c-b2a4-5ac5f51555eb", "payload": "WFIVqOgc1IuMwBG2xOdcWu1HcWcpZVoTKN/igsAGBj0ZZ95cl88O45qyny4eriZhhUq6I9IickOHIS2aTYRAyX4XHe5I7N42rwDnSriHAaNhdqTne/zgsvCtWgYSwzSPhJ1D80cLmfxD1ErBxToZomEr7YZUuZAOPJEBR6hw4qvHBFT+GaMvPQ5C71Irblj1AfCMpsDpKCigYVI="}
5 | {"id": "02e8ca60-39e2-4662-a84d-21621be82709", "payload": "ePka0xALUTfFXmQ4vAmoaxNGWNFE4Jo8Q2Vu6DY9Tmu6MjixXEXXGLZylJ7Gd133DzIbf3yjpKUqtORkIqpGvxtihmLO36P393lgCkxBM4gU/VcAr1IPhy9qjHTgzLgnI364NzPaE94/TqKLibmtsuD1tmXoGhr9hhcb+LRFZWOUjL5ndCfqx16/Fv2pBAqPsZmMcAxms5FDyzI="}
6 | 


--------------------------------------------------------------------------------
/tests/resources/cli/server_b_keys.json:
--------------------------------------------------------------------------------
1 | {
2 |   "private_key": "766B14C6899560BD0B136043AA4817AFA0D5ECD0E17BE47896AD9F5F72C1862C",
3 |   "public_key": "78D9E153651EFD04C07B95492F0485B743AA77013D8FC317DCAE33BACDC32D0A"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/resources/cli/shared_seed.json:
--------------------------------------------------------------------------------
1 | {
2 |   "shared_seed": "nedME1QT1TS+7asOVOBqnA=="
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/test_origin_indexing.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime
  3 | from pathlib import Path
  4 | from uuid import uuid4
  5 | 
  6 | import pytest
  7 | from click.testing import CliRunner
  8 | from prio_processor.origin import indexing
  9 | from pyspark.sql import Row
 10 | 
 11 | 
 12 | @pytest.fixture()
 13 | def config_path():
 14 |     return Path(__file__).parent.parent / "config"
 15 | 
 16 | 
 17 | @pytest.fixture()
 18 | def origins_dict(config_path):
 19 |     path = config_path / "telemetry_origin_data_inc.json"
 20 |     with open(path) as f:
 21 |         return json.load(f)
 22 | 
 23 | 
 24 | @pytest.fixture()
 25 | def config(config_path):
 26 |     path = config_path / "content.json"
 27 |     with open(path) as f:
 28 |         return json.load(f)
 29 | 
 30 | 
 31 | def test_origins_dict(origins_dict):
 32 |     indexing.validate_origins(origins_dict)
 33 |     assert sorted(origins_dict[0].keys()) == sorted(["name", "hash", "index"])
 34 |     assert len(origins_dict) == origins_dict[-1]["index"] + 1
 35 | 
 36 | 
 37 | def test_config(config):
 38 |     batch_id = "content.blocking_blocked-{index}"
 39 |     (part_0, part_1) = [
 40 |         [d["n_data"] for d in config if d["batch_id"] == batch_id.format(index=i)][0]
 41 |         for i in (0, 1)
 42 |     ]
 43 |     assert part_0 == 2046
 44 |     assert part_1 == 441
 45 | 
 46 | 
 47 | @pytest.fixture()
 48 | def prio_aggregated_data(tmp_path, spark, config):
 49 |     """
 50 |     ├── _SUCCESS
 51 |     ├── batch_id=content.blocking_blocked-0
 52 |     │   └── part-00000-45945db7-4b6d-4eef-9e6f-76f98a3aefd4.c000.json
 53 |     ├── batch_id=content.blocking_blocked-1
 54 |     │   └── part-00001-45945db7-4b6d-4eef-9e6f-76f98a3aefd4.c000.json
 55 |     ...
 56 |     └── batch_id=content.blocking_storage_access_api_exempt_TESTONLY-1
 57 |         └── part-00011-45945db7-4b6d-4eef-9e6f-76f98a3aefd4.c000.json
 58 |     """
 59 |     output = str(tmp_path / "data")
 60 |     rows = []
 61 |     for d in config:
 62 |         batch_id = d["batch_id"]
 63 |         n_data = d["n_data"]
 64 |         # write data in such a way where each aggregate value matches to the
 65 |         # index value
 66 |         if int(batch_id.split("-")[1]) == 1:
 67 |             offset = 2046
 68 |         else:
 69 |             offset = 0
 70 |         datum = [offset + i for i in range(n_data)]
 71 |         row = Row(
 72 |             batch_id=batch_id,
 73 |             id=str(uuid4()),
 74 |             timestamp=datetime.utcnow().isoformat(),
 75 |             payload=datum,
 76 |         )
 77 |         rows.append(row)
 78 |     df = spark.createDataFrame(rows)
 79 |     df.write.partitionBy("batch_id").json(output)
 80 |     return output
 81 | 
 82 | 
 83 | def test_prio_aggregated_data_fixture(spark, prio_aggregated_data, config):
 84 |     df = spark.read.json(prio_aggregated_data)
 85 |     assert df.count() == len(config)
 86 | 
 87 | 
 88 | def test_indexing_transform_unit(spark):
 89 |     whitelist = [
 90 |         {"batch_id": "test-0", "n_data": 3},
 91 |         {"batch_id": "test-1", "n_data": 2},
 92 |     ]
 93 |     origins = []
 94 |     for i, ch in enumerate("abcde"):
 95 |         origins.append({"name": ch, "hash": ch, "index": i})
 96 | 
 97 |     def build_row(batch_id, payload):
 98 |         return Row(
 99 |             batch_id=batch_id,
100 |             id=str(uuid4()),
101 |             timestamp=datetime.utcnow().isoformat(),
102 |             payload=payload,
103 |         )
104 | 
105 |     data = [build_row("test-0", [0, 1, 2]), build_row("test-1", [3, 4])]
106 |     df = spark.createDataFrame(data)
107 |     transformed = indexing.transform(df, whitelist, origins)
108 |     assert transformed.count() == 5
109 |     assert transformed.where("index <> aggregate").count() == 0
110 | 
111 |     with pytest.raises(Exception):
112 |         whitelist["test-3"] = 1
113 |         # `origins` doesn't need to be modified because transform should throw before then
114 |         data.append(build_row("test-3", [5]))
115 |         indexing.transform(spark.createDataFrame(data), whitelist, origins).count()
116 | 
117 | 
118 | def test_indexing_transform(spark, prio_aggregated_data, config, origins_dict):
119 |     df = spark.read.json(prio_aggregated_data)
120 |     transformed = indexing.transform(df, config, origins_dict)
121 | 
122 |     merged_batches = {}
123 |     for d in config:
124 |         batch_id = d["batch_id"]
125 |         n_data = d["n_data"]
126 |         key = batch_id.split("-")[0]
127 |         merged_batches[key] = merged_batches.get(key, 0) + n_data
128 | 
129 |     assert transformed.select("batch_id").distinct().count() == len(merged_batches)
130 |     assert transformed.count() == sum(merged_batches.values())
131 |     assert transformed.where("index <> aggregate").count() == 0
132 | 
133 | 
134 | def test_indexing_cli(spark, tmp_path, prio_aggregated_data, config_path):
135 |     output = str(tmp_path / "output")
136 |     runner = CliRunner()
137 |     result = runner.invoke(
138 |         indexing.run,
139 |         [
140 |             "--input",
141 |             prio_aggregated_data,
142 |             "--output",
143 |             output,
144 |             "--config",
145 |             str(config_path / "content.json"),
146 |             "--origins",
147 |             str(config_path / "telemetry_origin_data_inc.json"),
148 |         ],
149 |         catch_exceptions=False,
150 |     )
151 |     assert result.exit_code == 0
152 | 
153 |     df = spark.read.json(output)
154 |     assert df.count() > 0
155 |     assert df.where("index <> aggregate").count() == 0
156 | 


--------------------------------------------------------------------------------
/tests/test_origin_origins.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import urllib.request
 3 | from io import BytesIO
 4 | 
 5 | import pytest
 6 | from click.testing import CliRunner
 7 | from prio_processor.origin import origins
 8 | 
 9 | # First five origins from mozilla-central
10 | # https://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/core/TelemetryOriginData.inc
11 | TELEMETRY_ORIGIN_DATA_INC = """
12 | // dummy origin, this is used to be a counter of page loads.
13 | ORIGIN("PAGELOAD", "PAGELOAD")
14 | 
15 | ORIGIN("advertstream.com", "lzPiT1FuoHNMKQ1Hw8AaTi68TokOB24ciFBqmCk62ek=")
16 | ORIGIN("kitaramedia.com", "r+U9PL3uMrjCKe8/T8goY9MHPA+6JckC3R+/1R9TQKA=")
17 | ORIGIN("questionmarket.com", "3KCO/qN+KmApmfH3RaXAmdR65Z/TRfrr6pds7aDKn1c=")
18 | ORIGIN("3lift.com", "33Xrix7c41Jc9q3InjMWHq+yKVoa/u2IB511kr4X+Ro=")
19 | """
20 | 
21 | 
22 | @pytest.fixture()
23 | def mock_request(monkeypatch):
24 |     def _mocked(*args, **kwargs):
25 |         return BytesIO(TELEMETRY_ORIGIN_DATA_INC.encode())
26 | 
27 |     monkeypatch.setattr(urllib.request, "urlopen", _mocked)
28 | 
29 | 
30 | def test_origins_cli(mock_request, tmp_path):
31 |     output = str(tmp_path / "output")
32 |     runner = CliRunner()
33 |     result = runner.invoke(
34 |         origins.run, ["--output", str(output)], catch_exceptions=False
35 |     )
36 |     assert result.exit_code == 0
37 | 
38 |     with open(output) as f:
39 |         data = json.load(f)
40 | 
41 |     assert len(data) == 6
42 |     assert data[0] == {"name": "PAGELOAD", "hash": "PAGELOAD", "index": 0}
43 |     assert data[-2] == {
44 |         "name": "3lift.com",
45 |         "hash": "33Xrix7c41Jc9q3InjMWHq+yKVoa/u2IB511kr4X+Ro=",
46 |         "index": 4,
47 |     }
48 |     assert data[-1] == {"name": "__UNKNOWN__", "hash": "__UNKNOWN__", "index": 5}
49 | 


--------------------------------------------------------------------------------
/tests/test_prio_wrapper_client.py:
--------------------------------------------------------------------------------
  1 | # This Source Code Form is subject to the terms of the Mozilla Public
  2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | import pytest
  6 | from prio_processor.prio import wrapper as prio
  7 | from prio import PrioContext, libprio
  8 | 
  9 | 
 10 | @PrioContext()
 11 | @pytest.mark.parametrize("n_clients", [1, 2, 10])
 12 | def test_client_agg(n_clients):
 13 |     seed = prio.PRGSeed()
 14 | 
 15 |     skA, pkA = prio.create_keypair()
 16 |     skB, pkB = prio.create_keypair()
 17 | 
 18 |     # the config is shared across all actors
 19 |     config = prio.Config(133, pkA, pkB, b"test_batch")
 20 | 
 21 |     sA = prio.Server(config, prio.PRIO_SERVER_A, skA, seed)
 22 |     sB = prio.Server(config, prio.PRIO_SERVER_B, skB, seed)
 23 | 
 24 |     client = prio.Client(config)
 25 | 
 26 |     n_data = config.num_data_fields()
 27 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
 28 | 
 29 |     for i in range(n_clients):
 30 |         for_server_a, for_server_b = client.encode(data_items)
 31 | 
 32 |         # Setup verification
 33 |         vA = sA.create_verifier(for_server_a)
 34 |         vB = sB.create_verifier(for_server_b)
 35 | 
 36 |         # Produce a packet1 and send to the other party
 37 |         p1A = vA.create_verify1()
 38 |         p1B = vB.create_verify1()
 39 | 
 40 |         # Produce packet2 and send to the other party
 41 |         p2A = vA.create_verify2(p1A, p1B)
 42 |         p2B = vB.create_verify2(p1A, p1B)
 43 | 
 44 |         assert vA.is_valid(p2A, p2B)
 45 |         assert vB.is_valid(p2A, p2B)
 46 | 
 47 |         sA.aggregate(vA)
 48 |         sB.aggregate(vB)
 49 | 
 50 |     t_a = sA.total_shares()
 51 |     t_b = sB.total_shares()
 52 | 
 53 |     output = prio.total_share_final(config, t_a, t_b)
 54 | 
 55 |     expected = [item * n_clients for item in list(data_items)]
 56 |     assert list(output) == expected
 57 | 
 58 | 
 59 | @PrioContext()
 60 | def test_publickey_export():
 61 |     raw_bytes = bytes((3 * x + 7) % 0xFF for x in range(libprio.CURVE25519_KEY_LEN))
 62 |     pubkey = prio.PublicKey().import_bin(raw_bytes)
 63 |     raw_bytes2 = pubkey.export_bin()
 64 | 
 65 |     assert raw_bytes == raw_bytes2
 66 | 
 67 | 
 68 | @PrioContext()
 69 | @pytest.mark.parametrize(
 70 |     "hex_bytes",
 71 |     [
 72 |         b"102030405060708090A0B0C0D0E0F00000FFEEDDCCBBAA998877665544332211",
 73 |         b"102030405060708090a0B0C0D0E0F00000FfeEddcCbBaa998877665544332211",
 74 |     ],
 75 | )
 76 | def test_publickey_import_hex(hex_bytes):
 77 |     expect = bytes(
 78 |         [
 79 |             0x10,
 80 |             0x20,
 81 |             0x30,
 82 |             0x40,
 83 |             0x50,
 84 |             0x60,
 85 |             0x70,
 86 |             0x80,
 87 |             0x90,
 88 |             0xA0,
 89 |             0xB0,
 90 |             0xC0,
 91 |             0xD0,
 92 |             0xE0,
 93 |             0xF0,
 94 |             0x00,
 95 |             0x00,
 96 |             0xFF,
 97 |             0xEE,
 98 |             0xDD,
 99 |             0xCC,
100 |             0xBB,
101 |             0xAA,
102 |             0x99,
103 |             0x88,
104 |             0x77,
105 |             0x66,
106 |             0x55,
107 |             0x44,
108 |             0x33,
109 |             0x22,
110 |             0x11,
111 |         ]
112 |     )
113 | 
114 |     pubkey = prio.PublicKey().import_hex(hex_bytes)
115 |     raw_bytes = pubkey.export_bin()
116 | 
117 |     assert raw_bytes == expect
118 | 
119 | 
120 | @PrioContext()
121 | def test_publickey_import_hex_bad_length_raises_exception():
122 |     hex_bytes = b"102030405060708090A"
123 |     pubkey = prio.PublicKey()
124 |     with pytest.raises(RuntimeError):
125 |         pubkey.import_hex(hex_bytes)
126 | 
127 | 
128 | @PrioContext()
129 | def test_publickey_export_hex():
130 |     # the output includes the null-byte
131 |     expect = b"102030405060708090A0B0C0D0E0F00000FFEEDDCCBBAA998877665544332211"
132 |     raw_bytes = bytes(
133 |         [
134 |             0x10,
135 |             0x20,
136 |             0x30,
137 |             0x40,
138 |             0x50,
139 |             0x60,
140 |             0x70,
141 |             0x80,
142 |             0x90,
143 |             0xA0,
144 |             0xB0,
145 |             0xC0,
146 |             0xD0,
147 |             0xE0,
148 |             0xF0,
149 |             0x00,
150 |             0x00,
151 |             0xFF,
152 |             0xEE,
153 |             0xDD,
154 |             0xCC,
155 |             0xBB,
156 |             0xAA,
157 |             0x99,
158 |             0x88,
159 |             0x77,
160 |             0x66,
161 |             0x55,
162 |             0x44,
163 |             0x33,
164 |             0x22,
165 |             0x11,
166 |         ]
167 |     )
168 |     pubkey = prio.PublicKey().import_bin(raw_bytes)
169 |     hex_bytes = pubkey.export_hex()
170 |     assert bytes(hex_bytes) == expect
171 | 
172 | 
173 | @PrioContext()
174 | def test_publickey_export_missing_key():
175 |     pubkey = prio.PublicKey()
176 |     assert pubkey.export_bin() is None
177 |     assert pubkey.export_hex() is None
178 | 
179 | 
180 | @PrioContext()
181 | def test_privatekey():
182 |     pvtkey, pubkey = prio.create_keypair()
183 |     pvtdata = pvtkey.export_bin()
184 |     pubdata = pubkey.export_bin()
185 |     new_pvtkey = prio.PrivateKey().import_bin(pvtdata, pubdata)
186 |     assert pvtdata == new_pvtkey.export_bin()
187 | 


--------------------------------------------------------------------------------
/tests/test_prio_wrapper_serialize.py:
--------------------------------------------------------------------------------
  1 | # This Source Code Form is subject to the terms of the Mozilla Public
  2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | import pickle
  6 | import pytest
  7 | from prio_processor.prio import wrapper as prio
  8 | from prio import libprio
  9 | 
 10 | 
 11 | @pytest.fixture(autouse=True)
 12 | def init():
 13 |     # Note: PrioContext breaks with the fixtures
 14 |     libprio.Prio_init()
 15 |     yield
 16 |     libprio.Prio_clear()
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def seed():
 21 |     return prio.PRGSeed()
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def serverA_keypair():
 26 |     return prio.create_keypair()
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def serverB_keypair():
 31 |     return prio.create_keypair()
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | def config(serverA_keypair, serverB_keypair):
 36 |     _, pkA = serverA_keypair
 37 |     _, pkB = serverB_keypair
 38 |     return prio.Config(133, pkA, pkB, b"test_batch")
 39 | 
 40 | 
 41 | @pytest.fixture
 42 | def serverA(seed, config, serverA_keypair):
 43 |     sk, _ = serverA_keypair
 44 |     return prio.Server(config, prio.PRIO_SERVER_A, sk, seed)
 45 | 
 46 | 
 47 | @pytest.fixture
 48 | def serverB(seed, config, serverB_keypair):
 49 |     sk, _ = serverB_keypair
 50 |     return prio.Server(config, prio.PRIO_SERVER_B, sk, seed)
 51 | 
 52 | 
 53 | @pytest.fixture
 54 | def client(config):
 55 |     return prio.Client(config)
 56 | 
 57 | 
 58 | @pytest.mark.skip
 59 | def test_serialize_verifier(config, client, serverA, serverB):
 60 |     n_data = config.num_data_fields()
 61 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
 62 | 
 63 |     for_server_a, for_server_b = client.encode(data_items)
 64 | 
 65 |     vA = pickle.loads(pickle.dumps(serverA.create_verifier(for_server_a)))
 66 |     vB = serverB.create_verifier(for_server_b)
 67 | 
 68 |     p1A = vA.create_verify1()
 69 |     p1B = vB.create_verify1()
 70 | 
 71 |     p2A = vA.create_verify2(p1A, p1B)
 72 |     p2B = vB.create_verify2(p1A, p1B)
 73 | 
 74 |     assert vA.is_valid(p2A, p2B)
 75 |     assert vB.is_valid(p2A, p2B)
 76 | 
 77 | 
 78 | def test_serialize_verify1(config, client, serverA, serverB):
 79 |     n_data = config.num_data_fields()
 80 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
 81 | 
 82 |     for_server_a, for_server_b = client.encode(data_items)
 83 | 
 84 |     vA = serverA.create_verifier(for_server_a)
 85 |     vB = serverB.create_verifier(for_server_b)
 86 | 
 87 |     p1A = pickle.loads(pickle.dumps(vA.create_verify1()))
 88 |     p1B = vB.create_verify1()
 89 | 
 90 |     p2A = vA.create_verify2(p1A, p1B)
 91 |     p2B = vB.create_verify2(p1A, p1B)
 92 | 
 93 |     assert vA.is_valid(p2A, p2B)
 94 |     assert vB.is_valid(p2A, p2B)
 95 | 
 96 | 
 97 | def test_serialize_verify2(config, client, serverA, serverB):
 98 |     n_data = config.num_data_fields()
 99 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
100 | 
101 |     for_server_a, for_server_b = client.encode(data_items)
102 | 
103 |     vA = serverA.create_verifier(for_server_a)
104 |     vB = serverB.create_verifier(for_server_b)
105 | 
106 |     p1A = vA.create_verify1()
107 |     p1B = vB.create_verify1()
108 | 
109 |     p2A = pickle.loads(pickle.dumps(vA.create_verify2(p1A, p1B)))
110 |     p2B = vB.create_verify2(p1A, p1B)
111 | 
112 |     assert vA.is_valid(p2A, p2B)
113 |     assert vB.is_valid(p2A, p2B)
114 | 
115 | 
116 | def test_serialize_total_shares(config, client, serverA, serverB):
117 |     n_data = config.num_data_fields()
118 |     data_items = bytes([(i % 3 == 1) or (i % 5 == 1) for i in range(n_data)])
119 | 
120 |     for_server_a, for_server_b = client.encode(data_items)
121 | 
122 |     vA = serverA.create_verifier(for_server_a)
123 |     vB = serverB.create_verifier(for_server_b)
124 | 
125 |     p1A = vA.create_verify1()
126 |     p1B = vB.create_verify1()
127 | 
128 |     p2A = vA.create_verify2(p1A, p1B)
129 |     p2B = vB.create_verify2(p1A, p1B)
130 | 
131 |     assert vA.is_valid(p2A, p2B)
132 |     assert vB.is_valid(p2A, p2B)
133 | 
134 |     serverA.aggregate(vA)
135 |     serverB.aggregate(vB)
136 | 
137 |     t_a = pickle.loads(pickle.dumps(serverA.total_shares()))
138 |     t_b = serverB.total_shares()
139 |     output = prio.total_share_final(config, t_a, t_b)
140 |     assert list(output) == list(data_items)
141 | 


--------------------------------------------------------------------------------