├── .circleci └── config.yml ├── .coveragerc ├── .flake8 ├── .gitignore ├── .pyup.yml ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── Dockerfile.dev ├── Makefile ├── README.md ├── bin ├── README.md ├── dataproc.sh ├── export-avro.sh ├── run └── wait-for-it.sh ├── docker-compose.yml ├── mozaggregator ├── __init__.py ├── aggregator.py ├── bigquery.py ├── cli.py ├── config.py ├── db.py ├── mobile.py ├── parquet.py ├── service.py ├── sql.py └── trim_db.py ├── queries └── drop_non_quantum.sql ├── requirements ├── all.txt ├── build.txt └── tests.txt ├── script └── validation │ ├── README.md │ ├── entrypoint.sh │ ├── fetch_credentials.sh │ ├── fetch_stats.py │ ├── results.png │ ├── validate.py │ ├── validate_data_ref.py │ └── validate_data_test.py ├── setup.py └── tests ├── conftest.py ├── dataset.py ├── decoded.1.bq ├── mobile_dataset.py ├── test_aggregator.py ├── test_db.py ├── test_fixtures.py ├── test_mobile.py ├── test_parquet.py ├── test_service.py ├── test_trim_db.py └── utils.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # These environment variables must be set in CircleCI UI 2 | # 3 | # DOCKERHUB_REPO - docker hub repo, format: / 4 | # DOCKER_EMAIL - login info for docker hub 5 | # DOCKER_USER 6 | # DOCKER_PASS 7 | # 8 | 9 | version: 2 10 | jobs: 11 | build: 12 | docker: 13 | - image: docker:18.02.0-ce 14 | working_directory: ~/mozilla/python_mozaggregator 15 | steps: 16 | - checkout 17 | - setup_remote_docker 18 | - run: | 19 | printf '{"commit":"%s","version":"%s","source":"https://github.com/%s/%s","build":"%s"}\n' "$CIRCLE_SHA1" "$CIRCLE_TAG" "$CIRCLE_PROJECT_USERNAME" "$CIRCLE_PROJECT_REPONAME" "$CIRCLE_BUILD_URL" > version.json 20 | - run: docker build -t app:build . 21 | 22 | test: 23 | machine: true 24 | working_directory: ~/mozilla/python_mozaggregator 25 | steps: 26 | - checkout 27 | - run: 28 | name: Build 29 | command: make build 30 | - run: docker info 31 | - run: docker --version 32 | - run: make test 33 | 34 | deploy: 35 | docker: 36 | - image: docker:18.02.0-ce 37 | working_directory: ~/mozilla/python_mozaggregator 38 | steps: 39 | - checkout 40 | - setup_remote_docker 41 | - run: | 42 | printf '{"commit":"%s","version":"%s","source":"https://github.com/%s/%s","build":"%s"}\n' "$CIRCLE_SHA1" "$CIRCLE_TAG" "$CIRCLE_PROJECT_USERNAME" "$CIRCLE_PROJECT_REPONAME" "$CIRCLE_BUILD_URL" > version.json 43 | - run: docker build -t app:build . 44 | - run: 45 | name: Deploy to Dockerhub 46 | command: | 47 | # Deploy main 48 | if [ "${CIRCLE_BRANCH}" == "main" ]; then 49 | docker login -u $DOCKER_USER -p $DOCKER_PASS 50 | docker tag app:build ${DOCKERHUB_REPO}:latest 51 | docker push ${DOCKERHUB_REPO}:latest 52 | elif [ ! -z "${CIRCLE_TAG}" ]; then 53 | # Deploy a release tag... 54 | docker login -u $DOCKER_USER -p $DOCKER_PASS 55 | echo "${DOCKERHUB_REPO}:${CIRCLE_TAG}" 56 | docker tag app:build "${DOCKERHUB_REPO}:${CIRCLE_TAG}" 57 | docker images 58 | docker push "${DOCKERHUB_REPO}:${CIRCLE_TAG}" 59 | fi 60 | 61 | 62 | workflows: 63 | version: 2 64 | build-test-deploy: 65 | jobs: 66 | - build: 67 | filters: 68 | tags: 69 | only: /.*/ 70 | 71 | - test: 72 | filters: 73 | tags: 74 | only: /.*/ 75 | 76 | - deploy: 77 | requires: 78 | - test 79 | filters: 80 | tags: 81 | only: /.*/ 82 | branches: 83 | only: main 84 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = mozaggregator 3 | branch = True 4 | data_file = /tmp/.coverage 5 | 6 | [report] 7 | omit = 8 | tests/* 9 | 10 | [xml] 11 | output = /tmp/coverage.xml 12 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # E501: This project is ok with long lines. 3 | ignore = E501 4 | exclude = setup.py, ansible/ 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .vagrant/ 3 | *.egg-info/ -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | update: all 2 | search: False 3 | requirements: 4 | - requirements/build.txt 5 | - requirements/tests.txt 6 | schedule: "every week on monday" 7 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-stretch 2 | 3 | ENV PYTHONUNBUFFERED=1 \ 4 | POSTGRES_USER=root \ 5 | POSTGRES_DB=telemetry \ 6 | PORT=5000 7 | 8 | EXPOSE $PORT 9 | 10 | # Install Java 11 | RUN apt-get update -y && \ 12 | apt-get install -y --no-install-recommends openjdk-8-jdk 13 | 14 | # Install Cloud SDK: https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu 15 | RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | \ 16 | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 17 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ 18 | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ 19 | apt-get update -y && \ 20 | apt-get install google-cloud-sdk -y 21 | 22 | RUN apt-get update -y && \ 23 | apt-get install -y --no-install-recommends \ 24 | # production only libs on next line. 25 | gcc awscli net-tools \ 26 | libsnappy-dev liblzma-dev g++ curl libpq-dev bzip2 libffi-dev \ 27 | libblas-dev liblapack-dev wget ca-certificates openssl libssl-dev \ 28 | postgresql && \ 29 | apt-get autoremove -y && \ 30 | apt-get clean && \ 31 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 32 | 33 | # add a non-privileged user for installing and running the application 34 | RUN mkdir /app && \ 35 | chown 10001:10001 /app && \ 36 | groupadd --gid 10001 app && \ 37 | useradd --no-create-home --uid 10001 --gid 10001 --home-dir /app app 38 | 39 | # Install Python dependencies 40 | COPY requirements/*.txt /tmp/requirements/ 41 | 42 | # Switch to /tmp to install dependencies outside home dir 43 | WORKDIR /tmp 44 | RUN pip install --upgrade pip && \ 45 | pip install --no-cache-dir -r requirements/build.txt 46 | 47 | ENV PYSPARK_PYTHON=python \ 48 | SPARK_HOME=/usr/local/lib/python3.7/site-packages/pyspark 49 | 50 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-latest.jar 51 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar 52 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar 53 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar 54 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.11/2.4.3/spark-avro_2.11-2.4.3.jar 55 | 56 | # Switch back to home directory 57 | WORKDIR /app 58 | 59 | COPY . /app 60 | 61 | RUN chown -R 10001:10001 /app 62 | 63 | USER 10001 64 | 65 | ENTRYPOINT ["/usr/local/bin/gunicorn"] 66 | 67 | CMD ["mozaggregator.service:app", "-k", "gevent", "--bind", "0.0.0.0:5000"] 68 | -------------------------------------------------------------------------------- /Dockerfile.dev: -------------------------------------------------------------------------------- 1 | FROM python:3.7-stretch 2 | 3 | ENV PYTHONUNBUFFERED=1 \ 4 | PYTHONDONTWRITEBYTECODE=1 \ 5 | DEVELOPMENT=1 \ 6 | POSTGRES_HOST=db \ 7 | POSTGRES_USER=postgres \ 8 | PORT=5000 \ 9 | DB_PORT=5432 \ 10 | # No boto. See https://github.com/travis-ci/travis-ci/issues/7940 11 | BOTO_CONFIG=/dev/null \ 12 | DB_TEST_URL="dbname=postgres user=postgres host=db" \ 13 | PYTHONPATH=$PYTHONPATH:. 14 | 15 | EXPOSE $PORT 16 | 17 | # Install Java 18 | RUN apt-get update -y && \ 19 | apt-get install -y --no-install-recommends openjdk-8-jdk 20 | 21 | # Install Cloud SDK: https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu 22 | RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | \ 23 | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 24 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ 25 | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ 26 | apt-get update -y && \ 27 | apt-get install google-cloud-sdk -y 28 | 29 | RUN apt-get update -y && \ 30 | apt-get install -y --no-install-recommends \ 31 | gcc awscli net-tools \ 32 | libsnappy-dev liblzma-dev g++ curl libpq-dev bzip2 libffi-dev \ 33 | libblas-dev liblapack-dev wget ca-certificates openssl libssl-dev \ 34 | postgresql && \ 35 | apt-get autoremove -y && \ 36 | apt-get clean && \ 37 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 38 | 39 | # add a non-privileged user for installing and running the application 40 | RUN mkdir /app && \ 41 | chown 10001:10001 /app && \ 42 | groupadd --gid 10001 app && \ 43 | useradd --no-create-home --uid 10001 --gid 10001 --home-dir /app app 44 | 45 | # Install Python dependencies 46 | COPY requirements/*.txt /tmp/requirements/ 47 | 48 | # Switch to /tmp to install dependencies outside home dir 49 | WORKDIR /tmp 50 | RUN pip install --upgrade pip && \ 51 | pip install --no-cache-dir -r requirements/all.txt 52 | 53 | ENV PYSPARK_PYTHON=python \ 54 | SPARK_HOME=/usr/local/lib/python3.7/site-packages/pyspark 55 | 56 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-latest.jar 57 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar 58 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar 59 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar 60 | # although scala 2.11 has been deprecated since spark 2.4.1, it is still the default scala version in this dockerfile 61 | RUN wget --directory-prefix $SPARK_HOME/jars/ https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.11/2.4.3/spark-avro_2.11-2.4.3.jar 62 | 63 | # Switch back to home directory 64 | WORKDIR /app 65 | COPY . /app 66 | RUN chown -R 10001:10001 /app 67 | 68 | USER 10001 69 | 70 | ENTRYPOINT ["/bin/bash", "/app/bin/run"] 71 | 72 | CMD ["serve"] 73 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build clean test shell stop up 2 | 3 | help: 4 | @echo "Welcome to the Python Mozaggregator\n" 5 | @echo "The list of commands for local development:\n" 6 | @echo " build Builds the docker images for the docker-compose setup" 7 | @echo " clean Stops and removes all docker containers" 8 | @echo " shell Opens a Bash shell" 9 | @echo " stop Stops the docker containers" 10 | @echo " test Runs the Python test suite" 11 | @echo " up Runs the whole stack, served at http://localhost:5000/" 12 | 13 | build: 14 | docker-compose build 15 | 16 | clean: stop 17 | docker-compose rm -f 18 | 19 | shell: 20 | docker-compose run --service-ports web bash 21 | 22 | stop: 23 | docker-compose stop 24 | 25 | test: 26 | docker-compose run web test 27 | 28 | up: 29 | docker-compose up 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python_mozaggregator 2 | 3 | Aggregator job for Telemetry. See this [blog](http://robertovitillo.com/2015/07/02/telemetry-metrics-roll-ups/) post for details. 4 | 5 | [![CircleCI](https://circleci.com/gh/mozilla/python_mozaggregator/tree/main.svg?style=svg)](https://circleci.com/gh/mozilla/python_mozaggregator/tree/main) 6 | 7 | ## Development and deployment 8 | 9 | To clean, build, and run all containers: 10 | ``` 11 | make up 12 | ``` 13 | 14 | To build containers and ssh into web container: 15 | ``` 16 | make shell 17 | ``` 18 | 19 | To manually ssh into running web container (e.g. after a 'make up'): 20 | ``` 21 | docker ps 22 | docker exec -it /bin/bash 23 | ``` 24 | 25 | To build and run tests inside dev container: 26 | ``` 27 | make test 28 | ``` 29 | 30 | To manually run tests on running web container: 31 | ``` 32 | make shell 33 | ./bin/run test 34 | ``` 35 | 36 | ## Deployment 37 | The following Env vars need to be set up in hiera-sops POSTGRES_HOST, POSTGRES_RO_HOST, POSTGRES_PASS 38 | There are jenkins pipeline jobs to deploy this. See cloudops-deployment/projects/mozaggregator for details 39 | 40 | ## Enabling and Disabling Metrics 41 | To completely disable viewing of a metric, add it to the `METRICS_BLACKLIST`. No matter how this is deployed, users will 42 | never be able to see that metric. Any regex can be matched against with the blacklist, for example to disable all metrics 43 | prefixed with "USER_DATA", put in `r"USER_DATA.*"`. 44 | 45 | ### Release Metrics 46 | By default, release metrics are not allowed by the service. To enable a specific release metric, add it to `PUBLIC_RELEASE_METRICS`. 47 | It will then be viewable publicly. 48 | 49 | To enable all release metrics (except those in `METRICS_BLACKLIST`), set the envvar `ALLOW_ALL_RELEASE_METRICS` to "True". 50 | 51 | ## API 52 | Aggregates are made available through a HTTP API. There are two kinds of aggregates: per submission date (date a ping is received by the server) and per build-id (date the submitting product was built). 53 | 54 | To access the aggregates use the ```aggregates_by/build_id/``` and ```aggregates_by/submission_date/``` prefix respectively. 55 | 56 | In the URLs below, replace `SERVICE` with the origin of this service's instance. The official service is `https://aggregates.telemetry.mozilla.org`. 57 | 58 | The following examples are based on build-id aggregates. Replace `build_id` with `submission_date` to use aggregates per submission date instead. 59 | 60 | ##### Get available channels: 61 | ```bash 62 | curl -X GET https://SERVICE/aggregates_by/build_id/channels/ 63 | ["nightly","beta","release"] 64 | ``` 65 | 66 | ##### Get a list of options for the available dimensions on a given channel and version: 67 | ```bash 68 | curl -X GET "https://SERVICE/filters/?channel=nightly&version=42" 69 | {"metric":["A11Y_CONSUMERS","A11Y_IATABLE_USAGE_FLAG",...], 70 | "application":["Fennec","Firefox"], 71 | ...} 72 | ``` 73 | 74 | ##### Get a list of available build-ids for a given channel: 75 | ```bash 76 | curl -X GET "https://SERVICE/aggregates_by/build_id/channels/nightly/dates/" 77 | [{"date":"20150630","version":"42"}, {"date":"20150629","version":"42"}] 78 | ``` 79 | 80 | ##### Given a set of build-ids, retrieve for each of build-id the aggregated histogram that complies with the requested filters: 81 | ```bash 82 | curl -X GET "https://SERVICE/aggregates_by/build_id/channels/nightly/?version=41&dates=20150615,20150616&metric=GC_MS&os=Windows_NT" 83 | {"buckets":[0, ..., 10000], 84 | "data":[{"date":"20150615", 85 | "count":239459, 86 | "sum": 412346123, 87 | "histogram":[309, ..., 5047], 88 | "label":""}, 89 | {"date":"20150616", 90 | "count":233688, 91 | "sum": 402241121, 92 | "histogram":[306, ..., 7875], 93 | "label":""}], 94 | "kind":"exponential", 95 | "description":"Time spent running JS GC (ms)"} 96 | ``` 97 | 98 | The available filters are: 99 | - `metric`, e.g. JS_TELEMETRY_ADDON_EXCEPTIONS 100 | - `application`, e.g. Firefox 101 | - `architecture`, e.g. x86 102 | - `os`, e.g. Windows_NT 103 | - `osVersion`, e.g. 6.1 104 | - `label`, e.g Adblock-Plus 105 | - `child`, e.g. true, meaningful only if e10s is enabled 106 | 107 | A reply has the following attributes: 108 | - `buckets`, which represents the bucket labels of the histogram 109 | - `kind`, the kind of histogram (e.g. exponential) 110 | - `data`, which is an array of metric objects with the following attributes: 111 | - `date`: a build-id 112 | - `count`: number of metrics aggregated 113 | - `sum`: sum of accumulated values 114 | - `histogram`: bucket values 115 | - `description`: histogram description 116 | - `label`: for keyed histograms, the key the entry belongs to, or otherwise a blank string 117 | 118 | Keyed histograms have the same format as unkeyed histograms, but there can possibly be multiple metric objects with the same date, each with a different key (`label`). 119 | -------------------------------------------------------------------------------- /bin/README.md: -------------------------------------------------------------------------------- 1 | # `python_mozaggregator/bin` 2 | 3 | This folder contains script to execute the Docker containers. It also contains 4 | scripts for managing workflows. 5 | 6 | ## `wait-for-it` 7 | 8 | The "wait-for-it" shell script comes from https://github.com/vishnubob/wait-for-it 9 | 10 | To update this file execute the following command: 11 | 12 | ```bash 13 | wget https://raw.githubusercontent.com/vishnubob/wait-for-it/master/wait-for-it.sh -O bin/wait-for-it.sh 14 | ``` 15 | 16 | ## `dataproc` testing harness 17 | 18 | This script is used to spin up a DataProc cluster with the necessary jars for 19 | connecting to BigQuery. Clusters will automatically delete themselves after 10 20 | minutes of idle. Any command defined in `python_mozaggregator` can be run using 21 | this harness. 22 | 23 | ```bash 24 | NUM_WORKERS=5 bin/dataproc.sh \ 25 | mobile \ 26 | --output gs://amiyaguchi-dev/mozaggregator/mobile_test/nonprod/20191101/ \ 27 | --num-partitions 200 \ 28 | --date 20191101 \ 29 | --source avro \ 30 | --avro-prefix gs://amiyaguchi-dev/avro-mozaggregator/moz-fx-data-shar-nonprod-efed 31 | ``` 32 | 33 | ## `export-avro` 34 | 35 | This script can be run locally or through the docker image. You will need to 36 | have access to a sandbox account with access to the 37 | `moz-fx-data-shar-nonprod-efed` or `moz-fx-data-shared-prod` projects. 38 | 39 | ```bash 40 | export PROJECT_ID=... 41 | 42 | bin/export-avro.sh \ 43 | moz-fx-data-shar-nonprod-efed \ 44 | ${PROJECT_ID}:avro_export \ 45 | gs://${PROJECT_ID}/avro-mozaggregator \ 46 | "main_v4" \ 47 | "'nightly', 'beta'" \ 48 | 2019-12-15 49 | ``` 50 | 51 | This can also be run through the docker image: 52 | 53 | ```bash 54 | export PROJECT_ID=... 55 | export GOOGLE_APPLICATION_CREDENTIALS=... 56 | 57 | docker run \ 58 | --entrypoint bash \ 59 | -v $GOOGLE_APPLICATION_CREDENTIALS:/tmp/credentials \ 60 | -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/credentials \ 61 | -it mozilla/python_mozaggregator:latest \ 62 | gcloud auth activate-service-account --key-file /tmp/credentials && \ 63 | bin/export-avro.sh \ 64 | moz-fx-data-shar-nonprod-efed \ 65 | ${PROJECT_ID}:avro_export \ 66 | gs://${PROJECT_ID}/avro-mozaggregator \ 67 | "main_v4" \ 68 | "'nightly', 'beta'" \ 69 | 2019-12-15 70 | ``` 71 | 72 | The production settings for pre-release aggregates are as follows: 73 | 74 | ```bash 75 | "main_v4" "'nightly', 'beta'" 76 | "mobile_metrics_v1" "" 77 | ``` 78 | -------------------------------------------------------------------------------- /bin/dataproc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # A testing script for verifying the spark-bigquery connector with the existing 4 | # mozaggregator code. This requires `gcloud` to be configured to point at a 5 | # sandbox project for reading data from `payload_bytes_decoded`. 6 | 7 | set -e 8 | 9 | REGION=us-west1 10 | MODULE="python_mozaggregator" 11 | NUM_WORKERS=${NUM_WORKERS:-1} 12 | 13 | 14 | function bootstrap() { 15 | local bucket=$1 16 | 17 | # create the package artifacts 18 | rm -rf dist build 19 | python3 setup.py bdist_egg 20 | gsutil cp "dist/${MODULE}*.egg" "gs://${bucket}/bootstrap/${MODULE}.egg" 21 | 22 | # create the initialization script and runner 23 | mkdir -p bootstrap 24 | cd bootstrap 25 | echo "apt install --yes python-dev" > install-python-dev.sh 26 | tee mozaggregator-runner.py >/dev/null << EOF 27 | # This runner has been auto-generated from mozilla/python_mozaggregator/bin/dataproc.sh. 28 | # Any changes made to the runner file will be over-written on subsequent runs. 29 | from mozaggregator import cli 30 | 31 | try: 32 | cli.entry_point(auto_envvar_prefix="MOZETL") 33 | except SystemExit: 34 | # avoid calling sys.exit() in databricks 35 | # http://click.palletsprojects.com/en/7.x/api/?highlight=auto_envvar_prefix#click.BaseCommand.main 36 | pass 37 | EOF 38 | cd .. 39 | gsutil cp bootstrap/* "gs://${bucket}/bootstrap/" 40 | } 41 | 42 | 43 | function delete_cluster() { 44 | local cluster_id=$1 45 | gcloud dataproc clusters delete ${cluster_id} --region=${REGION} 46 | } 47 | 48 | 49 | function create_cluster() { 50 | local cluster_id=$1 51 | local bucket=$2 52 | requirements=$(tr "\n" " " < requirements/build.txt) 53 | 54 | function cleanup { 55 | delete_cluster ${cluster_id} 56 | } 57 | trap cleanup EXIT 58 | 59 | gcloud beta dataproc clusters create ${cluster_id} \ 60 | --image-version 1.4 \ 61 | --enable-component-gateway \ 62 | --worker-machine-type=n1-standard-8 \ 63 | --num-preemptible-workers ${NUM_WORKERS} \ 64 | --properties ^#^spark:spark.jars=gs://spark-lib/bigquery/spark-bigquery-latest.jar#spark:spark.hadoop.fs.s3a.access.key=${AWS_ACCESS_KEY_ID}#spark:spark.hadoop.fs.s3a.secret.key=${AWS_SECRET_ACCESS_KEY}#spark:spark.jars.packages=org.apache.spark:spark-avro_2.11:2.4.4#spark:spark.python.profile=true \ 65 | --metadata "PIP_PACKAGES=${requirements}" \ 66 | --initialization-actions \ 67 | gs://${bucket}/bootstrap/install-python-dev.sh,gs://dataproc-initialization-actions/python/pip-install.sh \ 68 | --region=${REGION} \ 69 | --max-idle 10m 70 | } 71 | 72 | 73 | function submit() { 74 | cluster_id=$1 75 | bucket=$2 76 | # pass the rest of the parameters from the main function 77 | shift 2 78 | gcloud dataproc jobs submit pyspark \ 79 | gs://${bucket}/bootstrap/mozaggregator-runner.py \ 80 | --cluster ${cluster_id} \ 81 | --region ${REGION} \ 82 | --py-files=gs://${bucket}/bootstrap/${MODULE}.egg \ 83 | -- "$@" 84 | } 85 | 86 | 87 | function main() { 88 | cd "$(dirname "$0")/.." 89 | bucket=$(gcloud config get-value project) 90 | cluster_id="test-mozaggregator-${RANDOM}" 91 | bootstrap $bucket 92 | create_cluster $cluster_id $bucket 93 | submit $cluster_id $bucket "$@" 94 | } 95 | 96 | 97 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 98 | main "$@" 99 | fi 100 | -------------------------------------------------------------------------------- /bin/export-avro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # A testing script for verifying the avro exports work with the existing 4 | # mozaggregator code. This requires `gcloud` to be configured to point at a 5 | # sandbox project for reading data from `payload_bytes_decoded`. There is a 6 | # 10 TB export limit per day, so be conservative with usage. 7 | 8 | set -eou pipefail 9 | 10 | # system agnostic way of obtaining yesterday's date, macOS' date utility doesnt provide -d 11 | function default_date() { 12 | python3 - <" 6 | exit 1 7 | } 8 | 9 | [ $# -lt 1 ] && usage 10 | 11 | # Only wait for backend services in development. 12 | # http://stackoverflow.com/a/13864829 13 | [ ! -z ${DEVELOPMENT+check} ] && ./bin/wait-for-it.sh db:${DB_PORT:-5432} --timeout=0 --strict 14 | 15 | case $1 in 16 | serve) 17 | exec python mozaggregator/service.py 0.0.0.0:${PORT} 18 | ;; 19 | test) 20 | pytest --cov=mozaggregator tests || exit 1 21 | 22 | if [[ ! -z ${CI+check} ]]; then 23 | echo "TODO: Set up codecov." 24 | # bash <(curl -s https://codecov.io/bash) -s /tmp 25 | else 26 | coverage report -m 27 | fi 28 | ;; 29 | *) 30 | exec "$@" 31 | ;; 32 | esac 33 | -------------------------------------------------------------------------------- /bin/wait-for-it.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Use this script to test if a given TCP host/port are available 3 | 4 | cmdname=$(basename $0) 5 | 6 | echoerr() { if [[ $QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } 7 | 8 | usage() 9 | { 10 | cat << USAGE >&2 11 | Usage: 12 | $cmdname host:port [-s] [-t timeout] [-- command args] 13 | -h HOST | --host=HOST Host or IP under test 14 | -p PORT | --port=PORT TCP port under test 15 | Alternatively, you specify the host and port as host:port 16 | -s | --strict Only execute subcommand if the test succeeds 17 | -q | --quiet Don't output any status messages 18 | -t TIMEOUT | --timeout=TIMEOUT 19 | Timeout in seconds, zero for no timeout 20 | -- COMMAND ARGS Execute command with args after the test finishes 21 | USAGE 22 | exit 1 23 | } 24 | 25 | wait_for() 26 | { 27 | if [[ $TIMEOUT -gt 0 ]]; then 28 | echoerr "$cmdname: waiting $TIMEOUT seconds for $HOST:$PORT" 29 | else 30 | echoerr "$cmdname: waiting for $HOST:$PORT without a timeout" 31 | fi 32 | start_ts=$(date +%s) 33 | while : 34 | do 35 | if [[ $ISBUSY -eq 1 ]]; then 36 | nc -z $HOST $PORT 37 | result=$? 38 | else 39 | (echo > /dev/tcp/$HOST/$PORT) >/dev/null 2>&1 40 | result=$? 41 | fi 42 | if [[ $result -eq 0 ]]; then 43 | end_ts=$(date +%s) 44 | echoerr "$cmdname: $HOST:$PORT is available after $((end_ts - start_ts)) seconds" 45 | break 46 | fi 47 | sleep 1 48 | done 49 | return $result 50 | } 51 | 52 | wait_for_wrapper() 53 | { 54 | # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 55 | if [[ $QUIET -eq 1 ]]; then 56 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --quiet --child --host=$HOST --port=$PORT --timeout=$TIMEOUT & 57 | else 58 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --child --host=$HOST --port=$PORT --timeout=$TIMEOUT & 59 | fi 60 | PID=$! 61 | trap "kill -INT -$PID" INT 62 | wait $PID 63 | RESULT=$? 64 | if [[ $RESULT -ne 0 ]]; then 65 | echoerr "$cmdname: timeout occurred after waiting $TIMEOUT seconds for $HOST:$PORT" 66 | fi 67 | return $RESULT 68 | } 69 | 70 | # process arguments 71 | while [[ $# -gt 0 ]] 72 | do 73 | case "$1" in 74 | *:* ) 75 | hostport=(${1//:/ }) 76 | HOST=${hostport[0]} 77 | PORT=${hostport[1]} 78 | shift 1 79 | ;; 80 | --child) 81 | CHILD=1 82 | shift 1 83 | ;; 84 | -q | --quiet) 85 | QUIET=1 86 | shift 1 87 | ;; 88 | -s | --strict) 89 | STRICT=1 90 | shift 1 91 | ;; 92 | -h) 93 | HOST="$2" 94 | if [[ $HOST == "" ]]; then break; fi 95 | shift 2 96 | ;; 97 | --host=*) 98 | HOST="${1#*=}" 99 | shift 1 100 | ;; 101 | -p) 102 | PORT="$2" 103 | if [[ $PORT == "" ]]; then break; fi 104 | shift 2 105 | ;; 106 | --port=*) 107 | PORT="${1#*=}" 108 | shift 1 109 | ;; 110 | -t) 111 | TIMEOUT="$2" 112 | if [[ $TIMEOUT == "" ]]; then break; fi 113 | shift 2 114 | ;; 115 | --timeout=*) 116 | TIMEOUT="${1#*=}" 117 | shift 1 118 | ;; 119 | --) 120 | shift 121 | CLI=("$@") 122 | break 123 | ;; 124 | --help) 125 | usage 126 | ;; 127 | *) 128 | echoerr "Unknown argument: $1" 129 | usage 130 | ;; 131 | esac 132 | done 133 | 134 | if [[ "$HOST" == "" || "$PORT" == "" ]]; then 135 | echoerr "Error: you need to provide a host and port to test." 136 | usage 137 | fi 138 | 139 | TIMEOUT=${TIMEOUT:-15} 140 | STRICT=${STRICT:-0} 141 | CHILD=${CHILD:-0} 142 | QUIET=${QUIET:-0} 143 | 144 | # check to see if timeout is from busybox? 145 | # check to see if timeout is from busybox? 146 | TIMEOUT_PATH=$(realpath $(which timeout)) 147 | if [[ $TIMEOUT_PATH =~ "busybox" ]]; then 148 | ISBUSY=1 149 | BUSYTIMEFLAG="-t" 150 | else 151 | ISBUSY=0 152 | BUSYTIMEFLAG="" 153 | fi 154 | 155 | if [[ $CHILD -gt 0 ]]; then 156 | wait_for 157 | RESULT=$? 158 | exit $RESULT 159 | else 160 | if [[ $TIMEOUT -gt 0 ]]; then 161 | wait_for_wrapper 162 | RESULT=$? 163 | else 164 | wait_for 165 | RESULT=$? 166 | fi 167 | fi 168 | 169 | if [[ $CLI != "" ]]; then 170 | if [[ $RESULT -ne 0 && $STRICT -eq 1 ]]; then 171 | echoerr "$cmdname: strict mode, refusing to execute subprocess" 172 | exit $RESULT 173 | fi 174 | exec "${CLI[@]}" 175 | else 176 | exit $RESULT 177 | fi 178 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | 3 | services: 4 | db: 5 | image: postgres:9.4 6 | ports: 7 | - "5432:5432" 8 | environment: 9 | - POSTGRES_HOST_AUTH_METHOD=trust 10 | 11 | web: 12 | build: 13 | context: . 14 | dockerfile: Dockerfile.dev 15 | ports: 16 | - "5000:5000" 17 | depends_on: 18 | - db 19 | command: serve 20 | volumes: 21 | - ./:/app/ 22 | - ${GOOGLE_APPLICATION_CREDENTIALS:-./setup.py}:/app/.credentials 23 | environment: 24 | - GOOGLE_APPLICATION_CREDENTIALS=/app/.credentials 25 | - PROJECT_ID 26 | - TMP_AVRO_PATH 27 | - AWS_ACCESS_KEY_ID 28 | - AWS_SECRET_ACCESS_KEY 29 | -------------------------------------------------------------------------------- /mozaggregator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/python_mozaggregator/6c0119bfd0b535346c37cb3f707d998039d3e24b/mozaggregator/__init__.py -------------------------------------------------------------------------------- /mozaggregator/aggregator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | from collections import defaultdict 9 | 10 | from moztelemetry.dataset import Dataset 11 | from moztelemetry.histogram import cached_exponential_buckets 12 | 13 | from mozaggregator.bigquery import BigQueryDataset 14 | 15 | 16 | # Simple measurement, count histogram, and numeric scalars labels & prefixes 17 | SIMPLE_MEASURES_LABELS = cached_exponential_buckets(1, 30000, 50) 18 | COUNT_HISTOGRAM_LABELS = [ 19 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 25, 27, 29, 31, 34, 20 | 37, 40, 43, 46, 50, 54, 58, 63, 68, 74, 80, 86, 93, 101, 109, 118, 128, 138, 149, 161, 174, 188, 21 | 203, 219, 237, 256, 277, 299, 323, 349, 377, 408, 441, 477, 516, 558, 603, 652, 705, 762, 824, 22 | 891, 963, 1041, 1125, 1216, 1315, 1422, 1537, 1662, 1797, 1943, 2101, 2271, 2455, 2654, 2869, 23 | 3102, 3354, 3626, 3920, 4238, 4582, 4954, 5356, 5791, 6261, 6769, 7318, 7912, 8554, 9249, 10000, 24 | ] 25 | NUMERIC_SCALARS_LABELS = COUNT_HISTOGRAM_LABELS 26 | 27 | SIMPLE_MEASURES_PREFIX = 'SIMPLE_MEASURES' 28 | COUNT_HISTOGRAM_PREFIX = '[[COUNT]]' 29 | NUMERIC_SCALARS_PREFIX = 'SCALARS' 30 | 31 | SCALAR_MEASURE_MAP = { 32 | SIMPLE_MEASURES_PREFIX: SIMPLE_MEASURES_LABELS, 33 | COUNT_HISTOGRAM_PREFIX: COUNT_HISTOGRAM_LABELS, 34 | NUMERIC_SCALARS_PREFIX: NUMERIC_SCALARS_LABELS 35 | } 36 | 37 | PROCESS_TYPES = {"parent", "content", "gpu"} 38 | 39 | def aggregate_metrics( 40 | sc, 41 | channels, 42 | submission_date, 43 | main_ping_fraction=1, 44 | num_reducers=10000, 45 | source="moztelemetry", 46 | project_id=None, 47 | dataset_id=None, 48 | avro_prefix=None, 49 | ): 50 | """ Returns the build-id and submission date aggregates for a given submission date. 51 | 52 | :param sc: A SparkContext instance 53 | :param channel: Either the name of a channel or a list/tuple of names 54 | :param submission-date: The submission date for which the data will be aggregated 55 | :param fraction: An approximative fraction of submissions to consider for aggregation 56 | """ 57 | if not isinstance(channels, (tuple, list)): 58 | channels = [channels] 59 | 60 | if source == "bigquery" and project_id and dataset_id: 61 | dataset = BigQueryDataset() 62 | pings = dataset.load( 63 | project_id, 64 | dataset_id, 65 | "main", 66 | submission_date, 67 | channels, 68 | "normalized_app_name <> 'Fennec'" 69 | ) 70 | elif source == "avro" and avro_prefix: 71 | dataset = BigQueryDataset() 72 | pings = dataset.load_avro( 73 | avro_prefix, 74 | "main", 75 | submission_date, 76 | channels, 77 | "normalized_app_name <> 'Fennec'" 78 | ) 79 | else: 80 | pings = Dataset.from_source('telemetry') \ 81 | .where(appUpdateChannel=lambda x: x in channels, 82 | submissionDate=submission_date, 83 | docType='main', 84 | sourceVersion='4', 85 | appName=lambda x: x != 'Fennec') \ 86 | .records(sc, sample=main_ping_fraction) 87 | return _aggregate_metrics(pings, num_reducers) 88 | 89 | 90 | def _aggregate_metrics(pings, num_reducers=10000): 91 | trimmed = pings.filter(_sample_clients).map(_map_ping_to_dimensions).filter(lambda x: x) 92 | build_id_aggregates = trimmed.aggregateByKey(defaultdict(dict), _aggregate_ping, _aggregate_aggregates, num_reducers) 93 | submission_date_aggregates = build_id_aggregates.map(_map_build_id_key_to_submission_date_key).reduceByKey(_aggregate_aggregates) 94 | return build_id_aggregates, submission_date_aggregates 95 | 96 | 97 | def _map_build_id_key_to_submission_date_key(aggregate): 98 | return tuple(aggregate[0][:3] + aggregate[0][4:]), aggregate[1] 99 | 100 | 101 | def _sample_clients(ping): 102 | try: 103 | sample_id = ping.get("meta", {}).get("sampleId") 104 | 105 | if not isinstance(sample_id, (int, float)): 106 | return False 107 | 108 | # Here "aurora" is actually the dev edition. 109 | if ping.get("application", {}).get("channel") not in ("nightly", "aurora", "beta", "release"): 110 | return False 111 | 112 | return sample_id < 100 113 | except: # noqa 114 | return False 115 | 116 | 117 | def _extract_histograms(state, payload, process_type="parent"): 118 | if not isinstance(payload, dict): 119 | return 120 | 121 | histograms = payload.get("histograms", {}) 122 | _extract_main_histograms(state, histograms, process_type) 123 | 124 | keyed_histograms = payload.get("keyedHistograms", {}) 125 | if not isinstance(keyed_histograms, dict): 126 | return 127 | 128 | for name, histograms in keyed_histograms.items(): 129 | # See Bug 1275010 and 1275019 130 | if name in ["MESSAGE_MANAGER_MESSAGE_SIZE", 131 | "VIDEO_DETAILED_DROPPED_FRAMES_PROPORTION"]: 132 | continue 133 | _extract_keyed_histograms(state, name, histograms, process_type) 134 | 135 | 136 | def _extract_histogram(state, histogram, histogram_name, label, process_type): 137 | if not isinstance(histogram, dict): 138 | return 139 | 140 | values = histogram.get("values") 141 | if not isinstance(values, dict): 142 | return 143 | 144 | sum = histogram.get("sum") 145 | if not isinstance(sum, int) or sum < 0: 146 | return 147 | 148 | histogram_type = histogram.get("histogram_type") 149 | if not isinstance(histogram_type, int): 150 | return 151 | 152 | if histogram_type == 4: # Count histogram 153 | return _extract_scalar_value( 154 | state, '_'.join((COUNT_HISTOGRAM_PREFIX, histogram_name)), label, 155 | sum, COUNT_HISTOGRAM_LABELS, process_type=process_type) 156 | 157 | # Note that some dimensions don't vary within a single submissions 158 | # (e.g. channel) while some do (e.g. process type). 159 | # The latter should appear within the key of a single metric. 160 | accessor = (histogram_name, label, process_type) 161 | aggregated_histogram = state[accessor]["histogram"] = state[accessor].get("histogram", {}) 162 | 163 | state[accessor]["sum"] = state[accessor].get("sum", 0) + sum 164 | state[accessor]["count"] = state[accessor].get("count", 0) + 1 165 | for k, v in values.items(): 166 | try: 167 | int(k) 168 | except ValueError: 169 | # We have seen some histograms with non-integer bucket keys. 170 | continue 171 | 172 | v = v if isinstance(v, int) else 0 173 | aggregated_histogram[k] = aggregated_histogram.get(k, 0) + v 174 | 175 | 176 | def _extract_main_histograms(state, histograms, process_type): 177 | if not isinstance(histograms, dict): 178 | return 179 | 180 | for histogram_name, histogram in histograms.items(): 181 | _extract_histogram(state, histogram, histogram_name, "", process_type) 182 | 183 | 184 | def _extract_keyed_histograms(state, histogram_name, histograms, process_type): 185 | if not isinstance(histograms, dict): 186 | return 187 | 188 | for key, histogram in histograms.items(): 189 | _extract_histogram(state, histogram, histogram_name, key, process_type) 190 | 191 | 192 | def _extract_simple_measures(state, simple, process_type="parent"): 193 | if not isinstance(simple, dict): 194 | return 195 | 196 | for name, value in simple.items(): 197 | if isinstance(value, dict): 198 | for sub_name, sub_value in value.items(): 199 | if isinstance(sub_value, (int, float)): 200 | _extract_scalar_value( 201 | state, 202 | "_".join((SIMPLE_MEASURES_PREFIX, name.upper(), sub_name.upper())), 203 | "", sub_value, SIMPLE_MEASURES_LABELS, process_type) 204 | elif isinstance(value, (int, float)): 205 | _extract_scalar_value( 206 | state, "_".join((SIMPLE_MEASURES_PREFIX, name.upper())), 207 | "", value, SIMPLE_MEASURES_LABELS, process_type) 208 | 209 | 210 | def _extract_scalars(state, process_payloads): 211 | for process in PROCESS_TYPES: 212 | _extract_numeric_scalars(state, process_payloads.get(process, {}).get("scalars", {}), process) 213 | _extract_keyed_numeric_scalars(state, process_payloads.get(process, {}).get("keyedScalars", {}), process) 214 | 215 | 216 | def _extract_numeric_scalars(state, scalar_dict, process): 217 | if not isinstance(scalar_dict, dict): 218 | return 219 | 220 | for name, value in scalar_dict.items(): 221 | if not isinstance(value, (int, float)): 222 | continue 223 | 224 | if name.startswith("browser.engagement.navigation"): 225 | continue 226 | 227 | scalar_name = "_".join((NUMERIC_SCALARS_PREFIX, name.upper())) 228 | _extract_scalar_value(state, scalar_name, "", value, NUMERIC_SCALARS_LABELS, process) 229 | 230 | 231 | def _extract_keyed_numeric_scalars(state, scalar_dict, process): 232 | if not isinstance(scalar_dict, dict): 233 | return 234 | 235 | for name, value in scalar_dict.items(): 236 | if not isinstance(value, dict): 237 | continue 238 | 239 | if name.startswith("browser.engagement.navigation"): 240 | continue 241 | 242 | scalar_name = "_".join((NUMERIC_SCALARS_PREFIX, name.upper())) 243 | for sub_name, sub_value in value.items(): 244 | if not isinstance(sub_value, (int, float)): 245 | continue 246 | 247 | _extract_scalar_value(state, scalar_name, sub_name.upper(), sub_value, NUMERIC_SCALARS_LABELS, process) 248 | 249 | 250 | def _extract_scalar_value(state, name, label, value, bucket_labels, process_type="parent"): 251 | if value < 0: # Afaik we are collecting only positive values 252 | return 253 | 254 | accessor = (name, label, process_type) 255 | aggregated_histogram = state[accessor]["histogram"] = state[accessor].get("histogram", {}) 256 | state[accessor]["sum"] = state[accessor].get("sum", 0) + value 257 | state[accessor]["count"] = state[accessor].get("count", 0) + 1 258 | 259 | insert_bucket = bucket_labels[0] # Initialized to underflow bucket 260 | for bucket in reversed(bucket_labels): 261 | if value >= bucket: 262 | insert_bucket = bucket 263 | break 264 | 265 | aggregated_histogram[str(insert_bucket)] = aggregated_histogram.get(str(insert_bucket), 0) + 1 266 | 267 | 268 | def _extract_child_payloads(state, child_payloads): 269 | if not isinstance(child_payloads, (list, tuple)): 270 | return 271 | 272 | for child in child_payloads: 273 | _extract_histograms(state, child, "content") 274 | _extract_simple_measures(state, child.get("simpleMeasurements", {}), "content") 275 | 276 | 277 | def _aggregate_ping(state, ping): 278 | if not isinstance(ping, dict): 279 | return 280 | 281 | _extract_scalars(state, ping.get("payload", {}).get("processes", {})) 282 | _extract_histograms(state, ping.get("payload", {})) 283 | _extract_simple_measures(state, ping.get("payload", {}).get("simpleMeasurements", {})) 284 | _extract_child_payloads(state, ping.get("payload", {}).get("childPayloads", {})) 285 | _extract_histograms(state, ping.get("payload", {}).get("processes", {}).get("content", {}), "content") 286 | _extract_histograms(state, ping.get("payload", {}).get("processes", {}).get("gpu", {}), "gpu") 287 | return state 288 | 289 | 290 | def _aggregate_aggregates(agg1, agg2): 291 | for metric, payload in agg2.items(): 292 | if metric not in agg1: 293 | agg1[metric] = payload 294 | continue 295 | 296 | agg1[metric]["count"] += payload["count"] 297 | agg1[metric]["sum"] += payload["sum"] 298 | 299 | for k, v in payload["histogram"].items(): 300 | agg1[metric]["histogram"][k] = agg1[metric]["histogram"].get(k, 0) + v 301 | 302 | return agg1 303 | 304 | 305 | def _trim_payload(payload): 306 | return {k: v for k, v in payload.items() 307 | if k in ["histograms", "keyedHistograms", "simpleMeasurements", "processes"]} 308 | 309 | 310 | def _map_ping_to_dimensions(ping): 311 | try: 312 | submission_date = ping["meta"]["submissionDate"] 313 | channel = ping["application"]["channel"] 314 | version = ping["application"]["version"].split('.')[0] 315 | build_id = ping["application"]["buildId"][:8] 316 | application = ping["application"]["name"] 317 | architecture = ping["application"]["architecture"] 318 | os = ping["environment"]["system"]["os"]["name"] 319 | os_version = ping["environment"]["system"]["os"]["version"] 320 | 321 | if os == "Linux": 322 | os_version = str(os_version)[:3] 323 | 324 | try: 325 | int(build_id) 326 | except ValueError: 327 | return None 328 | 329 | subset = {} 330 | subset["payload"] = _trim_payload(ping["payload"]) 331 | subset["payload"]["childPayloads"] = [_trim_payload(c) for c in ping["payload"].get("childPayloads", [])] 332 | 333 | # Note that some dimensions don't vary within a single submissions 334 | # (e.g. channel) while some do (e.g. process type). 335 | # Dimensions that don't vary should appear in the submission key, while 336 | # the ones that do vary should appear within the key of a single metric. 337 | return ((submission_date, channel, version, build_id, application, architecture, os, os_version), subset) 338 | except: # noqa 339 | return None 340 | -------------------------------------------------------------------------------- /mozaggregator/bigquery.py: -------------------------------------------------------------------------------- 1 | import json 2 | import gzip 3 | 4 | from datetime import datetime, timedelta 5 | 6 | from pyspark.sql import Row, SparkSession 7 | 8 | 9 | class BigQueryDataset: 10 | def __init__(self): 11 | self.spark = SparkSession.builder.getOrCreate() 12 | 13 | @staticmethod 14 | def _date_add_days(date_ds, days): 15 | dt = datetime.strptime(date_ds, "%Y%m%d") 16 | return datetime.strftime(dt + timedelta(days), "%Y-%m-%d") 17 | 18 | @staticmethod 19 | def _extract_payload(row): 20 | """ 21 | The schema for the `payload_bytes_decoded` table is listed for reference. 22 | 23 | root 24 | |-- client_id: string (nullable = true) 25 | |-- document_id: string (nullable = true) 26 | |-- metadata: struct (nullable = true) 27 | | |-- document_namespace: string (nullable = true) 28 | | |-- document_type: string (nullable = true) 29 | | |-- document_version: string (nullable = true) 30 | | |-- geo: struct (nullable = true) 31 | | | |-- city: string (nullable = true) 32 | | | |-- country: string (nullable = true) 33 | | | |-- db_version: string (nullable = true) 34 | | | |-- subdivision1: string (nullable = true) 35 | | | |-- subdivision2: string (nullable = true) 36 | | |-- header: struct (nullable = true) 37 | | | |-- date: string (nullable = true) 38 | | | |-- dnt: string (nullable = true) 39 | | | |-- x_debug_id: string (nullable = true) 40 | | | |-- x_pingsender_version: string (nullable = true) 41 | | |-- uri: struct (nullable = true) 42 | | | |-- app_build_id: string (nullable = true) 43 | | | |-- app_name: string (nullable = true) 44 | | | |-- app_update_channel: string (nullable = true) 45 | | | |-- app_version: string (nullable = true) 46 | | |-- user_agent: struct (nullable = true) 47 | | | |-- browser: string (nullable = true) 48 | | | |-- os: string (nullable = true) 49 | | | |-- version: string (nullable = true) 50 | |-- normalized_app_name: string (nullable = true) 51 | |-- normalized_channel: string (nullable = true) 52 | |-- normalized_country_code: string (nullable = true) 53 | |-- normalized_os: string (nullable = true) 54 | |-- normalized_os_version: string (nullable = true) 55 | |-- payload: binary (nullable = true) 56 | |-- sample_id: long (nullable = true) 57 | |-- submission_timestamp: timestamp (nullable = true) 58 | """ 59 | # Data is stored in payload_bytes_decoded as gzip. 60 | data = json.loads(gzip.decompress(row.payload).decode("utf-8")) 61 | # add `meta` fields for backwards compatibility 62 | data["meta"] = { 63 | "submissionDate": datetime.strftime(row.submission_timestamp, "%Y%m%d"), 64 | "sampleId": row.sample_id, 65 | # following 4 fields necessary for mobile_aggregates 66 | "normalizedChannel": row.normalized_channel, 67 | "appVersion": row.metadata.uri.app_version, 68 | "appBuildId": row.metadata.uri.app_build_id, 69 | "appName": row.metadata.uri.app_name, 70 | } 71 | return data 72 | 73 | def load( 74 | self, 75 | project_id, 76 | dataset_id, 77 | doc_type, 78 | submission_date, 79 | channels=None, 80 | filter_clause=None, 81 | fraction=1, 82 | doc_version="v4", 83 | ): 84 | 85 | start = self._date_add_days(submission_date, 0) 86 | end = self._date_add_days(submission_date, 1) 87 | 88 | date_clause = ( 89 | f"submission_timestamp >= '{start}' AND submission_timestamp < '{end}'" 90 | ) 91 | filters = [date_clause] 92 | if channels: 93 | # build up a clause like "(normalized_channel = 'nightly' OR normalized_channel = 'beta')" 94 | clauses = [f"normalized_channel = '{channel}'" for channel in channels] 95 | joined = f"({' OR '.join(clauses)})" 96 | filters.append(joined) 97 | if filter_clause: 98 | filters.append(filter_clause) 99 | 100 | df = ( 101 | self.spark.read.format("bigquery") 102 | # Assumes the namespace is telemetry 103 | .option( 104 | "table", 105 | f"{project_id}.{dataset_id}.telemetry_telemetry__{doc_type}_{doc_version}", 106 | ) 107 | .option("filter", " AND ".join(filters)) 108 | .load() 109 | ) 110 | 111 | # Size of the RDD sample is not deterministic 112 | return df.rdd.map(self._extract_payload).sample(False, fraction) 113 | 114 | def load_avro( 115 | self, 116 | prefix, 117 | doc_type, 118 | submission_date, 119 | channels=None, 120 | filter_clause=None, 121 | doc_version="v4", 122 | ): 123 | filters = [] 124 | if channels: 125 | # build up a clause like "(normalized_channel = 'nightly' OR normalized_channel = 'beta')" 126 | clauses = ' OR '.join([f"normalized_channel = '{channel}'" for channel in channels]) 127 | joined = f"({clauses})" 128 | filters.append(joined) 129 | if filter_clause: 130 | filters.append(filter_clause) 131 | 132 | df = self.spark.read.format("avro").load( 133 | f"{prefix}/{submission_date}/{doc_type}_{doc_version}" 134 | ) 135 | if filters: 136 | df.where(" AND ".join(filters)) 137 | 138 | return df.rdd.map(self._extract_payload) 139 | -------------------------------------------------------------------------------- /mozaggregator/cli.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime, timedelta 3 | from os import environ 4 | 5 | import click 6 | from pyspark.sql import SparkSession 7 | 8 | from mozaggregator import aggregator, db, parquet, mobile, trim_db 9 | 10 | DS_NODASH_YESTERDAY = datetime.strftime(datetime.utcnow() - timedelta(1), "%Y%m%d") 11 | 12 | 13 | @click.group() 14 | def entry_point(): 15 | pass 16 | 17 | 18 | @click.command() 19 | @click.option("--date", type=str, default=DS_NODASH_YESTERDAY) 20 | @click.option("--channels", type=str, default="nightly") 21 | @click.option( 22 | "--credentials-protocol", type=click.Choice(["file", "s3", "gcs"]), default="s3" 23 | ) 24 | @click.option("--credentials-bucket", type=str, required=False) 25 | @click.option("--credentials-prefix", type=str, required=False) 26 | @click.option("--postgres-db", type=str, required=False) 27 | @click.option("--postgres-user", type=str, required=False) 28 | @click.option("--postgres-pass", type=str, required=False) 29 | @click.option("--postgres-host", type=str, required=False) 30 | @click.option("--postgres-ro-host", type=str, required=False) 31 | @click.option("--num-partitions", type=int, default=10000) 32 | @click.option( 33 | "--source", 34 | type=click.Choice(["bigquery", "moztelemetry", "avro"]), 35 | default="moztelemetry", 36 | ) 37 | @click.option( 38 | "--project-id", envvar="PROJECT_ID", type=str, default="moz-fx-data-shared-prod" 39 | ) 40 | @click.option("--dataset-id", type=str, default="payload_bytes_decoded") 41 | @click.option("--avro-prefix", type=str) 42 | def run_aggregator( 43 | date, 44 | channels, 45 | credentials_protocol, 46 | credentials_bucket, 47 | credentials_prefix, 48 | postgres_db, 49 | postgres_user, 50 | postgres_pass, 51 | postgres_host, 52 | postgres_ro_host, 53 | num_partitions, 54 | source, 55 | project_id, 56 | dataset_id, 57 | avro_prefix, 58 | ): 59 | spark = SparkSession.builder.getOrCreate() 60 | 61 | # Mozaggregator expects a series of POSTGRES_* variables in order to connect 62 | # to a db instance; we may pull them into the environment now by reading an 63 | # object from a file system. 64 | def create_path(protocol, bucket, prefix): 65 | mapping = {"file": "file", "s3": "s3a", "gcs": "gs"} 66 | return f"{mapping[protocol]}://{bucket}/{prefix}" 67 | 68 | # priority of reading credentials is options > credentials file > environment 69 | option_credentials = { 70 | "POSTGRES_DB": postgres_db, 71 | "POSTGRES_USER": postgres_user, 72 | "POSTGRES_PASS": postgres_pass, 73 | "POSTGRES_HOST": postgres_host, 74 | "POSTGRES_RO_HOST": postgres_ro_host, 75 | } 76 | if all(option_credentials.values()): 77 | print("reading credentials from options") 78 | environ.update(option_credentials) 79 | elif credentials_bucket and credentials_prefix: 80 | path = create_path(credentials_protocol, credentials_bucket, credentials_prefix) 81 | print(f"reading credentials from {path}") 82 | creds = spark.read.json(path, multiLine=True).first().asDict() 83 | environ.update(creds) 84 | else: 85 | print(f"assuming credentials from the environment") 86 | 87 | # Attempt a database connection now so we can fail fast if credentials are broken. 88 | db._preparedb() 89 | 90 | channels = [channel.strip() for channel in channels.split(",")] 91 | print(f"Running job for {date}") 92 | aggregates = aggregator.aggregate_metrics( 93 | spark.sparkContext, 94 | channels, 95 | date, 96 | num_reducers=num_partitions, 97 | source=source, 98 | project_id=project_id, 99 | dataset_id=dataset_id, 100 | avro_prefix=avro_prefix, 101 | ) 102 | aggregates[0].cache() 103 | aggregates[1].cache() 104 | print(f"Number of build-id aggregates: {aggregates[0].count()}") 105 | print(f"Number of submission date aggregates: {aggregates[1].count()}") 106 | 107 | # Store the results in Postgres. 108 | db.submit_aggregates(aggregates) 109 | 110 | 111 | @click.command() 112 | @click.option("--date", type=str, default=DS_NODASH_YESTERDAY) 113 | @click.option("--channels", type=str, default="nightly") 114 | @click.option("--output", type=str, default="s3://telemetry-parquet/aggregates_poc/v1") 115 | @click.option("--num-partitions", type=int, default=10000) 116 | @click.option( 117 | "--source", 118 | type=click.Choice(["bigquery", "moztelemetry", "avro"]), 119 | default="moztelemetry", 120 | ) 121 | @click.option( 122 | "--project-id", envvar="PROJECT_ID", type=str, default="moz-fx-data-shared-prod" 123 | ) 124 | @click.option("--dataset-id", type=str, default="payload_bytes_decoded") 125 | @click.option("--avro-prefix", type=str) 126 | def run_parquet( 127 | date, channels, output, num_partitions, source, project_id, dataset_id, avro_prefix 128 | ): 129 | spark = SparkSession.builder.getOrCreate() 130 | channels = [channel.strip() for channel in channels.split(",")] 131 | 132 | print(f"Running job for {date}") 133 | aggregates = parquet.aggregate_metrics( 134 | spark.sparkContext, 135 | channels, 136 | date, 137 | num_reducers=num_partitions, 138 | source=source, 139 | project_id=project_id, 140 | dataset_id=dataset_id, 141 | avro_prefix=avro_prefix, 142 | ) 143 | print(f"Number of build-id aggregates: {aggregates[0].count()}") 144 | print(f"Number of submission date aggregates: {aggregates[1].count()}") 145 | 146 | parquet.write_aggregates(spark, aggregates, output, "append") 147 | 148 | 149 | @click.command() 150 | @click.option("--date", type=str, default=DS_NODASH_YESTERDAY) 151 | @click.option( 152 | "--output", 153 | type=str, 154 | default="s3://{}/{}/{}".format( 155 | mobile.PATH_BUCKET, mobile.PATH_PREFIX, mobile.PATH_VERSION 156 | ), 157 | ) 158 | @click.option("--num-partitions", type=int, default=10000) 159 | @click.option( 160 | "--source", 161 | type=click.Choice(["bigquery", "moztelemetry", "avro"]), 162 | default="moztelemetry", 163 | ) 164 | @click.option( 165 | "--project-id", envvar="PROJECT_ID", type=str, default="moz-fx-data-shared-prod" 166 | ) 167 | @click.option("--dataset-id", type=str, default="payload_bytes_decoded") 168 | @click.option("--avro-prefix", type=str) 169 | def run_mobile( 170 | date, output, num_partitions, source, project_id, dataset_id, avro_prefix 171 | ): 172 | spark = SparkSession.builder.getOrCreate() 173 | spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") 174 | 175 | print(f"Running job for {date}") 176 | agg_metrics = mobile.aggregate_metrics( 177 | spark.sparkContext, 178 | date, 179 | num_partitions=num_partitions, 180 | source=source, 181 | project_id=project_id, 182 | dataset_id=dataset_id, 183 | avro_prefix=avro_prefix, 184 | ) 185 | aggs = mobile.get_aggregates_dataframe(spark, agg_metrics) 186 | mobile.write_parquet(aggs, output) 187 | 188 | 189 | entry_point.add_command(run_aggregator, "aggregator") 190 | entry_point.add_command(run_mobile, "mobile") 191 | entry_point.add_command(run_parquet, "parquet") 192 | entry_point.add_command(trim_db.main, "trim-database") 193 | 194 | if __name__ == "__main__": 195 | entry_point() 196 | -------------------------------------------------------------------------------- /mozaggregator/config.py: -------------------------------------------------------------------------------- 1 | REGION = "us-west-2" 2 | BUCKET = "telemetry-spark-emr-2" 3 | TIMEOUT = 24 * 60 * 60 4 | MINCONN = 4 5 | MAXCONN = 64 6 | CACHETYPE = "simple" 7 | USE_PRODUCTION_DB = True 8 | -------------------------------------------------------------------------------- /mozaggregator/db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import logging 9 | import os 10 | import string 11 | from collections import defaultdict 12 | from io import StringIO 13 | 14 | import pandas as pd 15 | import psycopg2 16 | import ujson as json 17 | from moztelemetry.histogram import Histogram 18 | 19 | from mozaggregator import sql, config 20 | from mozaggregator.aggregator import SCALAR_MEASURE_MAP 21 | 22 | # Use latest revision, we don't really care about histograms that have 23 | # been removed. This only works though if histogram definitions are 24 | # immutable, which has been the case so far. 25 | _histogram_revision_map = { 26 | "nightly": "https://hg.mozilla.org/mozilla-central/rev/tip", 27 | # Aurora channel (aka Dev Edition) is a repacked beta. 28 | "aurora": "https://hg.mozilla.org/releases/mozilla-beta/rev/tip", 29 | "beta": "https://hg.mozilla.org/releases/mozilla-beta/rev/tip", 30 | "release": "https://hg.mozilla.org/releases/mozilla-release/rev/tip" 31 | } 32 | # NOTE: Using `histogram_revision_map.get(...)` will still return `None`. 33 | # Use dict subscripts when mapping URLs with this dictionary. 34 | histogram_revision_map = defaultdict(lambda: _histogram_revision_map['nightly']) 35 | histogram_revision_map.update(_histogram_revision_map) 36 | 37 | _metric_printable = set(string.ascii_uppercase + string.ascii_lowercase + string.digits + "_-[].") 38 | 39 | db_pass = "POSTGRES_PASS" 40 | db_user = "POSTGRES_USER" 41 | db_host = "POSTGRES_HOST" 42 | db_ro_host = "POSTGRES_RO_HOST" 43 | db_name = "POSTGRES_DB" 44 | 45 | # Configure logging via py4j; see https://www.py4j.org/faq.html 46 | logger = logging.getLogger("py4j") 47 | logger.setLevel(logging.INFO) 48 | logger.addHandler(logging.StreamHandler()) 49 | 50 | 51 | class NoticeLoggingCursor(psycopg2.extensions.cursor): 52 | """ 53 | Cursor subclass that emits Postgres NOTICE messages (db-level logs) to application logs. 54 | 55 | Introduced for bug 1474590. 56 | 57 | See: 58 | http://initd.org/psycopg/docs/advanced.html#subclassing-cursor 59 | https://github.com/zzzeek/sqlalchemy/blob/2f03ec08b5a1c633133c0a38d82b05eb83708f69/lib/sqlalchemy/dialects/postgresql/psycopg2.py#L482-L488 60 | """ 61 | 62 | def execute(self, sql, args=None): 63 | psycopg2.extensions.cursor.execute(self, sql, args) 64 | for notice in self.connection.notices: 65 | level = self.parse_level(notice) 66 | # NOTICE messages have a newline character at the end 67 | logger.log(level, notice.rstrip()) 68 | self.connection.notices[:] = [] 69 | 70 | @staticmethod 71 | def parse_level(notice): 72 | """ 73 | Return a python log level based on the PostgreSQL log level in notice. 74 | 75 | https://www.postgresql.org/docs/9.4/static/runtime-config-logging.html#RUNTIME-CONFIG-SEVERITY-LEVELS 76 | https://docs.python.org/2/library/logging.html#logging-levels 77 | """ 78 | prefix, _, _ = notice.partition(':') 79 | if prefix in ['PANIC', 'FATAL']: 80 | return logging.CRITICAL 81 | if prefix in ['ERROR']: 82 | return logging.ERROR 83 | if prefix in ['WARNING']: 84 | return logging.WARNING 85 | if prefix in ['INFO', 'NOTICE']: 86 | return logging.INFO 87 | return logging.DEBUG 88 | 89 | 90 | def get_db_connection_string(read_only=False): 91 | if os.getenv("DB_TEST_URL"): 92 | return os.getenv("DB_TEST_URL") 93 | elif config.USE_PRODUCTION_DB: 94 | if (os.getenv(db_pass) and 95 | os.getenv(db_host) and 96 | os.getenv(db_ro_host) and 97 | os.getenv(db_user) and 98 | os.getenv(db_name)): 99 | 100 | rds_pass = os.getenv(db_pass) 101 | rds_host = os.getenv(db_host) 102 | rds_ro_host = os.getenv(db_ro_host) 103 | rds_user = os.getenv(db_user) 104 | rds_db = os.getenv(db_name) 105 | else: 106 | print("One or more POSTGRES env vars not set.") 107 | exit(1) 108 | 109 | rds_endpoint = rds_ro_host if read_only else rds_host 110 | return "dbname={} user={} password={} host={}".format(rds_db, rds_user, rds_pass, rds_endpoint) 111 | else: 112 | return "dbname={} user={} password={} host={}".format(config.DBNAME, config.DBUSER, config.DBPASS, config.DBHOST) 113 | 114 | 115 | def _create_connection(autocommit=True, connection_string_override=None): 116 | if connection_string_override: 117 | conn = psycopg2.connect(connection_string_override) 118 | else: 119 | conn = psycopg2.connect(get_db_connection_string(False)) 120 | 121 | if autocommit: 122 | conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) 123 | 124 | return conn 125 | 126 | 127 | def submit_aggregates(aggregates, dry_run=False): 128 | _preparedb() 129 | 130 | connection_string = get_db_connection_string(False) 131 | 132 | build_id_count = ( 133 | aggregates[0].map(lambda x: (x[0][:4], _aggregate_to_sql(x))) 134 | .filter(lambda x: x[1]) 135 | .reduceByKey(lambda x, y: x + y) 136 | .map(lambda x: _upsert_build_id_aggregates(x[0], x[1], connection_string, dry_run=dry_run)) 137 | .count()) 138 | 139 | submission_date_count = ( 140 | aggregates[1].map(lambda x: (x[0][:3], _aggregate_to_sql(x))) 141 | .filter(lambda x: x[1]) 142 | .reduceByKey(lambda x, y: x + y) 143 | .map(lambda x: _upsert_submission_date_aggregates(x[0], x[1], connection_string, dry_run=dry_run)) 144 | .count()) 145 | 146 | # TODO: Auto-vacuuming might be sufficient. Re-enable if needed. 147 | # _vacuumdb() 148 | return build_id_count, submission_date_count 149 | 150 | 151 | def _preparedb(): 152 | conn = _create_connection() 153 | cursor = conn.cursor(cursor_factory=NoticeLoggingCursor) 154 | cursor.execute(sql.query) 155 | 156 | 157 | def _get_complete_histogram(channel, metric, values): 158 | revision = histogram_revision_map[channel] 159 | 160 | for prefix, labels in SCALAR_MEASURE_MAP.items(): 161 | if metric.startswith(prefix): 162 | histogram = pd.Series({int(k): v for k, v in values.items()}, index=labels).fillna(0).values 163 | break 164 | else: 165 | histogram = Histogram(metric, {"values": values}, revision=revision).get_value(autocast=False).values 166 | 167 | return list(histogram) 168 | 169 | 170 | def _aggregate_to_sql(aggregate): 171 | result = StringIO() 172 | key, metrics = aggregate 173 | submission_date, channel, version, application, architecture, os, os_version = key[:3] + key[-4:] 174 | dimensions = { 175 | "application": application, 176 | "architecture": architecture, 177 | "os": os, 178 | "osVersion": os_version, 179 | } 180 | 181 | for metric, payload in metrics.items(): 182 | metric, label, process_type = metric 183 | 184 | if not set(metric).issubset(_metric_printable): 185 | continue # Ignore metrics with non printable characters... 186 | 187 | if any(("\u0000" in x for x in [metric, label, application, architecture, os, os_version])): 188 | continue # Ignore dimensions with null character 189 | 190 | try: 191 | # Make sure values fit within a pgsql bigint 192 | # TODO: we should probably log this event 193 | if payload["sum"] > (1 << 63) - 1: 194 | continue 195 | 196 | histogram = _get_complete_histogram(channel, metric, payload["histogram"]) + [payload["sum"], payload["count"]] 197 | histogram = [str(int(x)) for x in histogram] 198 | except KeyError: 199 | # Should eventually log errors 200 | continue 201 | 202 | dimensions["metric"] = metric 203 | dimensions["label"] = label 204 | # Have to special-case content and parent here to maintain backwards compatibility. 205 | dimensions["child"] = {"content": "true", 206 | "parent": "false"}.get(process_type, process_type) 207 | 208 | json_dimensions = json.dumps(dimensions) 209 | # json.dumps takes care of properly escaping the text but a SQL command 210 | # will first be interpreted as a string literal before being executed. 211 | # This doubles the number of backslashes we need. 212 | json_dimensions = json_dimensions.replace("\\", "\\\\") 213 | 214 | result.write("{}\t{}\n".format(json_dimensions, "{" + ",".join(histogram) + "}")) 215 | return result.getvalue() 216 | 217 | 218 | def _upsert_build_id_aggregates(key, stage_table, connection_string, dry_run=False): 219 | conn = _create_connection(autocommit=False, connection_string_override=connection_string) 220 | cursor = conn.cursor(cursor_factory=NoticeLoggingCursor) 221 | submission_date, channel, version, build_id = key 222 | 223 | # Aggregates with different submisssion_dates write to the same tables, we need a lock 224 | cursor.execute("select lock_transaction(%s, %s, %s, %s)", ("build_id", channel, version, build_id)) 225 | 226 | cursor.execute("select was_processed(%s, %s, %s, %s, %s)", ("build_id", channel, version, build_id, submission_date)) 227 | if cursor.fetchone()[0]: 228 | # This aggregate has already been processed 229 | conn.rollback() 230 | return 231 | 232 | cursor.execute("select create_temporary_table(%s, %s, %s, %s)", ("build_id", channel, version, build_id)) 233 | stage_table_name = cursor.fetchone()[0] 234 | 235 | cursor.copy_from(StringIO(stage_table), stage_table_name, columns=("dimensions", "histogram")) 236 | cursor.execute("select merge_table(%s, %s, %s, %s, %s)", ("build_id", channel, version, build_id, stage_table_name)) 237 | 238 | if dry_run: 239 | conn.rollback() 240 | else: 241 | conn.commit() 242 | 243 | cursor.close() 244 | conn.close() 245 | 246 | 247 | def _upsert_submission_date_aggregates(key, stage_table, connection_string, dry_run=False): 248 | conn = _create_connection(autocommit=False, connection_string_override=connection_string) 249 | cursor = conn.cursor(cursor_factory=NoticeLoggingCursor) 250 | submission_date, channel, version = key 251 | 252 | cursor.execute("select was_processed(%s, %s, %s, %s, %s)", ("submission_date", channel, version, submission_date, submission_date)) 253 | if cursor.fetchone()[0]: 254 | # This aggregate has already been processed 255 | conn.rollback() 256 | return 257 | 258 | cursor.execute("select create_temporary_table(%s, %s, %s, %s)", ("submission_date", channel, version, submission_date)) 259 | stage_table_name = cursor.fetchone()[0] 260 | 261 | cursor.copy_from(StringIO(stage_table), stage_table_name, columns=("dimensions", "histogram")) 262 | cursor.execute("select merge_table(%s, %s, %s, %s, %s)", ("submission_date", channel, version, submission_date, stage_table_name)) 263 | 264 | if dry_run: 265 | conn.rollback() 266 | else: 267 | conn.commit() 268 | 269 | cursor.close() 270 | conn.close() 271 | 272 | 273 | def _vacuumdb(): 274 | conn = _create_connection() 275 | conn.set_isolation_level(0) 276 | cursor = conn.cursor(cursor_factory=NoticeLoggingCursor) 277 | cursor.execute("vacuum") 278 | cursor.close() 279 | conn.close() 280 | 281 | 282 | def clear_db(): 283 | # For tests to clear the database between runs. 284 | conn = _create_connection() 285 | cursor = conn.cursor(cursor_factory=NoticeLoggingCursor) 286 | cursor.execute("select tablename from pg_tables where schemaname='public'") 287 | tables = [r[0] for r in cursor.fetchall()] 288 | for table in tables: 289 | # Note: Intentionally not using parameters here so the table name isn't quoted. 290 | cursor.execute("DROP TABLE IF EXISTS %s CASCADE" % table) 291 | conn.commit() 292 | cursor.close() 293 | conn.close() 294 | -------------------------------------------------------------------------------- /mozaggregator/mobile.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from os import environ 3 | 4 | import pandas as pd 5 | from pyspark.sql.types import ( 6 | LongType, MapType, StringType, StructField, StructType) 7 | 8 | from mozaggregator.aggregator import ( 9 | SCALAR_MEASURE_MAP, _aggregate_aggregates, _extract_numeric_scalars, 10 | _extract_keyed_numeric_scalars, _extract_main_histograms, 11 | _extract_keyed_histograms) 12 | from mozaggregator.db import histogram_revision_map 13 | from moztelemetry.dataset import Dataset 14 | from moztelemetry.histogram import Histogram 15 | from mozaggregator.bigquery import BigQueryDataset 16 | 17 | import warnings 18 | warnings.filterwarnings("always") 19 | 20 | PATH_BUCKET = environ.get('bucket', 'telemetry-parquet') 21 | PATH_PREFIX = 'mobile_metrics_aggregates' 22 | PATH_VERSION = 'v3' 23 | 24 | SCHEMA = StructType([ 25 | StructField('submission_date', StringType(), False), 26 | StructField('channel', StringType(), False), 27 | StructField('version', StringType(), False), 28 | StructField('build_id', StringType(), True), 29 | StructField('application', StringType(), False), 30 | StructField('architecture', StringType(), False), 31 | StructField('os', StringType(), False), 32 | StructField('os_version', StringType(), False), 33 | StructField('metric', StringType(), False), 34 | StructField('key', StringType(), True), 35 | StructField('process', StringType(), False), 36 | StructField('count', LongType(), False), 37 | StructField('sum', LongType(), False), 38 | StructField('histogram', MapType(StringType(), LongType(), False), False), 39 | ]) 40 | 41 | 42 | def get_aggregates_dataframe(spark, aggregates): 43 | build_id_agg = aggregates.flatMap(lambda row: _explode(row)) 44 | return spark.createDataFrame(build_id_agg, SCHEMA) 45 | 46 | 47 | def write_parquet(df, path, num_partitions=1): 48 | (df.repartitionByRange(num_partitions, "submission_date", "metric", "channel", "version") 49 | .write 50 | .partitionBy("submission_date") 51 | .parquet(path, mode="overwrite")) 52 | 53 | 54 | def _explode(row): 55 | dimensions, metrics = row 56 | 57 | for k, v in metrics.items(): 58 | try: 59 | histogram = _get_complete_histogram(dimensions[1], k[0], v['histogram']) 60 | except KeyError: 61 | continue 62 | yield dimensions + k + (v['count'], v['sum'], histogram) 63 | 64 | 65 | def _get_complete_histogram(channel, metric, values): 66 | revision = histogram_revision_map[channel] 67 | 68 | for prefix, labels in SCALAR_MEASURE_MAP.items(): 69 | if metric.startswith(prefix): 70 | histogram = pd.Series({int(k): v for k, v in values.items()}, 71 | index=labels).fillna(0) 72 | break 73 | else: 74 | histogram = Histogram(metric, {"values": values}, 75 | revision=revision).get_value(autocast=False) 76 | 77 | return {str(k): int(v) for k, v in histogram.to_dict().items()} 78 | 79 | 80 | def _extract_process_scalars(state, metrics, process): 81 | scalars = metrics.get("scalars", {}) 82 | keyed_scalars = metrics.get("keyedScalars", {}) 83 | 84 | if not isinstance(scalars, dict) or not isinstance(keyed_scalars, dict): 85 | raise("Scalar is not a scalar!") 86 | 87 | _extract_numeric_scalars(state, scalars, process) 88 | _extract_keyed_numeric_scalars(state, keyed_scalars, process) 89 | 90 | 91 | def _extract_process_histograms(state, metrics, process): 92 | histograms = metrics.get("histograms", {}) 93 | keyedHistograms = metrics.get("keyedHistograms", {}) 94 | 95 | if not isinstance(histograms, dict) or not isinstance(keyedHistograms, dict): 96 | raise Exception("Histogram is not a histogram!") 97 | 98 | _extract_main_histograms(state, histograms, process) 99 | for name, histogram in keyedHistograms.items(): 100 | _extract_keyed_histograms(state, name, histogram, process) 101 | 102 | 103 | def _aggregate_ping(state, metrics): 104 | if not isinstance(metrics, dict): 105 | raise Exception( 106 | "When is a ping not a ping? (%s)" 107 | % type(metrics) 108 | ) 109 | 110 | for process in metrics.keys(): 111 | process_metrics = metrics.get(process, {}) 112 | _extract_process_histograms(state, process_metrics, process) 113 | _extract_process_scalars(state, process_metrics, process) 114 | return state 115 | 116 | 117 | def _aggregate_metrics(pings, num_partitions): 118 | trimmed = ( 119 | pings.map(_map_ping_to_dimensions) 120 | .filter(lambda x: x)) 121 | 122 | return trimmed.aggregateByKey( 123 | defaultdict(dict), _aggregate_ping, _aggregate_aggregates, 124 | num_partitions) 125 | 126 | 127 | def _map_ping_to_dimensions(ping): 128 | try: 129 | submission_date = ping["meta"]["submissionDate"] 130 | channel = ping["meta"]["normalizedChannel"] 131 | version = ping["meta"]["appVersion"] 132 | build_id = ping["meta"]["appBuildId"] 133 | application = ping["meta"]["appName"] 134 | architecture = ping["arch"] 135 | os = ping["os"] 136 | os_version = ping["osversion"] 137 | 138 | # TODO: Validate build_id string against the whitelist from build hub. 139 | 140 | # Note that some dimensions don't vary within a single submission 141 | # (e.g. channel) while some do (e.g. process type). 142 | # Dimensions that don't vary should appear in the submission key, while 143 | # the ones that do vary should appear within the key of a single metric. 144 | return ( 145 | (submission_date, channel, version, build_id, application, 146 | architecture, os, os_version), 147 | ping.get("metrics", {}) 148 | ) 149 | except KeyError: 150 | raise 151 | 152 | 153 | def aggregate_metrics( 154 | sc, 155 | begin, 156 | end=None, 157 | num_partitions=10000, 158 | source="moztelemetry", 159 | project_id=None, 160 | dataset_id=None, 161 | avro_prefix=None, 162 | ): 163 | """ 164 | Returns the build-id and submission date aggregates for a given submission date. 165 | 166 | :param sc: A SparkContext instance 167 | :param begin: A string for the beginning date, in form "YYYYMMDD" 168 | :param end: An optional string for the end date, in form "YYYYMMDD". If 169 | not provided, metrics will only be aggregrated for the date provided 170 | with `begin`. 171 | :param num_partitions: An optional value to be passed to `aggregateByKey`. 172 | 173 | """ 174 | if end is None: 175 | end = begin 176 | 177 | if source == "bigquery" and project_id and dataset_id: 178 | if end != begin: 179 | raise NotImplementedError( 180 | "processing multiple days of data is not supported for BigQuery source" 181 | ) 182 | dataset = BigQueryDataset() 183 | pings = dataset.load(project_id, dataset_id, "mobile_metrics", begin, doc_version="v1") 184 | elif source == "avro" and avro_prefix: 185 | dataset = BigQueryDataset() 186 | pings = dataset.load_avro( 187 | avro_prefix, 188 | "mobile_metrics", 189 | begin, 190 | doc_version="v1", 191 | ) 192 | else: 193 | pings = (Dataset.from_source('telemetry') 194 | .where(docType='mobile_metrics', 195 | submissionDate=lambda x: begin <= x <= end) 196 | .records(sc)) 197 | return _aggregate_metrics(pings, num_partitions) 198 | -------------------------------------------------------------------------------- /mozaggregator/parquet.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from collections import defaultdict 3 | from os import environ 4 | 5 | import pandas as pd 6 | from pyspark.sql.types import ( 7 | LongType, MapType, StringType, StructField, StructType) 8 | 9 | from mozaggregator.aggregator import ( 10 | SCALAR_MEASURE_MAP, _aggregate_aggregates, _aggregate_ping, 11 | _sample_clients, _trim_payload) 12 | from mozaggregator.db import histogram_revision_map 13 | from moztelemetry.dataset import Dataset 14 | from moztelemetry.histogram import Histogram 15 | 16 | from mozaggregator.bigquery import BigQueryDataset 17 | 18 | 19 | PATH_BUCKET = environ.get('bucket', 'telemetry-parquet') 20 | PATH_PREFIX = 'aggregates_poc' 21 | PATH_VERSION = 'v1' 22 | 23 | DEFAULT_PATH = 's3://{bucket}/{prefix}/{version}'.format( 24 | bucket=PATH_BUCKET, prefix=PATH_PREFIX, version=PATH_VERSION 25 | ) 26 | 27 | SCHEMA = StructType([ 28 | StructField('period', StringType(), False), 29 | StructField('aggregate_type', StringType(), False), 30 | StructField('submission_date', StringType(), False), 31 | StructField('channel', StringType(), False), 32 | StructField('version', StringType(), False), 33 | StructField('build_id', StringType(), True), 34 | StructField('application', StringType(), False), 35 | StructField('architecture', StringType(), False), 36 | StructField('os', StringType(), False), 37 | StructField('os_version', StringType(), False), 38 | StructField('metric', StringType(), False), 39 | StructField('key', StringType(), True), 40 | StructField('process_type', StringType(), False), 41 | StructField('count', LongType(), False), 42 | StructField('sum', LongType(), False), 43 | StructField('histogram', MapType(StringType(), LongType(), False), False), 44 | ]) 45 | BUILD_ID_CUTOFF_UNKNOWN = 45 46 | BUILD_ID_CUTOFFS = { 47 | 'release': 84, 48 | 'esr': 84, 49 | 'beta': 30, 50 | 'aurora': 30, 51 | 'nightly': 10, 52 | } 53 | 54 | 55 | def write_aggregates(sc, aggregates, path=DEFAULT_PATH, mode="append"): 56 | build_id_agg = aggregates[0].flatMap(lambda row: _explode(row, 'build_id')) 57 | submission_date_agg = aggregates[1].flatMap(lambda row: _explode(row, 'submission_date')) 58 | df = sc.createDataFrame(build_id_agg, SCHEMA) 59 | df = df.union(sc.createDataFrame(submission_date_agg, SCHEMA)) 60 | 61 | (df.repartition('metric', 'aggregate_type', 'period') 62 | .sortWithinPartitions(['channel', 'version', 'submission_date']) 63 | .write 64 | .partitionBy('metric', 'aggregate_type', 'period') 65 | .parquet(path, mode=mode)) 66 | 67 | 68 | def _explode(row, aggregate_type): 69 | dimensions, metrics = row 70 | 71 | period = _period(dimensions[3] if aggregate_type == 'build_id' else dimensions[0]) 72 | 73 | for k, v in metrics.items(): 74 | try: 75 | histogram = _get_complete_histogram(dimensions[1], k[0], v['histogram']) 76 | except KeyError: 77 | continue 78 | yield (period, aggregate_type,) + dimensions + k + (v['count'], v['sum'], histogram) 79 | 80 | 81 | def _period(date_str): 82 | """ 83 | Returns a period string given a string of "YYYYMMDD". 84 | 85 | Note: Make sure the return value is sortable as expected as a string, as queries 86 | against this will likely use `BETWEEN` or other comparisons. 87 | 88 | """ 89 | return date_str[:6] 90 | 91 | 92 | def _get_complete_histogram(channel, metric, values): 93 | revision = histogram_revision_map[channel] 94 | 95 | for prefix, labels in SCALAR_MEASURE_MAP.items(): 96 | if metric.startswith(prefix): 97 | histogram = pd.Series({int(k): v for k, v in values.items()}, 98 | index=labels).fillna(0) 99 | break 100 | else: 101 | histogram = Histogram(metric, {"values": values}, 102 | revision=revision).get_value(autocast=False) 103 | 104 | return {str(k): int(v) for k, v in histogram.to_dict().items()} 105 | 106 | 107 | def _aggregate_metrics(pings, num_reducers=10000): 108 | trimmed = ( 109 | pings.filter(_sample_clients) 110 | .map(_map_ping_to_dimensions) 111 | .filter(lambda x: x)) 112 | build_id_aggregates = ( 113 | trimmed.aggregateByKey(defaultdict(dict), _aggregate_ping, 114 | _aggregate_aggregates, num_reducers)) 115 | submission_date_aggregates = ( 116 | build_id_aggregates.map(_map_build_id_key_to_submission_date_key) 117 | .reduceByKey(_aggregate_aggregates)) 118 | return build_id_aggregates, submission_date_aggregates 119 | 120 | 121 | def _map_build_id_key_to_submission_date_key(aggregate): 122 | # This skips the build_id column and replaces it with `None`. 123 | return tuple(aggregate[0][:3] + (None,) + aggregate[0][4:]), aggregate[1] 124 | 125 | 126 | def _map_ping_to_dimensions(ping): 127 | try: 128 | submission_date = ping["meta"]["submissionDate"] 129 | channel = ping["application"]["channel"] 130 | version = ping["application"]["version"].split('.')[0] 131 | build_id = ping["application"]["buildId"] 132 | application = ping["application"]["name"] 133 | architecture = ping["application"]["architecture"] 134 | os = ping["environment"]["system"]["os"]["name"] 135 | os_version = ping["environment"]["system"]["os"]["version"] 136 | 137 | if os == "Linux": 138 | os_version = str(os_version)[:3] 139 | 140 | try: 141 | build_id_as_date = datetime.datetime.strptime(build_id, '%Y%m%d%H%M%S') 142 | except ValueError: 143 | return None 144 | 145 | # Remove pings with build_id older than the specified cutoff days. 146 | cutoff = ( 147 | datetime.date.today() - 148 | datetime.timedelta(days=BUILD_ID_CUTOFFS.get(channel, BUILD_ID_CUTOFF_UNKNOWN))) 149 | if build_id_as_date.date() <= cutoff: 150 | return None 151 | 152 | # TODO: Validate build_id string against the whitelist from build hub. 153 | 154 | subset = {} 155 | subset["payload"] = _trim_payload(ping["payload"]) 156 | 157 | # Note that some dimensions don't vary within a single submission 158 | # (e.g. channel) while some do (e.g. process type). 159 | # Dimensions that don't vary should appear in the submission key, while 160 | # the ones that do vary should appear within the key of a single metric. 161 | return ( 162 | (submission_date, channel, version, build_id, application, 163 | architecture, os, os_version), 164 | subset 165 | ) 166 | except KeyError: 167 | return None 168 | 169 | 170 | def aggregate_metrics(sc, channels, submission_date, main_ping_fraction=1, 171 | num_reducers=10000, source="moztelemetry", 172 | project_id=None, dataset_id=None, 173 | avro_prefix=None): 174 | """ 175 | Returns the build-id and submission date aggregates for a given submission date. 176 | 177 | :param sc: A SparkContext instance 178 | :param channel: Either the name of a channel or a list/tuple of names 179 | :param submission_date: The submission date for which the data will be aggregated 180 | :param main_ping_fraction: An approximative fraction of submissions to consider for aggregation 181 | """ 182 | if not isinstance(channels, (tuple, list)): 183 | channels = [channels] 184 | 185 | if source == "bigquery" and project_id and dataset_id: 186 | dataset = BigQueryDataset() 187 | pings = dataset.load( 188 | project_id, 189 | dataset_id, 190 | "main", 191 | submission_date, 192 | channels, 193 | "normalized_app_name <> 'Fennec'" 194 | ) 195 | elif source == "avro" and avro_prefix: 196 | dataset = BigQueryDataset() 197 | pings = dataset.load_avro( 198 | avro_prefix, 199 | "main", 200 | submission_date, 201 | channels, 202 | "normalized_app_name <> 'Fennec'" 203 | ) 204 | else: 205 | channels = set(channels) 206 | source = 'telemetry' 207 | where = { 208 | 'appUpdateChannel': lambda x: x in channels, 209 | 'submissionDate': submission_date, 210 | 'sourceVersion': '4', 211 | } 212 | pings = (Dataset.from_source(source) 213 | .where(docType='main', 214 | appName=lambda x: x != 'Fennec', 215 | **where) 216 | .records(sc, sample=main_ping_fraction)) 217 | 218 | return _aggregate_metrics(pings, num_reducers) 219 | -------------------------------------------------------------------------------- /mozaggregator/service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from copy import deepcopy 4 | from functools import wraps 5 | from urllib.parse import urlencode 6 | from urllib.request import urlopen 7 | from expiringdict import ExpiringDict 8 | 9 | import re 10 | import ujson as json 11 | from dockerflow.flask import Dockerflow 12 | from flask import Flask, Response, abort, jsonify, request, _request_ctx_stack 13 | from flask_cache import Cache 14 | from flask_cors import CORS 15 | from flask_sslify import SSLify 16 | from joblib import Parallel, delayed 17 | from moztelemetry.histogram import Histogram 18 | from moztelemetry.scalar import MissingScalarError, Scalar 19 | from psycogreen.gevent import patch_psycopg 20 | from psycopg2.pool import ThreadedConnectionPool 21 | from werkzeug.exceptions import MethodNotAllowed 22 | from jose import jwt 23 | from jose.jwt import JWTError 24 | 25 | from mozaggregator.aggregator import ( 26 | COUNT_HISTOGRAM_LABELS, COUNT_HISTOGRAM_PREFIX, NUMERIC_SCALARS_PREFIX, SCALAR_MEASURE_MAP) 27 | from mozaggregator.db import get_db_connection_string, histogram_revision_map, _preparedb 28 | 29 | pool = None 30 | db_connection_string = get_db_connection_string(read_only=True) 31 | app = Flask(__name__) 32 | dockerflow = Dockerflow(app, version_path='/app') 33 | 34 | app.config.from_pyfile('config.py') 35 | 36 | CORS(app, resources=r'/*', allow_headers=['Authorization', 'Content-Type']) 37 | cache = Cache(app, config={'CACHE_TYPE': app.config["CACHETYPE"]}) 38 | sslify = SSLify(app, permanent=True, skips=['__version__', '__heartbeat__', '__lbheartbeat__', 'status']) 39 | 40 | patch_psycopg() 41 | cache.clear() 42 | 43 | # For caching - change this if after backfilling submission_date data 44 | SUBMISSION_DATE_ETAG = 'submission_date_v1' 45 | CLIENT_CACHE_SLACK_SECONDS = 3600 46 | 47 | # If we get a query string not in this set we throw a 405. 48 | ALLOWED_DIMENSIONS = ('application', 'architecture', 'child', 'dates', 'label', 49 | 'metric', 'os', 'osVersion', 'version') 50 | 51 | # Disallowed metrics for serving - matches regex 52 | METRICS_BLACKLIST_RE = [re.compile(x) for x in [ 53 | r"SEARCH_COUNTS", 54 | r"SCALARS_BROWSER\.SEARCH\..+", 55 | ]] 56 | 57 | NON_AUTH_METRICS_BLACKLIST = [ 58 | "SCALARS_TELEMETRY.EVENT_COUNTS", 59 | "SCALARS_TELEMETRY.DYNAMIC_EVENT_COUNTS", 60 | ] 61 | 62 | # Allowed public release metrics 63 | RELEASE_CHANNEL = "release" 64 | ALLOW_ALL_RELEASE_METRICS = os.environ.get("ALLOW_ALL_RELEASE_METRICS", "False") == "True" 65 | PUBLIC_RELEASE_METRICS = {"SCALARS_TELEMETRY.TEST.KEYED_UNSIGNED_INT"} 66 | 67 | # Auth0 Integration 68 | AUTH0_DOMAIN = "auth.mozilla.auth0.com" 69 | AUTH0_API_AUDIENCE = "https://aggregates.telemetry.mozilla.org/" 70 | AUTH0_ALGORITHMS = ["RS256"] 71 | AUTH0_REQUIRED_SCOPE = "read:aggregates" 72 | auth0_cache = ExpiringDict(max_len=1000, max_age_seconds=15 * 60) 73 | 74 | # CSP Headers 75 | DEFAULT_CSP_POLICY = "frame-ancestors 'none'; default-src 'self'" 76 | DEFAULT_X_FRAME_POLICY = "DENY" 77 | 78 | 79 | # Error handler 80 | class AuthError(Exception): 81 | def __init__(self, error, status_code): 82 | self.error = error 83 | self.status_code = status_code 84 | 85 | 86 | @app.errorhandler(AuthError) 87 | def handle_auth_error(ex): 88 | response = jsonify(ex.error) 89 | response.status_code = ex.status_code 90 | return response 91 | 92 | 93 | def get_token_auth_header(): 94 | """Obtains the Access Token from the Authorization Header 95 | """ 96 | auth = request.headers.get("Authorization", None) 97 | if not auth: 98 | raise AuthError({"code": "authorization_header_missing", 99 | "description": 100 | "Authorization header is expected"}, 403) 101 | 102 | parts = auth.split() 103 | 104 | if parts[0] != "Bearer": 105 | raise AuthError({"code": "invalid_header", 106 | "description": 107 | "Authorization header must start with" 108 | " Bearer"}, 403) 109 | elif len(parts) == 1: 110 | raise AuthError({"code": "invalid_header", 111 | "description": "Token not found"}, 403) 112 | elif len(parts) > 2: 113 | raise AuthError({"code": "invalid_header", 114 | "description": 115 | "Authorization header must be" 116 | " Bearer token"}, 403) 117 | 118 | token = parts[1] 119 | return token 120 | 121 | 122 | def check_auth(): 123 | """Determines if the Access Token is valid 124 | """ 125 | domain_base = "https://" + AUTH0_DOMAIN + "/" 126 | token = get_token_auth_header() 127 | 128 | if token in auth0_cache: 129 | return auth0_cache[token] 130 | 131 | # check token validity 132 | jsonurl = urlopen(domain_base + ".well-known/jwks.json") 133 | jwks = json.loads(jsonurl.read()) 134 | 135 | try: 136 | unverified_header = jwt.get_unverified_header(token) 137 | except JWTError: 138 | raise AuthError({"code": "improper_token", 139 | "description": "Token cannot be validated"}, 403) 140 | 141 | rsa_key = {} 142 | for key in jwks["keys"]: 143 | if key["kid"] == unverified_header["kid"]: 144 | rsa_key = { 145 | "kty": key["kty"], 146 | "kid": key["kid"], 147 | "use": key["use"], 148 | "n": key["n"], 149 | "e": key["e"] 150 | } 151 | break 152 | else: 153 | raise AuthError({"code": "invalid_header", 154 | "description": "Unable to find appropriate key"}, 403) 155 | 156 | try: 157 | payload = jwt.decode( 158 | token, 159 | rsa_key, 160 | algorithms=AUTH0_ALGORITHMS, 161 | audience=AUTH0_API_AUDIENCE, 162 | issuer=domain_base 163 | ) 164 | except jwt.ExpiredSignatureError: 165 | raise AuthError({"code": "token_expired", 166 | "description": "Token is expired"}, 403) 167 | except jwt.JWTClaimsError: 168 | raise AuthError({"code": "invalid_claims", 169 | "description": 170 | "Incorrect claims," 171 | "please check the audience and issuer"}, 403) 172 | except Exception: 173 | raise AuthError({"code": "invalid_header", 174 | "description": 175 | "Unable to parse authentication" 176 | " token."}, 403) 177 | 178 | # check scope 179 | unverified_claims = jwt.get_unverified_claims(token) 180 | if unverified_claims.get("scope"): 181 | token_scopes = unverified_claims["scope"].split() 182 | for token_scope in token_scopes: 183 | if token_scope == AUTH0_REQUIRED_SCOPE: 184 | _request_ctx_stack.top.current_user = payload 185 | auth0_cache[token] = True 186 | return True 187 | 188 | raise AuthError({"code": "access_denied", 189 | "description": "Access not allowed"}, 403) 190 | 191 | 192 | def is_authed(): 193 | try: 194 | return check_auth() 195 | except AuthError: 196 | return False 197 | 198 | 199 | def get_time_left_in_cache(): 200 | assert app.config["CACHETYPE"] == "simple", "Only simple caches can be used with get_time_left_in_cache" 201 | 202 | # our cache (a flask cache), contains a cache (werkzeug SimpleCache), which contains a _cache (dict) 203 | # see https://github.com/pallets/werkzeug/blob/master/werkzeug/contrib/cache.py#L307 204 | expires_ts, _ = cache.cache._cache.get((request.url, False), (0, "")) 205 | 206 | # get seconds until expiry 207 | expires = int(expires_ts - time.time()) 208 | 209 | if expires <= 0: 210 | return 0 211 | 212 | # add some slack 213 | expires += CLIENT_CACHE_SLACK_SECONDS 214 | return expires 215 | 216 | 217 | def add_cache_header(add_etag=False): 218 | def decorator_func(f): 219 | @wraps(f) 220 | def decorated_request(*args, **kwargs): 221 | response = f(*args, **kwargs) 222 | 223 | prefix = kwargs.get('prefix') 224 | if prefix == 'submission_date' and add_etag: 225 | response.cache_control.max_age = app.config["TIMEOUT"] 226 | response.set_etag(SUBMISSION_DATE_ETAG) 227 | else: 228 | response.cache_control.max_age = get_time_left_in_cache() 229 | 230 | return response 231 | return decorated_request 232 | return decorator_func 233 | 234 | 235 | def check_etag(f): 236 | @wraps(f) 237 | def decorated_request(*args, **kwargs): 238 | etag = request.headers.get('If-None-Match') 239 | prefix = kwargs.get('prefix') 240 | if prefix == 'submission_date' and etag == SUBMISSION_DATE_ETAG: 241 | return Response(status=304) 242 | return f(*args, **kwargs) 243 | return decorated_request 244 | 245 | 246 | def cache_request(f): 247 | @wraps(f) 248 | def decorated_request(*args, **kwargs): 249 | authed = is_authed() 250 | cache_key = (request.url, authed) 251 | rv = cache.get(cache_key) 252 | 253 | if rv is None: 254 | rv = f(*args, **kwargs) 255 | cache.set(cache_key, rv, timeout=app.config["TIMEOUT"]) 256 | return rv 257 | else: 258 | return rv 259 | return decorated_request 260 | 261 | 262 | def create_pool(): 263 | global pool 264 | if pool is None: 265 | _preparedb() 266 | pool = ThreadedConnectionPool( 267 | app.config["MINCONN"], 268 | app.config["MAXCONN"], 269 | dsn=db_connection_string) 270 | return pool 271 | 272 | 273 | def execute_query(query, params=tuple()): 274 | pool = create_pool() 275 | db = pool.getconn() 276 | 277 | try: 278 | cursor = db.cursor() 279 | cursor.execute(query, params) 280 | return cursor.fetchall() 281 | except: # noqa 282 | abort(404) 283 | finally: 284 | pool.putconn(db) 285 | 286 | 287 | @app.after_request 288 | def apply_headers(response): 289 | response.headers["X-Frame-Options"] = DEFAULT_X_FRAME_POLICY 290 | response.headers["Content-Security-Policy"] = DEFAULT_CSP_POLICY 291 | response.headers["X-Content-Security-Policy"] = DEFAULT_CSP_POLICY 292 | return response 293 | 294 | 295 | @app.route('/status') 296 | def status(): 297 | return "OK" 298 | 299 | 300 | @app.route('/authed') 301 | def authed(): 302 | check_auth() 303 | return "Authenticated" 304 | 305 | 306 | @app.route('/clear-cache') 307 | def clear_cache(): 308 | check_auth() 309 | cache.clear() 310 | return "Cache Cleared" 311 | 312 | 313 | @app.route('/aggregates_by//channels/') 314 | @add_cache_header() 315 | @cache_request 316 | def get_channels(prefix): 317 | channels = execute_query("select * from list_channels(%s)", (prefix, )) 318 | channels = [channel[0] for channel in channels] 319 | 320 | if not is_authed(): 321 | channels = [c for c in channels if c != RELEASE_CHANNEL] 322 | 323 | return Response(json.dumps(channels), mimetype="application/json") 324 | 325 | 326 | @app.route('/aggregates_by//channels//dates/') 327 | @add_cache_header() 328 | @cache_request 329 | def get_dates(prefix, channel): 330 | if channel == RELEASE_CHANNEL: 331 | check_auth() 332 | result = execute_query("select * from list_buildids(%s, %s)", (prefix, channel)) 333 | pretty_result = [{"version": r[0], "date": r[1]} for r in result] 334 | return Response(json.dumps(pretty_result), mimetype="application/json") 335 | 336 | 337 | def matches_blacklist(string): 338 | return any((regex.match(string) for regex in METRICS_BLACKLIST_RE)) 339 | 340 | 341 | def get_filter_options(authed, channel, version, filters, filter): 342 | try: 343 | options = execute_query("select * from get_filter_options(%s, %s, %s)", (channel, version, filter)) 344 | if not options or (len(options) == 1 and options[0][0] is None): 345 | return 346 | 347 | pretty_opts = [] 348 | for option in options: 349 | option = option[0] 350 | if filter == "metric": 351 | if option.startswith(COUNT_HISTOGRAM_PREFIX): 352 | option = option[len(COUNT_HISTOGRAM_PREFIX) + 1:] 353 | if option in NON_AUTH_METRICS_BLACKLIST and authed: 354 | pretty_opts.append(option) 355 | elif not matches_blacklist(option) and option not in NON_AUTH_METRICS_BLACKLIST: 356 | pretty_opts.append(option) 357 | else: 358 | pretty_opts.append(option) 359 | 360 | if filter == "child": 361 | # In the db, child is true, false, and other things. 362 | # We want to have content = true and parent = false. 363 | pretty_opts = ["content" if x == "true" 364 | else "parent" if x == "false" 365 | else x for x in pretty_opts] 366 | 367 | filters[filter] = pretty_opts 368 | except: # noqa 369 | pass 370 | 371 | 372 | @app.route('/filters/', methods=['GET']) 373 | @add_cache_header() 374 | @cache_request 375 | def get_filters_options(): 376 | channel = request.args.get("channel") 377 | version = request.args.get("version") 378 | 379 | if not channel or not version: 380 | abort(404) 381 | 382 | if channel == RELEASE_CHANNEL: 383 | authed = check_auth() 384 | else: 385 | authed = is_authed() 386 | 387 | filters = {} 388 | dimensions = ["metric", "application", "architecture", "os", "child"] 389 | 390 | Parallel(n_jobs=len(dimensions), backend="threading")( 391 | delayed(get_filter_options)(authed, channel, version, filters, f) 392 | for f in dimensions 393 | ) 394 | 395 | if not filters: 396 | abort(404) 397 | 398 | return Response(json.dumps(filters), mimetype="application/json") 399 | 400 | 401 | def _get_description(channel, prefix, metric): 402 | if prefix != NUMERIC_SCALARS_PREFIX: 403 | return '' 404 | 405 | metric = metric.replace(prefix + '_', '').lower() 406 | return Scalar(metric, 0, channel=channel).definition.description 407 | 408 | 409 | def _allow_metric(channel, metric): 410 | if matches_blacklist(metric): 411 | return False 412 | elif channel == RELEASE_CHANNEL: 413 | if ALLOW_ALL_RELEASE_METRICS: 414 | return True 415 | elif metric in PUBLIC_RELEASE_METRICS: 416 | return True 417 | else: 418 | return check_auth() 419 | elif channel != RELEASE_CHANNEL: 420 | if metric in NON_AUTH_METRICS_BLACKLIST: 421 | return check_auth() 422 | else: 423 | return True 424 | 425 | 426 | @app.route('/aggregates_by//channels//', methods=['GET']) 427 | @add_cache_header(True) 428 | @check_etag 429 | @cache_request 430 | def get_dates_metrics(prefix, channel): 431 | mapping = {"true": True, "false": False} 432 | dimensions = {k: mapping.get(v, v) for k, v in request.args.items()} 433 | 434 | extra_dimensions = dimensions.keys() - ALLOWED_DIMENSIONS 435 | if extra_dimensions: 436 | # We received an unsupported query string to filter by, return 405. 437 | valid_url = '{}?{}'.format( 438 | request.path, 439 | urlencode({k: v for k, v in dimensions.items() if k in ALLOWED_DIMENSIONS})) 440 | raise MethodNotAllowed(valid_methods=[valid_url]) 441 | 442 | if 'child' in dimensions: 443 | # Process types in the db are true/false, not content/process 444 | new_process_map = {"content": True, "parent": False} 445 | dimensions['child'] = new_process_map.get(dimensions['child'], dimensions['child']) 446 | 447 | # Get dates 448 | dates = dimensions.pop('dates', '').split(',') 449 | version = dimensions.pop('version', None) 450 | metric = dimensions.get('metric') 451 | 452 | if not dates or not version or not metric: 453 | abort(404, description="Missing date or version or metric. All three are required.") 454 | 455 | if not _allow_metric(channel, metric): 456 | abort(404, description="This metric is not allowed to be served.") 457 | 458 | # Get bucket labels 459 | for _prefix, _labels in SCALAR_MEASURE_MAP.items(): 460 | if metric.startswith(_prefix) and _prefix != COUNT_HISTOGRAM_PREFIX: 461 | labels = _labels 462 | kind = "exponential" 463 | try: 464 | description = _get_description(channel, _prefix, metric) 465 | except MissingScalarError: 466 | abort(404, description="Cannot find this scalar definition.") 467 | break 468 | else: 469 | revision = histogram_revision_map[channel] 470 | try: 471 | definition = Histogram(metric, {"values": {}}, revision=revision) 472 | except KeyError: 473 | # Couldn't find the histogram definition 474 | abort(404, description="Cannot find this histogram definition.") 475 | 476 | kind = definition.kind 477 | description = definition.definition.description() 478 | 479 | if kind == "count": 480 | labels = COUNT_HISTOGRAM_LABELS 481 | dimensions["metric"] = "{}_{}".format(COUNT_HISTOGRAM_PREFIX, metric) 482 | elif kind == "flag": 483 | labels = [0, 1] 484 | else: 485 | labels = list(definition.get_value().keys()) 486 | 487 | altered_dimensions = deepcopy(dimensions) 488 | if 'child' in dimensions: 489 | # Bug 1339139 - when adding gpu processes, child process went from True/False to "true"/"false"/"gpu" 490 | reverse_map = {True: 'true', False: 'false'} 491 | altered_dimensions['child'] = reverse_map.get(altered_dimensions['child'], altered_dimensions['child']) 492 | 493 | # Fetch metrics 494 | if metric.startswith("USE_COUNTER2_"): 495 | # Bug 1412382 - Use Counters need to be composed from reported True 496 | # values and False values supplied by *CONTENT_DOCUMENTS_DESTROYED. 497 | denominator = "TOP_LEVEL_CONTENT_DOCUMENTS_DESTROYED" 498 | if metric.endswith("_DOCUMENT"): 499 | denominator = "CONTENT_DOCUMENTS_DESTROYED" 500 | denominator = "{}_{}".format(COUNT_HISTOGRAM_PREFIX, denominator) 501 | denominator_dimensions = deepcopy(dimensions) 502 | denominator_dimensions["metric"] = denominator 503 | denominator_new_dimensions = deepcopy(altered_dimensions) 504 | denominator_new_dimensions["metric"] = denominator 505 | result = execute_query( 506 | "select * from batched_get_use_counter(%s, %s, %s, %s, %s, %s, %s, %s)", ( 507 | prefix, channel, version, dates, json.dumps(denominator_dimensions), 508 | json.dumps(denominator_new_dimensions), json.dumps(dimensions), json.dumps(altered_dimensions))) 509 | else: 510 | result = execute_query( 511 | "select * from batched_get_metric(%s, %s, %s, %s, %s, %s)", ( 512 | prefix, channel, version, dates, json.dumps(dimensions), json.dumps(altered_dimensions))) 513 | 514 | if not result: 515 | abort(404, description="No data found for this metric.") 516 | 517 | pretty_result = {"data": [], "buckets": labels, "kind": kind, "description": description} 518 | for row in result: 519 | date = row[0] 520 | label = row[1] 521 | histogram = row[2][:-2] 522 | sum = row[2][-2] 523 | count = row[2][-1] 524 | pretty_result["data"].append({"date": date, "label": label, "histogram": histogram, "count": count, "sum": sum}) 525 | 526 | return Response(json.dumps(pretty_result), mimetype="application/json") 527 | 528 | 529 | if __name__ == "__main__": 530 | app.run("0.0.0.0", debug=True, threaded=True) 531 | -------------------------------------------------------------------------------- /mozaggregator/sql.py: -------------------------------------------------------------------------------- 1 | query = r""" 2 | create or replace function aggregate_table_name(prefix text, channel text, version text, date text) returns text as $$ 3 | begin 4 | return format('%s_%s_%s_%s', prefix, channel, version, date); 5 | end 6 | $$ language plpgsql strict immutable; 7 | 8 | create or replace function cast_array_to_bigint_safe(input numeric[]) returns bigint[] as $$ 9 | declare 10 | output numeric[]; 11 | begin 12 | output := input; 13 | if (select min(x) from unnest(output) as x) < -9223372036854775808 then 14 | RAISE WARNING 'Truncating negative value(s) too large for bigint in array: %', output; 15 | output := (select ARRAY(select GREATEST(x, -9223372036854775808) from unnest(output) as x)); 16 | end if; 17 | if (select max(x) from unnest(output) as x) > 9223372036854775807 then 18 | RAISE WARNING 'Truncating positive value(s) too large for bigint in array: %', output; 19 | output := (select ARRAY(select LEAST(x, 9223372036854775807) from unnest(output) as x)); 20 | end if; 21 | return output; 22 | end 23 | $$ language plpgsql strict immutable; 24 | 25 | create or replace function aggregate_arrays(acc bigint[], x bigint[]) returns bigint[] as $$ 26 | begin 27 | return cast_array_to_bigint_safe( 28 | (select array( 29 | select sum(elem) 30 | from (values (1, acc), (2, x)) as t(idx, arr) 31 | , unnest(t.arr) with ordinality x(elem, rn) 32 | group by rn 33 | order by rn))); 34 | end 35 | $$ language plpgsql strict immutable; 36 | 37 | create or replace function aggregate_histogram_arrays(acc bigint[], x bigint[]) returns bigint[] as $$ 38 | begin 39 | return (select ( 40 | aggregate_arrays(x[1 : GREATEST(array_length(x, 1) - 2, 1)], 41 | acc[1 : GREATEST(array_length(acc, 1) - 2, 1)]) 42 | || 43 | aggregate_arrays(x[GREATEST(array_length(x, 1) - 1, 1) : GREATEST(array_length(x, 1), 1)], 44 | acc[GREATEST(array_length(acc, 1) - 1, 1) : GREATEST(array_length(acc, 1), 1)]) 45 | )); 46 | end 47 | $$ language plpgsql strict immutable; 48 | 49 | drop aggregate if exists aggregate_histograms(bigint[]); 50 | create aggregate aggregate_histograms (bigint[]) ( 51 | sfunc = aggregate_histogram_arrays, stype = bigint[], initcond = '{}' 52 | ); 53 | 54 | 55 | create or replace function merge_table(prefix text, channel text, version text, date text, stage_table regclass) returns void as $$ 56 | declare 57 | tablename text; 58 | table_exists bool; 59 | begin 60 | tablename := aggregate_table_name(prefix, channel, version, date); 61 | -- Check if table exists and if not create one 62 | table_exists := (select exists (select 1 from information_schema.tables where table_schema = 'public' and table_name = tablename)); 63 | 64 | if not table_exists then 65 | execute format('create table %s as table %s', tablename, stage_table); 66 | execute format('create index on %s using GIN (dimensions jsonb_path_ops)', tablename); 67 | perform update_filter_options(channel, version, stage_table); 68 | return; 69 | end if; 70 | 71 | -- Update existing tuples and delete matching rows from the staging table 72 | execute 'with merge as (update ' || tablename || ' as dest 73 | set histogram = aggregate_histogram_arrays(dest.histogram, src.histogram) 74 | from ' || stage_table || ' as src 75 | where dest.dimensions = src.dimensions 76 | returning dest.*) 77 | delete from ' || stage_table || ' as stage 78 | using merge 79 | where stage.dimensions = merge.dimensions'; 80 | 81 | -- Insert new tuples 82 | execute 'insert into ' || tablename || ' (dimensions, histogram) 83 | select dimensions, histogram from ' || stage_table; 84 | perform update_filter_options(channel, version, stage_table); 85 | end 86 | $$ language plpgsql strict; 87 | 88 | 89 | create or replace function lock_transaction(prefix text, channel text, version text, date text) returns bigint as $$ 90 | declare 91 | table_name text; 92 | lock bigint; 93 | begin 94 | table_name := aggregate_table_name(prefix, channel, version, date); 95 | lock := (select h_bigint(table_name)); 96 | execute 'select pg_advisory_xact_lock($1)' using lock; 97 | return lock; 98 | end 99 | $$ language plpgsql strict; 100 | 101 | 102 | create or replace function h_bigint(text) returns bigint as $$ 103 | select ('x'||substr(md5($1),1,16))::bit(64)::bigint; 104 | $$ language sql; 105 | 106 | 107 | create or replace function create_temporary_table(prefix text, channel text, version text, date text) returns text as $$ 108 | declare 109 | tablename text; 110 | begin 111 | tablename := aggregate_table_name('staging_' || prefix, channel, version, date); 112 | execute 'create temporary table ' || tablename || ' (dimensions jsonb, histogram bigint[]) on commit drop'; 113 | return tablename; 114 | end 115 | $$ language plpgsql strict; 116 | 117 | 118 | create or replace function was_processed(prefix text, channel text, version text, date text, submission_date text) returns boolean as $$ 119 | declare 120 | table_name text; 121 | was_processed boolean; 122 | begin 123 | table_name := aggregate_table_name(prefix, channel, version, date); 124 | select exists(select 1 125 | from table_update_dates as t 126 | where t.tablename = table_name and submission_date = any(t.submission_dates)) 127 | into was_processed; 128 | 129 | if (was_processed) then 130 | return was_processed; 131 | end if; 132 | 133 | with upsert as (update table_update_dates 134 | set submission_dates = submission_dates || submission_date 135 | where tablename = table_name 136 | returning *) 137 | insert into table_update_dates 138 | select * from (values (table_name, array[submission_date])) as t 139 | where not exists(select 1 from upsert); 140 | 141 | return was_processed; 142 | end 143 | $$ language plpgsql strict; 144 | 145 | 146 | -- We have to explicitly drop the old get_metric function or else the new one will not be backwards compatible 147 | drop function if exists get_metric(text, text, text, text, jsonb); 148 | 149 | -- The default value for new_dimensions has to be something that will never match 150 | create or replace function get_metric(prefix text, channel text, version text, date text, dimensions jsonb, new_dimensions jsonb DEFAULT '{"metric":"METRIC???"}') returns table(label text, histogram bigint[]) as $$ 151 | declare 152 | tablename text; 153 | begin 154 | if not dimensions ? 'metric' then 155 | raise exception 'Missing metric field!'; 156 | end if; 157 | 158 | if not new_dimensions ? 'metric' then 159 | raise exception 'Missing metric field!'; 160 | end if; 161 | 162 | tablename := aggregate_table_name(prefix, channel, version, date); 163 | 164 | return query execute 165 | E'select dimensions->>\'label\', aggregate_histograms(histogram) 166 | from ' || tablename || E' 167 | where dimensions @> $1 168 | or dimensions @> $2 169 | group by dimensions->>\'label\'' 170 | using dimensions, new_dimensions; 171 | end 172 | $$ language plpgsql strict stable; 173 | 174 | 175 | drop type if exists metric_type; 176 | create type metric_type AS (label text, histogram bigint[]); 177 | 178 | drop function if exists batched_get_metric(text, text, text, text[], jsonb); 179 | create or replace function batched_get_metric(prefix text, channel text, version text, dates text[], dimensions jsonb, new_dimensions jsonb DEFAULT '{"metric":"METRIC???"}') returns table(date text, label text, histogram bigint[]) as $$ 180 | begin 181 | return query select t.date, (get_metric(prefix, channel, version, t.date, dimensions, new_dimensions)::text::metric_type).* 182 | from (select unnest(dates)) as t(date); 183 | end 184 | $$ language plpgsql strict; 185 | 186 | create or replace function batched_get_use_counter(prefix text, channel text, version text, dates text[], denominator_dimensions jsonb, denominator_new_dimensions jsonb, dimensions jsonb, new_dimensions jsonb DEFAULT '{"metric":"METRIC???"}') returns table(date text, label text, histogram bigint[]) as $$ 187 | begin 188 | return query 189 | select t2.date, 190 | coalesce(t1.label, ''), 191 | case 192 | when t1.histogram is null then ARRAY[ 193 | t2.histogram[array_length(t2.histogram, 1) - 1], 194 | 0, 195 | 0, 196 | 0, 197 | t2.histogram[array_length(t2.histogram, 1)]] 198 | when t2.histogram is null then t1.histogram 199 | else ARRAY[ 200 | t2.histogram[array_length(t2.histogram, 1) - 1] - t1.histogram[2] - t1.histogram[3], 201 | t1.histogram[2], 202 | t1.histogram[3], 203 | t1.histogram[4], 204 | t1.histogram[5]] 205 | end 206 | from batched_get_metric(prefix, channel, version, dates, dimensions, new_dimensions) t1 207 | full outer join batched_get_metric(prefix, channel, version, dates, denominator_dimensions, denominator_new_dimensions) t2 208 | on t1.date = t2.date; 209 | end 210 | $$ language plpgsql strict; 211 | 212 | create or replace function list_buildids(prefix text, channel text) returns table(version text, buildid text) as $$ 213 | begin 214 | return query execute 215 | E'select t.matches[2], t.matches[3] from 216 | (select regexp_matches(table_name::text, $3) 217 | from information_schema.tables 218 | where table_schema=\'public\' and table_type=\'BASE TABLE\' and table_name like $1 || $2 219 | order by table_name desc) as t (matches)' 220 | using prefix, '_' || channel || '%', '^' || prefix || '_([^_]+)_([0-9]+)_([0-9]+)$'; 221 | end 222 | $$ language plpgsql strict; 223 | 224 | 225 | create or replace function list_channels(prefix text) returns table(channel text) as $$ 226 | begin 227 | return query execute 228 | E'select distinct t.matches[1] from 229 | (select regexp_matches(table_name::text, $1 || \'_([^_]+)_([0-9]+)_([0-9]+)\') 230 | from information_schema.tables 231 | where table_schema=\'public\' and table_type=\'BASE TABLE\' 232 | order by table_name desc) as t (matches)' 233 | using prefix; 234 | end 235 | $$ language plpgsql strict; 236 | 237 | 238 | create or replace function get_dimension_values(filter text, table_name regclass) returns table(option text) as $$ 239 | declare 240 | begin 241 | -- TODO: os & osVersion should be merged into a single dimension... 242 | if (filter = 'osVersion') then 243 | return query execute 244 | E'select concat(t.os, \',\', t.version) 245 | from (select distinct dimensions->>\'os\', dimensions->>\'osVersion\' 246 | from ' || table_name || E') as t(os, version)'; 247 | else 248 | return query execute 249 | E'select distinct dimensions->>\'' || filter || E'\' from ' || table_name; 250 | end if; 251 | end 252 | $$ language plpgsql strict stable; 253 | 254 | 255 | create or replace function update_filter_options(channel text, version text, stage_table regclass) returns void as $$ 256 | declare 257 | table_match text; 258 | dimension_sample jsonb; 259 | dimension text; 260 | begin 261 | table_match := aggregate_table_name('*', channel, version, '*'); 262 | 263 | execute 'select dimensions 264 | from ' || stage_table || ' 265 | limit 1' 266 | into dimension_sample; 267 | 268 | perform lock_transaction('*', channel, version, '*'); 269 | 270 | for dimension in select jsonb_object_keys(dimension_sample) 271 | loop 272 | if (dimension = 'label' or dimension = 'os') then 273 | continue; 274 | end if; 275 | 276 | execute E'with curr as (select value 277 | from filter_options 278 | where table_match = $1 and filter = $2), 279 | new as (select get_dimension_values($2, $3)), 280 | diff as (select * from new except select * from curr) 281 | insert into filter_options (table_match, filter, value) 282 | select $1, $2, t.value 283 | from diff as t(value)' 284 | using table_match, dimension, stage_table; 285 | end loop; 286 | end 287 | $$ language plpgsql strict; 288 | 289 | 290 | create or replace function get_filter_options(channel text, version text, dimension text) returns table(option text) as $$ 291 | declare 292 | match_table text; 293 | begin 294 | match_table := aggregate_table_name('*', channel, version, '*'); 295 | 296 | if dimension = 'os' then 297 | dimension := 'osVersion'; 298 | end if; 299 | 300 | return query 301 | select value 302 | from filter_options 303 | where table_match = match_table and filter = dimension; 304 | end 305 | $$ language plpgsql strict; 306 | 307 | 308 | create or replace function create_tables() returns void as $$ 309 | declare 310 | table_exists boolean; 311 | begin 312 | table_exists := (select exists (select 1 from information_schema.tables where table_schema = 'public' and table_name = 'table_update_dates')); 313 | if (not table_exists) then 314 | create table table_update_dates (tablename text primary key, submission_dates text[]); 315 | create index on table_update_dates (tablename); 316 | end if; 317 | 318 | table_exists := (select exists (select 1 from information_schema.tables where table_schema = 'public' and table_name = 'filter_options')); 319 | if (not table_exists) then 320 | create table filter_options (id serial primary key, table_match text not null, filter text not null, value text not null); 321 | create index on filter_options (table_match); 322 | end if; 323 | end 324 | $$ language plpgsql strict; 325 | 326 | 327 | select create_tables(); 328 | """ 329 | -------------------------------------------------------------------------------- /mozaggregator/trim_db.py: -------------------------------------------------------------------------------- 1 | import click 2 | import psycopg2 3 | 4 | from typing import Set, List, Tuple 5 | from datetime import datetime, timedelta 6 | 7 | DS_NODASH = "%Y%m%d" 8 | 9 | 10 | def extract_ds_nodash(tablename): 11 | return tablename.split("_")[-1] 12 | 13 | 14 | def retention_date_range(base: str, period: int = 365, buffer: int = 7) -> Set[str]: 15 | """Create a set of dates between [base-period, base]. The date format is ds_nodash.""" 16 | base = datetime.strptime(base, DS_NODASH) 17 | num_days = period + buffer 18 | dates = set( 19 | [ 20 | datetime.strftime(base - timedelta(period) + timedelta(x), DS_NODASH) 21 | for x in range(num_days) 22 | ] 23 | ) 24 | return dates 25 | 26 | 27 | def create_connection(dbname, user, password, host): 28 | conn_str = f"dbname={dbname} user={user} password={password} host={host}" 29 | conn = psycopg2.connect(conn_str) 30 | return conn 31 | 32 | 33 | def display_summary(action: str, table_set: Set[str], tables_to_show: int = 10): 34 | tables = list(sorted(table_set, key=extract_ds_nodash)) 35 | print(f"To {action} {len(tables)} tables...") 36 | print("-" * 40) 37 | if len(tables) > tables_to_show: 38 | show = tables[: tables_to_show // 2] + ["..."] + tables[-tables_to_show // 2 :] 39 | else: 40 | show = tables 41 | print("\n".join(show)) 42 | print("=" * 40) 43 | 44 | 45 | def partition_set_by_filter( 46 | full_set: Set[str], retain_suffix_set: Set[str] 47 | ) -> Tuple[Set[str], Set[str]]: 48 | retain_set = { 49 | table for table in full_set if extract_ds_nodash(table) in retain_suffix_set 50 | } 51 | trim_set = full_set - retain_set 52 | return retain_set, trim_set 53 | 54 | 55 | def query_submission_date( 56 | cursor, retain_suffix_set: Set[str] 57 | ) -> Tuple[Set[str], Set[str]]: 58 | submission_date_query = """ 59 | select tablename 60 | from pg_catalog.pg_tables 61 | where schemaname='public' and tablename like 'submission_date%'; 62 | """ 63 | cursor.execute(submission_date_query) 64 | submission_retain, submission_trim = partition_set_by_filter( 65 | {row[0] for row in cursor.fetchall()}, retain_suffix_set 66 | ) 67 | display_summary("retain", submission_retain) 68 | display_summary("trim", submission_trim) 69 | return submission_retain, submission_trim 70 | 71 | 72 | def query_build_id(cursor, retain_suffix_set: Set[str]) -> Tuple[Set[str], Set[str]]: 73 | build_id_query = """ 74 | select tablename 75 | from pg_catalog.pg_tables 76 | where schemaname='public' and tablename like 'build_id%'; 77 | """ 78 | cursor.execute(build_id_query) 79 | build_id_retain, build_id_trim = partition_set_by_filter( 80 | {row[0] for row in cursor.fetchall()}, retain_suffix_set 81 | ) 82 | display_summary("retain", build_id_retain) 83 | display_summary("trim", build_id_trim) 84 | return build_id_retain, build_id_trim 85 | 86 | 87 | def trim_tables(conn, trim_set: Set[str], batch_size=100): 88 | cursor = conn.cursor() 89 | trim_list = list(trim_set) 90 | num_batches = (len(trim_list) // batch_size) + 1 91 | for i in range(num_batches): 92 | trim_subset = trim_list[i * batch_size : (i + 1) * batch_size] 93 | if not trim_subset: 94 | continue 95 | print(f"dropping {i+1} out of {num_batches} batches in groups of {batch_size}") 96 | tables = ", ".join(trim_subset) 97 | query = f"drop table {tables};" 98 | cursor.execute(query) 99 | conn.commit() 100 | 101 | 102 | @click.command() 103 | @click.option( 104 | "--base-date", type=str, default=datetime.strftime(datetime.today(), DS_NODASH) 105 | ) 106 | @click.option("--retention-period", type=int, default=365 * 2) 107 | @click.option("--dry-run/--no-dry-run", default=True) 108 | @click.option("--postgres-db", type=str, envvar="POSTGRES_DB", default="telemetry") 109 | @click.option("--postgres-user", type=str, envvar="POSTGRES_USER", default="root") 110 | @click.option("--postgres-pass", type=str, envvar="POSTGRES_PASS", required=True) 111 | @click.option("--postgres-host", type=str, envvar="POSTGRES_HOST", required=True) 112 | def main( 113 | base_date, 114 | retention_period, 115 | dry_run, 116 | postgres_db, 117 | postgres_user, 118 | postgres_pass, 119 | postgres_host, 120 | ): 121 | conn = create_connection(postgres_db, postgres_user, postgres_pass, postgres_host) 122 | cursor = conn.cursor() 123 | 124 | retain_suffix_set = retention_date_range(base_date, retention_period) 125 | submission_retain, submission_trim = query_submission_date( 126 | cursor, retain_suffix_set 127 | ) 128 | build_id_retain, build_id_trim = query_build_id(cursor, retain_suffix_set) 129 | 130 | if not dry_run: 131 | print("Dropping tables...") 132 | trim_tables(conn, submission_trim | build_id_trim) 133 | else: 134 | print("Dry run enabled, not dropping tables...") 135 | conn.close() 136 | 137 | 138 | if __name__ == "__main__": 139 | main() 140 | -------------------------------------------------------------------------------- /queries/drop_non_quantum.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE FUNCTION try_cast_int(p_in TEXT, p_default INT default null) 2 | RETURNS INT 3 | LANGUAGE plpgsql; 4 | AS $$ 5 | BEGIN 6 | BEGIN 7 | RETURN $1::INT; 8 | EXCEPTION 9 | WHEN others THEN 10 | RETURN p_default; 11 | END; 12 | END; 13 | $$; 14 | 15 | CREATE OR REPLACE FUNCTION reverse(TEXT) 16 | RETURNS TEXT 17 | AS $$ 18 | SELECT array_to_string(ARRAY( 19 | SELECT SUBSTRING($1, s.i,1) FROM generate_series(LENGTH($1), 1, -1) AS s(i) 20 | ), ''); 21 | $$ LANGUAGE SQL IMMUTABLE STRICT; 22 | 23 | CREATE OR REPLACE FUNCTION drop_tables_before_date(IN _schema TEXT, IN _min_date INT) 24 | RETURNS void 25 | LANGUAGE plpgsql 26 | AS 27 | $$ 28 | DECLARE 29 | row record; 30 | BEGIN 31 | FOR row IN 32 | SELECT 33 | table_schema, 34 | table_name 35 | FROM 36 | information_schema.tables 37 | WHERE 38 | table_type = 'BASE TABLE' 39 | AND 40 | table_schema = _schema 41 | AND 42 | try_cast_int(reverse(split_part(reverse(table_name), '_', 1)), _min_date-1) < _min_date 43 | LOOP 44 | EXECUTE 'DROP TABLE ' || quote_ident(row.table_schema) || '.' || quote_ident(row.table_name); 45 | RAISE INFO 'Dropped table: %', quote_ident(row.table_schema) || '.' || quote_ident(row.table_name); 46 | END LOOP; 47 | END; 48 | $$; 49 | 50 | SELECT drop_tables_before_date('public', '20180101'); 51 | -------------------------------------------------------------------------------- /requirements/all.txt: -------------------------------------------------------------------------------- 1 | -r build.txt 2 | -r tests.txt 3 | -------------------------------------------------------------------------------- /requirements/build.txt: -------------------------------------------------------------------------------- 1 | Flask-Cache==0.13.1 2 | Flask-Cors==3.0.6 3 | Flask-SSLify==0.1.5 4 | Flask==1.0 5 | Jinja2==2.10.1 6 | Werkzeug==0.15.3 7 | blinker==1.4 8 | boto3==1.9.37 9 | botocore==1.12.37 10 | certifi==2018.10.15 11 | click==7.0 12 | dockerflow==2018.4.0 13 | gevent==1.3.7 14 | greenlet==0.4.15 15 | gunicorn==19.9.0 16 | idna==2.7 17 | itsdangerous==1.1.0 18 | joblib==0.12.5 19 | numpy==1.16.1 20 | pandas==0.23.4 21 | protobuf==3.13.0 22 | psycogreen==1.0 23 | psycopg2-binary==2.7.5 24 | pyspark==2.4.4 25 | python-dateutil==2.7.5 26 | python-jose-cryptodome==1.3.2 27 | # using git reference as python_moztelemetry 28 | # has been deleted from pypi repository 29 | # TODO: investigate python_moztelemetry usage 30 | # and remove this dependency if possible. 31 | git+https://github.com/mozilla/python_moztelemetry.git@v0.10.4#egg=python-moztelemetry 32 | python-snappy==0.5.3 33 | ujson==1.35 34 | urllib3==1.24.2 35 | requests==2.22.0 36 | -------------------------------------------------------------------------------- /requirements/tests.txt: -------------------------------------------------------------------------------- 1 | configparser==3.5.0 2 | coverage==4.5.1 3 | enum34==1.1.6 4 | flake8==3.6.0 5 | mccabe==0.6.1 6 | pycodestyle==2.4.0 7 | pyflakes==2.0.0 8 | testfixtures==6.3.0 9 | pytest 10 | pytest-cov 11 | google-cloud-bigquery 12 | google-cloud-storage 13 | google-resumable-media 14 | -------------------------------------------------------------------------------- /script/validation/README.md: -------------------------------------------------------------------------------- 1 | # Validation against copies of the mozaggregator database 2 | 3 | These scripts run validation to determine whether two databases contain the same 4 | aggregate data for a day. This requires boto3 to be configured correctly on the 5 | host machine. 6 | 7 | ```bash 8 | script/validation/entrypoint.sh 9 | ``` 10 | 11 | ## Results 12 | 13 | Error is the percentage difference between three statistics: 14 | * Number of aggregate rows for the reference date 15 | * Sum of all histogram sums for the reference date 16 | * Sum of all histogram counts for the reference date 17 | 18 | ```python 19 | submission_date test ref err 20 | 0 20200229 2936771 2936771 0.0 21 | 1 20200301 2826848 2826848 0.0 22 | 23 | sum_test sum_ref sum_err 24 | 0 2626701384304806794 2626701384304806794 0 25 | 1 2015714821244959696 2015714821244959696 0 26 | 27 | count_test count_ref count_err 28 | 0 4192687908 4192687908 0 29 | 1 3461041798 3461041798 0 30 | ``` 31 | 32 | The plot shows the `GC_MS` histogram using the reference date and prior date 33 | across both databases. 34 | 35 | ![results](results.png) 36 | -------------------------------------------------------------------------------- /script/validation/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | # relative to the current directory 6 | cd "$(dirname "$0")" 7 | 8 | set +x 9 | source fetch_credentials.sh 10 | set -x 11 | 12 | HOST_REF=${1?missing first host} 13 | HOST_TEST=${2?missing second host} 14 | DATE=${3:-20200301} 15 | 16 | python3 fetch_stats.py \ 17 | validate_data_ref.py \ 18 | --host $HOST_REF \ 19 | --date $DATE 20 | 21 | python3 fetch_stats.py \ 22 | validate_data_test.py \ 23 | --host $HOST_TEST \ 24 | --date $DATE 25 | 26 | black . 27 | 28 | python3 validate.py 29 | -------------------------------------------------------------------------------- /script/validation/fetch_credentials.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | creds="$(aws s3 cp s3://telemetry-spark-emr-2/aggregator_database_envvars.json -)" 4 | 5 | function dev_creds() { 6 | key=$1 7 | echo "$creds" | jq -r ".${key}" 8 | } 9 | 10 | export POSTGRES_DB="$(dev_creds POSTGRES_DB)" 11 | export POSTGRES_USER="$(dev_creds POSTGRES_USER)" 12 | export POSTGRES_PASS="$(dev_creds POSTGRES_PASS)" 13 | export POSTGRES_HOST="$(dev_creds POSTGRES_RO_HOST)" 14 | 15 | 16 | # useful command when sourcing this script 17 | : << EOF 18 | PGPASSWORD=$POSTGRES_PASS psql \ 19 | --host=$POSTGRES_RO_HOST \ 20 | --username=$POSTGRES_USER \ 21 | --dbname=$POSTGRES_DB 22 | EOF -------------------------------------------------------------------------------- /script/validation/fetch_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # pip3 install psycopg2-binary 4 | 5 | from argparse import ArgumentParser 6 | import os 7 | import psycopg2 8 | from datetime import datetime, timedelta 9 | 10 | names = ["POSTGRES_DB", "POSTGRES_USER", "POSTGRES_PASS"] 11 | 12 | parser = ArgumentParser() 13 | parser.add_argument("output") 14 | parser.add_argument("--host", default=os.getenv("POSTGRES_HOST")) 15 | parser.add_argument("--date", default="20201119") 16 | args = parser.parse_args() 17 | 18 | 19 | def yesterday(ds): 20 | fmt = "%Y%m%d" 21 | return (datetime.strptime(ds, fmt) - timedelta(1)).strftime(fmt) 22 | 23 | 24 | conn_str = f"dbname={{}} user={{}} password={{}} host={args.host}".format( 25 | *[os.getenv(name) for name in names] 26 | ) 27 | 28 | conn = psycopg2.connect(conn_str) 29 | cursor = conn.cursor() 30 | 31 | cursor.execute( 32 | f""" 33 | select * 34 | from pg_catalog.pg_tables 35 | where schemaname='public' 36 | and tablename like 'submission_date%{args.date[:4]}%'; 37 | """ 38 | ) 39 | rows = cursor.fetchall() 40 | reference_period = [yesterday(args.date), args.date] 41 | table_names = [x[1] for x in rows if x[1].split("_")[-1] in reference_period] 42 | 43 | subquery = " UNION ALL ".join( 44 | f"select '{table.split('_')[-1]}' as submission_date, dimensions->>'metric' as metric, histogram from {table}" 45 | for table in table_names 46 | ) 47 | 48 | # get the number of rows for each submission_date (aggregates over each dimension) 49 | cursor.execute( 50 | f""" 51 | select 52 | submission_date, 53 | count(*) 54 | from ({subquery}) as unioned 55 | group by submission_date 56 | """ 57 | ) 58 | counts = list(cursor.fetchall()) 59 | 60 | # calculate the sum and counts for all histograms in a date 61 | cursor.execute( 62 | f""" 63 | select 64 | submission_date, 65 | -- sum and count are inserted into the last two elements of a histogram 66 | sum(histogram[array_upper(histogram, 1)-1]) as sum_sum, 67 | sum(histogram[array_upper(histogram, 1)]) as sum_count 68 | from ({subquery}) as unioned 69 | group by submission_date 70 | """ 71 | ) 72 | sums = list(cursor.fetchall()) 73 | 74 | # Calculate the aggregates for GC_MS for each date 75 | cursor.execute( 76 | f""" 77 | select 78 | submission_date, 79 | -- sum and count are inserted into the last two elements of a histogram 80 | aggregate_histograms(histogram) 81 | from ({subquery}) as unioned 82 | where metric = 'GC_MS' 83 | group by submission_date 84 | """ 85 | ) 86 | gc_ms = list(cursor.fetchall()) 87 | 88 | with open(args.output, "w") as fp: 89 | fp.write("from decimal import Decimal\n") 90 | fp.write(f"counts = {counts}\n") 91 | fp.write(f"sums = {sums}\n") 92 | fp.write(f"gc_ms = {gc_ms}\n") 93 | 94 | 95 | # Other queries 96 | """ 97 | -- number of submissions in 2020 98 | select tablename 99 | from pg_catalog.pg_tables 100 | where schemaname='public' 101 | and tablename like 'submission_date%2020%'; 102 | order by tablename desc 103 | limit 100; 104 | 105 | -- number of submissions 106 | select count(*) 107 | from pg_catalog.pg_tables 108 | where schemaname='public' 109 | and tablename like 'submission_date%'; 110 | 111 | -- number of builds 112 | select count(*) 113 | from pg_catalog.pg_tables 114 | where schemaname='public' 115 | and tablename like 'build_id%'; 116 | """ 117 | -------------------------------------------------------------------------------- /script/validation/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/python_mozaggregator/6c0119bfd0b535346c37cb3f707d998039d3e24b/script/validation/results.png -------------------------------------------------------------------------------- /script/validation/validate.py: -------------------------------------------------------------------------------- 1 | from decimal import Decimal 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | import validate_data_ref 7 | import validate_data_test 8 | 9 | 10 | df_count = pd.DataFrame( 11 | validate_data_test.counts, columns=["submission_date", "test"] 12 | ).merge(pd.DataFrame(validate_data_ref.counts, columns=["submission_date", "ref"])) 13 | 14 | df_sum = pd.DataFrame( 15 | validate_data_test.sums, columns=["submission_date", "sum_test", "count_test"] 16 | ).merge( 17 | pd.DataFrame( 18 | validate_data_ref.sums, columns=["submission_date", "sum_ref", "count_ref"] 19 | ) 20 | ) 21 | 22 | df_gc_ms = pd.DataFrame( 23 | validate_data_test.gc_ms, columns=["submission_date", "test"] 24 | ).merge(pd.DataFrame(validate_data_ref.gc_ms, columns=["submission_date", "ref"])) 25 | # hack, because something changed 26 | df_gc_ms.test = df_gc_ms.test.apply(np.array) 27 | df_gc_ms.ref = df_gc_ms.ref.apply(np.array) 28 | 29 | 30 | df_count["err"] = abs(df_count.test - df_count.ref) / df_count.ref * 100 31 | df_sum["sum_err"] = abs(df_sum.sum_test - df_sum.sum_ref) / df_sum.sum_ref * 100 32 | df_sum["count_err"] = abs(df_sum.count_test - df_sum.count_ref) / df_sum.count_ref * 100 33 | df_gc_ms["err"] = abs(df_gc_ms.test - df_gc_ms.ref) / df_gc_ms.ref * 100 34 | 35 | print(df_count) 36 | print(df_sum[["sum_test", "sum_ref", "sum_err"]]) 37 | print(df_sum[["count_test", "count_ref", "count_err"]]) 38 | 39 | 40 | x = np.arange(len(df_gc_ms.test.values[0][:-2])) 41 | 42 | plt.subplot(221) 43 | plt.title(f"{df_gc_ms.submission_date[0]}: test vs ref") 44 | plt.plot(x, df_gc_ms.test.values[0][:-2]) 45 | plt.plot(x, df_gc_ms.ref.values[0][:-2]) 46 | 47 | plt.subplot(222) 48 | plt.title(f"{df_gc_ms.submission_date[1]}: test vs ref") 49 | plt.plot(x, df_gc_ms.test.values[1][:-2]) 50 | plt.plot(x, df_gc_ms.ref.values[1][:-2]) 51 | 52 | plt.subplot(223) 53 | plt.title(f"test: {df_gc_ms.submission_date[0]} vs {df_gc_ms.submission_date[1]}") 54 | plt.plot(x, df_gc_ms.test.values[0][:-2]) 55 | plt.plot(x, df_gc_ms.test.values[1][:-2]) 56 | 57 | plt.subplot(224) 58 | plt.title(f"ref: {df_gc_ms.submission_date[0]} vs {df_gc_ms.submission_date[1]}") 59 | plt.plot(x, df_gc_ms.ref.values[0][:-2]) 60 | plt.plot(x, df_gc_ms.ref.values[1][:-2]) 61 | plt.savefig("results.png") 62 | -------------------------------------------------------------------------------- /script/validation/validate_data_ref.py: -------------------------------------------------------------------------------- 1 | from decimal import Decimal 2 | 3 | counts = [("20200229", 2936771), ("20200301", 2826848)] 4 | sums = [ 5 | ("20200229", Decimal("2626701384304806794"), Decimal("4192687908")), 6 | ("20200301", Decimal("2015714821244959696"), Decimal("3461041798")), 7 | ] 8 | gc_ms = [ 9 | ( 10 | "20200229", 11 | [ 12 | 6658499, 13 | 8354994, 14 | 6143765, 15 | 5932372, 16 | 5296495, 17 | 5125504, 18 | 5192139, 19 | 5236794, 20 | 10548921, 21 | 10307775, 22 | 10185395, 23 | 16650529, 24 | 14306376, 25 | 18234460, 26 | 22270035, 27 | 23378856, 28 | 26089481, 29 | 35395399, 30 | 36401066, 31 | 41290415, 32 | 41776387, 33 | 40651039, 34 | 40532326, 35 | 39627254, 36 | 39836392, 37 | 38728037, 38 | 37188229, 39 | 32062201, 40 | 27690311, 41 | 24098905, 42 | 21688575, 43 | 16531594, 44 | 12738588, 45 | 9867065, 46 | 7520683, 47 | 5559968, 48 | 4098988, 49 | 3010292, 50 | 2223359, 51 | 1606433, 52 | 1162388, 53 | 845775, 54 | 597783, 55 | 438837, 56 | 337075, 57 | 277828, 58 | 220436, 59 | 162407, 60 | 132715, 61 | 503191, 62 | 278928610753, 63 | 8595358, 64 | ], 65 | ), 66 | ( 67 | "20200301", 68 | [ 69 | 2925670, 70 | 6715787, 71 | 4707073, 72 | 4192814, 73 | 3969322, 74 | 3857732, 75 | 3944128, 76 | 3974515, 77 | 7949886, 78 | 7761962, 79 | 7691705, 80 | 12183705, 81 | 10774769, 82 | 14083327, 83 | 17557472, 84 | 18663309, 85 | 21399559, 86 | 29188776, 87 | 30312972, 88 | 34389898, 89 | 35229665, 90 | 35067090, 91 | 35213484, 92 | 34867597, 93 | 35353800, 94 | 34489141, 95 | 33724172, 96 | 29573912, 97 | 25685282, 98 | 22627592, 99 | 19646673, 100 | 15277624, 101 | 12149394, 102 | 9370835, 103 | 7241278, 104 | 5204618, 105 | 4015992, 106 | 2937125, 107 | 2051030, 108 | 1453853, 109 | 1036862, 110 | 752442, 111 | 551556, 112 | 449630, 113 | 326982, 114 | 276883, 115 | 184940, 116 | 144580, 117 | 121030, 118 | 455103, 119 | 330249296203, 120 | 7049297, 121 | ], 122 | ), 123 | ] 124 | -------------------------------------------------------------------------------- /script/validation/validate_data_test.py: -------------------------------------------------------------------------------- 1 | from decimal import Decimal 2 | 3 | counts = [("20200229", 2936771), ("20200301", 2826848)] 4 | sums = [ 5 | ("20200229", Decimal("2626701384304806794"), Decimal("4192687908")), 6 | ("20200301", Decimal("2015714821244959696"), Decimal("3461041798")), 7 | ] 8 | gc_ms = [ 9 | ( 10 | "20200229", 11 | [ 12 | 6658499, 13 | 8354994, 14 | 6143765, 15 | 5932372, 16 | 5296495, 17 | 5125504, 18 | 5192139, 19 | 5236794, 20 | 10548921, 21 | 10307775, 22 | 10185395, 23 | 16650529, 24 | 14306376, 25 | 18234460, 26 | 22270035, 27 | 23378856, 28 | 26089481, 29 | 35395399, 30 | 36401066, 31 | 41290415, 32 | 41776387, 33 | 40651039, 34 | 40532326, 35 | 39627254, 36 | 39836392, 37 | 38728037, 38 | 37188229, 39 | 32062201, 40 | 27690311, 41 | 24098905, 42 | 21688575, 43 | 16531594, 44 | 12738588, 45 | 9867065, 46 | 7520683, 47 | 5559968, 48 | 4098988, 49 | 3010292, 50 | 2223359, 51 | 1606433, 52 | 1162388, 53 | 845775, 54 | 597783, 55 | 438837, 56 | 337075, 57 | 277828, 58 | 220436, 59 | 162407, 60 | 132715, 61 | 503191, 62 | 278928610753, 63 | 8595358, 64 | ], 65 | ), 66 | ( 67 | "20200301", 68 | [ 69 | 2925670, 70 | 6715787, 71 | 4707073, 72 | 4192814, 73 | 3969322, 74 | 3857732, 75 | 3944128, 76 | 3974515, 77 | 7949886, 78 | 7761962, 79 | 7691705, 80 | 12183705, 81 | 10774769, 82 | 14083327, 83 | 17557472, 84 | 18663309, 85 | 21399559, 86 | 29188776, 87 | 30312972, 88 | 34389898, 89 | 35229665, 90 | 35067090, 91 | 35213484, 92 | 34867597, 93 | 35353800, 94 | 34489141, 95 | 33724172, 96 | 29573912, 97 | 25685282, 98 | 22627592, 99 | 19646673, 100 | 15277624, 101 | 12149394, 102 | 9370835, 103 | 7241278, 104 | 5204618, 105 | 4015992, 106 | 2937125, 107 | 2051030, 108 | 1453853, 109 | 1036862, 110 | 752442, 111 | 551556, 112 | 449630, 113 | 326982, 114 | 276883, 115 | 184940, 116 | 144580, 117 | 121030, 118 | 455103, 119 | 330249296203, 120 | 7049297, 121 | ], 122 | ), 123 | ] 124 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | 9 | from setuptools import setup 10 | 11 | setup(name='python_mozaggregator', 12 | version='0.3.0.4', 13 | author='Roberto Agostino Vitillo', 14 | author_email='rvitillo@mozilla.com', 15 | description='Telemetry aggregation job', 16 | url='https://github.com/vitillo/python_mozaggregator', 17 | packages=['mozaggregator'], 18 | package_dir={'mozaggregator': 'mozaggregator'}, 19 | install_requires=[ 20 | 'Flask', 21 | 'Flask-Cache', 22 | 'Flask-Cors', 23 | 'Flask-SSLify', 24 | 'boto3', 25 | 'click', 26 | 'dockerflow', 27 | 'gevent', 28 | 'gunicorn', 29 | 'joblib', 30 | 'pandas', 31 | 'psycogreen', 32 | 'psycopg2-binary', 33 | 'pyspark', 34 | 'python-jose-cryptodome', 35 | # using git reference as python_moztelemetry 36 | # has been deleted from pypi repository 37 | # TODO: investigate python_moztelemetry usage 38 | # and remove this dependency if possible. 39 | 'python-moztelemetry @ git+https://github.com/mozilla/python_moztelemetry.git@v0.10.4#egg=python-moztelemetry', 40 | # 'git+https://github.com/mozilla/python_moztelemetry.git@v0.10.4#egg=python-moztelemetry', 41 | 'ujson', 42 | ] 43 | ) 44 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from google.cloud import bigquery, storage 4 | from pyspark.sql import SparkSession 5 | 6 | import pytest 7 | from dataset import generate_pings 8 | from mobile_dataset import generate_mobile_pings 9 | from utils import ( 10 | runif_avro_testing_enabled, 11 | runif_bigquery_testing_enabled, 12 | format_payload_bytes_decoded, 13 | format_payload_bytes_decoded_mobile, 14 | ) 15 | 16 | 17 | @pytest.fixture() 18 | def spark(): 19 | spark = SparkSession.builder.getOrCreate() 20 | spark.conf.set("spark.sql.session.timeZone", "UTC") 21 | yield spark 22 | spark.stop() 23 | 24 | 25 | @pytest.fixture() 26 | def sc(spark): 27 | return spark.sparkContext 28 | 29 | 30 | @runif_bigquery_testing_enabled 31 | @pytest.fixture 32 | def bq_testing_table(): 33 | bq_client = bigquery.Client() 34 | 35 | project_id = os.environ["PROJECT_ID"] 36 | dataset_id = f"{project_id}.pytest_mozaggregator_test" 37 | bq_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True) 38 | bq_client.create_dataset(dataset_id) 39 | 40 | schema = bq_client.schema_from_json( 41 | os.path.join(os.path.dirname(__file__), "decoded.1.bq") 42 | ) 43 | # use load_table instead of insert_rows to avoid eventual consistency guarantees 44 | df = [format_payload_bytes_decoded(ping) for ping in generate_pings()] 45 | mobile_df = [ 46 | format_payload_bytes_decoded_mobile(ping) for ping in generate_mobile_pings() 47 | ] 48 | 49 | # result set to be yielded are (table_name, fully-qualified path) pairs 50 | results = [] 51 | # create the relevant tables 52 | for table_name, df in [ 53 | ("main_v4", df), 54 | ("mobile_metrics_v1", mobile_df), 55 | ]: 56 | table_id = f"{dataset_id}.telemetry_telemetry__{table_name}" 57 | table = bigquery.table.Table(table_id, schema) 58 | table.time_partitioning = bigquery.TimePartitioning( 59 | type_=bigquery.TimePartitioningType.DAY, field="submission_timestamp" 60 | ) 61 | bq_client.create_table(table) 62 | bq_client.load_table_from_json( 63 | df, table, job_config=bigquery.job.LoadJobConfig(schema=schema) 64 | ).result() 65 | 66 | results.append((table_name, table_id)) 67 | 68 | yield results 69 | 70 | bq_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True) 71 | 72 | 73 | @runif_avro_testing_enabled 74 | @pytest.fixture 75 | def avro_testing_files(bq_testing_table): 76 | bq_client = bigquery.Client() 77 | parent_path = os.path.join(os.environ["TMP_AVRO_PATH"], "mozaggregator_test_avro") 78 | 79 | for table_name, table_id in bq_testing_table: 80 | job = bq_client.query( 81 | f"SELECT distinct cast(extract(date from submission_timestamp) as string) as ds FROM `{table_id}`" 82 | ) 83 | for row in job.result(): 84 | ds_nodash = row.ds.replace("-", "") 85 | path = f"{parent_path}/{ds_nodash}/{table_name}/*.avro" 86 | bq_client.extract_table( 87 | f"{table_id}${ds_nodash}", 88 | path, 89 | job_config=bigquery.job.ExtractJobConfig(destination_format="AVRO"), 90 | ).result() 91 | 92 | yield parent_path 93 | 94 | storage_client = storage.Client() 95 | parts = parent_path.strip("gs://").split("/") 96 | bucket = parts[0] 97 | prefix = "/".join(parts[1:parts.index("mozaggregator_test_avro")+1]) 98 | bucket = storage_client.get_bucket(bucket) 99 | for blob in bucket.list_blobs(prefix=prefix): 100 | print(f"deleting {blob.name}") 101 | blob.delete() 102 | -------------------------------------------------------------------------------- /tests/dataset.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import uuid 3 | from itertools import chain, product, repeat 4 | 5 | from mozaggregator.aggregator import PROCESS_TYPES 6 | 7 | 8 | NUM_CHILDREN_PER_PING = 3 9 | NUM_AGGREGATED_CHILD_PINGS = 2 10 | NUM_PINGS_PER_DIMENSIONS = 3 11 | assert(NUM_AGGREGATED_CHILD_PINGS <= NUM_PINGS_PER_DIMENSIONS) 12 | NUM_PROCESS_TYPES = len(PROCESS_TYPES) 13 | SCALAR_VALUE = 42 14 | SIMPLE_SCALAR_BUCKET = 35 15 | COUNT_SCALAR_BUCKET = 40 16 | NUMERIC_SCALAR_BUCKET = 40 17 | 18 | DATE_FMT = '%Y%m%d' 19 | DATETIME_FMT = '%Y%m%d%H%M%S' 20 | 21 | TODAY = datetime.date.today() 22 | BUILD_ID_1 = TODAY - datetime.timedelta(days=1) 23 | BUILD_ID_2 = TODAY - datetime.timedelta(days=2) 24 | SUBMISSION_DATE_1 = TODAY 25 | SUBMISSION_DATE_2 = TODAY - datetime.timedelta(days=2) 26 | 27 | ping_dimensions = { 28 | "submission_date": [SUBMISSION_DATE_2.strftime(DATE_FMT), 29 | SUBMISSION_DATE_1.strftime(DATE_FMT)], 30 | "channel": ["nightly", "beta", "release"], 31 | "version": ["40.0b1", "41"], 32 | "build_id": [BUILD_ID_2.strftime(DATETIME_FMT), 33 | BUILD_ID_1.strftime(DATETIME_FMT)], 34 | "application": ["Firefox", "Fennec"], 35 | "arch": ["x86", "x86-64"], 36 | "os": ["Linux", "Windows_NT"], 37 | "os_version": ["6.1", "3.1.12"], 38 | } 39 | 40 | histograms_template = { 41 | "EVENTLOOP_UI_ACTIVITY_EXP_MS": {"bucket_count": 20, 42 | "histogram_type": 0, 43 | "log_sum": 0, 44 | "log_sum_squares": 0, 45 | "range": [50, 60000], 46 | "sum": 9362, 47 | "values": {"0": 0, 48 | "110": 16, 49 | "1170": 0, 50 | "163": 8, 51 | "242": 5, 52 | "359": 2, 53 | "50": 18, 54 | "74": 16, 55 | "789": 1}}, 56 | "UPDATE_PING_COUNT_EXTERNAL": {"bucket_count": 3, 57 | "histogram_type": 4, 58 | "range": [1, 2], 59 | "sum": SCALAR_VALUE, 60 | "values": {"0": SCALAR_VALUE, "1": 0}}, 61 | "USE_COUNTER2_PROPERTY_FILL_PAGE": {'bucket_count': 3, 62 | 'histogram_type': 2, 63 | 'range': [1, 2], 64 | 'sum': 2, 65 | 'values': {'0': 0, '1': 2, '2': 0}}, 66 | "USE_COUNTER2_ISNULL_PAGE": None, 67 | "USE_COUNTER2_PROPERTY_FILL_DOCUMENT": {'bucket_count': 3, 68 | 'histogram_type': 2, 69 | 'range': [1, 2], 70 | 'sum': 1, 71 | 'values': {'0': 0, '1': 1, '2': 0}}, 72 | "CONTENT_DOCUMENTS_DESTROYED": {"bucket_count": 3, 73 | "histogram_type": 4, 74 | "range": [1, 2], 75 | "sum": 17, 76 | "values": {"0": 17, "1": 0}}, 77 | "TOP_LEVEL_CONTENT_DOCUMENTS_DESTROYED": {"bucket_count": 3, 78 | "histogram_type": 4, 79 | "range": [1, 2], 80 | "sum": 19, 81 | "values": {"0": 19, "1": 0}}, 82 | "USE_COUNTER_PROPERTY_FILL_DOCUMENT": {'bucket_count': 3, 83 | 'histogram_type': 2, 84 | 'range': [1, 2], 85 | 'sum': 1, 86 | 'values': {'0': 0, '1': 1}}, 87 | "TELEMETRY_TEST_CATEGORICAL": {"bucket_count": 4, 88 | "histogram_type": 5, 89 | "range": [1, 2], 90 | "sum": 3, 91 | "values": {"0": 1, "1": 1, "2": 1, "3": 0}}, 92 | "GC_MAX_PAUSE_MS_2": {"bucket_count": 50, 93 | "histogram_type": 1, 94 | "range": [1, 1000], 95 | "sum": 554, 96 | "values": {"0": 0, "1": 4, "22": 2, "43": 1, "63": 1, "272": 1, "292": 0}} 97 | } 98 | 99 | keyed_histograms_template = { 100 | "DEVTOOLS_PERFTOOLS_RECORDING_FEATURES_USED": { 101 | "withMarkers": { 102 | "range": [1, 2], 103 | "bucket_count": 3, 104 | "histogram_type": 2, 105 | "values": { 106 | "0": 0, 107 | "1": 1, 108 | "2": 0 109 | }, 110 | "sum": 1 111 | } 112 | }, 113 | } 114 | 115 | ignored_keyed_histograms_template = { 116 | "MESSAGE_MANAGER_MESSAGE_SIZE": {"foo": {"bucket_count": 20, 117 | "histogram_type": 0, 118 | "sum": 0, 119 | "values": {"0": 0}}}, 120 | "VIDEO_DETAILED_DROPPED_FRAMES_PROPORTION": {"foo": {"bucket_count": 20, 121 | "histogram_type": 0, 122 | "sum": 0, 123 | "values": {"0": 0}}}, 124 | "SEARCH_COUNTS": {"ddg.urlbar": {"range": [1, 2], 125 | "bucket_count": 3, 126 | "histogram_type": 4, 127 | "values": {"0": 1, "1": 0}, 128 | "sum": 1}}, 129 | } 130 | 131 | 132 | simple_measurements_template = { 133 | "uptime": SCALAR_VALUE, 134 | "addonManager": { 135 | "XPIDB_parseDB_MS": SCALAR_VALUE 136 | } 137 | } 138 | 139 | scalars_template = { 140 | "browser.engagement.total_uri_count": SCALAR_VALUE, 141 | "browser.engagement.tab_open_event_count": SCALAR_VALUE 142 | } 143 | 144 | ignored_scalars_template = { 145 | "browser.engagement.navigation": SCALAR_VALUE, 146 | "browser.engagement.navigation.test": SCALAR_VALUE, 147 | "telemetry.test.string_kind": "IGNORED_STRING" 148 | } 149 | 150 | keyed_scalars_template = { 151 | "telemetry.test.keyed_release_optout": { 152 | "search_enter": SCALAR_VALUE 153 | }, 154 | "telemetry.test.keyed_unsigned_int": { 155 | "first": SCALAR_VALUE, 156 | "second": SCALAR_VALUE 157 | } 158 | } 159 | 160 | ignored_keyed_scalars_template = { 161 | "browser.engagement.navigation.searchbar": { 162 | "first": SCALAR_VALUE, 163 | "second": SCALAR_VALUE 164 | }, 165 | "fake.keyed.string": { 166 | "first": "IGNORE_ME" 167 | } 168 | } 169 | 170 | private_keyed_scalars_template = { 171 | "telemetry.event_counts": { 172 | "some#event#happened": SCALAR_VALUE 173 | }, 174 | "telemetry.dynamic_event_counts": { 175 | "some#dynamic#event": SCALAR_VALUE 176 | } 177 | } 178 | 179 | 180 | def generate_pings(): 181 | for dimensions in [ 182 | dict(x) for x in product( 183 | *[list(zip(repeat(k), v)) for k, v in ping_dimensions.items()] 184 | ) 185 | ]: 186 | for i in range(NUM_PINGS_PER_DIMENSIONS): 187 | yield generate_payload(dimensions, i < NUM_AGGREGATED_CHILD_PINGS) 188 | 189 | 190 | def generate_payload(dimensions, aggregated_child_histograms): 191 | meta = { 192 | "submissionDate": dimensions["submission_date"], 193 | "sampleId": 42, 194 | } 195 | application = { 196 | "channel": dimensions["channel"], 197 | "version": dimensions["version"], 198 | "buildId": dimensions["build_id"], 199 | "name": dimensions["application"], 200 | "architecture": dimensions["arch"], 201 | } 202 | 203 | child_payloads = [{"simpleMeasurements": simple_measurements_template} 204 | for i in range(NUM_CHILDREN_PER_PING)] 205 | 206 | scalars = {**scalars_template, **ignored_scalars_template} 207 | keyed_scalars = { 208 | **keyed_scalars_template, 209 | **ignored_keyed_scalars_template, 210 | **private_keyed_scalars_template, 211 | } 212 | 213 | processes_payload = { 214 | "parent": { 215 | "scalars": scalars, 216 | "keyedScalars": keyed_scalars 217 | } 218 | } 219 | 220 | if aggregated_child_histograms: 221 | processes_payload["content"] = { 222 | "histograms": histograms_template, 223 | "keyedHistograms": keyed_histograms_template, 224 | "scalars": scalars, 225 | "keyedScalars": keyed_scalars 226 | } 227 | processes_payload["gpu"] = { 228 | "histograms": histograms_template, 229 | "keyedHistograms": keyed_histograms_template, 230 | "scalars": scalars, 231 | "keyedScalars": keyed_scalars 232 | } 233 | else: 234 | for i in range(NUM_CHILDREN_PER_PING): 235 | child_payloads[i]["histograms"] = histograms_template 236 | child_payloads[i]["keyedHistograms"] = keyed_histograms_template 237 | 238 | payload = { 239 | "simpleMeasurements": simple_measurements_template, 240 | "histograms": histograms_template, 241 | "keyedHistograms": { 242 | **keyed_histograms_template, 243 | **ignored_keyed_histograms_template 244 | }, 245 | "childPayloads": child_payloads, 246 | "processes": processes_payload, 247 | } 248 | 249 | environment = { 250 | "system": {"os": {"name": dimensions["os"], 251 | "version": dimensions["os_version"]}}, 252 | "settings": {"telemetryEnabled": False, 253 | "e10sEnabled": dimensions.get("e10s", True)} 254 | } 255 | 256 | return { 257 | "clientId": str(uuid.uuid4()), 258 | "meta": meta, 259 | "application": application, 260 | "payload": payload, 261 | "environment": environment, 262 | } 263 | 264 | 265 | def expected_count(process_type, scalar=False): 266 | if process_type == "parent": 267 | return NUM_PINGS_PER_DIMENSIONS 268 | elif process_type == "gpu": 269 | return NUM_AGGREGATED_CHILD_PINGS 270 | elif process_type == "content" and not scalar: 271 | return (NUM_PINGS_PER_DIMENSIONS - NUM_AGGREGATED_CHILD_PINGS) * NUM_CHILDREN_PER_PING + NUM_AGGREGATED_CHILD_PINGS 272 | elif process_type == "content" and scalar: 273 | return NUM_AGGREGATED_CHILD_PINGS 274 | else: 275 | return -1 276 | -------------------------------------------------------------------------------- /tests/decoded.1.bq: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "NULLABLE", 4 | "name": "client_id", 5 | "type": "STRING" 6 | }, 7 | { 8 | "mode": "NULLABLE", 9 | "name": "document_id", 10 | "type": "STRING" 11 | }, 12 | { 13 | "fields": [ 14 | { 15 | "mode": "NULLABLE", 16 | "name": "document_namespace", 17 | "type": "STRING" 18 | }, 19 | { 20 | "mode": "NULLABLE", 21 | "name": "document_type", 22 | "type": "STRING" 23 | }, 24 | { 25 | "mode": "NULLABLE", 26 | "name": "document_version", 27 | "type": "STRING" 28 | }, 29 | { 30 | "fields": [ 31 | { 32 | "mode": "NULLABLE", 33 | "name": "city", 34 | "type": "STRING" 35 | }, 36 | { 37 | "mode": "NULLABLE", 38 | "name": "country", 39 | "type": "STRING" 40 | }, 41 | { 42 | "mode": "NULLABLE", 43 | "name": "db_version", 44 | "type": "STRING" 45 | }, 46 | { 47 | "mode": "NULLABLE", 48 | "name": "subdivision1", 49 | "type": "STRING" 50 | }, 51 | { 52 | "mode": "NULLABLE", 53 | "name": "subdivision2", 54 | "type": "STRING" 55 | } 56 | ], 57 | "mode": "NULLABLE", 58 | "name": "geo", 59 | "type": "RECORD" 60 | }, 61 | { 62 | "fields": [ 63 | { 64 | "mode": "NULLABLE", 65 | "name": "date", 66 | "type": "STRING" 67 | }, 68 | { 69 | "mode": "NULLABLE", 70 | "name": "dnt", 71 | "type": "STRING" 72 | }, 73 | { 74 | "mode": "NULLABLE", 75 | "name": "x_debug_id", 76 | "type": "STRING" 77 | }, 78 | { 79 | "mode": "NULLABLE", 80 | "name": "x_pingsender_version", 81 | "type": "STRING" 82 | } 83 | ], 84 | "mode": "NULLABLE", 85 | "name": "header", 86 | "type": "RECORD" 87 | }, 88 | { 89 | "fields": [ 90 | { 91 | "mode": "NULLABLE", 92 | "name": "app_build_id", 93 | "type": "STRING" 94 | }, 95 | { 96 | "mode": "NULLABLE", 97 | "name": "app_name", 98 | "type": "STRING" 99 | }, 100 | { 101 | "mode": "NULLABLE", 102 | "name": "app_update_channel", 103 | "type": "STRING" 104 | }, 105 | { 106 | "mode": "NULLABLE", 107 | "name": "app_version", 108 | "type": "STRING" 109 | } 110 | ], 111 | "mode": "NULLABLE", 112 | "name": "uri", 113 | "type": "RECORD" 114 | }, 115 | { 116 | "fields": [ 117 | { 118 | "mode": "NULLABLE", 119 | "name": "browser", 120 | "type": "STRING" 121 | }, 122 | { 123 | "mode": "NULLABLE", 124 | "name": "os", 125 | "type": "STRING" 126 | }, 127 | { 128 | "mode": "NULLABLE", 129 | "name": "version", 130 | "type": "STRING" 131 | } 132 | ], 133 | "mode": "NULLABLE", 134 | "name": "user_agent", 135 | "type": "RECORD" 136 | } 137 | ], 138 | "mode": "NULLABLE", 139 | "name": "metadata", 140 | "type": "RECORD" 141 | }, 142 | { 143 | "mode": "NULLABLE", 144 | "name": "normalized_app_name", 145 | "type": "STRING" 146 | }, 147 | { 148 | "mode": "NULLABLE", 149 | "name": "normalized_channel", 150 | "type": "STRING" 151 | }, 152 | { 153 | "mode": "NULLABLE", 154 | "name": "normalized_country_code", 155 | "type": "STRING" 156 | }, 157 | { 158 | "mode": "NULLABLE", 159 | "name": "normalized_os", 160 | "type": "STRING" 161 | }, 162 | { 163 | "mode": "NULLABLE", 164 | "name": "normalized_os_version", 165 | "type": "STRING" 166 | }, 167 | { 168 | "mode": "NULLABLE", 169 | "name": "payload", 170 | "type": "BYTES" 171 | }, 172 | { 173 | "mode": "NULLABLE", 174 | "name": "sample_id", 175 | "type": "INT64" 176 | }, 177 | { 178 | "mode": "NULLABLE", 179 | "name": "submission_timestamp", 180 | "type": "TIMESTAMP" 181 | } 182 | ] 183 | -------------------------------------------------------------------------------- /tests/mobile_dataset.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import uuid 3 | from itertools import product, repeat 4 | 5 | 6 | NUM_PINGS_PER_DIMENSIONS = 3 7 | 8 | DATE_FMT = '%Y-%m-%d' 9 | DATETIME_FMT = '%Y%m%d%H%M%S' 10 | 11 | NOW = datetime.datetime.now() 12 | SUBMISSION_DATE_1 = NOW 13 | SUBMISSION_DATE_2 = NOW - datetime.timedelta(days=2) 14 | 15 | ping_dimensions = { 16 | "arch": ["arm64-v7a", "arm64-v8a"], 17 | "osversion": [26, 27], 18 | "normalizedChannel": ["release", "nightly"], 19 | } 20 | 21 | meta_template = { 22 | 'Type': 'telemetry', 23 | 'appBuildId': '323001230-GeckoView', 24 | 'appName': 'Focus', 25 | 'appUpdateChannel': 'nightly', 26 | 'appVersion': '8.0', 27 | 'docType': 'mobile-metrics', 28 | 'normalizedAppName': 'Focus', 29 | 'normalizedOs': 'Android', 30 | 'sampleId': '34.0', 31 | 'sourceName': 'telemetry', 32 | 'sourceVersion': '1', 33 | 'submissionDate': SUBMISSION_DATE_1.strftime('%Y%m%d'), 34 | } 35 | 36 | histograms_template = { 37 | 'USE_COUNTER2_PROPERTY_FILL_PAGE': { 38 | 'bucket_count': 3, 39 | 'histogram_type': 2, 40 | 'range': [1, 2], 41 | 'sum': 96, 42 | 'values': { 43 | '0': 0, 44 | '1': 96, 45 | '2': 0 46 | } 47 | }, 48 | 'GC_MAX_PAUSE_MS_2': { 49 | 'bucket_count': 50, 50 | 'histogram_type': 0, 51 | 'range': [1, 10000], 52 | 'sum': 18587, 53 | 'values': { 54 | '0': 0, 55 | '1': 13, 56 | '2': 7, 57 | '3': 9, 58 | '4': 13, 59 | '5': 24, 60 | '6': 19, 61 | '7': 26, 62 | '8': 76, 63 | '10': 93, 64 | '12': 55, 65 | '14': 73, 66 | '17': 74, 67 | '20': 56, 68 | '24': 54, 69 | '29': 48, 70 | '34': 48, 71 | '40': 38, 72 | '48': 60, 73 | '57': 18, 74 | '68': 8, 75 | '81': 6, 76 | '96': 2, 77 | '114': 3, 78 | '135': 0 79 | } 80 | }, 81 | } 82 | 83 | keyed_histograms_template = { 84 | 'NETWORK_HTTP_REDIRECT_TO_SCHEME': { 85 | 'http': { 86 | 'bucket_count': 51, 87 | 'histogram_type': 5, 88 | 'range': [1, 50], 89 | 'sum': 2, 90 | 'values': { 91 | '0': 34, 92 | '1': 2, 93 | '2': 0 94 | } 95 | }, 96 | 'https': { 97 | 'bucket_count': 51, 98 | 'histogram_type': 5, 99 | 'range': [1, 50], 100 | 'sum': 55, 101 | 'values': { 102 | '0': 89, 103 | '1': 55, 104 | '2': 0 105 | } 106 | } 107 | } 108 | } 109 | 110 | keyed_scalars_template = { 111 | 'telemetry.accumulate_clamped_values': { 112 | 'HTTP_CACHE_IO_QUEUE_2_EVICT': 18 113 | } 114 | } 115 | 116 | scalars_template = { 117 | 'media.page_count': 176, 118 | 'media.page_had_media_count': 2, 119 | 'telemetry.persistence_timer_hit_count': 230, 120 | } 121 | 122 | 123 | def generate_mobile_pings(): 124 | 125 | for dimension in [ 126 | dict(x) 127 | for x in product(*[list(zip(repeat(k), v)) 128 | for k, v in ping_dimensions.items()]) 129 | ]: 130 | for i in range(NUM_PINGS_PER_DIMENSIONS): 131 | yield generate_payload(dimension) 132 | 133 | 134 | def generate_payload(dimension): 135 | 136 | metrics = { 137 | 'content': { 138 | 'histograms': histograms_template, 139 | 'keyedHistograms': keyed_histograms_template, 140 | 'keyedScalars': keyed_scalars_template, 141 | 'scalars': scalars_template, 142 | }, 143 | 'dynamic': { 144 | 'histograms': histograms_template, 145 | 'keyedHistograms': keyed_histograms_template, 146 | }, 147 | 'extension': { 148 | 'histograms': histograms_template, 149 | 'keyedHistograms': keyed_histograms_template, 150 | }, 151 | 'gpu': { 152 | 'histograms': histograms_template, 153 | 'keyedHistograms': keyed_histograms_template, 154 | }, 155 | 'parent': { 156 | 'histograms': histograms_template, 157 | 'keyedHistograms': keyed_histograms_template, 158 | 'keyedScalars': keyed_scalars_template, 159 | 'scalars': scalars_template, 160 | }, 161 | } 162 | 163 | meta = meta_template.copy() 164 | meta['normalizedChannel'] = dimension['normalizedChannel'] 165 | 166 | return { 167 | 'arch': dimension['arch'], 168 | 'clientId': str(uuid.uuid4()), 169 | 'createdDate': SUBMISSION_DATE_1.strftime(DATE_FMT), 170 | 'createdTimestamp': SUBMISSION_DATE_1.strftime("%s"), 171 | 'device': 'Google-Pixel 2', 172 | 'locale': 'en-US', 173 | 'meta': meta, 174 | 'metrics': metrics, 175 | 'os': 'Android', 176 | 'osversion': dimension['osversion'], 177 | 'processStartTimestamp': SUBMISSION_DATE_1.strftime("%s"), 178 | 'profileDate': 17747, 179 | 'seq': 123, 180 | 'tz': 120, 181 | 'v': 1, 182 | } 183 | -------------------------------------------------------------------------------- /tests/test_aggregator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import defaultdict 3 | 4 | import pandas as pd 5 | import pytest 6 | import pyspark 7 | 8 | import dataset as d 9 | from mozaggregator.aggregator import (COUNT_HISTOGRAM_PREFIX, 10 | NUMERIC_SCALARS_PREFIX, PROCESS_TYPES, 11 | SIMPLE_MEASURES_PREFIX, 12 | _aggregate_metrics) 13 | 14 | 15 | @pytest.fixture() 16 | def aggregates(sc): 17 | logger = logging.getLogger("py4j") 18 | logger.setLevel(logging.ERROR) 19 | 20 | raw_pings = list(d.generate_pings()) 21 | return _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) 22 | 23 | 24 | @pytest.fixture() 25 | def build_id_aggregates(aggregates): 26 | # Note: most tests are based on the build-id aggregates as the aggregation 27 | # code is the same for both scenarios. 28 | return aggregates[0].collect() 29 | 30 | 31 | @pytest.fixture() 32 | def submission_date_aggregates(aggregates): 33 | return aggregates[1].collect() 34 | 35 | 36 | def test_count(build_id_aggregates, submission_date_aggregates): 37 | pings = list(d.generate_pings()) 38 | num_build_ids = len(d.ping_dimensions["build_id"]) 39 | assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS == len(build_id_aggregates)) 40 | assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids == len(submission_date_aggregates)) 41 | 42 | 43 | def test_keys(build_id_aggregates, submission_date_aggregates): 44 | for aggregate in build_id_aggregates: 45 | submission_date, channel, version, build_id, app, arch, os, os_version = aggregate[0] 46 | 47 | assert(submission_date in d.ping_dimensions["submission_date"]) 48 | assert(channel in d.ping_dimensions["channel"]) 49 | assert(version in [x.split(".")[0] for x in d.ping_dimensions["version"]]) 50 | assert(build_id in [x[:8] for x in d.ping_dimensions["build_id"]]) 51 | assert(app in d.ping_dimensions["application"]) 52 | assert(arch in d.ping_dimensions["arch"]) 53 | assert(os in d.ping_dimensions["os"]) 54 | if os == "Linux": 55 | assert(os_version in [x[:3] for x in d.ping_dimensions["os_version"]]) 56 | else: 57 | assert(os_version in d.ping_dimensions["os_version"]) 58 | 59 | for aggregate in submission_date_aggregates: 60 | submission_date, channel, version, app, arch, os, os_version = aggregate[0] 61 | 62 | assert(submission_date in d.ping_dimensions["submission_date"]) 63 | assert(channel in d.ping_dimensions["channel"]) 64 | assert(version in [x.split(".")[0] for x in d.ping_dimensions["version"]]) 65 | assert(app in d.ping_dimensions["application"]) 66 | assert(arch in d.ping_dimensions["arch"]) 67 | assert(os in d.ping_dimensions["os"]) 68 | if os == "Linux": 69 | assert(os_version in [x[:3] for x in d.ping_dimensions["os_version"]]) 70 | else: 71 | assert(os_version in d.ping_dimensions["os_version"]) 72 | 73 | 74 | def test_simple_measurements(build_id_aggregates): 75 | metric_count = defaultdict(lambda: defaultdict(int)) 76 | 77 | for aggregate in build_id_aggregates: 78 | for key, value in aggregate[1].items(): 79 | metric, label, process_type = key 80 | 81 | if metric.startswith(SIMPLE_MEASURES_PREFIX): 82 | metric_count[metric][process_type] += 1 83 | assert(label == "") 84 | # Simple measurements are still in childPayloads. 85 | # d.expected_count() is correct only for child dimensions in processes.content. 86 | assert(value["count"] == d.NUM_PINGS_PER_DIMENSIONS * (d.NUM_CHILDREN_PER_PING if process_type != "parent" else 1)) 87 | assert(value["sum"] == value["count"] * d.SCALAR_VALUE) 88 | assert(value["histogram"][str(d.SIMPLE_SCALAR_BUCKET)] == value["count"]) 89 | 90 | assert len(metric_count) == len(d.simple_measurements_template) 91 | for process_counts in metric_count.values(): 92 | assert(len(process_counts) == 2) # 1 for parent, 1 for childPayloads 93 | for v in process_counts.values(): 94 | assert(v == len(build_id_aggregates)) 95 | 96 | 97 | def test_numerical_scalars(build_id_aggregates): 98 | metric_count = defaultdict(lambda: defaultdict(int)) 99 | scalar_metrics = set([k.upper() for k in d.scalars_template.keys()]) 100 | keyed_scalar_metrics = set([k.upper() for k in d.keyed_scalars_template.keys()]) 101 | keyed_scalar_metrics |= set([k.upper() for k in d.private_keyed_scalars_template.keys()]) 102 | 103 | for aggregate in build_id_aggregates: 104 | for key, value in aggregate[1].items(): 105 | metric, label, process_type = key 106 | 107 | if metric.startswith(NUMERIC_SCALARS_PREFIX): 108 | orig_name = metric.replace(NUMERIC_SCALARS_PREFIX + "_", "") 109 | assert(orig_name in scalar_metrics | keyed_scalar_metrics) 110 | 111 | if orig_name in scalar_metrics: 112 | assert(label == "") 113 | else: 114 | assert(label != "") 115 | metric = "{}_{}".format(metric, label) 116 | 117 | metric_count[metric][process_type] += 1 118 | assert value["count"] == d.expected_count(process_type, True), ( 119 | "Expected {}, Got {}, Process {}".format(d.expected_count(process_type, True), value["count"], process_type)) 120 | assert(value["sum"] == value["count"] * d.SCALAR_VALUE) 121 | assert(value["histogram"][str(d.NUMERIC_SCALAR_BUCKET)] == value["count"]) 122 | 123 | keyed_scalars_template_len = len([key for m, dic in d.keyed_scalars_template.items() for key in dic]) 124 | keyed_scalars_template_len += len([key for m, dic in d.private_keyed_scalars_template.items() for key in dic]) 125 | assert len(metric_count) == len(d.scalars_template) + keyed_scalars_template_len 126 | for metric, process_counts in metric_count.items(): 127 | assert(set(process_counts.keys()) == PROCESS_TYPES) 128 | for v in list(process_counts.values()): 129 | assert(v == len(build_id_aggregates)) 130 | 131 | 132 | def test_classic_histograms(build_id_aggregates): 133 | metric_count = defaultdict(lambda: defaultdict(int)) 134 | histograms = {k: v for k, v in d.histograms_template.items() 135 | if v is not None and v.get("histogram_type", -1) != 4 and not k.startswith("USE_COUNTER2_")} 136 | 137 | for aggregate in build_id_aggregates: 138 | for key, value in aggregate[1].items(): 139 | metric, label, process_type = key 140 | histogram = histograms.get(metric, None) 141 | 142 | if histogram: 143 | metric_count[metric][process_type] += 1 144 | assert(label == "") 145 | assert(value["count"] == d.expected_count(process_type)) 146 | assert(value["sum"] == value["count"] * histogram["sum"]) 147 | assert(set(histogram["values"].keys()) == set(value["histogram"].keys())) 148 | assert((pd.Series(histogram["values"]) * value["count"] == pd.Series(value["histogram"])).all()) 149 | 150 | assert(len(metric_count) == len(histograms)) 151 | for process_counts in list(metric_count.values()): 152 | assert(set(process_counts.keys()) == PROCESS_TYPES) 153 | for v in list(process_counts.values()): 154 | assert(v == len(build_id_aggregates)) 155 | 156 | 157 | def test_count_histograms(build_id_aggregates): 158 | metric_count = defaultdict(lambda: defaultdict(int)) 159 | histograms = {"{}_{}".format(COUNT_HISTOGRAM_PREFIX, k): v for k, v in d.histograms_template.items() 160 | if v is not None and v.get("histogram_type", -1) == 4 and not k.endswith("CONTENT_DOCUMENTS_DESTROYED")} 161 | 162 | for aggregate in build_id_aggregates: 163 | for key, value in aggregate[1].items(): 164 | metric, label, process_type = key 165 | histogram = histograms.get(metric, None) 166 | 167 | if histogram: 168 | metric_count[metric][process_type] += 1 169 | assert(label == "") 170 | assert(value["count"] == d.expected_count(process_type)) 171 | assert(value["sum"] == value["count"] * histogram["sum"]) 172 | assert(value["histogram"][str(d.COUNT_SCALAR_BUCKET)] == value["count"]) 173 | 174 | assert len(metric_count) == len(histograms) 175 | for process_counts in metric_count.values(): 176 | assert(set(process_counts.keys()) == PROCESS_TYPES) 177 | for v in process_counts.values(): 178 | assert(v == len(build_id_aggregates)) 179 | 180 | 181 | def test_keyed_histograms(build_id_aggregates): 182 | metric_count = defaultdict(lambda: defaultdict(int)) 183 | 184 | for aggregate in build_id_aggregates: 185 | for key, value in aggregate[1].items(): 186 | metric, label, process_type = key 187 | 188 | if metric in d.keyed_histograms_template.keys(): 189 | metric_label = f"{metric}_{label}" 190 | if metric_label not in metric_count: 191 | metric_count[metric_label] = defaultdict(int) 192 | metric_count[metric_label][process_type] += 1 193 | assert(label != "") 194 | assert(value["count"] == d.expected_count(process_type)) 195 | assert(value["sum"] == value["count"] * d.keyed_histograms_template[metric][label]["sum"]) 196 | 197 | histogram_template = d.keyed_histograms_template[metric][label]["values"] 198 | assert(set(histogram_template.keys()) == set(value["histogram"].keys())) 199 | assert((pd.Series(histogram_template) * value["count"] == pd.Series(value["histogram"])).all()) 200 | 201 | assert(metric not in d.ignored_keyed_histograms_template.keys()) 202 | 203 | assert(len(metric_count) == len(d.keyed_histograms_template)) # Assume one label per keyed histogram 204 | for process_counts in metric_count.values(): 205 | assert(set(process_counts.keys()) == PROCESS_TYPES) 206 | for v in process_counts.values(): 207 | assert(v == len(build_id_aggregates)) 208 | -------------------------------------------------------------------------------- /tests/test_db.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | from click.testing import CliRunner 6 | 7 | import pytest 8 | from dataset import (DATE_FMT, SUBMISSION_DATE_1, generate_pings, 9 | ping_dimensions) 10 | from mozaggregator.aggregator import _aggregate_metrics 11 | from mozaggregator.cli import run_aggregator 12 | from mozaggregator.db import (NoticeLoggingCursor, _create_connection, 13 | submit_aggregates, clear_db) 14 | from testfixtures import LogCapture 15 | from utils import runif_bigquery_testing_enabled, runif_avro_testing_enabled 16 | 17 | SERVICE_URI = "http://localhost:5000" 18 | 19 | 20 | logger = logging.getLogger("py4j") 21 | logger.setLevel(logging.INFO) 22 | 23 | 24 | @pytest.fixture() 25 | def aggregates(sc): 26 | raw_pings = list(generate_pings()) 27 | aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) 28 | submit_aggregates(aggregates) 29 | return aggregates 30 | 31 | 32 | @pytest.fixture(autouse=True) 33 | def clear_state(): 34 | clear_db() 35 | 36 | 37 | def test_connection(): 38 | db = _create_connection() 39 | assert(db) 40 | 41 | 42 | def test_submit(aggregates): 43 | # Multiple submissions should not alter the aggregates in the db 44 | build_id_count, submission_date_count = submit_aggregates(aggregates) 45 | 46 | n_submission_dates = len(ping_dimensions["submission_date"]) 47 | n_channels = len(ping_dimensions["channel"]) 48 | n_versions = len(ping_dimensions["version"]) 49 | n_build_ids = len(ping_dimensions["build_id"]) 50 | assert(build_id_count == n_submission_dates * n_channels * n_versions * n_build_ids) 51 | assert(submission_date_count == n_submission_dates * n_channels * n_versions) 52 | 53 | 54 | def test_null_label_character_submit(sc): 55 | metric_info = ("SIMPLE_MEASURES_NULL_METRIC_LABEL", "\u0001\u0000\u0000\u0000\u7000\ub82c", False) 56 | payload = {"sum": 4, "count": 2, "histogram": {2: 2}} 57 | key = ('20161111', 'nightly', '52', '20161111', '', 'Firefox', 'arch', 'Windows', '2.4.21') 58 | aggregate = (key, {metric_info: payload}) 59 | 60 | aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])] 61 | build_id_count, submission_date_count = submit_aggregates(aggregates) 62 | 63 | assert build_id_count == 0, "Build id count should be 0, was {}".format(build_id_count) 64 | assert submission_date_count == 0, "submission date count should be 0, was {}".format(build_id_count) 65 | 66 | 67 | def test_null_arch_character_submit(sc): 68 | metric_info = ("SIMPLE_MEASURES_NULL_ARCHITECTURE", "", False) 69 | payload = {"sum": 4, "count": 2, "histogram": {2: 2}} 70 | key = ('20161111', 'nightly', '52', '20161111', '', "Firefox", "\x00", 'Windows', '2.4.21') 71 | aggregate = (key, {metric_info: payload}) 72 | 73 | aggregates = [sc.parallelize([aggregate]), sc.parallelize([aggregate])] 74 | build_id_count, submission_date_count = submit_aggregates(aggregates) 75 | 76 | assert build_id_count == 0, "Build id count should be 0, was {}".format(build_id_count) 77 | assert submission_date_count == 0, "submission date count should be 0, was {}".format(build_id_count) 78 | 79 | def assert_new_db_functions_backwards_compatible(): 80 | conn = _create_connection() 81 | cursor = conn.cursor() 82 | 83 | old_query = 'SELECT * FROM batched_get_metric(%s, %s, %s, %s, %s)' 84 | cursor.execute(old_query, ( 85 | 'submission_date', 'nightly', '41', [SUBMISSION_DATE_1.strftime(DATE_FMT)], 86 | json.dumps({'metric': 'GC_MAX_PAUSE_MS_2', 'child': 'true'}))) 87 | 88 | # Just 1 result since this is 1 date and not a keyed histogram 89 | result = cursor.fetchall() 90 | assert len(result) == 1, result 91 | 92 | new_query = 'SELECT * FROM batched_get_metric(%s, %s, %s, %s, %s, %s)' 93 | cursor.execute(new_query, ( 94 | 'submission_date', 'nightly', '41', [SUBMISSION_DATE_1.strftime(DATE_FMT)], 95 | json.dumps({'metric': 'GC_MAX_PAUSE_MS_2', 'child': 'true'}), 96 | json.dumps({'metric': 'DEVTOOLS_PERFTOOLS_RECORDING_FEATURES_USED'}))) 97 | 98 | # 1 for the non-keyed histogram, 1 for the 1 key of the keyed histogram 99 | # Note we don't actually use batched_get_metric for multiple metrics, 100 | # but this behavior is expected 101 | assert len(cursor.fetchall()) == 2 102 | 103 | 104 | def test_new_db_functions_backwards_compatible(aggregates): 105 | assert_new_db_functions_backwards_compatible() 106 | 107 | def test_aggregate_histograms(): 108 | conn = _create_connection() 109 | cursor = conn.cursor() 110 | cursor.execute(""" 111 | SELECT aggregate_histograms(t.histos) AS aggregates 112 | FROM (VALUES (ARRAY[1,1,1,1]), (ARRAY[1,1,1,1,1])) AS t(histos) 113 | """) 114 | res = cursor.fetchall() 115 | assert res == [([2, 2, 1, 2, 2],)] 116 | 117 | 118 | def test_cast_array_to_bigint(): 119 | conn = _create_connection() 120 | cursor = conn.cursor() 121 | cursor.execute("SELECT cast_array_to_bigint_safe(ARRAY[-9223372036854775809, 9223372036854775808, 12]);") 122 | res = cursor.fetchall() 123 | assert res == [([-9223372036854775808, 9223372036854775807, 12],)] 124 | 125 | 126 | def test_notice_logging_cursor(): 127 | conn = _create_connection() 128 | cursor = conn.cursor(cursor_factory=NoticeLoggingCursor) 129 | expected = ('py4j', 130 | 'WARNING', 131 | 'WARNING: Truncating positive value(s) too large for bigint in array: {9223372036854775808}') 132 | with LogCapture("py4j") as lc: 133 | cursor.execute("SELECT cast_array_to_bigint_safe(ARRAY[9223372036854775808]);") 134 | lc.check(expected) 135 | 136 | 137 | @pytest.fixture 138 | def mock_dataset(monkeypatch, spark): 139 | class Dataset: 140 | @staticmethod 141 | def from_source(*args, **kwargs): 142 | return Dataset() 143 | 144 | def where(self, *args, **kwargs): 145 | return self 146 | 147 | def records(self, *args, **kwargs): 148 | return spark.sparkContext.parallelize(generate_pings()) 149 | 150 | monkeypatch.setattr("mozaggregator.aggregator.Dataset", Dataset) 151 | 152 | 153 | def test_aggregation_cli(tmp_path, mock_dataset): 154 | test_creds = str(tmp_path / "creds") 155 | # generally points to the production credentials 156 | creds = {"DB_TEST_URL": "dbname=postgres user=postgres host=db"} 157 | with open(test_creds, "w") as f: 158 | json.dump(creds, f) 159 | 160 | result = CliRunner().invoke( 161 | run_aggregator, 162 | [ 163 | "--date", 164 | SUBMISSION_DATE_1.strftime('%Y%m%d'), 165 | "--channels", 166 | "nightly,beta", 167 | "--credentials-protocol", 168 | "file", 169 | "--credentials-bucket", 170 | "/", 171 | "--credentials-prefix", 172 | test_creds, 173 | "--num-partitions", 174 | 10, 175 | ], 176 | catch_exceptions=False, 177 | ) 178 | 179 | assert result.exit_code == 0, result.output 180 | assert_new_db_functions_backwards_compatible() 181 | 182 | 183 | def test_aggregation_cli_no_credentials_file(mock_dataset): 184 | result = CliRunner().invoke( 185 | run_aggregator, 186 | [ 187 | "--date", 188 | SUBMISSION_DATE_1.strftime('%Y%m%d'), 189 | "--channels", 190 | "nightly,beta", 191 | "--num-partitions", 192 | 10, 193 | ], 194 | env={ 195 | "DB_TEST_URL": "", 196 | "POSTGRES_DB": "postgres", 197 | "POSTGRES_USER": "postgres", 198 | "POSTGRES_PASS": "pass", 199 | "POSTGRES_HOST": "db", 200 | "POSTGRES_RO_HOST": "db", 201 | }, 202 | catch_exceptions=False, 203 | ) 204 | 205 | assert result.exit_code == 0, result.output 206 | assert_new_db_functions_backwards_compatible() 207 | 208 | 209 | def test_aggregation_cli_credentials_option(mock_dataset): 210 | empty_env = { 211 | "DB_TEST_URL": "", 212 | "POSTGRES_DB": "", 213 | "POSTGRES_USER": "", 214 | "POSTGRES_PASS": "", 215 | "POSTGRES_HOST": "", 216 | "POSTGRES_RO_HOST": "," 217 | } 218 | options = [ 219 | "--postgres-db", 220 | "postgres", 221 | "--postgres-user", 222 | "postgres", 223 | "--postgres-pass", 224 | "pass", 225 | "--postgres-host", 226 | "db", 227 | "--postgres-ro-host", 228 | "db" 229 | ] 230 | result = CliRunner().invoke( 231 | run_aggregator, 232 | [ 233 | "--date", 234 | SUBMISSION_DATE_1.strftime('%Y%m%d'), 235 | "--channels", 236 | "nightly,beta", 237 | "--num-partitions", 238 | 10, 239 | ] + options, 240 | env=empty_env, 241 | catch_exceptions=False, 242 | ) 243 | 244 | assert result.exit_code == 0, result.output 245 | assert_new_db_functions_backwards_compatible() 246 | 247 | # now test that missing an option will exit with non-zero 248 | result = CliRunner().invoke( 249 | run_aggregator, 250 | [ 251 | "--date", 252 | SUBMISSION_DATE_1.strftime('%Y%m%d'), 253 | "--channels", 254 | "nightly,beta", 255 | "--num-partitions", 256 | 10, 257 | ] + options[:2], # missing ro_host 258 | env=empty_env, 259 | catch_exceptions=False, 260 | ) 261 | assert result.exit_code == 1 262 | 263 | 264 | @runif_bigquery_testing_enabled 265 | def test_aggregation_cli_bigquery(tmp_path, bq_testing_table): 266 | test_creds = str(tmp_path / "creds") 267 | # generally points to the production credentials 268 | creds = {"DB_TEST_URL": "dbname=postgres user=postgres host=db"} 269 | with open(test_creds, "w") as f: 270 | json.dump(creds, f) 271 | 272 | result = CliRunner().invoke( 273 | run_aggregator, 274 | [ 275 | "--date", 276 | SUBMISSION_DATE_1.strftime('%Y%m%d'), 277 | "--channels", 278 | "nightly,beta", 279 | "--credentials-protocol", 280 | "file", 281 | "--credentials-bucket", 282 | "/", 283 | "--credentials-prefix", 284 | test_creds, 285 | "--num-partitions", 286 | 10, 287 | "--source", 288 | "bigquery", 289 | "--project-id", 290 | os.environ["PROJECT_ID"], 291 | "--dataset-id", 292 | "pytest_mozaggregator_test" 293 | ], 294 | catch_exceptions=False, 295 | ) 296 | 297 | assert result.exit_code == 0, result.output 298 | assert_new_db_functions_backwards_compatible() 299 | 300 | 301 | @runif_avro_testing_enabled 302 | def test_aggregation_cli_avro(tmp_path, avro_testing_files): 303 | test_creds = str(tmp_path / "creds") 304 | # generally points to the production credentials 305 | creds = {"DB_TEST_URL": "dbname=postgres user=postgres host=db"} 306 | with open(test_creds, "w") as f: 307 | json.dump(creds, f) 308 | 309 | result = CliRunner().invoke( 310 | run_aggregator, 311 | [ 312 | "--date", 313 | SUBMISSION_DATE_1.strftime('%Y%m%d'), 314 | "--channels", 315 | "nightly,beta", 316 | "--credentials-protocol", 317 | "file", 318 | "--credentials-bucket", 319 | "/", 320 | "--credentials-prefix", 321 | test_creds, 322 | "--num-partitions", 323 | 10, 324 | "--source", 325 | "avro", 326 | "--avro-prefix", 327 | avro_testing_files 328 | ], 329 | catch_exceptions=False, 330 | ) 331 | 332 | assert result.exit_code == 0, result.output 333 | assert_new_db_functions_backwards_compatible() -------------------------------------------------------------------------------- /tests/test_fixtures.py: -------------------------------------------------------------------------------- 1 | from google.cloud import bigquery, storage 2 | from utils import runif_avro_testing_enabled 3 | 4 | 5 | @runif_avro_testing_enabled 6 | def test_avro_matches_bigquery_resource(spark, bq_testing_table, avro_testing_files): 7 | """Test that testing resources for exporting into avro and loading into spark 8 | matches results from bigquery.""" 9 | 10 | bq_client = bigquery.Client() 11 | 12 | for table_name, table_id in bq_testing_table: 13 | df = spark.read.format("avro").load( 14 | f"{avro_testing_files}/*/{table_name}/*.avro" 15 | ) 16 | avro_counts = df.count() 17 | 18 | job = bq_client.query(f"SELECT count(*) as row_count FROM `{table_id}`") 19 | bq_counts = list(job.result())[0].row_count 20 | 21 | assert avro_counts > 0 22 | assert avro_counts == bq_counts 23 | -------------------------------------------------------------------------------- /tests/test_mobile.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import pyspark 5 | import pytest 6 | from click.testing import CliRunner 7 | 8 | import mobile_dataset as d 9 | from mozaggregator.cli import run_mobile 10 | from mozaggregator.mobile import _aggregate_metrics, get_aggregates_dataframe 11 | from utils import runif_bigquery_testing_enabled, runif_avro_testing_enabled 12 | 13 | @pytest.fixture() 14 | def aggregates_rdd(sc): 15 | logger = logging.getLogger("py4j") 16 | logger.setLevel(logging.ERROR) 17 | 18 | raw_pings = list(d.generate_mobile_pings()) 19 | return _aggregate_metrics(sc.parallelize(raw_pings), num_partitions=10) 20 | 21 | 22 | @pytest.fixture() 23 | def aggregates(aggregates_rdd): 24 | return aggregates_rdd.collect() 25 | 26 | 27 | def test_count(aggregates): 28 | pings = list(d.generate_mobile_pings()) 29 | assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS == len(aggregates)) 30 | 31 | 32 | def test_keys(aggregates): 33 | for aggregate in aggregates: 34 | (submission_date, channel, version, build_id, application, 35 | architecture, os, os_version) = aggregate[0] 36 | assert(submission_date == d.meta_template["submissionDate"]) 37 | assert(channel in d.ping_dimensions["normalizedChannel"]) 38 | assert(version == d.meta_template["appVersion"]) 39 | assert(build_id == d.meta_template["appBuildId"]) 40 | assert(application == d.meta_template["appName"]) 41 | assert(architecture in d.ping_dimensions["arch"]) 42 | assert(os == d.meta_template["normalizedOs"]) 43 | assert(os_version in d.ping_dimensions["osversion"]) 44 | 45 | 46 | def test_histograms(aggregates): 47 | n = d.NUM_PINGS_PER_DIMENSIONS 48 | for aggregate in aggregates: 49 | for metric_data in aggregate[1].items(): 50 | metric_name, metric_key, process = metric_data[0] 51 | # A regular histogram. 52 | if metric_name in d.histograms_template.keys(): 53 | tpl = d.histograms_template[metric_name] 54 | assert(metric_data[1]['count'] == n) 55 | assert(metric_data[1]['sum'] == tpl['sum'] * n) 56 | for k, v in tpl['values'].items(): 57 | assert(metric_data[1]['histogram'][k] == v * n) 58 | # A keyed histogram. 59 | elif metric_name in d.keyed_histograms_template.keys(): 60 | tpl = d.keyed_histograms_template[metric_name] 61 | assert(metric_data[1]['count'] == n) 62 | assert(metric_data[1]['sum'] == tpl[metric_key]['sum'] * n) 63 | for k, v in tpl[metric_key]['values'].items(): 64 | assert(metric_data[1]['histogram'][k] == v * n) 65 | 66 | 67 | def test_scalars(aggregates): 68 | n = d.NUM_PINGS_PER_DIMENSIONS 69 | for aggregate in aggregates: 70 | for metric_data in aggregate[1].items(): 71 | metric_name, metric_key, process = metric_data[0] 72 | metric_name = metric_name.split('_')[1].lower() 73 | # A regular scalar. 74 | if metric_name in d.scalars_template.keys(): 75 | value = d.scalars_template[metric_name] 76 | # A keyed scalar. 77 | elif metric_name in d.keyed_scalars_template.keys(): 78 | value = d.keyed_scalars_template[metric_name][metric_key] 79 | else: 80 | continue 81 | assert(metric_data[1]['count'] == n) 82 | assert(metric_data[1]['sum'] == value * n) 83 | assert(metric_data[1]['histogram'] == {str(value): n}) 84 | 85 | 86 | def test_mobile_aggregation_cli(tmp_path, monkeypatch, spark, aggregates_rdd): 87 | output = str(tmp_path / "output") 88 | 89 | class Dataset: 90 | @staticmethod 91 | def from_source(*args, **kwargs): 92 | return Dataset() 93 | 94 | def where(self, *args, **kwargs): 95 | return self 96 | 97 | def records(self, *args, **kwargs): 98 | return spark.sparkContext.parallelize(d.generate_mobile_pings()) 99 | 100 | monkeypatch.setattr("mozaggregator.mobile.Dataset", Dataset) 101 | 102 | result = CliRunner().invoke( 103 | run_mobile, 104 | [ 105 | "--date", 106 | # this date is ignored because we are monkeypatching the dataset 107 | "20190901", 108 | "--output", 109 | output, 110 | "--num-partitions", 111 | 10, 112 | ], 113 | catch_exceptions=False, 114 | ) 115 | 116 | assert result.exit_code == 0 117 | 118 | expect = get_aggregates_dataframe(spark, aggregates_rdd) 119 | actual = spark.read.parquet(output) 120 | 121 | assert expect.count() > 0 and actual.count() > 0 122 | assert expect.count() == actual.count() 123 | 124 | 125 | @runif_bigquery_testing_enabled 126 | def test_mobile_aggregation_cli_bigquery(tmp_path, spark, aggregates_rdd, bq_testing_table): 127 | output = str(tmp_path / "output") 128 | 129 | result = CliRunner().invoke( 130 | run_mobile, 131 | [ 132 | "--date", 133 | d.SUBMISSION_DATE_1.strftime('%Y%m%d'), 134 | "--output", 135 | output, 136 | "--num-partitions", 137 | 10, 138 | "--source", 139 | "bigquery", 140 | "--project-id", 141 | os.environ["PROJECT_ID"], 142 | "--dataset-id", 143 | "pytest_mozaggregator_test" 144 | ], 145 | catch_exceptions=False, 146 | ) 147 | assert len({f"submission_date={d.SUBMISSION_DATE_1.strftime('%Y%m%d')}"} - set(os.listdir(output))) == 0 148 | 149 | expect = get_aggregates_dataframe(spark, aggregates_rdd) 150 | actual = spark.read.parquet(output) 151 | 152 | assert expect.count() > 0 and actual.count() > 0 153 | assert expect.count() == actual.count() 154 | 155 | 156 | @runif_avro_testing_enabled 157 | def test_mobile_aggregation_cli_avro(tmp_path, spark, aggregates_rdd, avro_testing_files): 158 | output = str(tmp_path / "output") 159 | 160 | result = CliRunner().invoke( 161 | run_mobile, 162 | [ 163 | "--date", 164 | d.SUBMISSION_DATE_1.strftime('%Y%m%d'), 165 | "--output", 166 | output, 167 | "--num-partitions", 168 | 10, 169 | "--source", 170 | "avro", 171 | "--avro-prefix", 172 | avro_testing_files, 173 | ], 174 | catch_exceptions=False, 175 | ) 176 | 177 | assert result.exit_code == 0 178 | 179 | expect = get_aggregates_dataframe(spark, aggregates_rdd) 180 | actual = spark.read.parquet(output) 181 | 182 | assert expect.count() > 0 and actual.count() > 0 183 | assert expect.count() == actual.count() 184 | -------------------------------------------------------------------------------- /tests/test_parquet.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | import pyspark 6 | from click.testing import CliRunner 7 | 8 | import dataset as d 9 | import pytest 10 | from mozaggregator.cli import run_parquet 11 | from mozaggregator.parquet import _aggregate_metrics 12 | from utils import runif_bigquery_testing_enabled, runif_avro_testing_enabled 13 | 14 | 15 | class testParquetAggregation(unittest.TestCase): 16 | 17 | def setUp(self): 18 | logger = logging.getLogger("py4j") 19 | logger.setLevel(logging.ERROR) 20 | 21 | self.sc = pyspark.SparkContext(master="local[*]") 22 | raw_pings = list(d.generate_pings()) 23 | build_id_aggs, submission_date_aggs = ( 24 | _aggregate_metrics(self.sc.parallelize(raw_pings), num_reducers=10)) 25 | self.build_id_aggs = build_id_aggs.collect() 26 | self.submission_date_aggs = submission_date_aggs.collect() 27 | 28 | def tearDown(self): 29 | self.sc.stop() 30 | 31 | def test_count(self): 32 | pings = list(d.generate_pings()) 33 | num_build_ids = len(d.ping_dimensions["build_id"]) 34 | self.assertEqual(len(pings) / d.NUM_PINGS_PER_DIMENSIONS, 35 | len(self.build_id_aggs)) 36 | self.assertEqual( 37 | len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids, 38 | len(self.submission_date_aggs)) 39 | 40 | 41 | @pytest.fixture() 42 | def raw_pings(): 43 | return list(d.generate_pings()) 44 | 45 | 46 | def test_parquet_aggregation_cli(tmp_path, monkeypatch, spark, raw_pings): 47 | output = str(tmp_path / "output") 48 | 49 | class Dataset: 50 | @staticmethod 51 | def from_source(*args, **kwargs): 52 | return Dataset() 53 | 54 | def where(self, *args, **kwargs): 55 | return self 56 | 57 | def records(self, *args, **kwargs): 58 | return spark.sparkContext.parallelize(raw_pings) 59 | 60 | monkeypatch.setattr("mozaggregator.parquet.Dataset", Dataset) 61 | 62 | result = CliRunner().invoke( 63 | run_parquet, 64 | [ 65 | "--date", 66 | "20190901", 67 | "--channels", 68 | "nightly,beta", 69 | "--output", 70 | output, 71 | "--num-partitions", 72 | 10, 73 | ], 74 | catch_exceptions=False, 75 | ) 76 | 77 | assert result.exit_code == 0 78 | 79 | df = spark.read.parquet(output) 80 | # 31104 is the empirical count from the generated pings 81 | assert df.count() > len(raw_pings) 82 | 83 | 84 | @runif_bigquery_testing_enabled 85 | def test_parquet_aggregation_cli_bigquery(tmp_path, spark, raw_pings, bq_testing_table): 86 | output = str(tmp_path / "output") 87 | 88 | result = CliRunner().invoke( 89 | run_parquet, 90 | [ 91 | "--date", 92 | d.SUBMISSION_DATE_1.strftime("%Y%m%d"), 93 | "--channels", 94 | "nightly,beta", 95 | "--output", 96 | output, 97 | "--num-partitions", 98 | 10, 99 | "--source", 100 | "bigquery", 101 | "--project-id", 102 | os.environ["PROJECT_ID"], 103 | "--dataset-id", 104 | "pytest_mozaggregator_test", 105 | ], 106 | catch_exceptions=False, 107 | ) 108 | 109 | assert result.exit_code == 0 110 | 111 | df = spark.read.parquet(output) 112 | assert df.count() > len(raw_pings) 113 | 114 | 115 | @runif_avro_testing_enabled 116 | def test_parquet_aggregation_cli_avro(tmp_path, spark, raw_pings, avro_testing_files): 117 | output = str(tmp_path / "output") 118 | 119 | result = CliRunner().invoke( 120 | run_parquet, 121 | [ 122 | "--date", 123 | d.SUBMISSION_DATE_1.strftime("%Y%m%d"), 124 | "--channels", 125 | "nightly,beta", 126 | "--output", 127 | output, 128 | "--num-partitions", 129 | 10, 130 | "--source", 131 | "avro", 132 | "--avro-prefix", 133 | avro_testing_files, 134 | ], 135 | catch_exceptions=False, 136 | ) 137 | 138 | assert result.exit_code == 0 139 | 140 | df = spark.read.parquet(output) 141 | assert df.count() > len(raw_pings) 142 | -------------------------------------------------------------------------------- /tests/test_trim_db.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | 3 | import pytest 4 | from dataset import ( 5 | BUILD_ID_1, 6 | BUILD_ID_2, 7 | DATE_FMT, 8 | SUBMISSION_DATE_1, 9 | SUBMISSION_DATE_2, 10 | generate_pings, 11 | ) 12 | from mozaggregator import trim_db 13 | from mozaggregator.aggregator import _aggregate_metrics 14 | from mozaggregator.db import clear_db, submit_aggregates 15 | 16 | 17 | @pytest.fixture() 18 | def aggregates(sc): 19 | raw_pings = list(generate_pings()) 20 | aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) 21 | submit_aggregates(aggregates) 22 | return aggregates 23 | 24 | 25 | @pytest.fixture(autouse=True) 26 | def clear_state(): 27 | clear_db() 28 | 29 | 30 | @pytest.fixture() 31 | def conn(aggregates): 32 | conn = trim_db.create_connection("postgres", "postgres", "pass", "db") 33 | yield conn 34 | conn.close() 35 | 36 | 37 | def test_extract_ds_nodash(): 38 | cases = [ 39 | "build_id_beta_39_", 40 | "build_id_beta_1_1", 41 | "build_id_aurora_0_1000001", 42 | "build_id_nightly_72_20191119", 43 | ] 44 | expect = ["", "1", "1000001", "20191119"] 45 | actual = [trim_db.extract_ds_nodash(case) for case in cases] 46 | assert actual == expect 47 | 48 | 49 | def test_retention_date_range(): 50 | base = "20191110" 51 | period = 3 52 | buffer = 0 53 | expect = {"20191107", "20191108", "20191109"} 54 | assert trim_db.retention_date_range(base, period, buffer) == expect 55 | 56 | expect.add("20191110") 57 | assert trim_db.retention_date_range(base, period, buffer + 1) == expect 58 | 59 | 60 | def test_query_submission_date(conn): 61 | cursor = conn.cursor() 62 | dates = [SUBMISSION_DATE_1.strftime(DATE_FMT), SUBMISSION_DATE_2.strftime(DATE_FMT)] 63 | retain, trim = trim_db.query_submission_date(cursor, set(dates)) 64 | assert len(trim) == 0 65 | assert len(retain) > 2 # each tablename includes the dimensions 66 | assert {trim_db.extract_ds_nodash(table) for table in retain} == set(dates) 67 | assert all(["submission_date" in table for table in retain]) 68 | 69 | retain, trim = trim_db.query_submission_date(cursor, set(dates[:1])) 70 | assert {trim_db.extract_ds_nodash(table) for table in retain} == set(dates[:1]) 71 | assert {trim_db.extract_ds_nodash(table) for table in trim} == set(dates[1:]) 72 | 73 | 74 | def test_query_build_id(conn): 75 | cursor = conn.cursor() 76 | builds = [BUILD_ID_1.strftime(DATE_FMT), BUILD_ID_2.strftime(DATE_FMT)] 77 | retain, trim = trim_db.query_build_id(cursor, set(builds)) 78 | assert len(trim) == 0 79 | assert len(retain) > 2 # each tablename includes the dimensions 80 | assert {trim_db.extract_ds_nodash(table) for table in retain} == set(builds) 81 | assert all(["build_id" in table for table in retain]) 82 | 83 | retain, trim = trim_db.query_build_id(cursor, set(builds[:1])) 84 | assert {trim_db.extract_ds_nodash(table) for table in retain} == set(builds[:1]) 85 | assert {trim_db.extract_ds_nodash(table) for table in trim} == set(builds[1:]) 86 | 87 | 88 | def test_trim_tables(conn): 89 | cursor = conn.cursor() 90 | list_tables = "select tablename from pg_catalog.pg_tables where schemaname='public'" 91 | cursor.execute(list_tables) 92 | full = {row[0] for row in cursor.fetchall()} 93 | 94 | dates = [SUBMISSION_DATE_1.strftime(DATE_FMT), SUBMISSION_DATE_2.strftime(DATE_FMT)] 95 | retain, trim = trim_db.query_submission_date(cursor, set(dates[:1])) 96 | 97 | expect = full - trim 98 | trim_db.trim_tables(conn, trim) 99 | conn.commit() 100 | cursor.execute(list_tables) 101 | actual = {row[0] for row in cursor.fetchall()} 102 | 103 | assert expect == actual 104 | 105 | 106 | def test_trim_db_cli(conn): 107 | cursor = conn.cursor() 108 | list_tables = "select tablename from pg_catalog.pg_tables where schemaname='public'" 109 | cursor.execute(list_tables) 110 | before = {row[0] for row in cursor.fetchall()} 111 | 112 | result = CliRunner().invoke( 113 | trim_db.main, 114 | [ 115 | "--base-date", 116 | SUBMISSION_DATE_1.strftime(DATE_FMT), 117 | "--retention-period", 118 | 1, 119 | "--dry-run", 120 | "--postgres-db", 121 | "postgres", 122 | "--postgres-user", 123 | "postgres", 124 | "--postgres-pass", 125 | "pass", 126 | "--postgres-host", 127 | "db", 128 | ], 129 | catch_exceptions=False, 130 | ) 131 | assert result.exit_code == 0, result.output 132 | cursor.execute(list_tables) 133 | after = {row[0] for row in cursor.fetchall()} 134 | assert before == after 135 | 136 | result = CliRunner().invoke( 137 | trim_db.main, 138 | [ 139 | "--base-date", 140 | SUBMISSION_DATE_1.strftime(DATE_FMT), 141 | "--retention-period", 142 | 1, 143 | "--no-dry-run", 144 | "--postgres-db", 145 | "postgres", 146 | "--postgres-user", 147 | "postgres", 148 | "--postgres-pass", 149 | "pass", 150 | "--postgres-host", 151 | "db", 152 | ], 153 | catch_exceptions=False, 154 | ) 155 | assert result.exit_code == 0, result.output 156 | 157 | cursor.execute(list_tables) 158 | after = {row[0] for row in cursor.fetchall()} 159 | 160 | assert before != after 161 | assert all([SUBMISSION_DATE_2.strftime(DATE_FMT) in x for x in before - after]), ( 162 | before - after 163 | ) 164 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import os 4 | import gzip 5 | from datetime import datetime 6 | 7 | 8 | def format_payload_bytes_decoded(ping): 9 | # fields are created in tests/dataset.py 10 | return { 11 | "normalized_app_name": ping["application"]["name"], 12 | "normalized_channel": ping["application"]["channel"], 13 | "normalized_os": ping["environment"]["system"]["os"]["name"], 14 | "metadata": { 15 | "uri": { 16 | "app_version": ping["application"]["version"], 17 | "app_build_id": ping["application"]["buildId"], 18 | "app_name": ping["application"]["name"], 19 | } 20 | }, 21 | "sample_id": ping["meta"]["sampleId"], 22 | "submission_timestamp": datetime.strptime( 23 | ping["meta"]["submissionDate"], "%Y%m%d" 24 | ).strftime("%Y-%m-%d %H:%M:%S"), 25 | "payload": base64.b64encode(gzip.compress(json.dumps(ping).encode())).decode(), 26 | } 27 | 28 | 29 | def format_payload_bytes_decoded_mobile(ping): 30 | """Format the mobile payload, this requires less meta information than the 31 | normal dataset because there is little to no filtering being done in the job. 32 | 33 | Fields are created in tests/mobile_dataset.py. 34 | """ 35 | return { 36 | "submission_timestamp": datetime.strptime( 37 | ping["meta"]["submissionDate"], "%Y%m%d" 38 | ).strftime("%Y-%m-%d %H:%M:%S"), 39 | "normalized_channel": ping["meta"]["normalizedChannel"], 40 | "metadata": { 41 | "uri": { 42 | "app_version": str(ping["meta"]["appVersion"]), 43 | "app_build_id": ping["meta"]["appBuildId"], 44 | "app_name": ping["meta"]["appName"], 45 | } 46 | }, 47 | "payload": base64.b64encode(gzip.compress(json.dumps(ping).encode())).decode(), 48 | } 49 | 50 | 51 | def runif_bigquery_testing_enabled(func): 52 | """A decorator that will skip the test if the current environment is not set up for running tests. 53 | 54 | @runif_bigquery_testing_enabled 55 | def test_my_function_that_uses_bigquery_spark_connector(table_fixture): 56 | ... 57 | """ 58 | # importing this at module scope will break test discoverability 59 | import pytest 60 | 61 | bigquery_testing_enabled = os.environ.get( 62 | "GOOGLE_APPLICATION_CREDENTIALS" 63 | ) and os.environ.get("PROJECT_ID") 64 | return pytest.mark.skipif( 65 | not bigquery_testing_enabled, 66 | reason="requires valid gcp credentials and project id", 67 | )(func) 68 | 69 | 70 | def runif_avro_testing_enabled(func): 71 | """A decorator that will skip the test if the current environment is not set up for running tests. 72 | 73 | @runif_avro_testing_enabled 74 | def test_my_function_that_uses_gcs_connector(table_fixture): 75 | ... 76 | """ 77 | # importing this at module scope will break test discoverability 78 | import pytest 79 | 80 | avro_testing_enabled = ( 81 | os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") 82 | and os.environ.get("PROJECT_ID") 83 | and os.environ.get("TMP_AVRO_PATH") 84 | ) 85 | if os.environ.get("TMP_AVRO_PATH"): 86 | assert os.environ["TMP_AVRO_PATH"].startswith( 87 | "gs://" 88 | ), "temporary avro path must be start with gs://" 89 | return pytest.mark.skipif( 90 | not avro_testing_enabled, 91 | reason="requires valid gcp credentials, project id, and temporary avro path", 92 | )(func) 93 | --------------------------------------------------------------------------------