├── .circleci
    └── config.yml
├── .dockerignore
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── bin
    ├── add_gcp_creds
    ├── start_gke
    └── stop_gke
├── config
    └── airflow_local_settings.py
├── constraints.txt
├── dags
    ├── .airflowignore
    ├── __init__.py
    ├── adm_export.py
    ├── app_store_analytics.py
    ├── backfill.py
    ├── bhr_collection.py
    ├── bqetl_artifact_deployment.py
    ├── bqetl_backfill.py
    ├── bqetl_backfill_complete.py
    ├── bqetl_backfill_initiate.py
    ├── broken_site_report_ml.py
    ├── catalyst.py
    ├── clean_gke_pods.py
    ├── contextual_services_import.py
    ├── copy_deduplicate.py
    ├── crash_symbolication.py
    ├── dap_collector.py
    ├── dap_collector_ppa_dev.py
    ├── dap_collector_ppa_prod.py
    ├── dbt_daily.py
    ├── eam_slack_channels.py
    ├── eam_workday_everfi_integration.py
    ├── eam_workday_netsuite.py
    ├── experiment_auto_sizing.py
    ├── experiments_live.py
    ├── extensions.py
    ├── firefox_public_data_report.py
    ├── fxci_metric_export.py
    ├── fxci_pulse_export.py
    ├── ga4_site_metrics_summary_backfill.py
    ├── glam.py
    ├── glam_fenix.py
    ├── glam_fenix_release.py
    ├── glam_fog.py
    ├── glam_fog_release.py
    ├── glam_glean_imports.py
    ├── graphics_telemetry.py
    ├── jetstream.py
    ├── kpi_forecasting.py
    ├── looker.py
    ├── looker_usage_analysis.py
    ├── ltv.py
    ├── mad_server.py
    ├── merino_jobs.py
    ├── microsoft_store.py
    ├── operational_monitoring.py
    ├── operational_monitoring_backfill.py
    ├── partybal.py
    ├── play_store_export.py
    ├── probe_scraper.py
    ├── publish_bqetl_static.py
    ├── search_alert.py
    ├── search_forecasting.py
    ├── shredder.py
    ├── shredder_backfill.py
    ├── socorro_import.py
    ├── update_orphaning_dashboard_etl.py
    └── webcompat_kb.py
├── dataproc_bootstrap
    ├── README.md
    ├── airflow_gcp.sh
    ├── dataproc_init.sh
    ├── fx_usage_init.sh
    └── python-requirements.txt
├── docker-compose.yml
├── jobs
    ├── addon_recommender.sh
    ├── bugzilla_dataset.sh
    ├── ltv_daily.py
    ├── moz_dataproc_runner.py
    ├── mozaggregator_runner.py
    ├── pip-install.sh
    ├── socorro_import_crash_data.py
    ├── telemetry_batch_view.py
    ├── txp_pulse.sh
    └── update_orphaning_dashboard_etl.py
├── operators
    ├── __init__.py
    └── gcp_container_operator.py
├── plugins
    ├── mozmenu.py
    ├── timetable.py
    └── version_endpoint.py
├── pyproject.toml
├── requirements-dev.in
├── requirements-dev.txt
├── requirements-override.txt
├── requirements.in
├── requirements.txt
├── resources
    ├── dev_connections.json
    ├── dev_variables.json
    └── dev_webserver_config.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── dags
    │   ├── __init__.py
    │   └── test_dag_validity.py
    ├── plugins
    │   ├── __init__.py
    │   ├── test_timetable.py
    │   └── test_version_endpoint.py
    └── utils
    │   ├── __init__.py
    │   ├── test_backfill.py
    │   └── test_tags.py
└── utils
    ├── __init__.py
    ├── backfill.py
    ├── callbacks.py
    ├── constants.py
    ├── dataproc.py
    ├── gcp.py
    ├── glam_subdags
        ├── __init__.py
        ├── general.py
        ├── generate_query.py
        └── histograms.py
    ├── patched
        ├── __init__.py
        └── dataproc_hook.py
    ├── slack.py
    └── tags.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Git
 2 | .git
 3 | .gitignore
 4 | 
 5 | # CI
 6 | .circleci/
 7 | 
 8 | # Docker
 9 | docker-compose.yml
10 | 
11 | # cache
12 | __pycache__/
13 | .pytest_cache/
14 | 
15 | # Airflow stuff
16 | logs/
17 | 
18 | # Virtual environment
19 | .env/
20 | .venv/
21 | venv/
22 | 
23 | # Airflow dev resources
24 | resources/
25 | 
26 | # IDE
27 | .idea
28 | .vscode/
29 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | <!-- 
 4 | Please do not leave this blank 
 5 | This PR [adds/removes/fixes/replaces] the [feature/bug/etc]. 
 6 | -->
 7 | 
 8 | ## Related Tickets & Documents
 9 | * DENG-XXXX
10 | * DSRE-XXXX
11 | 
12 | <!-- 
13 | Please reference related Jira tickets, GitHub issues or Bugzilla. This repo has been 
14 | configured to automatically insert hyperlinks for DSRE and DENG tickets.
15 | See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/managing-repository-settings/configuring-autolinks-to-reference-external-resources
16 | -->
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.retry
 3 | *undo-tree~
 4 | *.un~
 5 | venv/
 6 | .venv/
 7 | .env
 8 | 
 9 | logs
10 | unittests.cfg
11 | airflow-webserver.pid
12 | airflow-worker.pid
13 | .config
14 | .viminfo
15 | .credentials
16 | .bash_history
17 | .mysql_history
18 | 
19 | /dags/bigquery-etl-dags
20 | /dags/bigquery-etl-dags/*
21 | 
22 | *~
23 | 
24 | .cache
25 | 
26 | # IDE
27 | .idea
28 | .vscode/


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Community Participation Guidelines
 2 | 
 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 
 4 | For more details, please read the
 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 
 6 | 
 7 | ## How to Report
 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page.
 9 | 
10 | <!--
11 | ## Project Specific Etiquette
12 | 
13 | In some cases, there will be additional project etiquette i.e.: (https://bugzilla.mozilla.org/page.cgi?id=etiquette.html).
14 | Please update for your project.
15 | -->
16 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:slim-2.10.5-python3.11
 2 | 
 3 | ARG PROJECT_DIR="/opt/airflow"
 4 | 
 5 | ENV PYTHONUNBUFFERED=1
 6 | ENV PYTHONPATH="$PYTHONPATH:$PROJECT_DIR"
 7 | ENV AIRFLOW_HOME=$PROJECT_DIR
 8 | 
 9 | USER root
10 | 
11 | RUN apt-get update \
12 |   && apt-get install -y --no-install-recommends build-essential
13 | 
14 | # Legacy docker image dependencies to be reviewed
15 | RUN apt-get install -y --no-install-recommends \
16 |     lsb-release gnupg curl && \
17 |     CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \
18 |     echo "deb https://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | tee -a /etc/apt/sources.list.d/google-cloud-cli.list && \
19 |     curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
20 |     apt-get update -y && apt-get install google-cloud-cli -y && apt-get install google-cloud-cli-gke-gcloud-auth-plugin && \
21 |     apt-get remove -y lsb-release gnupg
22 | 
23 | RUN apt-get autoremove -yqq --purge && \
24 |     apt-get clean && \
25 |     rm -rf /var/lib/apt/lists/*
26 | 
27 | USER airflow
28 | 
29 | COPY requirements.txt /
30 | RUN pip install --no-cache-dir -r /requirements.txt
31 | COPY requirements-override.txt /
32 | RUN pip install --no-cache-dir -r /requirements-override.txt --upgrade
33 | 
34 | WORKDIR $PROJECT_DIR
35 | 
36 | COPY . .
37 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build clean clean-gke fixes gke help pip-compile pip-install-local stop test up
 2 | 
 3 | 
 4 | help:
 5 | 	@echo "Welcome to the Telemetry Airflow\n"
 6 | 	@echo "The list of commands for local development:\n"
 7 | 	@echo "  build              Builds the docker images for the docker-compose setup"
 8 | 	@echo "  clean              Stops and removes all docker containers"
 9 | 	@echo "  fixes              Applies Black and Ruff fixes to Python files"
10 | 	@echo "  pip-compile        Compile dependencies from 'requirements.in' into 'requirements.txt'"
11 | 	@echo "  pip-install-local  Install pip project requirements to your local environment"
12 | 	@echo "  test               Runs pytest"
13 | 	@echo "  up                 Runs the whole stack, served under http://localhost:8080/"
14 | 	@echo "  gke                Create a sandbox gke cluster for testing"
15 | 	@echo "  clean-gke          Delete the sandbox gke cluster"
16 | 	@echo "  stop               Stops the docker containers"
17 | 
18 | build:
19 | 	docker-compose build
20 | 
21 | pip-compile:
22 | 	pip-compile --strip-extras --no-annotate requirements.in
23 | 	pip-compile --strip-extras --no-annotate requirements-dev.in
24 | 
25 | fixes:
26 | 	ruff check . --fix
27 | 	ruff format .
28 | 
29 | clean: stop
30 | 	docker-compose down --volumes
31 | 	docker-compose rm -f
32 | 	rm -rf logs/*
33 | 	if [ -f airflow-worker.pid ]; then rm airflow-worker.pid; fi
34 | 
35 | pip-install-local:
36 | 	pip install -r requirements.txt -r requirements-dev.txt
37 | 	pip install -r requirements-override.txt --upgrade
38 | 
39 | stop:
40 | 	docker-compose down
41 | 	docker-compose stop
42 | 
43 | up:
44 | 	grep -qF 'AIRFLOW_UID=' .env || echo "AIRFLOW_UID=$$(id -u)" >> .env
45 | 	grep -qF 'FERNET_KEY=' .env || echo "FERNET_KEY=$$(python3 -c "from cryptography.fernet import Fernet; fernet_key = Fernet.generate_key(); print(fernet_key.decode())")" >> .env
46 | 	mkdir -p logs
47 | 	docker-compose up --wait
48 | 	docker-compose exec airflow-webserver airflow variables import dev_variables.json
49 | 	docker-compose exec airflow-webserver airflow connections import dev_connections.json
50 | 
51 | gke:
52 | 	bin/start_gke
53 | 
54 | clean-gke:
55 | 	bin/stop_gke
56 | 
57 | test:
58 | 	python -m pytest tests/
59 | 


--------------------------------------------------------------------------------
/bin/add_gcp_creds:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eou pipefail
 4 | 
 5 | keyfile_path=${1?"Must specify keyfile path"}
 6 | 
 7 | connection=${2:-"google_cloud_airflow_gke"}
 8 | 
 9 | # Wait for full display until after checks
10 | set -x
11 | 
12 | function format_gcp() {
13 | KEYFILE="$1" python3 - <<END
14 | import os
15 | import json
16 | 
17 | with open(os.environ["KEYFILE"]) as f:
18 |     data = json.load(f)
19 | 
20 | extra = {
21 |     "extra__google_cloud_platform__project": data["project_id"],
22 |     "extra__google_cloud_platform__keyfile_dict": json.dumps(data)
23 | }
24 | 
25 | print(json.dumps(extra))
26 | END
27 | }
28 | 
29 | function update_gcp() {
30 |     local conn_id=$1
31 |     local keyfile=$2
32 | 
33 |     container_id=$(docker ps --filter name=web -q)
34 | 
35 |     docker exec $container_id airflow connections delete $conn_id
36 | 
37 |     docker exec $container_id airflow connections add $conn_id \
38 |             --conn-type google_cloud_platform \
39 |             --conn-extra "$(format_gcp $keyfile)"
40 | }
41 | 
42 | update_gcp $connection $keyfile_path
43 | 


--------------------------------------------------------------------------------
/bin/start_gke:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is to be used by the Makefile for a start gke target.
 4 | 
 5 | set -eo pipefail
 6 | 
 7 | # Set Env var MY_LOCAL_IP or use icanhazip.com to fetch it
 8 | # https://major.io/icanhazip-com-faq/#what-about-my-privacy
 9 | MY_IP=${MY_LOCAL_IP:-$(curl icanhazip.com -s -4)}
10 | echo "local ip is $MY_IP"
11 | 
12 | 
13 | USERNAME=$(gcloud config get-value account | awk -F"@" '{print $1}')
14 | CLUSTERNAME=$USERNAME-gke-sandbox
15 | 
16 | # Create GKE Cluster - No TTL available, will need a external monitor with cleanup
17 | if gcloud container clusters describe $CLUSTERNAME --project moz-fx-data-gke-sandbox --region us-west1 >/dev/null 2>&1; then
18 |     echo "cluster $CLUSTERNAME exists"
19 | else
20 |     echo "cluster $CLUSTERNAME doesn't exist. creating..."
21 |     gcloud container clusters create $CLUSTERNAME \
22 |            --enable-stackdriver-kubernetes \
23 |            -m n1-standard-4 \
24 |            --release-channel="stable" \
25 |            --enable-master-authorized-networks \
26 |            --master-authorized-networks="$MY_IP/32" \
27 |            --region us-west1 \
28 |            --num-nodes=1 \
29 |            --scopes="cloud-platform" \
30 |            --service-account="data-gke-sandbox-runner@moz-fx-data-gke-sandbox.iam.gserviceaccount.com" \
31 |            --project moz-fx-data-gke-sandbox
32 | 
33 | fi
34 | 
35 | echo "fetching secret..."
36 | JSON_CREDS=$(gcloud secrets versions access latest --secret="gke-sandbox-creds" --project moz-fx-data-gke-sandbox)
37 | 
38 | # Upload secret to local wtmo
39 | GCP_CONN_ID="google_cloud_gke_sandbox"
40 | 
41 | CONTAINER_ID=$(docker ps --filter name=web -q)
42 | if [ -z "$CONTAINER_ID" ]; then
43 |     echo "ERROR: Airflow container is likely not running (or docker). Run 'make up' to start airflow containers"
44 | else
45 |     echo "Web container id is $CONTAINER_ID. Adding gcp connection..."
46 |     docker exec $CONTAINER_ID airflow connections delete $GCP_CONN_ID
47 | 
48 |     docker exec $CONTAINER_ID airflow connections add $GCP_CONN_ID \
49 |            --conn-type google_cloud_platform \
50 |            --conn-extra "$JSON_CREDS"
51 | fi
52 | 
53 | echo "visit https://go.corp.mozilla.com/wtmodev for more info"
54 | 


--------------------------------------------------------------------------------
/bin/stop_gke:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is to be used by the Makefile for a stop gke target.
 4 | 
 5 | set -eo pipefail
 6 | 
 7 | USERNAME=$(gcloud config get-value account | awk -F"@" '{print $1}')
 8 | CLUSTERNAME=$USERNAME-gke-sandbox
 9 | 
10 | if gcloud container clusters describe $CLUSTERNAME --region us-west1 --project moz-fx-data-gke-sandbox >/dev/null 2>&1; then
11 |     gcloud container clusters delete $CLUSTERNAME --region us-west1 --quiet --project moz-fx-data-gke-sandbox
12 | else
13 |     echo "cluster $CLUSTERNAME does not exist"
14 | fi
15 | 


--------------------------------------------------------------------------------
/config/airflow_local_settings.py:
--------------------------------------------------------------------------------
 1 | STATE_COLORS = {
 2 |     "queued": "gray",
 3 |     "running": "lime",
 4 |     "success": "#0000FF",  # Rather than "green".
 5 |     "restarting": "violet",
 6 |     "failed": "red",
 7 |     "up_for_retry": "gold",
 8 |     "up_for_reschedule": "turquoise",
 9 |     "upstream_failed": "orange",
10 |     "skipped": "pink",  # Rather than "hotpink".
11 |     "deferred": "mediumpurple",
12 |     "removed": "lightgrey",
13 |     "scheduled": "tan",
14 | }
15 | 


--------------------------------------------------------------------------------
/dags/.airflowignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/dags/.airflowignore


--------------------------------------------------------------------------------
/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/dags/__init__.py


--------------------------------------------------------------------------------
/dags/adm_export.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.hooks.base import BaseHook
 5 | from airflow.providers.cncf.kubernetes.secret import Secret
 6 | from airflow.sensors.external_task import ExternalTaskSensor
 7 | 
 8 | from operators.gcp_container_operator import GKEPodOperator
 9 | from utils.constants import ALLOWED_STATES, FAILED_STATES
10 | from utils.tags import Tag
11 | 
12 | DOCS = """\
13 | Daily data exports of contextual services data aggregates to adMarketplace.
14 | This is a complementary approach to the near real-time sharing that is implemented
15 | in gcp-ingestion.
16 | 
17 | Relies on the [`bq2stfp` container defined in `docker-etl`](https://github.com/mozilla/docker-etl/tree/main/jobs/bq2sftp)
18 | and credentials stored in the `adm_sftp` connection.
19 | 
20 | For more context, see https://bugzilla.mozilla.org/show_bug.cgi?id=1729524
21 | """
22 | 
23 | default_args = {
24 |     "owner": "wstuckey@mozilla.com",
25 |     "start_date": datetime.datetime(2019, 7, 25),
26 |     "email": ["telemetry-alerts@mozilla.com", "wstuckey@mozilla.com"],
27 |     "email_on_failure": True,
28 |     "email_on_retry": True,
29 |     "depends_on_past": False,
30 |     # If a task fails, retry it once after waiting at least 5 minutes
31 |     "retries": 1,
32 |     "retry_delay": datetime.timedelta(minutes=5),
33 | }
34 | 
35 | dag_name = "adm_export"
36 | tags = [Tag.ImpactTier.tier_3]
37 | 
38 | adm_sftp_secret = Secret(
39 |     deploy_type="env",
40 |     deploy_target="SFTP_PASSWORD",
41 |     secret="airflow-gke-secrets",
42 |     key="adm_export_secret__sftp_password",
43 | )
44 | 
45 | with DAG(
46 |     dag_name,
47 |     schedule_interval="0 5 * * *",
48 |     doc_md=DOCS,
49 |     default_args=default_args,
50 |     tags=tags,
51 | ) as dag:
52 |     conn = BaseHook.get_connection("adm_sftp")
53 | 
54 |     adm_daily_aggregates_to_sftp = GKEPodOperator(
55 |         task_id="adm_daily_aggregates_to_sftp",
56 |         name="adm_daily_aggregates_to_sftp",
57 |         # See https://github.com/mozilla/docker-etl/pull/28
58 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/bq2sftp_docker_etl:latest",
59 |         project_id="moz-fx-data-airflow-gke-prod",
60 |         gcp_conn_id="google_cloud_airflow_gke",
61 |         cluster_name="workloads-prod-v1",
62 |         location="us-west1",
63 |         env_vars={
64 |             "SFTP_USERNAME": conn.login,
65 |             "SFTP_HOST": conn.host,
66 |             "SFTP_PORT": str(conn.port),
67 |             "KNOWN_HOSTS": conn.extra_dejson["known_hosts"],
68 |             "SRC_TABLE": "moz-fx-data-shared-prod.search_terms_derived.adm_daily_aggregates_v1",
69 |             # The run for submission_date=2022-03-04 will be named:
70 |             # Aggregated-Query-Data-03042022.csv.gz
71 |             "DST_PATH": 'files/Aggregated-Query-Data-{{ macros.ds_format(ds, "%Y-%m-%d", "%m%d%Y") }}.csv.gz',
72 |             "SUBMISSION_DATE": "{{ ds }}",
73 |         },
74 |         secrets=[adm_sftp_secret],
75 |         email=[
76 |             "telemetry-alerts@mozilla.com",
77 |         ],
78 |     )
79 | 
80 |     wait_for_clients_daily_export = ExternalTaskSensor(
81 |         task_id="wait_for_adm_daily_aggregates",
82 |         external_dag_id="bqetl_search_terms_daily",
83 |         external_task_id="search_terms_derived__adm_daily_aggregates__v1",
84 |         execution_delta=datetime.timedelta(hours=2),
85 |         mode="reschedule",
86 |         allowed_states=ALLOWED_STATES,
87 |         failed_states=FAILED_STATES,
88 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
89 |         email_on_retry=False,
90 |     )
91 | 
92 |     wait_for_clients_daily_export >> adm_daily_aggregates_to_sftp
93 | 


--------------------------------------------------------------------------------
/dags/app_store_analytics.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | 
  3 | from airflow import DAG
  4 | 
  5 | from operators.gcp_container_operator import GKEPodOperator
  6 | from utils.gcp import bigquery_etl_query
  7 | from utils.tags import Tag
  8 | 
  9 | default_args = {
 10 |     "owner": "telemetry-alerts@mozilla.com",
 11 |     "depends_on_past": False,
 12 |     "start_date": datetime(2020, 6, 23),
 13 |     "email_on_failure": True,
 14 |     "email_on_retry": True,
 15 |     "retries": 1,
 16 |     "retry_delay": timedelta(minutes=30),
 17 |     "email": [
 18 |         "telemetry-alerts@mozilla.com",
 19 |     ],
 20 | }
 21 | 
 22 | PROJECT_ID = "moz-fx-data-marketing-prod"
 23 | EXPORT_DATASET_ID = "apple_app_store_exported"
 24 | DERIVED_DATASET_ID = "apple_app_store"
 25 | 
 26 | APPS = [
 27 |     ("989804926", "Firefox"),
 28 |     ("1489407738", "VPN"),
 29 |     ("1295998056", "WebXRViewer"),
 30 |     ("1314000270", "Lockwise"),
 31 |     ("1073435754", "Klar"),
 32 |     ("1055677337", "Focus"),
 33 | ]
 34 | 
 35 | DERIVED_TABLES = [
 36 |     "metrics_by_app_referrer",
 37 |     "metrics_by_app_version",
 38 |     "metrics_by_campaign",
 39 |     "metrics_by_platform",
 40 |     "metrics_by_platform_version",
 41 |     "metrics_by_region",
 42 |     "metrics_by_source",
 43 |     "metrics_by_storefront",
 44 |     "metrics_by_web_referrer",
 45 |     "metrics_total",
 46 | ]
 47 | 
 48 | tags = [Tag.ImpactTier.tier_1]
 49 | 
 50 | with DAG(
 51 |     "app_store_analytics",
 52 |     default_args=default_args,
 53 |     max_active_runs=1,
 54 |     schedule_interval="@daily",
 55 |     tags=tags,
 56 | ) as dag:
 57 |     export_date = "macros.ds_add(ds, -2)"  # previous day data is incomplete
 58 |     tasks = []
 59 | 
 60 |     # App exports are scheduled sequentially to avoid hit api rate limit
 61 |     for i, (app_id, app_name) in enumerate(APPS):
 62 |         commands = [
 63 |             "yarn",
 64 |             "--silent",  # silent to hide arguments from logs
 65 |             "export",
 66 |             "--username={{ var.value.app_store_connect_username }}",
 67 |             "--password={{ var.value.app_store_connect_password }}",
 68 |             f"--app-id={app_id}",
 69 |             f"--app-name={app_name}",
 70 |             f"--start-date={{{{ {export_date} }}}}",
 71 |             f"--project={PROJECT_ID}",
 72 |             f"--dataset={EXPORT_DATASET_ID}",
 73 |         ]
 74 | 
 75 |         # First task will clear the day partition so that the only data in the table partition
 76 |         # is the data written by the current dag run and does not include unrecognized apps
 77 |         if i == 0:
 78 |             commands.append("--overwrite")
 79 | 
 80 |         app_store_analytics = GKEPodOperator(
 81 |             task_id=f"app_store_analytics_{app_name}",
 82 |             arguments=commands,
 83 |             image="gcr.io/moz-fx-data-airflow-prod-88e0/app-store-analytics-export:latest",
 84 |             gcp_conn_id="google_cloud_airflow_gke",
 85 |             dag=dag,
 86 |         )
 87 | 
 88 |         if i > 0:
 89 |             app_store_analytics.set_upstream(tasks[i - 1])
 90 | 
 91 |         tasks.append(app_store_analytics)
 92 | 
 93 |     # derived tables combine all metrics per dimension
 94 |     for derived_table in DERIVED_TABLES:
 95 |         combined_metrics_query = bigquery_etl_query(
 96 |             task_id=f"{derived_table}_query",
 97 |             project_id=PROJECT_ID,
 98 |             dataset_id=DERIVED_DATASET_ID,
 99 |             sql_file_path=f"sql/moz-fx-data-marketing-prod/{DERIVED_DATASET_ID}/{derived_table}/query.sql",
100 |             # Override default date partition because data has multiple day lag
101 |             destination_table=(
102 |                 f"{derived_table}${{{{ macros.ds_format({export_date}, '%Y-%m-%d', '%Y%m%d') }}}}"
103 |             ),
104 |             date_partition_parameter=None,
105 |             parameters=[f"submission_date:DATE:{{{{ {export_date} }}}}"],
106 |             dag=dag,
107 |         )
108 | 
109 |         combined_metrics_query.set_upstream(tasks[-1])
110 | 


--------------------------------------------------------------------------------
/dags/backfill.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from enum import Enum
  3 | 
  4 | from airflow.decorators import dag
  5 | from airflow.models import DagModel
  6 | from airflow.models.param import Param
  7 | from airflow.operators.bash import BashOperator
  8 | from airflow.operators.empty import EmptyOperator
  9 | from airflow.operators.python import BranchPythonOperator, PythonOperator
 10 | from airflow.utils.trigger_rule import TriggerRule
 11 | 
 12 | from utils.backfill import BackfillParams
 13 | from utils.tags import Tag
 14 | 
 15 | 
 16 | class TaskId(Enum):
 17 |     dry_run = "dry_run"
 18 |     real_deal = "real_deal"
 19 |     clear_tasks = "clear_tasks"
 20 |     do_not_clear_tasks = "do_not_clear_tasks"
 21 | 
 22 | 
 23 | def dry_run_branch_callable(params: dict) -> str:
 24 |     backfill_params = BackfillParams(**params)
 25 |     return TaskId.dry_run.value if backfill_params.dry_run else TaskId.real_deal.value
 26 | 
 27 | 
 28 | def clear_branch_callable(params: dict) -> str:
 29 |     backfill_params = BackfillParams(**params)
 30 |     return (
 31 |         TaskId.clear_tasks.value
 32 |         if backfill_params.clear
 33 |         else TaskId.do_not_clear_tasks.value
 34 |     )
 35 | 
 36 | 
 37 | def param_validation(params: dict) -> bool:
 38 |     backfill_params = BackfillParams(**params)
 39 |     backfill_params.validate_date_range()
 40 |     validate_dag_exists(dag_name=backfill_params.dag_name)
 41 |     backfill_params.validate_regex_pattern()
 42 |     return True
 43 | 
 44 | 
 45 | def validate_dag_exists(dag_name: str) -> None:
 46 |     dag_instance = DagModel.get_dagmodel(dag_name)
 47 |     if dag_instance is None:
 48 |         raise ValueError(f"`dag_name`={dag_name} does not exist")
 49 | 
 50 | 
 51 | def generate_bash_command(params: dict) -> str:
 52 |     backfill_params = BackfillParams(**params)
 53 |     return " ".join(backfill_params.generate_backfill_command())
 54 | 
 55 | 
 56 | doc_md = """
 57 | # Backfill DAG
 58 | 
 59 | #### Use with caution
 60 | 
 61 | #### Some tips/notes:
 62 | 
 63 | * Always use dry run first. Especially when using task regex
 64 | * Date formats are 2020-03-01 or 2020-03-01T00:00:00
 65 | * Dry run for clearing tasks will show you the list of tasks that will be cleared
 66 | * Dry run for backfilling will not show the list, but is useful in testing for input errors
 67 | 
 68 | """
 69 | 
 70 | 
 71 | @dag(
 72 |     dag_id="backfill",
 73 |     schedule_interval=None,
 74 |     doc_md=doc_md,
 75 |     catchup=False,
 76 |     start_date=datetime.datetime(2022, 11, 1),
 77 |     dagrun_timeout=datetime.timedelta(days=1),
 78 |     tags=[Tag.ImpactTier.tier_3, Tag.Triage.record_only],
 79 |     render_template_as_native_obj=True,
 80 |     params={
 81 |         "dag_name": Param("dag_name", type="string"),
 82 |         "start_date": Param(
 83 |             (datetime.datetime.today() - datetime.timedelta(days=10)).isoformat(),
 84 |             type="string",
 85 |             format="date-time",
 86 |         ),
 87 |         "end_date": Param(
 88 |             datetime.datetime.today().isoformat(), type="string", format="date-time"
 89 |         ),
 90 |         "clear": Param(False, type="boolean"),
 91 |         "dry_run": Param(True, type="boolean"),
 92 |         "task_regex": Param(None, type=["string", "null"]),
 93 |     },
 94 | )
 95 | def backfill_dag():
 96 |     param_validation_task = PythonOperator(
 97 |         task_id="param_validation",
 98 |         python_callable=param_validation,
 99 |         op_kwargs={"params": "{{ dag_run.conf }}"},
100 |     )
101 | 
102 |     dry_run_branch_task = BranchPythonOperator(
103 |         task_id="dry_run_parameter",
104 |         python_callable=dry_run_branch_callable,
105 |         op_kwargs={"params": "{{ dag_run.conf }}"},
106 |         trigger_rule=TriggerRule.ONE_SUCCESS,
107 |     )
108 | 
109 |     dry_run_task = EmptyOperator(task_id=TaskId.dry_run.value)
110 |     real_deal_task = EmptyOperator(task_id=TaskId.real_deal.value)
111 | 
112 |     clear_branch_task = BranchPythonOperator(
113 |         task_id="clear_parameter",
114 |         python_callable=clear_branch_callable,
115 |         op_kwargs={"params": "{{ dag_run.conf }}"},
116 |         trigger_rule=TriggerRule.ONE_SUCCESS,
117 |     )
118 | 
119 |     clear_tasks_task = EmptyOperator(task_id=TaskId.clear_tasks.value)
120 |     do_not_clear_tasks_task = EmptyOperator(task_id=TaskId.do_not_clear_tasks.value)
121 | 
122 |     generate_backfill_command_task = PythonOperator(
123 |         task_id="generate_backfill_command",
124 |         python_callable=generate_bash_command,
125 |         op_kwargs={"params": "{{ dag_run.conf }}"},
126 |         trigger_rule=TriggerRule.ONE_SUCCESS,
127 |     )
128 | 
129 |     backfill_task = BashOperator(
130 |         task_id="execute_backfill",
131 |         bash_command="{{ ti.xcom_pull(task_ids='generate_backfill_command') }}",
132 |     )
133 | 
134 |     (
135 |         param_validation_task
136 |         >> dry_run_branch_task
137 |         >> [dry_run_task, real_deal_task]
138 |         >> clear_branch_task
139 |         >> [clear_tasks_task, do_not_clear_tasks_task]
140 |         >> generate_backfill_command_task
141 |         >> backfill_task
142 |     )
143 | 
144 | 
145 | dag = backfill_dag()
146 | 


--------------------------------------------------------------------------------
/dags/bhr_collection.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A processing job on top of BHR (Background Hang Reporter) pings.
  3 | 
  4 | More information about the pings: https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/backgroundhangmonitor-ping.html
  5 | 
  6 | BHR is related to the Background Hang Monitor in Firefox Desktop.
  7 | See: [bug 1675103](https://bugzilla.mozilla.org/show_bug.cgi?id=1675103)
  8 | 
  9 | The [job source](https://github.com/mozilla/python_mozetl/blob/main/mozetl/bhr_collection)
 10 | is maintained in the mozetl repository.
 11 | 
 12 | * Migrated from Databricks and now running as a scheduled Dataproc task. *
 13 | 
 14 | The resulting aggregations are used by the following service:
 15 | https://fqueze.github.io/hang-stats/#date=[DATE]&row=0
 16 | """
 17 | 
 18 | import datetime
 19 | 
 20 | from airflow import DAG
 21 | from airflow.operators.subdag import SubDagOperator
 22 | from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
 23 | from airflow.sensors.external_task import ExternalTaskSensor
 24 | 
 25 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 26 | from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner
 27 | from utils.tags import Tag
 28 | 
 29 | default_args = {
 30 |     "owner": "bewu@mozilla.com",
 31 |     "depends_on_past": False,
 32 |     "start_date": datetime.datetime(2020, 11, 26),
 33 |     "email": [
 34 |         "telemetry-alerts@mozilla.com",
 35 |         "kik@mozilla.com",
 36 |         "dothayer@mozilla.com",
 37 |         "bewu@mozilla.com",
 38 |     ],
 39 |     "email_on_failure": True,
 40 |     "email_on_retry": True,
 41 |     "retries": 1,
 42 |     "retry_delay": datetime.timedelta(minutes=30),
 43 | }
 44 | 
 45 | tags = [Tag.ImpactTier.tier_1]
 46 | 
 47 | with DAG(
 48 |     "bhr_collection",
 49 |     default_args=default_args,
 50 |     schedule_interval="0 5 * * *",
 51 |     doc_md=__doc__,
 52 |     tags=tags,
 53 | ) as dag:
 54 |     wait_for_bhr_ping = ExternalTaskSensor(
 55 |         task_id="wait_for_copy_deduplicate",
 56 |         external_dag_id="copy_deduplicate",
 57 |         external_task_id="copy_deduplicate_all",
 58 |         execution_delta=datetime.timedelta(hours=4),
 59 |         check_existence=True,
 60 |         mode="reschedule",
 61 |         allowed_states=ALLOWED_STATES,
 62 |         failed_states=FAILED_STATES,
 63 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 64 |         email_on_retry=False,
 65 |         dag=dag,
 66 |     )
 67 | 
 68 |     params = get_dataproc_parameters("google_cloud_airflow_dataproc")
 69 | 
 70 |     shared_runner_args = {
 71 |         "parent_dag_name": dag.dag_id,
 72 |         "image_version": "1.5-debian10",
 73 |         "default_args": default_args,
 74 |         "python_driver_code": "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/bhr_collection/bhr_collection.py",
 75 |         "init_actions_uris": [
 76 |             "gs://dataproc-initialization-actions/python/pip-install.sh"
 77 |         ],
 78 |         "additional_metadata": {
 79 |             "PIP_PACKAGES": "boto3==1.16.20 click==7.1.2 google-cloud-storage==2.7.0"
 80 |         },
 81 |         "additional_properties": {
 82 |             "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
 83 |             "spark:spark.driver.memory": "30g",
 84 |             "spark:spark.executor.memory": "20g",
 85 |         },
 86 |         "idle_delete_ttl": 14400,
 87 |         # supported machine types depends on dataproc image version:
 88 |         # https://cloud.google.com/dataproc/docs/concepts/compute/supported-machine-types
 89 |         "master_machine_type": "n2-highmem-8",
 90 |         "worker_machine_type": "n2-highmem-4",
 91 |         "gcp_conn_id": params.conn_id,
 92 |         "service_account": params.client_email,
 93 |         "storage_bucket": params.storage_bucket,
 94 |     }
 95 | 
 96 |     bhr_collection = SubDagOperator(
 97 |         task_id="bhr_collection",
 98 |         dag=dag,
 99 |         subdag=moz_dataproc_pyspark_runner(
100 |             dag_name="bhr_collection",
101 |             cluster_name="bhr-collection-main-{{ ds }}",
102 |             job_name="bhr-collection-main",
103 |             **shared_runner_args,
104 |             num_workers=6,
105 |             py_args=[
106 |                 "--date",
107 |                 "{{ ds }}",
108 |                 "--sample-size",
109 |                 "0.5",
110 |                 "--use_gcs",
111 |                 "--thread-filter",
112 |                 "Gecko",
113 |                 "--output-tag",
114 |                 "main",
115 |             ],
116 |         ),
117 |     )
118 | 
119 |     bhr_collection_child = SubDagOperator(
120 |         task_id="bhr_collection_child",
121 |         dag=dag,
122 |         subdag=moz_dataproc_pyspark_runner(
123 |             dag_name="bhr_collection_child",
124 |             cluster_name="bhr-collection-child-{{ ds }}",
125 |             job_name="bhr-collection-child",
126 |             **shared_runner_args,
127 |             num_workers=12,
128 |             py_args=[
129 |                 "--date",
130 |                 "{{ ds }}",
131 |                 "--sample-size",
132 |                 "0.08",  # there are usually 12-15x more hangs in the child process than main
133 |                 "--use_gcs",
134 |                 "--thread-filter",
135 |                 "Gecko_Child",
136 |                 "--output-tag",
137 |                 "child",
138 |             ],
139 |         ),
140 |     )
141 | 
142 |     wait_for_bhr_ping >> [
143 |         bhr_collection,
144 |         bhr_collection_child,
145 |     ]
146 | 


--------------------------------------------------------------------------------
/dags/bqetl_backfill_complete.py:
--------------------------------------------------------------------------------
 1 | """DAG for completing registered bigquery-etl backfills."""
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | from airflow import DAG
 6 | from airflow.decorators import task, task_group
 7 | from airflow.providers.slack.operators.slack import SlackAPIPostOperator
 8 | 
 9 | from operators.gcp_container_operator import GKEPodOperator
10 | from utils.tags import Tag
11 | 
12 | AUTOMATION_SLACK_CHANNEL = "#dataops-alerts"
13 | SLACK_CONNECTION_ID = "overwatch_slack"
14 | DOCKER_IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest"
15 | 
16 | tags = [Tag.ImpactTier.tier_3]
17 | 
18 | default_args = {
19 |     "email": [
20 |         "ascholtz@mozilla.com",
21 |         "bewu@mozilla.com",
22 |         "wichan@mozilla.com",
23 |     ]
24 | }
25 | 
26 | with DAG(
27 |     "bqetl_backfill_complete",
28 |     doc_md=__doc__,
29 |     tags=tags,
30 |     schedule_interval="@hourly",
31 |     start_date=datetime(2024, 1, 1),
32 |     catchup=False,
33 |     default_args=default_args,
34 | ) as dag:
35 |     detect_backfills = GKEPodOperator(
36 |         task_id="detect_backfills",
37 |         name="detect_backfills",
38 |         cmds=["sh", "-cx"],
39 |         arguments=[
40 |             "script/bqetl backfill scheduled --status=Complete --json_path=/airflow/xcom/return.json --ignore-old-entries",
41 |         ],
42 |         image=DOCKER_IMAGE,
43 |         do_xcom_push=True,
44 |     )
45 | 
46 |     @task_group
47 |     def complete_backfill(backfill):
48 |         @task
49 |         def prepare_slack_complete_message(entry):
50 |             watcher_text = " ".join(
51 |                 f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"]
52 |             )
53 |             return (
54 |                 f"{watcher_text} :hourglass_flowing_sand: Completing backfill of `{entry['qualified_table_name']}` has started - currently swapping backfill data into production. "
55 |                 f"A snapshot of the current production data will be kept as a backup for 30 days. "
56 |                 f"You will receive another notification once the completing step is done."
57 |             )
58 | 
59 |         notify_initiate = SlackAPIPostOperator(
60 |             task_id="slack_notify_initate",
61 |             username="Backfill",
62 |             slack_conn_id=SLACK_CONNECTION_ID,
63 |             text=prepare_slack_complete_message(backfill),
64 |             channel=AUTOMATION_SLACK_CHANNEL,
65 |         )
66 | 
67 |         @task
68 |         def prepare_pod_parameters(entry):
69 |             return [f"script/bqetl backfill complete { entry['qualified_table_name'] }"]
70 | 
71 |         process_backfill = GKEPodOperator(
72 |             task_id="process_backfill",
73 |             name="process_backfill",
74 |             cmds=["sh", "-cx"],
75 |             arguments=prepare_pod_parameters(backfill),
76 |             image=DOCKER_IMAGE,
77 |             reattach_on_restart=True,
78 |         )
79 | 
80 |         @task
81 |         def prepare_slack_processing_complete_parameters(entry):
82 |             watcher_text = " ".join(
83 |                 f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"]
84 |             )
85 | 
86 |             return f"{watcher_text} :white_check_mark: Backfill is complete for `{entry['qualified_table_name']}`. Production data has been updated."
87 | 
88 |         notify_processing_complete = SlackAPIPostOperator(
89 |             task_id="slack_notify_processing_complete",
90 |             username="Backfill",
91 |             slack_conn_id=SLACK_CONNECTION_ID,
92 |             text=prepare_slack_processing_complete_parameters(backfill),
93 |             channel=AUTOMATION_SLACK_CHANNEL,
94 |         )
95 | 
96 |         notify_initiate >> process_backfill >> notify_processing_complete
97 | 
98 |     backfill_groups = complete_backfill.expand(backfill=detect_backfills.output)
99 | 


--------------------------------------------------------------------------------
/dags/bqetl_backfill_initiate.py:
--------------------------------------------------------------------------------
  1 | """DAG for initiating registered bigquery-etl backfills."""
  2 | 
  3 | from datetime import datetime
  4 | 
  5 | from airflow import DAG
  6 | from airflow.decorators import task, task_group
  7 | from airflow.providers.slack.operators.slack import SlackAPIPostOperator
  8 | 
  9 | from operators.gcp_container_operator import GKEPodOperator
 10 | from utils.tags import Tag
 11 | 
 12 | AUTOMATION_SLACK_CHANNEL = "#dataops-alerts"
 13 | SLACK_CONNECTION_ID = "overwatch_slack"
 14 | DOCKER_IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest"
 15 | 
 16 | tags = [Tag.ImpactTier.tier_3]
 17 | 
 18 | default_args = {
 19 |     "email": [
 20 |         "ascholtz@mozilla.com",
 21 |         "bewu@mozilla.com",
 22 |         "wichan@mozilla.com",
 23 |     ]
 24 | }
 25 | 
 26 | with DAG(
 27 |     "bqetl_backfill_initiate",
 28 |     doc_md=__doc__,
 29 |     tags=tags,
 30 |     schedule_interval="@hourly",
 31 |     start_date=datetime(2024, 1, 1),
 32 |     catchup=False,
 33 |     default_args=default_args,
 34 | ) as dag:
 35 |     detect_backfills = GKEPodOperator(
 36 |         task_id="detect_backfills",
 37 |         name="detect_backfills",
 38 |         cmds=["sh", "-cx"],
 39 |         arguments=[
 40 |             "script/bqetl backfill scheduled --status=Initiate --json_path=/airflow/xcom/return.json --ignore-old-entries"
 41 |         ],
 42 |         image=DOCKER_IMAGE,
 43 |         do_xcom_push=True,
 44 |     )
 45 | 
 46 |     @task_group
 47 |     def initiate_backfill(backfill):
 48 |         @task
 49 |         def prepare_slack_initiate_message(entry):
 50 |             watcher_text = " ".join(
 51 |                 f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"]
 52 |             )
 53 |             return f"{watcher_text} :hourglass_flowing_sand: Initiating backfill scheduled for `{entry['qualified_table_name']}`.  You will receive another notification once the backfill is done."
 54 | 
 55 |         notify_initiate = SlackAPIPostOperator(
 56 |             task_id="slack_notify_initate",
 57 |             username="Backfill",
 58 |             slack_conn_id=SLACK_CONNECTION_ID,
 59 |             text=prepare_slack_initiate_message(backfill),
 60 |             channel=AUTOMATION_SLACK_CHANNEL,
 61 |         )
 62 | 
 63 |         @task
 64 |         def prepare_pod_parameters(entry):
 65 |             return [f"script/bqetl backfill initiate { entry['qualified_table_name'] }"]
 66 | 
 67 |         process_backfill = GKEPodOperator(
 68 |             task_id="process_backfill",
 69 |             name="process_backfill",
 70 |             cmds=["sh", "-cx"],
 71 |             arguments=prepare_pod_parameters(backfill),
 72 |             image=DOCKER_IMAGE,
 73 |             reattach_on_restart=True,
 74 |         )
 75 | 
 76 |         @task
 77 |         def prepare_slack_processing_complete_parameters(entry):
 78 |             project, dataset, table = entry["qualified_table_name"].split(".")
 79 |             backfill_table_id = (
 80 |                 f"{dataset}__{table}_{entry['entry_date'].replace('-', '_')}"
 81 |             )
 82 |             staging_location = (
 83 |                 f"{project}.backfills_staging_derived.{backfill_table_id}"
 84 |             )
 85 |             watcher_text = " ".join(
 86 |                 f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"]
 87 |             )
 88 | 
 89 |             return (
 90 |                 f"{watcher_text} :white_check_mark: Backfill processing is done. Staging location: `{staging_location}`. "
 91 |                 "Please validate that your data has changed as you expect and complete your backfill by updating the Backfill entry's status to Complete in the bigquery-etl repository. "
 92 |                 "Note that the staging table will expire in 30 days, so the backfill must be completed within 30 days."
 93 |             )
 94 | 
 95 |         notify_processing_complete = SlackAPIPostOperator(
 96 |             task_id="slack_notify_processing_complete",
 97 |             username="Backfill",
 98 |             slack_conn_id=SLACK_CONNECTION_ID,
 99 |             text=prepare_slack_processing_complete_parameters(backfill),
100 |             channel=AUTOMATION_SLACK_CHANNEL,
101 |         )
102 | 
103 |         notify_initiate >> process_backfill >> notify_processing_complete
104 | 
105 |     backfill_groups = initiate_backfill.expand(backfill=detect_backfills.output)
106 | 


--------------------------------------------------------------------------------
/dags/broken_site_report_ml.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from airflow import DAG
 4 | 
 5 | from operators.gcp_container_operator import GKEPodOperator
 6 | from utils.tags import Tag
 7 | 
 8 | DOCS = """
 9 | ### ML classification of broken site reports
10 | 
11 | #### Description
12 | 
13 | Runs a Docker image that does the following:
14 | 
15 | 1. Translates incoming broken sites reports to English with ML.TRANSLATE.
16 | 2. Classifies translated reports as valid/invalid using [bugbug](https://github.com/mozilla/bugbug).
17 | 3. Stores translation and classification results in BQ.
18 | 
19 | The container is defined in
20 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/broken-site-report-ml)
21 | 
22 | *Triage notes*
23 | 
24 | As long as the most recent DAG run is successful this job doesn't need to be triaged.
25 | 
26 | #### Owner
27 | 
28 | kberezina@mozilla.com
29 | """
30 | 
31 | default_args = {
32 |     "owner": "kberezina@mozilla.com",
33 |     "email": ["kberezina@mozilla.com", "webcompat-internal@mozilla.org"],
34 |     "depends_on_past": False,
35 |     "start_date": datetime(2023, 12, 21),
36 |     "email_on_failure": True,
37 | }
38 | 
39 | 
40 | tags = [
41 |     Tag.ImpactTier.tier_2,
42 | ]
43 | 
44 | every_fifteen_minutes = "*/15 * * * *"
45 | 
46 | with DAG(
47 |     "broken_site_report_ml",
48 |     default_args=default_args,
49 |     max_active_runs=1,
50 |     doc_md=DOCS,
51 |     schedule_interval=every_fifteen_minutes,
52 |     tags=tags,
53 |     catchup=False,
54 | ) as dag:
55 |     broken_site_report_ml = GKEPodOperator(
56 |         task_id="broken_site_report_ml",
57 |         arguments=[
58 |             "python",
59 |             "broken_site_report_ml/main.py",
60 |             "--bq_project_id",
61 |             "moz-fx-dev-dschubert-wckb",
62 |             "--bq_dataset_id",
63 |             "webcompat_user_reports",
64 |         ],
65 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/broken-site-report-ml_docker_etl:latest",
66 |         dag=dag,
67 |     )
68 | 


--------------------------------------------------------------------------------
/dags/catalyst.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DAG to schedule generation of performance reports for recently completed nimbus experiments.
  3 | 
  4 | See the [catalyst repository](https://github.com/mozilla/catalyst).
  5 | 
  6 | *Triage notes*
  7 | 
  8 | This app will perform some bigquery queries, and generate statistical reports based on that data which are
  9 | then published to https://protosaur.dev/perf-reports/index.html.
 10 | 
 11 | Generally, there should be minimal triage necessary for failures unless it's related to infrastructure issues.
 12 | Any failures related to the app execution itself will be taken care of directly by the performance team.
 13 | 
 14 | """
 15 | 
 16 | from datetime import datetime, timedelta
 17 | 
 18 | from airflow import DAG
 19 | from airflow.sensors.external_task import ExternalTaskSensor
 20 | 
 21 | from operators.gcp_container_operator import GKEPodOperator
 22 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 23 | from utils.tags import Tag
 24 | 
 25 | default_args = {
 26 |     "owner": "dpalmeiro@mozilla.com",
 27 |     "email": [
 28 |         "dpalmeiro@mozilla.com",
 29 |     ],
 30 |     "depends_on_past": False,
 31 |     "start_date": datetime(2025, 5, 5),
 32 |     "email_on_failure": True,
 33 |     "email_on_retry": True,
 34 |     "retries": 1,
 35 |     "retry_delay": timedelta(minutes=30),
 36 | }
 37 | 
 38 | tags = [Tag.ImpactTier.tier_2]
 39 | 
 40 | with DAG(
 41 |     "catalyst",
 42 |     default_args=default_args,
 43 |     schedule_interval="0 4 * * *",
 44 |     doc_md=__doc__,
 45 |     tags=tags,
 46 | ) as dag:
 47 |     # Built from repo https://github.com/mozilla/catalyst
 48 |     catalyst_image = "gcr.io/moz-fx-data-experiments/catalyst:latest"
 49 | 
 50 |     catalyst_run = GKEPodOperator(
 51 |         task_id="catalyst_run",
 52 |         name="catalyst_run",
 53 |         image=catalyst_image,
 54 |         email=default_args["email"],
 55 |         dag=dag,
 56 |     )
 57 | 
 58 |     wait_for_clients_daily_export = ExternalTaskSensor(
 59 |         task_id="wait_for_clients_daily",
 60 |         external_dag_id="bqetl_main_summary",
 61 |         external_task_id="telemetry_derived__clients_daily__v6",
 62 |         execution_delta=timedelta(hours=2),
 63 |         mode="reschedule",
 64 |         allowed_states=ALLOWED_STATES,
 65 |         failed_states=FAILED_STATES,
 66 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 67 |         email_on_retry=False,
 68 |         dag=dag,
 69 |     )
 70 | 
 71 |     wait_for_search_clients_daily = ExternalTaskSensor(
 72 |         task_id="wait_for_search_clients_daily",
 73 |         external_dag_id="bqetl_search",
 74 |         external_task_id="search_derived__search_clients_daily__v8",
 75 |         execution_delta=timedelta(hours=1),
 76 |         mode="reschedule",
 77 |         allowed_states=ALLOWED_STATES,
 78 |         failed_states=FAILED_STATES,
 79 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 80 |         email_on_retry=False,
 81 |         dag=dag,
 82 |     )
 83 | 
 84 |     wait_for_bq_events = ExternalTaskSensor(
 85 |         task_id="wait_for_bq_main_events",
 86 |         external_dag_id="copy_deduplicate",
 87 |         external_task_id="bq_main_events",
 88 |         execution_delta=timedelta(hours=3),
 89 |         mode="reschedule",
 90 |         allowed_states=ALLOWED_STATES,
 91 |         failed_states=FAILED_STATES,
 92 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 93 |         email_on_retry=False,
 94 |         dag=dag,
 95 |     )
 96 | 
 97 |     wait_for_copy_deduplicate_events = ExternalTaskSensor(
 98 |         task_id="wait_for_event_events",
 99 |         external_dag_id="copy_deduplicate",
100 |         external_task_id="event_events",
101 |         execution_delta=timedelta(hours=3),
102 |         mode="reschedule",
103 |         allowed_states=ALLOWED_STATES,
104 |         failed_states=FAILED_STATES,
105 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
106 |         email_on_retry=False,
107 |         dag=dag,
108 |     )
109 | 
110 |     catalyst_run.set_upstream(
111 |         [
112 |             wait_for_clients_daily_export,
113 |             wait_for_search_clients_daily,
114 |             wait_for_bq_events,
115 |             wait_for_copy_deduplicate_events,
116 |         ]
117 |     )
118 | 


--------------------------------------------------------------------------------
/dags/clean_gke_pods.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | 
 5 | from operators.gcp_container_operator import GKEPodOperator
 6 | from utils.tags import Tag
 7 | 
 8 | docs = """
 9 | ### Clean GKE Pods
10 | 
11 | Failures can be ignored during Airflow Triage. This job is idempotent.
12 | 
13 | Built from cloudops-infra repo, projects/airflow/pod-clean
14 | 
15 | #### Purpose
16 | 
17 | This DAG executes a GKEPodOperator to clean out old completed pods
18 | on the shared workloads-prod-v1 gke cluster. We need to do this periodically
19 | because GCP has a 1500 object limit quota.
20 | 
21 | #### Owner
22 | 
23 | hwoo@mozilla.com
24 | """
25 | 
26 | 
27 | default_args = {
28 |     "owner": "hwoo@mozilla.com",
29 |     "depends_on_past": False,
30 |     "start_date": datetime(2019, 12, 26),
31 |     "email_on_failure": True,
32 |     "email_on_retry": True,
33 |     "retries": 2,
34 |     "retry_delay": timedelta(minutes=30),
35 | }
36 | 
37 | tags = [
38 |     Tag.ImpactTier.tier_3,
39 |     Tag.Triage.no_triage,
40 | ]
41 | 
42 | dag = DAG(
43 |     "clean-gke-pods",
44 |     default_args=default_args,
45 |     schedule_interval="@daily",
46 |     doc_md=docs,
47 |     tags=tags,
48 | )
49 | 
50 | # docker_image = 'us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/gke-pod-clean:1.3'
51 | docker_image = "gcr.io/moz-fx-data-airflow-prod-88e0/gke-pod-clean:1.4"
52 | gke_cluster_name = "workloads-prod-v1"
53 | gke_location = "us-west1"
54 | project_id = "moz-fx-data-airflow-gke-prod"
55 | 
56 | docker_args = [
57 |     "--project",
58 |     project_id,
59 |     "--gke-cluster",
60 |     gke_cluster_name,
61 |     "--region",
62 |     gke_location,
63 |     "--retention-days",
64 |     "4",
65 | ]
66 | 
67 | clean_gke_pods = GKEPodOperator(
68 |     task_id="clean-gke-pods",
69 |     name="clean-gke-pods",
70 |     image=docker_image,
71 |     arguments=docker_args,
72 |     dag=dag,
73 | )
74 | 


--------------------------------------------------------------------------------
/dags/contextual_services_import.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Runs a Docker image that imports Quicksuggest suggestions from Remote Settings to BigQuery.
 3 | 
 4 | See the [`quicksuggest2bq`](https://github.com/mozilla/docker-etl/tree/main/jobs/quicksuggest2bq)
 5 | docker image defined in `docker-etl`.
 6 | """
 7 | 
 8 | from datetime import datetime, timedelta
 9 | 
10 | from airflow import DAG
11 | 
12 | from operators.gcp_container_operator import GKEPodOperator
13 | from utils.tags import Tag
14 | 
15 | default_args = {
16 |     "owner": "wstuckey@mozilla.com",
17 |     "depends_on_past": False,
18 |     "start_date": datetime(2021, 11, 18),
19 |     "email_on_failure": True,
20 |     "email_on_retry": True,
21 |     "retries": 2,
22 |     "retry_delay": timedelta(minutes=30),
23 | }
24 | 
25 | project_id = "moz-fx-data-shared-prod"
26 | table_id = "search_terms_derived.remotesettings_suggestions_v1"
27 | 
28 | tags = [Tag.ImpactTier.tier_1]
29 | 
30 | with DAG(
31 |     "contextual_services_import",
32 |     default_args=default_args,
33 |     doc_md=__doc__,
34 |     schedule_interval="@daily",
35 |     tags=tags,
36 | ) as dag:
37 |     quicksuggest2bq = GKEPodOperator(
38 |         task_id="quicksuggest2bq",
39 |         arguments=[
40 |             "python",
41 |             "quicksuggest2bq/main.py",
42 |             "--destination-project",
43 |             project_id,
44 |             "--destination-table-id",
45 |             table_id,
46 |         ],
47 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/quicksuggest2bq_docker_etl:latest",
48 |         gcp_conn_id="google_cloud_airflow_gke",
49 |         dag=dag,
50 |         email=[
51 |             "wstuckey@mozilla.com",
52 |             "ctroy@mozilla.com",
53 |         ],
54 |     )
55 | 


--------------------------------------------------------------------------------
/dags/crash_symbolication.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generates "Weekly report of modules with missing symbols in crash reports" and sends it to the Stability list.
  3 | 
  4 | Generates correlations data for top crashers.
  5 | 
  6 | Uses crash report data imported from Socorro.
  7 | """
  8 | import datetime
  9 | 
 10 | from airflow import DAG
 11 | from airflow.operators.subdag import SubDagOperator
 12 | from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
 13 | from airflow.sensors.external_task import ExternalTaskSensor
 14 | 
 15 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 16 | from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner
 17 | from utils.tags import Tag
 18 | 
 19 | default_args = {
 20 |     "owner": "srose@mozilla.com",
 21 |     "depends_on_past": False,
 22 |     "start_date": datetime.datetime(2020, 11, 26),
 23 |     "email": [
 24 |         "mcastelluccio@mozilla.com",
 25 |         "srose@mozilla.com",
 26 |         "telemetry-alerts@mozilla.com",
 27 |     ],
 28 |     "email_on_failure": True,
 29 |     "email_on_retry": True,
 30 |     "retries": 2,
 31 |     "retry_delay": datetime.timedelta(minutes=30),
 32 | }
 33 | 
 34 | PIP_PACKAGES = [
 35 |     "boto3==1.16.20",
 36 |     "scipy==1.5.4",
 37 |     "google-cloud-storage==2.7.0",
 38 | ]
 39 | 
 40 | tags = [Tag.ImpactTier.tier_3]
 41 | 
 42 | with DAG(
 43 |     "crash_symbolication",
 44 |     default_args=default_args,
 45 |     # dag runs daily but tasks only run on certain days
 46 |     schedule_interval="0 5 * * *",
 47 |     tags=tags,
 48 |     doc_md=__doc__,
 49 | ) as dag:
 50 |     # modules_with_missing_symbols sends results as email
 51 |     ses_aws_conn_id = "aws_data_iam_ses"
 52 |     ses_access_key, ses_secret_key, _ = AwsBaseHook(
 53 |         aws_conn_id=ses_aws_conn_id, client_type="s3"
 54 |     ).get_credentials()
 55 | 
 56 |     wait_for_socorro_import = ExternalTaskSensor(
 57 |         task_id="wait_for_socorro_import",
 58 |         external_dag_id="socorro_import",
 59 |         external_task_id="bigquery_load",
 60 |         check_existence=True,
 61 |         execution_delta=datetime.timedelta(hours=5),
 62 |         mode="reschedule",
 63 |         allowed_states=ALLOWED_STATES,
 64 |         failed_states=FAILED_STATES,
 65 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 66 |         email_on_retry=False,
 67 |     )
 68 | 
 69 |     params = get_dataproc_parameters("google_cloud_airflow_dataproc")
 70 | 
 71 |     modules_with_missing_symbols = SubDagOperator(
 72 |         task_id="modules_with_missing_symbols",
 73 |         subdag=moz_dataproc_pyspark_runner(
 74 |             parent_dag_name=dag.dag_id,
 75 |             image_version="1.5-debian10",
 76 |             dag_name="modules_with_missing_symbols",
 77 |             default_args=default_args,
 78 |             cluster_name="modules-with-missing-symbols-{{ ds }}",
 79 |             job_name="modules-with-missing-symbols",
 80 |             python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/modules_with_missing_symbols.py",
 81 |             init_actions_uris=[
 82 |                 "gs://dataproc-initialization-actions/python/pip-install.sh"
 83 |             ],
 84 |             additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)},
 85 |             additional_properties={
 86 |                 "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
 87 |                 "spark-env:AWS_ACCESS_KEY_ID": ses_access_key,
 88 |                 "spark-env:AWS_SECRET_ACCESS_KEY": ses_secret_key,
 89 |             },
 90 |             py_args=["--run-on-days", "0", "--date", "{{ ds }}"],  # run monday
 91 |             idle_delete_ttl=14400,
 92 |             num_workers=2,
 93 |             worker_machine_type="n1-standard-4",
 94 |             gcp_conn_id=params.conn_id,
 95 |             service_account=params.client_email,
 96 |             storage_bucket=params.storage_bucket,
 97 |         ),
 98 |     )
 99 | 
100 |     top_signatures_correlations = SubDagOperator(
101 |         task_id="top_signatures_correlations",
102 |         subdag=moz_dataproc_pyspark_runner(
103 |             parent_dag_name=dag.dag_id,
104 |             image_version="1.5-debian10",
105 |             dag_name="top_signatures_correlations",
106 |             default_args=default_args,
107 |             cluster_name="top-signatures-correlations-{{ ds }}",
108 |             job_name="top-signatures-correlations",
109 |             python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/top_signatures_correlations.py",
110 |             init_actions_uris=[
111 |                 "gs://dataproc-initialization-actions/python/pip-install.sh"
112 |             ],
113 |             additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)},
114 |             additional_properties={
115 |                 "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
116 |             },
117 |             py_args=[
118 |                 # run monday, wednesday, and friday
119 |                 "--run-on-days",
120 |                 "0",
121 |                 "2",
122 |                 "4",
123 |                 "--date",
124 |                 "{{ ds }}",
125 |             ],
126 |             idle_delete_ttl=14400,
127 |             num_workers=2,
128 |             worker_machine_type="n1-standard-8",
129 |             gcp_conn_id=params.conn_id,
130 |             service_account=params.client_email,
131 |             storage_bucket=params.storage_bucket,
132 |         ),
133 |     )
134 | 
135 |     wait_for_socorro_import >> modules_with_missing_symbols
136 |     wait_for_socorro_import >> top_signatures_correlations
137 | 


--------------------------------------------------------------------------------
/dags/dap_collector.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | 
 5 | from operators.gcp_container_operator import GKEPodOperator
 6 | from utils.tags import Tag
 7 | 
 8 | DOCS = """
 9 | ### DAP Collector
10 | 
11 | #### Description
12 | 
13 | Runs a Docker image that collects data from a DAP (Distributed Aggregation Protocol) leader and stores it in BigQuery.
14 | 
15 | The container is defined in
16 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/dap-collector)
17 | 
18 | For more information on Privacy Preserving Measurement in Firefox see
19 | https://bugzilla.mozilla.org/show_bug.cgi?id=1775035
20 | 
21 | This DAG requires following variables to be defined in Airflow:
22 | * dap_auth_token
23 | * dap_hpke_private_key
24 | * dap_task_config_url
25 | 
26 | This job is under active development, occasional failures are expected.
27 | 
28 | #### Owner
29 | 
30 | sfriedberger@mozilla.com
31 | """
32 | 
33 | default_args = {
34 |     "owner": "sfriedberger@mozilla.com",
35 |     "email": ["akomarzewski@mozilla.com", "sfriedberger@mozilla.com"],
36 |     "depends_on_past": False,
37 |     "start_date": datetime(2023, 3, 8),
38 |     "email_on_failure": True,
39 |     "email_on_retry": True,
40 |     "retries": 1,
41 |     "retry_delay": timedelta(hours=2),
42 | }
43 | 
44 | project_id = "moz-fx-data-shared-prod"
45 | table_id = "dap_collector_derived.aggregates_v1"
46 | 
47 | tags = [
48 |     Tag.ImpactTier.tier_3,
49 |     Tag.Triage.no_triage,
50 | ]
51 | 
52 | with DAG(
53 |     "dap_collector",
54 |     default_args=default_args,
55 |     doc_md=DOCS,
56 |     schedule_interval="@daily",
57 |     tags=tags,
58 | ) as dag:
59 |     dap_collector = GKEPodOperator(
60 |         task_id="dap_collector",
61 |         arguments=[
62 |             "python",
63 |             "dap_collector/main.py",
64 |             "--date={{ ds }}",
65 |             "--auth-token={{ var.value.dap_auth_token }}",
66 |             "--hpke-private-key={{ var.value.dap_hpke_private_key }}",
67 |             "--task-config-url={{ var.value.dap_task_config_url }}",
68 |             "--project",
69 |             project_id,
70 |             "--table-id",
71 |             table_id,
72 |         ],
73 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/dap-collector_docker_etl:latest",
74 |         gcp_conn_id="google_cloud_airflow_gke",
75 |     )
76 | 


--------------------------------------------------------------------------------
/dags/dap_collector_ppa_dev.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from airflow import DAG
 4 | 
 5 | from operators.gcp_container_operator import GKEPodOperator
 6 | from utils.tags import Tag
 7 | 
 8 | DOCS = """
 9 | ### PPA Dev DAP Collector
10 | 
11 | #### Description
12 | 
13 | Runs a Docker image that collects PPA Dev Environment data from a DAP (Distributed Aggregation Protocol) leader and stores it in BigQuery.
14 | 
15 | The container is defined in
16 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/dap-collector-ppa-dev)
17 | 
18 | This DAG requires following variables to be defined in Airflow:
19 | * dap_ppa_dev_auth_token
20 | * dap_ppa_dev_hpke_private_key
21 | * dap_ppa_dev_task_config_url
22 | * dap_ppa_dev_ad_config_url
23 | 
24 | This job is under active development, occasional failures are expected.
25 | 
26 | #### Owner
27 | 
28 | bbirdsong@mozilla.com
29 | """
30 | 
31 | default_args = {
32 |     "owner": "bbirdsong@mozilla.com",
33 |     "email": ["ads-eng@mozilla.com", "bbirdsong@mozilla.com"],
34 |     "depends_on_past": False,
35 |     "start_date": datetime(2024, 4, 30),
36 |     "email_on_failure": True,
37 |     "email_on_retry": False,
38 |     "retries": 0,
39 | }
40 | 
41 | project_id = "moz-fx-ads-nonprod"
42 | ad_table_id = "ppa_dev.measurements"
43 | report_table_id = "ppa_dev.reports"
44 | 
45 | tags = [
46 |     Tag.ImpactTier.tier_3,
47 |     Tag.Triage.no_triage,
48 | ]
49 | 
50 | 
51 | with DAG(
52 |     "dap_collector_ppa_dev",
53 |     default_args=default_args,
54 |     doc_md=DOCS,
55 |     schedule_interval="15 0 * * *",
56 |     tags=tags,
57 |     catchup=False,
58 | ) as dag:
59 |     dap_collector = GKEPodOperator(
60 |         task_id="dap_collector_ppa_dev",
61 |         arguments=[
62 |             "python",
63 |             "dap_collector_ppa_dev/main.py",
64 |             "--date={{ data_interval_end.at(0) | ts }}",
65 |             "--auth-token={{ var.value.dap_ppa_dev_auth_token }}",
66 |             "--hpke-private-key={{ var.value.dap_ppa_dev_hpke_private_key }}",
67 |             "--task-config-url={{ var.value.dap_ppa_dev_task_config_url }}",
68 |             "--ad-config-url={{ var.value.dap_ppa_dev_ad_config_url }}",
69 |             "--project",
70 |             project_id,
71 |             "--ad-table-id",
72 |             ad_table_id,
73 |             "--report-table-id",
74 |             report_table_id,
75 |         ],
76 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/dap-collector-ppa-dev_docker_etl:latest",
77 |         gcp_conn_id="google_cloud_airflow_gke",
78 |     )
79 | 


--------------------------------------------------------------------------------
/dags/dap_collector_ppa_prod.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.providers.cncf.kubernetes.secret import Secret
 5 | 
 6 | from operators.gcp_container_operator import GKEPodOperator
 7 | from utils.tags import Tag
 8 | 
 9 | DOCS = """
10 | ### PPA Prod DAP Collector
11 | 
12 | #### Description
13 | 
14 | Runs a Docker image that collects PPA Prod Environment data from a DAP (Distributed Aggregation Protocol) leader and stores it in BigQuery.
15 | 
16 | The container is defined in
17 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/dap-collector-ppa-prod)
18 | 
19 | This DAG requires following variables to be defined in Airflow:
20 | * dap_ppa_prod_auth_token
21 | * dap_ppa_prod_hpke_private_key
22 | * dap_ppa_prod_task_config_url
23 | * dap_ppa_prod_ad_config_url
24 | 
25 | This job is under active development, occasional failures are expected.
26 | 
27 | #### Owner
28 | 
29 | bbirdsong@mozilla.com
30 | """
31 | 
32 | default_args = {
33 |     "owner": "bbirdsong@mozilla.com",
34 |     "email": ["ads-eng@mozilla.com", "bbirdsong@mozilla.com"],
35 |     "depends_on_past": False,
36 |     "start_date": datetime(2024, 6, 26),
37 |     "email_on_failure": True,
38 |     "email_on_retry": False,
39 |     "retries": 0,
40 | }
41 | 
42 | project_id = "moz-fx-ads-prod"
43 | ad_table_id = "ppa.measurements"
44 | report_table_id = "ppa.reports"
45 | 
46 | tags = [
47 |     Tag.ImpactTier.tier_3,
48 |     Tag.Triage.no_triage,
49 | ]
50 | 
51 | hpke_private_key = Secret(
52 |     deploy_type="env",
53 |     deploy_target="HPKE_PRIVATE_KEY",
54 |     secret="airflow-gke-secrets",
55 |     key="DAP_PPA_PROD_HPKE_PRIVATE_KEY",
56 | )
57 | 
58 | auth_token = Secret(
59 |     deploy_type="env",
60 |     deploy_target="AUTH_TOKEN",
61 |     secret="airflow-gke-secrets",
62 |     key="DAP_PPA_PROD_AUTH_TOKEN",
63 | )
64 | 
65 | with DAG(
66 |     "dap_collector_ppa_prod",
67 |     default_args=default_args,
68 |     doc_md=DOCS,
69 |     schedule_interval="15 0 * * *",
70 |     tags=tags,
71 |     catchup=False,
72 | ) as dag:
73 |     dap_collector = GKEPodOperator(
74 |         task_id="dap_collector_ppa_prod",
75 |         arguments=[
76 |             "python",
77 |             "dap_collector_ppa_prod/main.py",
78 |             "--date={{ data_interval_end.at(0) | ts }}",
79 |             "--task-config-url={{ var.value.dap_ppa_prod_task_config_url }}",
80 |             "--ad-config-url={{ var.value.dap_ppa_prod_ad_config_url }}",
81 |             "--project",
82 |             project_id,
83 |             "--ad-table-id",
84 |             ad_table_id,
85 |             "--report-table-id",
86 |             report_table_id,
87 |         ],
88 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/dap-collector-ppa-prod_docker_etl:latest",
89 |         gcp_conn_id="google_cloud_airflow_gke",
90 |         secrets=[
91 |             hpke_private_key,
92 |             auth_token,
93 |         ],
94 |     )
95 | 


--------------------------------------------------------------------------------
/dags/dbt_daily.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.providers.dbt.cloud.operators.dbt import DbtCloudRunJobOperator
 6 | from airflow.sensors.external_task import ExternalTaskSensor
 7 | 
 8 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 9 | from utils.tags import Tag
10 | 
11 | DOCS = """\
12 | # DBT Daily
13 | 
14 | This triggers jobs configured in dbt Cloud to run daily scheduled models that depend
15 | on other Airflow jobs.
16 | 
17 | *Triage notes*
18 | 
19 | DBT accounts are limited at the moment, so it might not be possible to get more visibility
20 | into failing jobs at the moment.
21 | """
22 | 
23 | default_args = {
24 |     "owner": "ascholtz@mozilla.com",
25 |     "depends_on_past": False,
26 |     "start_date": datetime(2024, 7, 31),
27 |     "email_on_failure": True,
28 |     "email_on_retry": True,
29 |     "retries": 2,
30 |     "retry_delay": timedelta(minutes=30),
31 |     "dbt_cloud_conn_id": "dbt_cloud",
32 |     "account_id": "{{ var.value.dbt_account_id }}"
33 | }
34 | 
35 | tags = [
36 |     Tag.ImpactTier.tier_3,
37 |     Tag.Triage.no_triage,
38 | ]
39 | 
40 | 
41 | with DAG(
42 |     "dbt_daily",
43 |     doc_md=DOCS,
44 |     max_active_runs=1,
45 |     default_args=default_args,
46 |     schedule_interval="0 4 * * 0",
47 |     tags=tags,
48 | ) as dag:
49 |     wait_for_copy_deduplicate = ExternalTaskSensor(
50 |         task_id="wait_for_copy_deduplicate",
51 |         external_dag_id="copy_deduplicate",
52 |         external_task_id="copy_deduplicate_all",
53 |         execution_delta=timedelta(hours=3),
54 |         mode="reschedule",
55 |         allowed_states=ALLOWED_STATES,
56 |         failed_states=FAILED_STATES,
57 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
58 |         email_on_retry=False,
59 |         dag=dag,
60 |     )
61 | 
62 |     # runs dbt jobs tagged with "refresh_daily" and "scheduled_in_airflow"
63 |     trigger_dbt_daily_cloud_run_job = DbtCloudRunJobOperator(
64 |         task_id="trigger_dbt_daily_cloud_run_job",
65 |         job_id=684764,
66 |         check_interval=10,
67 |         timeout=300,
68 |     )
69 | 
70 |     wait_for_copy_deduplicate >> trigger_dbt_daily_cloud_run_job
71 | 


--------------------------------------------------------------------------------
/dags/eam_slack_channels.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | from airflow import DAG
  4 | from airflow.providers.cncf.kubernetes.secret import Secret
  5 | 
  6 | from operators.gcp_container_operator import GKEPodOperator
  7 | from utils.tags import Tag
  8 | 
  9 | DOCS = """
 10 | ### Slack Channels integration
 11 | Runs a script in docker image that
 12 |  - will archive unused channels
 13 |  - delete old archived channels
 14 | 
 15 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/eam-integrations)
 16 | 
 17 | This DAG requires the creation of an Airflow Jira connection.
 18 | 
 19 | #### Owner
 20 | jmoscon@mozilla.com
 21 | 
 22 | """
 23 | 
 24 | 
 25 | def get_airflow_log_link(context):
 26 |     import urllib.parse
 27 | 
 28 |     dag_run_id = context["dag_run"].run_id
 29 |     task_id = context["task_instance"].task_id
 30 |     base_url = "http://workflow.telemetry.mozilla.org/dags/"
 31 |     base_url += "eam-slack-channels-integration/grid?tab=logs&dag_run_id="
 32 |     return base_url + f"{urllib.parse.quote(dag_run_id)}&task_id={task_id}"
 33 | 
 34 | 
 35 | def create_jira_ticket(context):
 36 |     import json
 37 |     import logging
 38 | 
 39 |     import requests
 40 |     from airflow.providers.atlassian.jira.hooks.jira import JiraHook
 41 |     from requests.auth import HTTPBasicAuth
 42 | 
 43 |     logger = logging.getLogger(__name__)
 44 |     logger.info("Creating Jira ticket ...")
 45 | 
 46 |     conn_id = "eam_jira_connection_id"
 47 |     conn = JiraHook(
 48 |         jira_conn_id=conn_id,
 49 |     ).get_connection(conn_id)
 50 |     log_url = get_airflow_log_link(context)
 51 | 
 52 |     jira_domain = "mozilla-hub-sandbox-721.atlassian.net"
 53 |     url = f"https://{jira_domain}/rest/api/3/issue"
 54 |     headers = {"Accept": "application/json", "Content-Type": "application/json"}
 55 |     auth = HTTPBasicAuth(conn.login, conn.password)
 56 |     summary = "Slack Channels Integration - Airflow Task Issue Exception"
 57 |     paragraph_text = "Detailed error logging can be found in the link: "
 58 |     project_key = "ASP"
 59 |     issue_type_id = "10020"  # Issue Type = Bug
 60 |     assignee_id = "712020:b999000a-67b1-45ff-8b40-42a5ceeee75b"  # Julio
 61 |     payload = json.dumps(
 62 |         {
 63 |             "fields": {
 64 |                 "assignee": {"id": assignee_id},
 65 |                 "project": {"key": project_key},
 66 |                 "summary": summary,
 67 |                 "description": {
 68 |                     "type": "doc",
 69 |                     "version": 1,
 70 |                     "content": [
 71 |                         {
 72 |                             "type": "paragraph",
 73 |                             "content": [
 74 |                                 {
 75 |                                     "type": "text",
 76 |                                     "text": paragraph_text,
 77 |                                 },
 78 |                                 {
 79 |                                     "type": "text",
 80 |                                     "text": "Mozilla-Telemetry log.",
 81 |                                     "marks": [
 82 |                                         {
 83 |                                             "type": "link",
 84 |                                             "attrs": {"href": f"{log_url}"},
 85 |                                         }
 86 |                                     ],
 87 |                                 },
 88 |                             ],
 89 |                         }
 90 |                     ],
 91 |                 },
 92 |                 "issuetype": {"id": issue_type_id},
 93 |             }
 94 |         }
 95 |     )
 96 | 
 97 |     response = requests.post(url, headers=headers, auth=auth, data=payload)
 98 |     logger.info(f"response.text={response.text}")
 99 |     if response.status_code == 201:
100 |         logger.info("Issue created successfully.")
101 |         return response.json()
102 |     else:
103 |         logger.info(
104 |             f"Failed to create issue. Status code:"
105 |             f"{response.status_code}, Response: {response.text}"
106 |         )
107 |         return None
108 | 
109 | 
110 | default_args = {
111 |     "owner": "jmoscon@mozilla.com",
112 |     "emails": ["jmoscon@mozilla.com"],
113 |     "start_date": datetime.datetime(2024, 1, 1),
114 |     "retries": 3,
115 |     # wait 5 min before retry
116 |     "retry_delay": datetime.timedelta(minutes=5),
117 |     "on_failure_callback": create_jira_ticket,
118 | }
119 | tags = [Tag.ImpactTier.tier_3]
120 | 
121 | 
122 | SLACK_CHANNEL_TOKEN = Secret(
123 |     deploy_type="env",
124 |     deploy_target="SLACK_CHANNEL_TOKEN",
125 |     secret="airflow-gke-secrets",
126 |     key="SLACK_CHANNEL_TOKEN",
127 | )
128 | 
129 | with DAG(
130 |     "eam-slack-channels-integration",
131 |     default_args=default_args,
132 |     doc_md=DOCS,
133 |     tags=tags,
134 |     # 10 PM standard time (PST, UTC-8) every day
135 |     schedule_interval="0 6 * * *",
136 | ) as dag:
137 |     slack_channels_dag = GKEPodOperator(
138 |         task_id="eam_slack_channels",
139 |         arguments=[
140 |             "python",
141 |             "scripts/slack_channels_integration.py",
142 |             "--level",
143 |             "info",
144 |         ],
145 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/"
146 |         + "eam-integrations_docker_etl:latest",
147 |         gcp_conn_id="google_cloud_airflow_gke",
148 |         secrets=[
149 |             SLACK_CHANNEL_TOKEN,
150 |         ],
151 |     )
152 | 


--------------------------------------------------------------------------------
/dags/experiment_auto_sizing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Powers the [auto-sizing](https://github.com/mozilla/auto-sizing) tool
 3 | for computing experiment sizing information for various configurations.
 4 | 
 5 | *Triage notes*
 6 | TBD
 7 | """  # noqa: D205
 8 | 
 9 | from datetime import datetime, timedelta
10 | 
11 | from airflow import DAG
12 | from airflow.sensors.external_task import ExternalTaskSensor
13 | 
14 | from operators.gcp_container_operator import GKEPodOperator
15 | from utils.constants import ALLOWED_STATES, FAILED_STATES
16 | from utils.tags import Tag
17 | 
18 | default_args = {
19 |     "owner": "mwilliams@mozilla.com",
20 |     "email": ["mwilliams@mozilla.com", "ascholtz@mozilla.com", "mbowerman@mozilla.com"],
21 |     "depends_on_past": False,
22 |     "start_date": datetime(2023, 4, 15),
23 |     "email_on_failure": True,
24 |     "email_on_retry": True,
25 |     "retries": 2,
26 |     "retry_delay": timedelta(minutes=30),
27 | }
28 | 
29 | tags = [Tag.ImpactTier.tier_1]
30 | 
31 | with DAG(
32 |     "experiment_auto_sizing",
33 |     default_args=default_args,
34 |     schedule_interval="0 6 * * 0",  # 6am every Sunday, after Jetstream
35 |     doc_md=__doc__,
36 |     tags=tags,
37 | ) as dag:
38 |     # Built from repo https://github.com/mozilla/auto-sizing
39 |     auto_sizing_image = "gcr.io/moz-fx-data-experiments/auto_sizing:latest"
40 | 
41 |     auto_sizing_run = GKEPodOperator(
42 |         task_id="auto_sizing_run",
43 |         name="auto_sizing_run",
44 |         image=auto_sizing_image,
45 |         email=default_args["email"],
46 |         arguments=[
47 |             "--log-to-bigquery",
48 |             "run-argo",
49 |             "--bucket=mozanalysis",
50 |             "--dataset-id=auto_sizing",
51 |             # the Airflow cluster doesn't have Compute Engine API access so pass in IP
52 |             # and certificate in order for the pod to connect to the Kubernetes cluster
53 |             # running Jetstream/auto-sizing
54 |             "--cluster-ip={{ var.value.jetstream_cluster_ip }}",
55 |             "--cluster-cert={{ var.value.jetstream_cluster_cert }}",
56 |         ],
57 |         dag=dag,
58 |     )
59 | 
60 |     wait_for_jetstream = ExternalTaskSensor(
61 |         task_id="wait_for_jetstream",
62 |         external_dag_id="jetstream",
63 |         external_task_id="jetstream_run_config_changed",
64 |         execution_delta=timedelta(hours=2),
65 |         mode="reschedule",
66 |         allowed_states=ALLOWED_STATES,
67 |         failed_states=FAILED_STATES,
68 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
69 |         email_on_retry=False,
70 |         dag=dag,
71 |     )
72 | 
73 |     auto_sizing_run.set_upstream(wait_for_jetstream)
74 | 


--------------------------------------------------------------------------------
/dags/experiments_live.py:
--------------------------------------------------------------------------------
 1 | """
 2 | See [experiments-monitoring-data-export in the docker-etl repository](https://github.com/mozilla/docker-etl/tree/main/jobs/experiments-monitoring-data-export).
 3 | 
 4 | This DAG exports views related to experiment monitoring to GCS as JSON
 5 | every 5 minutes to power the Experimenter console.
 6 | """
 7 | 
 8 | from datetime import datetime
 9 | 
10 | from airflow import DAG
11 | 
12 | from operators.gcp_container_operator import GKEPodOperator
13 | from utils.tags import Tag
14 | 
15 | default_args = {
16 |     "owner": "ascholtz@mozilla.com",
17 |     "depends_on_past": False,
18 |     "start_date": datetime(2021, 1, 8),
19 |     "email_on_failure": True,
20 |     "email_on_retry": True,
21 | }
22 | 
23 | tags = [Tag.ImpactTier.tier_2]
24 | 
25 | # We rely on max_active_runs=1 at the DAG level to manage the dependency on past runs.
26 | with DAG(
27 |     "experiments_live",
28 |     default_args=default_args,
29 |     max_active_tasks=4,
30 |     max_active_runs=1,
31 |     schedule_interval="*/5 * * * *",
32 |     doc_md=__doc__,
33 |     tags=tags,
34 | ) as dag:
35 |     # list of datasets to export data to GCS
36 |     experiment_datasets = [
37 |         "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_other_events_overall_v1",
38 |         "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_cumulative_population_estimate_v1",
39 |         "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_overall_v1",
40 |         "moz-fx-data-shared-prod.telemetry_derived.experiment_unenrollment_overall_v1",
41 |         "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_ad_clicks_v1",
42 |         "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_count_v1",
43 |         "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_with_ads_count_v1",
44 |         "moz-fx-data-shared-prod.telemetry.experiment_enrollment_daily_active_population",
45 |     ]
46 | 
47 |     experiment_enrollment_export = GKEPodOperator(
48 |         task_id="experiment_enrollment_export",
49 |         arguments=[
50 |             "python",
51 |             "experiments_monitoring_data_export/export.py",
52 |             "--datasets",
53 |             *experiment_datasets,
54 |         ],
55 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/experiments-monitoring-data-export_docker_etl:latest",
56 |         dag=dag,
57 |     )
58 | 


--------------------------------------------------------------------------------
/dags/extensions.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from airflow import DAG
 4 | 
 5 | from operators.gcp_container_operator import GKEPodOperator
 6 | 
 7 | docs = """
 8 | ### extensions
 9 | 
10 | Loads the table moz-fx-data-shared-prod.external_derived.chrome_extensions_v1
11 | 
12 | Note - if it fails, please alert the DAG owner, but do not re-run.
13 | 
14 | Owner: kwindau@mozilla.com
15 | """
16 | 
17 | default_args = {
18 |     "owner": "kwindau@mozilla.com",
19 |     "start_date": datetime.datetime(2025, 4, 13, 0, 0),
20 |     "end_date": None,
21 |     "email": ["kwindau@mozilla.com"],
22 |     "depends_on_past": False,
23 |     "retry_delay": datetime.timedelta(seconds=1800),
24 |     "email_on_failure": True,
25 |     "email_on_retry": True,
26 |     "retries": 2,
27 | }
28 | 
29 | tags = ["impact/tier_3", "repo/telemetry-airflow"]
30 | SERVER = "moz-fx-data-airflow-prod-88e0"
31 | IMAGE_NAME = "extensions_docker_etl:latest"
32 | 
33 | with DAG(
34 |     "extensions",
35 |     default_args=default_args,
36 |     schedule_interval="0 15 * * *",
37 |     doc_md=docs,
38 |     tags=tags,
39 | ) as dag:
40 |     pull_extensions = GKEPodOperator(
41 |         task_id="pull_extensions",
42 |         arguments=[
43 |             "python",
44 |             "extensions/main.py",
45 |             "--date",
46 |             "{{ ds }}",
47 |         ],
48 |         image=f"gcr.io/{SERVER}/{IMAGE_NAME}",
49 |         gcp_conn_id="google_cloud_airflow_gke",
50 |     )
51 | 


--------------------------------------------------------------------------------
/dags/firefox_public_data_report.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Powers the public https://data.firefox.com/ dashboard.
  3 | 
  4 | Source code is in the [firefox-public-data-report-etl repository]
  5 | (https://github.com/mozilla/firefox-public-data-report-etl).
  6 | """
  7 | 
  8 | from datetime import datetime, timedelta
  9 | 
 10 | from airflow import DAG
 11 | from airflow.sensors.external_task import ExternalTaskSensor
 12 | 
 13 | from operators.gcp_container_operator import GKEPodOperator
 14 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 15 | from utils.gcp import bigquery_etl_query
 16 | from utils.tags import Tag
 17 | 
 18 | default_args = {
 19 |     "owner": "bewu@mozilla.com",
 20 |     "depends_on_past": False,
 21 |     "start_date": datetime(2020, 4, 6),
 22 |     "email": [
 23 |         "telemetry-alerts@mozilla.com",
 24 |         "firefox-hardware-report-feedback@mozilla.com",
 25 |         "akomar@mozilla.com",
 26 |         "shong@mozilla.com",
 27 |         "bewu@mozilla.com",
 28 |     ],
 29 |     "email_on_failure": True,
 30 |     "email_on_retry": True,
 31 |     "retries": 2,
 32 |     "retry_delay": timedelta(minutes=10),
 33 | }
 34 | 
 35 | tags = [Tag.ImpactTier.tier_3]
 36 | 
 37 | dag = DAG(
 38 |     "firefox_public_data_report",
 39 |     default_args=default_args,
 40 |     schedule_interval="0 1 * * MON",
 41 |     doc_md=__doc__,
 42 |     tags=tags,
 43 | )
 44 | 
 45 | # hardware_report's execution date will be {now}-7days. It will read last week's main pings,
 46 | # therefore we need to wait for yesterday's Main Ping deduplication task to finish
 47 | wait_for_main_ping = ExternalTaskSensor(
 48 |     task_id="wait_for_main_ping",
 49 |     external_dag_id="copy_deduplicate",
 50 |     external_task_id="copy_deduplicate_main_ping",
 51 |     execution_delta=timedelta(days=-6),
 52 |     check_existence=True,
 53 |     mode="reschedule",
 54 |     allowed_states=ALLOWED_STATES,
 55 |     failed_states=FAILED_STATES,
 56 |     pool="DATA_ENG_EXTERNALTASKSENSOR",
 57 |     email_on_retry=False,
 58 |     dag=dag,
 59 | )
 60 | 
 61 | hardware_report_query = bigquery_etl_query(
 62 |     task_id="hardware_report_query",
 63 |     destination_table="public_data_report_hardware_aggregates_v1",
 64 |     project_id="moz-fx-data-shared-prod",
 65 |     dataset_id="telemetry_derived",
 66 |     dag=dag,
 67 | )
 68 | 
 69 | hardware_report_export = GKEPodOperator(
 70 |     task_id="hardware_report_export",
 71 |     name="hardware_report_export",
 72 |     image="gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest",
 73 |     arguments=[
 74 |         "-m",
 75 |         "public_data_report.cli",
 76 |         "hardware_report",
 77 |         "--date_from",
 78 |         "{{ ds }}",
 79 |         "--input_bq_table",
 80 |         "moz-fx-data-shared-prod.telemetry_derived.public_data_report_hardware_aggregates_v1",
 81 |         "--output_bq_table",
 82 |         "moz-fx-data-shared-prod.telemetry_derived.public_data_report_hardware_v1",
 83 |         "--gcs_bucket",
 84 |         "moz-fx-data-static-websit-8565-analysis-output",
 85 |         "--gcs_path",
 86 |         "public-data-report/hardware/",
 87 |     ],
 88 |     image_pull_policy="Always",
 89 |     dag=dag,
 90 | )
 91 | 
 92 | wait_for_clients_last_seen = ExternalTaskSensor(
 93 |     task_id="wait_for_clients_last_seen",
 94 |     external_dag_id="bqetl_main_summary",
 95 |     external_task_id="telemetry_derived__clients_last_seen__v1",
 96 |     execution_delta=timedelta(days=-6, hours=-1),
 97 |     check_existence=True,
 98 |     mode="reschedule",
 99 |     allowed_states=ALLOWED_STATES,
100 |     failed_states=FAILED_STATES,
101 |     pool="DATA_ENG_EXTERNALTASKSENSOR",
102 |     email_on_retry=False,
103 |     dag=dag,
104 | )
105 | 
106 | user_activity = bigquery_etl_query(
107 |     task_id="user_activity",
108 |     destination_table="public_data_report_user_activity_v1",
109 |     project_id="moz-fx-data-shared-prod",
110 |     dataset_id="telemetry_derived",
111 |     dag=dag,
112 | )
113 | 
114 | user_activity_usage_behavior_export = GKEPodOperator(
115 |     task_id="user_activity_export",
116 |     name="user_activity_export",
117 |     image="gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest",
118 |     arguments=[
119 |         "-m",
120 |         "public_data_report.cli",
121 |         "user_activity",
122 |         "--bq_table",
123 |         "moz-fx-data-shared-prod.telemetry_derived.public_data_report_user_activity_v1",
124 |         "--gcs_bucket",
125 |         "moz-fx-data-static-websit-8565-analysis-output",
126 |         "--gcs_path",
127 |         "public-data-report/user_activity",
128 |     ],
129 |     image_pull_policy="Always",
130 |     dag=dag,
131 | )
132 | 
133 | annotations_export = GKEPodOperator(
134 |     task_id="annotations_export",
135 |     name="annotations_export",
136 |     image="gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest",
137 |     arguments=[
138 |         "-m",
139 |         "public_data_report.cli",
140 |         "annotations",
141 |         "--date_to",
142 |         "{{ ds }}",
143 |         "--output_bucket",
144 |         "moz-fx-data-static-websit-8565-analysis-output",
145 |         "--output_prefix",
146 |         "public-data-report/annotations",
147 |     ],
148 |     image_pull_policy="Always",
149 |     dag=dag,
150 | )
151 | 
152 | ensemble_transposer = GKEPodOperator(
153 |     task_id="ensemble_transposer",
154 |     name="ensemble_transposer",
155 |     image="gcr.io/moz-fx-data-airflow-prod-88e0/ensemble-transposer:latest",
156 |     env_vars={
157 |         "GCS_BUCKET_NAME": "moz-fx-data-static-websit-8565-ensemble",
158 |     },
159 |     image_pull_policy="Always",
160 |     dag=dag,
161 | )
162 | 
163 | 
164 | (
165 |     wait_for_main_ping
166 |     >> hardware_report_query
167 |     >> hardware_report_export
168 |     >> ensemble_transposer
169 | )
170 | (
171 |     wait_for_clients_last_seen
172 |     >> user_activity
173 |     >> user_activity_usage_behavior_export
174 |     >> ensemble_transposer
175 | )
176 | annotations_export >> ensemble_transposer
177 | 


--------------------------------------------------------------------------------
/dags/fxci_metric_export.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Exports Firefox-CI worker data from the Google Cloud Monitoring to BigQuery.
 3 | 
 4 | The container is defined in [fxci-etl](https://github.com/mozilla-releng/fxci-etl).
 5 | """
 6 | 
 7 | from datetime import datetime, timedelta
 8 | 
 9 | from airflow import DAG
10 | from airflow.providers.cncf.kubernetes.secret import Secret
11 | 
12 | from operators.gcp_container_operator import GKEPodOperator
13 | from utils.tags import Tag
14 | 
15 | default_args = {
16 |     "owner": "ahalberstadt@mozilla.com",
17 |     "depends_on_past": False,
18 |     "start_date": datetime(2024, 7, 8),
19 |     "email_on_failure": True,
20 |     "email_on_retry": False,
21 |     "retries": 1,
22 |     "retry_delay": timedelta(minutes=30),
23 | }
24 | 
25 | tags = [Tag.ImpactTier.tier_3]
26 | 
27 | env_vars = {
28 |     "FXCI_ETL_BIGQUERY_PROJECT": "moz-fx-data-shared-prod",
29 |     "FXCI_ETL_BIGQUERY_DATASET": "fxci_derived",
30 |     "FXCI_ETL_STORAGE_PROJECT": "moz-fx-dev-releng",
31 |     "FXCI_ETL_STORAGE_BUCKET": "fxci-etl",
32 | }
33 | 
34 | secrets = [
35 |     Secret(
36 |         deploy_type="env",
37 |         deploy_target="FXCI_ETL_STORAGE_CREDENTIALS",
38 |         secret="airflow-gke-secrets",
39 |         key="fxci_etl_secret__gcp-credentials",
40 |     ),
41 |     Secret(
42 |         deploy_type="env",
43 |         deploy_target="FXCI_ETL_MONITORING_CREDENTIALS",
44 |         secret="airflow-gke-secrets",
45 |         key="fxci_etl_secret__gcp-credentials",
46 |     ),
47 | ]
48 | 
49 | with DAG(
50 |     "fxci_metric_export",
51 |     default_args=default_args,
52 |     doc_md=__doc__,
53 |     schedule_interval="30 0 * * *",
54 |     tags=tags,
55 | ) as dag:
56 |     fxci_metric_export = GKEPodOperator(
57 |         task_id="fxci_metric_export",
58 |         arguments=[
59 |             "fxci-etl",
60 |             "metric",
61 |             "export",
62 |             "-vv",
63 |             "--date={{ ds }}",
64 |         ],
65 |         env_vars=env_vars,
66 |         secrets=secrets,
67 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/fxci-taskcluster-export_docker_etl:latest",
68 |         gcp_conn_id="google_cloud_airflow_gke",
69 |         dag=dag,
70 |         email=[
71 |             "ahalberstadt@mozilla.com",
72 |         ],
73 |     )
74 | 


--------------------------------------------------------------------------------
/dags/fxci_pulse_export.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Exports Firefox-CI task and run data from Taskcluster to BigQuery.
 3 | 
 4 | This connects to and drains three separate Taskcluster pulse queues, and
 5 | exports each message into BigQuery.
 6 | 
 7 | The container is defined in [fxci-etl](https://github.com/mozilla-releng/fxci-etl).
 8 | """
 9 | 
10 | from datetime import datetime, timedelta
11 | 
12 | from airflow import DAG
13 | from airflow.providers.cncf.kubernetes.secret import Secret
14 | 
15 | from operators.gcp_container_operator import GKEPodOperator
16 | from utils.tags import Tag
17 | 
18 | default_args = {
19 |     "owner": "ahalberstadt@mozilla.com",
20 |     "depends_on_past": False,
21 |     "start_date": datetime(2024, 7, 8),
22 |     "email_on_failure": True,
23 |     "email_on_retry": False,
24 |     "retries": 1,
25 |     "retry_delay": timedelta(minutes=30),
26 | }
27 | 
28 | tags = [Tag.ImpactTier.tier_3]
29 | 
30 | env_vars = {
31 |     "FXCI_ETL_BIGQUERY_PROJECT": "moz-fx-data-shared-prod",
32 |     "FXCI_ETL_BIGQUERY_DATASET": "fxci_derived",
33 |     "FXCI_ETL_STORAGE_PROJECT": "moz-fx-dev-releng",
34 |     "FXCI_ETL_STORAGE_BUCKET": "fxci-etl",
35 |     "FXCI_ETL_PULSE_USER": "fxci-etl",
36 | }
37 | 
38 | secrets = [
39 |     Secret(
40 |         deploy_type="env",
41 |         deploy_target="FXCI_ETL_STORAGE_CREDENTIALS",
42 |         secret="airflow-gke-secrets",
43 |         key="fxci_etl_secret__gcp-credentials",
44 |     ),
45 |     Secret(
46 |         deploy_type="env",
47 |         deploy_target="FXCI_ETL_PULSE_PASSWORD",
48 |         secret="airflow-gke-secrets",
49 |         key="fxci_etl_secret__pulse-password",
50 |     ),
51 | ]
52 | 
53 | with DAG(
54 |     "fxci_pulse_export",
55 |     default_args=default_args,
56 |     doc_md=__doc__,
57 |     schedule_interval="30 */4 * * *",
58 |     tags=tags,
59 | ) as dag:
60 |     fxci_pulse_export = GKEPodOperator(
61 |         task_id="fxci_pulse_export",
62 |         arguments=[
63 |             "fxci-etl",
64 |             "pulse",
65 |             "drain",
66 |             "-vv",
67 |         ],
68 |         env_vars=env_vars,
69 |         secrets=secrets,
70 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/fxci-taskcluster-export_docker_etl:latest",
71 |         gcp_conn_id="google_cloud_airflow_gke",
72 |         dag=dag,
73 |         email=[
74 |             "ahalberstadt@mozilla.com",
75 |         ],
76 |     )
77 | 


--------------------------------------------------------------------------------
/dags/ga4_site_metrics_summary_backfill.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.sensors.external_task import ExternalTaskMarker
 5 | 
 6 | from utils.gcp import bigquery_dq_check, bigquery_etl_query
 7 | 
 8 | docs = """
 9 | ### ga4_site_metrics_summary_backfill
10 | 
11 | Backfills the past three days of data for moz-fx-data-shared-prod.mozilla_org_derived.www_site_metrics_summary_v2 since late data can arrive for a few days
12 | 
13 | Built from bigquery-etl repo, [`dags/bqetl_google_analytics_derived_ga4.py`](https://github.com/mozilla/bigquery-etl/blob/generated-sql/dags/bqetl_google_analytics_derived_ga4.py).
14 | 
15 | This file is meant to look very similar to generated DAGs in bigquery-etl.
16 | 
17 | Owner: kwindau@mozilla.com
18 | """
19 | 
20 | default_args = {
21 |     "owner": "kwindau@mozilla.com",
22 |     "start_date": datetime.datetime(2024, 1, 4, 0, 0),
23 |     "end_date": None,
24 |     "email": ["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"],
25 |     "depends_on_past": False,
26 |     "retry_delay": datetime.timedelta(seconds=1800),
27 |     "email_on_failure": True,
28 |     "email_on_retry": True,
29 |     "retries": 2,
30 | }
31 | 
32 | tags = ["impact/tier_2", "repo/bigquery-etl"]
33 | 
34 | with DAG(
35 |     "ga4_site_metrics_summary_backfill",
36 |     default_args=default_args,
37 |     schedule_interval="0 1 * * *",
38 |     doc_md=docs,
39 |     tags=tags,
40 | ) as dag:
41 |     for day_offset in ["-3", "-2", "-1"]:
42 |         task_id = "mozilla_org_derived__www_site_metrics_summary__v2__backfill_" + day_offset
43 |         date_str = "macros.ds_add(ds, " + day_offset + ")"
44 |         date_str_no_dash = "macros.ds_format(" + date_str + ", '%Y-%m-%d', '%Y%m%d')"
45 | 
46 |         ga4_www_site_metrics_summary_v2_checks = bigquery_dq_check(
47 |             task_id="checks__fail_" + task_id,
48 |             source_table="www_site_metrics_summary_v2",
49 |             dataset_id="mozilla_org_derived",
50 |             project_id="moz-fx-data-shared-prod",
51 |             is_dq_check_fail=True,
52 |             owner="kwindau@mozilla.com",
53 |             email=["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"],
54 |             depends_on_past=False,
55 |             parameters=["submission_date:DATE:{{ " + date_str + " }}"],
56 |             retries=0,
57 |         )
58 | 
59 |         ga4_www_site_metrics_summary_v2 = bigquery_etl_query(
60 |             task_id=task_id,
61 |             destination_table="www_site_metrics_summary_v2${{ "
62 |             + date_str_no_dash
63 |             + " }}",
64 |             dataset_id="mozilla_org_derived",
65 |             project_id="moz-fx-data-shared-prod",
66 |             owner="kwindau@mozilla.com",
67 |             email=["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"],
68 |             date_partition_parameter=None,
69 |             parameters=["submission_date:DATE:{{ " + date_str + " }}"],
70 |             depends_on_past=False,
71 |         )
72 | 
73 |         todays_ga4_www_site_metrics_summary_v2 = ExternalTaskMarker(
74 |             task_id="rerun__mozilla_org_derived__www_site_metrics_summary__v2__" + day_offset,
75 |             external_dag_id="bqetl_google_analytics_derived_ga4",
76 |             external_task_id="wait_for_" + task_id,
77 |             execution_date="{{ (execution_date - macros.timedelta(days=-1, seconds=82800)).isoformat() }}",
78 |         )
79 | 
80 |         (
81 |             ga4_www_site_metrics_summary_v2
82 |             >> ga4_www_site_metrics_summary_v2_checks
83 |             >> todays_ga4_www_site_metrics_summary_v2
84 |         )
85 | 


--------------------------------------------------------------------------------
/dags/glam_fenix_release.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | from functools import partial, reduce
  3 | 
  4 | from airflow import DAG
  5 | from airflow.operators.empty import EmptyOperator
  6 | from airflow.sensors.external_task import ExternalTaskSensor
  7 | from airflow.utils.task_group import TaskGroup
  8 | 
  9 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 10 | from utils.glam_subdags.generate_query import (
 11 |     generate_and_run_glean_task,
 12 | )
 13 | from utils.tags import Tag
 14 | 
 15 | default_args = {
 16 |     "owner": "efilho@mozilla.com",
 17 |     "depends_on_past": False,
 18 |     "start_date": datetime(2025, 1, 22),
 19 |     "email": [
 20 |         "telemetry-alerts@mozilla.com",
 21 |         "akomarzewski@mozilla.com",
 22 |         "efilho@mozilla.com",
 23 |     ],
 24 |     "email_on_failure": True,
 25 |     "email_on_retry": True,
 26 |     "retries": 2,
 27 |     "retry_delay": timedelta(minutes=30),
 28 | }
 29 | 
 30 | PROJECT = "moz-fx-glam-prod"
 31 | 
 32 | tags = [Tag.ImpactTier.tier_1]
 33 | 
 34 | with DAG(
 35 |     "glam_fenix_release",
 36 |     default_args=default_args,
 37 |     max_active_runs=1,
 38 |     schedule_interval="0 10 * * 6",  # 10am on Saturday
 39 |     doc_md=__doc__,
 40 |     tags=tags,
 41 | ) as dag:
 42 |     wait_for_glam_fenix = ExternalTaskSensor(
 43 |         task_id="wait_for_daily_fenix_release",
 44 |         external_dag_id="glam_fenix",
 45 |         external_task_id="org_mozilla_fenix_glam_release_done",
 46 |         execution_delta=timedelta(days=-5, hours=-16),
 47 |         check_existence=True,
 48 |         mode="reschedule",
 49 |         allowed_states=ALLOWED_STATES,
 50 |         failed_states=FAILED_STATES,
 51 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 52 |         email_on_retry=False,
 53 |     )
 54 | 
 55 |     for product in ["org_mozilla_fenix_glam_release"]:
 56 |         func = partial(
 57 |             generate_and_run_glean_task,
 58 |             product=product,
 59 |             destination_project_id=PROJECT,
 60 |             env_vars={"STAGE": "incremental"},
 61 |         )
 62 |         view, init, query = (
 63 |             partial(func, task_type=task_type)
 64 |             for task_type in ["view", "init", "query"]
 65 |         )
 66 | 
 67 |         # stage 2 - downstream for export
 68 |         scalar_bucket_counts = query(task_name=f"{product}__scalar_bucket_counts_v1")
 69 |         scalar_probe_counts = query(task_name=f"{product}__scalar_probe_counts_v1")
 70 | 
 71 |         with TaskGroup(
 72 |             group_id=f"{product}__histogram_bucket_counts_v1", dag=dag, default_args=default_args
 73 |         ) as histogram_bucket_counts:
 74 |             prev_task = None
 75 |             for sample_range in ([0, 19], [20, 39], [40, 59], [60, 79], [80, 99]):
 76 |                 histogram_bucket_counts_sampled = query(
 77 |                     task_name=f"{product}__histogram_bucket_counts_v1_sampled_{sample_range[0]}_{sample_range[1]}",
 78 |                     min_sample_id=sample_range[0],
 79 |                     max_sample_id=sample_range[1],
 80 |                     replace_table=(sample_range[0] == 0)
 81 |                 )
 82 |                 if prev_task:
 83 |                     histogram_bucket_counts_sampled.set_upstream(prev_task)
 84 |                 prev_task = histogram_bucket_counts_sampled
 85 | 
 86 |         histogram_probe_counts = query(
 87 |             task_name=f"{product}__histogram_probe_counts_v1"
 88 |         )
 89 | 
 90 |         probe_counts = view(task_name=f"{product}__view_probe_counts_v1")
 91 |         extract_probe_counts = query(task_name=f"{product}__extract_probe_counts_v1")
 92 | 
 93 |         user_counts = view(task_name=f"{product}__view_user_counts_v1")
 94 |         extract_user_counts = query(task_name=f"{product}__extract_user_counts_v1")
 95 | 
 96 |         sample_counts = view(task_name=f"{product}__view_sample_counts_v1")
 97 | 
 98 |         fenix_release_done = EmptyOperator(task_id="fenix_release_done")
 99 | 
100 |         (
101 |             wait_for_glam_fenix
102 |             >> scalar_bucket_counts
103 |             >> scalar_probe_counts
104 |             >> probe_counts
105 |         )
106 |         (
107 |             wait_for_glam_fenix
108 |             >> histogram_bucket_counts
109 |             >> histogram_probe_counts
110 |             >> probe_counts
111 |         )
112 |         probe_counts >> sample_counts >> extract_probe_counts >> fenix_release_done
113 |         (
114 |             wait_for_glam_fenix
115 |             >> user_counts
116 |             >> extract_user_counts
117 |             >> fenix_release_done
118 |         )
119 |         wait_for_glam_fenix >> fenix_release_done
120 | 


--------------------------------------------------------------------------------
/dags/glam_fog_release.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | from datetime import datetime, timedelta
  3 | from functools import partial, reduce
  4 | 
  5 | from airflow import DAG
  6 | from airflow.operators.empty import EmptyOperator
  7 | from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor
  8 | from airflow.utils.task_group import TaskGroup
  9 | 
 10 | from operators.gcp_container_operator import GKEPodOperator
 11 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 12 | from utils.glam_subdags.generate_query import (
 13 |     generate_and_run_glean_task,
 14 | )
 15 | from utils.tags import Tag
 16 | 
 17 | default_args = {
 18 |     "owner": "efilho@mozilla.com",
 19 |     "depends_on_past": False,
 20 |     "start_date": datetime(2024, 12, 11),
 21 |     "email": [
 22 |         "telemetry-alerts@mozilla.com",
 23 |         "akomarzewski@mozilla.com",
 24 |         "efilho@mozilla.com",
 25 |     ],
 26 |     "email_on_failure": True,
 27 |     "email_on_retry": True,
 28 |     "retries": 1,
 29 |     "retry_delay": timedelta(minutes=30),
 30 | }
 31 | 
 32 | PROJECT = "moz-fx-glam-prod"
 33 | 
 34 | tags = [Tag.ImpactTier.tier_2]
 35 | 
 36 | with DAG(
 37 |     "glam_fog_release",
 38 |     default_args=default_args,
 39 |     max_active_runs=1,
 40 |     schedule_interval="0 10 * * 6",  # 10am on Saturday
 41 |     tags=tags,
 42 | ) as dag:
 43 |     wait_for_glam_fog = ExternalTaskSensor(
 44 |         task_id="wait_for_daily_glam_fog_release",
 45 |         external_dag_id="glam_fog",
 46 |         external_task_id="daily_release_done",
 47 |         execution_delta=timedelta(days=-5, hours=-16),
 48 |         check_existence=True,
 49 |         mode="reschedule",
 50 |         allowed_states=ALLOWED_STATES,
 51 |         failed_states=FAILED_STATES,
 52 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 53 |         email_on_retry=False,
 54 |     )
 55 | 
 56 |     fog_release_done = EmptyOperator(
 57 |         task_id="fog_release_done",
 58 |     )
 59 | 
 60 |     for product in ["firefox_desktop_glam_release"]:
 61 |         func = partial(
 62 |             generate_and_run_glean_task,
 63 |             product=product,
 64 |             destination_project_id=PROJECT,
 65 |             env_vars={"STAGE": "incremental"},
 66 |         )
 67 |         view, init, query = (
 68 |             partial(func, task_type=task_type)
 69 |             for task_type in ["view", "init", "query"]
 70 |         )
 71 | 
 72 |         # stage 2 - downstream for export
 73 |         scalar_bucket_counts = query(task_name=f"{product}__scalar_bucket_counts_v1")
 74 |         scalar_probe_counts = query(task_name=f"{product}__scalar_probe_counts_v1")
 75 | 
 76 |         with TaskGroup(
 77 |             group_id=f"{product}__histogram_bucket_counts_v1", dag=dag, default_args=default_args
 78 |         ) as histogram_bucket_counts:
 79 |             prev_task = None
 80 |             # Windows + Release data is in [0-9] so we're further splitting that range.
 81 |             for sample_range in (
 82 |                 [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6],
 83 |                 [7, 7], [8, 8], [9, 9], [10, 19], [20, 29], [30, 39],
 84 |                 [40, 49], [50, 59], [60, 69], [70, 79], [80, 89], [90, 99]
 85 |             ):
 86 |                 histogram_bucket_counts_sampled = query(
 87 |                     task_name=(
 88 |                         f"{product}__histogram_bucket_counts_v1_sampled_"
 89 |                         f"{sample_range[0]}_{sample_range[1]}"
 90 |                     ),
 91 |                     min_sample_id=sample_range[0],
 92 |                     max_sample_id=sample_range[1],
 93 |                     replace_table=(sample_range[0] == 0)
 94 |                 )
 95 |                 if prev_task:
 96 |                     histogram_bucket_counts_sampled.set_upstream(prev_task)
 97 |                 prev_task = histogram_bucket_counts_sampled
 98 | 
 99 |         histogram_probe_counts = query(
100 |             task_name=f"{product}__histogram_probe_counts_v1"
101 |         )
102 | 
103 |         probe_counts = view(task_name=f"{product}__view_probe_counts_v1")
104 |         extract_probe_counts = query(task_name=f"{product}__extract_probe_counts_v1")
105 | 
106 |         user_counts = view(task_name=f"{product}__view_user_counts_v1")
107 |         extract_user_counts = query(task_name=f"{product}__extract_user_counts_v1")
108 | 
109 |         sample_counts = view(task_name=f"{product}__view_sample_counts_v1")
110 | 
111 |         (
112 |             wait_for_glam_fog
113 |             >> scalar_bucket_counts
114 |             >> scalar_probe_counts
115 |             >> probe_counts
116 |         )
117 |         (
118 |             wait_for_glam_fog
119 |             >> histogram_bucket_counts
120 |             >> histogram_probe_counts
121 |             >> probe_counts
122 |         )
123 |         probe_counts >> sample_counts >> extract_probe_counts >> fog_release_done
124 |         (
125 |             wait_for_glam_fog
126 |             >> user_counts
127 |             >> extract_user_counts
128 |             >> fog_release_done
129 |         )
130 |         wait_for_glam_fog >> fog_release_done
131 | 


--------------------------------------------------------------------------------
/dags/glam_glean_imports.py:
--------------------------------------------------------------------------------
  1 | """Desktop ETL for importing glean data into GLAM app."""
  2 | 
  3 | from datetime import datetime, timedelta
  4 | 
  5 | from airflow import DAG
  6 | from airflow.models import Variable
  7 | from airflow.providers.cncf.kubernetes.secret import Secret
  8 | from airflow.sensors.external_task import ExternalTaskSensor
  9 | from airflow.utils.task_group import TaskGroup
 10 | 
 11 | from operators.gcp_container_operator import GKEPodOperator
 12 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 13 | from utils.tags import Tag
 14 | 
 15 | default_args = {
 16 |     "owner": "efilho@mozilla.com",
 17 |     "depends_on_past": False,
 18 |     "start_date": datetime(2019, 10, 22),
 19 |     "email": [
 20 |         "akommasani@mozilla.com",
 21 |         "akomarzewski@mozilla.com",
 22 |         "efilho@mozilla.com",
 23 |     ],
 24 |     "email_on_failure": True,
 25 |     "email_on_retry": True,
 26 |     "retries": 1,
 27 |     "retry_delay": timedelta(minutes=30),
 28 | }
 29 | 
 30 | tags = [Tag.ImpactTier.tier_2]
 31 | 
 32 | dag = DAG(
 33 |     "glam_glean_imports",
 34 |     default_args=default_args,
 35 |     schedule_interval="0 19 * * *",
 36 |     doc_md=__doc__,
 37 |     tags=tags,
 38 | )
 39 | 
 40 | wait_for_glam = ExternalTaskSensor(
 41 |     task_id="wait_for_glam",
 42 |     external_dag_id="glam",
 43 |     external_task_group_id="extracts",
 44 |     execution_delta=timedelta(hours=3),
 45 |     check_existence=True,
 46 |     mode="reschedule",
 47 |     allowed_states=ALLOWED_STATES,
 48 |     failed_states=FAILED_STATES,
 49 |     pool="DATA_ENG_EXTERNALTASKSENSOR",
 50 |     email_on_retry=False,
 51 |     dag=dag,
 52 | )
 53 | 
 54 | # Move logic from Glam deployment's GKE Cronjob to this dag for better dependency timing
 55 | default_glean_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2023.07.1-43"
 56 | 
 57 | base_docker_args = ["/venv/bin/python", "manage.py"]
 58 | 
 59 | for env in ["Dev", "Prod"]:
 60 |     glean_import_image = default_glean_import_image
 61 |     if env == "Dev":  # noqa SIMM114
 62 |         glean_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2023.07.1-43"
 63 |     elif env == "Prod":
 64 |         glean_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2023.07.1-43"
 65 | 
 66 |     # Fetch secrets from Google Secret Manager to be injected into the pod.
 67 |     database_url_secret = Secret(
 68 |         deploy_type="env",
 69 |         deploy_target="DATABASE_URL",
 70 |         secret="airflow-gke-secrets",
 71 |         key=f"{env}_glam_secret__database_url",
 72 |     )
 73 |     django_secret = Secret(
 74 |         deploy_type="env",
 75 |         deploy_target="DJANGO_SECRET_KEY",
 76 |         secret="airflow-gke-secrets",
 77 |         key=f"{env}_glam_secret__django_secret_key",
 78 |     )
 79 | 
 80 |     env_vars = {
 81 |         # Tells Django what set of configs to load depending on the environment. Defaults to dev on the app.
 82 |         "DJANGO_CONFIGURATION": env,
 83 |         "DJANGO_DEBUG": "False",
 84 |         "DJANGO_SETTINGS_MODULE": "glam.settings",
 85 |         "GOOGLE_CLOUD_PROJECT": Variable.get(env + "_glam_project"),
 86 |     }
 87 | 
 88 | 
 89 | default_glam_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2024.10.0-58"
 90 | 
 91 | base_docker_args = ["/venv/bin/python", "manage.py"]
 92 | 
 93 | for env in ["Dev", "Prod"]:
 94 |     glam_import_image = default_glam_import_image
 95 |     if env == "Dev":  # noqa 114
 96 |         glam_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2024.10.0-58"
 97 |     elif env == "Prod":
 98 |         glam_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2024.10.0-58"
 99 | 
100 |     # Fetch secrets from Google Secret Manager to be injected into the pod.
101 |     database_url_secret = Secret(
102 |         deploy_type="env",
103 |         deploy_target="DATABASE_URL",
104 |         secret="airflow-gke-secrets",
105 |         key=f"{env}_glam_secret__database_url",
106 |     )
107 |     django_secret = Secret(
108 |         deploy_type="env",
109 |         deploy_target="DJANGO_SECRET_KEY",
110 |         secret="airflow-gke-secrets",
111 |         key=f"{env}_glam_secret__django_secret_key",
112 |     )
113 | 
114 |     env_vars = {
115 |         "DJANGO_CONFIGURATION": env,
116 |         "DJANGO_DEBUG": "False",
117 |         "DJANGO_SETTINGS_MODULE": "glam.settings",
118 |         "GOOGLE_CLOUD_PROJECT": Variable.get(env + "_glam_project"),
119 |     }
120 | 
121 |     with dag as dag, TaskGroup(group_id=env + "_glam") as glam_env_task_group:
122 |         glam_import_probes = GKEPodOperator(
123 |             reattach_on_restart=True,
124 |             task_id="glam_import_probes",
125 |             name="glam_import_probes",
126 |             image=glam_import_image,
127 |             arguments=[*base_docker_args, "import_probes"],
128 |             env_vars=env_vars,
129 |             secrets=[database_url_secret, django_secret],
130 |         )
131 | 
132 |         glam_import_revisions = GKEPodOperator(
133 |             reattach_on_restart=True,
134 |             task_id="glam_import_revisions",
135 |             name="glam_import_revisions",
136 |             image=glam_import_image,
137 |             arguments=[*base_docker_args, "import_revisions"],
138 |             env_vars=env_vars,
139 |             secrets=[database_url_secret, django_secret],
140 |         )
141 | 
142 |         wait_for_glam >> glam_env_task_group
143 | 


--------------------------------------------------------------------------------
/dags/graphics_telemetry.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A job to power graphics dashboard.
  3 | 
  4 | Processes main ping data and exports to GCS to power a graphics dashboard at
  5 | https://firefoxgraphics.github.io/telemetry/.
  6 | 
  7 | This was originally a Databricks notebook that was migrated to a scheduled
  8 | Dataproc task. Source code lives in the
  9 | [FirefoxGraphics/telemetry](https://github.com/FirefoxGraphics/telemetry)
 10 | repository.
 11 | 
 12 | This is a overwrite kind of operation and as long as the most recent DAG run succeeded
 13 | the job should be considered healthy.
 14 | """
 15 | 
 16 | import datetime
 17 | 
 18 | from airflow import DAG
 19 | from airflow.operators.subdag import SubDagOperator
 20 | from airflow.sensors.external_task import ExternalTaskSensor
 21 | 
 22 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 23 | from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner
 24 | from utils.tags import Tag
 25 | 
 26 | default_args = {
 27 |     "owner": "kik@mozilla.com",
 28 |     "depends_on_past": False,
 29 |     "start_date": datetime.datetime(2020, 11, 26),
 30 |     "email": [
 31 |         "telemetry-alerts@mozilla.com",
 32 |         "kik@mozilla.com",
 33 |     ],
 34 |     "email_on_failure": True,
 35 |     "email_on_retry": True,
 36 |     "retries": 2,
 37 |     "retry_delay": datetime.timedelta(minutes=20),
 38 | }
 39 | 
 40 | PIP_PACKAGES = [
 41 |     "git+https://github.com/mozilla/python_moztelemetry.git@v0.10.7#egg=python-moztelemetry",
 42 |     "git+https://github.com/FirefoxGraphics/telemetry.git#egg=pkg&subdirectory=analyses/bigquery_shim",
 43 |     "boto3==1.16.20",
 44 |     "six==1.15.0",
 45 | ]
 46 | 
 47 | GCS_BUCKET = "moz-fx-data-static-websit-8565-analysis-output"
 48 | GCS_PREFIX = "gfx/telemetry-data/"
 49 | 
 50 | tags = [Tag.ImpactTier.tier_1]
 51 | 
 52 | with DAG(
 53 |     "graphics_telemetry",
 54 |     default_args=default_args,
 55 |     schedule_interval="0 3 * * *",
 56 |     doc_md=__doc__,
 57 |     tags=tags,
 58 | ) as dag:
 59 |     wait_for_main_ping = ExternalTaskSensor(
 60 |         task_id="wait_for_copy_deduplicate_main_ping",
 61 |         external_dag_id="copy_deduplicate",
 62 |         external_task_id="copy_deduplicate_main_ping",
 63 |         execution_delta=datetime.timedelta(hours=2),
 64 |         check_existence=True,
 65 |         mode="reschedule",
 66 |         allowed_states=ALLOWED_STATES,
 67 |         failed_states=FAILED_STATES,
 68 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 69 |         email_on_retry=False,
 70 |         dag=dag,
 71 |     )
 72 | 
 73 |     params = get_dataproc_parameters("google_cloud_airflow_dataproc")
 74 | 
 75 |     graphics_trends = SubDagOperator(
 76 |         task_id="graphics_trends",
 77 |         dag=dag,
 78 |         subdag=moz_dataproc_pyspark_runner(
 79 |             parent_dag_name=dag.dag_id,
 80 |             image_version="1.5-debian10",
 81 |             dag_name="graphics_trends",
 82 |             default_args=default_args,
 83 |             cluster_name="graphics-trends-{{ ds }}",
 84 |             job_name="graphics-trends",
 85 |             python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/graphics/graphics_telemetry_trends.py",
 86 |             init_actions_uris=[
 87 |                 "gs://dataproc-initialization-actions/python/pip-install.sh"
 88 |             ],
 89 |             additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)},
 90 |             additional_properties={
 91 |                 "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
 92 |             },
 93 |             py_args=[
 94 |                 "--gcs-bucket",
 95 |                 GCS_BUCKET,
 96 |                 "--gcs-prefix",
 97 |                 GCS_PREFIX,
 98 |                 "--weekly-fraction",
 99 |                 "0.003",
100 |             ],
101 |             idle_delete_ttl=14400,
102 |             num_workers=2,
103 |             worker_machine_type="n1-standard-4",
104 |             gcp_conn_id=params.conn_id,
105 |             service_account=params.client_email,
106 |             storage_bucket=params.storage_bucket,
107 |         ),
108 |     )
109 | 
110 |     graphics_dashboard = SubDagOperator(
111 |         task_id="graphics_dashboard",
112 |         dag=dag,
113 |         subdag=moz_dataproc_pyspark_runner(
114 |             parent_dag_name=dag.dag_id,
115 |             image_version="1.5-debian10",
116 |             dag_name="graphics_dashboard",
117 |             default_args=default_args,
118 |             cluster_name="graphics-dashboard-{{ ds }}",
119 |             job_name="graphics-dashboard",
120 |             python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/graphics/graphics_telemetry_dashboard.py",
121 |             init_actions_uris=[
122 |                 "gs://dataproc-initialization-actions/python/pip-install.sh"
123 |             ],
124 |             additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)},
125 |             additional_properties={
126 |                 "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
127 |             },
128 |             py_args=[
129 |                 "--output-bucket",
130 |                 GCS_BUCKET,
131 |                 "--output-prefix",
132 |                 GCS_PREFIX,
133 |                 "--release-fraction",
134 |                 "0.003",
135 |             ],
136 |             idle_delete_ttl=14400,
137 |             num_workers=2,
138 |             worker_machine_type="n1-highmem-4",
139 |             gcp_conn_id=params.conn_id,
140 |             service_account=params.client_email,
141 |             storage_bucket=params.storage_bucket,
142 |         ),
143 |     )
144 | 
145 |     wait_for_main_ping >> graphics_trends
146 |     wait_for_main_ping >> graphics_dashboard
147 | 


--------------------------------------------------------------------------------
/dags/jetstream.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Powers the [jetstream](https://experimenter.info/jetstream/jetstream/)
  3 | analysis framework for experiments.
  4 | 
  5 | See the [jetstream repository](https://github.com/mozilla/jetstream).
  6 | 
  7 | *Triage notes*
  8 | 
  9 | In case jetstream configuration is modified it is perfectly normal for the task
 10 | `jetstream_run_config_changed` to take significantly longer to complete (hours instead of minutes).
 11 | In these cases we expect anything below 12 hours, only after that amount of time should
 12 | this task be considered potentially faulty and subject to the triage process.
 13 | """  # noqa: D205
 14 | 
 15 | from datetime import datetime, timedelta
 16 | 
 17 | from airflow import DAG
 18 | from airflow.sensors.external_task import ExternalTaskSensor
 19 | 
 20 | from operators.gcp_container_operator import GKEPodOperator
 21 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 22 | from utils.tags import Tag
 23 | 
 24 | default_args = {
 25 |     "owner": "ascholtz@mozilla.com",
 26 |     "email": [
 27 |         "ascholtz@mozilla.com",
 28 |         "mwilliams@mozilla.com",
 29 |     ],
 30 |     "depends_on_past": False,
 31 |     "start_date": datetime(2020, 3, 12),
 32 |     "email_on_failure": True,
 33 |     "email_on_retry": True,
 34 |     "retries": 2,
 35 |     "retry_delay": timedelta(minutes=30),
 36 | }
 37 | 
 38 | tags = [Tag.ImpactTier.tier_1]
 39 | 
 40 | with DAG(
 41 |     "jetstream",
 42 |     default_args=default_args,
 43 |     schedule_interval="0 4 * * *",
 44 |     doc_md=__doc__,
 45 |     tags=tags,
 46 | ) as dag:
 47 |     # Built from repo https://github.com/mozilla/jetstream
 48 |     jetstream_image = "gcr.io/moz-fx-data-experiments/jetstream:latest"
 49 | 
 50 |     jetstream_run = GKEPodOperator(
 51 |         task_id="jetstream_run",
 52 |         name="jetstream_run",
 53 |         image=jetstream_image,
 54 |         email=default_args["email"],
 55 |         arguments=[
 56 |             "--log_to_bigquery",
 57 |             "run-argo",
 58 |             "--date={{ ds }}",
 59 |             # the Airflow cluster doesn't have Compute Engine API access so pass in IP
 60 |             # and certificate in order for the pod to connect to the Kubernetes cluster
 61 |             # running Jetstream
 62 |             "--cluster-ip={{ var.value.jetstream_cluster_ip }}",
 63 |             "--cluster-cert={{ var.value.jetstream_cluster_cert }}",
 64 |         ],
 65 |         dag=dag,
 66 |     )
 67 | 
 68 |     jetstream_config_changed = GKEPodOperator(
 69 |         task_id="jetstream_run_config_changed",
 70 |         name="jetstream_run_config_changed",
 71 |         image=jetstream_image,
 72 |         email=default_args["email"],
 73 |         arguments=[
 74 |             "--log_to_bigquery",
 75 |             "rerun-config-changed",
 76 |             "--argo",
 77 |             # the Airflow cluster doesn't have Compute Engine API access so pass in IP
 78 |             # and certificate in order for the pod to connect to the Kubernetes cluster
 79 |             # running Jetstream
 80 |             "--cluster-ip={{ var.value.jetstream_cluster_ip }}",
 81 |             "--cluster-cert={{ var.value.jetstream_cluster_cert }}",
 82 |         ],
 83 |         dag=dag,
 84 |     )
 85 | 
 86 |     wait_for_clients_daily_export = ExternalTaskSensor(
 87 |         task_id="wait_for_clients_daily",
 88 |         external_dag_id="bqetl_main_summary",
 89 |         external_task_id="telemetry_derived__clients_daily__v6",
 90 |         execution_delta=timedelta(hours=2),
 91 |         mode="reschedule",
 92 |         allowed_states=ALLOWED_STATES,
 93 |         failed_states=FAILED_STATES,
 94 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
 95 |         email_on_retry=False,
 96 |         dag=dag,
 97 |     )
 98 | 
 99 |     wait_for_search_clients_daily = ExternalTaskSensor(
100 |         task_id="wait_for_search_clients_daily",
101 |         external_dag_id="bqetl_search",
102 |         external_task_id="search_derived__search_clients_daily__v8",
103 |         execution_delta=timedelta(hours=1),
104 |         mode="reschedule",
105 |         allowed_states=ALLOWED_STATES,
106 |         failed_states=FAILED_STATES,
107 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
108 |         email_on_retry=False,
109 |         dag=dag,
110 |     )
111 | 
112 |     wait_for_bq_events = ExternalTaskSensor(
113 |         task_id="wait_for_bq_main_events",
114 |         external_dag_id="copy_deduplicate",
115 |         external_task_id="bq_main_events",
116 |         execution_delta=timedelta(hours=3),
117 |         mode="reschedule",
118 |         allowed_states=ALLOWED_STATES,
119 |         failed_states=FAILED_STATES,
120 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
121 |         email_on_retry=False,
122 |         dag=dag,
123 |     )
124 | 
125 |     wait_for_copy_deduplicate_events = ExternalTaskSensor(
126 |         task_id="wait_for_event_events",
127 |         external_dag_id="copy_deduplicate",
128 |         external_task_id="event_events",
129 |         execution_delta=timedelta(hours=3),
130 |         mode="reschedule",
131 |         allowed_states=ALLOWED_STATES,
132 |         failed_states=FAILED_STATES,
133 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
134 |         email_on_retry=False,
135 |         dag=dag,
136 |     )
137 | 
138 |     jetstream_run.set_upstream(
139 |         [
140 |             wait_for_clients_daily_export,
141 |             wait_for_search_clients_daily,
142 |             wait_for_bq_events,
143 |             wait_for_copy_deduplicate_events,
144 |         ]
145 |     )
146 |     jetstream_config_changed.set_upstream(jetstream_run)
147 | 


--------------------------------------------------------------------------------
/dags/kpi_forecasting.py:
--------------------------------------------------------------------------------
 1 | """
 2 | See [kpi-forecasting in the docker-etl repository](https://github.com/mozilla/docker-etl/blob/main/jobs/kpi-forecasting).
 3 | 
 4 | This DAG runs the forecast Desktop DAU and Mobile DAU. The output powers KPI dashboards and monthly revenue forecasts.
 5 | 
 6 | This DAG is high priority for week 1 of the month and low priority otherwise.
 7 | """
 8 | 
 9 | import os
10 | from collections import namedtuple
11 | from datetime import datetime, timedelta
12 | 
13 | from airflow import DAG
14 | from airflow.sensors.external_task import ExternalTaskSensor
15 | 
16 | from operators.gcp_container_operator import GKEPodOperator
17 | from utils.constants import ALLOWED_STATES, FAILED_STATES
18 | from utils.tags import Tag
19 | 
20 | default_args = {
21 |     "owner": "bochocki@mozilla.com",
22 |     "email": ["bochocki@mozilla.com", "jsilverman@mozilla.com"],
23 |     "depends_on_past": False,
24 |     "start_date": datetime(2022, 3, 28),
25 |     "email_on_failure": True,
26 |     "email_on_retry": True,
27 |     "retries": 2,
28 |     "retry_delay": timedelta(minutes=30),
29 | }
30 | 
31 | TAGS = [Tag.ImpactTier.tier_1]
32 | IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/kpi-forecasting_docker_etl:latest"
33 | 
34 | Config = namedtuple("Config", ["filename", "wait_dag", "wait_tasks"])
35 | CONFIGS = {
36 |     "dau_desktop": Config(
37 |         "dau_desktop.yaml",
38 |         "bqetl_analytics_aggregations",
39 |         [
40 |             "firefox_desktop_active_users_aggregates_v4",
41 |         ],
42 |     ),
43 |     "dau_mobile": Config(
44 |         "dau_mobile.yaml",
45 |         "bqetl_analytics_aggregations",
46 |         [
47 |             "firefox_ios_active_users_aggregates_v3",
48 |             "fenix_active_users_aggregates_v3",
49 |             "focus_android_active_users_aggregates_v3",
50 |             "focus_ios_active_users_aggregates_v3",
51 |         ],
52 |     ),
53 | }
54 | 
55 | with DAG(
56 |     "kpi_forecasting",
57 |     default_args=default_args,
58 |     schedule_interval="0 5 * * *",
59 |     doc_md=__doc__,
60 |     tags=TAGS,
61 | ) as dag:
62 |     for id, config in CONFIGS.items():
63 |         script_path = os.path.join(".", "kpi_forecasting.py")
64 |         config_path = os.path.join("kpi_forecasting", "configs", config.filename)
65 |         wait_tasks = config.wait_tasks
66 | 
67 |         if not isinstance(config.wait_tasks, list):
68 |             wait_tasks = [wait_tasks]
69 | 
70 |         forecast_task = GKEPodOperator(
71 |             task_id=f"kpi_forecasting_{id}",
72 |             arguments=["python", script_path, "-c", config_path],
73 |             image=IMAGE,
74 |             dag=dag,
75 |         )
76 | 
77 |         for wait_task in wait_tasks:
78 |             wait_task_sensor = ExternalTaskSensor(
79 |                 task_id=f"wait_for_{wait_task}",
80 |                 external_dag_id=config.wait_dag,
81 |                 external_task_id=wait_task,
82 |                 execution_delta=timedelta(minutes=45),
83 |                 check_existence=True,
84 |                 mode="reschedule",
85 |                 allowed_states=ALLOWED_STATES,
86 |                 failed_states=FAILED_STATES,
87 |                 pool="DATA_ENG_EXTERNALTASKSENSOR",
88 |             )
89 | 
90 |             wait_task_sensor >> forecast_task
91 | 


--------------------------------------------------------------------------------
/dags/looker_usage_analysis.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | 
  3 | from airflow import DAG
  4 | from airflow.providers.cncf.kubernetes.secret import Secret
  5 | 
  6 | from operators.gcp_container_operator import GKEPodOperator
  7 | from utils.tags import Tag
  8 | 
  9 | DOCS = """\
 10 | # Looker Usage Analysis
 11 | 
 12 | *Triage notes*
 13 | 
 14 | As long as the most recent DAG run is successful this job can be considered healthy.
 15 | In such case, past DAG failures can be ignored.
 16 | 
 17 | This DAG runs every quarter (1st day of February, May, August, November) and analyses the
 18 | Looker artifact usage using [Henry](https://github.com/looker-open-source/henry)
 19 | """
 20 | 
 21 | 
 22 | default_args = {
 23 |     "owner": "ascholtz@mozilla.com",
 24 |     "depends_on_past": False,
 25 |     "start_date": datetime(2025, 5, 30),
 26 |     "email_on_failure": True,
 27 |     "email_on_retry": True,
 28 |     "retries": 2,
 29 |     "retry_delay": timedelta(minutes=30),
 30 | }
 31 | 
 32 | tags = [Tag.ImpactTier.tier_3]
 33 | 
 34 | looker_client_id_prod = Secret(
 35 |     deploy_type="env",
 36 |     deploy_target="LOOKER_CLIENT_ID",
 37 |     secret="airflow-gke-secrets",
 38 |     key="probe_scraper_secret__looker_api_client_id_prod",
 39 | )
 40 | looker_client_secret_prod = Secret(
 41 |     deploy_type="env",
 42 |     deploy_target="LOOKER_CLIENT_SECRET",
 43 |     secret="airflow-gke-secrets",
 44 |     key="probe_scraper_secret__looker_api_client_secret_prod",
 45 | )
 46 | looker_instance_uri = "https://mozilla.cloud.looker.com"
 47 | 
 48 | 
 49 | with DAG(
 50 |     "looker_usage_analysis",
 51 |     doc_md=DOCS,
 52 |     max_active_runs=1,
 53 |     default_args=default_args,
 54 |     schedule_interval="0 0 1 2,5,8,11 *",
 55 |     tags=tags,
 56 | ) as dag:
 57 |     airflow_gke_prod_kwargs = {
 58 |         "gcp_conn_id": "google_cloud_airflow_gke",
 59 |         "project_id": "moz-fx-data-airflow-gke-prod",
 60 |         "location": "us-west1",
 61 |         "cluster_name": "workloads-prod-v1",
 62 |     }
 63 | 
 64 |     analyze_explores = GKEPodOperator(
 65 |         task_id="analyze_explores",
 66 |         arguments=[
 67 |             "python",
 68 |             "-m",
 69 |             "looker_utils.main",
 70 |             "analyze",
 71 |             "--destination_table",
 72 |             "moz-fx-data-shared-prod.monitoring_derived.looker_usage_explores_v1",
 73 |             "--date={{ ds }}",
 74 |             "explores",
 75 |         ],
 76 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/looker-utils_docker_etl:latest",
 77 |         env_vars={
 78 |             "LOOKER_INSTANCE_URI": looker_instance_uri,
 79 |         },
 80 |         secrets=[looker_client_id_prod, looker_client_secret_prod],
 81 |         **airflow_gke_prod_kwargs,
 82 |     )
 83 | 
 84 |     analyze_models = GKEPodOperator(
 85 |         task_id="analyze_models",
 86 |         arguments=[
 87 |             "python",
 88 |             "-m",
 89 |             "looker_utils.main",
 90 |             "analyze",
 91 |             "--destination_table",
 92 |             "moz-fx-data-shared-prod.monitoring_derived.looker_usage_models_v1",
 93 |             "--date={{ ds }}",
 94 |             "models",
 95 |         ],
 96 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/looker-utils_docker_etl:latest",
 97 |         env_vars={
 98 |             "LOOKER_INSTANCE_URI": looker_instance_uri,
 99 |         },
100 |         secrets=[looker_client_id_prod, looker_client_secret_prod],
101 |         **airflow_gke_prod_kwargs,
102 |     )
103 | 
104 |     analyze_unused_explores = GKEPodOperator(
105 |         task_id="analyze_unused_explores",
106 |         arguments=[
107 |             "python",
108 |             "-m",
109 |             "looker_utils.main",
110 |             "analyze",
111 |             "--destination_table",
112 |             "moz-fx-data-shared-prod.monitoring_derived.looker_usage_unused_explores_v1",
113 |             "--date={{ ds }}",
114 |             "unused-explores",
115 |         ],
116 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/looker-utils_docker_etl:latest",
117 |         env_vars={
118 |             "LOOKER_INSTANCE_URI": looker_instance_uri,
119 |         },
120 |         secrets=[looker_client_id_prod, looker_client_secret_prod],
121 |         **airflow_gke_prod_kwargs,
122 |     )
123 | 


--------------------------------------------------------------------------------
/dags/ltv.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Client Lifetime Value.
  3 | 
  4 | Kicks off jobs to run on a Dataproc cluster. The job code lives in
  5 | [jobs/ltv_daily.py](https://github.com/mozilla/telemetry-airflow/blob/main/jobs/ltv_daily.py).
  6 | 
  7 | See [client_ltv docs on DTMO](https://docs.telemetry.mozilla.org/datasets/search/client_ltv/reference.html).
  8 | """
  9 | from datetime import datetime, timedelta
 10 | 
 11 | from airflow import DAG
 12 | from airflow.operators.subdag import SubDagOperator
 13 | from airflow.sensors.external_task import ExternalTaskSensor
 14 | 
 15 | from utils.constants import ALLOWED_STATES, FAILED_STATES
 16 | from utils.dataproc import (
 17 |     copy_artifacts_dev,
 18 |     get_dataproc_parameters,
 19 |     moz_dataproc_pyspark_runner,
 20 | )
 21 | from utils.gcp import bigquery_etl_query
 22 | from utils.tags import Tag
 23 | 
 24 | default_args = {
 25 |     "owner": "akomar@mozilla.com",
 26 |     "depends_on_past": True,
 27 |     "start_date": datetime(2020, 3, 15),
 28 |     "email": [
 29 |         "telemetry-alerts@mozilla.com",
 30 |         "akomar@mozilla.com",
 31 |     ],
 32 |     "email_on_failure": True,
 33 |     "email_on_retry": True,
 34 |     "retries": 3,
 35 |     "retry_delay": timedelta(minutes=30),
 36 | }
 37 | 
 38 | tags = [Tag.ImpactTier.tier_2]
 39 | 
 40 | dag = DAG(
 41 |     "ltv_daily",
 42 |     default_args=default_args,
 43 |     schedule_interval="0 4 * * *",
 44 |     doc_md=__doc__,
 45 |     tags=tags,
 46 | )
 47 | 
 48 | params = get_dataproc_parameters("google_cloud_airflow_dataproc")
 49 | 
 50 | subdag_args = default_args.copy()
 51 | subdag_args["retries"] = 0
 52 | 
 53 | task_id = "ltv_daily"
 54 | project = params.project_id if params.is_dev else "moz-fx-data-shared-prod"
 55 | ltv_daily = SubDagOperator(
 56 |     task_id=task_id,
 57 |     dag=dag,
 58 |     subdag=moz_dataproc_pyspark_runner(
 59 |         parent_dag_name=dag.dag_id,
 60 |         dag_name=task_id,
 61 |         job_name="ltv-daily",
 62 |         cluster_name="ltv-daily-{{ ds_nodash }}",
 63 |         idle_delete_ttl=600,
 64 |         num_workers=30,
 65 |         worker_machine_type="n2-standard-16",
 66 |         optional_components=["ANACONDA"],
 67 |         init_actions_uris=[
 68 |             "gs://dataproc-initialization-actions/python/pip-install.sh"
 69 |         ],
 70 |         additional_properties={
 71 |             "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
 72 |         },
 73 |         additional_metadata={"PIP_PACKAGES": "lifetimes==0.11.1"},
 74 |         python_driver_code=f"gs://{params.artifact_bucket}/jobs/ltv_daily.py",
 75 |         py_args=[
 76 |             "--submission-date",
 77 |             "{{ ds }}",
 78 |             "--prediction-days",
 79 |             "364",
 80 |             "--project-id",
 81 |             project,
 82 |             "--source-qualified-table-id",
 83 |             f"{project}.search.search_rfm",
 84 |             "--dataset-id",
 85 |             "analysis",
 86 |             "--intermediate-table-id",
 87 |             "ltv_daily_temporary_search_rfm_day",
 88 |             "--model-input-table-id",
 89 |             "ltv_daily_model_perf",
 90 |             "--model-output-table-id",
 91 |             "ltv_daily",
 92 |             "--temporary-gcs-bucket",
 93 |             params.storage_bucket,
 94 |         ],
 95 |         gcp_conn_id=params.conn_id,
 96 |         service_account=params.client_email,
 97 |         artifact_bucket=params.artifact_bucket,
 98 |         storage_bucket=params.storage_bucket,
 99 |         default_args=subdag_args,
100 |     ),
101 | )
102 | 
103 | if params.is_dev:
104 |     copy_to_dev = copy_artifacts_dev(
105 |         dag, params.project_id, params.artifact_bucket, params.storage_bucket
106 |     )
107 |     copy_to_dev >> ltv_daily
108 | else:
109 |     wait_for_search_clients_last_seen = ExternalTaskSensor(
110 |         task_id="wait_for_search_clients_last_seen",
111 |         external_dag_id="bqetl_search",
112 |         external_task_id="search_derived__search_clients_last_seen__v1",
113 |         execution_delta=timedelta(hours=1),
114 |         check_existence=True,
115 |         mode="reschedule",
116 |         allowed_states=ALLOWED_STATES,
117 |         failed_states=FAILED_STATES,
118 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
119 |         email_on_retry=False,
120 |         dag=dag,
121 |     )
122 |     wait_for_search_clients_last_seen >> ltv_daily
123 | 
124 | ltv_revenue_join = bigquery_etl_query(
125 |     task_id="ltv_revenue_join",
126 |     destination_table="client_ltv_v1",
127 |     dataset_id="revenue_derived",
128 |     project_id="moz-fx-data-shared-prod",
129 |     arguments=(
130 |         "--clustering_fields=engine,country",
131 |         "--schema_update_option=ALLOW_FIELD_ADDITION",
132 |         "--schema_update_option=ALLOW_FIELD_RELAXATION",
133 |         "--time_partitioning_type=DAY",
134 |         "--time_partitioning_field=submission_date",
135 |     ),
136 |     dag=dag,
137 | )
138 | 
139 | ltv_daily >> ltv_revenue_join
140 | 


--------------------------------------------------------------------------------
/dags/mad_server.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Malicious Addons Detection.
  3 | 
  4 | This runs once a week to emit a trained model to GCS.
  5 | 
  6 | Source code is in the private [mad-server repository](https://github.com/mozilla/mad-server/).
  7 | 
  8 | *Triage notes*
  9 | 
 10 | The way the app was designed it is decoupled from Airflow and will pull all data since the last
 11 | successful data pull. What this means if we have a failed DAG run followed by
 12 | a successful DAG run it will cover the data from the previous run.
 13 | 
 14 | So as long as the most recent DAG run is successful the job can be considered healthy
 15 | and not action is required for failed DAG runs.
 16 | """
 17 | 
 18 | from datetime import datetime, timedelta
 19 | 
 20 | from airflow import DAG
 21 | from airflow.providers.cncf.kubernetes.secret import Secret
 22 | 
 23 | from operators.gcp_container_operator import GKEPodOperator
 24 | from utils.tags import Tag
 25 | 
 26 | default_args = {
 27 |     "owner": "dzeber@mozilla.com",
 28 |     "depends_on_past": False,
 29 |     "start_date": datetime(2021, 4, 15),
 30 |     "email_on_failure": True,
 31 |     "email_on_retry": True,
 32 |     "retries": 1,
 33 |     "retry_delay": timedelta(minutes=30),
 34 | }
 35 | 
 36 | tags = [Tag.ImpactTier.tier_3]
 37 | 
 38 | gcs_bucket = "mad-resources-training"
 39 | gcs_root_training = "datasets"
 40 | cloud_service = "GCS"
 41 | customs_training_allow_overwrite = "True"
 42 | gcloud_project = "mad-model-training"
 43 | gcs_report_bucket = "mad-reports"
 44 | amo_cred_issuer_secret = Secret(
 45 |     deploy_type="env",
 46 |     deploy_target="AMO_CRED_ISSUER",
 47 |     secret="airflow-gke-secrets",
 48 |     key="mad_server_secret__amo_cred_issuer",
 49 | )
 50 | amo_cred_secret_secret = Secret(
 51 |     deploy_type="env",
 52 |     deploy_target="AMO_CRED_SECRET",
 53 |     secret="airflow-gke-secrets",
 54 |     key="mad_server_secret__amo_cred_secret",
 55 | )
 56 | 
 57 | with DAG(
 58 |     "mad_server",
 59 |     default_args=default_args,
 60 |     schedule_interval="@weekly",
 61 |     doc_md=__doc__,
 62 |     tags=tags,
 63 | ) as dag:
 64 |     mad_server_pull = GKEPodOperator(
 65 |         task_id="mad_server_pull",
 66 |         # Controls the entrypoint of the container, which for mad-server
 67 |         # defaults to bin/run rather than a shell.
 68 |         cmds=[
 69 |             "/bin/bash",
 70 |         ],
 71 |         arguments=[
 72 |             "bin/airflow-pull",
 73 |         ],
 74 |         image="us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/mad-server:latest",
 75 |         startup_timeout_seconds=500,
 76 |         gcp_conn_id="google_cloud_airflow_gke",
 77 |         env_vars={
 78 |             "GCS_BUCKET": gcs_bucket,
 79 |             "GCS_ROOT_TRAINING": gcs_root_training,
 80 |             "CLOUD_SERVICE": cloud_service,
 81 |             "CUSTOMS_TRAINING_ALLOW_OVERWRITE": customs_training_allow_overwrite,
 82 |         },
 83 |         email=[
 84 |             "dzeber@mozilla.com",
 85 |             "gleonard@mozilla.com",
 86 |         ],
 87 |         secrets=[amo_cred_issuer_secret, amo_cred_secret_secret],
 88 |     )
 89 |     mad_train_model = GKEPodOperator(
 90 |         task_id="train_model",
 91 |         cmds=[
 92 |             "/bin/bash",
 93 |         ],
 94 |         arguments=[
 95 |             "bin/train_model",
 96 |             "--publish",
 97 |             "--publish-as-latest",
 98 |             "./working",
 99 |         ],
100 |         image="us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/mad-server:latest",
101 |         startup_timeout_seconds=500,
102 |         env_vars={
103 |             "GCS_BUCKET": gcs_bucket,
104 |             "GCS_ROOT_TRAINING": gcs_root_training,
105 |             "CLOUD_SERVICE": cloud_service,
106 |             "CUSTOMS_TRAINING_ALLOW_OVERWRITE": customs_training_allow_overwrite,
107 |             "GCLOUD_PROJECT": gcloud_project,
108 |             "GCS_REPORT_BUCKET": gcs_report_bucket,
109 |         },
110 |         email=[
111 |             "dzeber@mozilla.com",
112 |             "gleonard@mozilla.com",
113 |         ],
114 |     )
115 |     new_data_eval = GKEPodOperator(
116 |         task_id="evaluate_new_data",
117 |         cmds=[
118 |             "/bin/bash",
119 |         ],
120 |         arguments=[
121 |             "bin/evaluate_new_data",
122 |             "--publish",
123 |             "--publish-as-latest",
124 |             "./working",
125 |         ],
126 |         image="us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/mad-server:latest",
127 |         startup_timeout_seconds=500,
128 |         gcp_conn_id="google_cloud_airflow_gke",
129 |         env_vars={
130 |             "GCS_BUCKET": gcs_bucket,
131 |             "GCS_ROOT_TRAINING": gcs_root_training,
132 |             "CLOUD_SERVICE": cloud_service,
133 |             "CUSTOMS_TRAINING_ALLOW_OVERWRITE": customs_training_allow_overwrite,
134 |             "GCLOUD_PROJECT": gcloud_project,
135 |             "GCS_REPORT_BUCKET": gcs_report_bucket,
136 |         },
137 |         email=[
138 |             "dzeber@mozilla.com",
139 |             "gleonard@mozilla.com",
140 |         ],
141 |     )
142 | 
143 |     mad_server_pull >> mad_train_model >> new_data_eval
144 | 


--------------------------------------------------------------------------------
/dags/microsoft_store.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.providers.cncf.kubernetes.secret import Secret
 5 | 
 6 | from operators.gcp_container_operator import GKEPodOperator
 7 | from utils.tags import Tag
 8 | 
 9 | # Deploy value associated with Microsoft Store keys in k8s secret `airflow-gke-secrets` in environments Microsoft variables.
10 | 
11 | microsoft_client_id = Secret(
12 |     deploy_type="env",
13 |     deploy_target="MICROSOFT_CLIENT_ID",
14 |     secret="airflow-gke-secrets",
15 |     key="MICROSOFT_CLIENT_ID",
16 | )
17 | microsoft_client_secret = Secret(
18 |     deploy_type="env",
19 |     deploy_target="MICROSOFT_CLIENT_SECRET",
20 |     secret="airflow-gke-secrets",
21 |     key="MICROSOFT_CLIENT_SECRET",
22 | )
23 | microsoft_tenant_id = Secret(
24 |     deploy_type="env",
25 |     deploy_target="MICROSOFT_TENANT_ID",
26 |     secret="airflow-gke-secrets",
27 |     key="MICROSOFT_TENANT_ID",
28 | )
29 | microsoft_store_app_list = Secret(
30 |     deploy_type="env",
31 |     deploy_target="MICROSOFT_STORE_APP_LIST",
32 |     secret="airflow-gke-secrets",
33 |     key="MICROSOFT_STORE_APP_LIST",
34 | )
35 | 
36 | docs = """
37 | This DAG runs the daily download of aggregated data from the Microsoft Store API.
38 | #### Owner
39 | mhirose@mozilla.com
40 | #### Tags
41 | * impact/tier_2
42 | * repo/bigquery-etl
43 | """
44 | 
45 | default_args = {
46 |     "owner": "mhirose@mozilla.com",
47 |     "start_date": datetime.datetime(2024, 6, 18, 0, 0),
48 |     "end_date": None,
49 |     "email": ["telemetry-alerts@mozilla.com", "mhirose@mozilla.com"],
50 |     "depends_on_past": False,
51 |     "retry_delay": datetime.timedelta(seconds=1800),
52 |     "email_on_failure": True,
53 |     "email_on_retry": True,
54 |     "retries": 2,
55 | }
56 | 
57 | tags = [Tag.ImpactTier.tier_2]
58 | 
59 | # Have the DAG run later in the day so Microsoft Store data has a chance to populate
60 | with DAG(
61 |     "microsoft_store",
62 |     default_args=default_args,
63 |     schedule_interval="0 15 * * *",
64 |     doc_md=docs,
65 |     tags=tags,
66 | ) as dag:
67 |     table_names = (
68 |         "app_acquisitions",
69 |         "app_conversions",
70 |         "app_installs",
71 |     )
72 |     for table in table_names:
73 |         GKEPodOperator(
74 |             task_id=f"microsoft_derived__{table}__v1",
75 |             secrets=[
76 |                 microsoft_client_id,
77 |                 microsoft_client_secret,
78 |                 microsoft_tenant_id,
79 |                 microsoft_store_app_list,
80 |             ],
81 |             arguments=[
82 |                 "python",
83 |                 f"sql/moz-fx-data-shared-prod/microsoft_derived/{table}_v1/query.py",
84 |                 "--date={{ macros.ds_add(ds, -3) }}",
85 |             ],
86 |             image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
87 |             owner="mhirose@mozilla.com",
88 |             email=["mhirose@mozilla.com", "telemetry-alerts@mozilla.com"],
89 |         )
90 | 


--------------------------------------------------------------------------------
/dags/operational_monitoring.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.sensors.external_task import ExternalTaskSensor
 5 | 
 6 | from operators.gcp_container_operator import GKEPodOperator
 7 | from utils.tags import Tag
 8 | 
 9 | docs = """
10 | ### operational_monitoring
11 | 
12 | 
13 | This DAG schedules queries for populating datasets used for operational monitoring.
14 | The queries are generated via [`opmon`](https://github.com/mozilla/opmon).
15 | 
16 | #### Owner
17 | 
18 | ascholtz@mozilla.com
19 | """
20 | 
21 | default_args = {
22 |     "owner": "ascholtz@mozilla.com",
23 |     "email": [
24 |         "telemetry-alerts@mozilla.com",
25 |         "ascholtz@mozilla.com",
26 |     ],
27 |     "depends_on_past": False,
28 |     "start_date": datetime(2021, 6, 3),
29 |     "email_on_failure": True,
30 |     "email_on_retry": True,
31 |     "retries": 2,
32 |     "retry_delay": timedelta(minutes=30),
33 | }
34 | 
35 | DAG_NAME = "operational_monitoring"
36 | tags = [Tag.ImpactTier.tier_3]
37 | 
38 | with DAG(
39 |     DAG_NAME,
40 |     default_args=default_args,
41 |     schedule_interval="0 4 * * *",
42 |     doc_md=docs,
43 |     tags=tags,
44 | ) as dag:
45 |     # Built from repo https://github.com/mozilla/opmon
46 |     opmon_image = "gcr.io/moz-fx-data-experiments/opmon:latest"
47 | 
48 |     opmon_run = GKEPodOperator(
49 |         task_id="opmon_run",
50 |         name="opmon_run",
51 |         image=opmon_image,
52 |         email=["ascholtz@mozilla.com"],
53 |         arguments=[
54 |             "--log_to_bigquery",
55 |             "run",
56 |             "--date={{ ds }}",
57 |         ],
58 |         dag=dag,
59 |     )
60 | 
61 |     wait_for_clients_daily_export = ExternalTaskSensor(
62 |         task_id="wait_for_clients_daily",
63 |         external_dag_id="bqetl_main_summary",
64 |         external_task_id="telemetry_derived__clients_daily__v6",
65 |         execution_delta=timedelta(hours=2),
66 |         mode="reschedule",
67 |         allowed_states=["success"],
68 |         failed_states=["failed", "upstream_failed", "skipped"],
69 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
70 |         email_on_retry=False,
71 |         dag=dag,
72 |     )
73 | 
74 |     wait_for_search_clients_daily = ExternalTaskSensor(
75 |         task_id="wait_for_search_clients_daily",
76 |         external_dag_id="bqetl_search",
77 |         external_task_id="search_derived__search_clients_daily__v8",
78 |         execution_delta=timedelta(hours=1),
79 |         mode="reschedule",
80 |         allowed_states=["success"],
81 |         failed_states=["failed", "upstream_failed", "skipped"],
82 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
83 |         email_on_retry=False,
84 |         dag=dag,
85 |     )
86 | 
87 |     opmon_run.set_upstream(
88 |         [
89 |             wait_for_clients_daily_export,
90 |             wait_for_search_clients_daily,
91 |         ]
92 |     )
93 | 


--------------------------------------------------------------------------------
/dags/operational_monitoring_backfill.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from airflow.decorators import dag, task
 4 | from airflow.models.param import Param
 5 | 
 6 | from operators.gcp_container_operator import GKEPodOperator
 7 | from utils.tags import Tag
 8 | 
 9 | docs = """
10 | ### operational_monitoring_backfill
11 | Build from telemetry-airflow repo, [dags/operational_monitoring_backfill.py](https://github.com/mozilla/telemetry-airflow/blob/main/dags/operational_monitoring_backfill.py)
12 | Triggers backfills for specifc operational monitoring projects.
13 | 
14 | #### Owner
15 | 
16 | ascholtz@mozilla.com
17 | lschiestl@mozilla.com
18 | """
19 | 
20 | tags = [Tag.ImpactTier.tier_3, Tag.Triage.no_triage]
21 | 
22 | 
23 | @dag(
24 |     dag_id="operational_monitoring_backfill",
25 |     start_date=datetime.datetime(2021, 1, 1, 0, 0),
26 |     schedule_interval=None,
27 |     catchup=False,
28 |     doc_md=docs,
29 |     dagrun_timeout=datetime.timedelta(days=4),
30 |     tags=tags,
31 |     render_template_as_native_obj=True,
32 |     params={
33 |         "slug": Param(
34 |             "slug",
35 |             title="Slug",
36 |             type="string",
37 |             description="[Required] Experimenter slug or slug of OpMon project to (re)run the analysis for",
38 |         ),
39 |         "start_date": Param(
40 |             f"{datetime.date.today()}",
41 |             title="Start Date",
42 |             type="string",
43 |             format="date",
44 |             description="[Required] First date to be backfilled, inclusive",
45 |         ),
46 |         "end_date": Param(
47 |             f"{datetime.date.today()}",
48 |             title="End Date",
49 |             type="string",
50 |             format="date",
51 |             description="[Required] Last date to be backfilled, inclusive",
52 |         ),
53 |         "args": Param(
54 |             None,
55 |             title="Additional Arguments",
56 |             type=["null", "string"],
57 |             description="[Optional] Additional command line arguments",
58 |         ),
59 |     },
60 | )
61 | def operational_monitoring_backfill_dag():
62 |     @task
63 |     def generate_backfill_arguments(**context):
64 |         cmd = [
65 |             "backfill",
66 |             "--slug",
67 |             context["params"]["slug"],
68 |             "--start-date",
69 |             context["params"]["start_date"],
70 |             "--end_date",
71 |             context["params"]["end_date"],
72 |         ]
73 | 
74 |         if args := context["params"]["args"]:
75 |             cmd.append(args)
76 | 
77 |         return cmd
78 | 
79 |     # Built from repo https://github.com/mozilla/opmon
80 |     opmon_image = "gcr.io/moz-fx-data-experiments/opmon:latest"
81 | 
82 |     GKEPodOperator(
83 |         task_id="opmon_backfill",
84 |         name="opmon_backfill",
85 |         image=opmon_image,
86 |         arguments=generate_backfill_arguments(),
87 |     )
88 | 
89 | 
90 | dag = operational_monitoring_backfill_dag()
91 | 


--------------------------------------------------------------------------------
/dags/partybal.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DAG to schedule generation of results for partybal.
 3 | 
 4 | Partybal is an experimental service to visualize experiment results that have been
 5 | produced by [jetstream](https://github.com/mozilla/jetstream).
 6 | See https://github.com/mozilla/partybal
 7 | 
 8 | This DAG depends on experiment results being available for a certain date.
 9 | So if the [jetstream DAG](https://workflow.telemetry.mozilla.org/tree?dag_id=jetstream)
10 | does not successfully complete running, then the tasks in this DAG will fail as well.
11 | 
12 | The DAG is scheduled to run every three hours to pick up experiment results from manually
13 | triggered analysis runs quickly.
14 | 
15 | *Triage notes*
16 | 
17 | As long as the most recent DAG run is successful this job can be considered healthy.
18 | In such case, past DAG failures can be ignored.
19 | """
20 | 
21 | from datetime import datetime, timedelta
22 | 
23 | from airflow import DAG
24 | 
25 | from operators.gcp_container_operator import GKEPodOperator
26 | from utils.tags import Tag
27 | 
28 | default_args = {
29 |     "owner": "ascholtz@mozilla.com",
30 |     "email": [
31 |         "ascholtz@mozilla.com",
32 |         "mwilliams@mozilla.com",
33 |     ],
34 |     "depends_on_past": False,
35 |     "start_date": datetime(2021, 6, 21),
36 |     "email_on_failure": True,
37 |     "email_on_retry": True,
38 |     "retries": 2,
39 |     "retry_delay": timedelta(minutes=30),
40 | }
41 | 
42 | tags = [Tag.ImpactTier.tier_2]
43 | 
44 | with DAG(
45 |     "partybal",
46 |     default_args=default_args,
47 |     schedule_interval="0 */3 * * *",
48 |     doc_md=__doc__,
49 |     tags=tags,
50 | ) as dag:
51 |     # Built from repo https://github.com/mozilla/partybal
52 |     partybal_image = "gcr.io/moz-fx-data-experiments/partybal:latest"
53 | 
54 |     partybal = GKEPodOperator(
55 |         task_id="partybal",
56 |         name="partybal",
57 |         image=partybal_image,
58 |         email=[
59 |             "ascholtz@mozilla.com",
60 |             "mwilliams@mozilla.com",
61 |         ],
62 |         dag=dag,
63 |     )
64 | 


--------------------------------------------------------------------------------
/dags/play_store_export.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Runs a Docker image that backfills data from the Google Play store to BigQuery.
 3 | 
 4 | The container is defined in
 5 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/play-store-export)
 6 | """
 7 | 
 8 | from datetime import datetime, timedelta
 9 | 
10 | from airflow import DAG
11 | 
12 | from operators.gcp_container_operator import GKEPodOperator
13 | from utils.tags import Tag
14 | 
15 | default_args = {
16 |     "owner": "akomar@mozilla.com",
17 |     "depends_on_past": False,
18 |     "start_date": datetime(2020, 6, 23),
19 |     "email_on_failure": True,
20 |     "email_on_retry": True,
21 |     "retries": 1,
22 |     "retry_delay": timedelta(minutes=30),
23 | }
24 | 
25 | project_id = "moz-fx-data-marketing-prod"
26 | 
27 | tags = [Tag.ImpactTier.tier_3]
28 | 
29 | with DAG(
30 |     "play_store_export",
31 |     default_args=default_args,
32 |     doc_md=__doc__,
33 |     schedule_interval="@daily",
34 |     tags=tags,
35 | ) as dag:
36 |     play_store_export = GKEPodOperator(
37 |         task_id="play_store_export",
38 |         arguments=[
39 |             "python",
40 |             "play_store_export/export.py",
41 |             "--date={{ yesterday_ds }}",
42 |             "--backfill-day-count=60",
43 |             "--project",
44 |             project_id,
45 |             "--transfer-config={{ var.value.play_store_transfer_config_id }}",
46 |         ],
47 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/play-store-export:latest",
48 |         gcp_conn_id="google_cloud_airflow_gke",
49 |         dag=dag,
50 |         email=[
51 |             "akomar@mozilla.com",
52 |         ],
53 |     )
54 | 


--------------------------------------------------------------------------------
/dags/publish_bqetl_static.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Daily deployment of static bigquery-etl data to various projects.
 3 | 
 4 | See the publish command [here](https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/static/__init__.py).
 5 | """
 6 | 
 7 | from datetime import datetime, timedelta
 8 | 
 9 | from airflow import DAG
10 | 
11 | from operators.gcp_container_operator import GKEPodOperator
12 | from utils.tags import Tag
13 | 
14 | IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest"
15 | 
16 | default_args = {
17 |     "owner": "anicholson@mozilla.com",
18 |     "email": [
19 |         "telemetry-alerts@mozilla.com",
20 |         "anicholson@mozilla.com",
21 |     ],
22 |     "depends_on_past": False,
23 |     "start_date": datetime(2022, 4, 4),
24 |     "email_on_failure": True,
25 |     "email_on_retry": True,
26 |     "retries": 2,
27 |     "retry_delay": timedelta(minutes=30),
28 | }
29 | 
30 | tags = [Tag.ImpactTier.tier_2]
31 | 
32 | with DAG(
33 |     "publish_bqetl_static",
34 |     default_args=default_args,
35 |     schedule_interval="@daily",
36 |     doc_md=__doc__,
37 |     tags=tags,
38 | ) as dag:
39 |     publish_static_mozdata = GKEPodOperator(
40 |         task_id="publish_static_mozdata",
41 |         arguments=["script/bqetl", "static", "publish", "--project_id", "mozdata"],
42 |         image=IMAGE,
43 |     )
44 | 
45 |     publish_static_shared_prod = GKEPodOperator(
46 |         task_id="publish_static_shared_prod",
47 |         arguments=[
48 |             "script/bqetl",
49 |             "static",
50 |             "publish",
51 |             "--project_id",
52 |             "moz-fx-data-shared-prod",
53 |         ],
54 |         image=IMAGE,
55 |     )
56 | 


--------------------------------------------------------------------------------
/dags/search_alert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Runs a Docker image that produces search alert data.
 3 | 
 4 | The container is defined in
 5 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/search-alert)
 6 | """
 7 | 
 8 | from datetime import datetime, timedelta
 9 | 
10 | from airflow import DAG
11 | from airflow.sensors.external_task import ExternalTaskSensor
12 | 
13 | from operators.gcp_container_operator import GKEPodOperator
14 | from utils.constants import ALLOWED_STATES, FAILED_STATES
15 | from utils.tags import Tag
16 | 
17 | default_args = {
18 |     "owner": "akomar@mozilla.com",
19 |     "depends_on_past": False,
20 |     "start_date": datetime(2022, 1, 20),
21 |     "email": [
22 |         "telemetry-alerts@mozilla.com",
23 |         "akomar@mozilla.com",
24 |     ],
25 |     "email_on_failure": True,
26 |     "email_on_retry": True,
27 |     "retries": 3,
28 |     "retry_delay": timedelta(minutes=30),
29 | }
30 | 
31 | tags = [Tag.ImpactTier.tier_2]
32 | 
33 | with DAG(
34 |     "search_alert",
35 |     default_args=default_args,
36 |     doc_md=__doc__,
37 |     schedule_interval="0 4 * * *",
38 |     # We don't want to run more than a single instance of this DAG
39 |     # since underlying tables are not partitioned
40 |     max_active_runs=1,
41 |     tags=tags,
42 | ) as dag:
43 |     wait_for_search_aggregates = ExternalTaskSensor(
44 |         task_id="wait_for_search_aggregates",
45 |         external_dag_id="bqetl_search",
46 |         external_task_id="search_derived__search_aggregates__v8",
47 |         execution_delta=timedelta(hours=1),
48 |         check_existence=True,
49 |         mode="reschedule",
50 |         allowed_states=ALLOWED_STATES,
51 |         failed_states=FAILED_STATES,
52 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
53 |         email_on_retry=False,
54 |         dag=dag,
55 |     )
56 | 
57 |     search_alert = GKEPodOperator(
58 |         task_id="search_alert",
59 |         arguments=[
60 |             "python",
61 |             "search_alert/main.py",
62 |             "--submission_date={{ ds }}",
63 |             "--project_id=mozdata",
64 |         ],
65 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/search-alert_docker_etl:latest",
66 |         gcp_conn_id="google_cloud_airflow_gke",
67 |     )
68 | 
69 |     wait_for_search_aggregates >> search_alert
70 | 


--------------------------------------------------------------------------------
/dags/search_forecasting.py:
--------------------------------------------------------------------------------
 1 | """
 2 | See [kpi-forecasting in the docker-etl repository](https://github.com/mozilla/docker-etl/blob/main/jobs/kpi-forecasting).
 3 | 
 4 | This DAG runs the search forecasts for the DAU, search count and ad clicks metrics .
 5 | 
 6 | This DAG is high priority for week 1 of the month and low priority otherwise.
 7 | """
 8 | 
 9 | import os
10 | from datetime import datetime, timedelta
11 | 
12 | from airflow import DAG
13 | from airflow.sensors.external_task import ExternalTaskSensor
14 | 
15 | from operators.gcp_container_operator import GKEPodOperator
16 | from utils.constants import ALLOWED_STATES, FAILED_STATES
17 | from utils.tags import Tag
18 | 
19 | default_args = {
20 |     "owner": "jsnyder@mozilla.com",
21 |     "email": [
22 |         "jsnyder@mozilla.com",
23 |         "mbowerman@mozilla.com",
24 |         "telemetry-alerts@mozilla.com",
25 |     ],
26 |     "depends_on_past": False,
27 |     "start_date": datetime(2024, 7, 6),
28 |     "email_on_failure": True,
29 |     "email_on_retry": False,
30 |     "retries": 2,
31 |     "retry_delay": timedelta(minutes=30),
32 | }
33 | 
34 | TAGS = [Tag.ImpactTier.tier_1]
35 | IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/kpi-forecasting_docker_etl:latest"
36 | 
37 | FORECAST_METRICS_LIST = [
38 |     "search_forecasting_daily_active_users",
39 |     "search_forecasting_search_count",
40 |     "search_forecasting_ad_clicks",
41 | ]
42 | 
43 | # schedule to run after bqetl_search_dashboard completes
44 | with DAG(
45 |     "search_forecasting",
46 |     default_args=default_args,
47 |     schedule_interval="30 5 7 * *",
48 |     doc_md=__doc__,
49 |     tags=TAGS,
50 | ) as dag:
51 |     # all the search forecasting metrics come from the search_revenue_levers_daily
52 |     # table which is run in the bqetl_search_dashboard dag
53 |     # as the search_derived__search_revenue_levers_daily__v1 task
54 |     # see: https://workflow.telemetry.mozilla.org/dags/bqetl_search_dashboard/grid
55 |     wait_task_sensor = ExternalTaskSensor(
56 |         task_id="wait_for_search_dashboard",
57 |         external_dag_id="bqetl_search_dashboard",
58 |         external_task_id="search_derived__search_revenue_levers_daily__v1",
59 |         check_existence=True,
60 |         mode="reschedule",
61 |         allowed_states=ALLOWED_STATES,
62 |         failed_states=FAILED_STATES,
63 |         pool="DATA_ENG_EXTERNALTASKSENSOR",
64 |     )
65 | 
66 |     for metric in FORECAST_METRICS_LIST:
67 |         # pass the search_forecasting configs to the KPI forecasting script
68 |         config_filename = f"{metric}.yaml"
69 |         script_path = os.path.join(".", "kpi_forecasting.py")
70 |         config_path = os.path.join("kpi_forecasting", "configs", config_filename)
71 | 
72 |         forecast_task = GKEPodOperator(
73 |             task_id=f"search_forecasting_{metric}",
74 |             arguments=["python", script_path, "-c", config_path],
75 |             image=IMAGE,
76 |         )
77 | 
78 |         wait_task_sensor >> forecast_task
79 | 


--------------------------------------------------------------------------------
/dags/shredder_backfill.py:
--------------------------------------------------------------------------------
  1 | from datetime import date, datetime, timedelta
  2 | 
  3 | from airflow import DAG
  4 | from airflow.models.param import Param
  5 | from airflow.operators.python import BranchPythonOperator
  6 | 
  7 | from operators.gcp_container_operator import GKEPodOperator, OnFinishAction
  8 | from utils.tags import Tag
  9 | 
 10 | docs = """
 11 | ### shredder-backfill
 12 | 
 13 | #### Description
 14 | 
 15 | Manually triggered DAG that handles deletion requests from a specified time period
 16 | for a list of given tables.
 17 | 
 18 | `target_tables` is a list of tables formatted as `dataset.table_name` with one table per line.
 19 | The moz-fx-data-shared-prod project is assumed because shredder currently only runs
 20 | on tables in this project.
 21 | 
 22 | Use the dry run parameter run shredder with the --dry-run option to validate parameters.
 23 | Note that the shredder dry run will still dry run queries against every partition of each table
 24 | so it may take a long time to finish if a lot of tables are given.
 25 | 
 26 | This DAG is meant to be used to handle older deletion requests for tables that are already being
 27 | shredded.  Any provided tables that aren't already valid deletion targets will be ignored.
 28 | 
 29 | #### Owner
 30 | 
 31 | bewu@mozilla.com
 32 | """
 33 | 
 34 | params = {
 35 |     "request_start_date": Param(
 36 |         default=(date.today() - timedelta(days=7)).isoformat(),
 37 |         description="First date of deletion requests to process",
 38 |         type="string",
 39 |         format="date",
 40 |     ),
 41 |     "request_end_date": Param(
 42 |         default=(date.today()).isoformat(),
 43 |         description="Last date of data (i.e. partition) to delete from",
 44 |         type="string",
 45 |         format="date",
 46 |     ),
 47 |     "target_tables": Param(
 48 |         default=["dataset.table_name"],
 49 |         description="Tables to delete from (one per line)",
 50 |         type="array",
 51 |         minItems=1,
 52 |     ),
 53 |     "dry_run": Param(default=True, type="boolean"),
 54 | }
 55 | 
 56 | default_args = {
 57 |     "owner": "bewu@mozilla.com",
 58 |     "depends_on_past": False,
 59 |     "start_date": datetime(2024, 3, 1),
 60 |     "catchup": False,
 61 |     "email": [
 62 |         "telemetry-alerts@mozilla.com",
 63 |         "bewu@mozilla.com",
 64 |     ],
 65 |     "email_on_failure": True,
 66 |     "email_on_retry": False,
 67 |     # transient failures are expected and can be handled with state table
 68 |     "retries": 44,
 69 |     "retry_delay": timedelta(minutes=5),
 70 | }
 71 | 
 72 | tags = [
 73 |     Tag.ImpactTier.tier_3,
 74 |     Tag.Triage.no_triage,
 75 | ]
 76 | 
 77 | NON_DRY_RUN_TASK_ID = "shredder_backfill"
 78 | DRY_RUN_TASK_ID = "shredder_backfill_dry_run"
 79 | 
 80 | 
 81 | def base_backfill_operator(dry_run):
 82 |     """Create task for backfill, filling out parameters based on dry run."""
 83 |     return GKEPodOperator(
 84 |         task_id=DRY_RUN_TASK_ID if dry_run else NON_DRY_RUN_TASK_ID,
 85 |         cmds=[
 86 |             "script/shredder_delete",
 87 |             *(["--dry-run"] if dry_run else []),
 88 |             # use different tables from scheduled task so they can be monitored separately
 89 |             "--state-table=moz-fx-data-shredder.shredder_state.shredder_state_backfill",
 90 |             "--task-table=moz-fx-data-shredder.shredder_state.tasks_backfill",
 91 |             "--end-date={{ params.request_end_date }}",
 92 |             "--start-date={{ params.request_start_date }}",
 93 |             "--no-use-dml",
 94 |             # low parallelism to reduce slot contention with scheduled task
 95 |             "--parallelism=1",
 96 |             "--billing-project=moz-fx-data-bq-batch-prod",
 97 |             "--only",
 98 |         ],
 99 |         # target_tables will be rendered as a python list
100 |         arguments="{{ params.target_tables }}",
101 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
102 |         on_finish_action=OnFinishAction.DELETE_POD.value,
103 |         reattach_on_restart=True,
104 |     )
105 | 
106 | 
107 | with DAG(
108 |     "shredder_backfill",
109 |     default_args=default_args,
110 |     schedule=None,
111 |     doc_md=docs,
112 |     tags=tags,
113 |     params=params,
114 |     # needed to pass the list of tables as a list to the pod operator
115 |     render_template_as_native_obj=True,
116 | ) as dag:
117 |     # Use separate tasks for dry run to make logs easier to find
118 |     dry_run_branch = BranchPythonOperator(
119 |         task_id="dry_run_branch",
120 |         python_callable=lambda dry_run: (
121 |             DRY_RUN_TASK_ID if dry_run else NON_DRY_RUN_TASK_ID
122 |         ),
123 |         op_kwargs={"dry_run": "{{ params.dry_run }}"},
124 |     )
125 | 
126 |     backfill_tasks = [
127 |         base_backfill_operator(dry_run_value) for dry_run_value in (True, False)
128 |     ]
129 | 
130 |     dry_run_branch >> backfill_tasks
131 | 


--------------------------------------------------------------------------------
/dags/socorro_import.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | 
  3 | from airflow import DAG
  4 | from airflow.operators.subdag import SubDagOperator
  5 | from airflow.sensors.external_task import ExternalTaskMarker
  6 | from airflow.utils.task_group import TaskGroup
  7 | 
  8 | from operators.gcp_container_operator import GKEPodOperator
  9 | from utils.dataproc import moz_dataproc_pyspark_runner
 10 | from utils.tags import Tag
 11 | 
 12 | """
 13 | This uses dataproc to rewrite the data to parquet in gcs, and
 14 | load the parquet data into bigquery.
 15 | 
 16 | The following WTMO connections are needed in order for this job to run:
 17 | conn - google_cloud_airflow_dataproc
 18 | conn - google_cloud_airflow_gke
 19 | """
 20 | 
 21 | default_args = {
 22 |     "owner": "srose@mozilla.com",
 23 |     "depends_on_past": False,
 24 |     "start_date": datetime(2019, 9, 10),
 25 |     "email": [
 26 |         "srose@mozilla.com",
 27 |         "telemetry-alerts@mozilla.com",
 28 |     ],
 29 |     "email_on_failure": True,
 30 |     "email_on_retry": True,
 31 |     "retries": 2,
 32 |     "retry_delay": timedelta(minutes=30),
 33 | }
 34 | 
 35 | tags = [Tag.ImpactTier.tier_2]
 36 | 
 37 | with DAG(
 38 |     "socorro_import",
 39 |     default_args=default_args,
 40 |     schedule_interval="@daily",
 41 |     tags=tags,
 42 | ) as dag:
 43 |     # Unsalted cluster name so subsequent runs fail if the cluster name exists
 44 |     cluster_name = "socorro-import-dataproc-cluster"
 45 | 
 46 |     # Defined in Airflow's UI -> Admin -> Connections
 47 |     gcp_conn_id = "google_cloud_airflow_dataproc"
 48 |     project_id = "airflow-dataproc"
 49 | 
 50 |     # We use an application-specific gcs bucket because the data needs to be transformed
 51 |     # in dataproc before loading
 52 | 
 53 |     gcs_data_bucket = "moz-fx-data-prod-socorro-data"
 54 | 
 55 |     dataset = "socorro_crash"
 56 |     dataset_version = "v2"
 57 |     date_submission_col = "crash_date"
 58 | 
 59 |     objects_prefix = "{}/{}/{}={}".format(
 60 |         dataset, dataset_version, date_submission_col, "{{ ds_nodash }}"
 61 |     )
 62 | 
 63 |     # Spark job reads gcs json and writes gcs parquet
 64 |     crash_report_parquet = SubDagOperator(
 65 |         task_id="crash_report_parquet",
 66 |         subdag=moz_dataproc_pyspark_runner(
 67 |             parent_dag_name=dag.dag_id,
 68 |             dag_name="crash_report_parquet",
 69 |             default_args=default_args,
 70 |             cluster_name=cluster_name,
 71 |             job_name="Socorro_Crash_Reports_to_Parquet",
 72 |             python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/socorro_import_crash_data.py",
 73 |             py_args=[
 74 |                 "--date",
 75 |                 "{{ ds_nodash }}",
 76 |                 "--source-gcs-path",
 77 |                 "gs://moz-fx-socorro-prod-prod-telemetry/v1/crash_report",
 78 |                 "--dest-gcs-path",
 79 |                 f"gs://{gcs_data_bucket}/{dataset}",
 80 |             ],
 81 |             idle_delete_ttl=14400,
 82 |             num_workers=8,
 83 |             worker_machine_type="n1-standard-8",
 84 |             gcp_conn_id=gcp_conn_id,
 85 |         ),
 86 |     )
 87 | 
 88 |     bq_gcp_conn_id = "google_cloud_airflow_gke"
 89 | 
 90 |     # Not using load_to_bigquery since our source data is on GCS.
 91 |     # We do use the parquet2bigquery container to load gcs parquet into bq though.
 92 |     bq_dataset = "telemetry_derived"
 93 |     bq_table_name = f"{dataset}_{dataset_version}"
 94 | 
 95 |     # This image was manually built from
 96 |     # https://github.com/mozilla/parquet2bigquery/commit/6bf1f86076de8939ba2c4d008080d6c159a0a093
 97 |     # using python:3.7.4-slim-buster
 98 |     docker_image = "gcr.io/moz-fx-data-airflow-prod-88e0/parquet2bigquery:20190722"
 99 | 
100 |     gke_args = [
101 |         "--dataset",
102 |         bq_dataset,
103 |         "--concurrency",
104 |         "10",
105 |         "--bucket",
106 |         gcs_data_bucket,
107 |         "--no-resume",
108 |         "--prefix",
109 |         objects_prefix,
110 |         "--cluster-by",
111 |         "crash_date",
112 |     ]
113 | 
114 |     # We remove the current date partition for idempotency.
115 |     table_name = "{}:{}.{}${{{{ds_nodash}}}}".format(
116 |         "{{ var.value.gcp_shared_prod_project }}", bq_dataset, bq_table_name
117 |     )
118 | 
119 |     remove_bq_table_partition = GKEPodOperator(
120 |         task_id="remove_socorro_crash_bq_table_partition",
121 |         gcp_conn_id=bq_gcp_conn_id,
122 |         name="remove_socorro_crash_bq_table_partition",
123 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
124 |         arguments=["bq", "rm", "-f", "--table", table_name],
125 |     )
126 | 
127 |     bq_load = GKEPodOperator(
128 |         task_id="bigquery_load",
129 |         gcp_conn_id=bq_gcp_conn_id,
130 |         name="load-socorro-crash-parquet-to-bq",
131 |         image=docker_image,
132 |         arguments=gke_args,
133 |         env_vars={"GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}"},
134 |     )
135 | 
136 |     with TaskGroup("socorro_external") as socorro_external:
137 |         ExternalTaskMarker(
138 |             task_id="crash_symbolication__wait_for_socorro_import",
139 |             external_dag_id="crash_symbolication",
140 |             external_task_id="wait_for_socorro_import",
141 |             execution_date="{{ execution_date.replace(hour=5, minute=0).isoformat() }}",
142 |         )
143 | 
144 |         bq_load >> socorro_external
145 | 
146 |     crash_report_parquet >> remove_bq_table_partition >> bq_load
147 | 


--------------------------------------------------------------------------------
/dags/update_orphaning_dashboard_etl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Powers https://telemetry.mozilla.org/update-orphaning/.
 3 | 
 4 | See [jobs/update_orphaning_dashboard_etl.py](https://github.com/mozilla/telemetry-airflow/blob/main/jobs/update_orphaning_dashboard_etl.py).
 5 | """
 6 | 
 7 | from datetime import datetime, timedelta
 8 | 
 9 | from airflow import DAG
10 | from airflow.operators.subdag import SubDagOperator
11 | 
12 | from utils.constants import DS_WEEKLY
13 | from utils.dataproc import moz_dataproc_pyspark_runner
14 | from utils.tags import Tag
15 | 
16 | """
17 | 
18 | The following WTMO connections are needed in order for this job to run:
19 | conn - google_cloud_airflow_dataproc
20 | conn - aws_dev_telemetry_public_analysis_2_rw
21 | """
22 | 
23 | default_args = {
24 |     "owner": "akomar@mozilla.com",
25 |     "depends_on_past": False,
26 |     "start_date": datetime(2019, 10, 12),
27 |     "email": [
28 |         "telemetry-alerts@mozilla.com",
29 |         "ahabibi@mozilla.com",
30 |         "rsteuber@mozilla.com",
31 |         "akomar@mozilla.com",
32 |     ],
33 |     "email_on_failure": True,
34 |     "email_on_retry": True,
35 |     "retries": 2,
36 |     "retry_delay": timedelta(minutes=10),
37 | }
38 | 
39 | tags = [Tag.ImpactTier.tier_3]
40 | 
41 | # run every Monday to maintain compatibility with legacy ATMO schedule
42 | dag = DAG(
43 |     "update_orphaning_dashboard_etl",
44 |     default_args=default_args,
45 |     schedule_interval="0 2 * * MON",
46 |     doc_md=__doc__,
47 |     tags=tags,
48 | )
49 | 
50 | # Unsalted cluster name so subsequent runs fail if the cluster name exists
51 | cluster_name = "app-update-out-of-date-dataproc-cluster"
52 | 
53 | # Defined in Airflow's UI -> Admin -> Connections
54 | gcp_conn_id = "google_cloud_airflow_dataproc"
55 | 
56 | SubDagOperator(
57 |     task_id="update_orphaning_dashboard_etl",
58 |     dag=dag,
59 |     subdag=moz_dataproc_pyspark_runner(
60 |         parent_dag_name=dag.dag_id,
61 |         dag_name="update_orphaning_dashboard_etl",
62 |         default_args=default_args,
63 |         cluster_name=cluster_name,
64 |         job_name="update_orphaning_dashboard_etl",
65 |         python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/update_orphaning_dashboard_etl.py",
66 |         init_actions_uris=[
67 |             "gs://dataproc-initialization-actions/python/pip-install.sh"
68 |         ],
69 |         additional_metadata={
70 |             "PIP_PACKAGES": "google-cloud-bigquery==1.20.0 google-cloud-storage==1.19.1 boto3==1.9.253"
71 |         },
72 |         additional_properties={
73 |             "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.3"
74 |         },
75 |         py_args=[
76 |             "--run-date",
77 |             DS_WEEKLY,
78 |             "--gcs-bucket",
79 |             "mozdata-analysis",
80 |             "--gcs-prefix",
81 |             "update-orphaning-airflow",
82 |             "--gcs-output-bucket",
83 |             "moz-fx-data-static-websit-8565-analysis-output",
84 |             "--gcs-output-path",
85 |             "app-update/data/out-of-date/",
86 |         ],
87 |         idle_delete_ttl=14400,
88 |         num_workers=20,
89 |         worker_machine_type="n1-standard-8",
90 |         gcp_conn_id=gcp_conn_id,
91 |     ),
92 | )
93 | 


--------------------------------------------------------------------------------
/dags/webcompat_kb.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.providers.cncf.kubernetes.secret import Secret
 5 | 
 6 | from operators.gcp_container_operator import GKEPodOperator
 7 | from utils.tags import Tag
 8 | 
 9 | DOCS = """
10 | ### Bugzilla to BigQuery import
11 | 
12 | #### Description
13 | 
14 | Runs a Docker image that fetches bugzilla bugs from
15 | Web Compatibility > Knowledge Base component, as well as their core
16 | bugs dependencies and breakage reports and stores them in BQ.
17 | 
18 | The container is defined in
19 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/webcompat-kb)
20 | 
21 | *Triage notes*
22 | 
23 | As long as the most recent DAG run is successful this job doesn't need to be triaged.
24 | 
25 | #### Owner
26 | 
27 | kberezina@mozilla.com
28 | """
29 | 
30 | default_args = {
31 |     "owner": "kberezina@mozilla.com",
32 |     "email": ["kberezina@mozilla.com", "webcompat-internal@mozilla.org"],
33 |     "depends_on_past": False,
34 |     "start_date": datetime(2023, 9, 26),
35 |     "email_on_failure": True,
36 | }
37 | 
38 | 
39 | tags = [
40 |     Tag.ImpactTier.tier_2,
41 | ]
42 | 
43 | every_fifteen_minutes = "*/15 * * * *"
44 | 
45 | bugzilla_token = Secret(
46 |     deploy_type="env",
47 |     deploy_target="BUGZILLA_API_KEY",
48 |     secret="airflow-gke-secrets",
49 |     key="webcompat_kb_secret__bugzilla_api_key",
50 | )
51 | 
52 | with DAG(
53 |     "webcompat_kb",
54 |     default_args=default_args,
55 |     max_active_runs=1,
56 |     doc_md=DOCS,
57 |     schedule_interval=every_fifteen_minutes,
58 |     tags=tags,
59 |     catchup=False,
60 | ) as dag:
61 |     webcompat_kb_import = GKEPodOperator(
62 |         task_id="webcompat_kb",
63 |         arguments=[
64 |             "python",
65 |             "-m",
66 |             "webcompat_kb.main",
67 |             "--bq-project",
68 |             "moz-fx-dev-dschubert-wckb",
69 |             "--bq-kb-dataset",
70 |             "webcompat_knowledge_base",
71 |             "--bq-web-features-dataset",
72 |             "web_features",
73 |             "--bq-standards-positions-dataset",
74 |             "standards_positions",
75 |         ],
76 |         image="gcr.io/moz-fx-data-airflow-prod-88e0/webcompat-kb_docker_etl:latest",
77 |         dag=dag,
78 |         secrets=[
79 |             bugzilla_token,
80 |         ],
81 |     )
82 | 


--------------------------------------------------------------------------------
/dataproc_bootstrap/README.md:
--------------------------------------------------------------------------------
1 | Contents of this directory will be rsync'd to gs://moz-fx-data-prod-airflow-dataproc-artifacts/bootstrap by CI
2 | 


--------------------------------------------------------------------------------
/dataproc_bootstrap/airflow_gcp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -exo pipefail
 4 | 
 5 | # Error message
 6 | error_msg ()
 7 | {
 8 |     echo 1>&2 "Error: $1"
 9 | }
10 | 
11 | # Parse arguments
12 | while [ $# -gt 0 ]; do
13 |     case "$1" in
14 |         --job-name)
15 |             shift
16 |             job_name=$1
17 |             ;;
18 |         --uri)
19 |             shift
20 |             uri=$1
21 |             ;;
22 |         --arguments)
23 |             shift
24 |             args=$1
25 |             ;;
26 |         --environment)
27 |             shift
28 |             environment=$1
29 |             ;;
30 |          -*)
31 |             # do not exit out, just note failure
32 |             error_msg "unrecognized option: $1"
33 |             ;;
34 |           *)
35 |             break;
36 |             ;;
37 |     esac
38 |     shift
39 | done
40 | 
41 | if [ -z "$job_name" ] || [ -z "$uri" ]; then
42 |     error_msg "missing argument(s)"
43 |     exit 1
44 | fi
45 | 
46 | wd=/mnt/analyses
47 | mkdir -p $wd && cd $wd
48 | mkdir -p output
49 | 
50 | urldecode() {
51 |     local url_encoded="${1//+/ }"
52 |     printf '%b' "${url_encoded//%/\\x}"
53 | }
54 | 
55 | # Download file
56 | if [[ $uri =~ ^gs.*$ ]]; then
57 |     gsutil cp "$uri" .
58 | elif [[ $uri =~ ^https?.*$ ]]; then
59 |     uri=$(urldecode $uri)
60 |     wget -N "$uri"
61 | fi
62 | 
63 | # Run job
64 | job="${uri##*/}"
65 | 
66 | if [[ $uri == *.jar ]]; then
67 |     time env $environment spark-submit --master yarn "./$job" $args
68 | elif [[ $uri == *.ipynb ]]; then
69 |     echo "We are no longer supporting running ipynb's via GCP dataproc."
70 |     exit 1
71 | elif [[ $uri == *.py ]]; then
72 |     time env $environment \
73 |     PYSPARK_DRIVER_PYTHON=/opt/conda/default/bin/python PYSPARK_DRIVER_PYTHON_OPTS="" spark-submit \
74 |     --master yarn "./$job" $args
75 | else
76 |     chmod +x "./$job"
77 |     time env $environment "./$job" $args
78 | fi
79 | 


--------------------------------------------------------------------------------
/dataproc_bootstrap/dataproc_init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -exo pipefail
 4 | 
 5 | # Logs will be available on the dataproc nodes at /var/log/dataproc-initialization-script-X.log
 6 | # or via the GCP Dataproc UI
 7 | 
 8 | ARTIFACTS_BUCKET=gs://moz-fx-data-prod-airflow-dataproc-artifacts
 9 | 
10 | ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
11 | if [[ "${ROLE}" == 'Master' ]]; then
12 |   # You can put any master-specific logic here
13 |   echo "Running dataproc_init.sh on master..."
14 | fi
15 | 
16 | gsutil cp $ARTIFACTS_BUCKET/jars/* /usr/lib/spark/jars/
17 | 
18 | # Install spark packages
19 | # See https://github.com/mozilla/telemetry-spark-packages-assembly
20 | TSPA_VERSION=v1.0.0
21 | TSPA_GS_PATH=$ARTIFACTS_BUCKET/mozilla/telemetry-spark-packages-assembly/$TSPA_VERSION/telemetry-spark-packages-assembly.jar
22 | TSPA_JAR=/usr/lib/spark/jars/telemetry-spark-packages-assembly.jar
23 | gsutil cp $TSPA_GS_PATH $TSPA_JAR
24 | 
25 | # Install python packages
26 | PIP_REQUIREMENTS_FILE=/tmp/requirements.txt
27 | gsutil cp $ARTIFACTS_BUCKET/bootstrap/python-requirements.txt $PIP_REQUIREMENTS_FILE
28 | /opt/conda/default/bin/pip install --upgrade 'pip<20.3.0'
29 | /opt/conda/default/bin/pip install -r $PIP_REQUIREMENTS_FILE
30 | 


--------------------------------------------------------------------------------
/dataproc_bootstrap/fx_usage_init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -exo pipefail
 4 | 
 5 | # Logs will be available on the dataproc nodes at /var/log/dataproc-initialization-script-X.log
 6 | # or via the GCP Dataproc UI
 7 | 
 8 | /opt/conda/default/bin/pip install --upgrade pip
 9 | 
10 | /opt/conda/default/bin/pip install arrow==0.10.0
11 | /opt/conda/default/bin/pip install boto3==1.9.199
12 | /opt/conda/default/bin/pip install click==6.7
13 | /opt/conda/default/bin/pip install click_datetime==0.2
14 | /opt/conda/default/bin/pip install --ignore-installed flake8==3.7.8
15 | /opt/conda/default/bin/pip install pyspark==2.2.2
16 | /opt/conda/default/bin/pip install pytest==4.6.4
17 | /opt/conda/default/bin/pip install scipy==1.0.0rc1
18 | 
19 | /opt/conda/default/bin/pip install py4j --upgrade
20 | /opt/conda/default/bin/pip install numpy==1.16.4
21 | /opt/conda/default/bin/pip install python-dateutil==2.5.0
22 | /opt/conda/default/bin/pip install pytz==2011k
23 | /opt/conda/default/bin/pip install --no-dependencies pandas==0.24
24 | 
25 | # This fixes the PythonAccumulatorV2 does not exist error
26 | export PYTHONPATH=/usr/lib/spark/python/lib/pyspark.zip
27 | 


--------------------------------------------------------------------------------
/dataproc_bootstrap/python-requirements.txt:
--------------------------------------------------------------------------------
 1 | arrow==0.10.0
 2 | boto
 3 | boto3
 4 | botocore
 5 | click==6.7
 6 | click_datetime==0.2
 7 | numpy==1.13.3
 8 | pandas==0.23.4
 9 | pyspark==2.3.2
10 | requests-toolbelt==0.8.0
11 | requests==2.20.1
12 | scipy==1.0.0
13 | typing==3.6.4
14 | six==1.11.0
15 | protobuf==3.6.1
16 | py4j==0.10.7
17 | ujson
18 | 


--------------------------------------------------------------------------------
/jobs/addon_recommender.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$privateBucket" || -z "$publicBucket" || -z "$date" ]]; then
 4 |     echo "Missing arguments!" 1>&2
 5 |     exit 1
 6 | fi
 7 | 
 8 | git clone https://github.com/mozilla/telemetry-batch-view.git
 9 | cd telemetry-batch-view
10 | sbt assembly
11 | mkdir ml_output
12 | spark-submit --master yarn \
13 |              --deploy-mode client \
14 |              --class com.mozilla.telemetry.ml.AddonRecommender \
15 |              target/scala-2.11/telemetry-batch-view-1.1.jar \
16 |              train \
17 |              --privateBucket $privateBucket \
18 |              --publicBucket $publicBucket \
19 |              --runDate $date
20 | 


--------------------------------------------------------------------------------
/jobs/bugzilla_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | conda install psycopg2 --yes
4 | git clone https://github.com/maurodoglio/bz2db.git
5 | pip install -r bz2db/requirements.txt
6 | cd bz2db && python bz2db/update_bugs.py
7 | 


--------------------------------------------------------------------------------
/jobs/moz_dataproc_runner.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import sys
 3 | 
 4 | 
 5 | """Generic runner for PySpark jobs
 6 | 
 7 | This script runs a `cli.entry_point()` from an arbitrary Python module or CLI application.
 8 | Job module name should be provided as a first command line argument. Module argument will be cleared
 9 | before executing the `entry_point()`, allowing for the underlying job to be decoupled from this script.
10 | 
11 | If running on Dataproc, this requires the job to be installed on the cluster
12 | (e.g. via `pip_install` initialization action).
13 | """
14 | # Retrieve target module name
15 | module_to_run = sys.argv[1]
16 | # Clear retrieved argument in the list of arguments passed to this script
17 | # This allows the target job to properly interpret its command line arguments
18 | del sys.argv[1]
19 | 
20 | # Import the target module and execute its entry point
21 | cli = importlib.import_module(f"{module_to_run}.cli")
22 | cli.entry_point()
23 | 


--------------------------------------------------------------------------------
/jobs/mozaggregator_runner.py:
--------------------------------------------------------------------------------
1 | from mozaggregator import cli
2 | 
3 | cli.entry_point()
4 | 


--------------------------------------------------------------------------------
/jobs/pip-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -exo pipefail
 4 | 
 5 | readonly PACKAGES=$(/usr/share/google/get_metadata_value attributes/PIP_PACKAGES || true)
 6 | 
 7 | function install_pip() {
 8 |   if command -v pip >/dev/null; then
 9 |     echo "pip is already installed."
10 |     return 0
11 |   fi
12 | 
13 |   if command -v easy_install >/dev/null; then
14 |     echo "Installing pip with easy_install..."
15 |     easy_install pip
16 |     return 0
17 |   fi
18 | 
19 |   echo "Installing python-pip..."
20 |   apt update
21 |   apt install python-pip -y
22 | }
23 | 
24 | function main() {
25 |   if [[ -z "${PACKAGES}" ]]; then
26 |     echo "ERROR: Must specify PIP_PACKAGES metadata key"
27 |     exit 1
28 |   fi
29 | 
30 |   install_pip
31 |   pip install --upgrade ${PACKAGES}
32 | }
33 | 
34 | main
35 | 


--------------------------------------------------------------------------------
/jobs/telemetry_batch_view.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import requests
 4 | from os import chdir
 5 | from os import environ
 6 | from subprocess import call, PIPE, Popen
 7 | from urlparse import urlparse
 8 | import zipfile
 9 | import boto3
10 | 
11 | artifact_file = "artifact.jar"
12 | 
13 | 
14 | def call_exit_errors(command):
15 |     print("+ {}".format(" ".join(command)))
16 |     rc = call(command, env=environ.copy())
17 |     if rc > 0:
18 |        exit(rc)
19 | 
20 | 
21 | def retrieve_jar():
22 |     jar_url = environ.get("ARTIFACT_URL")
23 | 
24 |     if jar_url is None:
25 |         exit(1)
26 | 
27 | 
28 |     print("Retrieving JAR: {}".format(jar_url))
29 | 
30 |     # Check to see if this is an alias for a full jar path
31 |     # If it's an alias, it should be accompanied by a .txt
32 |     # file whose contents point to the aliased location.
33 |     #
34 |     # The associated .txt files have two lines [0]:
35 |     # 1. The query string to get to the aliased jar
36 |     # 2. The associated build URL for that jar
37 |     #
38 |     # Historical version only had the query string [1],
39 |     # so we need to handle that case separately.
40 |     #
41 |     # [0] https://github.com/mozilla/telemetry-batch-view/blob/main/.circleci/deploy.sh#L37
42 |     # [1] https://github.com/mozilla/telemetry-batch-view/blob/14741db20dd3873b94944b8238dfc48a003c744d/deploy.sh#L50
43 | 
44 |     txt_url = jar_url.replace(".jar", ".txt")
45 |     response = requests.get(txt_url)
46 | 
47 |     if response.status_code != 404:
48 |         uri_query, _, build_url = response.content.partition("\n")
49 |         if not build_url:
50 |             # Handle historical version
51 |             build_url = "Build URL not available"
52 | 
53 |         parsed_uri = urlparse(jar_url)
54 |         bucket, _, _ = parsed_uri.path.lstrip("/").partition("/")
55 |         full_url = "{uri.scheme}://{uri.netloc}/{bucket}/{uri_query}".format(uri=parsed_uri, bucket=bucket, uri_query=uri_query)
56 | 
57 |         print("  Alias: {}".format(full_url))
58 |         print("  Build URL: {}".format(build_url.strip()))
59 | 
60 |     response = requests.get(jar_url)
61 |     with open(artifact_file, 'wb') as f:
62 |         f.write(response.content)
63 | 
64 | 
65 | def submit_job():
66 |     opts = [
67 |         ["--{}".format(key[4:].replace("_", "-")), value]
68 |         for key, value in environ.items()
69 |         if key.startswith("TBV_") and key != "TBV_CLASS"
70 |     ]
71 | 
72 |     command = [
73 |         "spark-submit",
74 |         "--master", "yarn",
75 |         "--deploy-mode", "client",
76 |         "--class", environ["TBV_CLASS"],
77 |         artifact_file,
78 |     ] + [v for opt in opts for v in opt if v]
79 | 
80 |     call_exit_errors(command)
81 | 
82 | 
83 | if environ.get("DO_RETRIEVE", "True") == "True":
84 |     retrieve_jar()
85 | 
86 | if environ.get("DO_SUBMIT", "True") == "True":
87 |     submit_job()
88 | 


--------------------------------------------------------------------------------
/jobs/txp_pulse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # We use jupyter by default, but here we want to use python
 4 | unset PYSPARK_DRIVER_PYTHON
 5 | 
 6 | # Clone, install, and run
 7 | git clone https://github.com/mozilla/python_etl.git
 8 | cd python_etl
 9 | pip install .
10 | python setup.py bdist_egg
11 | spark-submit scheduling/pulse.py
12 | 


--------------------------------------------------------------------------------
/operators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/operators/__init__.py


--------------------------------------------------------------------------------
/plugins/mozmenu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Plugin that adds a "Mozilla" entry to the top bar with some useful links.
 3 | 
 4 | Based on an example at
 5 | https://github.com/airflow-plugins/Getting-Started/blob/master/Tutorial/creating-ui-modification.md
 6 | """
 7 | from airflow.plugins_manager import AirflowPlugin
 8 | 
 9 | telemetry_airflow = {
10 |     "name": "telemetry-airflow on GitHub",
11 |     "category": "Mozilla",
12 |     "href": "https://github.com/mozilla/telemetry-airflow",
13 | }
14 | 
15 | wtmo_dev = {
16 |     "name": "WTMO Developer Guide",
17 |     "category": "Mozilla",
18 |     "href": "https://mozilla-hub.atlassian.net/wiki/spaces/SRE/pages/27922811/WTMO+Developer+Guide",
19 | }
20 | 
21 | airflow_triage_guide = {
22 |     "name": "Airflow Triage Guide",
23 |     "category": "Mozilla",
24 |     "href": "https://mozilla-hub.atlassian.net/wiki/spaces/DATA/pages/175603730/Airflow+Triage+Guide",
25 | }
26 | 
27 | gke_cluster = {
28 |     "name": "GKE cluster",
29 |     "category": "Mozilla",
30 |     "href": "https://console.cloud.google.com/kubernetes/workload/overview?project=moz-fx-data-airflow-gke-prod",
31 | }
32 | 
33 | 
34 | # ruff: noqa: RUF012
35 | class MozMenuPlugin(AirflowPlugin):
36 |     name = "Mozilla"
37 |     operators = []
38 |     flask_blueprints = []
39 |     hooks = []
40 |     executors = []
41 |     appbuilder_views = []
42 |     appbuilder_menu_items = [
43 |         telemetry_airflow,
44 |         wtmo_dev,
45 |         airflow_triage_guide,
46 |         gke_cluster,
47 |     ]
48 | 


--------------------------------------------------------------------------------
/plugins/timetable.py:
--------------------------------------------------------------------------------
 1 | """Plugin for alternative timetables that cannot be trivially defined via cron expressions."""
 2 | 
 3 | from datetime import timedelta
 4 | from typing import Any
 5 | 
 6 | from airflow.plugins_manager import AirflowPlugin
 7 | from airflow.timetables.base import DagRunInfo, DataInterval, TimeRestriction, Timetable
 8 | from pendulum import UTC, DateTime, Time
 9 | 
10 | 
11 | class MultiWeekTimetable(Timetable):
12 |     def __init__(self, *, num_weeks: int, time: Time = Time.min):
13 |         self.num_weeks = num_weeks
14 |         self.interval_delta = timedelta(days=7 * num_weeks)
15 |         # only enforced for automated data intervals
16 |         self.time = time
17 | 
18 |     def infer_manual_data_interval(self, run_after: DateTime) -> DataInterval:
19 |         return DataInterval(start=run_after - self.interval_delta, end=run_after)
20 | 
21 |     def next_dagrun_info(
22 |         self,
23 |         *,
24 |         last_automated_data_interval: DataInterval | None,
25 |         restriction: TimeRestriction,
26 |     ) -> DagRunInfo | None:
27 |         if restriction.earliest is None:  # No start_date specified. Don't schedule.
28 |             return None
29 | 
30 |         # Find the first run on the regular schedule.
31 |         next_end = (
32 |             DateTime.combine(restriction.earliest, self.time).replace(tzinfo=UTC)
33 |             + self.interval_delta
34 |         )
35 | 
36 |         max_end = next_end
37 |         if last_automated_data_interval is not None:
38 |             # There was a previous run on the regular schedule.
39 |             # Return the next interval after last_automated_data_interval.end that is
40 |             # aligned with restriction.earliest and self.time
41 |             max_end = last_automated_data_interval.end + self.interval_delta
42 |         elif not restriction.catchup:
43 |             # This is the first ever run on the regular schedule, and catchup is not
44 |             # enabled. Return the last complete interval before now.
45 |             max_end = DateTime.utcnow()
46 |         if next_end < max_end:
47 |             # Return the last complete interval on or before max_end. Use integer
48 |             # division on the number of whole days rather than deal with any corner
49 |             # cases related to leap seconds and partial days.
50 |             skip_intervals = (max_end - next_end).days // self.interval_delta.days
51 |             next_end = next_end + (self.interval_delta * skip_intervals)
52 | 
53 |         if restriction.latest is not None and next_end > restriction.latest:
54 |             return None  # Over the DAG's scheduled end; don't schedule.
55 |         return DagRunInfo.interval(start=next_end - self.interval_delta, end=next_end)
56 | 
57 |     def serialize(self) -> dict[str, Any]:
58 |         return {"num_weeks": self.num_weeks, "time": self.time.isoformat()}
59 | 
60 |     @classmethod
61 |     def deserialize(cls, value: dict[str, Any]) -> Timetable:
62 |         return cls(num_weeks=value["num_weeks"], time=Time.fromisoformat(value["time"]))
63 | 
64 | 
65 | class MozillaTimetablePlugin(AirflowPlugin):
66 |     name = "mozilla_timetable_plugin"
67 |     timetables = (MultiWeekTimetable,)
68 | 


--------------------------------------------------------------------------------
/plugins/version_endpoint.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from pathlib import Path
 4 | 
 5 | from airflow.plugins_manager import AirflowPlugin
 6 | from flask import Blueprint, jsonify
 7 | 
 8 | version_endpoint_bp = Blueprint("version_endpoint", __name__)
 9 | 
10 | # from https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
11 | SEM_VER_REGEX = (
12 |     r"(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\."
13 |     r"(?P<patch>0|[1-9]\d*)(?:-(?P<prerelease>"
14 |     r"(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)"
15 |     r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?"
16 |     r"(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"
17 | )
18 | 
19 | 
20 | def get_project_root() -> Path:
21 |     """Reliably give the project root as a Path object."""
22 |     return Path(__file__).parent.parent
23 | 
24 | 
25 | def parse_airflow_version(dockerfile_content: str) -> str:
26 |     version_pattern = rf"^FROM apache\/airflow:((slim-)?{SEM_VER_REGEX})$"
27 |     version_regex = re.compile(pattern=version_pattern, flags=re.MULTILINE | re.DOTALL)
28 |     return version_regex.search(dockerfile_content).group(1)
29 | 
30 | 
31 | def get_airflow_version() -> dict[str, str | None]:
32 |     """Parse Airflow version from Dockerfile and return it as a dict."""
33 |     project_root = get_project_root()
34 |     dockerfile = project_root / "Dockerfile"
35 |     if dockerfile.is_file() and dockerfile.exists():
36 |         with open(dockerfile) as file:
37 |             content = file.read()
38 |         version = parse_airflow_version(dockerfile_content=content)
39 |     else:
40 |         version = None
41 |     return {"version": version}
42 | 
43 | 
44 | def get_dockerflow_version() -> dict[str, str | None]:
45 |     """
46 |     Parse Dockerflow style version.json file and return it as a dict.
47 | 
48 |     version.json is baked in the Docker image at build time in CI.
49 | 
50 |     """
51 |     project_root = get_project_root()
52 |     version_file = project_root / "version.json"
53 |     if version_file.is_file() and version_file.exists():
54 |         with open(project_root / "version.json") as file:
55 |             version = json.load(file)
56 |     else:
57 |         version = {"build": None, "commit": None, "source": None}
58 |     return version
59 | 
60 | 
61 | @version_endpoint_bp.route("/__version__", methods=["GET"])
62 | def version_endpoint():
63 |     airflow_version = get_airflow_version()
64 |     dockerflow_version = get_dockerflow_version()
65 |     return jsonify(dockerflow_version | airflow_version), 200
66 | 
67 | 
68 | class CustomPlugin(AirflowPlugin):
69 |     name = "version_endpoint"
70 |     flask_blueprints = (version_endpoint_bp,)
71 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff.lint.isort]
 2 | known-third-party = ["airflow"]
 3 | 
 4 | [tool.ruff]
 5 | target-version = "py310"
 6 | # Exclude questionably linted code (aka legacy) under the `jobs` directory
 7 | exclude = ["./jobs"]
 8 | 
 9 | [tool.ruff.lint]
10 | select = [
11 |     "E", # pycodestyle
12 |     "W", # pycodestyle
13 |     "F", # Pyflakes
14 |     "B", # flake8-bugbear
15 |     "C4", # flake8-comprehensions
16 |     "D", # flake8-docstrings
17 |     "I", # isort
18 |     "SIM", # flake8-simplify
19 |     "TCH", # flake8-type-checking
20 |     "TID", # flake8-tidy-imports
21 |     "Q", # flake8-quotes
22 |     "UP", # pyupgrade
23 |     "PT", # flake8-pytest-style
24 |     "RUF", # Ruff-specific rules
25 | ]
26 | ignore = [
27 |     "E501", # line too long, handled by black
28 |     # Docstring linting
29 |     "D100", # Missing docstring in public module
30 |     "D101", # Missing docstring in public class
31 |     "D102", # Missing docstring in public method
32 |     "D103", # Missing docstring in public function
33 |     "D104", # Missing docstring in public package
34 |     "D105", # Missing docstring in magic method
35 |     "D107", # Missing docstring in __init__
36 |     "D202", # No blank lines allowed after function docstring -> clashes with Black
37 |     "D203", # 1 blank line required before class docstring
38 |     "D212", # Multi-line docstring summary should start at the first line
39 |     "D415", # First line should end with a period, question mark, or exclamation point
40 |     "D416", #Section name should end with a colon ("{name}")
41 |     # flake8-pytest-style:
42 |     "PT011", # pytest.raises({exception}) is too broad, set the match parameter or use a more specific exception
43 |     # To enable when we migrate to Python 3.10
44 |     "B905", # `zip()` without an explicit `strict=` parameter
45 | ]
46 | 


--------------------------------------------------------------------------------
/requirements-dev.in:
--------------------------------------------------------------------------------
 1 | --constraint ./constraints.txt
 2 | --constraint ./requirements.txt
 3 | 
 4 | 
 5 | # Package management
 6 | pip-tools==7.4.1
 7 | 
 8 | # Code quality
 9 | pytest==8.3.4
10 | pytest-mock==3.14.0
11 | ruff==0.5.5
12 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.11
 3 | # by the following command:
 4 | #
 5 | #    pip-compile --no-annotate --strip-extras requirements-dev.in
 6 | #
 7 | build==1.1.1
 8 | click==8.1.8
 9 | iniconfig==2.0.0
10 | packaging==24.2
11 | pip-tools==7.4.1
12 | pluggy==1.5.0
13 | pyproject-hooks==1.0.0
14 | pytest==8.3.4
15 | pytest-mock==3.14.0
16 | ruff==0.5.5
17 | wheel==0.43.0
18 | 
19 | # The following packages are considered to be unsafe in a requirements file:
20 | # pip
21 | # setuptools
22 | 


--------------------------------------------------------------------------------
/requirements-override.txt:
--------------------------------------------------------------------------------
1 | # There's a bug in apache-airflow-providers-google 12.0.0 where Dataproc operators fail to import
2 | # without OpenLineage installed, which was fixed in 14.0.0 (https://github.com/apache/airflow/pull/46561).
3 | apache-airflow-providers-google==14.0.0
4 | 


--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
 1 | # Official Airflow constraints file
 2 | # Doc: https://airflow.apache.org/docs/apache-airflow/stable/installation/installing-from-pypi.html#constraints-files
 3 | # File: https://raw.githubusercontent.com/apache/airflow/constraints-2.10.5/constraints-3.11.txt
 4 | --constraint ./constraints.txt
 5 | 
 6 | # Airflow dependencies
 7 | apache-airflow[async,google-auth,password,statsd]==2.10.5
 8 | apache-airflow-providers-amazon
 9 | apache-airflow-providers-celery
10 | apache-airflow-providers-cncf-kubernetes
11 | apache-airflow-providers-google
12 | apache-airflow-providers-http
13 | apache-airflow-providers-postgres
14 | apache-airflow-providers-redis
15 | apache-airflow-providers-slack
16 | airflow-provider-fivetran-async==2.0.2
17 | 
18 | # Acryl DataHub integration
19 | acryl-datahub-airflow-plugin==1.0.0.3
20 | gql
21 | 
22 | # dbt integration
23 | apache-airflow-providers-dbt-cloud
24 | 
25 | # Required for /app/dags/empeam_workday_xmatters_integration.py
26 | apache-airflow-providers-atlassian-jira
27 | 


--------------------------------------------------------------------------------
/resources/dev_variables.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Dev_glam_project": "Dev_glam_project",
 3 |     "Prod_glam_project": "Prod_glam_project",
 4 |     "app_store_connect_password": "password",
 5 |     "app_store_connect_username": "username",
 6 |     "bugzilla_probe_expiry_bot_api_key": "bugzilla-api-key",
 7 |     "dataops_looker_github_secret_access_token": "dataops_looker_github_secret_access_token",
 8 |     "glean_dictionary_netlify_build_webhook_id": "status/200",
 9 |     "jetstream_cluster_cert": "cert",
10 |     "jetstream_cluster_ip": "127.0.0.1",
11 |     "lookml_generator_release_str": "v0.0.0",
12 |     "slack_secret_token": "slack_secret_token",
13 |     "surveygizmo_api_secret": "tapsekret",
14 |     "surveygizmo_api_token": "tokentokentoken",
15 |     "surveygizmo_daily_attitudes_survey_id": 12345,
16 |     "dbt_account_id": "dbt_account_id",
17 |     "looker_api_client_id_prod": "looker_api_client_id_prod",
18 |     "looker_api_client_secret_prod": "looker_api_client_secret_prod"
19 | }
20 | 


--------------------------------------------------------------------------------
/resources/dev_webserver_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from flask_appbuilder.security.manager import AUTH_DB
 4 | 
 5 | basedir = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | WTF_CSRF_ENABLED = True
 8 | AUTH_TYPE = AUTH_DB
 9 | AUTH_ROLE_PUBLIC = "Admin"
10 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pathlib
 4 | import sys
 5 | import warnings
 6 | 
 7 | import pytest
 8 | from airflow.models import DagBag
 9 | 
10 | # get absolute project directory path no matter the environment
11 | PROJECT_DIR = pathlib.Path(__file__).resolve().parent.parent
12 | 
13 | 
14 | @pytest.fixture(scope="session")
15 | def get_dag_bag(session_mocker) -> DagBag:
16 |     from airflow.operators.subdag import SubDagOperator
17 | 
18 |     # Mock _validate_pool, so we don't need an actual provisioned database
19 |     session_mocker.patch.object(
20 |         SubDagOperator,
21 |         "_validate_pool",
22 |         return_value=None,
23 |     )
24 | 
25 |     # load dev connection and variables
26 |     env_load_variables_from_json(PROJECT_DIR / "resources" / "dev_variables.json")
27 |     env_load_connections_from_json(PROJECT_DIR / "resources" / "dev_connections.json")
28 | 
29 |     # Replicate Airflow adding dags, plugins folders in system path at runtime
30 |     sys.path.insert(0, str(PROJECT_DIR))
31 |     sys.path.insert(1, str(PROJECT_DIR / "dags"))
32 |     sys.path.insert(2, str(PROJECT_DIR / "plugins"))
33 | 
34 |     # Supress warnings from loading DAGs
35 |     with warnings.catch_warnings():
36 |         warnings.simplefilter("ignore")
37 |         dagbag = DagBag(dag_folder=PROJECT_DIR / "dags", include_examples=False)
38 | 
39 |     return dagbag
40 | 
41 | 
42 | def env_load_variables_from_json(path: pathlib.Path) -> None:
43 |     """
44 |     Load Airflow Variables as environment variables from a JSON file.
45 | 
46 |     JSON file should be generated by running `airflow variables export <filename>.json`.
47 |     Variable values must be `str` or `int`.
48 | 
49 |     See this link for more information on Airflow Variables as environment variables
50 |     https://airflow.apache.org/docs/apache-airflow/stable/howto/variable.html
51 |     """
52 |     with open(path) as file:
53 |         variables: dict[str, str | int] = json.load(file)
54 | 
55 |     for name, value in variables.items():
56 |         formatted_variable_name = f"AIRFLOW_VAR_{name.upper()}"
57 |         os.environ[formatted_variable_name] = str(value)
58 | 
59 | 
60 | def env_load_connections_from_json(path: pathlib.Path) -> None:
61 |     """
62 |     Load Airflow Connections as environment variables from a JSON file.
63 | 
64 |     JSON file should be generated by running `airflow connections export <filename>.json`.
65 |     Uses a Connection object to ensure correct Connection parsing.
66 | 
67 |     See this link for more information on Airflow Connections as environment variables
68 |     https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html
69 |     """
70 |     from airflow.models import Connection
71 | 
72 |     with open(path) as file:
73 |         connections: dict[str, dict] = json.load(file)
74 | 
75 |     for name, params in connections.items():
76 |         conn_instance = Connection.from_json(value=json.dumps(params), conn_id=name)
77 |         formatted_connection_name = f"AIRFLOW_CONN_{name.upper()}"
78 |         os.environ[formatted_connection_name] = conn_instance.get_uri()
79 | 


--------------------------------------------------------------------------------
/tests/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/dags/__init__.py


--------------------------------------------------------------------------------
/tests/dags/test_dag_validity.py:
--------------------------------------------------------------------------------
 1 | def test_dag_validity(get_dag_bag):
 2 |     """
 3 |     Test all DAGs can be parsed.
 4 | 
 5 |     This test should be equivalent to the integration test using airflow CLI.
 6 |     At the moment, there is a discrepancy between this unit test and the integration
 7 |     test. Once equivalent, this unit test should replace to the integration test.
 8 | 
 9 |     """
10 |     dagbag = get_dag_bag
11 | 
12 |     data = []
13 |     for filename, errors in dagbag.import_errors.items():
14 |         data.append({"filepath": filename, "error": errors})
15 |     if data:
16 |         print(data)
17 |         raise AssertionError
18 | 
19 | 
20 | def test_dag_tags(get_dag_bag):
21 |     """Check tags in all DAGs are valid."""
22 | 
23 |     valid_tags = {
24 |         "impact/tier_1",
25 |         "impact/tier_2",
26 |         "impact/tier_3",
27 |         "repo/bigquery-etl",
28 |         "repo/telemetry-airflow",
29 |         "repo/private-bigquery-etl",
30 |         "triage/confidential",
31 |         "triage/no_triage",
32 |         "triage/record_only",
33 |     }
34 |     dagbag = get_dag_bag
35 | 
36 |     for dag_name, dag in dagbag.dags.items():
37 |         for tag in dag.tags:
38 |             assert tag in valid_tags, f"DAG: {dag_name}: Invalid tag `{tag}`"
39 | 
40 | 
41 | def test_dag_tags_required(get_dag_bag):
42 |     """Check at least one tag per DAG is of the required type."""
43 | 
44 |     required_tag_type = "impact"
45 |     dagbag = get_dag_bag
46 | 
47 |     for dag_name, dag in dagbag.dags.items():
48 |         # don't check tags on subdags
49 |         if dag.is_subdag:
50 |             continue
51 | 
52 |         assert [
53 |             tag for tag in dag.tags if required_tag_type in tag
54 |         ], f"DAG: {dag_name}: Missing required tag type `{required_tag_type}`"
55 | 


--------------------------------------------------------------------------------
/tests/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/plugins/__init__.py


--------------------------------------------------------------------------------
/tests/plugins/test_timetable.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | from airflow.timetables.base import DagRunInfo, DataInterval, TimeRestriction
 4 | from pendulum import UTC, DateTime, Time
 5 | 
 6 | from plugins.timetable import MultiWeekTimetable
 7 | 
 8 | 
 9 | def test_manual_interval():
10 |     tt = MultiWeekTimetable(num_weeks=4)
11 |     actual = tt.infer_manual_data_interval(run_after=DateTime(2023, 1, 29))
12 |     expected = DataInterval(start=DateTime(2023, 1, 1), end=DateTime(2023, 1, 29))
13 |     assert actual == expected
14 | 
15 | 
16 | def test_first_automated_interval():
17 |     tt = MultiWeekTimetable(num_weeks=4, time=Time(hour=4))
18 |     actual = tt.next_dagrun_info(
19 |         last_automated_data_interval=None,
20 |         restriction=TimeRestriction(
21 |             earliest=DateTime(2023, 1, 1), latest=None, catchup=True
22 |         ),
23 |     )
24 |     expected = DagRunInfo.interval(
25 |         start=DateTime(2023, 1, 1, 4, tzinfo=UTC),
26 |         end=DateTime(2023, 1, 29, 4, tzinfo=UTC),
27 |     )
28 |     assert actual == expected
29 | 
30 | 
31 | def test_first_automated_interval_no_catchup():
32 |     tt = MultiWeekTimetable(num_weeks=4)
33 |     with mock.patch.object(
34 |         DateTime, "utcnow", return_value=DateTime(2023, 2, 28, tzinfo=UTC)
35 |     ):
36 |         actual = tt.next_dagrun_info(
37 |             last_automated_data_interval=None,
38 |             restriction=TimeRestriction(
39 |                 earliest=DateTime(2023, 1, 1), latest=None, catchup=False
40 |             ),
41 |         )
42 |     expected = DagRunInfo.interval(
43 |         start=DateTime(2023, 1, 29, tzinfo=UTC), end=DateTime(2023, 2, 26, tzinfo=UTC)
44 |     )
45 |     assert actual == expected
46 | 
47 | 
48 | def test_next_automated_interval():
49 |     tt = MultiWeekTimetable(num_weeks=4)
50 |     actual = tt.next_dagrun_info(
51 |         last_automated_data_interval=DataInterval(
52 |             start=DateTime(2023, 1, 29, tzinfo=UTC),
53 |             end=DateTime(2023, 2, 26, tzinfo=UTC),
54 |         ),
55 |         restriction=TimeRestriction(
56 |             earliest=DateTime(2023, 1, 1),
57 |             latest=DateTime(2023, 3, 26, tzinfo=UTC),
58 |             catchup=False,
59 |         ),
60 |     )
61 |     expected = DagRunInfo.interval(
62 |         start=DateTime(2023, 2, 26, tzinfo=UTC), end=DateTime(2023, 3, 26, tzinfo=UTC)
63 |     )
64 |     assert actual == expected
65 | 
66 | 
67 | def test_last_automated_interval():
68 |     tt = MultiWeekTimetable(num_weeks=4)
69 |     actual = tt.next_dagrun_info(
70 |         last_automated_data_interval=DataInterval(
71 |             start=DateTime(2023, 1, 29, tzinfo=UTC),
72 |             end=DateTime(2023, 2, 26, tzinfo=UTC),
73 |         ),
74 |         restriction=TimeRestriction(
75 |             earliest=DateTime(2023, 1, 1),
76 |             latest=DateTime(2023, 2, 26, tzinfo=UTC),
77 |             catchup=False,
78 |         ),
79 |     )
80 |     assert actual is None
81 | 


--------------------------------------------------------------------------------
/tests/plugins/test_version_endpoint.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | from unittest.mock import mock_open, patch
  4 | 
  5 | import pytest
  6 | 
  7 | from plugins.version_endpoint import (
  8 |     get_airflow_version,
  9 |     get_dockerflow_version,
 10 |     get_project_root,
 11 |     parse_airflow_version,
 12 | )
 13 | 
 14 | 
 15 | def test_get_project_root():
 16 |     # CircleCI renames the project directory to `project`
 17 |     assert get_project_root().name in ("telemetry-airflow", "project")
 18 |     assert get_project_root().is_dir()
 19 | 
 20 | 
 21 | @pytest.mark.parametrize(
 22 |     ("test_input", "expected"),
 23 |     [
 24 |         (
 25 |             (
 26 |                 "# example comment on first line\n"
 27 |                 "FROM apache/airflow:slim-2.8.2-python3.11\n"
 28 |                 "# Rest of Dockerfile"
 29 |             ),
 30 |             "slim-2.8.2-python3.11",
 31 |         ),
 32 |         ("FROM apache/airflow:2.9.1", "2.9.1"),
 33 |         ("FROM apache/airflow:slim-2.7.3", "slim-2.7.3"),
 34 |     ],
 35 | )
 36 | def test_parse_airflow_version(test_input, expected):
 37 |     assert parse_airflow_version(test_input) == expected
 38 | 
 39 | 
 40 | def test_get_airflow_version_exists():
 41 |     mock_project_root = patch(
 42 |         "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path")
 43 |     )
 44 |     mock_parse_airflow_version = patch(
 45 |         "plugins.version_endpoint.parse_airflow_version", return_value="2.8.2"
 46 |     )
 47 |     mock_open_file = patch("builtins.open", mock_open(read_data="Mock Data!"))
 48 |     mock_is_file = patch("pathlib.Path.is_file", return_value=True)
 49 |     mock_exists = patch("pathlib.Path.exists", return_value=True)
 50 | 
 51 |     with (
 52 |         mock_project_root,
 53 |         mock_parse_airflow_version,
 54 |         mock_open_file,
 55 |         mock_is_file,
 56 |         mock_exists,
 57 |     ):
 58 |         result = get_airflow_version()
 59 |         assert result == {"version": "2.8.2"}
 60 | 
 61 | 
 62 | def test_get_airflow_version_not_exists():
 63 |     mock_project_root = patch(
 64 |         "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path")
 65 |     )
 66 | 
 67 |     with mock_project_root:
 68 |         result = get_airflow_version()
 69 |         assert result == {"version": None}
 70 | 
 71 | 
 72 | def test_get_dockerflow_version_exists():
 73 |     mock_project_root = patch(
 74 |         "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path")
 75 |     )
 76 |     mock_open_file = patch(
 77 |         "builtins.open",
 78 |         mock_open(
 79 |             read_data=json.dumps(
 80 |                 {
 81 |                     "build": "12345",
 82 |                     "commit": "abcdef",
 83 |                     "source": "https://github.com/mozilla/telemetry-airflow",
 84 |                 }
 85 |             )
 86 |         ),
 87 |     )
 88 |     mock_is_file = patch("pathlib.Path.is_file", return_value=True)
 89 |     mock_exists = patch("pathlib.Path.exists", return_value=True)
 90 | 
 91 |     with mock_project_root, mock_open_file, mock_is_file, mock_exists:
 92 |         result = get_dockerflow_version()
 93 |         assert result == {
 94 |             "build": "12345",
 95 |             "commit": "abcdef",
 96 |             "source": "https://github.com/mozilla/telemetry-airflow",
 97 |         }
 98 | 
 99 | 
100 | def test_get_dockerflow_version_not_exists():
101 |     mock_project_root = patch(
102 |         "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path")
103 |     )
104 | 
105 |     with mock_project_root:
106 |         result = get_dockerflow_version()
107 |         assert result == {"build": None, "commit": None, "source": None}
108 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/utils/__init__.py


--------------------------------------------------------------------------------
/tests/utils/test_backfill.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from utils.backfill import BackfillParams
  4 | 
  5 | 
  6 | @pytest.fixture()
  7 | def base_params() -> dict:
  8 |     return {
  9 |         "clear": False,
 10 |         "dry_run": True,
 11 |         "dag_name": "dag_name",
 12 |         "end_date": "2022-11-10",
 13 |         "start_date": "2022-10-31",
 14 |         "task_regex": None,
 15 |     }
 16 | 
 17 | 
 18 | @pytest.fixture()
 19 | def base_backfill_params(base_params: dict) -> BackfillParams:
 20 |     return BackfillParams(**base_params)
 21 | 
 22 | 
 23 | def test_date_validation(base_backfill_params) -> None:
 24 |     # valid date range
 25 |     base_backfill_params.validate_date_range()
 26 | 
 27 |     # invalid date range
 28 |     base_backfill_params.start_date, base_backfill_params.end_date = (
 29 |         base_backfill_params.end_date,
 30 |         base_backfill_params.start_date,
 31 |     )
 32 |     with pytest.raises(ValueError):
 33 |         base_backfill_params.validate_date_range()
 34 | 
 35 | 
 36 | def test_validate_regex_pattern(base_backfill_params) -> None:
 37 |     # task_regex is None
 38 |     base_backfill_params.validate_regex_pattern()
 39 | 
 40 |     # valid regex pattern
 41 |     base_backfill_params.task_regex = "/ab+c/"
 42 |     base_backfill_params.validate_regex_pattern()
 43 | 
 44 |     # invalid regex pattern
 45 |     base_backfill_params.task_regex = "[.*"
 46 |     with pytest.raises(ValueError):
 47 |         base_backfill_params.validate_regex_pattern()
 48 | 
 49 | 
 50 | def test_generate_backfill_command(base_backfill_params) -> None:
 51 |     """
 52 |     Assert backfill commands are equivalent between the backfill plugin and backfill DAG.
 53 | 
 54 |     Expected results were generated from the plugin implementation
 55 | 
 56 |     """
 57 |     test_start_date = "2022-01-01"
 58 |     test_end_date = "2022-01-10"
 59 | 
 60 |     test_params: list[BackfillParams] = [
 61 |         BackfillParams(
 62 |             clear=True,
 63 |             dry_run=True,
 64 |             task_regex=None,
 65 |             dag_name="test_value",
 66 |             start_date=test_start_date,
 67 |             end_date=test_end_date,
 68 |         ),
 69 |         BackfillParams(
 70 |             clear=False,
 71 |             dry_run=True,
 72 |             task_regex=None,
 73 |             dag_name="test_value",
 74 |             start_date=test_start_date,
 75 |             end_date=test_end_date,
 76 |         ),
 77 |         BackfillParams(
 78 |             clear=True,
 79 |             dry_run=False,
 80 |             task_regex=None,
 81 |             dag_name="test_value",
 82 |             start_date=test_start_date,
 83 |             end_date=test_end_date,
 84 |         ),
 85 |         BackfillParams(
 86 |             clear=False,
 87 |             dry_run=False,
 88 |             task_regex=None,
 89 |             dag_name="test_value",
 90 |             start_date=test_start_date,
 91 |             end_date=test_end_date,
 92 |         ),
 93 |         BackfillParams(
 94 |             clear=False,
 95 |             dry_run=False,
 96 |             task_regex="/ab+c/",
 97 |             dag_name="test_value",
 98 |             start_date=test_start_date,
 99 |             end_date=test_end_date,
100 |         ),
101 |     ]
102 | 
103 |     expected_results = [
104 |         [
105 |             "timeout",
106 |             "60",
107 |             "airflow",
108 |             "tasks",
109 |             "clear",
110 |             "-s",
111 |             "2022-01-01",
112 |             "-e",
113 |             "2022-01-10",
114 |             "test_value",
115 |         ],
116 |         [
117 |             "airflow",
118 |             "dags",
119 |             "backfill",
120 |             "--donot-pickle",
121 |             "--dry-run",
122 |             "-s",
123 |             "2022-01-01",
124 |             "-e",
125 |             "2022-01-10",
126 |             "test_value",
127 |         ],
128 |         [
129 |             "airflow",
130 |             "tasks",
131 |             "clear",
132 |             "-y",
133 |             "-s",
134 |             "2022-01-01",
135 |             "-e",
136 |             "2022-01-10",
137 |             "test_value",
138 |         ],
139 |         [
140 |             "airflow",
141 |             "dags",
142 |             "backfill",
143 |             "--donot-pickle",
144 |             "-s",
145 |             "2022-01-01",
146 |             "-e",
147 |             "2022-01-10",
148 |             "test_value",
149 |         ],
150 |         [
151 |             "airflow",
152 |             "dags",
153 |             "backfill",
154 |             "--donot-pickle",
155 |             "-t",
156 |             "/ab+c/",
157 |             "-s",
158 |             "2022-01-01",
159 |             "-e",
160 |             "2022-01-10",
161 |             "test_value",
162 |         ],
163 |     ]
164 | 
165 |     for params, result in zip(test_params, expected_results):
166 |         backfill_command = params.generate_backfill_command()
167 |         assert backfill_command == result
168 | 


--------------------------------------------------------------------------------
/tests/utils/test_tags.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from utils.tags import InvalidTagError, Tag
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     ("actual", "expected"),
 8 |     [
 9 |         (Tag.ImpactTier.tier_1, "impact/tier_1"),
10 |         (Tag.ImpactTier.tier_2, "impact/tier_2"),
11 |         (Tag.ImpactTier.tier_3, "impact/tier_3"),
12 |     ],
13 | )
14 | def test_valid_impact_tag(actual, expected):
15 |     assert actual == expected
16 | 
17 | 
18 | @pytest.mark.parametrize(
19 |     ("obj", "attr", "expected"),
20 |     [
21 |         (Tag.ImpactTier, "tier_1", "impact/tier_1"),
22 |         (Tag.ImpactTier, "tier_2", "impact/tier_2"),
23 |         (Tag.ImpactTier, "tier_3", "impact/tier_3"),
24 |     ],
25 | )
26 | def test_get_impact_tag(obj, attr, expected):
27 |     assert getattr(obj, attr) == expected
28 | 
29 | 
30 | @pytest.mark.parametrize(
31 |     "invalid_input",
32 |     [
33 |         "tier_4",
34 |         "",
35 |         "bq-etl",
36 |     ],
37 | )
38 | def test_invalid_impact_tag(invalid_input):
39 |     with pytest.raises(InvalidTagError):
40 |         getattr(Tag.ImpactTier, invalid_input)
41 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/utils/__init__.py


--------------------------------------------------------------------------------
/utils/backfill.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import dataclasses
 4 | import datetime
 5 | import re
 6 | 
 7 | 
 8 | @dataclasses.dataclass
 9 | class BackfillParams:
10 |     dag_name: str
11 |     start_date: str
12 |     end_date: str
13 |     clear: bool
14 |     dry_run: bool
15 |     task_regex: str | None
16 | 
17 |     def validate_date_range(self) -> None:
18 |         start_date = datetime.datetime.fromisoformat(self.start_date)
19 |         end_date = datetime.datetime.fromisoformat(self.end_date)
20 |         if start_date > end_date:
21 |             raise ValueError(
22 |                 f"`start_date`={self.start_date} is greater than `end_date`={self.end_date}"
23 |             )
24 | 
25 |     def validate_regex_pattern(self) -> None:
26 |         if self.task_regex:
27 |             try:
28 |                 re.compile(self.task_regex)
29 |             except re.error:
30 |                 raise ValueError(
31 |                     f"Invalid regex pattern for `task_regex`={self.task_regex}"
32 |                 ) from None
33 | 
34 |     def generate_backfill_command(self) -> list[str]:
35 |         """
36 |         Backfill command based off the Airflow plugin implemented by hwoo.
37 | 
38 |         Original implementation in plugins/backfill/main.py
39 | 
40 |         """
41 |         # Construct the airflow command
42 |         cmd = ["airflow"]
43 | 
44 |         if self.clear:
45 |             cmd.extend(["tasks", "clear"])
46 | 
47 |             if self.dry_run:
48 |                 # For dry runs we simply time out to avoid zombie procs waiting on user input.
49 |                 # The output is what we're interested in
50 |                 timeout_list = ["timeout", "60"]
51 |                 cmd = timeout_list + cmd
52 |             else:
53 |                 cmd.append("-y")
54 | 
55 |             if self.task_regex:
56 |                 cmd.extend(["-t", str(self.task_regex)])
57 |         else:
58 |             cmd.extend(["dags", "backfill", "--donot-pickle"])
59 |             if self.dry_run:
60 |                 cmd.append("--dry-run")
61 | 
62 |             if self.task_regex:
63 |                 cmd.extend(["-t", str(self.task_regex)])
64 | 
65 |         cmd.extend(
66 |             ["-s", str(self.start_date), "-e", str(self.end_date), str(self.dag_name)]
67 |         )
68 | 
69 |         return cmd
70 | 


--------------------------------------------------------------------------------
/utils/callbacks.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from airflow.models.taskinstance import clear_task_instances
 4 | from airflow.utils.context import Context
 5 | from airflow.utils.db import provide_session
 6 | from sqlalchemy.orm.session import Session
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from airflow.models.dagrun import DagRun
10 | 
11 | 
12 | @provide_session
13 | def retry_tasks_callback(context: Context, session: Session | None = None) -> None:
14 |     """
15 |     Clear tasks specified by the `retry_tasks` task param.
16 | 
17 |     Intended to be used to as an `on_retry_callback` to also retry other tasks when a task fails.
18 |     """
19 |     retry_task_ids: list[str] = context["params"].get("retry_tasks", [])
20 |     if isinstance(retry_task_ids, str):
21 |         retry_task_ids = [retry_task_ids]
22 |     dag_run: DagRun = context["dag_run"]
23 |     retry_task_instances = [
24 |         task_instance
25 |         for task_instance in dag_run.get_task_instances(session=session)
26 |         if task_instance.task_id in retry_task_ids
27 |     ]
28 |     if retry_task_instances:
29 |         clear_task_instances(retry_task_instances, session=session)
30 | 


--------------------------------------------------------------------------------
/utils/constants.py:
--------------------------------------------------------------------------------
 1 | DS_WEEKLY = (
 2 |     "{% if dag_run.external_trigger %}"
 3 |     "{{ ds_nodash }}"
 4 |     "{% else %}"
 5 |     '{{ macros.ds_format(macros.ds_add(ds, 6), "%Y-%m-%d", "%Y%m%d") }}'
 6 |     "{% endif %}"
 7 | )
 8 | 
 9 | FAILED_STATES = ["failed", "upstream_failed", "skipped"]
10 | 
11 | ALLOWED_STATES = ["success"]
12 | 


--------------------------------------------------------------------------------
/utils/glam_subdags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/utils/glam_subdags/__init__.py


--------------------------------------------------------------------------------
/utils/glam_subdags/general.py:
--------------------------------------------------------------------------------
 1 | from airflow.models import DAG
 2 | 
 3 | from utils.gcp import bigquery_etl_query
 4 | 
 5 | 
 6 | def merge_params(min_param, max_param, additional_params):
 7 |     parameters = (
 8 |         f"min_sample_id:INT64:{min_param}",
 9 |         f"max_sample_id:INT64:{max_param}",
10 |     )
11 | 
12 |     if additional_params is not None:
13 |         parameters += additional_params
14 | 
15 |     return parameters
16 | 
17 | 
18 | def repeated_subdag(
19 |     parent_dag_name,
20 |     child_dag_name,
21 |     default_args,
22 |     schedule_interval,
23 |     billing_project_id,
24 |     table_project_id,
25 |     dataset_id,
26 |     fully_qualified_dataset_id,
27 |     additional_params=None,
28 |     num_partitions=5,
29 |     date_partition_parameter="submission_date",
30 |     docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
31 |     parallel=False,
32 | ):
33 |     dag = DAG(
34 |         f"{parent_dag_name}.{child_dag_name}",
35 |         default_args=default_args,
36 |         schedule_interval=schedule_interval,
37 |     )
38 | 
39 |     # This task runs first and replaces the relevant partition, followed
40 |     # by the next tasks that append to the same partition of the same table.
41 |     NUM_SAMPLE_IDS = 100
42 |     PARTITION_SIZE = NUM_SAMPLE_IDS // num_partitions
43 | 
44 |     if NUM_SAMPLE_IDS % num_partitions != 0:
45 |         raise ValueError(
46 |             f"Number of partitions must be a divisor "
47 |             f"of the number of sample ids ({NUM_SAMPLE_IDS})"
48 |         )
49 | 
50 |     task_0 = bigquery_etl_query(
51 |         reattach_on_restart=True,
52 |         task_id=f"{child_dag_name}_0",
53 |         destination_table=f"{child_dag_name}_v1",
54 |         dataset_id=fully_qualified_dataset_id,
55 |         sql_file_path=f"sql/{table_project_id}/{dataset_id}/{child_dag_name}_v1/query.sql",
56 |         project_id=billing_project_id,
57 |         depends_on_past=True,
58 |         parameters=merge_params(0, PARTITION_SIZE - 1, additional_params),
59 |         date_partition_parameter=date_partition_parameter,
60 |         arguments=("--replace",),
61 |         dag=dag,
62 |         docker_image=docker_image,
63 |     )
64 | 
65 |     upstream_task = task_0
66 | 
67 |     for partition in range(1, num_partitions):
68 |         min_param = partition * PARTITION_SIZE
69 |         max_param = min_param + PARTITION_SIZE - 1
70 | 
71 |         task = bigquery_etl_query(
72 |             reattach_on_restart=True,
73 |             task_id=f"{child_dag_name}_{partition}",
74 |             destination_table=f"{child_dag_name}_v1",
75 |             dataset_id=fully_qualified_dataset_id,
76 |             sql_file_path=f"sql/{table_project_id}/{dataset_id}/{child_dag_name}_v1/query.sql",
77 |             project_id=billing_project_id,
78 |             depends_on_past=True,
79 |             parameters=merge_params(min_param, max_param, additional_params),
80 |             date_partition_parameter=date_partition_parameter,
81 |             arguments=(
82 |                 "--append_table",
83 |                 "--noreplace",
84 |             ),
85 |             dag=dag,
86 |             docker_image=docker_image,
87 |         )
88 |         upstream_task >> task
89 |         if not parallel:
90 |             upstream_task = task
91 | 
92 |     return dag
93 | 


--------------------------------------------------------------------------------
/utils/glam_subdags/histograms.py:
--------------------------------------------------------------------------------
 1 | from airflow.models import DAG
 2 | 
 3 | from utils.gcp import bigquery_etl_query
 4 | 
 5 | GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG = "clients_histogram_aggregates"
 6 | 
 7 | 
 8 | def histogram_aggregates_subdag(
 9 |     parent_dag_name,
10 |     child_dag_name,
11 |     default_args,
12 |     schedule_interval,
13 |     dataset_id,
14 |     fully_qualified_dataset,
15 |     billing_project_id,
16 |     table_project_id="moz-fx-data-shared-prod",
17 |     is_dev=False,
18 |     docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
19 | ):
20 |     GLAM_HISTOGRAM_AGGREGATES_SUBDAG = f"{parent_dag_name}.{child_dag_name}"
21 |     default_args["depends_on_past"] = True
22 |     dag = DAG(
23 |         GLAM_HISTOGRAM_AGGREGATES_SUBDAG,
24 |         default_args=default_args,
25 |         schedule_interval=schedule_interval,
26 |     )
27 | 
28 |     clients_histogram_aggregates_new = bigquery_etl_query(
29 |         reattach_on_restart=True,
30 |         task_id="clients_histogram_aggregates_new",
31 |         destination_table="clients_histogram_aggregates_new_v1",
32 |         dataset_id=fully_qualified_dataset,
33 |         sql_file_path=f"sql/{table_project_id}/{dataset_id}/clients_histogram_aggregates_new_v1/query.sql",
34 |         project_id=billing_project_id,
35 |         date_partition_parameter=None,
36 |         parameters=("submission_date:DATE:{{ds}}",),
37 |         arguments=("--replace",),
38 |         dag=dag,
39 |         docker_image=docker_image,
40 |     )
41 | 
42 |     clients_histogram_aggregates_final = bigquery_etl_query(
43 |         reattach_on_restart=True,
44 |         task_id="clients_histogram_aggregates_v2",
45 |         destination_table="clients_histogram_aggregates_v2",
46 |         dataset_id=fully_qualified_dataset,
47 |         sql_file_path=f"sql/{table_project_id}/{dataset_id}/clients_histogram_aggregates_v2/query.sql",
48 |         project_id=billing_project_id,
49 |         depends_on_past=True,
50 |         parameters=("submission_date:DATE:{{ds}}",),
51 |         date_partition_parameter=None,
52 |         arguments=("--replace",),
53 |         dag=dag,
54 |         docker_image=docker_image,
55 |     )
56 | 
57 |     clients_histogram_aggregates_new >> clients_histogram_aggregates_final
58 |     return dag
59 | 


--------------------------------------------------------------------------------
/utils/patched/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/utils/patched/__init__.py


--------------------------------------------------------------------------------
/utils/slack.py:
--------------------------------------------------------------------------------
 1 | from airflow.models import Variable
 2 | from airflow.providers.slack.operators.slack import SlackAPIPostOperator
 3 | 
 4 | SLACK_CHANNEL = "#airflow-alerts"
 5 | 
 6 | 
 7 | def if_task_fails_alert_slack(context):
 8 |     failed_alert = SlackAPIPostOperator(
 9 |         task_id="slack_failed",
10 |         channel=SLACK_CHANNEL,
11 |         token=Variable.get("slack_secret_token"),
12 |         text="""
13 |             :red_circle: Task Failed.
14 |             *Task*: {task}
15 |             *Dag*: {dag}
16 |             *Date*: {ds}
17 |             """.format(
18 |             task=context.get("task_instance").task_id,
19 |             dag=context.get("task_instance").dag_id,
20 |             ds=context.get("ds"),
21 |         ),
22 |     )
23 |     return failed_alert.execute(context=context)
24 | 


--------------------------------------------------------------------------------
/utils/tags.py:
--------------------------------------------------------------------------------
 1 | """Module with Airflow tag definitions."""
 2 | 
 3 | from enum import Enum, member
 4 | 
 5 | 
 6 | class InvalidTagError(AttributeError):
 7 |     pass
 8 | 
 9 | 
10 | class Tag(Enum):
11 |     """Enum containing available Airflow tags."""
12 | 
13 |     def __getattr__(self, item: str) -> str:
14 |         """
15 |         Simplifies accessing enum values.
16 | 
17 |         Instead of Tag.ImpactTier.value.tier_1.value we can
18 |         just use Tag.ImpactTier.tier_1.
19 |         Simplify accessing enum values.
20 | 
21 |         Instead of Tag.ImpactTier.value.tier_1.value we can just use
22 |         Tag.ImpactTier.tier_1.
23 | 
24 |         # source: https://newbedev.com/enum-of-enums-in-python
25 |         """
26 | 
27 |         if item == "_value_":
28 |             raise InvalidTagError
29 | 
30 |         try:
31 |             ret_val = getattr(self.value, item).value
32 |         except AttributeError as _err:
33 |             raise InvalidTagError() from _err
34 | 
35 |         return ret_val
36 | 
37 |     @member
38 |     class ImpactTier(Enum):
39 |         """Valid options for Impact tier tag."""
40 | 
41 |         tier_1: str = "impact/tier_1"
42 |         tier_2: str = "impact/tier_2"
43 |         tier_3: str = "impact/tier_3"
44 | 
45 |     @member
46 |     class Triage(Enum):
47 |         """Tag for conveying information to the engineer on triage."""
48 | 
49 |         confidential: str = "triage/confidential"
50 |         record_only: str = "triage/record_only"
51 |         no_triage: str = "triage/no_triage"
52 | 
53 |     @member
54 |     class Repo(Enum):
55 |         """Valid options for Repo tag."""
56 | 
57 |         bqetl: str = "repo/bigquery-etl"
58 |         airflow: str = "repo/telemetry-airflow"
59 |         private_bqetl: str = "repo/private-bigquery-etl"
60 | 


--------------------------------------------------------------------------------