├── .circleci └── config.yml ├── .dockerignore ├── .github └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── bin ├── add_gcp_creds ├── start_gke └── stop_gke ├── config └── airflow_local_settings.py ├── constraints.txt ├── dags ├── .airflowignore ├── __init__.py ├── adm_export.py ├── app_store_analytics.py ├── backfill.py ├── bhr_collection.py ├── bqetl_artifact_deployment.py ├── bqetl_backfill.py ├── bqetl_backfill_complete.py ├── bqetl_backfill_initiate.py ├── broken_site_report_ml.py ├── catalyst.py ├── clean_gke_pods.py ├── contextual_services_import.py ├── copy_deduplicate.py ├── crash_symbolication.py ├── dap_collector.py ├── dap_collector_ppa_dev.py ├── dap_collector_ppa_prod.py ├── dbt_daily.py ├── eam_slack_channels.py ├── eam_workday_everfi_integration.py ├── eam_workday_netsuite.py ├── experiment_auto_sizing.py ├── experiments_live.py ├── extensions.py ├── firefox_public_data_report.py ├── fxci_metric_export.py ├── fxci_pulse_export.py ├── ga4_site_metrics_summary_backfill.py ├── glam.py ├── glam_fenix.py ├── glam_fenix_release.py ├── glam_fog.py ├── glam_fog_release.py ├── glam_glean_imports.py ├── graphics_telemetry.py ├── jetstream.py ├── kpi_forecasting.py ├── looker.py ├── looker_usage_analysis.py ├── ltv.py ├── mad_server.py ├── merino_jobs.py ├── microsoft_store.py ├── operational_monitoring.py ├── operational_monitoring_backfill.py ├── partybal.py ├── play_store_export.py ├── probe_scraper.py ├── publish_bqetl_static.py ├── search_alert.py ├── search_forecasting.py ├── shredder.py ├── shredder_backfill.py ├── socorro_import.py ├── update_orphaning_dashboard_etl.py └── webcompat_kb.py ├── dataproc_bootstrap ├── README.md ├── airflow_gcp.sh ├── dataproc_init.sh ├── fx_usage_init.sh └── python-requirements.txt ├── docker-compose.yml ├── jobs ├── addon_recommender.sh ├── bugzilla_dataset.sh ├── ltv_daily.py ├── moz_dataproc_runner.py ├── mozaggregator_runner.py ├── pip-install.sh ├── socorro_import_crash_data.py ├── telemetry_batch_view.py ├── txp_pulse.sh └── update_orphaning_dashboard_etl.py ├── operators ├── __init__.py └── gcp_container_operator.py ├── plugins ├── mozmenu.py ├── timetable.py └── version_endpoint.py ├── pyproject.toml ├── requirements-dev.in ├── requirements-dev.txt ├── requirements-override.txt ├── requirements.in ├── requirements.txt ├── resources ├── dev_connections.json ├── dev_variables.json └── dev_webserver_config.py ├── tests ├── __init__.py ├── conftest.py ├── dags │ ├── __init__.py │ └── test_dag_validity.py ├── plugins │ ├── __init__.py │ ├── test_timetable.py │ └── test_version_endpoint.py └── utils │ ├── __init__.py │ ├── test_backfill.py │ └── test_tags.py └── utils ├── __init__.py ├── backfill.py ├── callbacks.py ├── constants.py ├── dataproc.py ├── gcp.py ├── glam_subdags ├── __init__.py ├── general.py ├── generate_query.py └── histograms.py ├── patched ├── __init__.py └── dataproc_hook.py ├── slack.py └── tags.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Git 2 | .git 3 | .gitignore 4 | 5 | # CI 6 | .circleci/ 7 | 8 | # Docker 9 | docker-compose.yml 10 | 11 | # cache 12 | __pycache__/ 13 | .pytest_cache/ 14 | 15 | # Airflow stuff 16 | logs/ 17 | 18 | # Virtual environment 19 | .env/ 20 | .venv/ 21 | venv/ 22 | 23 | # Airflow dev resources 24 | resources/ 25 | 26 | # IDE 27 | .idea 28 | .vscode/ 29 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 7 | 8 | ## Related Tickets & Documents 9 | * DENG-XXXX 10 | * DSRE-XXXX 11 | 12 | 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.retry 3 | *undo-tree~ 4 | *.un~ 5 | venv/ 6 | .venv/ 7 | .env 8 | 9 | logs 10 | unittests.cfg 11 | airflow-webserver.pid 12 | airflow-worker.pid 13 | .config 14 | .viminfo 15 | .credentials 16 | .bash_history 17 | .mysql_history 18 | 19 | /dags/bigquery-etl-dags 20 | /dags/bigquery-etl-dags/* 21 | 22 | *~ 23 | 24 | .cache 25 | 26 | # IDE 27 | .idea 28 | .vscode/ -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:slim-2.10.5-python3.11 2 | 3 | ARG PROJECT_DIR="/opt/airflow" 4 | 5 | ENV PYTHONUNBUFFERED=1 6 | ENV PYTHONPATH="$PYTHONPATH:$PROJECT_DIR" 7 | ENV AIRFLOW_HOME=$PROJECT_DIR 8 | 9 | USER root 10 | 11 | RUN apt-get update \ 12 | && apt-get install -y --no-install-recommends build-essential 13 | 14 | # Legacy docker image dependencies to be reviewed 15 | RUN apt-get install -y --no-install-recommends \ 16 | lsb-release gnupg curl && \ 17 | CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \ 18 | echo "deb https://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | tee -a /etc/apt/sources.list.d/google-cloud-cli.list && \ 19 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \ 20 | apt-get update -y && apt-get install google-cloud-cli -y && apt-get install google-cloud-cli-gke-gcloud-auth-plugin && \ 21 | apt-get remove -y lsb-release gnupg 22 | 23 | RUN apt-get autoremove -yqq --purge && \ 24 | apt-get clean && \ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | USER airflow 28 | 29 | COPY requirements.txt / 30 | RUN pip install --no-cache-dir -r /requirements.txt 31 | COPY requirements-override.txt / 32 | RUN pip install --no-cache-dir -r /requirements-override.txt --upgrade 33 | 34 | WORKDIR $PROJECT_DIR 35 | 36 | COPY . . 37 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build clean clean-gke fixes gke help pip-compile pip-install-local stop test up 2 | 3 | 4 | help: 5 | @echo "Welcome to the Telemetry Airflow\n" 6 | @echo "The list of commands for local development:\n" 7 | @echo " build Builds the docker images for the docker-compose setup" 8 | @echo " clean Stops and removes all docker containers" 9 | @echo " fixes Applies Black and Ruff fixes to Python files" 10 | @echo " pip-compile Compile dependencies from 'requirements.in' into 'requirements.txt'" 11 | @echo " pip-install-local Install pip project requirements to your local environment" 12 | @echo " test Runs pytest" 13 | @echo " up Runs the whole stack, served under http://localhost:8080/" 14 | @echo " gke Create a sandbox gke cluster for testing" 15 | @echo " clean-gke Delete the sandbox gke cluster" 16 | @echo " stop Stops the docker containers" 17 | 18 | build: 19 | docker-compose build 20 | 21 | pip-compile: 22 | pip-compile --strip-extras --no-annotate requirements.in 23 | pip-compile --strip-extras --no-annotate requirements-dev.in 24 | 25 | fixes: 26 | ruff check . --fix 27 | ruff format . 28 | 29 | clean: stop 30 | docker-compose down --volumes 31 | docker-compose rm -f 32 | rm -rf logs/* 33 | if [ -f airflow-worker.pid ]; then rm airflow-worker.pid; fi 34 | 35 | pip-install-local: 36 | pip install -r requirements.txt -r requirements-dev.txt 37 | pip install -r requirements-override.txt --upgrade 38 | 39 | stop: 40 | docker-compose down 41 | docker-compose stop 42 | 43 | up: 44 | grep -qF 'AIRFLOW_UID=' .env || echo "AIRFLOW_UID=$$(id -u)" >> .env 45 | grep -qF 'FERNET_KEY=' .env || echo "FERNET_KEY=$$(python3 -c "from cryptography.fernet import Fernet; fernet_key = Fernet.generate_key(); print(fernet_key.decode())")" >> .env 46 | mkdir -p logs 47 | docker-compose up --wait 48 | docker-compose exec airflow-webserver airflow variables import dev_variables.json 49 | docker-compose exec airflow-webserver airflow connections import dev_connections.json 50 | 51 | gke: 52 | bin/start_gke 53 | 54 | clean-gke: 55 | bin/stop_gke 56 | 57 | test: 58 | python -m pytest tests/ 59 | -------------------------------------------------------------------------------- /bin/add_gcp_creds: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eou pipefail 4 | 5 | keyfile_path=${1?"Must specify keyfile path"} 6 | 7 | connection=${2:-"google_cloud_airflow_gke"} 8 | 9 | # Wait for full display until after checks 10 | set -x 11 | 12 | function format_gcp() { 13 | KEYFILE="$1" python3 - </dev/null 2>&1; then 18 | echo "cluster $CLUSTERNAME exists" 19 | else 20 | echo "cluster $CLUSTERNAME doesn't exist. creating..." 21 | gcloud container clusters create $CLUSTERNAME \ 22 | --enable-stackdriver-kubernetes \ 23 | -m n1-standard-4 \ 24 | --release-channel="stable" \ 25 | --enable-master-authorized-networks \ 26 | --master-authorized-networks="$MY_IP/32" \ 27 | --region us-west1 \ 28 | --num-nodes=1 \ 29 | --scopes="cloud-platform" \ 30 | --service-account="data-gke-sandbox-runner@moz-fx-data-gke-sandbox.iam.gserviceaccount.com" \ 31 | --project moz-fx-data-gke-sandbox 32 | 33 | fi 34 | 35 | echo "fetching secret..." 36 | JSON_CREDS=$(gcloud secrets versions access latest --secret="gke-sandbox-creds" --project moz-fx-data-gke-sandbox) 37 | 38 | # Upload secret to local wtmo 39 | GCP_CONN_ID="google_cloud_gke_sandbox" 40 | 41 | CONTAINER_ID=$(docker ps --filter name=web -q) 42 | if [ -z "$CONTAINER_ID" ]; then 43 | echo "ERROR: Airflow container is likely not running (or docker). Run 'make up' to start airflow containers" 44 | else 45 | echo "Web container id is $CONTAINER_ID. Adding gcp connection..." 46 | docker exec $CONTAINER_ID airflow connections delete $GCP_CONN_ID 47 | 48 | docker exec $CONTAINER_ID airflow connections add $GCP_CONN_ID \ 49 | --conn-type google_cloud_platform \ 50 | --conn-extra "$JSON_CREDS" 51 | fi 52 | 53 | echo "visit https://go.corp.mozilla.com/wtmodev for more info" 54 | -------------------------------------------------------------------------------- /bin/stop_gke: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is to be used by the Makefile for a stop gke target. 4 | 5 | set -eo pipefail 6 | 7 | USERNAME=$(gcloud config get-value account | awk -F"@" '{print $1}') 8 | CLUSTERNAME=$USERNAME-gke-sandbox 9 | 10 | if gcloud container clusters describe $CLUSTERNAME --region us-west1 --project moz-fx-data-gke-sandbox >/dev/null 2>&1; then 11 | gcloud container clusters delete $CLUSTERNAME --region us-west1 --quiet --project moz-fx-data-gke-sandbox 12 | else 13 | echo "cluster $CLUSTERNAME does not exist" 14 | fi 15 | -------------------------------------------------------------------------------- /config/airflow_local_settings.py: -------------------------------------------------------------------------------- 1 | STATE_COLORS = { 2 | "queued": "gray", 3 | "running": "lime", 4 | "success": "#0000FF", # Rather than "green". 5 | "restarting": "violet", 6 | "failed": "red", 7 | "up_for_retry": "gold", 8 | "up_for_reschedule": "turquoise", 9 | "upstream_failed": "orange", 10 | "skipped": "pink", # Rather than "hotpink". 11 | "deferred": "mediumpurple", 12 | "removed": "lightgrey", 13 | "scheduled": "tan", 14 | } 15 | -------------------------------------------------------------------------------- /dags/.airflowignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/dags/.airflowignore -------------------------------------------------------------------------------- /dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/dags/__init__.py -------------------------------------------------------------------------------- /dags/adm_export.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | from airflow.hooks.base import BaseHook 5 | from airflow.providers.cncf.kubernetes.secret import Secret 6 | from airflow.sensors.external_task import ExternalTaskSensor 7 | 8 | from operators.gcp_container_operator import GKEPodOperator 9 | from utils.constants import ALLOWED_STATES, FAILED_STATES 10 | from utils.tags import Tag 11 | 12 | DOCS = """\ 13 | Daily data exports of contextual services data aggregates to adMarketplace. 14 | This is a complementary approach to the near real-time sharing that is implemented 15 | in gcp-ingestion. 16 | 17 | Relies on the [`bq2stfp` container defined in `docker-etl`](https://github.com/mozilla/docker-etl/tree/main/jobs/bq2sftp) 18 | and credentials stored in the `adm_sftp` connection. 19 | 20 | For more context, see https://bugzilla.mozilla.org/show_bug.cgi?id=1729524 21 | """ 22 | 23 | default_args = { 24 | "owner": "wstuckey@mozilla.com", 25 | "start_date": datetime.datetime(2019, 7, 25), 26 | "email": ["telemetry-alerts@mozilla.com", "wstuckey@mozilla.com"], 27 | "email_on_failure": True, 28 | "email_on_retry": True, 29 | "depends_on_past": False, 30 | # If a task fails, retry it once after waiting at least 5 minutes 31 | "retries": 1, 32 | "retry_delay": datetime.timedelta(minutes=5), 33 | } 34 | 35 | dag_name = "adm_export" 36 | tags = [Tag.ImpactTier.tier_3] 37 | 38 | adm_sftp_secret = Secret( 39 | deploy_type="env", 40 | deploy_target="SFTP_PASSWORD", 41 | secret="airflow-gke-secrets", 42 | key="adm_export_secret__sftp_password", 43 | ) 44 | 45 | with DAG( 46 | dag_name, 47 | schedule_interval="0 5 * * *", 48 | doc_md=DOCS, 49 | default_args=default_args, 50 | tags=tags, 51 | ) as dag: 52 | conn = BaseHook.get_connection("adm_sftp") 53 | 54 | adm_daily_aggregates_to_sftp = GKEPodOperator( 55 | task_id="adm_daily_aggregates_to_sftp", 56 | name="adm_daily_aggregates_to_sftp", 57 | # See https://github.com/mozilla/docker-etl/pull/28 58 | image="gcr.io/moz-fx-data-airflow-prod-88e0/bq2sftp_docker_etl:latest", 59 | project_id="moz-fx-data-airflow-gke-prod", 60 | gcp_conn_id="google_cloud_airflow_gke", 61 | cluster_name="workloads-prod-v1", 62 | location="us-west1", 63 | env_vars={ 64 | "SFTP_USERNAME": conn.login, 65 | "SFTP_HOST": conn.host, 66 | "SFTP_PORT": str(conn.port), 67 | "KNOWN_HOSTS": conn.extra_dejson["known_hosts"], 68 | "SRC_TABLE": "moz-fx-data-shared-prod.search_terms_derived.adm_daily_aggregates_v1", 69 | # The run for submission_date=2022-03-04 will be named: 70 | # Aggregated-Query-Data-03042022.csv.gz 71 | "DST_PATH": 'files/Aggregated-Query-Data-{{ macros.ds_format(ds, "%Y-%m-%d", "%m%d%Y") }}.csv.gz', 72 | "SUBMISSION_DATE": "{{ ds }}", 73 | }, 74 | secrets=[adm_sftp_secret], 75 | email=[ 76 | "telemetry-alerts@mozilla.com", 77 | ], 78 | ) 79 | 80 | wait_for_clients_daily_export = ExternalTaskSensor( 81 | task_id="wait_for_adm_daily_aggregates", 82 | external_dag_id="bqetl_search_terms_daily", 83 | external_task_id="search_terms_derived__adm_daily_aggregates__v1", 84 | execution_delta=datetime.timedelta(hours=2), 85 | mode="reschedule", 86 | allowed_states=ALLOWED_STATES, 87 | failed_states=FAILED_STATES, 88 | pool="DATA_ENG_EXTERNALTASKSENSOR", 89 | email_on_retry=False, 90 | ) 91 | 92 | wait_for_clients_daily_export >> adm_daily_aggregates_to_sftp 93 | -------------------------------------------------------------------------------- /dags/app_store_analytics.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | 5 | from operators.gcp_container_operator import GKEPodOperator 6 | from utils.gcp import bigquery_etl_query 7 | from utils.tags import Tag 8 | 9 | default_args = { 10 | "owner": "telemetry-alerts@mozilla.com", 11 | "depends_on_past": False, 12 | "start_date": datetime(2020, 6, 23), 13 | "email_on_failure": True, 14 | "email_on_retry": True, 15 | "retries": 1, 16 | "retry_delay": timedelta(minutes=30), 17 | "email": [ 18 | "telemetry-alerts@mozilla.com", 19 | ], 20 | } 21 | 22 | PROJECT_ID = "moz-fx-data-marketing-prod" 23 | EXPORT_DATASET_ID = "apple_app_store_exported" 24 | DERIVED_DATASET_ID = "apple_app_store" 25 | 26 | APPS = [ 27 | ("989804926", "Firefox"), 28 | ("1489407738", "VPN"), 29 | ("1295998056", "WebXRViewer"), 30 | ("1314000270", "Lockwise"), 31 | ("1073435754", "Klar"), 32 | ("1055677337", "Focus"), 33 | ] 34 | 35 | DERIVED_TABLES = [ 36 | "metrics_by_app_referrer", 37 | "metrics_by_app_version", 38 | "metrics_by_campaign", 39 | "metrics_by_platform", 40 | "metrics_by_platform_version", 41 | "metrics_by_region", 42 | "metrics_by_source", 43 | "metrics_by_storefront", 44 | "metrics_by_web_referrer", 45 | "metrics_total", 46 | ] 47 | 48 | tags = [Tag.ImpactTier.tier_1] 49 | 50 | with DAG( 51 | "app_store_analytics", 52 | default_args=default_args, 53 | max_active_runs=1, 54 | schedule_interval="@daily", 55 | tags=tags, 56 | ) as dag: 57 | export_date = "macros.ds_add(ds, -2)" # previous day data is incomplete 58 | tasks = [] 59 | 60 | # App exports are scheduled sequentially to avoid hit api rate limit 61 | for i, (app_id, app_name) in enumerate(APPS): 62 | commands = [ 63 | "yarn", 64 | "--silent", # silent to hide arguments from logs 65 | "export", 66 | "--username={{ var.value.app_store_connect_username }}", 67 | "--password={{ var.value.app_store_connect_password }}", 68 | f"--app-id={app_id}", 69 | f"--app-name={app_name}", 70 | f"--start-date={{{{ {export_date} }}}}", 71 | f"--project={PROJECT_ID}", 72 | f"--dataset={EXPORT_DATASET_ID}", 73 | ] 74 | 75 | # First task will clear the day partition so that the only data in the table partition 76 | # is the data written by the current dag run and does not include unrecognized apps 77 | if i == 0: 78 | commands.append("--overwrite") 79 | 80 | app_store_analytics = GKEPodOperator( 81 | task_id=f"app_store_analytics_{app_name}", 82 | arguments=commands, 83 | image="gcr.io/moz-fx-data-airflow-prod-88e0/app-store-analytics-export:latest", 84 | gcp_conn_id="google_cloud_airflow_gke", 85 | dag=dag, 86 | ) 87 | 88 | if i > 0: 89 | app_store_analytics.set_upstream(tasks[i - 1]) 90 | 91 | tasks.append(app_store_analytics) 92 | 93 | # derived tables combine all metrics per dimension 94 | for derived_table in DERIVED_TABLES: 95 | combined_metrics_query = bigquery_etl_query( 96 | task_id=f"{derived_table}_query", 97 | project_id=PROJECT_ID, 98 | dataset_id=DERIVED_DATASET_ID, 99 | sql_file_path=f"sql/moz-fx-data-marketing-prod/{DERIVED_DATASET_ID}/{derived_table}/query.sql", 100 | # Override default date partition because data has multiple day lag 101 | destination_table=( 102 | f"{derived_table}${{{{ macros.ds_format({export_date}, '%Y-%m-%d', '%Y%m%d') }}}}" 103 | ), 104 | date_partition_parameter=None, 105 | parameters=[f"submission_date:DATE:{{{{ {export_date} }}}}"], 106 | dag=dag, 107 | ) 108 | 109 | combined_metrics_query.set_upstream(tasks[-1]) 110 | -------------------------------------------------------------------------------- /dags/backfill.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from enum import Enum 3 | 4 | from airflow.decorators import dag 5 | from airflow.models import DagModel 6 | from airflow.models.param import Param 7 | from airflow.operators.bash import BashOperator 8 | from airflow.operators.empty import EmptyOperator 9 | from airflow.operators.python import BranchPythonOperator, PythonOperator 10 | from airflow.utils.trigger_rule import TriggerRule 11 | 12 | from utils.backfill import BackfillParams 13 | from utils.tags import Tag 14 | 15 | 16 | class TaskId(Enum): 17 | dry_run = "dry_run" 18 | real_deal = "real_deal" 19 | clear_tasks = "clear_tasks" 20 | do_not_clear_tasks = "do_not_clear_tasks" 21 | 22 | 23 | def dry_run_branch_callable(params: dict) -> str: 24 | backfill_params = BackfillParams(**params) 25 | return TaskId.dry_run.value if backfill_params.dry_run else TaskId.real_deal.value 26 | 27 | 28 | def clear_branch_callable(params: dict) -> str: 29 | backfill_params = BackfillParams(**params) 30 | return ( 31 | TaskId.clear_tasks.value 32 | if backfill_params.clear 33 | else TaskId.do_not_clear_tasks.value 34 | ) 35 | 36 | 37 | def param_validation(params: dict) -> bool: 38 | backfill_params = BackfillParams(**params) 39 | backfill_params.validate_date_range() 40 | validate_dag_exists(dag_name=backfill_params.dag_name) 41 | backfill_params.validate_regex_pattern() 42 | return True 43 | 44 | 45 | def validate_dag_exists(dag_name: str) -> None: 46 | dag_instance = DagModel.get_dagmodel(dag_name) 47 | if dag_instance is None: 48 | raise ValueError(f"`dag_name`={dag_name} does not exist") 49 | 50 | 51 | def generate_bash_command(params: dict) -> str: 52 | backfill_params = BackfillParams(**params) 53 | return " ".join(backfill_params.generate_backfill_command()) 54 | 55 | 56 | doc_md = """ 57 | # Backfill DAG 58 | 59 | #### Use with caution 60 | 61 | #### Some tips/notes: 62 | 63 | * Always use dry run first. Especially when using task regex 64 | * Date formats are 2020-03-01 or 2020-03-01T00:00:00 65 | * Dry run for clearing tasks will show you the list of tasks that will be cleared 66 | * Dry run for backfilling will not show the list, but is useful in testing for input errors 67 | 68 | """ 69 | 70 | 71 | @dag( 72 | dag_id="backfill", 73 | schedule_interval=None, 74 | doc_md=doc_md, 75 | catchup=False, 76 | start_date=datetime.datetime(2022, 11, 1), 77 | dagrun_timeout=datetime.timedelta(days=1), 78 | tags=[Tag.ImpactTier.tier_3, Tag.Triage.record_only], 79 | render_template_as_native_obj=True, 80 | params={ 81 | "dag_name": Param("dag_name", type="string"), 82 | "start_date": Param( 83 | (datetime.datetime.today() - datetime.timedelta(days=10)).isoformat(), 84 | type="string", 85 | format="date-time", 86 | ), 87 | "end_date": Param( 88 | datetime.datetime.today().isoformat(), type="string", format="date-time" 89 | ), 90 | "clear": Param(False, type="boolean"), 91 | "dry_run": Param(True, type="boolean"), 92 | "task_regex": Param(None, type=["string", "null"]), 93 | }, 94 | ) 95 | def backfill_dag(): 96 | param_validation_task = PythonOperator( 97 | task_id="param_validation", 98 | python_callable=param_validation, 99 | op_kwargs={"params": "{{ dag_run.conf }}"}, 100 | ) 101 | 102 | dry_run_branch_task = BranchPythonOperator( 103 | task_id="dry_run_parameter", 104 | python_callable=dry_run_branch_callable, 105 | op_kwargs={"params": "{{ dag_run.conf }}"}, 106 | trigger_rule=TriggerRule.ONE_SUCCESS, 107 | ) 108 | 109 | dry_run_task = EmptyOperator(task_id=TaskId.dry_run.value) 110 | real_deal_task = EmptyOperator(task_id=TaskId.real_deal.value) 111 | 112 | clear_branch_task = BranchPythonOperator( 113 | task_id="clear_parameter", 114 | python_callable=clear_branch_callable, 115 | op_kwargs={"params": "{{ dag_run.conf }}"}, 116 | trigger_rule=TriggerRule.ONE_SUCCESS, 117 | ) 118 | 119 | clear_tasks_task = EmptyOperator(task_id=TaskId.clear_tasks.value) 120 | do_not_clear_tasks_task = EmptyOperator(task_id=TaskId.do_not_clear_tasks.value) 121 | 122 | generate_backfill_command_task = PythonOperator( 123 | task_id="generate_backfill_command", 124 | python_callable=generate_bash_command, 125 | op_kwargs={"params": "{{ dag_run.conf }}"}, 126 | trigger_rule=TriggerRule.ONE_SUCCESS, 127 | ) 128 | 129 | backfill_task = BashOperator( 130 | task_id="execute_backfill", 131 | bash_command="{{ ti.xcom_pull(task_ids='generate_backfill_command') }}", 132 | ) 133 | 134 | ( 135 | param_validation_task 136 | >> dry_run_branch_task 137 | >> [dry_run_task, real_deal_task] 138 | >> clear_branch_task 139 | >> [clear_tasks_task, do_not_clear_tasks_task] 140 | >> generate_backfill_command_task 141 | >> backfill_task 142 | ) 143 | 144 | 145 | dag = backfill_dag() 146 | -------------------------------------------------------------------------------- /dags/bhr_collection.py: -------------------------------------------------------------------------------- 1 | """ 2 | A processing job on top of BHR (Background Hang Reporter) pings. 3 | 4 | More information about the pings: https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/data/backgroundhangmonitor-ping.html 5 | 6 | BHR is related to the Background Hang Monitor in Firefox Desktop. 7 | See: [bug 1675103](https://bugzilla.mozilla.org/show_bug.cgi?id=1675103) 8 | 9 | The [job source](https://github.com/mozilla/python_mozetl/blob/main/mozetl/bhr_collection) 10 | is maintained in the mozetl repository. 11 | 12 | * Migrated from Databricks and now running as a scheduled Dataproc task. * 13 | 14 | The resulting aggregations are used by the following service: 15 | https://fqueze.github.io/hang-stats/#date=[DATE]&row=0 16 | """ 17 | 18 | import datetime 19 | 20 | from airflow import DAG 21 | from airflow.operators.subdag import SubDagOperator 22 | from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook 23 | from airflow.sensors.external_task import ExternalTaskSensor 24 | 25 | from utils.constants import ALLOWED_STATES, FAILED_STATES 26 | from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner 27 | from utils.tags import Tag 28 | 29 | default_args = { 30 | "owner": "bewu@mozilla.com", 31 | "depends_on_past": False, 32 | "start_date": datetime.datetime(2020, 11, 26), 33 | "email": [ 34 | "telemetry-alerts@mozilla.com", 35 | "kik@mozilla.com", 36 | "dothayer@mozilla.com", 37 | "bewu@mozilla.com", 38 | ], 39 | "email_on_failure": True, 40 | "email_on_retry": True, 41 | "retries": 1, 42 | "retry_delay": datetime.timedelta(minutes=30), 43 | } 44 | 45 | tags = [Tag.ImpactTier.tier_1] 46 | 47 | with DAG( 48 | "bhr_collection", 49 | default_args=default_args, 50 | schedule_interval="0 5 * * *", 51 | doc_md=__doc__, 52 | tags=tags, 53 | ) as dag: 54 | wait_for_bhr_ping = ExternalTaskSensor( 55 | task_id="wait_for_copy_deduplicate", 56 | external_dag_id="copy_deduplicate", 57 | external_task_id="copy_deduplicate_all", 58 | execution_delta=datetime.timedelta(hours=4), 59 | check_existence=True, 60 | mode="reschedule", 61 | allowed_states=ALLOWED_STATES, 62 | failed_states=FAILED_STATES, 63 | pool="DATA_ENG_EXTERNALTASKSENSOR", 64 | email_on_retry=False, 65 | dag=dag, 66 | ) 67 | 68 | params = get_dataproc_parameters("google_cloud_airflow_dataproc") 69 | 70 | shared_runner_args = { 71 | "parent_dag_name": dag.dag_id, 72 | "image_version": "1.5-debian10", 73 | "default_args": default_args, 74 | "python_driver_code": "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/bhr_collection/bhr_collection.py", 75 | "init_actions_uris": [ 76 | "gs://dataproc-initialization-actions/python/pip-install.sh" 77 | ], 78 | "additional_metadata": { 79 | "PIP_PACKAGES": "boto3==1.16.20 click==7.1.2 google-cloud-storage==2.7.0" 80 | }, 81 | "additional_properties": { 82 | "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", 83 | "spark:spark.driver.memory": "30g", 84 | "spark:spark.executor.memory": "20g", 85 | }, 86 | "idle_delete_ttl": 14400, 87 | # supported machine types depends on dataproc image version: 88 | # https://cloud.google.com/dataproc/docs/concepts/compute/supported-machine-types 89 | "master_machine_type": "n2-highmem-8", 90 | "worker_machine_type": "n2-highmem-4", 91 | "gcp_conn_id": params.conn_id, 92 | "service_account": params.client_email, 93 | "storage_bucket": params.storage_bucket, 94 | } 95 | 96 | bhr_collection = SubDagOperator( 97 | task_id="bhr_collection", 98 | dag=dag, 99 | subdag=moz_dataproc_pyspark_runner( 100 | dag_name="bhr_collection", 101 | cluster_name="bhr-collection-main-{{ ds }}", 102 | job_name="bhr-collection-main", 103 | **shared_runner_args, 104 | num_workers=6, 105 | py_args=[ 106 | "--date", 107 | "{{ ds }}", 108 | "--sample-size", 109 | "0.5", 110 | "--use_gcs", 111 | "--thread-filter", 112 | "Gecko", 113 | "--output-tag", 114 | "main", 115 | ], 116 | ), 117 | ) 118 | 119 | bhr_collection_child = SubDagOperator( 120 | task_id="bhr_collection_child", 121 | dag=dag, 122 | subdag=moz_dataproc_pyspark_runner( 123 | dag_name="bhr_collection_child", 124 | cluster_name="bhr-collection-child-{{ ds }}", 125 | job_name="bhr-collection-child", 126 | **shared_runner_args, 127 | num_workers=12, 128 | py_args=[ 129 | "--date", 130 | "{{ ds }}", 131 | "--sample-size", 132 | "0.08", # there are usually 12-15x more hangs in the child process than main 133 | "--use_gcs", 134 | "--thread-filter", 135 | "Gecko_Child", 136 | "--output-tag", 137 | "child", 138 | ], 139 | ), 140 | ) 141 | 142 | wait_for_bhr_ping >> [ 143 | bhr_collection, 144 | bhr_collection_child, 145 | ] 146 | -------------------------------------------------------------------------------- /dags/bqetl_backfill_complete.py: -------------------------------------------------------------------------------- 1 | """DAG for completing registered bigquery-etl backfills.""" 2 | 3 | from datetime import datetime 4 | 5 | from airflow import DAG 6 | from airflow.decorators import task, task_group 7 | from airflow.providers.slack.operators.slack import SlackAPIPostOperator 8 | 9 | from operators.gcp_container_operator import GKEPodOperator 10 | from utils.tags import Tag 11 | 12 | AUTOMATION_SLACK_CHANNEL = "#dataops-alerts" 13 | SLACK_CONNECTION_ID = "overwatch_slack" 14 | DOCKER_IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest" 15 | 16 | tags = [Tag.ImpactTier.tier_3] 17 | 18 | default_args = { 19 | "email": [ 20 | "ascholtz@mozilla.com", 21 | "bewu@mozilla.com", 22 | "wichan@mozilla.com", 23 | ] 24 | } 25 | 26 | with DAG( 27 | "bqetl_backfill_complete", 28 | doc_md=__doc__, 29 | tags=tags, 30 | schedule_interval="@hourly", 31 | start_date=datetime(2024, 1, 1), 32 | catchup=False, 33 | default_args=default_args, 34 | ) as dag: 35 | detect_backfills = GKEPodOperator( 36 | task_id="detect_backfills", 37 | name="detect_backfills", 38 | cmds=["sh", "-cx"], 39 | arguments=[ 40 | "script/bqetl backfill scheduled --status=Complete --json_path=/airflow/xcom/return.json --ignore-old-entries", 41 | ], 42 | image=DOCKER_IMAGE, 43 | do_xcom_push=True, 44 | ) 45 | 46 | @task_group 47 | def complete_backfill(backfill): 48 | @task 49 | def prepare_slack_complete_message(entry): 50 | watcher_text = " ".join( 51 | f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"] 52 | ) 53 | return ( 54 | f"{watcher_text} :hourglass_flowing_sand: Completing backfill of `{entry['qualified_table_name']}` has started - currently swapping backfill data into production. " 55 | f"A snapshot of the current production data will be kept as a backup for 30 days. " 56 | f"You will receive another notification once the completing step is done." 57 | ) 58 | 59 | notify_initiate = SlackAPIPostOperator( 60 | task_id="slack_notify_initate", 61 | username="Backfill", 62 | slack_conn_id=SLACK_CONNECTION_ID, 63 | text=prepare_slack_complete_message(backfill), 64 | channel=AUTOMATION_SLACK_CHANNEL, 65 | ) 66 | 67 | @task 68 | def prepare_pod_parameters(entry): 69 | return [f"script/bqetl backfill complete { entry['qualified_table_name'] }"] 70 | 71 | process_backfill = GKEPodOperator( 72 | task_id="process_backfill", 73 | name="process_backfill", 74 | cmds=["sh", "-cx"], 75 | arguments=prepare_pod_parameters(backfill), 76 | image=DOCKER_IMAGE, 77 | reattach_on_restart=True, 78 | ) 79 | 80 | @task 81 | def prepare_slack_processing_complete_parameters(entry): 82 | watcher_text = " ".join( 83 | f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"] 84 | ) 85 | 86 | return f"{watcher_text} :white_check_mark: Backfill is complete for `{entry['qualified_table_name']}`. Production data has been updated." 87 | 88 | notify_processing_complete = SlackAPIPostOperator( 89 | task_id="slack_notify_processing_complete", 90 | username="Backfill", 91 | slack_conn_id=SLACK_CONNECTION_ID, 92 | text=prepare_slack_processing_complete_parameters(backfill), 93 | channel=AUTOMATION_SLACK_CHANNEL, 94 | ) 95 | 96 | notify_initiate >> process_backfill >> notify_processing_complete 97 | 98 | backfill_groups = complete_backfill.expand(backfill=detect_backfills.output) 99 | -------------------------------------------------------------------------------- /dags/bqetl_backfill_initiate.py: -------------------------------------------------------------------------------- 1 | """DAG for initiating registered bigquery-etl backfills.""" 2 | 3 | from datetime import datetime 4 | 5 | from airflow import DAG 6 | from airflow.decorators import task, task_group 7 | from airflow.providers.slack.operators.slack import SlackAPIPostOperator 8 | 9 | from operators.gcp_container_operator import GKEPodOperator 10 | from utils.tags import Tag 11 | 12 | AUTOMATION_SLACK_CHANNEL = "#dataops-alerts" 13 | SLACK_CONNECTION_ID = "overwatch_slack" 14 | DOCKER_IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest" 15 | 16 | tags = [Tag.ImpactTier.tier_3] 17 | 18 | default_args = { 19 | "email": [ 20 | "ascholtz@mozilla.com", 21 | "bewu@mozilla.com", 22 | "wichan@mozilla.com", 23 | ] 24 | } 25 | 26 | with DAG( 27 | "bqetl_backfill_initiate", 28 | doc_md=__doc__, 29 | tags=tags, 30 | schedule_interval="@hourly", 31 | start_date=datetime(2024, 1, 1), 32 | catchup=False, 33 | default_args=default_args, 34 | ) as dag: 35 | detect_backfills = GKEPodOperator( 36 | task_id="detect_backfills", 37 | name="detect_backfills", 38 | cmds=["sh", "-cx"], 39 | arguments=[ 40 | "script/bqetl backfill scheduled --status=Initiate --json_path=/airflow/xcom/return.json --ignore-old-entries" 41 | ], 42 | image=DOCKER_IMAGE, 43 | do_xcom_push=True, 44 | ) 45 | 46 | @task_group 47 | def initiate_backfill(backfill): 48 | @task 49 | def prepare_slack_initiate_message(entry): 50 | watcher_text = " ".join( 51 | f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"] 52 | ) 53 | return f"{watcher_text} :hourglass_flowing_sand: Initiating backfill scheduled for `{entry['qualified_table_name']}`. You will receive another notification once the backfill is done." 54 | 55 | notify_initiate = SlackAPIPostOperator( 56 | task_id="slack_notify_initate", 57 | username="Backfill", 58 | slack_conn_id=SLACK_CONNECTION_ID, 59 | text=prepare_slack_initiate_message(backfill), 60 | channel=AUTOMATION_SLACK_CHANNEL, 61 | ) 62 | 63 | @task 64 | def prepare_pod_parameters(entry): 65 | return [f"script/bqetl backfill initiate { entry['qualified_table_name'] }"] 66 | 67 | process_backfill = GKEPodOperator( 68 | task_id="process_backfill", 69 | name="process_backfill", 70 | cmds=["sh", "-cx"], 71 | arguments=prepare_pod_parameters(backfill), 72 | image=DOCKER_IMAGE, 73 | reattach_on_restart=True, 74 | ) 75 | 76 | @task 77 | def prepare_slack_processing_complete_parameters(entry): 78 | project, dataset, table = entry["qualified_table_name"].split(".") 79 | backfill_table_id = ( 80 | f"{dataset}__{table}_{entry['entry_date'].replace('-', '_')}" 81 | ) 82 | staging_location = ( 83 | f"{project}.backfills_staging_derived.{backfill_table_id}" 84 | ) 85 | watcher_text = " ".join( 86 | f"<@{watcher.split('@')[0]}>" for watcher in entry["watchers"] 87 | ) 88 | 89 | return ( 90 | f"{watcher_text} :white_check_mark: Backfill processing is done. Staging location: `{staging_location}`. " 91 | "Please validate that your data has changed as you expect and complete your backfill by updating the Backfill entry's status to Complete in the bigquery-etl repository. " 92 | "Note that the staging table will expire in 30 days, so the backfill must be completed within 30 days." 93 | ) 94 | 95 | notify_processing_complete = SlackAPIPostOperator( 96 | task_id="slack_notify_processing_complete", 97 | username="Backfill", 98 | slack_conn_id=SLACK_CONNECTION_ID, 99 | text=prepare_slack_processing_complete_parameters(backfill), 100 | channel=AUTOMATION_SLACK_CHANNEL, 101 | ) 102 | 103 | notify_initiate >> process_backfill >> notify_processing_complete 104 | 105 | backfill_groups = initiate_backfill.expand(backfill=detect_backfills.output) 106 | -------------------------------------------------------------------------------- /dags/broken_site_report_ml.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | 5 | from operators.gcp_container_operator import GKEPodOperator 6 | from utils.tags import Tag 7 | 8 | DOCS = """ 9 | ### ML classification of broken site reports 10 | 11 | #### Description 12 | 13 | Runs a Docker image that does the following: 14 | 15 | 1. Translates incoming broken sites reports to English with ML.TRANSLATE. 16 | 2. Classifies translated reports as valid/invalid using [bugbug](https://github.com/mozilla/bugbug). 17 | 3. Stores translation and classification results in BQ. 18 | 19 | The container is defined in 20 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/broken-site-report-ml) 21 | 22 | *Triage notes* 23 | 24 | As long as the most recent DAG run is successful this job doesn't need to be triaged. 25 | 26 | #### Owner 27 | 28 | kberezina@mozilla.com 29 | """ 30 | 31 | default_args = { 32 | "owner": "kberezina@mozilla.com", 33 | "email": ["kberezina@mozilla.com", "webcompat-internal@mozilla.org"], 34 | "depends_on_past": False, 35 | "start_date": datetime(2023, 12, 21), 36 | "email_on_failure": True, 37 | } 38 | 39 | 40 | tags = [ 41 | Tag.ImpactTier.tier_2, 42 | ] 43 | 44 | every_fifteen_minutes = "*/15 * * * *" 45 | 46 | with DAG( 47 | "broken_site_report_ml", 48 | default_args=default_args, 49 | max_active_runs=1, 50 | doc_md=DOCS, 51 | schedule_interval=every_fifteen_minutes, 52 | tags=tags, 53 | catchup=False, 54 | ) as dag: 55 | broken_site_report_ml = GKEPodOperator( 56 | task_id="broken_site_report_ml", 57 | arguments=[ 58 | "python", 59 | "broken_site_report_ml/main.py", 60 | "--bq_project_id", 61 | "moz-fx-dev-dschubert-wckb", 62 | "--bq_dataset_id", 63 | "webcompat_user_reports", 64 | ], 65 | image="gcr.io/moz-fx-data-airflow-prod-88e0/broken-site-report-ml_docker_etl:latest", 66 | dag=dag, 67 | ) 68 | -------------------------------------------------------------------------------- /dags/catalyst.py: -------------------------------------------------------------------------------- 1 | """ 2 | DAG to schedule generation of performance reports for recently completed nimbus experiments. 3 | 4 | See the [catalyst repository](https://github.com/mozilla/catalyst). 5 | 6 | *Triage notes* 7 | 8 | This app will perform some bigquery queries, and generate statistical reports based on that data which are 9 | then published to https://protosaur.dev/perf-reports/index.html. 10 | 11 | Generally, there should be minimal triage necessary for failures unless it's related to infrastructure issues. 12 | Any failures related to the app execution itself will be taken care of directly by the performance team. 13 | 14 | """ 15 | 16 | from datetime import datetime, timedelta 17 | 18 | from airflow import DAG 19 | from airflow.sensors.external_task import ExternalTaskSensor 20 | 21 | from operators.gcp_container_operator import GKEPodOperator 22 | from utils.constants import ALLOWED_STATES, FAILED_STATES 23 | from utils.tags import Tag 24 | 25 | default_args = { 26 | "owner": "dpalmeiro@mozilla.com", 27 | "email": [ 28 | "dpalmeiro@mozilla.com", 29 | ], 30 | "depends_on_past": False, 31 | "start_date": datetime(2025, 5, 5), 32 | "email_on_failure": True, 33 | "email_on_retry": True, 34 | "retries": 1, 35 | "retry_delay": timedelta(minutes=30), 36 | } 37 | 38 | tags = [Tag.ImpactTier.tier_2] 39 | 40 | with DAG( 41 | "catalyst", 42 | default_args=default_args, 43 | schedule_interval="0 4 * * *", 44 | doc_md=__doc__, 45 | tags=tags, 46 | ) as dag: 47 | # Built from repo https://github.com/mozilla/catalyst 48 | catalyst_image = "gcr.io/moz-fx-data-experiments/catalyst:latest" 49 | 50 | catalyst_run = GKEPodOperator( 51 | task_id="catalyst_run", 52 | name="catalyst_run", 53 | image=catalyst_image, 54 | email=default_args["email"], 55 | dag=dag, 56 | ) 57 | 58 | wait_for_clients_daily_export = ExternalTaskSensor( 59 | task_id="wait_for_clients_daily", 60 | external_dag_id="bqetl_main_summary", 61 | external_task_id="telemetry_derived__clients_daily__v6", 62 | execution_delta=timedelta(hours=2), 63 | mode="reschedule", 64 | allowed_states=ALLOWED_STATES, 65 | failed_states=FAILED_STATES, 66 | pool="DATA_ENG_EXTERNALTASKSENSOR", 67 | email_on_retry=False, 68 | dag=dag, 69 | ) 70 | 71 | wait_for_search_clients_daily = ExternalTaskSensor( 72 | task_id="wait_for_search_clients_daily", 73 | external_dag_id="bqetl_search", 74 | external_task_id="search_derived__search_clients_daily__v8", 75 | execution_delta=timedelta(hours=1), 76 | mode="reschedule", 77 | allowed_states=ALLOWED_STATES, 78 | failed_states=FAILED_STATES, 79 | pool="DATA_ENG_EXTERNALTASKSENSOR", 80 | email_on_retry=False, 81 | dag=dag, 82 | ) 83 | 84 | wait_for_bq_events = ExternalTaskSensor( 85 | task_id="wait_for_bq_main_events", 86 | external_dag_id="copy_deduplicate", 87 | external_task_id="bq_main_events", 88 | execution_delta=timedelta(hours=3), 89 | mode="reschedule", 90 | allowed_states=ALLOWED_STATES, 91 | failed_states=FAILED_STATES, 92 | pool="DATA_ENG_EXTERNALTASKSENSOR", 93 | email_on_retry=False, 94 | dag=dag, 95 | ) 96 | 97 | wait_for_copy_deduplicate_events = ExternalTaskSensor( 98 | task_id="wait_for_event_events", 99 | external_dag_id="copy_deduplicate", 100 | external_task_id="event_events", 101 | execution_delta=timedelta(hours=3), 102 | mode="reschedule", 103 | allowed_states=ALLOWED_STATES, 104 | failed_states=FAILED_STATES, 105 | pool="DATA_ENG_EXTERNALTASKSENSOR", 106 | email_on_retry=False, 107 | dag=dag, 108 | ) 109 | 110 | catalyst_run.set_upstream( 111 | [ 112 | wait_for_clients_daily_export, 113 | wait_for_search_clients_daily, 114 | wait_for_bq_events, 115 | wait_for_copy_deduplicate_events, 116 | ] 117 | ) 118 | -------------------------------------------------------------------------------- /dags/clean_gke_pods.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | 5 | from operators.gcp_container_operator import GKEPodOperator 6 | from utils.tags import Tag 7 | 8 | docs = """ 9 | ### Clean GKE Pods 10 | 11 | Failures can be ignored during Airflow Triage. This job is idempotent. 12 | 13 | Built from cloudops-infra repo, projects/airflow/pod-clean 14 | 15 | #### Purpose 16 | 17 | This DAG executes a GKEPodOperator to clean out old completed pods 18 | on the shared workloads-prod-v1 gke cluster. We need to do this periodically 19 | because GCP has a 1500 object limit quota. 20 | 21 | #### Owner 22 | 23 | hwoo@mozilla.com 24 | """ 25 | 26 | 27 | default_args = { 28 | "owner": "hwoo@mozilla.com", 29 | "depends_on_past": False, 30 | "start_date": datetime(2019, 12, 26), 31 | "email_on_failure": True, 32 | "email_on_retry": True, 33 | "retries": 2, 34 | "retry_delay": timedelta(minutes=30), 35 | } 36 | 37 | tags = [ 38 | Tag.ImpactTier.tier_3, 39 | Tag.Triage.no_triage, 40 | ] 41 | 42 | dag = DAG( 43 | "clean-gke-pods", 44 | default_args=default_args, 45 | schedule_interval="@daily", 46 | doc_md=docs, 47 | tags=tags, 48 | ) 49 | 50 | # docker_image = 'us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/gke-pod-clean:1.3' 51 | docker_image = "gcr.io/moz-fx-data-airflow-prod-88e0/gke-pod-clean:1.4" 52 | gke_cluster_name = "workloads-prod-v1" 53 | gke_location = "us-west1" 54 | project_id = "moz-fx-data-airflow-gke-prod" 55 | 56 | docker_args = [ 57 | "--project", 58 | project_id, 59 | "--gke-cluster", 60 | gke_cluster_name, 61 | "--region", 62 | gke_location, 63 | "--retention-days", 64 | "4", 65 | ] 66 | 67 | clean_gke_pods = GKEPodOperator( 68 | task_id="clean-gke-pods", 69 | name="clean-gke-pods", 70 | image=docker_image, 71 | arguments=docker_args, 72 | dag=dag, 73 | ) 74 | -------------------------------------------------------------------------------- /dags/contextual_services_import.py: -------------------------------------------------------------------------------- 1 | """ 2 | Runs a Docker image that imports Quicksuggest suggestions from Remote Settings to BigQuery. 3 | 4 | See the [`quicksuggest2bq`](https://github.com/mozilla/docker-etl/tree/main/jobs/quicksuggest2bq) 5 | docker image defined in `docker-etl`. 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | 10 | from airflow import DAG 11 | 12 | from operators.gcp_container_operator import GKEPodOperator 13 | from utils.tags import Tag 14 | 15 | default_args = { 16 | "owner": "wstuckey@mozilla.com", 17 | "depends_on_past": False, 18 | "start_date": datetime(2021, 11, 18), 19 | "email_on_failure": True, 20 | "email_on_retry": True, 21 | "retries": 2, 22 | "retry_delay": timedelta(minutes=30), 23 | } 24 | 25 | project_id = "moz-fx-data-shared-prod" 26 | table_id = "search_terms_derived.remotesettings_suggestions_v1" 27 | 28 | tags = [Tag.ImpactTier.tier_1] 29 | 30 | with DAG( 31 | "contextual_services_import", 32 | default_args=default_args, 33 | doc_md=__doc__, 34 | schedule_interval="@daily", 35 | tags=tags, 36 | ) as dag: 37 | quicksuggest2bq = GKEPodOperator( 38 | task_id="quicksuggest2bq", 39 | arguments=[ 40 | "python", 41 | "quicksuggest2bq/main.py", 42 | "--destination-project", 43 | project_id, 44 | "--destination-table-id", 45 | table_id, 46 | ], 47 | image="gcr.io/moz-fx-data-airflow-prod-88e0/quicksuggest2bq_docker_etl:latest", 48 | gcp_conn_id="google_cloud_airflow_gke", 49 | dag=dag, 50 | email=[ 51 | "wstuckey@mozilla.com", 52 | "ctroy@mozilla.com", 53 | ], 54 | ) 55 | -------------------------------------------------------------------------------- /dags/crash_symbolication.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates "Weekly report of modules with missing symbols in crash reports" and sends it to the Stability list. 3 | 4 | Generates correlations data for top crashers. 5 | 6 | Uses crash report data imported from Socorro. 7 | """ 8 | import datetime 9 | 10 | from airflow import DAG 11 | from airflow.operators.subdag import SubDagOperator 12 | from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook 13 | from airflow.sensors.external_task import ExternalTaskSensor 14 | 15 | from utils.constants import ALLOWED_STATES, FAILED_STATES 16 | from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner 17 | from utils.tags import Tag 18 | 19 | default_args = { 20 | "owner": "srose@mozilla.com", 21 | "depends_on_past": False, 22 | "start_date": datetime.datetime(2020, 11, 26), 23 | "email": [ 24 | "mcastelluccio@mozilla.com", 25 | "srose@mozilla.com", 26 | "telemetry-alerts@mozilla.com", 27 | ], 28 | "email_on_failure": True, 29 | "email_on_retry": True, 30 | "retries": 2, 31 | "retry_delay": datetime.timedelta(minutes=30), 32 | } 33 | 34 | PIP_PACKAGES = [ 35 | "boto3==1.16.20", 36 | "scipy==1.5.4", 37 | "google-cloud-storage==2.7.0", 38 | ] 39 | 40 | tags = [Tag.ImpactTier.tier_3] 41 | 42 | with DAG( 43 | "crash_symbolication", 44 | default_args=default_args, 45 | # dag runs daily but tasks only run on certain days 46 | schedule_interval="0 5 * * *", 47 | tags=tags, 48 | doc_md=__doc__, 49 | ) as dag: 50 | # modules_with_missing_symbols sends results as email 51 | ses_aws_conn_id = "aws_data_iam_ses" 52 | ses_access_key, ses_secret_key, _ = AwsBaseHook( 53 | aws_conn_id=ses_aws_conn_id, client_type="s3" 54 | ).get_credentials() 55 | 56 | wait_for_socorro_import = ExternalTaskSensor( 57 | task_id="wait_for_socorro_import", 58 | external_dag_id="socorro_import", 59 | external_task_id="bigquery_load", 60 | check_existence=True, 61 | execution_delta=datetime.timedelta(hours=5), 62 | mode="reschedule", 63 | allowed_states=ALLOWED_STATES, 64 | failed_states=FAILED_STATES, 65 | pool="DATA_ENG_EXTERNALTASKSENSOR", 66 | email_on_retry=False, 67 | ) 68 | 69 | params = get_dataproc_parameters("google_cloud_airflow_dataproc") 70 | 71 | modules_with_missing_symbols = SubDagOperator( 72 | task_id="modules_with_missing_symbols", 73 | subdag=moz_dataproc_pyspark_runner( 74 | parent_dag_name=dag.dag_id, 75 | image_version="1.5-debian10", 76 | dag_name="modules_with_missing_symbols", 77 | default_args=default_args, 78 | cluster_name="modules-with-missing-symbols-{{ ds }}", 79 | job_name="modules-with-missing-symbols", 80 | python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/modules_with_missing_symbols.py", 81 | init_actions_uris=[ 82 | "gs://dataproc-initialization-actions/python/pip-install.sh" 83 | ], 84 | additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)}, 85 | additional_properties={ 86 | "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", 87 | "spark-env:AWS_ACCESS_KEY_ID": ses_access_key, 88 | "spark-env:AWS_SECRET_ACCESS_KEY": ses_secret_key, 89 | }, 90 | py_args=["--run-on-days", "0", "--date", "{{ ds }}"], # run monday 91 | idle_delete_ttl=14400, 92 | num_workers=2, 93 | worker_machine_type="n1-standard-4", 94 | gcp_conn_id=params.conn_id, 95 | service_account=params.client_email, 96 | storage_bucket=params.storage_bucket, 97 | ), 98 | ) 99 | 100 | top_signatures_correlations = SubDagOperator( 101 | task_id="top_signatures_correlations", 102 | subdag=moz_dataproc_pyspark_runner( 103 | parent_dag_name=dag.dag_id, 104 | image_version="1.5-debian10", 105 | dag_name="top_signatures_correlations", 106 | default_args=default_args, 107 | cluster_name="top-signatures-correlations-{{ ds }}", 108 | job_name="top-signatures-correlations", 109 | python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/top_signatures_correlations.py", 110 | init_actions_uris=[ 111 | "gs://dataproc-initialization-actions/python/pip-install.sh" 112 | ], 113 | additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)}, 114 | additional_properties={ 115 | "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", 116 | }, 117 | py_args=[ 118 | # run monday, wednesday, and friday 119 | "--run-on-days", 120 | "0", 121 | "2", 122 | "4", 123 | "--date", 124 | "{{ ds }}", 125 | ], 126 | idle_delete_ttl=14400, 127 | num_workers=2, 128 | worker_machine_type="n1-standard-8", 129 | gcp_conn_id=params.conn_id, 130 | service_account=params.client_email, 131 | storage_bucket=params.storage_bucket, 132 | ), 133 | ) 134 | 135 | wait_for_socorro_import >> modules_with_missing_symbols 136 | wait_for_socorro_import >> top_signatures_correlations 137 | -------------------------------------------------------------------------------- /dags/dap_collector.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | 5 | from operators.gcp_container_operator import GKEPodOperator 6 | from utils.tags import Tag 7 | 8 | DOCS = """ 9 | ### DAP Collector 10 | 11 | #### Description 12 | 13 | Runs a Docker image that collects data from a DAP (Distributed Aggregation Protocol) leader and stores it in BigQuery. 14 | 15 | The container is defined in 16 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/dap-collector) 17 | 18 | For more information on Privacy Preserving Measurement in Firefox see 19 | https://bugzilla.mozilla.org/show_bug.cgi?id=1775035 20 | 21 | This DAG requires following variables to be defined in Airflow: 22 | * dap_auth_token 23 | * dap_hpke_private_key 24 | * dap_task_config_url 25 | 26 | This job is under active development, occasional failures are expected. 27 | 28 | #### Owner 29 | 30 | sfriedberger@mozilla.com 31 | """ 32 | 33 | default_args = { 34 | "owner": "sfriedberger@mozilla.com", 35 | "email": ["akomarzewski@mozilla.com", "sfriedberger@mozilla.com"], 36 | "depends_on_past": False, 37 | "start_date": datetime(2023, 3, 8), 38 | "email_on_failure": True, 39 | "email_on_retry": True, 40 | "retries": 1, 41 | "retry_delay": timedelta(hours=2), 42 | } 43 | 44 | project_id = "moz-fx-data-shared-prod" 45 | table_id = "dap_collector_derived.aggregates_v1" 46 | 47 | tags = [ 48 | Tag.ImpactTier.tier_3, 49 | Tag.Triage.no_triage, 50 | ] 51 | 52 | with DAG( 53 | "dap_collector", 54 | default_args=default_args, 55 | doc_md=DOCS, 56 | schedule_interval="@daily", 57 | tags=tags, 58 | ) as dag: 59 | dap_collector = GKEPodOperator( 60 | task_id="dap_collector", 61 | arguments=[ 62 | "python", 63 | "dap_collector/main.py", 64 | "--date={{ ds }}", 65 | "--auth-token={{ var.value.dap_auth_token }}", 66 | "--hpke-private-key={{ var.value.dap_hpke_private_key }}", 67 | "--task-config-url={{ var.value.dap_task_config_url }}", 68 | "--project", 69 | project_id, 70 | "--table-id", 71 | table_id, 72 | ], 73 | image="gcr.io/moz-fx-data-airflow-prod-88e0/dap-collector_docker_etl:latest", 74 | gcp_conn_id="google_cloud_airflow_gke", 75 | ) 76 | -------------------------------------------------------------------------------- /dags/dap_collector_ppa_dev.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | 5 | from operators.gcp_container_operator import GKEPodOperator 6 | from utils.tags import Tag 7 | 8 | DOCS = """ 9 | ### PPA Dev DAP Collector 10 | 11 | #### Description 12 | 13 | Runs a Docker image that collects PPA Dev Environment data from a DAP (Distributed Aggregation Protocol) leader and stores it in BigQuery. 14 | 15 | The container is defined in 16 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/dap-collector-ppa-dev) 17 | 18 | This DAG requires following variables to be defined in Airflow: 19 | * dap_ppa_dev_auth_token 20 | * dap_ppa_dev_hpke_private_key 21 | * dap_ppa_dev_task_config_url 22 | * dap_ppa_dev_ad_config_url 23 | 24 | This job is under active development, occasional failures are expected. 25 | 26 | #### Owner 27 | 28 | bbirdsong@mozilla.com 29 | """ 30 | 31 | default_args = { 32 | "owner": "bbirdsong@mozilla.com", 33 | "email": ["ads-eng@mozilla.com", "bbirdsong@mozilla.com"], 34 | "depends_on_past": False, 35 | "start_date": datetime(2024, 4, 30), 36 | "email_on_failure": True, 37 | "email_on_retry": False, 38 | "retries": 0, 39 | } 40 | 41 | project_id = "moz-fx-ads-nonprod" 42 | ad_table_id = "ppa_dev.measurements" 43 | report_table_id = "ppa_dev.reports" 44 | 45 | tags = [ 46 | Tag.ImpactTier.tier_3, 47 | Tag.Triage.no_triage, 48 | ] 49 | 50 | 51 | with DAG( 52 | "dap_collector_ppa_dev", 53 | default_args=default_args, 54 | doc_md=DOCS, 55 | schedule_interval="15 0 * * *", 56 | tags=tags, 57 | catchup=False, 58 | ) as dag: 59 | dap_collector = GKEPodOperator( 60 | task_id="dap_collector_ppa_dev", 61 | arguments=[ 62 | "python", 63 | "dap_collector_ppa_dev/main.py", 64 | "--date={{ data_interval_end.at(0) | ts }}", 65 | "--auth-token={{ var.value.dap_ppa_dev_auth_token }}", 66 | "--hpke-private-key={{ var.value.dap_ppa_dev_hpke_private_key }}", 67 | "--task-config-url={{ var.value.dap_ppa_dev_task_config_url }}", 68 | "--ad-config-url={{ var.value.dap_ppa_dev_ad_config_url }}", 69 | "--project", 70 | project_id, 71 | "--ad-table-id", 72 | ad_table_id, 73 | "--report-table-id", 74 | report_table_id, 75 | ], 76 | image="gcr.io/moz-fx-data-airflow-prod-88e0/dap-collector-ppa-dev_docker_etl:latest", 77 | gcp_conn_id="google_cloud_airflow_gke", 78 | ) 79 | -------------------------------------------------------------------------------- /dags/dap_collector_ppa_prod.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | from airflow.providers.cncf.kubernetes.secret import Secret 5 | 6 | from operators.gcp_container_operator import GKEPodOperator 7 | from utils.tags import Tag 8 | 9 | DOCS = """ 10 | ### PPA Prod DAP Collector 11 | 12 | #### Description 13 | 14 | Runs a Docker image that collects PPA Prod Environment data from a DAP (Distributed Aggregation Protocol) leader and stores it in BigQuery. 15 | 16 | The container is defined in 17 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/dap-collector-ppa-prod) 18 | 19 | This DAG requires following variables to be defined in Airflow: 20 | * dap_ppa_prod_auth_token 21 | * dap_ppa_prod_hpke_private_key 22 | * dap_ppa_prod_task_config_url 23 | * dap_ppa_prod_ad_config_url 24 | 25 | This job is under active development, occasional failures are expected. 26 | 27 | #### Owner 28 | 29 | bbirdsong@mozilla.com 30 | """ 31 | 32 | default_args = { 33 | "owner": "bbirdsong@mozilla.com", 34 | "email": ["ads-eng@mozilla.com", "bbirdsong@mozilla.com"], 35 | "depends_on_past": False, 36 | "start_date": datetime(2024, 6, 26), 37 | "email_on_failure": True, 38 | "email_on_retry": False, 39 | "retries": 0, 40 | } 41 | 42 | project_id = "moz-fx-ads-prod" 43 | ad_table_id = "ppa.measurements" 44 | report_table_id = "ppa.reports" 45 | 46 | tags = [ 47 | Tag.ImpactTier.tier_3, 48 | Tag.Triage.no_triage, 49 | ] 50 | 51 | hpke_private_key = Secret( 52 | deploy_type="env", 53 | deploy_target="HPKE_PRIVATE_KEY", 54 | secret="airflow-gke-secrets", 55 | key="DAP_PPA_PROD_HPKE_PRIVATE_KEY", 56 | ) 57 | 58 | auth_token = Secret( 59 | deploy_type="env", 60 | deploy_target="AUTH_TOKEN", 61 | secret="airflow-gke-secrets", 62 | key="DAP_PPA_PROD_AUTH_TOKEN", 63 | ) 64 | 65 | with DAG( 66 | "dap_collector_ppa_prod", 67 | default_args=default_args, 68 | doc_md=DOCS, 69 | schedule_interval="15 0 * * *", 70 | tags=tags, 71 | catchup=False, 72 | ) as dag: 73 | dap_collector = GKEPodOperator( 74 | task_id="dap_collector_ppa_prod", 75 | arguments=[ 76 | "python", 77 | "dap_collector_ppa_prod/main.py", 78 | "--date={{ data_interval_end.at(0) | ts }}", 79 | "--task-config-url={{ var.value.dap_ppa_prod_task_config_url }}", 80 | "--ad-config-url={{ var.value.dap_ppa_prod_ad_config_url }}", 81 | "--project", 82 | project_id, 83 | "--ad-table-id", 84 | ad_table_id, 85 | "--report-table-id", 86 | report_table_id, 87 | ], 88 | image="gcr.io/moz-fx-data-airflow-prod-88e0/dap-collector-ppa-prod_docker_etl:latest", 89 | gcp_conn_id="google_cloud_airflow_gke", 90 | secrets=[ 91 | hpke_private_key, 92 | auth_token, 93 | ], 94 | ) 95 | -------------------------------------------------------------------------------- /dags/dbt_daily.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.providers.dbt.cloud.operators.dbt import DbtCloudRunJobOperator 6 | from airflow.sensors.external_task import ExternalTaskSensor 7 | 8 | from utils.constants import ALLOWED_STATES, FAILED_STATES 9 | from utils.tags import Tag 10 | 11 | DOCS = """\ 12 | # DBT Daily 13 | 14 | This triggers jobs configured in dbt Cloud to run daily scheduled models that depend 15 | on other Airflow jobs. 16 | 17 | *Triage notes* 18 | 19 | DBT accounts are limited at the moment, so it might not be possible to get more visibility 20 | into failing jobs at the moment. 21 | """ 22 | 23 | default_args = { 24 | "owner": "ascholtz@mozilla.com", 25 | "depends_on_past": False, 26 | "start_date": datetime(2024, 7, 31), 27 | "email_on_failure": True, 28 | "email_on_retry": True, 29 | "retries": 2, 30 | "retry_delay": timedelta(minutes=30), 31 | "dbt_cloud_conn_id": "dbt_cloud", 32 | "account_id": "{{ var.value.dbt_account_id }}" 33 | } 34 | 35 | tags = [ 36 | Tag.ImpactTier.tier_3, 37 | Tag.Triage.no_triage, 38 | ] 39 | 40 | 41 | with DAG( 42 | "dbt_daily", 43 | doc_md=DOCS, 44 | max_active_runs=1, 45 | default_args=default_args, 46 | schedule_interval="0 4 * * 0", 47 | tags=tags, 48 | ) as dag: 49 | wait_for_copy_deduplicate = ExternalTaskSensor( 50 | task_id="wait_for_copy_deduplicate", 51 | external_dag_id="copy_deduplicate", 52 | external_task_id="copy_deduplicate_all", 53 | execution_delta=timedelta(hours=3), 54 | mode="reschedule", 55 | allowed_states=ALLOWED_STATES, 56 | failed_states=FAILED_STATES, 57 | pool="DATA_ENG_EXTERNALTASKSENSOR", 58 | email_on_retry=False, 59 | dag=dag, 60 | ) 61 | 62 | # runs dbt jobs tagged with "refresh_daily" and "scheduled_in_airflow" 63 | trigger_dbt_daily_cloud_run_job = DbtCloudRunJobOperator( 64 | task_id="trigger_dbt_daily_cloud_run_job", 65 | job_id=684764, 66 | check_interval=10, 67 | timeout=300, 68 | ) 69 | 70 | wait_for_copy_deduplicate >> trigger_dbt_daily_cloud_run_job 71 | -------------------------------------------------------------------------------- /dags/eam_slack_channels.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | from airflow.providers.cncf.kubernetes.secret import Secret 5 | 6 | from operators.gcp_container_operator import GKEPodOperator 7 | from utils.tags import Tag 8 | 9 | DOCS = """ 10 | ### Slack Channels integration 11 | Runs a script in docker image that 12 | - will archive unused channels 13 | - delete old archived channels 14 | 15 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/eam-integrations) 16 | 17 | This DAG requires the creation of an Airflow Jira connection. 18 | 19 | #### Owner 20 | jmoscon@mozilla.com 21 | 22 | """ 23 | 24 | 25 | def get_airflow_log_link(context): 26 | import urllib.parse 27 | 28 | dag_run_id = context["dag_run"].run_id 29 | task_id = context["task_instance"].task_id 30 | base_url = "http://workflow.telemetry.mozilla.org/dags/" 31 | base_url += "eam-slack-channels-integration/grid?tab=logs&dag_run_id=" 32 | return base_url + f"{urllib.parse.quote(dag_run_id)}&task_id={task_id}" 33 | 34 | 35 | def create_jira_ticket(context): 36 | import json 37 | import logging 38 | 39 | import requests 40 | from airflow.providers.atlassian.jira.hooks.jira import JiraHook 41 | from requests.auth import HTTPBasicAuth 42 | 43 | logger = logging.getLogger(__name__) 44 | logger.info("Creating Jira ticket ...") 45 | 46 | conn_id = "eam_jira_connection_id" 47 | conn = JiraHook( 48 | jira_conn_id=conn_id, 49 | ).get_connection(conn_id) 50 | log_url = get_airflow_log_link(context) 51 | 52 | jira_domain = "mozilla-hub-sandbox-721.atlassian.net" 53 | url = f"https://{jira_domain}/rest/api/3/issue" 54 | headers = {"Accept": "application/json", "Content-Type": "application/json"} 55 | auth = HTTPBasicAuth(conn.login, conn.password) 56 | summary = "Slack Channels Integration - Airflow Task Issue Exception" 57 | paragraph_text = "Detailed error logging can be found in the link: " 58 | project_key = "ASP" 59 | issue_type_id = "10020" # Issue Type = Bug 60 | assignee_id = "712020:b999000a-67b1-45ff-8b40-42a5ceeee75b" # Julio 61 | payload = json.dumps( 62 | { 63 | "fields": { 64 | "assignee": {"id": assignee_id}, 65 | "project": {"key": project_key}, 66 | "summary": summary, 67 | "description": { 68 | "type": "doc", 69 | "version": 1, 70 | "content": [ 71 | { 72 | "type": "paragraph", 73 | "content": [ 74 | { 75 | "type": "text", 76 | "text": paragraph_text, 77 | }, 78 | { 79 | "type": "text", 80 | "text": "Mozilla-Telemetry log.", 81 | "marks": [ 82 | { 83 | "type": "link", 84 | "attrs": {"href": f"{log_url}"}, 85 | } 86 | ], 87 | }, 88 | ], 89 | } 90 | ], 91 | }, 92 | "issuetype": {"id": issue_type_id}, 93 | } 94 | } 95 | ) 96 | 97 | response = requests.post(url, headers=headers, auth=auth, data=payload) 98 | logger.info(f"response.text={response.text}") 99 | if response.status_code == 201: 100 | logger.info("Issue created successfully.") 101 | return response.json() 102 | else: 103 | logger.info( 104 | f"Failed to create issue. Status code:" 105 | f"{response.status_code}, Response: {response.text}" 106 | ) 107 | return None 108 | 109 | 110 | default_args = { 111 | "owner": "jmoscon@mozilla.com", 112 | "emails": ["jmoscon@mozilla.com"], 113 | "start_date": datetime.datetime(2024, 1, 1), 114 | "retries": 3, 115 | # wait 5 min before retry 116 | "retry_delay": datetime.timedelta(minutes=5), 117 | "on_failure_callback": create_jira_ticket, 118 | } 119 | tags = [Tag.ImpactTier.tier_3] 120 | 121 | 122 | SLACK_CHANNEL_TOKEN = Secret( 123 | deploy_type="env", 124 | deploy_target="SLACK_CHANNEL_TOKEN", 125 | secret="airflow-gke-secrets", 126 | key="SLACK_CHANNEL_TOKEN", 127 | ) 128 | 129 | with DAG( 130 | "eam-slack-channels-integration", 131 | default_args=default_args, 132 | doc_md=DOCS, 133 | tags=tags, 134 | # 10 PM standard time (PST, UTC-8) every day 135 | schedule_interval="0 6 * * *", 136 | ) as dag: 137 | slack_channels_dag = GKEPodOperator( 138 | task_id="eam_slack_channels", 139 | arguments=[ 140 | "python", 141 | "scripts/slack_channels_integration.py", 142 | "--level", 143 | "info", 144 | ], 145 | image="gcr.io/moz-fx-data-airflow-prod-88e0/" 146 | + "eam-integrations_docker_etl:latest", 147 | gcp_conn_id="google_cloud_airflow_gke", 148 | secrets=[ 149 | SLACK_CHANNEL_TOKEN, 150 | ], 151 | ) 152 | -------------------------------------------------------------------------------- /dags/experiment_auto_sizing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Powers the [auto-sizing](https://github.com/mozilla/auto-sizing) tool 3 | for computing experiment sizing information for various configurations. 4 | 5 | *Triage notes* 6 | TBD 7 | """ # noqa: D205 8 | 9 | from datetime import datetime, timedelta 10 | 11 | from airflow import DAG 12 | from airflow.sensors.external_task import ExternalTaskSensor 13 | 14 | from operators.gcp_container_operator import GKEPodOperator 15 | from utils.constants import ALLOWED_STATES, FAILED_STATES 16 | from utils.tags import Tag 17 | 18 | default_args = { 19 | "owner": "mwilliams@mozilla.com", 20 | "email": ["mwilliams@mozilla.com", "ascholtz@mozilla.com", "mbowerman@mozilla.com"], 21 | "depends_on_past": False, 22 | "start_date": datetime(2023, 4, 15), 23 | "email_on_failure": True, 24 | "email_on_retry": True, 25 | "retries": 2, 26 | "retry_delay": timedelta(minutes=30), 27 | } 28 | 29 | tags = [Tag.ImpactTier.tier_1] 30 | 31 | with DAG( 32 | "experiment_auto_sizing", 33 | default_args=default_args, 34 | schedule_interval="0 6 * * 0", # 6am every Sunday, after Jetstream 35 | doc_md=__doc__, 36 | tags=tags, 37 | ) as dag: 38 | # Built from repo https://github.com/mozilla/auto-sizing 39 | auto_sizing_image = "gcr.io/moz-fx-data-experiments/auto_sizing:latest" 40 | 41 | auto_sizing_run = GKEPodOperator( 42 | task_id="auto_sizing_run", 43 | name="auto_sizing_run", 44 | image=auto_sizing_image, 45 | email=default_args["email"], 46 | arguments=[ 47 | "--log-to-bigquery", 48 | "run-argo", 49 | "--bucket=mozanalysis", 50 | "--dataset-id=auto_sizing", 51 | # the Airflow cluster doesn't have Compute Engine API access so pass in IP 52 | # and certificate in order for the pod to connect to the Kubernetes cluster 53 | # running Jetstream/auto-sizing 54 | "--cluster-ip={{ var.value.jetstream_cluster_ip }}", 55 | "--cluster-cert={{ var.value.jetstream_cluster_cert }}", 56 | ], 57 | dag=dag, 58 | ) 59 | 60 | wait_for_jetstream = ExternalTaskSensor( 61 | task_id="wait_for_jetstream", 62 | external_dag_id="jetstream", 63 | external_task_id="jetstream_run_config_changed", 64 | execution_delta=timedelta(hours=2), 65 | mode="reschedule", 66 | allowed_states=ALLOWED_STATES, 67 | failed_states=FAILED_STATES, 68 | pool="DATA_ENG_EXTERNALTASKSENSOR", 69 | email_on_retry=False, 70 | dag=dag, 71 | ) 72 | 73 | auto_sizing_run.set_upstream(wait_for_jetstream) 74 | -------------------------------------------------------------------------------- /dags/experiments_live.py: -------------------------------------------------------------------------------- 1 | """ 2 | See [experiments-monitoring-data-export in the docker-etl repository](https://github.com/mozilla/docker-etl/tree/main/jobs/experiments-monitoring-data-export). 3 | 4 | This DAG exports views related to experiment monitoring to GCS as JSON 5 | every 5 minutes to power the Experimenter console. 6 | """ 7 | 8 | from datetime import datetime 9 | 10 | from airflow import DAG 11 | 12 | from operators.gcp_container_operator import GKEPodOperator 13 | from utils.tags import Tag 14 | 15 | default_args = { 16 | "owner": "ascholtz@mozilla.com", 17 | "depends_on_past": False, 18 | "start_date": datetime(2021, 1, 8), 19 | "email_on_failure": True, 20 | "email_on_retry": True, 21 | } 22 | 23 | tags = [Tag.ImpactTier.tier_2] 24 | 25 | # We rely on max_active_runs=1 at the DAG level to manage the dependency on past runs. 26 | with DAG( 27 | "experiments_live", 28 | default_args=default_args, 29 | max_active_tasks=4, 30 | max_active_runs=1, 31 | schedule_interval="*/5 * * * *", 32 | doc_md=__doc__, 33 | tags=tags, 34 | ) as dag: 35 | # list of datasets to export data to GCS 36 | experiment_datasets = [ 37 | "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_other_events_overall_v1", 38 | "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_cumulative_population_estimate_v1", 39 | "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_overall_v1", 40 | "moz-fx-data-shared-prod.telemetry_derived.experiment_unenrollment_overall_v1", 41 | "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_ad_clicks_v1", 42 | "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_count_v1", 43 | "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_with_ads_count_v1", 44 | "moz-fx-data-shared-prod.telemetry.experiment_enrollment_daily_active_population", 45 | ] 46 | 47 | experiment_enrollment_export = GKEPodOperator( 48 | task_id="experiment_enrollment_export", 49 | arguments=[ 50 | "python", 51 | "experiments_monitoring_data_export/export.py", 52 | "--datasets", 53 | *experiment_datasets, 54 | ], 55 | image="gcr.io/moz-fx-data-airflow-prod-88e0/experiments-monitoring-data-export_docker_etl:latest", 56 | dag=dag, 57 | ) 58 | -------------------------------------------------------------------------------- /dags/extensions.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | 5 | from operators.gcp_container_operator import GKEPodOperator 6 | 7 | docs = """ 8 | ### extensions 9 | 10 | Loads the table moz-fx-data-shared-prod.external_derived.chrome_extensions_v1 11 | 12 | Note - if it fails, please alert the DAG owner, but do not re-run. 13 | 14 | Owner: kwindau@mozilla.com 15 | """ 16 | 17 | default_args = { 18 | "owner": "kwindau@mozilla.com", 19 | "start_date": datetime.datetime(2025, 4, 13, 0, 0), 20 | "end_date": None, 21 | "email": ["kwindau@mozilla.com"], 22 | "depends_on_past": False, 23 | "retry_delay": datetime.timedelta(seconds=1800), 24 | "email_on_failure": True, 25 | "email_on_retry": True, 26 | "retries": 2, 27 | } 28 | 29 | tags = ["impact/tier_3", "repo/telemetry-airflow"] 30 | SERVER = "moz-fx-data-airflow-prod-88e0" 31 | IMAGE_NAME = "extensions_docker_etl:latest" 32 | 33 | with DAG( 34 | "extensions", 35 | default_args=default_args, 36 | schedule_interval="0 15 * * *", 37 | doc_md=docs, 38 | tags=tags, 39 | ) as dag: 40 | pull_extensions = GKEPodOperator( 41 | task_id="pull_extensions", 42 | arguments=[ 43 | "python", 44 | "extensions/main.py", 45 | "--date", 46 | "{{ ds }}", 47 | ], 48 | image=f"gcr.io/{SERVER}/{IMAGE_NAME}", 49 | gcp_conn_id="google_cloud_airflow_gke", 50 | ) 51 | -------------------------------------------------------------------------------- /dags/firefox_public_data_report.py: -------------------------------------------------------------------------------- 1 | """ 2 | Powers the public https://data.firefox.com/ dashboard. 3 | 4 | Source code is in the [firefox-public-data-report-etl repository] 5 | (https://github.com/mozilla/firefox-public-data-report-etl). 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | 10 | from airflow import DAG 11 | from airflow.sensors.external_task import ExternalTaskSensor 12 | 13 | from operators.gcp_container_operator import GKEPodOperator 14 | from utils.constants import ALLOWED_STATES, FAILED_STATES 15 | from utils.gcp import bigquery_etl_query 16 | from utils.tags import Tag 17 | 18 | default_args = { 19 | "owner": "bewu@mozilla.com", 20 | "depends_on_past": False, 21 | "start_date": datetime(2020, 4, 6), 22 | "email": [ 23 | "telemetry-alerts@mozilla.com", 24 | "firefox-hardware-report-feedback@mozilla.com", 25 | "akomar@mozilla.com", 26 | "shong@mozilla.com", 27 | "bewu@mozilla.com", 28 | ], 29 | "email_on_failure": True, 30 | "email_on_retry": True, 31 | "retries": 2, 32 | "retry_delay": timedelta(minutes=10), 33 | } 34 | 35 | tags = [Tag.ImpactTier.tier_3] 36 | 37 | dag = DAG( 38 | "firefox_public_data_report", 39 | default_args=default_args, 40 | schedule_interval="0 1 * * MON", 41 | doc_md=__doc__, 42 | tags=tags, 43 | ) 44 | 45 | # hardware_report's execution date will be {now}-7days. It will read last week's main pings, 46 | # therefore we need to wait for yesterday's Main Ping deduplication task to finish 47 | wait_for_main_ping = ExternalTaskSensor( 48 | task_id="wait_for_main_ping", 49 | external_dag_id="copy_deduplicate", 50 | external_task_id="copy_deduplicate_main_ping", 51 | execution_delta=timedelta(days=-6), 52 | check_existence=True, 53 | mode="reschedule", 54 | allowed_states=ALLOWED_STATES, 55 | failed_states=FAILED_STATES, 56 | pool="DATA_ENG_EXTERNALTASKSENSOR", 57 | email_on_retry=False, 58 | dag=dag, 59 | ) 60 | 61 | hardware_report_query = bigquery_etl_query( 62 | task_id="hardware_report_query", 63 | destination_table="public_data_report_hardware_aggregates_v1", 64 | project_id="moz-fx-data-shared-prod", 65 | dataset_id="telemetry_derived", 66 | dag=dag, 67 | ) 68 | 69 | hardware_report_export = GKEPodOperator( 70 | task_id="hardware_report_export", 71 | name="hardware_report_export", 72 | image="gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest", 73 | arguments=[ 74 | "-m", 75 | "public_data_report.cli", 76 | "hardware_report", 77 | "--date_from", 78 | "{{ ds }}", 79 | "--input_bq_table", 80 | "moz-fx-data-shared-prod.telemetry_derived.public_data_report_hardware_aggregates_v1", 81 | "--output_bq_table", 82 | "moz-fx-data-shared-prod.telemetry_derived.public_data_report_hardware_v1", 83 | "--gcs_bucket", 84 | "moz-fx-data-static-websit-8565-analysis-output", 85 | "--gcs_path", 86 | "public-data-report/hardware/", 87 | ], 88 | image_pull_policy="Always", 89 | dag=dag, 90 | ) 91 | 92 | wait_for_clients_last_seen = ExternalTaskSensor( 93 | task_id="wait_for_clients_last_seen", 94 | external_dag_id="bqetl_main_summary", 95 | external_task_id="telemetry_derived__clients_last_seen__v1", 96 | execution_delta=timedelta(days=-6, hours=-1), 97 | check_existence=True, 98 | mode="reschedule", 99 | allowed_states=ALLOWED_STATES, 100 | failed_states=FAILED_STATES, 101 | pool="DATA_ENG_EXTERNALTASKSENSOR", 102 | email_on_retry=False, 103 | dag=dag, 104 | ) 105 | 106 | user_activity = bigquery_etl_query( 107 | task_id="user_activity", 108 | destination_table="public_data_report_user_activity_v1", 109 | project_id="moz-fx-data-shared-prod", 110 | dataset_id="telemetry_derived", 111 | dag=dag, 112 | ) 113 | 114 | user_activity_usage_behavior_export = GKEPodOperator( 115 | task_id="user_activity_export", 116 | name="user_activity_export", 117 | image="gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest", 118 | arguments=[ 119 | "-m", 120 | "public_data_report.cli", 121 | "user_activity", 122 | "--bq_table", 123 | "moz-fx-data-shared-prod.telemetry_derived.public_data_report_user_activity_v1", 124 | "--gcs_bucket", 125 | "moz-fx-data-static-websit-8565-analysis-output", 126 | "--gcs_path", 127 | "public-data-report/user_activity", 128 | ], 129 | image_pull_policy="Always", 130 | dag=dag, 131 | ) 132 | 133 | annotations_export = GKEPodOperator( 134 | task_id="annotations_export", 135 | name="annotations_export", 136 | image="gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest", 137 | arguments=[ 138 | "-m", 139 | "public_data_report.cli", 140 | "annotations", 141 | "--date_to", 142 | "{{ ds }}", 143 | "--output_bucket", 144 | "moz-fx-data-static-websit-8565-analysis-output", 145 | "--output_prefix", 146 | "public-data-report/annotations", 147 | ], 148 | image_pull_policy="Always", 149 | dag=dag, 150 | ) 151 | 152 | ensemble_transposer = GKEPodOperator( 153 | task_id="ensemble_transposer", 154 | name="ensemble_transposer", 155 | image="gcr.io/moz-fx-data-airflow-prod-88e0/ensemble-transposer:latest", 156 | env_vars={ 157 | "GCS_BUCKET_NAME": "moz-fx-data-static-websit-8565-ensemble", 158 | }, 159 | image_pull_policy="Always", 160 | dag=dag, 161 | ) 162 | 163 | 164 | ( 165 | wait_for_main_ping 166 | >> hardware_report_query 167 | >> hardware_report_export 168 | >> ensemble_transposer 169 | ) 170 | ( 171 | wait_for_clients_last_seen 172 | >> user_activity 173 | >> user_activity_usage_behavior_export 174 | >> ensemble_transposer 175 | ) 176 | annotations_export >> ensemble_transposer 177 | -------------------------------------------------------------------------------- /dags/fxci_metric_export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exports Firefox-CI worker data from the Google Cloud Monitoring to BigQuery. 3 | 4 | The container is defined in [fxci-etl](https://github.com/mozilla-releng/fxci-etl). 5 | """ 6 | 7 | from datetime import datetime, timedelta 8 | 9 | from airflow import DAG 10 | from airflow.providers.cncf.kubernetes.secret import Secret 11 | 12 | from operators.gcp_container_operator import GKEPodOperator 13 | from utils.tags import Tag 14 | 15 | default_args = { 16 | "owner": "ahalberstadt@mozilla.com", 17 | "depends_on_past": False, 18 | "start_date": datetime(2024, 7, 8), 19 | "email_on_failure": True, 20 | "email_on_retry": False, 21 | "retries": 1, 22 | "retry_delay": timedelta(minutes=30), 23 | } 24 | 25 | tags = [Tag.ImpactTier.tier_3] 26 | 27 | env_vars = { 28 | "FXCI_ETL_BIGQUERY_PROJECT": "moz-fx-data-shared-prod", 29 | "FXCI_ETL_BIGQUERY_DATASET": "fxci_derived", 30 | "FXCI_ETL_STORAGE_PROJECT": "moz-fx-dev-releng", 31 | "FXCI_ETL_STORAGE_BUCKET": "fxci-etl", 32 | } 33 | 34 | secrets = [ 35 | Secret( 36 | deploy_type="env", 37 | deploy_target="FXCI_ETL_STORAGE_CREDENTIALS", 38 | secret="airflow-gke-secrets", 39 | key="fxci_etl_secret__gcp-credentials", 40 | ), 41 | Secret( 42 | deploy_type="env", 43 | deploy_target="FXCI_ETL_MONITORING_CREDENTIALS", 44 | secret="airflow-gke-secrets", 45 | key="fxci_etl_secret__gcp-credentials", 46 | ), 47 | ] 48 | 49 | with DAG( 50 | "fxci_metric_export", 51 | default_args=default_args, 52 | doc_md=__doc__, 53 | schedule_interval="30 0 * * *", 54 | tags=tags, 55 | ) as dag: 56 | fxci_metric_export = GKEPodOperator( 57 | task_id="fxci_metric_export", 58 | arguments=[ 59 | "fxci-etl", 60 | "metric", 61 | "export", 62 | "-vv", 63 | "--date={{ ds }}", 64 | ], 65 | env_vars=env_vars, 66 | secrets=secrets, 67 | image="gcr.io/moz-fx-data-airflow-prod-88e0/fxci-taskcluster-export_docker_etl:latest", 68 | gcp_conn_id="google_cloud_airflow_gke", 69 | dag=dag, 70 | email=[ 71 | "ahalberstadt@mozilla.com", 72 | ], 73 | ) 74 | -------------------------------------------------------------------------------- /dags/fxci_pulse_export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exports Firefox-CI task and run data from Taskcluster to BigQuery. 3 | 4 | This connects to and drains three separate Taskcluster pulse queues, and 5 | exports each message into BigQuery. 6 | 7 | The container is defined in [fxci-etl](https://github.com/mozilla-releng/fxci-etl). 8 | """ 9 | 10 | from datetime import datetime, timedelta 11 | 12 | from airflow import DAG 13 | from airflow.providers.cncf.kubernetes.secret import Secret 14 | 15 | from operators.gcp_container_operator import GKEPodOperator 16 | from utils.tags import Tag 17 | 18 | default_args = { 19 | "owner": "ahalberstadt@mozilla.com", 20 | "depends_on_past": False, 21 | "start_date": datetime(2024, 7, 8), 22 | "email_on_failure": True, 23 | "email_on_retry": False, 24 | "retries": 1, 25 | "retry_delay": timedelta(minutes=30), 26 | } 27 | 28 | tags = [Tag.ImpactTier.tier_3] 29 | 30 | env_vars = { 31 | "FXCI_ETL_BIGQUERY_PROJECT": "moz-fx-data-shared-prod", 32 | "FXCI_ETL_BIGQUERY_DATASET": "fxci_derived", 33 | "FXCI_ETL_STORAGE_PROJECT": "moz-fx-dev-releng", 34 | "FXCI_ETL_STORAGE_BUCKET": "fxci-etl", 35 | "FXCI_ETL_PULSE_USER": "fxci-etl", 36 | } 37 | 38 | secrets = [ 39 | Secret( 40 | deploy_type="env", 41 | deploy_target="FXCI_ETL_STORAGE_CREDENTIALS", 42 | secret="airflow-gke-secrets", 43 | key="fxci_etl_secret__gcp-credentials", 44 | ), 45 | Secret( 46 | deploy_type="env", 47 | deploy_target="FXCI_ETL_PULSE_PASSWORD", 48 | secret="airflow-gke-secrets", 49 | key="fxci_etl_secret__pulse-password", 50 | ), 51 | ] 52 | 53 | with DAG( 54 | "fxci_pulse_export", 55 | default_args=default_args, 56 | doc_md=__doc__, 57 | schedule_interval="30 */4 * * *", 58 | tags=tags, 59 | ) as dag: 60 | fxci_pulse_export = GKEPodOperator( 61 | task_id="fxci_pulse_export", 62 | arguments=[ 63 | "fxci-etl", 64 | "pulse", 65 | "drain", 66 | "-vv", 67 | ], 68 | env_vars=env_vars, 69 | secrets=secrets, 70 | image="gcr.io/moz-fx-data-airflow-prod-88e0/fxci-taskcluster-export_docker_etl:latest", 71 | gcp_conn_id="google_cloud_airflow_gke", 72 | dag=dag, 73 | email=[ 74 | "ahalberstadt@mozilla.com", 75 | ], 76 | ) 77 | -------------------------------------------------------------------------------- /dags/ga4_site_metrics_summary_backfill.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | from airflow.sensors.external_task import ExternalTaskMarker 5 | 6 | from utils.gcp import bigquery_dq_check, bigquery_etl_query 7 | 8 | docs = """ 9 | ### ga4_site_metrics_summary_backfill 10 | 11 | Backfills the past three days of data for moz-fx-data-shared-prod.mozilla_org_derived.www_site_metrics_summary_v2 since late data can arrive for a few days 12 | 13 | Built from bigquery-etl repo, [`dags/bqetl_google_analytics_derived_ga4.py`](https://github.com/mozilla/bigquery-etl/blob/generated-sql/dags/bqetl_google_analytics_derived_ga4.py). 14 | 15 | This file is meant to look very similar to generated DAGs in bigquery-etl. 16 | 17 | Owner: kwindau@mozilla.com 18 | """ 19 | 20 | default_args = { 21 | "owner": "kwindau@mozilla.com", 22 | "start_date": datetime.datetime(2024, 1, 4, 0, 0), 23 | "end_date": None, 24 | "email": ["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"], 25 | "depends_on_past": False, 26 | "retry_delay": datetime.timedelta(seconds=1800), 27 | "email_on_failure": True, 28 | "email_on_retry": True, 29 | "retries": 2, 30 | } 31 | 32 | tags = ["impact/tier_2", "repo/bigquery-etl"] 33 | 34 | with DAG( 35 | "ga4_site_metrics_summary_backfill", 36 | default_args=default_args, 37 | schedule_interval="0 1 * * *", 38 | doc_md=docs, 39 | tags=tags, 40 | ) as dag: 41 | for day_offset in ["-3", "-2", "-1"]: 42 | task_id = "mozilla_org_derived__www_site_metrics_summary__v2__backfill_" + day_offset 43 | date_str = "macros.ds_add(ds, " + day_offset + ")" 44 | date_str_no_dash = "macros.ds_format(" + date_str + ", '%Y-%m-%d', '%Y%m%d')" 45 | 46 | ga4_www_site_metrics_summary_v2_checks = bigquery_dq_check( 47 | task_id="checks__fail_" + task_id, 48 | source_table="www_site_metrics_summary_v2", 49 | dataset_id="mozilla_org_derived", 50 | project_id="moz-fx-data-shared-prod", 51 | is_dq_check_fail=True, 52 | owner="kwindau@mozilla.com", 53 | email=["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"], 54 | depends_on_past=False, 55 | parameters=["submission_date:DATE:{{ " + date_str + " }}"], 56 | retries=0, 57 | ) 58 | 59 | ga4_www_site_metrics_summary_v2 = bigquery_etl_query( 60 | task_id=task_id, 61 | destination_table="www_site_metrics_summary_v2${{ " 62 | + date_str_no_dash 63 | + " }}", 64 | dataset_id="mozilla_org_derived", 65 | project_id="moz-fx-data-shared-prod", 66 | owner="kwindau@mozilla.com", 67 | email=["kwindau@mozilla.com", "telemetry-alerts@mozilla.com"], 68 | date_partition_parameter=None, 69 | parameters=["submission_date:DATE:{{ " + date_str + " }}"], 70 | depends_on_past=False, 71 | ) 72 | 73 | todays_ga4_www_site_metrics_summary_v2 = ExternalTaskMarker( 74 | task_id="rerun__mozilla_org_derived__www_site_metrics_summary__v2__" + day_offset, 75 | external_dag_id="bqetl_google_analytics_derived_ga4", 76 | external_task_id="wait_for_" + task_id, 77 | execution_date="{{ (execution_date - macros.timedelta(days=-1, seconds=82800)).isoformat() }}", 78 | ) 79 | 80 | ( 81 | ga4_www_site_metrics_summary_v2 82 | >> ga4_www_site_metrics_summary_v2_checks 83 | >> todays_ga4_www_site_metrics_summary_v2 84 | ) 85 | -------------------------------------------------------------------------------- /dags/glam_fenix_release.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from functools import partial, reduce 3 | 4 | from airflow import DAG 5 | from airflow.operators.empty import EmptyOperator 6 | from airflow.sensors.external_task import ExternalTaskSensor 7 | from airflow.utils.task_group import TaskGroup 8 | 9 | from utils.constants import ALLOWED_STATES, FAILED_STATES 10 | from utils.glam_subdags.generate_query import ( 11 | generate_and_run_glean_task, 12 | ) 13 | from utils.tags import Tag 14 | 15 | default_args = { 16 | "owner": "efilho@mozilla.com", 17 | "depends_on_past": False, 18 | "start_date": datetime(2025, 1, 22), 19 | "email": [ 20 | "telemetry-alerts@mozilla.com", 21 | "akomarzewski@mozilla.com", 22 | "efilho@mozilla.com", 23 | ], 24 | "email_on_failure": True, 25 | "email_on_retry": True, 26 | "retries": 2, 27 | "retry_delay": timedelta(minutes=30), 28 | } 29 | 30 | PROJECT = "moz-fx-glam-prod" 31 | 32 | tags = [Tag.ImpactTier.tier_1] 33 | 34 | with DAG( 35 | "glam_fenix_release", 36 | default_args=default_args, 37 | max_active_runs=1, 38 | schedule_interval="0 10 * * 6", # 10am on Saturday 39 | doc_md=__doc__, 40 | tags=tags, 41 | ) as dag: 42 | wait_for_glam_fenix = ExternalTaskSensor( 43 | task_id="wait_for_daily_fenix_release", 44 | external_dag_id="glam_fenix", 45 | external_task_id="org_mozilla_fenix_glam_release_done", 46 | execution_delta=timedelta(days=-5, hours=-16), 47 | check_existence=True, 48 | mode="reschedule", 49 | allowed_states=ALLOWED_STATES, 50 | failed_states=FAILED_STATES, 51 | pool="DATA_ENG_EXTERNALTASKSENSOR", 52 | email_on_retry=False, 53 | ) 54 | 55 | for product in ["org_mozilla_fenix_glam_release"]: 56 | func = partial( 57 | generate_and_run_glean_task, 58 | product=product, 59 | destination_project_id=PROJECT, 60 | env_vars={"STAGE": "incremental"}, 61 | ) 62 | view, init, query = ( 63 | partial(func, task_type=task_type) 64 | for task_type in ["view", "init", "query"] 65 | ) 66 | 67 | # stage 2 - downstream for export 68 | scalar_bucket_counts = query(task_name=f"{product}__scalar_bucket_counts_v1") 69 | scalar_probe_counts = query(task_name=f"{product}__scalar_probe_counts_v1") 70 | 71 | with TaskGroup( 72 | group_id=f"{product}__histogram_bucket_counts_v1", dag=dag, default_args=default_args 73 | ) as histogram_bucket_counts: 74 | prev_task = None 75 | for sample_range in ([0, 19], [20, 39], [40, 59], [60, 79], [80, 99]): 76 | histogram_bucket_counts_sampled = query( 77 | task_name=f"{product}__histogram_bucket_counts_v1_sampled_{sample_range[0]}_{sample_range[1]}", 78 | min_sample_id=sample_range[0], 79 | max_sample_id=sample_range[1], 80 | replace_table=(sample_range[0] == 0) 81 | ) 82 | if prev_task: 83 | histogram_bucket_counts_sampled.set_upstream(prev_task) 84 | prev_task = histogram_bucket_counts_sampled 85 | 86 | histogram_probe_counts = query( 87 | task_name=f"{product}__histogram_probe_counts_v1" 88 | ) 89 | 90 | probe_counts = view(task_name=f"{product}__view_probe_counts_v1") 91 | extract_probe_counts = query(task_name=f"{product}__extract_probe_counts_v1") 92 | 93 | user_counts = view(task_name=f"{product}__view_user_counts_v1") 94 | extract_user_counts = query(task_name=f"{product}__extract_user_counts_v1") 95 | 96 | sample_counts = view(task_name=f"{product}__view_sample_counts_v1") 97 | 98 | fenix_release_done = EmptyOperator(task_id="fenix_release_done") 99 | 100 | ( 101 | wait_for_glam_fenix 102 | >> scalar_bucket_counts 103 | >> scalar_probe_counts 104 | >> probe_counts 105 | ) 106 | ( 107 | wait_for_glam_fenix 108 | >> histogram_bucket_counts 109 | >> histogram_probe_counts 110 | >> probe_counts 111 | ) 112 | probe_counts >> sample_counts >> extract_probe_counts >> fenix_release_done 113 | ( 114 | wait_for_glam_fenix 115 | >> user_counts 116 | >> extract_user_counts 117 | >> fenix_release_done 118 | ) 119 | wait_for_glam_fenix >> fenix_release_done 120 | -------------------------------------------------------------------------------- /dags/glam_fog_release.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from datetime import datetime, timedelta 3 | from functools import partial, reduce 4 | 5 | from airflow import DAG 6 | from airflow.operators.empty import EmptyOperator 7 | from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor 8 | from airflow.utils.task_group import TaskGroup 9 | 10 | from operators.gcp_container_operator import GKEPodOperator 11 | from utils.constants import ALLOWED_STATES, FAILED_STATES 12 | from utils.glam_subdags.generate_query import ( 13 | generate_and_run_glean_task, 14 | ) 15 | from utils.tags import Tag 16 | 17 | default_args = { 18 | "owner": "efilho@mozilla.com", 19 | "depends_on_past": False, 20 | "start_date": datetime(2024, 12, 11), 21 | "email": [ 22 | "telemetry-alerts@mozilla.com", 23 | "akomarzewski@mozilla.com", 24 | "efilho@mozilla.com", 25 | ], 26 | "email_on_failure": True, 27 | "email_on_retry": True, 28 | "retries": 1, 29 | "retry_delay": timedelta(minutes=30), 30 | } 31 | 32 | PROJECT = "moz-fx-glam-prod" 33 | 34 | tags = [Tag.ImpactTier.tier_2] 35 | 36 | with DAG( 37 | "glam_fog_release", 38 | default_args=default_args, 39 | max_active_runs=1, 40 | schedule_interval="0 10 * * 6", # 10am on Saturday 41 | tags=tags, 42 | ) as dag: 43 | wait_for_glam_fog = ExternalTaskSensor( 44 | task_id="wait_for_daily_glam_fog_release", 45 | external_dag_id="glam_fog", 46 | external_task_id="daily_release_done", 47 | execution_delta=timedelta(days=-5, hours=-16), 48 | check_existence=True, 49 | mode="reschedule", 50 | allowed_states=ALLOWED_STATES, 51 | failed_states=FAILED_STATES, 52 | pool="DATA_ENG_EXTERNALTASKSENSOR", 53 | email_on_retry=False, 54 | ) 55 | 56 | fog_release_done = EmptyOperator( 57 | task_id="fog_release_done", 58 | ) 59 | 60 | for product in ["firefox_desktop_glam_release"]: 61 | func = partial( 62 | generate_and_run_glean_task, 63 | product=product, 64 | destination_project_id=PROJECT, 65 | env_vars={"STAGE": "incremental"}, 66 | ) 67 | view, init, query = ( 68 | partial(func, task_type=task_type) 69 | for task_type in ["view", "init", "query"] 70 | ) 71 | 72 | # stage 2 - downstream for export 73 | scalar_bucket_counts = query(task_name=f"{product}__scalar_bucket_counts_v1") 74 | scalar_probe_counts = query(task_name=f"{product}__scalar_probe_counts_v1") 75 | 76 | with TaskGroup( 77 | group_id=f"{product}__histogram_bucket_counts_v1", dag=dag, default_args=default_args 78 | ) as histogram_bucket_counts: 79 | prev_task = None 80 | # Windows + Release data is in [0-9] so we're further splitting that range. 81 | for sample_range in ( 82 | [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], 83 | [7, 7], [8, 8], [9, 9], [10, 19], [20, 29], [30, 39], 84 | [40, 49], [50, 59], [60, 69], [70, 79], [80, 89], [90, 99] 85 | ): 86 | histogram_bucket_counts_sampled = query( 87 | task_name=( 88 | f"{product}__histogram_bucket_counts_v1_sampled_" 89 | f"{sample_range[0]}_{sample_range[1]}" 90 | ), 91 | min_sample_id=sample_range[0], 92 | max_sample_id=sample_range[1], 93 | replace_table=(sample_range[0] == 0) 94 | ) 95 | if prev_task: 96 | histogram_bucket_counts_sampled.set_upstream(prev_task) 97 | prev_task = histogram_bucket_counts_sampled 98 | 99 | histogram_probe_counts = query( 100 | task_name=f"{product}__histogram_probe_counts_v1" 101 | ) 102 | 103 | probe_counts = view(task_name=f"{product}__view_probe_counts_v1") 104 | extract_probe_counts = query(task_name=f"{product}__extract_probe_counts_v1") 105 | 106 | user_counts = view(task_name=f"{product}__view_user_counts_v1") 107 | extract_user_counts = query(task_name=f"{product}__extract_user_counts_v1") 108 | 109 | sample_counts = view(task_name=f"{product}__view_sample_counts_v1") 110 | 111 | ( 112 | wait_for_glam_fog 113 | >> scalar_bucket_counts 114 | >> scalar_probe_counts 115 | >> probe_counts 116 | ) 117 | ( 118 | wait_for_glam_fog 119 | >> histogram_bucket_counts 120 | >> histogram_probe_counts 121 | >> probe_counts 122 | ) 123 | probe_counts >> sample_counts >> extract_probe_counts >> fog_release_done 124 | ( 125 | wait_for_glam_fog 126 | >> user_counts 127 | >> extract_user_counts 128 | >> fog_release_done 129 | ) 130 | wait_for_glam_fog >> fog_release_done 131 | -------------------------------------------------------------------------------- /dags/glam_glean_imports.py: -------------------------------------------------------------------------------- 1 | """Desktop ETL for importing glean data into GLAM app.""" 2 | 3 | from datetime import datetime, timedelta 4 | 5 | from airflow import DAG 6 | from airflow.models import Variable 7 | from airflow.providers.cncf.kubernetes.secret import Secret 8 | from airflow.sensors.external_task import ExternalTaskSensor 9 | from airflow.utils.task_group import TaskGroup 10 | 11 | from operators.gcp_container_operator import GKEPodOperator 12 | from utils.constants import ALLOWED_STATES, FAILED_STATES 13 | from utils.tags import Tag 14 | 15 | default_args = { 16 | "owner": "efilho@mozilla.com", 17 | "depends_on_past": False, 18 | "start_date": datetime(2019, 10, 22), 19 | "email": [ 20 | "akommasani@mozilla.com", 21 | "akomarzewski@mozilla.com", 22 | "efilho@mozilla.com", 23 | ], 24 | "email_on_failure": True, 25 | "email_on_retry": True, 26 | "retries": 1, 27 | "retry_delay": timedelta(minutes=30), 28 | } 29 | 30 | tags = [Tag.ImpactTier.tier_2] 31 | 32 | dag = DAG( 33 | "glam_glean_imports", 34 | default_args=default_args, 35 | schedule_interval="0 19 * * *", 36 | doc_md=__doc__, 37 | tags=tags, 38 | ) 39 | 40 | wait_for_glam = ExternalTaskSensor( 41 | task_id="wait_for_glam", 42 | external_dag_id="glam", 43 | external_task_group_id="extracts", 44 | execution_delta=timedelta(hours=3), 45 | check_existence=True, 46 | mode="reschedule", 47 | allowed_states=ALLOWED_STATES, 48 | failed_states=FAILED_STATES, 49 | pool="DATA_ENG_EXTERNALTASKSENSOR", 50 | email_on_retry=False, 51 | dag=dag, 52 | ) 53 | 54 | # Move logic from Glam deployment's GKE Cronjob to this dag for better dependency timing 55 | default_glean_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2023.07.1-43" 56 | 57 | base_docker_args = ["/venv/bin/python", "manage.py"] 58 | 59 | for env in ["Dev", "Prod"]: 60 | glean_import_image = default_glean_import_image 61 | if env == "Dev": # noqa SIMM114 62 | glean_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2023.07.1-43" 63 | elif env == "Prod": 64 | glean_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2023.07.1-43" 65 | 66 | # Fetch secrets from Google Secret Manager to be injected into the pod. 67 | database_url_secret = Secret( 68 | deploy_type="env", 69 | deploy_target="DATABASE_URL", 70 | secret="airflow-gke-secrets", 71 | key=f"{env}_glam_secret__database_url", 72 | ) 73 | django_secret = Secret( 74 | deploy_type="env", 75 | deploy_target="DJANGO_SECRET_KEY", 76 | secret="airflow-gke-secrets", 77 | key=f"{env}_glam_secret__django_secret_key", 78 | ) 79 | 80 | env_vars = { 81 | # Tells Django what set of configs to load depending on the environment. Defaults to dev on the app. 82 | "DJANGO_CONFIGURATION": env, 83 | "DJANGO_DEBUG": "False", 84 | "DJANGO_SETTINGS_MODULE": "glam.settings", 85 | "GOOGLE_CLOUD_PROJECT": Variable.get(env + "_glam_project"), 86 | } 87 | 88 | 89 | default_glam_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2024.10.0-58" 90 | 91 | base_docker_args = ["/venv/bin/python", "manage.py"] 92 | 93 | for env in ["Dev", "Prod"]: 94 | glam_import_image = default_glam_import_image 95 | if env == "Dev": # noqa 114 96 | glam_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2024.10.0-58" 97 | elif env == "Prod": 98 | glam_import_image = "gcr.io/moz-fx-dataops-images-global/gcp-pipelines/glam/glam-production/glam:2024.10.0-58" 99 | 100 | # Fetch secrets from Google Secret Manager to be injected into the pod. 101 | database_url_secret = Secret( 102 | deploy_type="env", 103 | deploy_target="DATABASE_URL", 104 | secret="airflow-gke-secrets", 105 | key=f"{env}_glam_secret__database_url", 106 | ) 107 | django_secret = Secret( 108 | deploy_type="env", 109 | deploy_target="DJANGO_SECRET_KEY", 110 | secret="airflow-gke-secrets", 111 | key=f"{env}_glam_secret__django_secret_key", 112 | ) 113 | 114 | env_vars = { 115 | "DJANGO_CONFIGURATION": env, 116 | "DJANGO_DEBUG": "False", 117 | "DJANGO_SETTINGS_MODULE": "glam.settings", 118 | "GOOGLE_CLOUD_PROJECT": Variable.get(env + "_glam_project"), 119 | } 120 | 121 | with dag as dag, TaskGroup(group_id=env + "_glam") as glam_env_task_group: 122 | glam_import_probes = GKEPodOperator( 123 | reattach_on_restart=True, 124 | task_id="glam_import_probes", 125 | name="glam_import_probes", 126 | image=glam_import_image, 127 | arguments=[*base_docker_args, "import_probes"], 128 | env_vars=env_vars, 129 | secrets=[database_url_secret, django_secret], 130 | ) 131 | 132 | glam_import_revisions = GKEPodOperator( 133 | reattach_on_restart=True, 134 | task_id="glam_import_revisions", 135 | name="glam_import_revisions", 136 | image=glam_import_image, 137 | arguments=[*base_docker_args, "import_revisions"], 138 | env_vars=env_vars, 139 | secrets=[database_url_secret, django_secret], 140 | ) 141 | 142 | wait_for_glam >> glam_env_task_group 143 | -------------------------------------------------------------------------------- /dags/graphics_telemetry.py: -------------------------------------------------------------------------------- 1 | """ 2 | A job to power graphics dashboard. 3 | 4 | Processes main ping data and exports to GCS to power a graphics dashboard at 5 | https://firefoxgraphics.github.io/telemetry/. 6 | 7 | This was originally a Databricks notebook that was migrated to a scheduled 8 | Dataproc task. Source code lives in the 9 | [FirefoxGraphics/telemetry](https://github.com/FirefoxGraphics/telemetry) 10 | repository. 11 | 12 | This is a overwrite kind of operation and as long as the most recent DAG run succeeded 13 | the job should be considered healthy. 14 | """ 15 | 16 | import datetime 17 | 18 | from airflow import DAG 19 | from airflow.operators.subdag import SubDagOperator 20 | from airflow.sensors.external_task import ExternalTaskSensor 21 | 22 | from utils.constants import ALLOWED_STATES, FAILED_STATES 23 | from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner 24 | from utils.tags import Tag 25 | 26 | default_args = { 27 | "owner": "kik@mozilla.com", 28 | "depends_on_past": False, 29 | "start_date": datetime.datetime(2020, 11, 26), 30 | "email": [ 31 | "telemetry-alerts@mozilla.com", 32 | "kik@mozilla.com", 33 | ], 34 | "email_on_failure": True, 35 | "email_on_retry": True, 36 | "retries": 2, 37 | "retry_delay": datetime.timedelta(minutes=20), 38 | } 39 | 40 | PIP_PACKAGES = [ 41 | "git+https://github.com/mozilla/python_moztelemetry.git@v0.10.7#egg=python-moztelemetry", 42 | "git+https://github.com/FirefoxGraphics/telemetry.git#egg=pkg&subdirectory=analyses/bigquery_shim", 43 | "boto3==1.16.20", 44 | "six==1.15.0", 45 | ] 46 | 47 | GCS_BUCKET = "moz-fx-data-static-websit-8565-analysis-output" 48 | GCS_PREFIX = "gfx/telemetry-data/" 49 | 50 | tags = [Tag.ImpactTier.tier_1] 51 | 52 | with DAG( 53 | "graphics_telemetry", 54 | default_args=default_args, 55 | schedule_interval="0 3 * * *", 56 | doc_md=__doc__, 57 | tags=tags, 58 | ) as dag: 59 | wait_for_main_ping = ExternalTaskSensor( 60 | task_id="wait_for_copy_deduplicate_main_ping", 61 | external_dag_id="copy_deduplicate", 62 | external_task_id="copy_deduplicate_main_ping", 63 | execution_delta=datetime.timedelta(hours=2), 64 | check_existence=True, 65 | mode="reschedule", 66 | allowed_states=ALLOWED_STATES, 67 | failed_states=FAILED_STATES, 68 | pool="DATA_ENG_EXTERNALTASKSENSOR", 69 | email_on_retry=False, 70 | dag=dag, 71 | ) 72 | 73 | params = get_dataproc_parameters("google_cloud_airflow_dataproc") 74 | 75 | graphics_trends = SubDagOperator( 76 | task_id="graphics_trends", 77 | dag=dag, 78 | subdag=moz_dataproc_pyspark_runner( 79 | parent_dag_name=dag.dag_id, 80 | image_version="1.5-debian10", 81 | dag_name="graphics_trends", 82 | default_args=default_args, 83 | cluster_name="graphics-trends-{{ ds }}", 84 | job_name="graphics-trends", 85 | python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/graphics/graphics_telemetry_trends.py", 86 | init_actions_uris=[ 87 | "gs://dataproc-initialization-actions/python/pip-install.sh" 88 | ], 89 | additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)}, 90 | additional_properties={ 91 | "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", 92 | }, 93 | py_args=[ 94 | "--gcs-bucket", 95 | GCS_BUCKET, 96 | "--gcs-prefix", 97 | GCS_PREFIX, 98 | "--weekly-fraction", 99 | "0.003", 100 | ], 101 | idle_delete_ttl=14400, 102 | num_workers=2, 103 | worker_machine_type="n1-standard-4", 104 | gcp_conn_id=params.conn_id, 105 | service_account=params.client_email, 106 | storage_bucket=params.storage_bucket, 107 | ), 108 | ) 109 | 110 | graphics_dashboard = SubDagOperator( 111 | task_id="graphics_dashboard", 112 | dag=dag, 113 | subdag=moz_dataproc_pyspark_runner( 114 | parent_dag_name=dag.dag_id, 115 | image_version="1.5-debian10", 116 | dag_name="graphics_dashboard", 117 | default_args=default_args, 118 | cluster_name="graphics-dashboard-{{ ds }}", 119 | job_name="graphics-dashboard", 120 | python_driver_code="https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/graphics/graphics_telemetry_dashboard.py", 121 | init_actions_uris=[ 122 | "gs://dataproc-initialization-actions/python/pip-install.sh" 123 | ], 124 | additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)}, 125 | additional_properties={ 126 | "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", 127 | }, 128 | py_args=[ 129 | "--output-bucket", 130 | GCS_BUCKET, 131 | "--output-prefix", 132 | GCS_PREFIX, 133 | "--release-fraction", 134 | "0.003", 135 | ], 136 | idle_delete_ttl=14400, 137 | num_workers=2, 138 | worker_machine_type="n1-highmem-4", 139 | gcp_conn_id=params.conn_id, 140 | service_account=params.client_email, 141 | storage_bucket=params.storage_bucket, 142 | ), 143 | ) 144 | 145 | wait_for_main_ping >> graphics_trends 146 | wait_for_main_ping >> graphics_dashboard 147 | -------------------------------------------------------------------------------- /dags/jetstream.py: -------------------------------------------------------------------------------- 1 | """ 2 | Powers the [jetstream](https://experimenter.info/jetstream/jetstream/) 3 | analysis framework for experiments. 4 | 5 | See the [jetstream repository](https://github.com/mozilla/jetstream). 6 | 7 | *Triage notes* 8 | 9 | In case jetstream configuration is modified it is perfectly normal for the task 10 | `jetstream_run_config_changed` to take significantly longer to complete (hours instead of minutes). 11 | In these cases we expect anything below 12 hours, only after that amount of time should 12 | this task be considered potentially faulty and subject to the triage process. 13 | """ # noqa: D205 14 | 15 | from datetime import datetime, timedelta 16 | 17 | from airflow import DAG 18 | from airflow.sensors.external_task import ExternalTaskSensor 19 | 20 | from operators.gcp_container_operator import GKEPodOperator 21 | from utils.constants import ALLOWED_STATES, FAILED_STATES 22 | from utils.tags import Tag 23 | 24 | default_args = { 25 | "owner": "ascholtz@mozilla.com", 26 | "email": [ 27 | "ascholtz@mozilla.com", 28 | "mwilliams@mozilla.com", 29 | ], 30 | "depends_on_past": False, 31 | "start_date": datetime(2020, 3, 12), 32 | "email_on_failure": True, 33 | "email_on_retry": True, 34 | "retries": 2, 35 | "retry_delay": timedelta(minutes=30), 36 | } 37 | 38 | tags = [Tag.ImpactTier.tier_1] 39 | 40 | with DAG( 41 | "jetstream", 42 | default_args=default_args, 43 | schedule_interval="0 4 * * *", 44 | doc_md=__doc__, 45 | tags=tags, 46 | ) as dag: 47 | # Built from repo https://github.com/mozilla/jetstream 48 | jetstream_image = "gcr.io/moz-fx-data-experiments/jetstream:latest" 49 | 50 | jetstream_run = GKEPodOperator( 51 | task_id="jetstream_run", 52 | name="jetstream_run", 53 | image=jetstream_image, 54 | email=default_args["email"], 55 | arguments=[ 56 | "--log_to_bigquery", 57 | "run-argo", 58 | "--date={{ ds }}", 59 | # the Airflow cluster doesn't have Compute Engine API access so pass in IP 60 | # and certificate in order for the pod to connect to the Kubernetes cluster 61 | # running Jetstream 62 | "--cluster-ip={{ var.value.jetstream_cluster_ip }}", 63 | "--cluster-cert={{ var.value.jetstream_cluster_cert }}", 64 | ], 65 | dag=dag, 66 | ) 67 | 68 | jetstream_config_changed = GKEPodOperator( 69 | task_id="jetstream_run_config_changed", 70 | name="jetstream_run_config_changed", 71 | image=jetstream_image, 72 | email=default_args["email"], 73 | arguments=[ 74 | "--log_to_bigquery", 75 | "rerun-config-changed", 76 | "--argo", 77 | # the Airflow cluster doesn't have Compute Engine API access so pass in IP 78 | # and certificate in order for the pod to connect to the Kubernetes cluster 79 | # running Jetstream 80 | "--cluster-ip={{ var.value.jetstream_cluster_ip }}", 81 | "--cluster-cert={{ var.value.jetstream_cluster_cert }}", 82 | ], 83 | dag=dag, 84 | ) 85 | 86 | wait_for_clients_daily_export = ExternalTaskSensor( 87 | task_id="wait_for_clients_daily", 88 | external_dag_id="bqetl_main_summary", 89 | external_task_id="telemetry_derived__clients_daily__v6", 90 | execution_delta=timedelta(hours=2), 91 | mode="reschedule", 92 | allowed_states=ALLOWED_STATES, 93 | failed_states=FAILED_STATES, 94 | pool="DATA_ENG_EXTERNALTASKSENSOR", 95 | email_on_retry=False, 96 | dag=dag, 97 | ) 98 | 99 | wait_for_search_clients_daily = ExternalTaskSensor( 100 | task_id="wait_for_search_clients_daily", 101 | external_dag_id="bqetl_search", 102 | external_task_id="search_derived__search_clients_daily__v8", 103 | execution_delta=timedelta(hours=1), 104 | mode="reschedule", 105 | allowed_states=ALLOWED_STATES, 106 | failed_states=FAILED_STATES, 107 | pool="DATA_ENG_EXTERNALTASKSENSOR", 108 | email_on_retry=False, 109 | dag=dag, 110 | ) 111 | 112 | wait_for_bq_events = ExternalTaskSensor( 113 | task_id="wait_for_bq_main_events", 114 | external_dag_id="copy_deduplicate", 115 | external_task_id="bq_main_events", 116 | execution_delta=timedelta(hours=3), 117 | mode="reschedule", 118 | allowed_states=ALLOWED_STATES, 119 | failed_states=FAILED_STATES, 120 | pool="DATA_ENG_EXTERNALTASKSENSOR", 121 | email_on_retry=False, 122 | dag=dag, 123 | ) 124 | 125 | wait_for_copy_deduplicate_events = ExternalTaskSensor( 126 | task_id="wait_for_event_events", 127 | external_dag_id="copy_deduplicate", 128 | external_task_id="event_events", 129 | execution_delta=timedelta(hours=3), 130 | mode="reschedule", 131 | allowed_states=ALLOWED_STATES, 132 | failed_states=FAILED_STATES, 133 | pool="DATA_ENG_EXTERNALTASKSENSOR", 134 | email_on_retry=False, 135 | dag=dag, 136 | ) 137 | 138 | jetstream_run.set_upstream( 139 | [ 140 | wait_for_clients_daily_export, 141 | wait_for_search_clients_daily, 142 | wait_for_bq_events, 143 | wait_for_copy_deduplicate_events, 144 | ] 145 | ) 146 | jetstream_config_changed.set_upstream(jetstream_run) 147 | -------------------------------------------------------------------------------- /dags/kpi_forecasting.py: -------------------------------------------------------------------------------- 1 | """ 2 | See [kpi-forecasting in the docker-etl repository](https://github.com/mozilla/docker-etl/blob/main/jobs/kpi-forecasting). 3 | 4 | This DAG runs the forecast Desktop DAU and Mobile DAU. The output powers KPI dashboards and monthly revenue forecasts. 5 | 6 | This DAG is high priority for week 1 of the month and low priority otherwise. 7 | """ 8 | 9 | import os 10 | from collections import namedtuple 11 | from datetime import datetime, timedelta 12 | 13 | from airflow import DAG 14 | from airflow.sensors.external_task import ExternalTaskSensor 15 | 16 | from operators.gcp_container_operator import GKEPodOperator 17 | from utils.constants import ALLOWED_STATES, FAILED_STATES 18 | from utils.tags import Tag 19 | 20 | default_args = { 21 | "owner": "bochocki@mozilla.com", 22 | "email": ["bochocki@mozilla.com", "jsilverman@mozilla.com"], 23 | "depends_on_past": False, 24 | "start_date": datetime(2022, 3, 28), 25 | "email_on_failure": True, 26 | "email_on_retry": True, 27 | "retries": 2, 28 | "retry_delay": timedelta(minutes=30), 29 | } 30 | 31 | TAGS = [Tag.ImpactTier.tier_1] 32 | IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/kpi-forecasting_docker_etl:latest" 33 | 34 | Config = namedtuple("Config", ["filename", "wait_dag", "wait_tasks"]) 35 | CONFIGS = { 36 | "dau_desktop": Config( 37 | "dau_desktop.yaml", 38 | "bqetl_analytics_aggregations", 39 | [ 40 | "firefox_desktop_active_users_aggregates_v4", 41 | ], 42 | ), 43 | "dau_mobile": Config( 44 | "dau_mobile.yaml", 45 | "bqetl_analytics_aggregations", 46 | [ 47 | "firefox_ios_active_users_aggregates_v3", 48 | "fenix_active_users_aggregates_v3", 49 | "focus_android_active_users_aggregates_v3", 50 | "focus_ios_active_users_aggregates_v3", 51 | ], 52 | ), 53 | } 54 | 55 | with DAG( 56 | "kpi_forecasting", 57 | default_args=default_args, 58 | schedule_interval="0 5 * * *", 59 | doc_md=__doc__, 60 | tags=TAGS, 61 | ) as dag: 62 | for id, config in CONFIGS.items(): 63 | script_path = os.path.join(".", "kpi_forecasting.py") 64 | config_path = os.path.join("kpi_forecasting", "configs", config.filename) 65 | wait_tasks = config.wait_tasks 66 | 67 | if not isinstance(config.wait_tasks, list): 68 | wait_tasks = [wait_tasks] 69 | 70 | forecast_task = GKEPodOperator( 71 | task_id=f"kpi_forecasting_{id}", 72 | arguments=["python", script_path, "-c", config_path], 73 | image=IMAGE, 74 | dag=dag, 75 | ) 76 | 77 | for wait_task in wait_tasks: 78 | wait_task_sensor = ExternalTaskSensor( 79 | task_id=f"wait_for_{wait_task}", 80 | external_dag_id=config.wait_dag, 81 | external_task_id=wait_task, 82 | execution_delta=timedelta(minutes=45), 83 | check_existence=True, 84 | mode="reschedule", 85 | allowed_states=ALLOWED_STATES, 86 | failed_states=FAILED_STATES, 87 | pool="DATA_ENG_EXTERNALTASKSENSOR", 88 | ) 89 | 90 | wait_task_sensor >> forecast_task 91 | -------------------------------------------------------------------------------- /dags/looker_usage_analysis.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.providers.cncf.kubernetes.secret import Secret 5 | 6 | from operators.gcp_container_operator import GKEPodOperator 7 | from utils.tags import Tag 8 | 9 | DOCS = """\ 10 | # Looker Usage Analysis 11 | 12 | *Triage notes* 13 | 14 | As long as the most recent DAG run is successful this job can be considered healthy. 15 | In such case, past DAG failures can be ignored. 16 | 17 | This DAG runs every quarter (1st day of February, May, August, November) and analyses the 18 | Looker artifact usage using [Henry](https://github.com/looker-open-source/henry) 19 | """ 20 | 21 | 22 | default_args = { 23 | "owner": "ascholtz@mozilla.com", 24 | "depends_on_past": False, 25 | "start_date": datetime(2025, 5, 30), 26 | "email_on_failure": True, 27 | "email_on_retry": True, 28 | "retries": 2, 29 | "retry_delay": timedelta(minutes=30), 30 | } 31 | 32 | tags = [Tag.ImpactTier.tier_3] 33 | 34 | looker_client_id_prod = Secret( 35 | deploy_type="env", 36 | deploy_target="LOOKER_CLIENT_ID", 37 | secret="airflow-gke-secrets", 38 | key="probe_scraper_secret__looker_api_client_id_prod", 39 | ) 40 | looker_client_secret_prod = Secret( 41 | deploy_type="env", 42 | deploy_target="LOOKER_CLIENT_SECRET", 43 | secret="airflow-gke-secrets", 44 | key="probe_scraper_secret__looker_api_client_secret_prod", 45 | ) 46 | looker_instance_uri = "https://mozilla.cloud.looker.com" 47 | 48 | 49 | with DAG( 50 | "looker_usage_analysis", 51 | doc_md=DOCS, 52 | max_active_runs=1, 53 | default_args=default_args, 54 | schedule_interval="0 0 1 2,5,8,11 *", 55 | tags=tags, 56 | ) as dag: 57 | airflow_gke_prod_kwargs = { 58 | "gcp_conn_id": "google_cloud_airflow_gke", 59 | "project_id": "moz-fx-data-airflow-gke-prod", 60 | "location": "us-west1", 61 | "cluster_name": "workloads-prod-v1", 62 | } 63 | 64 | analyze_explores = GKEPodOperator( 65 | task_id="analyze_explores", 66 | arguments=[ 67 | "python", 68 | "-m", 69 | "looker_utils.main", 70 | "analyze", 71 | "--destination_table", 72 | "moz-fx-data-shared-prod.monitoring_derived.looker_usage_explores_v1", 73 | "--date={{ ds }}", 74 | "explores", 75 | ], 76 | image="gcr.io/moz-fx-data-airflow-prod-88e0/looker-utils_docker_etl:latest", 77 | env_vars={ 78 | "LOOKER_INSTANCE_URI": looker_instance_uri, 79 | }, 80 | secrets=[looker_client_id_prod, looker_client_secret_prod], 81 | **airflow_gke_prod_kwargs, 82 | ) 83 | 84 | analyze_models = GKEPodOperator( 85 | task_id="analyze_models", 86 | arguments=[ 87 | "python", 88 | "-m", 89 | "looker_utils.main", 90 | "analyze", 91 | "--destination_table", 92 | "moz-fx-data-shared-prod.monitoring_derived.looker_usage_models_v1", 93 | "--date={{ ds }}", 94 | "models", 95 | ], 96 | image="gcr.io/moz-fx-data-airflow-prod-88e0/looker-utils_docker_etl:latest", 97 | env_vars={ 98 | "LOOKER_INSTANCE_URI": looker_instance_uri, 99 | }, 100 | secrets=[looker_client_id_prod, looker_client_secret_prod], 101 | **airflow_gke_prod_kwargs, 102 | ) 103 | 104 | analyze_unused_explores = GKEPodOperator( 105 | task_id="analyze_unused_explores", 106 | arguments=[ 107 | "python", 108 | "-m", 109 | "looker_utils.main", 110 | "analyze", 111 | "--destination_table", 112 | "moz-fx-data-shared-prod.monitoring_derived.looker_usage_unused_explores_v1", 113 | "--date={{ ds }}", 114 | "unused-explores", 115 | ], 116 | image="gcr.io/moz-fx-data-airflow-prod-88e0/looker-utils_docker_etl:latest", 117 | env_vars={ 118 | "LOOKER_INSTANCE_URI": looker_instance_uri, 119 | }, 120 | secrets=[looker_client_id_prod, looker_client_secret_prod], 121 | **airflow_gke_prod_kwargs, 122 | ) 123 | -------------------------------------------------------------------------------- /dags/ltv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Client Lifetime Value. 3 | 4 | Kicks off jobs to run on a Dataproc cluster. The job code lives in 5 | [jobs/ltv_daily.py](https://github.com/mozilla/telemetry-airflow/blob/main/jobs/ltv_daily.py). 6 | 7 | See [client_ltv docs on DTMO](https://docs.telemetry.mozilla.org/datasets/search/client_ltv/reference.html). 8 | """ 9 | from datetime import datetime, timedelta 10 | 11 | from airflow import DAG 12 | from airflow.operators.subdag import SubDagOperator 13 | from airflow.sensors.external_task import ExternalTaskSensor 14 | 15 | from utils.constants import ALLOWED_STATES, FAILED_STATES 16 | from utils.dataproc import ( 17 | copy_artifacts_dev, 18 | get_dataproc_parameters, 19 | moz_dataproc_pyspark_runner, 20 | ) 21 | from utils.gcp import bigquery_etl_query 22 | from utils.tags import Tag 23 | 24 | default_args = { 25 | "owner": "akomar@mozilla.com", 26 | "depends_on_past": True, 27 | "start_date": datetime(2020, 3, 15), 28 | "email": [ 29 | "telemetry-alerts@mozilla.com", 30 | "akomar@mozilla.com", 31 | ], 32 | "email_on_failure": True, 33 | "email_on_retry": True, 34 | "retries": 3, 35 | "retry_delay": timedelta(minutes=30), 36 | } 37 | 38 | tags = [Tag.ImpactTier.tier_2] 39 | 40 | dag = DAG( 41 | "ltv_daily", 42 | default_args=default_args, 43 | schedule_interval="0 4 * * *", 44 | doc_md=__doc__, 45 | tags=tags, 46 | ) 47 | 48 | params = get_dataproc_parameters("google_cloud_airflow_dataproc") 49 | 50 | subdag_args = default_args.copy() 51 | subdag_args["retries"] = 0 52 | 53 | task_id = "ltv_daily" 54 | project = params.project_id if params.is_dev else "moz-fx-data-shared-prod" 55 | ltv_daily = SubDagOperator( 56 | task_id=task_id, 57 | dag=dag, 58 | subdag=moz_dataproc_pyspark_runner( 59 | parent_dag_name=dag.dag_id, 60 | dag_name=task_id, 61 | job_name="ltv-daily", 62 | cluster_name="ltv-daily-{{ ds_nodash }}", 63 | idle_delete_ttl=600, 64 | num_workers=30, 65 | worker_machine_type="n2-standard-16", 66 | optional_components=["ANACONDA"], 67 | init_actions_uris=[ 68 | "gs://dataproc-initialization-actions/python/pip-install.sh" 69 | ], 70 | additional_properties={ 71 | "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" 72 | }, 73 | additional_metadata={"PIP_PACKAGES": "lifetimes==0.11.1"}, 74 | python_driver_code=f"gs://{params.artifact_bucket}/jobs/ltv_daily.py", 75 | py_args=[ 76 | "--submission-date", 77 | "{{ ds }}", 78 | "--prediction-days", 79 | "364", 80 | "--project-id", 81 | project, 82 | "--source-qualified-table-id", 83 | f"{project}.search.search_rfm", 84 | "--dataset-id", 85 | "analysis", 86 | "--intermediate-table-id", 87 | "ltv_daily_temporary_search_rfm_day", 88 | "--model-input-table-id", 89 | "ltv_daily_model_perf", 90 | "--model-output-table-id", 91 | "ltv_daily", 92 | "--temporary-gcs-bucket", 93 | params.storage_bucket, 94 | ], 95 | gcp_conn_id=params.conn_id, 96 | service_account=params.client_email, 97 | artifact_bucket=params.artifact_bucket, 98 | storage_bucket=params.storage_bucket, 99 | default_args=subdag_args, 100 | ), 101 | ) 102 | 103 | if params.is_dev: 104 | copy_to_dev = copy_artifacts_dev( 105 | dag, params.project_id, params.artifact_bucket, params.storage_bucket 106 | ) 107 | copy_to_dev >> ltv_daily 108 | else: 109 | wait_for_search_clients_last_seen = ExternalTaskSensor( 110 | task_id="wait_for_search_clients_last_seen", 111 | external_dag_id="bqetl_search", 112 | external_task_id="search_derived__search_clients_last_seen__v1", 113 | execution_delta=timedelta(hours=1), 114 | check_existence=True, 115 | mode="reschedule", 116 | allowed_states=ALLOWED_STATES, 117 | failed_states=FAILED_STATES, 118 | pool="DATA_ENG_EXTERNALTASKSENSOR", 119 | email_on_retry=False, 120 | dag=dag, 121 | ) 122 | wait_for_search_clients_last_seen >> ltv_daily 123 | 124 | ltv_revenue_join = bigquery_etl_query( 125 | task_id="ltv_revenue_join", 126 | destination_table="client_ltv_v1", 127 | dataset_id="revenue_derived", 128 | project_id="moz-fx-data-shared-prod", 129 | arguments=( 130 | "--clustering_fields=engine,country", 131 | "--schema_update_option=ALLOW_FIELD_ADDITION", 132 | "--schema_update_option=ALLOW_FIELD_RELAXATION", 133 | "--time_partitioning_type=DAY", 134 | "--time_partitioning_field=submission_date", 135 | ), 136 | dag=dag, 137 | ) 138 | 139 | ltv_daily >> ltv_revenue_join 140 | -------------------------------------------------------------------------------- /dags/mad_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Malicious Addons Detection. 3 | 4 | This runs once a week to emit a trained model to GCS. 5 | 6 | Source code is in the private [mad-server repository](https://github.com/mozilla/mad-server/). 7 | 8 | *Triage notes* 9 | 10 | The way the app was designed it is decoupled from Airflow and will pull all data since the last 11 | successful data pull. What this means if we have a failed DAG run followed by 12 | a successful DAG run it will cover the data from the previous run. 13 | 14 | So as long as the most recent DAG run is successful the job can be considered healthy 15 | and not action is required for failed DAG runs. 16 | """ 17 | 18 | from datetime import datetime, timedelta 19 | 20 | from airflow import DAG 21 | from airflow.providers.cncf.kubernetes.secret import Secret 22 | 23 | from operators.gcp_container_operator import GKEPodOperator 24 | from utils.tags import Tag 25 | 26 | default_args = { 27 | "owner": "dzeber@mozilla.com", 28 | "depends_on_past": False, 29 | "start_date": datetime(2021, 4, 15), 30 | "email_on_failure": True, 31 | "email_on_retry": True, 32 | "retries": 1, 33 | "retry_delay": timedelta(minutes=30), 34 | } 35 | 36 | tags = [Tag.ImpactTier.tier_3] 37 | 38 | gcs_bucket = "mad-resources-training" 39 | gcs_root_training = "datasets" 40 | cloud_service = "GCS" 41 | customs_training_allow_overwrite = "True" 42 | gcloud_project = "mad-model-training" 43 | gcs_report_bucket = "mad-reports" 44 | amo_cred_issuer_secret = Secret( 45 | deploy_type="env", 46 | deploy_target="AMO_CRED_ISSUER", 47 | secret="airflow-gke-secrets", 48 | key="mad_server_secret__amo_cred_issuer", 49 | ) 50 | amo_cred_secret_secret = Secret( 51 | deploy_type="env", 52 | deploy_target="AMO_CRED_SECRET", 53 | secret="airflow-gke-secrets", 54 | key="mad_server_secret__amo_cred_secret", 55 | ) 56 | 57 | with DAG( 58 | "mad_server", 59 | default_args=default_args, 60 | schedule_interval="@weekly", 61 | doc_md=__doc__, 62 | tags=tags, 63 | ) as dag: 64 | mad_server_pull = GKEPodOperator( 65 | task_id="mad_server_pull", 66 | # Controls the entrypoint of the container, which for mad-server 67 | # defaults to bin/run rather than a shell. 68 | cmds=[ 69 | "/bin/bash", 70 | ], 71 | arguments=[ 72 | "bin/airflow-pull", 73 | ], 74 | image="us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/mad-server:latest", 75 | startup_timeout_seconds=500, 76 | gcp_conn_id="google_cloud_airflow_gke", 77 | env_vars={ 78 | "GCS_BUCKET": gcs_bucket, 79 | "GCS_ROOT_TRAINING": gcs_root_training, 80 | "CLOUD_SERVICE": cloud_service, 81 | "CUSTOMS_TRAINING_ALLOW_OVERWRITE": customs_training_allow_overwrite, 82 | }, 83 | email=[ 84 | "dzeber@mozilla.com", 85 | "gleonard@mozilla.com", 86 | ], 87 | secrets=[amo_cred_issuer_secret, amo_cred_secret_secret], 88 | ) 89 | mad_train_model = GKEPodOperator( 90 | task_id="train_model", 91 | cmds=[ 92 | "/bin/bash", 93 | ], 94 | arguments=[ 95 | "bin/train_model", 96 | "--publish", 97 | "--publish-as-latest", 98 | "./working", 99 | ], 100 | image="us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/mad-server:latest", 101 | startup_timeout_seconds=500, 102 | env_vars={ 103 | "GCS_BUCKET": gcs_bucket, 104 | "GCS_ROOT_TRAINING": gcs_root_training, 105 | "CLOUD_SERVICE": cloud_service, 106 | "CUSTOMS_TRAINING_ALLOW_OVERWRITE": customs_training_allow_overwrite, 107 | "GCLOUD_PROJECT": gcloud_project, 108 | "GCS_REPORT_BUCKET": gcs_report_bucket, 109 | }, 110 | email=[ 111 | "dzeber@mozilla.com", 112 | "gleonard@mozilla.com", 113 | ], 114 | ) 115 | new_data_eval = GKEPodOperator( 116 | task_id="evaluate_new_data", 117 | cmds=[ 118 | "/bin/bash", 119 | ], 120 | arguments=[ 121 | "bin/evaluate_new_data", 122 | "--publish", 123 | "--publish-as-latest", 124 | "./working", 125 | ], 126 | image="us-west1-docker.pkg.dev/moz-fx-data-airflow-prod-88e0/data-science-artifacts/mad-server:latest", 127 | startup_timeout_seconds=500, 128 | gcp_conn_id="google_cloud_airflow_gke", 129 | env_vars={ 130 | "GCS_BUCKET": gcs_bucket, 131 | "GCS_ROOT_TRAINING": gcs_root_training, 132 | "CLOUD_SERVICE": cloud_service, 133 | "CUSTOMS_TRAINING_ALLOW_OVERWRITE": customs_training_allow_overwrite, 134 | "GCLOUD_PROJECT": gcloud_project, 135 | "GCS_REPORT_BUCKET": gcs_report_bucket, 136 | }, 137 | email=[ 138 | "dzeber@mozilla.com", 139 | "gleonard@mozilla.com", 140 | ], 141 | ) 142 | 143 | mad_server_pull >> mad_train_model >> new_data_eval 144 | -------------------------------------------------------------------------------- /dags/microsoft_store.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | from airflow.providers.cncf.kubernetes.secret import Secret 5 | 6 | from operators.gcp_container_operator import GKEPodOperator 7 | from utils.tags import Tag 8 | 9 | # Deploy value associated with Microsoft Store keys in k8s secret `airflow-gke-secrets` in environments Microsoft variables. 10 | 11 | microsoft_client_id = Secret( 12 | deploy_type="env", 13 | deploy_target="MICROSOFT_CLIENT_ID", 14 | secret="airflow-gke-secrets", 15 | key="MICROSOFT_CLIENT_ID", 16 | ) 17 | microsoft_client_secret = Secret( 18 | deploy_type="env", 19 | deploy_target="MICROSOFT_CLIENT_SECRET", 20 | secret="airflow-gke-secrets", 21 | key="MICROSOFT_CLIENT_SECRET", 22 | ) 23 | microsoft_tenant_id = Secret( 24 | deploy_type="env", 25 | deploy_target="MICROSOFT_TENANT_ID", 26 | secret="airflow-gke-secrets", 27 | key="MICROSOFT_TENANT_ID", 28 | ) 29 | microsoft_store_app_list = Secret( 30 | deploy_type="env", 31 | deploy_target="MICROSOFT_STORE_APP_LIST", 32 | secret="airflow-gke-secrets", 33 | key="MICROSOFT_STORE_APP_LIST", 34 | ) 35 | 36 | docs = """ 37 | This DAG runs the daily download of aggregated data from the Microsoft Store API. 38 | #### Owner 39 | mhirose@mozilla.com 40 | #### Tags 41 | * impact/tier_2 42 | * repo/bigquery-etl 43 | """ 44 | 45 | default_args = { 46 | "owner": "mhirose@mozilla.com", 47 | "start_date": datetime.datetime(2024, 6, 18, 0, 0), 48 | "end_date": None, 49 | "email": ["telemetry-alerts@mozilla.com", "mhirose@mozilla.com"], 50 | "depends_on_past": False, 51 | "retry_delay": datetime.timedelta(seconds=1800), 52 | "email_on_failure": True, 53 | "email_on_retry": True, 54 | "retries": 2, 55 | } 56 | 57 | tags = [Tag.ImpactTier.tier_2] 58 | 59 | # Have the DAG run later in the day so Microsoft Store data has a chance to populate 60 | with DAG( 61 | "microsoft_store", 62 | default_args=default_args, 63 | schedule_interval="0 15 * * *", 64 | doc_md=docs, 65 | tags=tags, 66 | ) as dag: 67 | table_names = ( 68 | "app_acquisitions", 69 | "app_conversions", 70 | "app_installs", 71 | ) 72 | for table in table_names: 73 | GKEPodOperator( 74 | task_id=f"microsoft_derived__{table}__v1", 75 | secrets=[ 76 | microsoft_client_id, 77 | microsoft_client_secret, 78 | microsoft_tenant_id, 79 | microsoft_store_app_list, 80 | ], 81 | arguments=[ 82 | "python", 83 | f"sql/moz-fx-data-shared-prod/microsoft_derived/{table}_v1/query.py", 84 | "--date={{ macros.ds_add(ds, -3) }}", 85 | ], 86 | image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", 87 | owner="mhirose@mozilla.com", 88 | email=["mhirose@mozilla.com", "telemetry-alerts@mozilla.com"], 89 | ) 90 | -------------------------------------------------------------------------------- /dags/operational_monitoring.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.sensors.external_task import ExternalTaskSensor 5 | 6 | from operators.gcp_container_operator import GKEPodOperator 7 | from utils.tags import Tag 8 | 9 | docs = """ 10 | ### operational_monitoring 11 | 12 | 13 | This DAG schedules queries for populating datasets used for operational monitoring. 14 | The queries are generated via [`opmon`](https://github.com/mozilla/opmon). 15 | 16 | #### Owner 17 | 18 | ascholtz@mozilla.com 19 | """ 20 | 21 | default_args = { 22 | "owner": "ascholtz@mozilla.com", 23 | "email": [ 24 | "telemetry-alerts@mozilla.com", 25 | "ascholtz@mozilla.com", 26 | ], 27 | "depends_on_past": False, 28 | "start_date": datetime(2021, 6, 3), 29 | "email_on_failure": True, 30 | "email_on_retry": True, 31 | "retries": 2, 32 | "retry_delay": timedelta(minutes=30), 33 | } 34 | 35 | DAG_NAME = "operational_monitoring" 36 | tags = [Tag.ImpactTier.tier_3] 37 | 38 | with DAG( 39 | DAG_NAME, 40 | default_args=default_args, 41 | schedule_interval="0 4 * * *", 42 | doc_md=docs, 43 | tags=tags, 44 | ) as dag: 45 | # Built from repo https://github.com/mozilla/opmon 46 | opmon_image = "gcr.io/moz-fx-data-experiments/opmon:latest" 47 | 48 | opmon_run = GKEPodOperator( 49 | task_id="opmon_run", 50 | name="opmon_run", 51 | image=opmon_image, 52 | email=["ascholtz@mozilla.com"], 53 | arguments=[ 54 | "--log_to_bigquery", 55 | "run", 56 | "--date={{ ds }}", 57 | ], 58 | dag=dag, 59 | ) 60 | 61 | wait_for_clients_daily_export = ExternalTaskSensor( 62 | task_id="wait_for_clients_daily", 63 | external_dag_id="bqetl_main_summary", 64 | external_task_id="telemetry_derived__clients_daily__v6", 65 | execution_delta=timedelta(hours=2), 66 | mode="reschedule", 67 | allowed_states=["success"], 68 | failed_states=["failed", "upstream_failed", "skipped"], 69 | pool="DATA_ENG_EXTERNALTASKSENSOR", 70 | email_on_retry=False, 71 | dag=dag, 72 | ) 73 | 74 | wait_for_search_clients_daily = ExternalTaskSensor( 75 | task_id="wait_for_search_clients_daily", 76 | external_dag_id="bqetl_search", 77 | external_task_id="search_derived__search_clients_daily__v8", 78 | execution_delta=timedelta(hours=1), 79 | mode="reschedule", 80 | allowed_states=["success"], 81 | failed_states=["failed", "upstream_failed", "skipped"], 82 | pool="DATA_ENG_EXTERNALTASKSENSOR", 83 | email_on_retry=False, 84 | dag=dag, 85 | ) 86 | 87 | opmon_run.set_upstream( 88 | [ 89 | wait_for_clients_daily_export, 90 | wait_for_search_clients_daily, 91 | ] 92 | ) 93 | -------------------------------------------------------------------------------- /dags/operational_monitoring_backfill.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow.decorators import dag, task 4 | from airflow.models.param import Param 5 | 6 | from operators.gcp_container_operator import GKEPodOperator 7 | from utils.tags import Tag 8 | 9 | docs = """ 10 | ### operational_monitoring_backfill 11 | Build from telemetry-airflow repo, [dags/operational_monitoring_backfill.py](https://github.com/mozilla/telemetry-airflow/blob/main/dags/operational_monitoring_backfill.py) 12 | Triggers backfills for specifc operational monitoring projects. 13 | 14 | #### Owner 15 | 16 | ascholtz@mozilla.com 17 | lschiestl@mozilla.com 18 | """ 19 | 20 | tags = [Tag.ImpactTier.tier_3, Tag.Triage.no_triage] 21 | 22 | 23 | @dag( 24 | dag_id="operational_monitoring_backfill", 25 | start_date=datetime.datetime(2021, 1, 1, 0, 0), 26 | schedule_interval=None, 27 | catchup=False, 28 | doc_md=docs, 29 | dagrun_timeout=datetime.timedelta(days=4), 30 | tags=tags, 31 | render_template_as_native_obj=True, 32 | params={ 33 | "slug": Param( 34 | "slug", 35 | title="Slug", 36 | type="string", 37 | description="[Required] Experimenter slug or slug of OpMon project to (re)run the analysis for", 38 | ), 39 | "start_date": Param( 40 | f"{datetime.date.today()}", 41 | title="Start Date", 42 | type="string", 43 | format="date", 44 | description="[Required] First date to be backfilled, inclusive", 45 | ), 46 | "end_date": Param( 47 | f"{datetime.date.today()}", 48 | title="End Date", 49 | type="string", 50 | format="date", 51 | description="[Required] Last date to be backfilled, inclusive", 52 | ), 53 | "args": Param( 54 | None, 55 | title="Additional Arguments", 56 | type=["null", "string"], 57 | description="[Optional] Additional command line arguments", 58 | ), 59 | }, 60 | ) 61 | def operational_monitoring_backfill_dag(): 62 | @task 63 | def generate_backfill_arguments(**context): 64 | cmd = [ 65 | "backfill", 66 | "--slug", 67 | context["params"]["slug"], 68 | "--start-date", 69 | context["params"]["start_date"], 70 | "--end_date", 71 | context["params"]["end_date"], 72 | ] 73 | 74 | if args := context["params"]["args"]: 75 | cmd.append(args) 76 | 77 | return cmd 78 | 79 | # Built from repo https://github.com/mozilla/opmon 80 | opmon_image = "gcr.io/moz-fx-data-experiments/opmon:latest" 81 | 82 | GKEPodOperator( 83 | task_id="opmon_backfill", 84 | name="opmon_backfill", 85 | image=opmon_image, 86 | arguments=generate_backfill_arguments(), 87 | ) 88 | 89 | 90 | dag = operational_monitoring_backfill_dag() 91 | -------------------------------------------------------------------------------- /dags/partybal.py: -------------------------------------------------------------------------------- 1 | """ 2 | DAG to schedule generation of results for partybal. 3 | 4 | Partybal is an experimental service to visualize experiment results that have been 5 | produced by [jetstream](https://github.com/mozilla/jetstream). 6 | See https://github.com/mozilla/partybal 7 | 8 | This DAG depends on experiment results being available for a certain date. 9 | So if the [jetstream DAG](https://workflow.telemetry.mozilla.org/tree?dag_id=jetstream) 10 | does not successfully complete running, then the tasks in this DAG will fail as well. 11 | 12 | The DAG is scheduled to run every three hours to pick up experiment results from manually 13 | triggered analysis runs quickly. 14 | 15 | *Triage notes* 16 | 17 | As long as the most recent DAG run is successful this job can be considered healthy. 18 | In such case, past DAG failures can be ignored. 19 | """ 20 | 21 | from datetime import datetime, timedelta 22 | 23 | from airflow import DAG 24 | 25 | from operators.gcp_container_operator import GKEPodOperator 26 | from utils.tags import Tag 27 | 28 | default_args = { 29 | "owner": "ascholtz@mozilla.com", 30 | "email": [ 31 | "ascholtz@mozilla.com", 32 | "mwilliams@mozilla.com", 33 | ], 34 | "depends_on_past": False, 35 | "start_date": datetime(2021, 6, 21), 36 | "email_on_failure": True, 37 | "email_on_retry": True, 38 | "retries": 2, 39 | "retry_delay": timedelta(minutes=30), 40 | } 41 | 42 | tags = [Tag.ImpactTier.tier_2] 43 | 44 | with DAG( 45 | "partybal", 46 | default_args=default_args, 47 | schedule_interval="0 */3 * * *", 48 | doc_md=__doc__, 49 | tags=tags, 50 | ) as dag: 51 | # Built from repo https://github.com/mozilla/partybal 52 | partybal_image = "gcr.io/moz-fx-data-experiments/partybal:latest" 53 | 54 | partybal = GKEPodOperator( 55 | task_id="partybal", 56 | name="partybal", 57 | image=partybal_image, 58 | email=[ 59 | "ascholtz@mozilla.com", 60 | "mwilliams@mozilla.com", 61 | ], 62 | dag=dag, 63 | ) 64 | -------------------------------------------------------------------------------- /dags/play_store_export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Runs a Docker image that backfills data from the Google Play store to BigQuery. 3 | 4 | The container is defined in 5 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/play-store-export) 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | 10 | from airflow import DAG 11 | 12 | from operators.gcp_container_operator import GKEPodOperator 13 | from utils.tags import Tag 14 | 15 | default_args = { 16 | "owner": "akomar@mozilla.com", 17 | "depends_on_past": False, 18 | "start_date": datetime(2020, 6, 23), 19 | "email_on_failure": True, 20 | "email_on_retry": True, 21 | "retries": 1, 22 | "retry_delay": timedelta(minutes=30), 23 | } 24 | 25 | project_id = "moz-fx-data-marketing-prod" 26 | 27 | tags = [Tag.ImpactTier.tier_3] 28 | 29 | with DAG( 30 | "play_store_export", 31 | default_args=default_args, 32 | doc_md=__doc__, 33 | schedule_interval="@daily", 34 | tags=tags, 35 | ) as dag: 36 | play_store_export = GKEPodOperator( 37 | task_id="play_store_export", 38 | arguments=[ 39 | "python", 40 | "play_store_export/export.py", 41 | "--date={{ yesterday_ds }}", 42 | "--backfill-day-count=60", 43 | "--project", 44 | project_id, 45 | "--transfer-config={{ var.value.play_store_transfer_config_id }}", 46 | ], 47 | image="gcr.io/moz-fx-data-airflow-prod-88e0/play-store-export:latest", 48 | gcp_conn_id="google_cloud_airflow_gke", 49 | dag=dag, 50 | email=[ 51 | "akomar@mozilla.com", 52 | ], 53 | ) 54 | -------------------------------------------------------------------------------- /dags/publish_bqetl_static.py: -------------------------------------------------------------------------------- 1 | """ 2 | Daily deployment of static bigquery-etl data to various projects. 3 | 4 | See the publish command [here](https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/static/__init__.py). 5 | """ 6 | 7 | from datetime import datetime, timedelta 8 | 9 | from airflow import DAG 10 | 11 | from operators.gcp_container_operator import GKEPodOperator 12 | from utils.tags import Tag 13 | 14 | IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest" 15 | 16 | default_args = { 17 | "owner": "anicholson@mozilla.com", 18 | "email": [ 19 | "telemetry-alerts@mozilla.com", 20 | "anicholson@mozilla.com", 21 | ], 22 | "depends_on_past": False, 23 | "start_date": datetime(2022, 4, 4), 24 | "email_on_failure": True, 25 | "email_on_retry": True, 26 | "retries": 2, 27 | "retry_delay": timedelta(minutes=30), 28 | } 29 | 30 | tags = [Tag.ImpactTier.tier_2] 31 | 32 | with DAG( 33 | "publish_bqetl_static", 34 | default_args=default_args, 35 | schedule_interval="@daily", 36 | doc_md=__doc__, 37 | tags=tags, 38 | ) as dag: 39 | publish_static_mozdata = GKEPodOperator( 40 | task_id="publish_static_mozdata", 41 | arguments=["script/bqetl", "static", "publish", "--project_id", "mozdata"], 42 | image=IMAGE, 43 | ) 44 | 45 | publish_static_shared_prod = GKEPodOperator( 46 | task_id="publish_static_shared_prod", 47 | arguments=[ 48 | "script/bqetl", 49 | "static", 50 | "publish", 51 | "--project_id", 52 | "moz-fx-data-shared-prod", 53 | ], 54 | image=IMAGE, 55 | ) 56 | -------------------------------------------------------------------------------- /dags/search_alert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Runs a Docker image that produces search alert data. 3 | 4 | The container is defined in 5 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/search-alert) 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | 10 | from airflow import DAG 11 | from airflow.sensors.external_task import ExternalTaskSensor 12 | 13 | from operators.gcp_container_operator import GKEPodOperator 14 | from utils.constants import ALLOWED_STATES, FAILED_STATES 15 | from utils.tags import Tag 16 | 17 | default_args = { 18 | "owner": "akomar@mozilla.com", 19 | "depends_on_past": False, 20 | "start_date": datetime(2022, 1, 20), 21 | "email": [ 22 | "telemetry-alerts@mozilla.com", 23 | "akomar@mozilla.com", 24 | ], 25 | "email_on_failure": True, 26 | "email_on_retry": True, 27 | "retries": 3, 28 | "retry_delay": timedelta(minutes=30), 29 | } 30 | 31 | tags = [Tag.ImpactTier.tier_2] 32 | 33 | with DAG( 34 | "search_alert", 35 | default_args=default_args, 36 | doc_md=__doc__, 37 | schedule_interval="0 4 * * *", 38 | # We don't want to run more than a single instance of this DAG 39 | # since underlying tables are not partitioned 40 | max_active_runs=1, 41 | tags=tags, 42 | ) as dag: 43 | wait_for_search_aggregates = ExternalTaskSensor( 44 | task_id="wait_for_search_aggregates", 45 | external_dag_id="bqetl_search", 46 | external_task_id="search_derived__search_aggregates__v8", 47 | execution_delta=timedelta(hours=1), 48 | check_existence=True, 49 | mode="reschedule", 50 | allowed_states=ALLOWED_STATES, 51 | failed_states=FAILED_STATES, 52 | pool="DATA_ENG_EXTERNALTASKSENSOR", 53 | email_on_retry=False, 54 | dag=dag, 55 | ) 56 | 57 | search_alert = GKEPodOperator( 58 | task_id="search_alert", 59 | arguments=[ 60 | "python", 61 | "search_alert/main.py", 62 | "--submission_date={{ ds }}", 63 | "--project_id=mozdata", 64 | ], 65 | image="gcr.io/moz-fx-data-airflow-prod-88e0/search-alert_docker_etl:latest", 66 | gcp_conn_id="google_cloud_airflow_gke", 67 | ) 68 | 69 | wait_for_search_aggregates >> search_alert 70 | -------------------------------------------------------------------------------- /dags/search_forecasting.py: -------------------------------------------------------------------------------- 1 | """ 2 | See [kpi-forecasting in the docker-etl repository](https://github.com/mozilla/docker-etl/blob/main/jobs/kpi-forecasting). 3 | 4 | This DAG runs the search forecasts for the DAU, search count and ad clicks metrics . 5 | 6 | This DAG is high priority for week 1 of the month and low priority otherwise. 7 | """ 8 | 9 | import os 10 | from datetime import datetime, timedelta 11 | 12 | from airflow import DAG 13 | from airflow.sensors.external_task import ExternalTaskSensor 14 | 15 | from operators.gcp_container_operator import GKEPodOperator 16 | from utils.constants import ALLOWED_STATES, FAILED_STATES 17 | from utils.tags import Tag 18 | 19 | default_args = { 20 | "owner": "jsnyder@mozilla.com", 21 | "email": [ 22 | "jsnyder@mozilla.com", 23 | "mbowerman@mozilla.com", 24 | "telemetry-alerts@mozilla.com", 25 | ], 26 | "depends_on_past": False, 27 | "start_date": datetime(2024, 7, 6), 28 | "email_on_failure": True, 29 | "email_on_retry": False, 30 | "retries": 2, 31 | "retry_delay": timedelta(minutes=30), 32 | } 33 | 34 | TAGS = [Tag.ImpactTier.tier_1] 35 | IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/kpi-forecasting_docker_etl:latest" 36 | 37 | FORECAST_METRICS_LIST = [ 38 | "search_forecasting_daily_active_users", 39 | "search_forecasting_search_count", 40 | "search_forecasting_ad_clicks", 41 | ] 42 | 43 | # schedule to run after bqetl_search_dashboard completes 44 | with DAG( 45 | "search_forecasting", 46 | default_args=default_args, 47 | schedule_interval="30 5 7 * *", 48 | doc_md=__doc__, 49 | tags=TAGS, 50 | ) as dag: 51 | # all the search forecasting metrics come from the search_revenue_levers_daily 52 | # table which is run in the bqetl_search_dashboard dag 53 | # as the search_derived__search_revenue_levers_daily__v1 task 54 | # see: https://workflow.telemetry.mozilla.org/dags/bqetl_search_dashboard/grid 55 | wait_task_sensor = ExternalTaskSensor( 56 | task_id="wait_for_search_dashboard", 57 | external_dag_id="bqetl_search_dashboard", 58 | external_task_id="search_derived__search_revenue_levers_daily__v1", 59 | check_existence=True, 60 | mode="reschedule", 61 | allowed_states=ALLOWED_STATES, 62 | failed_states=FAILED_STATES, 63 | pool="DATA_ENG_EXTERNALTASKSENSOR", 64 | ) 65 | 66 | for metric in FORECAST_METRICS_LIST: 67 | # pass the search_forecasting configs to the KPI forecasting script 68 | config_filename = f"{metric}.yaml" 69 | script_path = os.path.join(".", "kpi_forecasting.py") 70 | config_path = os.path.join("kpi_forecasting", "configs", config_filename) 71 | 72 | forecast_task = GKEPodOperator( 73 | task_id=f"search_forecasting_{metric}", 74 | arguments=["python", script_path, "-c", config_path], 75 | image=IMAGE, 76 | ) 77 | 78 | wait_task_sensor >> forecast_task 79 | -------------------------------------------------------------------------------- /dags/shredder_backfill.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models.param import Param 5 | from airflow.operators.python import BranchPythonOperator 6 | 7 | from operators.gcp_container_operator import GKEPodOperator, OnFinishAction 8 | from utils.tags import Tag 9 | 10 | docs = """ 11 | ### shredder-backfill 12 | 13 | #### Description 14 | 15 | Manually triggered DAG that handles deletion requests from a specified time period 16 | for a list of given tables. 17 | 18 | `target_tables` is a list of tables formatted as `dataset.table_name` with one table per line. 19 | The moz-fx-data-shared-prod project is assumed because shredder currently only runs 20 | on tables in this project. 21 | 22 | Use the dry run parameter run shredder with the --dry-run option to validate parameters. 23 | Note that the shredder dry run will still dry run queries against every partition of each table 24 | so it may take a long time to finish if a lot of tables are given. 25 | 26 | This DAG is meant to be used to handle older deletion requests for tables that are already being 27 | shredded. Any provided tables that aren't already valid deletion targets will be ignored. 28 | 29 | #### Owner 30 | 31 | bewu@mozilla.com 32 | """ 33 | 34 | params = { 35 | "request_start_date": Param( 36 | default=(date.today() - timedelta(days=7)).isoformat(), 37 | description="First date of deletion requests to process", 38 | type="string", 39 | format="date", 40 | ), 41 | "request_end_date": Param( 42 | default=(date.today()).isoformat(), 43 | description="Last date of data (i.e. partition) to delete from", 44 | type="string", 45 | format="date", 46 | ), 47 | "target_tables": Param( 48 | default=["dataset.table_name"], 49 | description="Tables to delete from (one per line)", 50 | type="array", 51 | minItems=1, 52 | ), 53 | "dry_run": Param(default=True, type="boolean"), 54 | } 55 | 56 | default_args = { 57 | "owner": "bewu@mozilla.com", 58 | "depends_on_past": False, 59 | "start_date": datetime(2024, 3, 1), 60 | "catchup": False, 61 | "email": [ 62 | "telemetry-alerts@mozilla.com", 63 | "bewu@mozilla.com", 64 | ], 65 | "email_on_failure": True, 66 | "email_on_retry": False, 67 | # transient failures are expected and can be handled with state table 68 | "retries": 44, 69 | "retry_delay": timedelta(minutes=5), 70 | } 71 | 72 | tags = [ 73 | Tag.ImpactTier.tier_3, 74 | Tag.Triage.no_triage, 75 | ] 76 | 77 | NON_DRY_RUN_TASK_ID = "shredder_backfill" 78 | DRY_RUN_TASK_ID = "shredder_backfill_dry_run" 79 | 80 | 81 | def base_backfill_operator(dry_run): 82 | """Create task for backfill, filling out parameters based on dry run.""" 83 | return GKEPodOperator( 84 | task_id=DRY_RUN_TASK_ID if dry_run else NON_DRY_RUN_TASK_ID, 85 | cmds=[ 86 | "script/shredder_delete", 87 | *(["--dry-run"] if dry_run else []), 88 | # use different tables from scheduled task so they can be monitored separately 89 | "--state-table=moz-fx-data-shredder.shredder_state.shredder_state_backfill", 90 | "--task-table=moz-fx-data-shredder.shredder_state.tasks_backfill", 91 | "--end-date={{ params.request_end_date }}", 92 | "--start-date={{ params.request_start_date }}", 93 | "--no-use-dml", 94 | # low parallelism to reduce slot contention with scheduled task 95 | "--parallelism=1", 96 | "--billing-project=moz-fx-data-bq-batch-prod", 97 | "--only", 98 | ], 99 | # target_tables will be rendered as a python list 100 | arguments="{{ params.target_tables }}", 101 | image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", 102 | on_finish_action=OnFinishAction.DELETE_POD.value, 103 | reattach_on_restart=True, 104 | ) 105 | 106 | 107 | with DAG( 108 | "shredder_backfill", 109 | default_args=default_args, 110 | schedule=None, 111 | doc_md=docs, 112 | tags=tags, 113 | params=params, 114 | # needed to pass the list of tables as a list to the pod operator 115 | render_template_as_native_obj=True, 116 | ) as dag: 117 | # Use separate tasks for dry run to make logs easier to find 118 | dry_run_branch = BranchPythonOperator( 119 | task_id="dry_run_branch", 120 | python_callable=lambda dry_run: ( 121 | DRY_RUN_TASK_ID if dry_run else NON_DRY_RUN_TASK_ID 122 | ), 123 | op_kwargs={"dry_run": "{{ params.dry_run }}"}, 124 | ) 125 | 126 | backfill_tasks = [ 127 | base_backfill_operator(dry_run_value) for dry_run_value in (True, False) 128 | ] 129 | 130 | dry_run_branch >> backfill_tasks 131 | -------------------------------------------------------------------------------- /dags/socorro_import.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.subdag import SubDagOperator 5 | from airflow.sensors.external_task import ExternalTaskMarker 6 | from airflow.utils.task_group import TaskGroup 7 | 8 | from operators.gcp_container_operator import GKEPodOperator 9 | from utils.dataproc import moz_dataproc_pyspark_runner 10 | from utils.tags import Tag 11 | 12 | """ 13 | This uses dataproc to rewrite the data to parquet in gcs, and 14 | load the parquet data into bigquery. 15 | 16 | The following WTMO connections are needed in order for this job to run: 17 | conn - google_cloud_airflow_dataproc 18 | conn - google_cloud_airflow_gke 19 | """ 20 | 21 | default_args = { 22 | "owner": "srose@mozilla.com", 23 | "depends_on_past": False, 24 | "start_date": datetime(2019, 9, 10), 25 | "email": [ 26 | "srose@mozilla.com", 27 | "telemetry-alerts@mozilla.com", 28 | ], 29 | "email_on_failure": True, 30 | "email_on_retry": True, 31 | "retries": 2, 32 | "retry_delay": timedelta(minutes=30), 33 | } 34 | 35 | tags = [Tag.ImpactTier.tier_2] 36 | 37 | with DAG( 38 | "socorro_import", 39 | default_args=default_args, 40 | schedule_interval="@daily", 41 | tags=tags, 42 | ) as dag: 43 | # Unsalted cluster name so subsequent runs fail if the cluster name exists 44 | cluster_name = "socorro-import-dataproc-cluster" 45 | 46 | # Defined in Airflow's UI -> Admin -> Connections 47 | gcp_conn_id = "google_cloud_airflow_dataproc" 48 | project_id = "airflow-dataproc" 49 | 50 | # We use an application-specific gcs bucket because the data needs to be transformed 51 | # in dataproc before loading 52 | 53 | gcs_data_bucket = "moz-fx-data-prod-socorro-data" 54 | 55 | dataset = "socorro_crash" 56 | dataset_version = "v2" 57 | date_submission_col = "crash_date" 58 | 59 | objects_prefix = "{}/{}/{}={}".format( 60 | dataset, dataset_version, date_submission_col, "{{ ds_nodash }}" 61 | ) 62 | 63 | # Spark job reads gcs json and writes gcs parquet 64 | crash_report_parquet = SubDagOperator( 65 | task_id="crash_report_parquet", 66 | subdag=moz_dataproc_pyspark_runner( 67 | parent_dag_name=dag.dag_id, 68 | dag_name="crash_report_parquet", 69 | default_args=default_args, 70 | cluster_name=cluster_name, 71 | job_name="Socorro_Crash_Reports_to_Parquet", 72 | python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/socorro_import_crash_data.py", 73 | py_args=[ 74 | "--date", 75 | "{{ ds_nodash }}", 76 | "--source-gcs-path", 77 | "gs://moz-fx-socorro-prod-prod-telemetry/v1/crash_report", 78 | "--dest-gcs-path", 79 | f"gs://{gcs_data_bucket}/{dataset}", 80 | ], 81 | idle_delete_ttl=14400, 82 | num_workers=8, 83 | worker_machine_type="n1-standard-8", 84 | gcp_conn_id=gcp_conn_id, 85 | ), 86 | ) 87 | 88 | bq_gcp_conn_id = "google_cloud_airflow_gke" 89 | 90 | # Not using load_to_bigquery since our source data is on GCS. 91 | # We do use the parquet2bigquery container to load gcs parquet into bq though. 92 | bq_dataset = "telemetry_derived" 93 | bq_table_name = f"{dataset}_{dataset_version}" 94 | 95 | # This image was manually built from 96 | # https://github.com/mozilla/parquet2bigquery/commit/6bf1f86076de8939ba2c4d008080d6c159a0a093 97 | # using python:3.7.4-slim-buster 98 | docker_image = "gcr.io/moz-fx-data-airflow-prod-88e0/parquet2bigquery:20190722" 99 | 100 | gke_args = [ 101 | "--dataset", 102 | bq_dataset, 103 | "--concurrency", 104 | "10", 105 | "--bucket", 106 | gcs_data_bucket, 107 | "--no-resume", 108 | "--prefix", 109 | objects_prefix, 110 | "--cluster-by", 111 | "crash_date", 112 | ] 113 | 114 | # We remove the current date partition for idempotency. 115 | table_name = "{}:{}.{}${{{{ds_nodash}}}}".format( 116 | "{{ var.value.gcp_shared_prod_project }}", bq_dataset, bq_table_name 117 | ) 118 | 119 | remove_bq_table_partition = GKEPodOperator( 120 | task_id="remove_socorro_crash_bq_table_partition", 121 | gcp_conn_id=bq_gcp_conn_id, 122 | name="remove_socorro_crash_bq_table_partition", 123 | image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", 124 | arguments=["bq", "rm", "-f", "--table", table_name], 125 | ) 126 | 127 | bq_load = GKEPodOperator( 128 | task_id="bigquery_load", 129 | gcp_conn_id=bq_gcp_conn_id, 130 | name="load-socorro-crash-parquet-to-bq", 131 | image=docker_image, 132 | arguments=gke_args, 133 | env_vars={"GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}"}, 134 | ) 135 | 136 | with TaskGroup("socorro_external") as socorro_external: 137 | ExternalTaskMarker( 138 | task_id="crash_symbolication__wait_for_socorro_import", 139 | external_dag_id="crash_symbolication", 140 | external_task_id="wait_for_socorro_import", 141 | execution_date="{{ execution_date.replace(hour=5, minute=0).isoformat() }}", 142 | ) 143 | 144 | bq_load >> socorro_external 145 | 146 | crash_report_parquet >> remove_bq_table_partition >> bq_load 147 | -------------------------------------------------------------------------------- /dags/update_orphaning_dashboard_etl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Powers https://telemetry.mozilla.org/update-orphaning/. 3 | 4 | See [jobs/update_orphaning_dashboard_etl.py](https://github.com/mozilla/telemetry-airflow/blob/main/jobs/update_orphaning_dashboard_etl.py). 5 | """ 6 | 7 | from datetime import datetime, timedelta 8 | 9 | from airflow import DAG 10 | from airflow.operators.subdag import SubDagOperator 11 | 12 | from utils.constants import DS_WEEKLY 13 | from utils.dataproc import moz_dataproc_pyspark_runner 14 | from utils.tags import Tag 15 | 16 | """ 17 | 18 | The following WTMO connections are needed in order for this job to run: 19 | conn - google_cloud_airflow_dataproc 20 | conn - aws_dev_telemetry_public_analysis_2_rw 21 | """ 22 | 23 | default_args = { 24 | "owner": "akomar@mozilla.com", 25 | "depends_on_past": False, 26 | "start_date": datetime(2019, 10, 12), 27 | "email": [ 28 | "telemetry-alerts@mozilla.com", 29 | "ahabibi@mozilla.com", 30 | "rsteuber@mozilla.com", 31 | "akomar@mozilla.com", 32 | ], 33 | "email_on_failure": True, 34 | "email_on_retry": True, 35 | "retries": 2, 36 | "retry_delay": timedelta(minutes=10), 37 | } 38 | 39 | tags = [Tag.ImpactTier.tier_3] 40 | 41 | # run every Monday to maintain compatibility with legacy ATMO schedule 42 | dag = DAG( 43 | "update_orphaning_dashboard_etl", 44 | default_args=default_args, 45 | schedule_interval="0 2 * * MON", 46 | doc_md=__doc__, 47 | tags=tags, 48 | ) 49 | 50 | # Unsalted cluster name so subsequent runs fail if the cluster name exists 51 | cluster_name = "app-update-out-of-date-dataproc-cluster" 52 | 53 | # Defined in Airflow's UI -> Admin -> Connections 54 | gcp_conn_id = "google_cloud_airflow_dataproc" 55 | 56 | SubDagOperator( 57 | task_id="update_orphaning_dashboard_etl", 58 | dag=dag, 59 | subdag=moz_dataproc_pyspark_runner( 60 | parent_dag_name=dag.dag_id, 61 | dag_name="update_orphaning_dashboard_etl", 62 | default_args=default_args, 63 | cluster_name=cluster_name, 64 | job_name="update_orphaning_dashboard_etl", 65 | python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/update_orphaning_dashboard_etl.py", 66 | init_actions_uris=[ 67 | "gs://dataproc-initialization-actions/python/pip-install.sh" 68 | ], 69 | additional_metadata={ 70 | "PIP_PACKAGES": "google-cloud-bigquery==1.20.0 google-cloud-storage==1.19.1 boto3==1.9.253" 71 | }, 72 | additional_properties={ 73 | "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.3" 74 | }, 75 | py_args=[ 76 | "--run-date", 77 | DS_WEEKLY, 78 | "--gcs-bucket", 79 | "mozdata-analysis", 80 | "--gcs-prefix", 81 | "update-orphaning-airflow", 82 | "--gcs-output-bucket", 83 | "moz-fx-data-static-websit-8565-analysis-output", 84 | "--gcs-output-path", 85 | "app-update/data/out-of-date/", 86 | ], 87 | idle_delete_ttl=14400, 88 | num_workers=20, 89 | worker_machine_type="n1-standard-8", 90 | gcp_conn_id=gcp_conn_id, 91 | ), 92 | ) 93 | -------------------------------------------------------------------------------- /dags/webcompat_kb.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | from airflow.providers.cncf.kubernetes.secret import Secret 5 | 6 | from operators.gcp_container_operator import GKEPodOperator 7 | from utils.tags import Tag 8 | 9 | DOCS = """ 10 | ### Bugzilla to BigQuery import 11 | 12 | #### Description 13 | 14 | Runs a Docker image that fetches bugzilla bugs from 15 | Web Compatibility > Knowledge Base component, as well as their core 16 | bugs dependencies and breakage reports and stores them in BQ. 17 | 18 | The container is defined in 19 | [docker-etl](https://github.com/mozilla/docker-etl/tree/main/jobs/webcompat-kb) 20 | 21 | *Triage notes* 22 | 23 | As long as the most recent DAG run is successful this job doesn't need to be triaged. 24 | 25 | #### Owner 26 | 27 | kberezina@mozilla.com 28 | """ 29 | 30 | default_args = { 31 | "owner": "kberezina@mozilla.com", 32 | "email": ["kberezina@mozilla.com", "webcompat-internal@mozilla.org"], 33 | "depends_on_past": False, 34 | "start_date": datetime(2023, 9, 26), 35 | "email_on_failure": True, 36 | } 37 | 38 | 39 | tags = [ 40 | Tag.ImpactTier.tier_2, 41 | ] 42 | 43 | every_fifteen_minutes = "*/15 * * * *" 44 | 45 | bugzilla_token = Secret( 46 | deploy_type="env", 47 | deploy_target="BUGZILLA_API_KEY", 48 | secret="airflow-gke-secrets", 49 | key="webcompat_kb_secret__bugzilla_api_key", 50 | ) 51 | 52 | with DAG( 53 | "webcompat_kb", 54 | default_args=default_args, 55 | max_active_runs=1, 56 | doc_md=DOCS, 57 | schedule_interval=every_fifteen_minutes, 58 | tags=tags, 59 | catchup=False, 60 | ) as dag: 61 | webcompat_kb_import = GKEPodOperator( 62 | task_id="webcompat_kb", 63 | arguments=[ 64 | "python", 65 | "-m", 66 | "webcompat_kb.main", 67 | "--bq-project", 68 | "moz-fx-dev-dschubert-wckb", 69 | "--bq-kb-dataset", 70 | "webcompat_knowledge_base", 71 | "--bq-web-features-dataset", 72 | "web_features", 73 | "--bq-standards-positions-dataset", 74 | "standards_positions", 75 | ], 76 | image="gcr.io/moz-fx-data-airflow-prod-88e0/webcompat-kb_docker_etl:latest", 77 | dag=dag, 78 | secrets=[ 79 | bugzilla_token, 80 | ], 81 | ) 82 | -------------------------------------------------------------------------------- /dataproc_bootstrap/README.md: -------------------------------------------------------------------------------- 1 | Contents of this directory will be rsync'd to gs://moz-fx-data-prod-airflow-dataproc-artifacts/bootstrap by CI 2 | -------------------------------------------------------------------------------- /dataproc_bootstrap/airflow_gcp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -exo pipefail 4 | 5 | # Error message 6 | error_msg () 7 | { 8 | echo 1>&2 "Error: $1" 9 | } 10 | 11 | # Parse arguments 12 | while [ $# -gt 0 ]; do 13 | case "$1" in 14 | --job-name) 15 | shift 16 | job_name=$1 17 | ;; 18 | --uri) 19 | shift 20 | uri=$1 21 | ;; 22 | --arguments) 23 | shift 24 | args=$1 25 | ;; 26 | --environment) 27 | shift 28 | environment=$1 29 | ;; 30 | -*) 31 | # do not exit out, just note failure 32 | error_msg "unrecognized option: $1" 33 | ;; 34 | *) 35 | break; 36 | ;; 37 | esac 38 | shift 39 | done 40 | 41 | if [ -z "$job_name" ] || [ -z "$uri" ]; then 42 | error_msg "missing argument(s)" 43 | exit 1 44 | fi 45 | 46 | wd=/mnt/analyses 47 | mkdir -p $wd && cd $wd 48 | mkdir -p output 49 | 50 | urldecode() { 51 | local url_encoded="${1//+/ }" 52 | printf '%b' "${url_encoded//%/\\x}" 53 | } 54 | 55 | # Download file 56 | if [[ $uri =~ ^gs.*$ ]]; then 57 | gsutil cp "$uri" . 58 | elif [[ $uri =~ ^https?.*$ ]]; then 59 | uri=$(urldecode $uri) 60 | wget -N "$uri" 61 | fi 62 | 63 | # Run job 64 | job="${uri##*/}" 65 | 66 | if [[ $uri == *.jar ]]; then 67 | time env $environment spark-submit --master yarn "./$job" $args 68 | elif [[ $uri == *.ipynb ]]; then 69 | echo "We are no longer supporting running ipynb's via GCP dataproc." 70 | exit 1 71 | elif [[ $uri == *.py ]]; then 72 | time env $environment \ 73 | PYSPARK_DRIVER_PYTHON=/opt/conda/default/bin/python PYSPARK_DRIVER_PYTHON_OPTS="" spark-submit \ 74 | --master yarn "./$job" $args 75 | else 76 | chmod +x "./$job" 77 | time env $environment "./$job" $args 78 | fi 79 | -------------------------------------------------------------------------------- /dataproc_bootstrap/dataproc_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -exo pipefail 4 | 5 | # Logs will be available on the dataproc nodes at /var/log/dataproc-initialization-script-X.log 6 | # or via the GCP Dataproc UI 7 | 8 | ARTIFACTS_BUCKET=gs://moz-fx-data-prod-airflow-dataproc-artifacts 9 | 10 | ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) 11 | if [[ "${ROLE}" == 'Master' ]]; then 12 | # You can put any master-specific logic here 13 | echo "Running dataproc_init.sh on master..." 14 | fi 15 | 16 | gsutil cp $ARTIFACTS_BUCKET/jars/* /usr/lib/spark/jars/ 17 | 18 | # Install spark packages 19 | # See https://github.com/mozilla/telemetry-spark-packages-assembly 20 | TSPA_VERSION=v1.0.0 21 | TSPA_GS_PATH=$ARTIFACTS_BUCKET/mozilla/telemetry-spark-packages-assembly/$TSPA_VERSION/telemetry-spark-packages-assembly.jar 22 | TSPA_JAR=/usr/lib/spark/jars/telemetry-spark-packages-assembly.jar 23 | gsutil cp $TSPA_GS_PATH $TSPA_JAR 24 | 25 | # Install python packages 26 | PIP_REQUIREMENTS_FILE=/tmp/requirements.txt 27 | gsutil cp $ARTIFACTS_BUCKET/bootstrap/python-requirements.txt $PIP_REQUIREMENTS_FILE 28 | /opt/conda/default/bin/pip install --upgrade 'pip<20.3.0' 29 | /opt/conda/default/bin/pip install -r $PIP_REQUIREMENTS_FILE 30 | -------------------------------------------------------------------------------- /dataproc_bootstrap/fx_usage_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -exo pipefail 4 | 5 | # Logs will be available on the dataproc nodes at /var/log/dataproc-initialization-script-X.log 6 | # or via the GCP Dataproc UI 7 | 8 | /opt/conda/default/bin/pip install --upgrade pip 9 | 10 | /opt/conda/default/bin/pip install arrow==0.10.0 11 | /opt/conda/default/bin/pip install boto3==1.9.199 12 | /opt/conda/default/bin/pip install click==6.7 13 | /opt/conda/default/bin/pip install click_datetime==0.2 14 | /opt/conda/default/bin/pip install --ignore-installed flake8==3.7.8 15 | /opt/conda/default/bin/pip install pyspark==2.2.2 16 | /opt/conda/default/bin/pip install pytest==4.6.4 17 | /opt/conda/default/bin/pip install scipy==1.0.0rc1 18 | 19 | /opt/conda/default/bin/pip install py4j --upgrade 20 | /opt/conda/default/bin/pip install numpy==1.16.4 21 | /opt/conda/default/bin/pip install python-dateutil==2.5.0 22 | /opt/conda/default/bin/pip install pytz==2011k 23 | /opt/conda/default/bin/pip install --no-dependencies pandas==0.24 24 | 25 | # This fixes the PythonAccumulatorV2 does not exist error 26 | export PYTHONPATH=/usr/lib/spark/python/lib/pyspark.zip 27 | -------------------------------------------------------------------------------- /dataproc_bootstrap/python-requirements.txt: -------------------------------------------------------------------------------- 1 | arrow==0.10.0 2 | boto 3 | boto3 4 | botocore 5 | click==6.7 6 | click_datetime==0.2 7 | numpy==1.13.3 8 | pandas==0.23.4 9 | pyspark==2.3.2 10 | requests-toolbelt==0.8.0 11 | requests==2.20.1 12 | scipy==1.0.0 13 | typing==3.6.4 14 | six==1.11.0 15 | protobuf==3.6.1 16 | py4j==0.10.7 17 | ujson 18 | -------------------------------------------------------------------------------- /jobs/addon_recommender.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ -z "$privateBucket" || -z "$publicBucket" || -z "$date" ]]; then 4 | echo "Missing arguments!" 1>&2 5 | exit 1 6 | fi 7 | 8 | git clone https://github.com/mozilla/telemetry-batch-view.git 9 | cd telemetry-batch-view 10 | sbt assembly 11 | mkdir ml_output 12 | spark-submit --master yarn \ 13 | --deploy-mode client \ 14 | --class com.mozilla.telemetry.ml.AddonRecommender \ 15 | target/scala-2.11/telemetry-batch-view-1.1.jar \ 16 | train \ 17 | --privateBucket $privateBucket \ 18 | --publicBucket $publicBucket \ 19 | --runDate $date 20 | -------------------------------------------------------------------------------- /jobs/bugzilla_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda install psycopg2 --yes 4 | git clone https://github.com/maurodoglio/bz2db.git 5 | pip install -r bz2db/requirements.txt 6 | cd bz2db && python bz2db/update_bugs.py 7 | -------------------------------------------------------------------------------- /jobs/moz_dataproc_runner.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import sys 3 | 4 | 5 | """Generic runner for PySpark jobs 6 | 7 | This script runs a `cli.entry_point()` from an arbitrary Python module or CLI application. 8 | Job module name should be provided as a first command line argument. Module argument will be cleared 9 | before executing the `entry_point()`, allowing for the underlying job to be decoupled from this script. 10 | 11 | If running on Dataproc, this requires the job to be installed on the cluster 12 | (e.g. via `pip_install` initialization action). 13 | """ 14 | # Retrieve target module name 15 | module_to_run = sys.argv[1] 16 | # Clear retrieved argument in the list of arguments passed to this script 17 | # This allows the target job to properly interpret its command line arguments 18 | del sys.argv[1] 19 | 20 | # Import the target module and execute its entry point 21 | cli = importlib.import_module(f"{module_to_run}.cli") 22 | cli.entry_point() 23 | -------------------------------------------------------------------------------- /jobs/mozaggregator_runner.py: -------------------------------------------------------------------------------- 1 | from mozaggregator import cli 2 | 3 | cli.entry_point() 4 | -------------------------------------------------------------------------------- /jobs/pip-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -exo pipefail 4 | 5 | readonly PACKAGES=$(/usr/share/google/get_metadata_value attributes/PIP_PACKAGES || true) 6 | 7 | function install_pip() { 8 | if command -v pip >/dev/null; then 9 | echo "pip is already installed." 10 | return 0 11 | fi 12 | 13 | if command -v easy_install >/dev/null; then 14 | echo "Installing pip with easy_install..." 15 | easy_install pip 16 | return 0 17 | fi 18 | 19 | echo "Installing python-pip..." 20 | apt update 21 | apt install python-pip -y 22 | } 23 | 24 | function main() { 25 | if [[ -z "${PACKAGES}" ]]; then 26 | echo "ERROR: Must specify PIP_PACKAGES metadata key" 27 | exit 1 28 | fi 29 | 30 | install_pip 31 | pip install --upgrade ${PACKAGES} 32 | } 33 | 34 | main 35 | -------------------------------------------------------------------------------- /jobs/telemetry_batch_view.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import requests 4 | from os import chdir 5 | from os import environ 6 | from subprocess import call, PIPE, Popen 7 | from urlparse import urlparse 8 | import zipfile 9 | import boto3 10 | 11 | artifact_file = "artifact.jar" 12 | 13 | 14 | def call_exit_errors(command): 15 | print("+ {}".format(" ".join(command))) 16 | rc = call(command, env=environ.copy()) 17 | if rc > 0: 18 | exit(rc) 19 | 20 | 21 | def retrieve_jar(): 22 | jar_url = environ.get("ARTIFACT_URL") 23 | 24 | if jar_url is None: 25 | exit(1) 26 | 27 | 28 | print("Retrieving JAR: {}".format(jar_url)) 29 | 30 | # Check to see if this is an alias for a full jar path 31 | # If it's an alias, it should be accompanied by a .txt 32 | # file whose contents point to the aliased location. 33 | # 34 | # The associated .txt files have two lines [0]: 35 | # 1. The query string to get to the aliased jar 36 | # 2. The associated build URL for that jar 37 | # 38 | # Historical version only had the query string [1], 39 | # so we need to handle that case separately. 40 | # 41 | # [0] https://github.com/mozilla/telemetry-batch-view/blob/main/.circleci/deploy.sh#L37 42 | # [1] https://github.com/mozilla/telemetry-batch-view/blob/14741db20dd3873b94944b8238dfc48a003c744d/deploy.sh#L50 43 | 44 | txt_url = jar_url.replace(".jar", ".txt") 45 | response = requests.get(txt_url) 46 | 47 | if response.status_code != 404: 48 | uri_query, _, build_url = response.content.partition("\n") 49 | if not build_url: 50 | # Handle historical version 51 | build_url = "Build URL not available" 52 | 53 | parsed_uri = urlparse(jar_url) 54 | bucket, _, _ = parsed_uri.path.lstrip("/").partition("/") 55 | full_url = "{uri.scheme}://{uri.netloc}/{bucket}/{uri_query}".format(uri=parsed_uri, bucket=bucket, uri_query=uri_query) 56 | 57 | print(" Alias: {}".format(full_url)) 58 | print(" Build URL: {}".format(build_url.strip())) 59 | 60 | response = requests.get(jar_url) 61 | with open(artifact_file, 'wb') as f: 62 | f.write(response.content) 63 | 64 | 65 | def submit_job(): 66 | opts = [ 67 | ["--{}".format(key[4:].replace("_", "-")), value] 68 | for key, value in environ.items() 69 | if key.startswith("TBV_") and key != "TBV_CLASS" 70 | ] 71 | 72 | command = [ 73 | "spark-submit", 74 | "--master", "yarn", 75 | "--deploy-mode", "client", 76 | "--class", environ["TBV_CLASS"], 77 | artifact_file, 78 | ] + [v for opt in opts for v in opt if v] 79 | 80 | call_exit_errors(command) 81 | 82 | 83 | if environ.get("DO_RETRIEVE", "True") == "True": 84 | retrieve_jar() 85 | 86 | if environ.get("DO_SUBMIT", "True") == "True": 87 | submit_job() 88 | -------------------------------------------------------------------------------- /jobs/txp_pulse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # We use jupyter by default, but here we want to use python 4 | unset PYSPARK_DRIVER_PYTHON 5 | 6 | # Clone, install, and run 7 | git clone https://github.com/mozilla/python_etl.git 8 | cd python_etl 9 | pip install . 10 | python setup.py bdist_egg 11 | spark-submit scheduling/pulse.py 12 | -------------------------------------------------------------------------------- /operators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/operators/__init__.py -------------------------------------------------------------------------------- /plugins/mozmenu.py: -------------------------------------------------------------------------------- 1 | """ 2 | Plugin that adds a "Mozilla" entry to the top bar with some useful links. 3 | 4 | Based on an example at 5 | https://github.com/airflow-plugins/Getting-Started/blob/master/Tutorial/creating-ui-modification.md 6 | """ 7 | from airflow.plugins_manager import AirflowPlugin 8 | 9 | telemetry_airflow = { 10 | "name": "telemetry-airflow on GitHub", 11 | "category": "Mozilla", 12 | "href": "https://github.com/mozilla/telemetry-airflow", 13 | } 14 | 15 | wtmo_dev = { 16 | "name": "WTMO Developer Guide", 17 | "category": "Mozilla", 18 | "href": "https://mozilla-hub.atlassian.net/wiki/spaces/SRE/pages/27922811/WTMO+Developer+Guide", 19 | } 20 | 21 | airflow_triage_guide = { 22 | "name": "Airflow Triage Guide", 23 | "category": "Mozilla", 24 | "href": "https://mozilla-hub.atlassian.net/wiki/spaces/DATA/pages/175603730/Airflow+Triage+Guide", 25 | } 26 | 27 | gke_cluster = { 28 | "name": "GKE cluster", 29 | "category": "Mozilla", 30 | "href": "https://console.cloud.google.com/kubernetes/workload/overview?project=moz-fx-data-airflow-gke-prod", 31 | } 32 | 33 | 34 | # ruff: noqa: RUF012 35 | class MozMenuPlugin(AirflowPlugin): 36 | name = "Mozilla" 37 | operators = [] 38 | flask_blueprints = [] 39 | hooks = [] 40 | executors = [] 41 | appbuilder_views = [] 42 | appbuilder_menu_items = [ 43 | telemetry_airflow, 44 | wtmo_dev, 45 | airflow_triage_guide, 46 | gke_cluster, 47 | ] 48 | -------------------------------------------------------------------------------- /plugins/timetable.py: -------------------------------------------------------------------------------- 1 | """Plugin for alternative timetables that cannot be trivially defined via cron expressions.""" 2 | 3 | from datetime import timedelta 4 | from typing import Any 5 | 6 | from airflow.plugins_manager import AirflowPlugin 7 | from airflow.timetables.base import DagRunInfo, DataInterval, TimeRestriction, Timetable 8 | from pendulum import UTC, DateTime, Time 9 | 10 | 11 | class MultiWeekTimetable(Timetable): 12 | def __init__(self, *, num_weeks: int, time: Time = Time.min): 13 | self.num_weeks = num_weeks 14 | self.interval_delta = timedelta(days=7 * num_weeks) 15 | # only enforced for automated data intervals 16 | self.time = time 17 | 18 | def infer_manual_data_interval(self, run_after: DateTime) -> DataInterval: 19 | return DataInterval(start=run_after - self.interval_delta, end=run_after) 20 | 21 | def next_dagrun_info( 22 | self, 23 | *, 24 | last_automated_data_interval: DataInterval | None, 25 | restriction: TimeRestriction, 26 | ) -> DagRunInfo | None: 27 | if restriction.earliest is None: # No start_date specified. Don't schedule. 28 | return None 29 | 30 | # Find the first run on the regular schedule. 31 | next_end = ( 32 | DateTime.combine(restriction.earliest, self.time).replace(tzinfo=UTC) 33 | + self.interval_delta 34 | ) 35 | 36 | max_end = next_end 37 | if last_automated_data_interval is not None: 38 | # There was a previous run on the regular schedule. 39 | # Return the next interval after last_automated_data_interval.end that is 40 | # aligned with restriction.earliest and self.time 41 | max_end = last_automated_data_interval.end + self.interval_delta 42 | elif not restriction.catchup: 43 | # This is the first ever run on the regular schedule, and catchup is not 44 | # enabled. Return the last complete interval before now. 45 | max_end = DateTime.utcnow() 46 | if next_end < max_end: 47 | # Return the last complete interval on or before max_end. Use integer 48 | # division on the number of whole days rather than deal with any corner 49 | # cases related to leap seconds and partial days. 50 | skip_intervals = (max_end - next_end).days // self.interval_delta.days 51 | next_end = next_end + (self.interval_delta * skip_intervals) 52 | 53 | if restriction.latest is not None and next_end > restriction.latest: 54 | return None # Over the DAG's scheduled end; don't schedule. 55 | return DagRunInfo.interval(start=next_end - self.interval_delta, end=next_end) 56 | 57 | def serialize(self) -> dict[str, Any]: 58 | return {"num_weeks": self.num_weeks, "time": self.time.isoformat()} 59 | 60 | @classmethod 61 | def deserialize(cls, value: dict[str, Any]) -> Timetable: 62 | return cls(num_weeks=value["num_weeks"], time=Time.fromisoformat(value["time"])) 63 | 64 | 65 | class MozillaTimetablePlugin(AirflowPlugin): 66 | name = "mozilla_timetable_plugin" 67 | timetables = (MultiWeekTimetable,) 68 | -------------------------------------------------------------------------------- /plugins/version_endpoint.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from pathlib import Path 4 | 5 | from airflow.plugins_manager import AirflowPlugin 6 | from flask import Blueprint, jsonify 7 | 8 | version_endpoint_bp = Blueprint("version_endpoint", __name__) 9 | 10 | # from https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string 11 | SEM_VER_REGEX = ( 12 | r"(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)\." 13 | r"(?P0|[1-9]\d*)(?:-(?P" 14 | r"(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)" 15 | r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?" 16 | r"(?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$" 17 | ) 18 | 19 | 20 | def get_project_root() -> Path: 21 | """Reliably give the project root as a Path object.""" 22 | return Path(__file__).parent.parent 23 | 24 | 25 | def parse_airflow_version(dockerfile_content: str) -> str: 26 | version_pattern = rf"^FROM apache\/airflow:((slim-)?{SEM_VER_REGEX})$" 27 | version_regex = re.compile(pattern=version_pattern, flags=re.MULTILINE | re.DOTALL) 28 | return version_regex.search(dockerfile_content).group(1) 29 | 30 | 31 | def get_airflow_version() -> dict[str, str | None]: 32 | """Parse Airflow version from Dockerfile and return it as a dict.""" 33 | project_root = get_project_root() 34 | dockerfile = project_root / "Dockerfile" 35 | if dockerfile.is_file() and dockerfile.exists(): 36 | with open(dockerfile) as file: 37 | content = file.read() 38 | version = parse_airflow_version(dockerfile_content=content) 39 | else: 40 | version = None 41 | return {"version": version} 42 | 43 | 44 | def get_dockerflow_version() -> dict[str, str | None]: 45 | """ 46 | Parse Dockerflow style version.json file and return it as a dict. 47 | 48 | version.json is baked in the Docker image at build time in CI. 49 | 50 | """ 51 | project_root = get_project_root() 52 | version_file = project_root / "version.json" 53 | if version_file.is_file() and version_file.exists(): 54 | with open(project_root / "version.json") as file: 55 | version = json.load(file) 56 | else: 57 | version = {"build": None, "commit": None, "source": None} 58 | return version 59 | 60 | 61 | @version_endpoint_bp.route("/__version__", methods=["GET"]) 62 | def version_endpoint(): 63 | airflow_version = get_airflow_version() 64 | dockerflow_version = get_dockerflow_version() 65 | return jsonify(dockerflow_version | airflow_version), 200 66 | 67 | 68 | class CustomPlugin(AirflowPlugin): 69 | name = "version_endpoint" 70 | flask_blueprints = (version_endpoint_bp,) 71 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff.lint.isort] 2 | known-third-party = ["airflow"] 3 | 4 | [tool.ruff] 5 | target-version = "py310" 6 | # Exclude questionably linted code (aka legacy) under the `jobs` directory 7 | exclude = ["./jobs"] 8 | 9 | [tool.ruff.lint] 10 | select = [ 11 | "E", # pycodestyle 12 | "W", # pycodestyle 13 | "F", # Pyflakes 14 | "B", # flake8-bugbear 15 | "C4", # flake8-comprehensions 16 | "D", # flake8-docstrings 17 | "I", # isort 18 | "SIM", # flake8-simplify 19 | "TCH", # flake8-type-checking 20 | "TID", # flake8-tidy-imports 21 | "Q", # flake8-quotes 22 | "UP", # pyupgrade 23 | "PT", # flake8-pytest-style 24 | "RUF", # Ruff-specific rules 25 | ] 26 | ignore = [ 27 | "E501", # line too long, handled by black 28 | # Docstring linting 29 | "D100", # Missing docstring in public module 30 | "D101", # Missing docstring in public class 31 | "D102", # Missing docstring in public method 32 | "D103", # Missing docstring in public function 33 | "D104", # Missing docstring in public package 34 | "D105", # Missing docstring in magic method 35 | "D107", # Missing docstring in __init__ 36 | "D202", # No blank lines allowed after function docstring -> clashes with Black 37 | "D203", # 1 blank line required before class docstring 38 | "D212", # Multi-line docstring summary should start at the first line 39 | "D415", # First line should end with a period, question mark, or exclamation point 40 | "D416", #Section name should end with a colon ("{name}") 41 | # flake8-pytest-style: 42 | "PT011", # pytest.raises({exception}) is too broad, set the match parameter or use a more specific exception 43 | # To enable when we migrate to Python 3.10 44 | "B905", # `zip()` without an explicit `strict=` parameter 45 | ] 46 | -------------------------------------------------------------------------------- /requirements-dev.in: -------------------------------------------------------------------------------- 1 | --constraint ./constraints.txt 2 | --constraint ./requirements.txt 3 | 4 | 5 | # Package management 6 | pip-tools==7.4.1 7 | 8 | # Code quality 9 | pytest==8.3.4 10 | pytest-mock==3.14.0 11 | ruff==0.5.5 12 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.11 3 | # by the following command: 4 | # 5 | # pip-compile --no-annotate --strip-extras requirements-dev.in 6 | # 7 | build==1.1.1 8 | click==8.1.8 9 | iniconfig==2.0.0 10 | packaging==24.2 11 | pip-tools==7.4.1 12 | pluggy==1.5.0 13 | pyproject-hooks==1.0.0 14 | pytest==8.3.4 15 | pytest-mock==3.14.0 16 | ruff==0.5.5 17 | wheel==0.43.0 18 | 19 | # The following packages are considered to be unsafe in a requirements file: 20 | # pip 21 | # setuptools 22 | -------------------------------------------------------------------------------- /requirements-override.txt: -------------------------------------------------------------------------------- 1 | # There's a bug in apache-airflow-providers-google 12.0.0 where Dataproc operators fail to import 2 | # without OpenLineage installed, which was fixed in 14.0.0 (https://github.com/apache/airflow/pull/46561). 3 | apache-airflow-providers-google==14.0.0 4 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | # Official Airflow constraints file 2 | # Doc: https://airflow.apache.org/docs/apache-airflow/stable/installation/installing-from-pypi.html#constraints-files 3 | # File: https://raw.githubusercontent.com/apache/airflow/constraints-2.10.5/constraints-3.11.txt 4 | --constraint ./constraints.txt 5 | 6 | # Airflow dependencies 7 | apache-airflow[async,google-auth,password,statsd]==2.10.5 8 | apache-airflow-providers-amazon 9 | apache-airflow-providers-celery 10 | apache-airflow-providers-cncf-kubernetes 11 | apache-airflow-providers-google 12 | apache-airflow-providers-http 13 | apache-airflow-providers-postgres 14 | apache-airflow-providers-redis 15 | apache-airflow-providers-slack 16 | airflow-provider-fivetran-async==2.0.2 17 | 18 | # Acryl DataHub integration 19 | acryl-datahub-airflow-plugin==1.0.0.3 20 | gql 21 | 22 | # dbt integration 23 | apache-airflow-providers-dbt-cloud 24 | 25 | # Required for /app/dags/empeam_workday_xmatters_integration.py 26 | apache-airflow-providers-atlassian-jira 27 | -------------------------------------------------------------------------------- /resources/dev_variables.json: -------------------------------------------------------------------------------- 1 | { 2 | "Dev_glam_project": "Dev_glam_project", 3 | "Prod_glam_project": "Prod_glam_project", 4 | "app_store_connect_password": "password", 5 | "app_store_connect_username": "username", 6 | "bugzilla_probe_expiry_bot_api_key": "bugzilla-api-key", 7 | "dataops_looker_github_secret_access_token": "dataops_looker_github_secret_access_token", 8 | "glean_dictionary_netlify_build_webhook_id": "status/200", 9 | "jetstream_cluster_cert": "cert", 10 | "jetstream_cluster_ip": "127.0.0.1", 11 | "lookml_generator_release_str": "v0.0.0", 12 | "slack_secret_token": "slack_secret_token", 13 | "surveygizmo_api_secret": "tapsekret", 14 | "surveygizmo_api_token": "tokentokentoken", 15 | "surveygizmo_daily_attitudes_survey_id": 12345, 16 | "dbt_account_id": "dbt_account_id", 17 | "looker_api_client_id_prod": "looker_api_client_id_prod", 18 | "looker_api_client_secret_prod": "looker_api_client_secret_prod" 19 | } 20 | -------------------------------------------------------------------------------- /resources/dev_webserver_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from flask_appbuilder.security.manager import AUTH_DB 4 | 5 | basedir = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | WTF_CSRF_ENABLED = True 8 | AUTH_TYPE = AUTH_DB 9 | AUTH_ROLE_PUBLIC = "Admin" 10 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | import sys 5 | import warnings 6 | 7 | import pytest 8 | from airflow.models import DagBag 9 | 10 | # get absolute project directory path no matter the environment 11 | PROJECT_DIR = pathlib.Path(__file__).resolve().parent.parent 12 | 13 | 14 | @pytest.fixture(scope="session") 15 | def get_dag_bag(session_mocker) -> DagBag: 16 | from airflow.operators.subdag import SubDagOperator 17 | 18 | # Mock _validate_pool, so we don't need an actual provisioned database 19 | session_mocker.patch.object( 20 | SubDagOperator, 21 | "_validate_pool", 22 | return_value=None, 23 | ) 24 | 25 | # load dev connection and variables 26 | env_load_variables_from_json(PROJECT_DIR / "resources" / "dev_variables.json") 27 | env_load_connections_from_json(PROJECT_DIR / "resources" / "dev_connections.json") 28 | 29 | # Replicate Airflow adding dags, plugins folders in system path at runtime 30 | sys.path.insert(0, str(PROJECT_DIR)) 31 | sys.path.insert(1, str(PROJECT_DIR / "dags")) 32 | sys.path.insert(2, str(PROJECT_DIR / "plugins")) 33 | 34 | # Supress warnings from loading DAGs 35 | with warnings.catch_warnings(): 36 | warnings.simplefilter("ignore") 37 | dagbag = DagBag(dag_folder=PROJECT_DIR / "dags", include_examples=False) 38 | 39 | return dagbag 40 | 41 | 42 | def env_load_variables_from_json(path: pathlib.Path) -> None: 43 | """ 44 | Load Airflow Variables as environment variables from a JSON file. 45 | 46 | JSON file should be generated by running `airflow variables export .json`. 47 | Variable values must be `str` or `int`. 48 | 49 | See this link for more information on Airflow Variables as environment variables 50 | https://airflow.apache.org/docs/apache-airflow/stable/howto/variable.html 51 | """ 52 | with open(path) as file: 53 | variables: dict[str, str | int] = json.load(file) 54 | 55 | for name, value in variables.items(): 56 | formatted_variable_name = f"AIRFLOW_VAR_{name.upper()}" 57 | os.environ[formatted_variable_name] = str(value) 58 | 59 | 60 | def env_load_connections_from_json(path: pathlib.Path) -> None: 61 | """ 62 | Load Airflow Connections as environment variables from a JSON file. 63 | 64 | JSON file should be generated by running `airflow connections export .json`. 65 | Uses a Connection object to ensure correct Connection parsing. 66 | 67 | See this link for more information on Airflow Connections as environment variables 68 | https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html 69 | """ 70 | from airflow.models import Connection 71 | 72 | with open(path) as file: 73 | connections: dict[str, dict] = json.load(file) 74 | 75 | for name, params in connections.items(): 76 | conn_instance = Connection.from_json(value=json.dumps(params), conn_id=name) 77 | formatted_connection_name = f"AIRFLOW_CONN_{name.upper()}" 78 | os.environ[formatted_connection_name] = conn_instance.get_uri() 79 | -------------------------------------------------------------------------------- /tests/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/dags/__init__.py -------------------------------------------------------------------------------- /tests/dags/test_dag_validity.py: -------------------------------------------------------------------------------- 1 | def test_dag_validity(get_dag_bag): 2 | """ 3 | Test all DAGs can be parsed. 4 | 5 | This test should be equivalent to the integration test using airflow CLI. 6 | At the moment, there is a discrepancy between this unit test and the integration 7 | test. Once equivalent, this unit test should replace to the integration test. 8 | 9 | """ 10 | dagbag = get_dag_bag 11 | 12 | data = [] 13 | for filename, errors in dagbag.import_errors.items(): 14 | data.append({"filepath": filename, "error": errors}) 15 | if data: 16 | print(data) 17 | raise AssertionError 18 | 19 | 20 | def test_dag_tags(get_dag_bag): 21 | """Check tags in all DAGs are valid.""" 22 | 23 | valid_tags = { 24 | "impact/tier_1", 25 | "impact/tier_2", 26 | "impact/tier_3", 27 | "repo/bigquery-etl", 28 | "repo/telemetry-airflow", 29 | "repo/private-bigquery-etl", 30 | "triage/confidential", 31 | "triage/no_triage", 32 | "triage/record_only", 33 | } 34 | dagbag = get_dag_bag 35 | 36 | for dag_name, dag in dagbag.dags.items(): 37 | for tag in dag.tags: 38 | assert tag in valid_tags, f"DAG: {dag_name}: Invalid tag `{tag}`" 39 | 40 | 41 | def test_dag_tags_required(get_dag_bag): 42 | """Check at least one tag per DAG is of the required type.""" 43 | 44 | required_tag_type = "impact" 45 | dagbag = get_dag_bag 46 | 47 | for dag_name, dag in dagbag.dags.items(): 48 | # don't check tags on subdags 49 | if dag.is_subdag: 50 | continue 51 | 52 | assert [ 53 | tag for tag in dag.tags if required_tag_type in tag 54 | ], f"DAG: {dag_name}: Missing required tag type `{required_tag_type}`" 55 | -------------------------------------------------------------------------------- /tests/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/plugins/__init__.py -------------------------------------------------------------------------------- /tests/plugins/test_timetable.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | from airflow.timetables.base import DagRunInfo, DataInterval, TimeRestriction 4 | from pendulum import UTC, DateTime, Time 5 | 6 | from plugins.timetable import MultiWeekTimetable 7 | 8 | 9 | def test_manual_interval(): 10 | tt = MultiWeekTimetable(num_weeks=4) 11 | actual = tt.infer_manual_data_interval(run_after=DateTime(2023, 1, 29)) 12 | expected = DataInterval(start=DateTime(2023, 1, 1), end=DateTime(2023, 1, 29)) 13 | assert actual == expected 14 | 15 | 16 | def test_first_automated_interval(): 17 | tt = MultiWeekTimetable(num_weeks=4, time=Time(hour=4)) 18 | actual = tt.next_dagrun_info( 19 | last_automated_data_interval=None, 20 | restriction=TimeRestriction( 21 | earliest=DateTime(2023, 1, 1), latest=None, catchup=True 22 | ), 23 | ) 24 | expected = DagRunInfo.interval( 25 | start=DateTime(2023, 1, 1, 4, tzinfo=UTC), 26 | end=DateTime(2023, 1, 29, 4, tzinfo=UTC), 27 | ) 28 | assert actual == expected 29 | 30 | 31 | def test_first_automated_interval_no_catchup(): 32 | tt = MultiWeekTimetable(num_weeks=4) 33 | with mock.patch.object( 34 | DateTime, "utcnow", return_value=DateTime(2023, 2, 28, tzinfo=UTC) 35 | ): 36 | actual = tt.next_dagrun_info( 37 | last_automated_data_interval=None, 38 | restriction=TimeRestriction( 39 | earliest=DateTime(2023, 1, 1), latest=None, catchup=False 40 | ), 41 | ) 42 | expected = DagRunInfo.interval( 43 | start=DateTime(2023, 1, 29, tzinfo=UTC), end=DateTime(2023, 2, 26, tzinfo=UTC) 44 | ) 45 | assert actual == expected 46 | 47 | 48 | def test_next_automated_interval(): 49 | tt = MultiWeekTimetable(num_weeks=4) 50 | actual = tt.next_dagrun_info( 51 | last_automated_data_interval=DataInterval( 52 | start=DateTime(2023, 1, 29, tzinfo=UTC), 53 | end=DateTime(2023, 2, 26, tzinfo=UTC), 54 | ), 55 | restriction=TimeRestriction( 56 | earliest=DateTime(2023, 1, 1), 57 | latest=DateTime(2023, 3, 26, tzinfo=UTC), 58 | catchup=False, 59 | ), 60 | ) 61 | expected = DagRunInfo.interval( 62 | start=DateTime(2023, 2, 26, tzinfo=UTC), end=DateTime(2023, 3, 26, tzinfo=UTC) 63 | ) 64 | assert actual == expected 65 | 66 | 67 | def test_last_automated_interval(): 68 | tt = MultiWeekTimetable(num_weeks=4) 69 | actual = tt.next_dagrun_info( 70 | last_automated_data_interval=DataInterval( 71 | start=DateTime(2023, 1, 29, tzinfo=UTC), 72 | end=DateTime(2023, 2, 26, tzinfo=UTC), 73 | ), 74 | restriction=TimeRestriction( 75 | earliest=DateTime(2023, 1, 1), 76 | latest=DateTime(2023, 2, 26, tzinfo=UTC), 77 | catchup=False, 78 | ), 79 | ) 80 | assert actual is None 81 | -------------------------------------------------------------------------------- /tests/plugins/test_version_endpoint.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from unittest.mock import mock_open, patch 4 | 5 | import pytest 6 | 7 | from plugins.version_endpoint import ( 8 | get_airflow_version, 9 | get_dockerflow_version, 10 | get_project_root, 11 | parse_airflow_version, 12 | ) 13 | 14 | 15 | def test_get_project_root(): 16 | # CircleCI renames the project directory to `project` 17 | assert get_project_root().name in ("telemetry-airflow", "project") 18 | assert get_project_root().is_dir() 19 | 20 | 21 | @pytest.mark.parametrize( 22 | ("test_input", "expected"), 23 | [ 24 | ( 25 | ( 26 | "# example comment on first line\n" 27 | "FROM apache/airflow:slim-2.8.2-python3.11\n" 28 | "# Rest of Dockerfile" 29 | ), 30 | "slim-2.8.2-python3.11", 31 | ), 32 | ("FROM apache/airflow:2.9.1", "2.9.1"), 33 | ("FROM apache/airflow:slim-2.7.3", "slim-2.7.3"), 34 | ], 35 | ) 36 | def test_parse_airflow_version(test_input, expected): 37 | assert parse_airflow_version(test_input) == expected 38 | 39 | 40 | def test_get_airflow_version_exists(): 41 | mock_project_root = patch( 42 | "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path") 43 | ) 44 | mock_parse_airflow_version = patch( 45 | "plugins.version_endpoint.parse_airflow_version", return_value="2.8.2" 46 | ) 47 | mock_open_file = patch("builtins.open", mock_open(read_data="Mock Data!")) 48 | mock_is_file = patch("pathlib.Path.is_file", return_value=True) 49 | mock_exists = patch("pathlib.Path.exists", return_value=True) 50 | 51 | with ( 52 | mock_project_root, 53 | mock_parse_airflow_version, 54 | mock_open_file, 55 | mock_is_file, 56 | mock_exists, 57 | ): 58 | result = get_airflow_version() 59 | assert result == {"version": "2.8.2"} 60 | 61 | 62 | def test_get_airflow_version_not_exists(): 63 | mock_project_root = patch( 64 | "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path") 65 | ) 66 | 67 | with mock_project_root: 68 | result = get_airflow_version() 69 | assert result == {"version": None} 70 | 71 | 72 | def test_get_dockerflow_version_exists(): 73 | mock_project_root = patch( 74 | "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path") 75 | ) 76 | mock_open_file = patch( 77 | "builtins.open", 78 | mock_open( 79 | read_data=json.dumps( 80 | { 81 | "build": "12345", 82 | "commit": "abcdef", 83 | "source": "https://github.com/mozilla/telemetry-airflow", 84 | } 85 | ) 86 | ), 87 | ) 88 | mock_is_file = patch("pathlib.Path.is_file", return_value=True) 89 | mock_exists = patch("pathlib.Path.exists", return_value=True) 90 | 91 | with mock_project_root, mock_open_file, mock_is_file, mock_exists: 92 | result = get_dockerflow_version() 93 | assert result == { 94 | "build": "12345", 95 | "commit": "abcdef", 96 | "source": "https://github.com/mozilla/telemetry-airflow", 97 | } 98 | 99 | 100 | def test_get_dockerflow_version_not_exists(): 101 | mock_project_root = patch( 102 | "plugins.version_endpoint.get_project_root", return_value=Path("/mock/path") 103 | ) 104 | 105 | with mock_project_root: 106 | result = get_dockerflow_version() 107 | assert result == {"build": None, "commit": None, "source": None} 108 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_backfill.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from utils.backfill import BackfillParams 4 | 5 | 6 | @pytest.fixture() 7 | def base_params() -> dict: 8 | return { 9 | "clear": False, 10 | "dry_run": True, 11 | "dag_name": "dag_name", 12 | "end_date": "2022-11-10", 13 | "start_date": "2022-10-31", 14 | "task_regex": None, 15 | } 16 | 17 | 18 | @pytest.fixture() 19 | def base_backfill_params(base_params: dict) -> BackfillParams: 20 | return BackfillParams(**base_params) 21 | 22 | 23 | def test_date_validation(base_backfill_params) -> None: 24 | # valid date range 25 | base_backfill_params.validate_date_range() 26 | 27 | # invalid date range 28 | base_backfill_params.start_date, base_backfill_params.end_date = ( 29 | base_backfill_params.end_date, 30 | base_backfill_params.start_date, 31 | ) 32 | with pytest.raises(ValueError): 33 | base_backfill_params.validate_date_range() 34 | 35 | 36 | def test_validate_regex_pattern(base_backfill_params) -> None: 37 | # task_regex is None 38 | base_backfill_params.validate_regex_pattern() 39 | 40 | # valid regex pattern 41 | base_backfill_params.task_regex = "/ab+c/" 42 | base_backfill_params.validate_regex_pattern() 43 | 44 | # invalid regex pattern 45 | base_backfill_params.task_regex = "[.*" 46 | with pytest.raises(ValueError): 47 | base_backfill_params.validate_regex_pattern() 48 | 49 | 50 | def test_generate_backfill_command(base_backfill_params) -> None: 51 | """ 52 | Assert backfill commands are equivalent between the backfill plugin and backfill DAG. 53 | 54 | Expected results were generated from the plugin implementation 55 | 56 | """ 57 | test_start_date = "2022-01-01" 58 | test_end_date = "2022-01-10" 59 | 60 | test_params: list[BackfillParams] = [ 61 | BackfillParams( 62 | clear=True, 63 | dry_run=True, 64 | task_regex=None, 65 | dag_name="test_value", 66 | start_date=test_start_date, 67 | end_date=test_end_date, 68 | ), 69 | BackfillParams( 70 | clear=False, 71 | dry_run=True, 72 | task_regex=None, 73 | dag_name="test_value", 74 | start_date=test_start_date, 75 | end_date=test_end_date, 76 | ), 77 | BackfillParams( 78 | clear=True, 79 | dry_run=False, 80 | task_regex=None, 81 | dag_name="test_value", 82 | start_date=test_start_date, 83 | end_date=test_end_date, 84 | ), 85 | BackfillParams( 86 | clear=False, 87 | dry_run=False, 88 | task_regex=None, 89 | dag_name="test_value", 90 | start_date=test_start_date, 91 | end_date=test_end_date, 92 | ), 93 | BackfillParams( 94 | clear=False, 95 | dry_run=False, 96 | task_regex="/ab+c/", 97 | dag_name="test_value", 98 | start_date=test_start_date, 99 | end_date=test_end_date, 100 | ), 101 | ] 102 | 103 | expected_results = [ 104 | [ 105 | "timeout", 106 | "60", 107 | "airflow", 108 | "tasks", 109 | "clear", 110 | "-s", 111 | "2022-01-01", 112 | "-e", 113 | "2022-01-10", 114 | "test_value", 115 | ], 116 | [ 117 | "airflow", 118 | "dags", 119 | "backfill", 120 | "--donot-pickle", 121 | "--dry-run", 122 | "-s", 123 | "2022-01-01", 124 | "-e", 125 | "2022-01-10", 126 | "test_value", 127 | ], 128 | [ 129 | "airflow", 130 | "tasks", 131 | "clear", 132 | "-y", 133 | "-s", 134 | "2022-01-01", 135 | "-e", 136 | "2022-01-10", 137 | "test_value", 138 | ], 139 | [ 140 | "airflow", 141 | "dags", 142 | "backfill", 143 | "--donot-pickle", 144 | "-s", 145 | "2022-01-01", 146 | "-e", 147 | "2022-01-10", 148 | "test_value", 149 | ], 150 | [ 151 | "airflow", 152 | "dags", 153 | "backfill", 154 | "--donot-pickle", 155 | "-t", 156 | "/ab+c/", 157 | "-s", 158 | "2022-01-01", 159 | "-e", 160 | "2022-01-10", 161 | "test_value", 162 | ], 163 | ] 164 | 165 | for params, result in zip(test_params, expected_results): 166 | backfill_command = params.generate_backfill_command() 167 | assert backfill_command == result 168 | -------------------------------------------------------------------------------- /tests/utils/test_tags.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from utils.tags import InvalidTagError, Tag 4 | 5 | 6 | @pytest.mark.parametrize( 7 | ("actual", "expected"), 8 | [ 9 | (Tag.ImpactTier.tier_1, "impact/tier_1"), 10 | (Tag.ImpactTier.tier_2, "impact/tier_2"), 11 | (Tag.ImpactTier.tier_3, "impact/tier_3"), 12 | ], 13 | ) 14 | def test_valid_impact_tag(actual, expected): 15 | assert actual == expected 16 | 17 | 18 | @pytest.mark.parametrize( 19 | ("obj", "attr", "expected"), 20 | [ 21 | (Tag.ImpactTier, "tier_1", "impact/tier_1"), 22 | (Tag.ImpactTier, "tier_2", "impact/tier_2"), 23 | (Tag.ImpactTier, "tier_3", "impact/tier_3"), 24 | ], 25 | ) 26 | def test_get_impact_tag(obj, attr, expected): 27 | assert getattr(obj, attr) == expected 28 | 29 | 30 | @pytest.mark.parametrize( 31 | "invalid_input", 32 | [ 33 | "tier_4", 34 | "", 35 | "bq-etl", 36 | ], 37 | ) 38 | def test_invalid_impact_tag(invalid_input): 39 | with pytest.raises(InvalidTagError): 40 | getattr(Tag.ImpactTier, invalid_input) 41 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/utils/__init__.py -------------------------------------------------------------------------------- /utils/backfill.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | import datetime 5 | import re 6 | 7 | 8 | @dataclasses.dataclass 9 | class BackfillParams: 10 | dag_name: str 11 | start_date: str 12 | end_date: str 13 | clear: bool 14 | dry_run: bool 15 | task_regex: str | None 16 | 17 | def validate_date_range(self) -> None: 18 | start_date = datetime.datetime.fromisoformat(self.start_date) 19 | end_date = datetime.datetime.fromisoformat(self.end_date) 20 | if start_date > end_date: 21 | raise ValueError( 22 | f"`start_date`={self.start_date} is greater than `end_date`={self.end_date}" 23 | ) 24 | 25 | def validate_regex_pattern(self) -> None: 26 | if self.task_regex: 27 | try: 28 | re.compile(self.task_regex) 29 | except re.error: 30 | raise ValueError( 31 | f"Invalid regex pattern for `task_regex`={self.task_regex}" 32 | ) from None 33 | 34 | def generate_backfill_command(self) -> list[str]: 35 | """ 36 | Backfill command based off the Airflow plugin implemented by hwoo. 37 | 38 | Original implementation in plugins/backfill/main.py 39 | 40 | """ 41 | # Construct the airflow command 42 | cmd = ["airflow"] 43 | 44 | if self.clear: 45 | cmd.extend(["tasks", "clear"]) 46 | 47 | if self.dry_run: 48 | # For dry runs we simply time out to avoid zombie procs waiting on user input. 49 | # The output is what we're interested in 50 | timeout_list = ["timeout", "60"] 51 | cmd = timeout_list + cmd 52 | else: 53 | cmd.append("-y") 54 | 55 | if self.task_regex: 56 | cmd.extend(["-t", str(self.task_regex)]) 57 | else: 58 | cmd.extend(["dags", "backfill", "--donot-pickle"]) 59 | if self.dry_run: 60 | cmd.append("--dry-run") 61 | 62 | if self.task_regex: 63 | cmd.extend(["-t", str(self.task_regex)]) 64 | 65 | cmd.extend( 66 | ["-s", str(self.start_date), "-e", str(self.end_date), str(self.dag_name)] 67 | ) 68 | 69 | return cmd 70 | -------------------------------------------------------------------------------- /utils/callbacks.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from airflow.models.taskinstance import clear_task_instances 4 | from airflow.utils.context import Context 5 | from airflow.utils.db import provide_session 6 | from sqlalchemy.orm.session import Session 7 | 8 | if TYPE_CHECKING: 9 | from airflow.models.dagrun import DagRun 10 | 11 | 12 | @provide_session 13 | def retry_tasks_callback(context: Context, session: Session | None = None) -> None: 14 | """ 15 | Clear tasks specified by the `retry_tasks` task param. 16 | 17 | Intended to be used to as an `on_retry_callback` to also retry other tasks when a task fails. 18 | """ 19 | retry_task_ids: list[str] = context["params"].get("retry_tasks", []) 20 | if isinstance(retry_task_ids, str): 21 | retry_task_ids = [retry_task_ids] 22 | dag_run: DagRun = context["dag_run"] 23 | retry_task_instances = [ 24 | task_instance 25 | for task_instance in dag_run.get_task_instances(session=session) 26 | if task_instance.task_id in retry_task_ids 27 | ] 28 | if retry_task_instances: 29 | clear_task_instances(retry_task_instances, session=session) 30 | -------------------------------------------------------------------------------- /utils/constants.py: -------------------------------------------------------------------------------- 1 | DS_WEEKLY = ( 2 | "{% if dag_run.external_trigger %}" 3 | "{{ ds_nodash }}" 4 | "{% else %}" 5 | '{{ macros.ds_format(macros.ds_add(ds, 6), "%Y-%m-%d", "%Y%m%d") }}' 6 | "{% endif %}" 7 | ) 8 | 9 | FAILED_STATES = ["failed", "upstream_failed", "skipped"] 10 | 11 | ALLOWED_STATES = ["success"] 12 | -------------------------------------------------------------------------------- /utils/glam_subdags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/utils/glam_subdags/__init__.py -------------------------------------------------------------------------------- /utils/glam_subdags/general.py: -------------------------------------------------------------------------------- 1 | from airflow.models import DAG 2 | 3 | from utils.gcp import bigquery_etl_query 4 | 5 | 6 | def merge_params(min_param, max_param, additional_params): 7 | parameters = ( 8 | f"min_sample_id:INT64:{min_param}", 9 | f"max_sample_id:INT64:{max_param}", 10 | ) 11 | 12 | if additional_params is not None: 13 | parameters += additional_params 14 | 15 | return parameters 16 | 17 | 18 | def repeated_subdag( 19 | parent_dag_name, 20 | child_dag_name, 21 | default_args, 22 | schedule_interval, 23 | billing_project_id, 24 | table_project_id, 25 | dataset_id, 26 | fully_qualified_dataset_id, 27 | additional_params=None, 28 | num_partitions=5, 29 | date_partition_parameter="submission_date", 30 | docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", 31 | parallel=False, 32 | ): 33 | dag = DAG( 34 | f"{parent_dag_name}.{child_dag_name}", 35 | default_args=default_args, 36 | schedule_interval=schedule_interval, 37 | ) 38 | 39 | # This task runs first and replaces the relevant partition, followed 40 | # by the next tasks that append to the same partition of the same table. 41 | NUM_SAMPLE_IDS = 100 42 | PARTITION_SIZE = NUM_SAMPLE_IDS // num_partitions 43 | 44 | if NUM_SAMPLE_IDS % num_partitions != 0: 45 | raise ValueError( 46 | f"Number of partitions must be a divisor " 47 | f"of the number of sample ids ({NUM_SAMPLE_IDS})" 48 | ) 49 | 50 | task_0 = bigquery_etl_query( 51 | reattach_on_restart=True, 52 | task_id=f"{child_dag_name}_0", 53 | destination_table=f"{child_dag_name}_v1", 54 | dataset_id=fully_qualified_dataset_id, 55 | sql_file_path=f"sql/{table_project_id}/{dataset_id}/{child_dag_name}_v1/query.sql", 56 | project_id=billing_project_id, 57 | depends_on_past=True, 58 | parameters=merge_params(0, PARTITION_SIZE - 1, additional_params), 59 | date_partition_parameter=date_partition_parameter, 60 | arguments=("--replace",), 61 | dag=dag, 62 | docker_image=docker_image, 63 | ) 64 | 65 | upstream_task = task_0 66 | 67 | for partition in range(1, num_partitions): 68 | min_param = partition * PARTITION_SIZE 69 | max_param = min_param + PARTITION_SIZE - 1 70 | 71 | task = bigquery_etl_query( 72 | reattach_on_restart=True, 73 | task_id=f"{child_dag_name}_{partition}", 74 | destination_table=f"{child_dag_name}_v1", 75 | dataset_id=fully_qualified_dataset_id, 76 | sql_file_path=f"sql/{table_project_id}/{dataset_id}/{child_dag_name}_v1/query.sql", 77 | project_id=billing_project_id, 78 | depends_on_past=True, 79 | parameters=merge_params(min_param, max_param, additional_params), 80 | date_partition_parameter=date_partition_parameter, 81 | arguments=( 82 | "--append_table", 83 | "--noreplace", 84 | ), 85 | dag=dag, 86 | docker_image=docker_image, 87 | ) 88 | upstream_task >> task 89 | if not parallel: 90 | upstream_task = task 91 | 92 | return dag 93 | -------------------------------------------------------------------------------- /utils/glam_subdags/histograms.py: -------------------------------------------------------------------------------- 1 | from airflow.models import DAG 2 | 3 | from utils.gcp import bigquery_etl_query 4 | 5 | GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG = "clients_histogram_aggregates" 6 | 7 | 8 | def histogram_aggregates_subdag( 9 | parent_dag_name, 10 | child_dag_name, 11 | default_args, 12 | schedule_interval, 13 | dataset_id, 14 | fully_qualified_dataset, 15 | billing_project_id, 16 | table_project_id="moz-fx-data-shared-prod", 17 | is_dev=False, 18 | docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", 19 | ): 20 | GLAM_HISTOGRAM_AGGREGATES_SUBDAG = f"{parent_dag_name}.{child_dag_name}" 21 | default_args["depends_on_past"] = True 22 | dag = DAG( 23 | GLAM_HISTOGRAM_AGGREGATES_SUBDAG, 24 | default_args=default_args, 25 | schedule_interval=schedule_interval, 26 | ) 27 | 28 | clients_histogram_aggregates_new = bigquery_etl_query( 29 | reattach_on_restart=True, 30 | task_id="clients_histogram_aggregates_new", 31 | destination_table="clients_histogram_aggregates_new_v1", 32 | dataset_id=fully_qualified_dataset, 33 | sql_file_path=f"sql/{table_project_id}/{dataset_id}/clients_histogram_aggregates_new_v1/query.sql", 34 | project_id=billing_project_id, 35 | date_partition_parameter=None, 36 | parameters=("submission_date:DATE:{{ds}}",), 37 | arguments=("--replace",), 38 | dag=dag, 39 | docker_image=docker_image, 40 | ) 41 | 42 | clients_histogram_aggregates_final = bigquery_etl_query( 43 | reattach_on_restart=True, 44 | task_id="clients_histogram_aggregates_v2", 45 | destination_table="clients_histogram_aggregates_v2", 46 | dataset_id=fully_qualified_dataset, 47 | sql_file_path=f"sql/{table_project_id}/{dataset_id}/clients_histogram_aggregates_v2/query.sql", 48 | project_id=billing_project_id, 49 | depends_on_past=True, 50 | parameters=("submission_date:DATE:{{ds}}",), 51 | date_partition_parameter=None, 52 | arguments=("--replace",), 53 | dag=dag, 54 | docker_image=docker_image, 55 | ) 56 | 57 | clients_histogram_aggregates_new >> clients_histogram_aggregates_final 58 | return dag 59 | -------------------------------------------------------------------------------- /utils/patched/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/telemetry-airflow/1d80ddd23c83f50787e46fa43e0929d9a13185de/utils/patched/__init__.py -------------------------------------------------------------------------------- /utils/slack.py: -------------------------------------------------------------------------------- 1 | from airflow.models import Variable 2 | from airflow.providers.slack.operators.slack import SlackAPIPostOperator 3 | 4 | SLACK_CHANNEL = "#airflow-alerts" 5 | 6 | 7 | def if_task_fails_alert_slack(context): 8 | failed_alert = SlackAPIPostOperator( 9 | task_id="slack_failed", 10 | channel=SLACK_CHANNEL, 11 | token=Variable.get("slack_secret_token"), 12 | text=""" 13 | :red_circle: Task Failed. 14 | *Task*: {task} 15 | *Dag*: {dag} 16 | *Date*: {ds} 17 | """.format( 18 | task=context.get("task_instance").task_id, 19 | dag=context.get("task_instance").dag_id, 20 | ds=context.get("ds"), 21 | ), 22 | ) 23 | return failed_alert.execute(context=context) 24 | -------------------------------------------------------------------------------- /utils/tags.py: -------------------------------------------------------------------------------- 1 | """Module with Airflow tag definitions.""" 2 | 3 | from enum import Enum, member 4 | 5 | 6 | class InvalidTagError(AttributeError): 7 | pass 8 | 9 | 10 | class Tag(Enum): 11 | """Enum containing available Airflow tags.""" 12 | 13 | def __getattr__(self, item: str) -> str: 14 | """ 15 | Simplifies accessing enum values. 16 | 17 | Instead of Tag.ImpactTier.value.tier_1.value we can 18 | just use Tag.ImpactTier.tier_1. 19 | Simplify accessing enum values. 20 | 21 | Instead of Tag.ImpactTier.value.tier_1.value we can just use 22 | Tag.ImpactTier.tier_1. 23 | 24 | # source: https://newbedev.com/enum-of-enums-in-python 25 | """ 26 | 27 | if item == "_value_": 28 | raise InvalidTagError 29 | 30 | try: 31 | ret_val = getattr(self.value, item).value 32 | except AttributeError as _err: 33 | raise InvalidTagError() from _err 34 | 35 | return ret_val 36 | 37 | @member 38 | class ImpactTier(Enum): 39 | """Valid options for Impact tier tag.""" 40 | 41 | tier_1: str = "impact/tier_1" 42 | tier_2: str = "impact/tier_2" 43 | tier_3: str = "impact/tier_3" 44 | 45 | @member 46 | class Triage(Enum): 47 | """Tag for conveying information to the engineer on triage.""" 48 | 49 | confidential: str = "triage/confidential" 50 | record_only: str = "triage/record_only" 51 | no_triage: str = "triage/no_triage" 52 | 53 | @member 54 | class Repo(Enum): 55 | """Valid options for Repo tag.""" 56 | 57 | bqetl: str = "repo/bigquery-etl" 58 | airflow: str = "repo/telemetry-airflow" 59 | private_bqetl: str = "repo/private-bigquery-etl" 60 | --------------------------------------------------------------------------------