├── .astro-registry.yaml ├── .astro └── config.yaml ├── .astrocloud └── config.yaml ├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── dags ├── bigquery_examples │ └── simple_bigquery.py ├── firebolt_examples │ └── simple_firebolt.py ├── great_expectations │ ├── great_expectations_bigquery.py │ ├── great_expectations_mlflow.py │ ├── great_expectations_pandas_df.py │ ├── great_expectations_redshift.py │ ├── great_expectations_snowflake.py │ └── great_expectations_snowflake_write_audit_publish.py ├── redshift_examples │ ├── simple_redshift_1.py │ ├── simple_redshift_2.py │ └── simple_redshift_3.py ├── snowflake_examples │ ├── complex_snowflake_transform.py │ ├── simple_snowflake.py │ ├── snowflake_dynamic_write_audit_publish.py │ ├── snowflake_write_audit_publish.py │ └── taxi_snowflake.py └── sql_examples │ ├── sql_check.py │ └── sql_check_redshift_etl.py ├── include ├── data │ ├── yellow_tripdata_sample_2019-01.csv │ └── yellow_tripdata_sample_2019-02.csv ├── forestfire_checks │ └── checks.py ├── gcs_xcom_backend.py ├── great_expectations │ ├── .gitignore │ ├── checkpoints │ │ ├── mlflow │ │ │ ├── feature_chk.yml │ │ │ └── preprocess_chk.yml │ │ └── taxi │ │ │ ├── fail │ │ │ └── chk.yml │ │ │ └── pass │ │ │ └── chk.yml │ ├── configs │ │ ├── bigquery_configs.py │ │ ├── mlflow_checkpoint_config.py │ │ ├── redshift_configs.py │ │ ├── s3_configs.py │ │ └── snowflake_configs.py │ ├── expectations │ │ ├── .ge_store_backend_id │ │ ├── mlflow │ │ │ ├── census_adult_income_features.json │ │ │ └── census_adult_income_preprocess.json │ │ ├── taxi │ │ │ ├── demo.json │ │ │ └── demo_fail.json │ │ └── test_suite.json │ ├── great_expectations.yml │ ├── notebooks │ │ ├── pandas │ │ │ └── validation_playground.ipynb │ │ ├── spark │ │ │ └── validation_playground.ipynb │ │ └── sql │ │ │ └── validation_playground.ipynb │ └── plugins │ │ └── custom_data_docs │ │ └── styles │ │ └── data_docs_custom_styles.css ├── grid_configs.py ├── libs │ └── schema_reg │ │ ├── __init__.py │ │ └── base_schema_transforms.py ├── metrics.py ├── sample_data │ ├── cost_data │ │ └── cost_data.csv │ ├── forestfire_data │ │ ├── forestfires.csv │ │ ├── forestfires_corrupt.csv │ │ └── forestfires_invalid.csv │ └── yellow_trip_data │ │ ├── yellow_tripdata_sample_2019-01.csv │ │ └── yellow_tripdata_sample_2019-02.csv ├── sql │ ├── bigquery_examples │ │ ├── load_bigquery_forestfire_data.sql │ │ └── row_quality_bigquery_forestfire_check.sql │ ├── dbt_examples │ │ └── copy_store_failures.sql │ ├── firebolt_examples │ │ ├── create_table.sql │ │ ├── drop_table.sql │ │ ├── load_forestfire_data.sql │ │ └── quality_check_template.sql │ ├── great_expectations_examples │ │ ├── copy_yellow_tripdata_snowflake_staging.sql │ │ ├── create_snowflake_yellow_tripdata_stage.sql │ │ ├── create_yellow_tripdata_redshift_table.sql │ │ ├── create_yellow_tripdata_snowflake_table.sql │ │ ├── delete_yellow_tripdata_table.sql │ │ └── table_schemas │ │ │ └── tripdata_schema.json │ ├── redshift_examples │ │ ├── create_redshift_forestfire_table.sql │ │ ├── drop_redshift_forestfire_table.sql │ │ ├── row_quality_redshift_forestfire_check.sql │ │ └── validate_redshift_forestfire_load.sql │ ├── snowflake_examples │ │ ├── copy_forestfire_snowflake_audit.sql │ │ ├── create_cost_table.sql │ │ ├── create_forestfire_cost_table.sql │ │ ├── create_forestfire_table.sql │ │ ├── create_snowflake_yellow_tripdata_stage.sql │ │ ├── create_snowflake_yellow_tripdata_table.sql │ │ ├── delete_forestfire_table.sql │ │ ├── delete_snowflake_table.sql │ │ ├── load_cost_data.sql │ │ ├── load_forestfire_cost_data.sql │ │ ├── load_snowflake_forestfire_data.sql │ │ ├── load_yellow_tripdata.sql │ │ ├── row_quality_snowflake_forestfire_check.sql │ │ ├── row_quality_yellow_tripdata_check.sql │ │ ├── row_quality_yellow_tripdata_template.sql │ │ ├── table_schemas │ │ │ └── forestfire_schema.json │ │ └── transform_forestfire_cost_table.sql │ └── sql_examples │ │ ├── create_redshift_yellow_tripdata_table.sql │ │ ├── drop_redshift_yellow_tripdata_table.sql │ │ └── row_quality_yellow_tripdata_check.sql └── validation │ └── forestfire_validation.json ├── packages.txt ├── plugins ├── firebolt_operator_test.py └── snowflake_check_operators.py └── requirements.txt /.astro-registry.yaml: -------------------------------------------------------------------------------- 1 | # These categories will be applied to all DAGs in the repo. 2 | categories: 3 | - ETL/ELT 4 | - Data Quality 5 | - Big Data and Analytics 6 | - Databases 7 | # List of DAGs that should be published to the Astronomer Registry. 8 | dags: 9 | - path: dags/bigquery_examples/simple_bigquery.py 10 | - path: dags/dbt_examples/copy_store_failures_bigquery.py 11 | - path: dags/dbt_examples/copy_store_failures_redshift.py 12 | - path: dags/dbt_examples/copy_store_failures_snowflake.py 13 | - path: dags/firebolt_examples/simple_firebolt.py 14 | - path: dags/great_expectations/v2/simple_great_expectations_bigquery_el_v2.py 15 | - path: dags/great_expectations/v2/simple_great_expectations_example_v2.py 16 | - path: dags/redshift_examples/simple_redshift_1.py 17 | - path: dags/redshift_examples/simple_redshift_2.py 18 | - path: dags/redshift_examples/simple_redshift_3.py 19 | - path: dags/snowflake_examples/complex_snowflake_transform.py 20 | - path: dags/snowflake_examples/simple_snowflake.py 21 | - path: dags/snowflake_examples/snowflake_dynamic_write_audit_publish.py 22 | - path: dags/snowflake_examples/snowflake_write_audit_publish.py 23 | - path: dags/snowflake_examples/taxi_snowflake.py 24 | - path: dags/sql_examples/sql_check_redshift_etl.py 25 | - path: dags/sql_examples/sql_check.py -------------------------------------------------------------------------------- /.astro/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: airflow-data-quality-demo 3 | -------------------------------------------------------------------------------- /.astrocloud/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: airflow-data-quality-demo 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .astro 2 | .astrocloud 3 | .github 4 | .git 5 | .gitignore 6 | venv/ 7 | tests/ 8 | .env 9 | airflow_settings.yaml 10 | logs/ 11 | venv/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .env 3 | airflow_settings.yaml 4 | *.pyc 5 | */__pycache__/* 6 | .vim/ 7 | Pip* 8 | *DS_Store 9 | include/gcloud_key/* 10 | venv/* 11 | dag_graph_generator.py 12 | task_dependency_tree.json 13 | include/openlineage* 14 | include/openlineage/* 15 | *.code-workspace -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/astronomer/astro-runtime:7.3.0 2 | ENV AIRFLOW__CORE__ENABLE_XCOM_PICKLING=True 3 | 4 | USER root 5 | # Required for some ML/DS dependencies 6 | RUN apt-get update -y 7 | RUN apt-get install libgomp1 -y 8 | RUN apt-get install -y git 9 | USER astro 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Quality Demo 2 | This repo contains DAGs to demonstrate a variety of data quality and integrity checks. 3 | All DAGs can be found under the dags/ folder, which is partitioned by backend data store 4 | or provider. Specific data stores need connections and may require accounts with cloud providers. Further details are provided in the data store specific sections below. 5 | 6 | ### Requirements 7 | The Astronomer CLI and Docker installed locally are needed to run all DAGs in this repo. Additional requirements per project are listed below. 8 | Provider packages are listed in the `requirements.txt` file. 9 | 10 | #### Redshift DAGs: 11 | - An AWS account 12 | - An S3 bucket 13 | - An active Redshift cluster 14 | 15 | #### BigQuery DAGs: 16 | - A GCP account 17 | - A service role with create, modify, and delete privileges on BigQuery 18 | - An active GCP project with BigQuery 19 | 20 | #### Snowflake DAGs: 21 | - A Snowflake account 22 | 23 | #### Firebolt DAGs: 24 | - A Firebolt account 25 | 26 | #### Great Expectations DAGs: 27 | - An account with service roles and tables as specified in one of the data stores above 28 | 29 | #### SQL DAGs: 30 | - A running SQL database 31 | 32 | ### Getting Started 33 | The easiest way to run these example DAGs is to use the Astronomer CLI to get an Airflow instance up and running locally: 34 | 1. [Install the Astronomer CLI](https://www.astronomer.io/docs/cloud/stable/develop/cli-quickstart). 35 | 2. Clone this repo locally and navigate into it. 36 | 3. Start Airflow locally by running `astro dev start`. 37 | 4. Create all necessary connections and variables - see below for specific DAG cases. 38 | 5. Navigate to localhost:8080 in your browser and you should see the tutorial DAGs there. 39 | 40 | #### Redshift DAGs: 41 | In addition to the Getting Started steps, connections to AWS and Postgres (for Redshift) are needed to upload files to S3 and load to Redshift. 42 | Under `Admin -> Connections` in the Airflow UI, add a new connection named `aws_default`. The `Conn Type` is `Amazon Web Services`. In the `Login` field, enter your AWS Access Key associated with your account. In the `Password` field, enter the corresponding AWS Secret Access Key. Press `Save` at the bottom. 43 | Add another connection named `redshift_default`. The `Conn Type` is `Postgres`. The host is your Redshift host name, something like `cluster-name.XXXXXXXXXXXX.region.redshift.amazonaws.com`. The `Schema` is your Redshift schema name. `Login` is the Redshift username. `Password` is the corresponding password to access the cluster. `Port` should be 5439 (the Redshift default). Make sure your IP address is whitelisted in Redshift, and that Redshift is accepting connections outside of your VPC! 44 | 45 | Variables needed are specified in each DAG and can be set under `Admin -> Variables` in the UI. 46 | 47 | #### BigQuery DAGs: 48 | In addition to the Getting Started steps, connections to GCP and BigQuery are needed to create BigQuery Datasets, tables, and insert and delete data there. 49 | Under `Admin -> Connections` in the Airflow UI, add a new connection with Conn ID as `google_cloud_default`. The connection type is `Google Cloud`. A GCP key associated with a service account that has access to BigQuery is needed; for more information generating a key, [follow the instructions in this guide](https://cloud.google.com/iam/docs/creating-managing-service-account-keys). The key can either be added via a path via the Keyfile Path field, or the JSON can be directly copied and pasted into the Keyfile JSON field. In the case of the Keyfile Path, a relative path is allowed, and if using Astronomer, the recommended path is under the `include/` directory, as Docker will mount all files and directories under it. Make sure the file name is included in the path. Finally, add the project ID to the Project ID field. No scopes should be needed. 50 | 51 | Variables needed are specified in each DAG and can be set under `Admin -> Variables` in the UI. 52 | 53 | #### Snowflake DAGs: 54 | In addition to the Getting Started steps, a connection to Snowflake is needed to run DAGs. Under `Admin -> Connections` in the Airflow UI, add a new connection with Conn ID as `snowflake_default`. The connection type is `Snowflake`. The host field should be the full URL that you use to log into Snowflake, for example `https://[account].[region].snowflakecomputing.com`. Fill out the `Login`, `Password`, `Schema`, `Account`, `Database`, `Region`, `Role`, and `Warehouse` fields with your information. 55 | 56 | #### Firebolt DAGs: 57 | In addition to the Getting Started steps, a connection to Firebolt is needed to run DAGs. Under `Admin -> Connections` in the Airflow UI, add a new connection with Conn ID as `firebolt_default`. The connection type is `Firebolt`. The host field should be `api.app.firebolt.com`. Fill in the `Login`, `Password` with your account login and password. In the `Advanced Connection Properties` field, enter at least an engine name in a dictionary, e.g.: `{"engine_name": "firebolt_test_general_purpose"}`. 58 | 59 | #### Great Expectations DAGs: 60 | 61 | For `airflow-provider-great-expectations<=0.1.5` 62 | In addition to the Getting Started steps, Great Expectations requires its own connections in addition to the Airflow Connections needed by other tasks in the DAG when using outside sources. These connections can be made in the file located at `include/great_expectations/uncommitted/config_variables.yml`. Note: you will have to create this file on your own, it does not come as part of the repository. Example connections in YAML are of the form: 63 | 64 | ``` 65 | my_bigquery_db: 66 | bigquery://[gcp-id]/[dataset] 67 | my_snowflake_db: 68 | snowflake://[username]:[password]@[account].[region]/[database]/[schema]?warehouse=[warehouse]&role=[role] 69 | my_redshift_db: 70 | postgresql+psycopg2://[username]:[password]@[database_uri]:5439/[default_db] 71 | ``` 72 | 73 | See the Great Expectations docs for more information on [BigQuery](https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/database/bigquery/), [Redshift](https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/database/redshift/), or [Snowflake](https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/database/snowflake/). More connections can be added via the Great Expectations CLI tool 74 | 75 | Files related to the Great Expectations DAGs can be found under `include/great_expectations/`, and the referenced SQL queries under `include/sql/great_expectations_examples/`. 76 | 77 | Variables needed are specified in each DAG and can be set under `Admin -> Variables` in the UI. 78 | 79 | #### SQL DAGs: 80 | In addition to the Getting Started steps, a SQL database (sqlite, Postgres, MySQL, etc...) needs to be up and running. This database may be local or cloud-hosted. An Airflow Connection to the database is needed. 81 | -------------------------------------------------------------------------------- /dags/bigquery_examples/simple_bigquery.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Simple Extract/Load Pipeline with Data Quality Checks Using BigQuery 3 | 4 | Before running the DAG, set the following in an Airflow or Environment Variable: 5 | - key: gcp_project_id 6 | value: [gcp_project_id] 7 | Fully replacing [gcp_project_id] with the actual ID. 8 | 9 | Ensure you have a connection to GCP, using a role with access to BigQuery 10 | and the ability to create, modify, and delete datasets and tables. 11 | 12 | What makes this a simple data quality case is: 13 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 14 | 2. No transformations or business logic. 15 | 3. Exact values of data to quality check are known. 16 | """ 17 | 18 | import json 19 | 20 | from airflow import DAG 21 | from airflow.models.baseoperator import chain 22 | from airflow.operators.dummy_operator import DummyOperator 23 | from airflow.providers.google.cloud.operators.bigquery import ( 24 | BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, 25 | BigQueryCreateEmptyTableOperator, BigQueryDeleteDatasetOperator, 26 | BigQueryInsertJobOperator, BigQueryValueCheckOperator) 27 | from airflow.providers.google.cloud.sensors.bigquery import \ 28 | BigQueryTableExistenceSensor 29 | from airflow.utils.dates import datetime 30 | from airflow.utils.task_group import TaskGroup 31 | 32 | DATASET = "simple_bigquery_example_dag" 33 | TABLE = "forestfires" 34 | 35 | with DAG( 36 | "simple_bigquery", 37 | start_date=datetime(2021, 1, 1), 38 | description="Example DAG showcasing loading and data quality checking with BigQuery.", 39 | doc_md=__doc__, 40 | schedule_interval=None, 41 | template_searchpath="/usr/local/airflow/include/sql/bigquery_examples/", 42 | catchup=False, 43 | ) as dag: 44 | 45 | """ 46 | #### BigQuery dataset creation 47 | Create the dataset to store the sample data tables. 48 | """ 49 | create_dataset = BigQueryCreateEmptyDatasetOperator( 50 | task_id="create_dataset", dataset_id=DATASET 51 | ) 52 | 53 | """ 54 | #### BigQuery table creation 55 | Create the table to store sample forest fire data. 56 | """ 57 | create_table = BigQueryCreateEmptyTableOperator( 58 | task_id="create_table", 59 | dataset_id=DATASET, 60 | table_id=TABLE, 61 | schema_fields=[ 62 | {"name": "id", "type": "INTEGER", "mode": "REQUIRED"}, 63 | {"name": "y", "type": "INTEGER", "mode": "NULLABLE"}, 64 | {"name": "month", "type": "STRING", "mode": "NULLABLE"}, 65 | {"name": "day", "type": "STRING", "mode": "NULLABLE"}, 66 | {"name": "ffmc", "type": "FLOAT", "mode": "NULLABLE"}, 67 | {"name": "dmc", "type": "FLOAT", "mode": "NULLABLE"}, 68 | {"name": "dc", "type": "FLOAT", "mode": "NULLABLE"}, 69 | {"name": "isi", "type": "FLOAT", "mode": "NULLABLE"}, 70 | {"name": "temp", "type": "FLOAT", "mode": "NULLABLE"}, 71 | {"name": "rh", "type": "FLOAT", "mode": "NULLABLE"}, 72 | {"name": "wind", "type": "FLOAT", "mode": "NULLABLE"}, 73 | {"name": "rain", "type": "FLOAT", "mode": "NULLABLE"}, 74 | {"name": "area", "type": "FLOAT", "mode": "NULLABLE"}, 75 | ], 76 | ) 77 | 78 | """ 79 | #### BigQuery table check 80 | Ensure that the table was created in BigQuery before inserting data. 81 | """ 82 | check_table_exists = BigQueryTableExistenceSensor( 83 | task_id="check_for_table", 84 | project_id="{{ var.value.gcp_project_id }}", 85 | dataset_id=DATASET, 86 | table_id=TABLE, 87 | ) 88 | 89 | """ 90 | #### Insert data 91 | Insert data into the BigQuery table using an existing SQL query (stored in 92 | a file under dags/sql). 93 | """ 94 | load_data = BigQueryInsertJobOperator( 95 | task_id="insert_query", 96 | configuration={ 97 | "query": { 98 | "query": "{% include 'load_bigquery_forestfire_data.sql' %}", 99 | "useLegacySql": False, 100 | } 101 | }, 102 | ) 103 | 104 | """ 105 | #### Row-level data quality check 106 | Run data quality checks on a few rows, ensuring that the data in BigQuery 107 | matches the ground truth in the correspoding JSON file. 108 | """ 109 | with open("include/validation/forestfire_validation.json") as ffv: 110 | with TaskGroup(group_id="row_quality_checks") as quality_check_group: 111 | ffv_json = json.load(ffv) 112 | for id, values in ffv_json.items(): 113 | values["id"] = id 114 | values["dataset"] = DATASET 115 | values["table"] = TABLE 116 | BigQueryCheckOperator( 117 | task_id=f"check_row_data_{id}", 118 | sql="row_quality_bigquery_forestfire_check.sql", 119 | use_legacy_sql=False, 120 | params=values, 121 | ) 122 | 123 | """ 124 | #### Table-level data quality check 125 | Run a row count check to ensure all data was uploaded to BigQuery properly. 126 | """ 127 | check_bq_row_count = BigQueryValueCheckOperator( 128 | task_id="check_row_count", 129 | sql=f"SELECT COUNT(*) FROM {DATASET}.{TABLE}", 130 | pass_value=9, 131 | use_legacy_sql=False, 132 | ) 133 | 134 | """ 135 | #### Delete test dataset and table 136 | Clean up the dataset and table created for the example. 137 | """ 138 | delete_dataset = BigQueryDeleteDatasetOperator( 139 | task_id="delete_dataset", dataset_id=DATASET, delete_contents=True 140 | ) 141 | 142 | begin = DummyOperator(task_id="begin") 143 | end = DummyOperator(task_id="end") 144 | 145 | chain( 146 | begin, 147 | create_dataset, 148 | create_table, 149 | check_table_exists, 150 | load_data, 151 | [quality_check_group, check_bq_row_count], 152 | delete_dataset, 153 | end, 154 | ) 155 | -------------------------------------------------------------------------------- /dags/firebolt_examples/simple_firebolt.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | """Example Firebolt Data Quality DAG 19 | 20 | DAG starts the Firebolt engine specificed, creates sample table, loads sample data into table, 21 | runs quality checks in CHECKS dictionary, then deletes the table and stops the engine. 22 | 23 | Checks work by running a MIN() function over the specific aggregate check, where the aggregate 24 | check is contained in a CASE statement. The CASE statement checks the result of the condition; 25 | if true, the CASE statement returns 1, else 0. Then MIN() will return 0 if any row returns a 26 | false result, and true otherwise. 27 | 28 | Note: the Firebolt operator currently does not support templated SQL queries. 29 | """ 30 | 31 | from datetime import datetime 32 | 33 | from airflow import DAG 34 | from airflow.models.baseoperator import chain 35 | from airflow.operators.sql import SQLCheckOperator 36 | from airflow.utils.task_group import TaskGroup 37 | from firebolt_provider.operators.firebolt import (FireboltOperator, 38 | FireboltStartEngineOperator, 39 | FireboltStopEngineOperator) 40 | 41 | FIREBOLT_CONN_ID = "firebolt_default" 42 | FIREBOLT_SAMPLE_TABLE = "forest_fire" 43 | FIREBOLT_DATABASE = "firebolt_test" 44 | FIREBOLT_ENGINE = "firebolt_test_general_purpose" 45 | 46 | CHECKS = {"id": "'column' IS NOT NULL", "ffmc": "MAX(ffmc) < 100"} 47 | 48 | with DAG( 49 | "simple_firebolt", 50 | schedule_interval=None, 51 | start_date=datetime(2021, 1, 1), 52 | doc_md=__doc__, 53 | default_args={ 54 | "conn_id": FIREBOLT_CONN_ID, 55 | "firebolt_conn_id": FIREBOLT_CONN_ID, 56 | "database": FIREBOLT_DATABASE, 57 | "engine_name": FIREBOLT_ENGINE, 58 | }, 59 | template_searchpath="/usr/local/airflow/include/sql/firebolt_examples/", 60 | catchup=False, 61 | ) as dag: 62 | 63 | start_engine = FireboltStartEngineOperator(task_id="start_engine") 64 | 65 | create_table = FireboltOperator( 66 | task_id="create_table", 67 | sql="create_table.sql", 68 | params={"table": FIREBOLT_SAMPLE_TABLE}, 69 | ) 70 | 71 | load_data = FireboltOperator( 72 | task_id="load_data", 73 | sql="load_forestfire_data.sql", 74 | params={"table": FIREBOLT_SAMPLE_TABLE}, 75 | ) 76 | 77 | with TaskGroup(group_id="aggregate_quality_checks") as check_group: 78 | for name, statement in CHECKS.items(): 79 | check = SQLCheckOperator( 80 | task_id=f"check_{name}", 81 | sql="quality_check_template.sql", 82 | params={ 83 | "col": name, 84 | "check_statement": statement, 85 | "table": FIREBOLT_SAMPLE_TABLE, 86 | }, 87 | ) 88 | 89 | drop_table = FireboltOperator( 90 | task_id="drop_table", 91 | sql="drop_table.sql", 92 | params={"table": FIREBOLT_SAMPLE_TABLE}, 93 | ) 94 | 95 | stop_engine = FireboltStopEngineOperator(task_id="stop_engine") 96 | 97 | chain( 98 | start_engine, 99 | create_table, 100 | load_data, 101 | check_group, 102 | drop_table, 103 | stop_engine, 104 | ) 105 | -------------------------------------------------------------------------------- /dags/great_expectations/great_expectations_bigquery.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Simple EL Pipeline with Data Quality Checks Using BigQuery and Great Expectations 3 | 4 | Before running the DAG, set the following in an Airflow or Environment Variable: 5 | - key: gcp_project_id 6 | value: [gcp_project_id] 7 | Fully replacing [gcp_project_id] with the actual ID. 8 | 9 | Ensure you have a connection to GCP, using a role with access to BigQuery 10 | and the ability to create, modify, and delete datasets and tables. 11 | 12 | What makes this a simple data quality case is: 13 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 14 | 2. No transformations or business logic. 15 | 3. Exact values of data to quality check are known. 16 | """ 17 | 18 | import os 19 | from datetime import datetime 20 | from pathlib import Path 21 | 22 | from airflow import DAG 23 | from airflow.models.baseoperator import chain 24 | from airflow.providers.google.cloud.operators.bigquery import ( 25 | BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator, 26 | BigQueryDeleteDatasetOperator) 27 | from airflow.providers.google.cloud.transfers.gcs_to_bigquery import \ 28 | GCSToBigQueryOperator 29 | from airflow.providers.google.cloud.transfers.local_to_gcs import \ 30 | LocalFilesystemToGCSOperator 31 | from great_expectations_provider.operators.great_expectations import \ 32 | GreatExpectationsOperator 33 | 34 | base_path = Path(__file__).parents[2] 35 | data_file = os.path.join( 36 | base_path, 37 | "include", 38 | "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv", 39 | ) 40 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 41 | 42 | # In a production DAG, the global variables below should be stored as Airflow 43 | # or Environment variables. 44 | bq_dataset = "great_expectations_bigquery_example" 45 | bq_table = "taxi" 46 | gcp_bucket = "great-expectations-demo" 47 | gcp_data_dest = "data/yellow_tripdata_sample_2019-01.csv" 48 | 49 | with DAG( 50 | "great_expectations.bigquery", 51 | description="Example DAG showcasing loading and data quality checking with BigQuery and Great Expectations.", 52 | doc_md=__doc__, 53 | schedule_interval=None, 54 | start_date=datetime(2021, 1, 1), 55 | catchup=False, 56 | ) as dag: 57 | 58 | """ 59 | #### BigQuery dataset creation 60 | Create the dataset to store the sample data tables. 61 | """ 62 | create_dataset = BigQueryCreateEmptyDatasetOperator( 63 | task_id="create_dataset", dataset_id=bq_dataset 64 | ) 65 | 66 | """ 67 | #### Upload taxi data to GCS 68 | Upload the test data to GCS so it can be transferred to BigQuery. 69 | """ 70 | upload_taxi_data = LocalFilesystemToGCSOperator( 71 | task_id="upload_taxi_data", 72 | src=data_file, 73 | dst=gcp_data_dest, 74 | bucket=gcp_bucket, 75 | ) 76 | 77 | """ 78 | #### Create Temp Table for GE in BigQuery 79 | """ 80 | create_temp_table = BigQueryCreateEmptyTableOperator( 81 | task_id="create_temp_table", 82 | dataset_id=bq_dataset, 83 | table_id=f"{bq_table}_temp", 84 | schema_fields=[ 85 | {"name": "vendor_id", "type": "INTEGER", "mode": "REQUIRED"}, 86 | {"name": "pickup_datetime", "type": "DATETIME", "mode": "NULLABLE"}, 87 | {"name": "dropoff_datetime", "type": "DATETIME", "mode": "NULLABLE"}, 88 | {"name": "passenger_count", "type": "INTEGER", "mode": "NULLABLE"}, 89 | {"name": "trip_distance", "type": "FLOAT", "mode": "NULLABLE"}, 90 | {"name": "rate_code_id", "type": "INTEGER", "mode": "NULLABLE"}, 91 | {"name": "store_and_fwd_flag", "type": "STRING", "mode": "NULLABLE"}, 92 | {"name": "pickup_location_id", "type": "INTEGER", "mode": "NULLABLE"}, 93 | {"name": "dropoff_location_id", "type": "INTEGER", "mode": "NULLABLE"}, 94 | {"name": "payment_type", "type": "INTEGER", "mode": "NULLABLE"}, 95 | {"name": "fare_amount", "type": "FLOAT", "mode": "NULLABLE"}, 96 | {"name": "extra", "type": "FLOAT", "mode": "NULLABLE"}, 97 | {"name": "mta_tax", "type": "FLOAT", "mode": "NULLABLE"}, 98 | {"name": "tip_amount", "type": "FLOAT", "mode": "NULLABLE"}, 99 | {"name": "tolls_amount", "type": "FLOAT", "mode": "NULLABLE"}, 100 | {"name": "improvement_surcharge", "type": "FLOAT", "mode": "NULLABLE"}, 101 | {"name": "total_amount", "type": "FLOAT", "mode": "NULLABLE"}, 102 | {"name": "congestion_surcharge", "type": "FLOAT", "mode": "NULLABLE"}, 103 | ], 104 | ) 105 | 106 | """ 107 | #### Transfer data from GCS to BigQuery 108 | Moves the data uploaded to GCS in the previous step to BigQuery, where 109 | Great Expectations can run a test suite against it. 110 | """ 111 | transfer_taxi_data = GCSToBigQueryOperator( 112 | task_id="taxi_data_gcs_to_bigquery", 113 | bucket=gcp_bucket, 114 | source_objects=[gcp_data_dest], 115 | skip_leading_rows=1, 116 | destination_project_dataset_table="{}.{}".format(bq_dataset, bq_table), 117 | schema_fields=[ 118 | {"name": "vendor_id", "type": "INTEGER", "mode": "REQUIRED"}, 119 | {"name": "pickup_datetime", "type": "DATETIME", "mode": "NULLABLE"}, 120 | {"name": "dropoff_datetime", "type": "DATETIME", "mode": "NULLABLE"}, 121 | {"name": "passenger_count", "type": "INTEGER", "mode": "NULLABLE"}, 122 | {"name": "trip_distance", "type": "FLOAT", "mode": "NULLABLE"}, 123 | {"name": "rate_code_id", "type": "INTEGER", "mode": "NULLABLE"}, 124 | {"name": "store_and_fwd_flag", "type": "STRING", "mode": "NULLABLE"}, 125 | {"name": "pickup_location_id", "type": "INTEGER", "mode": "NULLABLE"}, 126 | {"name": "dropoff_location_id", "type": "INTEGER", "mode": "NULLABLE"}, 127 | {"name": "payment_type", "type": "INTEGER", "mode": "NULLABLE"}, 128 | {"name": "fare_amount", "type": "FLOAT", "mode": "NULLABLE"}, 129 | {"name": "extra", "type": "FLOAT", "mode": "NULLABLE"}, 130 | {"name": "mta_tax", "type": "FLOAT", "mode": "NULLABLE"}, 131 | {"name": "tip_amount", "type": "FLOAT", "mode": "NULLABLE"}, 132 | {"name": "tolls_amount", "type": "FLOAT", "mode": "NULLABLE"}, 133 | {"name": "improvement_surcharge", "type": "FLOAT", "mode": "NULLABLE"}, 134 | {"name": "total_amount", "type": "FLOAT", "mode": "NULLABLE"}, 135 | {"name": "congestion_surcharge", "type": "FLOAT", "mode": "NULLABLE"}, 136 | ], 137 | source_format="CSV", 138 | create_disposition="CREATE_IF_NEEDED", 139 | write_disposition="WRITE_TRUNCATE", 140 | allow_jagged_rows=True, 141 | ) 142 | 143 | """ 144 | #### Great Expectations suite 145 | Run the Great Expectations suite on the table. 146 | """ 147 | ge_bigquery_validation = GreatExpectationsOperator( 148 | task_id="ge_bigquery_validation", 149 | data_context_root_dir=ge_root_dir, 150 | conn_id="bigquery_default", 151 | expectation_suite_name="taxi.demo", 152 | data_asset_name=bq_table, 153 | fail_task_on_validation_failure=False, 154 | ) 155 | 156 | """ 157 | #### Delete test dataset and table 158 | Clean up the dataset and table created for the example. 159 | """ 160 | delete_dataset = BigQueryDeleteDatasetOperator( 161 | task_id="delete_dataset", 162 | project_id="{{ var.value.gcp_project_id }}", 163 | dataset_id=bq_dataset, 164 | delete_contents=True, 165 | ) 166 | 167 | chain( 168 | create_dataset, 169 | create_temp_table, 170 | upload_taxi_data, 171 | transfer_taxi_data, 172 | ge_bigquery_validation, 173 | delete_dataset, 174 | ) 175 | -------------------------------------------------------------------------------- /dags/great_expectations/great_expectations_pandas_df.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Simple EL Pipeline with Data Quality Checks Using Pandas and Great Expectations 3 | 4 | A simple example of performing data quality checks on a Pandas dataframe using Great Expectations. 5 | 6 | What makes this a simple data quality case is: 7 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 8 | 2. No transformations or business logic. 9 | 3. Exact values of data to quality check are known. 10 | """ 11 | 12 | import os 13 | from datetime import datetime 14 | from pathlib import Path 15 | 16 | import pandas as pd 17 | from airflow import DAG 18 | from great_expectations_provider.operators.great_expectations import \ 19 | GreatExpectationsOperator 20 | 21 | base_path = Path(__file__).parents[2] 22 | data_file = os.path.join( 23 | base_path, 24 | "include", 25 | "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv", 26 | ) 27 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 28 | 29 | 30 | with DAG( 31 | "great_expectations.pandas_df", 32 | start_date=datetime(2021, 1, 1), 33 | description="Example DAG showcasing loading and data quality checking with Pandas and Great Expectations.", 34 | doc_md=__doc__, 35 | schedule_interval=None, 36 | catchup=False, 37 | ) as dag: 38 | 39 | """ 40 | #### Great Expectations suite 41 | Run the Great Expectations suite on the table. 42 | """ 43 | ge_pandas_df_validation = GreatExpectationsOperator( 44 | task_id="ge_pandas_df_validation", 45 | data_context_root_dir=ge_root_dir, 46 | dataframe_to_validate=pd.read_csv(filepath_or_buffer=data_file, header=0), 47 | execution_engine="PandasExecutionEngine", 48 | expectation_suite_name="taxi.demo", 49 | data_asset_name="yellow_tripdata_sample_2019-01", 50 | ) 51 | -------------------------------------------------------------------------------- /dags/great_expectations/great_expectations_redshift.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Simple EL Pipeline with Data Quality Checks Using Redshift and Great Expectations 3 | 4 | Use Great Expectations to check data quality in Redshift. 5 | 6 | Before running the DAG, set the following in an Airflow or Environment Variable: 7 | - key: aws_configs 8 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]} 9 | Fully replacing [bucket_name], [key_prefix], and [table_name]. 10 | 11 | What makes this a simple data quality case is: 12 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 13 | 2. No transformations or business logic. 14 | 3. Exact values of data to quality check are known. 15 | """ 16 | 17 | import os 18 | from datetime import datetime 19 | from pathlib import Path 20 | 21 | from airflow import DAG 22 | from airflow.models.baseoperator import chain 23 | from airflow.providers.amazon.aws.operators.redshift_sql import \ 24 | RedshiftSQLOperator 25 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \ 26 | LocalFilesystemToS3Operator 27 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \ 28 | S3ToRedshiftOperator 29 | from great_expectations_provider.operators.great_expectations import \ 30 | GreatExpectationsOperator 31 | 32 | table = "yellow_tripdata" 33 | base_path = Path(__file__).parents[2] 34 | data_file = os.path.join( 35 | base_path, 36 | "include", 37 | "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv", 38 | ) 39 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 40 | 41 | with DAG( 42 | "great_expectations.redshift", 43 | start_date=datetime(2021, 1, 1), 44 | description="Example DAG showcasing loading and data quality checking with Redshift and Great Expectations.", 45 | doc_md=__doc__, 46 | schedule_interval=None, 47 | template_searchpath=f"{base_path}/include/sql/great_expectations_examples/", 48 | catchup=False, 49 | ) as dag: 50 | 51 | upload_to_s3 = LocalFilesystemToS3Operator( 52 | task_id="upload_to_s3", 53 | filename=data_file, 54 | dest_key="{{ var.json.aws_configs.s3_key_prefix }}/yellow_tripdata_sample_2019-01.csv", 55 | dest_bucket="{{ var.json.aws_configs.s3_bucket }}", 56 | aws_conn_id="aws_default", 57 | replace=True, 58 | ) 59 | 60 | """ 61 | #### Create Redshift Table 62 | For demo purposes, create a Redshift table to store the forest fire data to. 63 | The database is not automatically destroyed at the end of the example; ensure 64 | this is done manually to avoid unnecessary costs. Additionally, set-up may 65 | need to be done in Airflow connections to allow access to Redshift. 66 | """ 67 | create_redshift_table = RedshiftSQLOperator( 68 | task_id="create_redshift_table", 69 | sql="{% include 'create_yellow_tripdata_redshift_table.sql' %}", 70 | parameters={"table_name": "yellow_tripdata"}, 71 | redshift_conn_id="redshift_default", 72 | ) 73 | 74 | """ 75 | #### Second load task 76 | Loads the S3 data from the previous load to a Redshift table (specified 77 | in the Airflow Variables backend). 78 | """ 79 | load_to_redshift = S3ToRedshiftOperator( 80 | task_id="load_to_redshift", 81 | s3_bucket="{{ var.json.aws_configs.s3_bucket }}", 82 | s3_key="{{ var.json.aws_configs.s3_key_prefix }}/yellow_tripdata_sample_2019-01.csv", 83 | schema="PUBLIC", 84 | table=table, 85 | copy_options=["csv", "ignoreheader 1"], 86 | ) 87 | 88 | """ 89 | #### Great Expectations suite 90 | Run the Great Expectations suite on the table. 91 | """ 92 | ge_redshift_validation = GreatExpectationsOperator( 93 | task_id="ge_redshift_validation", 94 | data_context_root_dir=ge_root_dir, 95 | conn_id="redshift_default", 96 | expectation_suite_name="taxi.demo", 97 | data_asset_name=table, 98 | fail_task_on_validation_failure=False, 99 | ) 100 | 101 | """ 102 | #### Drop Redshift table 103 | Drops the Redshift table if it exists already. This is to make sure that the 104 | data in the success and failure cases do not interfere with each other during 105 | the data quality check. 106 | """ 107 | drop_redshift_table = RedshiftSQLOperator( 108 | task_id="drop_table", 109 | sql="delete_yellow_tripdata_table.sql", 110 | redshift_conn_id="redshift_default", 111 | parameters={"table_name": table}, 112 | ) 113 | 114 | chain( 115 | upload_to_s3, 116 | create_redshift_table, 117 | load_to_redshift, 118 | ge_redshift_validation, 119 | drop_redshift_table, 120 | ) 121 | -------------------------------------------------------------------------------- /dags/great_expectations/great_expectations_snowflake.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Simple EL Pipeline with Data Quality Checks Using Snowflake and Great Expectations 3 | 4 | A simple example of performing data quality checks in Snowflake using Great Expectations. 5 | 6 | Ensure a Snowflake Warehouse, Database, Schema, Role, and S3 Key and Secret 7 | exist for the Snowflake connection, named `snowflake_default`. Access to S3 8 | is needed for this example. An 'aws_configs' variable is needed in Variables, 9 | see the Redshift Examples in the README section for more information. 10 | 11 | What makes this a simple data quality case is: 12 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 13 | 2. No transformations or business logic. 14 | 3. Exact values of data to quality check are known. 15 | """ 16 | 17 | import os 18 | from datetime import datetime 19 | from pathlib import Path 20 | 21 | import pandas as pd 22 | from airflow import DAG 23 | from airflow.models.baseoperator import chain 24 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator 25 | from great_expectations_provider.operators.great_expectations import \ 26 | GreatExpectationsOperator 27 | 28 | # This table variable is a placeholder, in a live environment, it is better 29 | # to pull the table info from a Variable in a template 30 | table = "YELLOW_TRIPDATA" 31 | base_path = Path(__file__).parents[2] 32 | data_file = os.path.join( 33 | base_path, 34 | "include", 35 | "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv", 36 | ) 37 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 38 | 39 | SNOWFLAKE_CONN_ID = "snowflake_default" 40 | 41 | with DAG( 42 | "great_expectations.snowflake", 43 | start_date=datetime(2021, 1, 1), 44 | description="Example DAG showcasing loading and data quality checking with Snowflake and Great Expectations.", 45 | doc_md=__doc__, 46 | schedule_interval=None, 47 | template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/", 48 | catchup=False, 49 | ) as dag: 50 | 51 | """ 52 | #### Snowflake table creation 53 | Create the table to store sample forest fire data. 54 | """ 55 | create_table = SnowflakeOperator( 56 | task_id="create_table", 57 | sql="{% include 'create_snowflake_yellow_tripdata_table.sql' %}", 58 | params={"table_name": table}, 59 | ) 60 | 61 | """ 62 | #### Insert data 63 | Insert data into the Snowflake table using an existing SQL query (stored in 64 | the include/sql/snowflake_examples/ directory). 65 | """ 66 | load_data = SnowflakeOperator( 67 | task_id="insert_query", 68 | sql="{% include 'load_yellow_tripdata.sql' %}", 69 | params={"table_name": table}, 70 | ) 71 | 72 | """ 73 | #### Delete table 74 | Clean up the table created for the example. 75 | """ 76 | delete_table = SnowflakeOperator( 77 | task_id="delete_table", 78 | sql="{% include 'delete_snowflake_table.sql' %}", 79 | params={"table_name": table}, 80 | ) 81 | 82 | """ 83 | #### Great Expectations suite 84 | Run the Great Expectations suite on the table. 85 | """ 86 | ge_snowflake_validation = GreatExpectationsOperator( 87 | task_id="ge_snowflake_validation", 88 | data_context_root_dir=ge_root_dir, 89 | conn_id=SNOWFLAKE_CONN_ID, 90 | expectation_suite_name="taxi.demo", 91 | data_asset_name=table, 92 | fail_task_on_validation_failure=False, 93 | ) 94 | 95 | ge_snowflake_query_validation = GreatExpectationsOperator( 96 | task_id="ge_snowflake_query_validation", 97 | data_context_root_dir=ge_root_dir, 98 | conn_id=SNOWFLAKE_CONN_ID, 99 | query_to_validate="SELECT *", 100 | expectation_suite_name="taxi.demo", 101 | data_asset_name=table, 102 | fail_task_on_validation_failure=False, 103 | ) 104 | 105 | chain( 106 | create_table, 107 | load_data, 108 | [ 109 | ge_snowflake_validation, 110 | ge_snowflake_query_validation, 111 | ], 112 | delete_table, 113 | ) 114 | -------------------------------------------------------------------------------- /dags/great_expectations/great_expectations_snowflake_write_audit_publish.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Write-Audit-Publish Pattern EL Pipeline with Data Quality Checks Using Snowflake and Great Expectations 3 | 4 | Use the Write-Audit-Publish pattern with Great Expectaitons and Snowflake. 5 | 6 | Ensure a Snowflake Warehouse, Database, Schema, Role, and S3 Key and Secret 7 | exist for the Snowflake connection, named `snowflake_default`. Access to S3 8 | is needed for this example. An 'aws_configs' variable is needed in Variables, 9 | see the Redshift Examples in the README section for more information. 10 | 11 | The write-audit-publish pattern writes data to a staging table, audits the 12 | data quality through quality checks, then publishes correct data to a 13 | production table. In this example incorrect data is discarded, and the DAG 14 | is failed on data quality check failure. 15 | 16 | What makes this a simple data quality case is: 17 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 18 | 2. No transformations or business logic. 19 | 3. Exact values of data to quality check are known. 20 | """ 21 | 22 | import json 23 | import os 24 | from datetime import datetime 25 | from pathlib import Path 26 | 27 | from airflow import DAG 28 | from airflow.models.baseoperator import chain 29 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator 30 | from great_expectations_provider.operators.great_expectations import \ 31 | GreatExpectationsOperator 32 | 33 | from include.libs.schema_reg.base_schema_transforms import \ 34 | snowflake_load_column_string 35 | 36 | # These variables are a placeholder. In a live environment, it is better 37 | # to pull the info from a Variable. 38 | table = "YELLOW_TRIPDATA" 39 | snowflake_conn = "snowflake_default" 40 | base_path = Path(__file__).parents[2] 41 | table_schema_path = ( 42 | f"{base_path}/include/sql/great_expectations_examples/table_schemas/" 43 | ) 44 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 45 | 46 | with DAG( 47 | "great_expectations.snowflake_write_audit_publish", 48 | start_date=datetime(2022, 1, 1), 49 | description="Example DAG showcasing a write-audit-publish data quality pattern with Snowflake and Great Expectations.", 50 | doc_md=__doc__, 51 | schedule_interval=None, 52 | template_searchpath=f"{base_path}/include/sql/snowflake_examples/", 53 | catchup=False, 54 | ) as dag: 55 | 56 | """ 57 | #### Snowflake table creation 58 | Creates the tables to store sample data 59 | """ 60 | create_snowflake_audit_table = SnowflakeOperator( 61 | task_id="create_snowflake_audit_table", 62 | sql="{% include 'create_snowflake_yellow_tripdata_table.sql' %}", 63 | params={"table_name": f"{table}_AUDIT"}, 64 | ) 65 | 66 | create_snowflake_table = SnowflakeOperator( 67 | task_id="create_snowflake_table", 68 | sql="{% include 'create_snowflake_yellow_tripdata_table.sql' %}", 69 | params={"table_name": table}, 70 | ) 71 | 72 | """ 73 | #### Insert data 74 | Insert data into the Snowflake table using an existing SQL query (stored in 75 | the include/sql/snowflake_examples/ directory). 76 | """ 77 | load_data = SnowflakeOperator( 78 | task_id="load_data", 79 | sql="{% include 'load_yellow_tripdata.sql' %}", 80 | params={"table_name": f"{table}_AUDIT"}, 81 | ) 82 | 83 | """ 84 | #### Delete table 85 | Cleans up the tables created for the example 86 | """ 87 | delete_snowflake_audit_table = SnowflakeOperator( 88 | task_id="delete_snowflake_audit_table", 89 | sql="{% include 'delete_snowflake_table.sql' %}", 90 | params={"table_name": f"{table}_AUDIT"}, 91 | trigger_rule="all_success", 92 | ) 93 | 94 | delete_snowflake_table = SnowflakeOperator( 95 | task_id="delete_snowflake_table", 96 | sql="{% include 'delete_snowflake_table.sql' %}", 97 | params={"table_name": table}, 98 | trigger_rule="all_success", 99 | ) 100 | 101 | """ 102 | #### Great Expectations suite 103 | Runs the Great Expectations suite on the table 104 | """ 105 | ge_snowflake_validation = GreatExpectationsOperator( 106 | task_id="ge_snowflake_validation", 107 | data_context_root_dir=ge_root_dir, 108 | conn_id=snowflake_conn, 109 | expectation_suite_name="taxi.demo", 110 | schema="SCHEMA", # set this to your schema 111 | data_asset_name=f"{table}_AUDIT", 112 | #fail_task_on_validation_failure=False, 113 | ) 114 | 115 | with open( 116 | f"{table_schema_path}/tripdata_schema.json", 117 | "r", 118 | ) as f: 119 | table_schema = json.load(f).get("yellow_tripdata") 120 | table_props = table_schema.get("properties") 121 | table_dimensions = table_schema.get("dimensions") 122 | table_metrics = table_schema.get("metrics") 123 | 124 | col_string = snowflake_load_column_string(table_props) 125 | 126 | """ 127 | #### Snowflake audit to production task 128 | Loads the data from the audit table to the production table 129 | """ 130 | copy_snowflake_audit_to_production_table = SnowflakeOperator( 131 | task_id="copy_snowflake_audit_to_production_table", 132 | sql="{% include 'copy_yellow_tripdata_snowflake_staging.sql' %}", 133 | params={ 134 | "table_name": table, 135 | "audit_table_name": f"{table}_AUDIT", 136 | "table_schema": table_props, 137 | "col_string": col_string, 138 | }, 139 | ) 140 | 141 | chain( 142 | [create_snowflake_table, create_snowflake_audit_table], 143 | load_data, 144 | ge_snowflake_validation, 145 | copy_snowflake_audit_to_production_table, 146 | [delete_snowflake_table, delete_snowflake_audit_table], 147 | ) 148 | -------------------------------------------------------------------------------- /dags/redshift_examples/simple_redshift_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Simple EL Pipeline with Data Integrity Check 3 | 4 | A simple DAG showing a minimal EL data pipeline with a data 5 | integrity check. using MD5 hashes. 6 | 7 | A single file is uploaded to S3, then its ETag is verified 8 | against the MD5 hash of the local file. The two should match, which will 9 | allow the DAG to flow along the "happy path". To see the "sad path", change 10 | `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` in the `validate_etag` task. 11 | 12 | Before running the DAG, set the following in an Airflow or Environment Variable: 13 | - key: aws_configs 14 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix]} 15 | Fully replacing [bucket_name] and [key_prefix]. 16 | 17 | What makes this a simple data quality case is: 18 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 19 | 2. Single-step data pipeline: no business logic to complicate things. 20 | 3. Single metric to validate. 21 | 22 | This demo works well in the case of validating data that is read from S3, such 23 | as other data pipelines that will read from S3, or Athena. It would not be 24 | helpful for data that is read from Redshift, as there is another load step 25 | that should be validated separately. 26 | """ 27 | 28 | import hashlib 29 | 30 | from airflow import DAG, AirflowException 31 | from airflow.decorators import task 32 | from airflow.models import Variable 33 | from airflow.operators.dummy_operator import DummyOperator 34 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 35 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \ 36 | LocalFilesystemToS3Operator 37 | from airflow.utils.dates import datetime 38 | 39 | # The file(s) to upload shouldn't be hardcoded in a production setting, this is just for demo purposes. 40 | CSV_FILE_NAME = "forestfires.csv" 41 | CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}" 42 | CSV_CORRUPT_FILE_NAME = "forestfires_corrupt.csv" 43 | CSV_CORRUPT_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_CORRUPT_FILE_NAME}" 44 | 45 | with DAG( 46 | "simple_redshift_1", 47 | start_date=datetime(2021, 7, 7), 48 | description="A sample Airflow DAG to load data from csv files to S3, then check that all data was uploaded properly.", 49 | doc_md=__doc__, 50 | schedule_interval=None, 51 | catchup=False, 52 | ) as dag: 53 | 54 | upload_file = LocalFilesystemToS3Operator( 55 | task_id="upload_to_s3", 56 | filename=CSV_FILE_PATH, 57 | dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + CSV_FILE_PATH, 58 | dest_bucket="{{ var.json.aws_configs.s3_bucket }}", 59 | aws_conn_id="aws_default", 60 | replace=True, 61 | ) 62 | 63 | @task 64 | def validate_etag(): 65 | """ 66 | #### Validation task 67 | Check the destination ETag against the local MD5 hash to ensure the file 68 | was uploaded without errors. 69 | """ 70 | s3 = S3Hook() 71 | aws_configs = Variable.get("aws_configs", deserialize_json=True) 72 | obj = s3.get_key( 73 | key=f"{aws_configs.get('s3_key_prefix')}/{CSV_FILE_PATH}", 74 | bucket_name=aws_configs.get("s3_bucket"), 75 | ) 76 | obj_etag = obj.e_tag.strip('"') 77 | # Change `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` for the "sad path". 78 | file_hash = hashlib.md5(open(CSV_FILE_PATH).read().encode("utf-8")).hexdigest() 79 | if obj_etag != file_hash: 80 | raise AirflowException( 81 | f"Upload Error: Object ETag in S3 did not match hash of local file." 82 | ) 83 | 84 | validate_file = validate_etag() 85 | 86 | begin = DummyOperator(task_id="begin") 87 | end = DummyOperator(task_id="end") 88 | 89 | begin >> upload_file >> validate_file >> end 90 | -------------------------------------------------------------------------------- /dags/redshift_examples/simple_redshift_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Extract and Load Pipeline with Data Integrity Check 3 | 4 | A single file is uploaded to S3, then its ETag is verified 5 | against the MD5 hash of the local file. The two should match, which will 6 | allow the DAG to flow along the "happy path". 7 | 8 | To see the "sad path", change`CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` in the `validate_etag` task. If the 9 | "happy path" is continued, a second data load from S3 to Redshift is triggered, 10 | which is followed by another data integrity check. A similar "happy/sad path" 11 | branch ends the DAG. 12 | 13 | Before running the DAG, set the following in an Airflow or Environment Variable: 14 | - key: aws_configs 15 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]} 16 | Fully replacing [bucket_name], [key_prefix], and [table_name]. 17 | 18 | What makes this a simple data quality case is: 19 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 20 | 2. No transformations or business logic. 21 | 3. Single metric to validate (whether the uploads were successful). 22 | 23 | This demo solves the issue the Simple EL Pipeline with Data Integrity Check DAG left open: validating an 24 | upload to Redshift. However, it only validates that the data matches the 25 | source file; it does not guarantee that the source file's data is actually 26 | valid with respect to expectations about that data. 27 | """ 28 | 29 | import hashlib 30 | 31 | from airflow import DAG, AirflowException 32 | from airflow.decorators import task 33 | from airflow.models import Variable 34 | from airflow.models.baseoperator import chain 35 | from airflow.operators.dummy_operator import DummyOperator 36 | from airflow.operators.sql import SQLCheckOperator 37 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 38 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \ 39 | LocalFilesystemToS3Operator 40 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \ 41 | S3ToRedshiftOperator 42 | from airflow.providers.postgres.operators.postgres import PostgresOperator 43 | from airflow.utils.dates import datetime 44 | 45 | # The file(s) to upload shouldn't be hardcoded in a production setting, this is just for demo purposes. 46 | CSV_FILE_NAME = "forestfires.csv" 47 | CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}" 48 | CSV_CORRUPT_FILE_NAME = "forestfires_corrupt.csv" 49 | CSV_CORRUPT_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_CORRUPT_FILE_NAME}" 50 | 51 | with DAG( 52 | "simple_redshift_2", 53 | start_date=datetime(2021, 7, 7), 54 | description="A sample Airflow DAG to load data from csv files to S3 and then Redshift, with data integrity checks.", 55 | doc_md=__doc__, 56 | schedule_interval=None, 57 | template_searchpath="/usr/local/airflow/include/sql/redshift_examples/", 58 | catchup=False, 59 | ) as dag: 60 | 61 | upload_file = LocalFilesystemToS3Operator( 62 | task_id="upload_to_s3", 63 | filename=CSV_FILE_PATH, 64 | dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + CSV_FILE_PATH, 65 | dest_bucket="{{ var.json.aws_configs.s3_bucket }}", 66 | aws_conn_id="aws_default", 67 | replace=True, 68 | ) 69 | 70 | @task 71 | def validate_etag(): 72 | """ 73 | #### Validation task 74 | Check the destination ETag against the local MD5 hash to ensure the file 75 | was uploaded without errors. 76 | """ 77 | s3 = S3Hook() 78 | aws_configs = Variable.get("aws_configs", deserialize_json=True) 79 | obj = s3.get_key( 80 | key=f"{aws_configs.get('s3_key_prefix')}/{CSV_FILE_PATH}", 81 | bucket_name=aws_configs.get("s3_bucket"), 82 | ) 83 | obj_etag = obj.e_tag.strip('"') 84 | # Change `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` for the "sad path". 85 | file_hash = hashlib.md5(open(CSV_FILE_PATH).read().encode("utf-8")).hexdigest() 86 | if obj_etag != file_hash: 87 | raise AirflowException( 88 | f"Upload Error: Object ETag in S3 did not match hash of local file." 89 | ) 90 | 91 | validate_file = validate_etag() 92 | 93 | """ 94 | #### Create Redshift Table 95 | For demo purposes, create a Redshift table to store the forest fire data to. 96 | The database is not automatically destroyed at the end of the example; ensure 97 | this is done manually to avoid unnecessary costs. Additionally, set-up may 98 | need to be done in Airflow connections to allow access to Redshift. 99 | """ 100 | create_redshift_table = PostgresOperator( 101 | task_id="create_table", 102 | sql="create_redshift_forestfire_table.sql", 103 | postgres_conn_id="redshift_default", 104 | ) 105 | 106 | """ 107 | #### Second load task 108 | Loads the S3 data from the previous load to a Redshift table (specified 109 | in the Airflow Variables backend). 110 | """ 111 | load_to_redshift = S3ToRedshiftOperator( 112 | task_id="load_to_redshift", 113 | s3_bucket="{{ var.json.aws_configs.s3_bucket }}", 114 | s3_key="{{ var.json.aws_configs.s3_key_prefix }}" + f"/{CSV_FILE_PATH}", 115 | schema="PUBLIC", 116 | table="{{ var.json.aws_configs.redshift_table }}", 117 | copy_options=["csv"], 118 | ) 119 | 120 | """ 121 | #### Redshift row validation task 122 | Ensure that data was copied to Redshift from S3 correctly. A SQLCheckOperator is 123 | used here to check for any files in the stl_load_errors table. 124 | """ 125 | validate_redshift = SQLCheckOperator( 126 | task_id="validate_redshift", 127 | conn_id="redshift_default", 128 | sql="validate_redshift_forestfire_load.sql", 129 | params={"filename": CSV_FILE_NAME}, 130 | ) 131 | 132 | """ 133 | #### Drop Redshift table 134 | Drops the Redshift table if it exists already. This is to make sure that the 135 | data in the success and failure cases do not interfere with each other during 136 | the data quality check. 137 | """ 138 | drop_redshift_table = PostgresOperator( 139 | task_id="drop_table", 140 | sql="drop_redshift_forestfire_table.sql", 141 | postgres_conn_id="redshift_default", 142 | ) 143 | 144 | begin = DummyOperator(task_id="begin") 145 | end = DummyOperator(task_id="end") 146 | 147 | chain( 148 | begin, 149 | upload_file, 150 | validate_file, 151 | create_redshift_table, 152 | load_to_redshift, 153 | validate_redshift, 154 | drop_redshift_table, 155 | end, 156 | ) 157 | -------------------------------------------------------------------------------- /dags/redshift_examples/simple_redshift_3.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Data Integrity Checks with Multiple Files 3 | 4 | 5 | This is the third in a series of DAGs showing an EL pipeline with data integrity 6 | and data quality checking. A single file is uploaded to S3, then its ETag is 7 | verified against the MD5 hash of the local file. The two should match, which 8 | will allow the DAG to continue to the next task. 9 | 10 | A second data load from S3 to Redshift is triggered, which is followed by another data integrity check. 11 | If the check fails, an Airflow Exception is raised. Otherwise, a final data 12 | quality check is performed on the Redshift table per row for a subset of rows, 13 | immitating a row-based data quality spot check where the specific ground truth 14 | is known. 15 | 16 | Before running the DAG, set the following in an Airflow or Environment Variable: 17 | - key: aws_configs 18 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]} 19 | Fully replacing [bucket_name], [key_prefix], and [table_name]. 20 | 21 | What makes this a simple data quality case is: 22 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 23 | 2. No transformations or business logic. 24 | 3. Exact values of data to quality check are known. 25 | 26 | This demo solves the issue Extract and Load Pipeline with Data Integrity Check left open: quality checking the data 27 | in the uploaded file. This DAG is a good starting point for a data integrity 28 | and data quality check. 29 | """ 30 | 31 | import hashlib 32 | import json 33 | 34 | from airflow import DAG, AirflowException 35 | from airflow.decorators import task 36 | from airflow.models import Variable 37 | from airflow.models.baseoperator import chain 38 | from airflow.operators.dummy_operator import DummyOperator 39 | from airflow.operators.sql import SQLCheckOperator 40 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 41 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \ 42 | LocalFilesystemToS3Operator 43 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \ 44 | S3ToRedshiftOperator 45 | from airflow.providers.postgres.operators.postgres import PostgresOperator 46 | from airflow.utils.dates import datetime 47 | from airflow.utils.task_group import TaskGroup 48 | 49 | # The file(s) to upload shouldn't be hardcoded in a production setting, this is just for demo purposes. 50 | CSV_FILE_NAME = "forestfires.csv" 51 | CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}" 52 | # CSV_CORRUPT_FILE_NAME = "forestfires_corrupt.csv" 53 | # CSV_CORRUPT_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_CORRUPT_FILE_NAME}" 54 | 55 | # Uncomment the below two constants to see the "sad path" (and comment out the paths above). 56 | # CSV_FILE_NAME = "forestfires_invalid.csv" 57 | # CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}" 58 | 59 | with DAG( 60 | "simple_redshift_3", 61 | start_date=datetime(2021, 7, 7), 62 | description="A sample Airflow DAG to load data from csv files to S3 and then Redshift, with data integrity and quality checks.", 63 | doc_md=__doc__, 64 | schedule_interval=None, 65 | template_searchpath="/usr/local/airflow/include/sql/redshift_examples/", 66 | catchup=False, 67 | ) as dag: 68 | 69 | upload_file = LocalFilesystemToS3Operator( 70 | task_id="upload_to_s3", 71 | filename=CSV_FILE_PATH, 72 | dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + CSV_FILE_PATH, 73 | dest_bucket="{{ var.json.aws_configs.s3_bucket }}", 74 | aws_conn_id="aws_default", 75 | replace=True, 76 | ) 77 | 78 | @task 79 | def validate_etag(): 80 | """ 81 | #### Validation task 82 | Check the destination ETag against the local MD5 hash to ensure the file 83 | was uploaded without errors. 84 | """ 85 | s3 = S3Hook() 86 | aws_configs = Variable.get("aws_configs", deserialize_json=True) 87 | obj = s3.get_key( 88 | key=f"{aws_configs.get('s3_key_prefix')}/{CSV_FILE_PATH}", 89 | bucket_name=aws_configs.get("s3_bucket"), 90 | ) 91 | obj_etag = obj.e_tag.strip('"') 92 | # Change `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` for the "sad path". 93 | file_hash = hashlib.md5(open(CSV_FILE_PATH).read().encode("utf-8")).hexdigest() 94 | if obj_etag != file_hash: 95 | raise AirflowException( 96 | f"Upload Error: Object ETag in S3 did not match hash of local file." 97 | ) 98 | 99 | validate_file = validate_etag() 100 | 101 | """ 102 | #### Create Redshift Table 103 | For demo purposes, create a Redshift table to store the forest fire data to. 104 | The database is not automatically destroyed at the end of the example; ensure 105 | this is done manually to avoid unnecessary costs. Additionally, set-up may 106 | need to be done in Airflow connections to allow access to Redshift. 107 | """ 108 | create_redshift_table = PostgresOperator( 109 | task_id="create_table", 110 | sql="create_redshift_forestfire_table.sql", 111 | postgres_conn_id="redshift_default", 112 | ) 113 | 114 | """ 115 | #### Second load task 116 | Loads the S3 data from the previous load to a Redshift table (specified 117 | in the Airflow Variables backend). 118 | """ 119 | load_to_redshift = S3ToRedshiftOperator( 120 | task_id="load_to_redshift", 121 | s3_bucket="{{ var.json.aws_configs.s3_bucket }}", 122 | s3_key="{{ var.json.aws_configs.s3_key_prefix }}" + f"/{CSV_FILE_PATH}", 123 | schema="PUBLIC", 124 | table="{{ var.json.aws_configs.redshift_table }}", 125 | copy_options=["csv"], 126 | ) 127 | 128 | """ 129 | #### Redshift row validation task 130 | Ensure that data was copied to Redshift from S3 correctly. A SQLCheckOperator is 131 | used here to check for any files in the stl_load_errors table. 132 | """ 133 | validate_redshift = SQLCheckOperator( 134 | task_id="validate_redshift", 135 | conn_id="redshift_default", 136 | sql="validate_redshift_forestfire_load.sql", 137 | params={"filename": CSV_FILE_NAME}, 138 | ) 139 | 140 | """ 141 | #### Row-level data quality check 142 | Run a data quality check on a few rows, ensuring that the data in Redshift 143 | matches the ground truth in the correspoding JSON file. 144 | """ 145 | with open("include/validation/forestfire_validation.json") as ffv: 146 | with TaskGroup(group_id="row_quality_checks") as quality_check_group: 147 | ffv_json = json.load(ffv) 148 | for id, values in ffv_json.items(): 149 | values["id"] = id 150 | SQLCheckOperator( 151 | task_id=f"forestfire_row_quality_check_{id}", 152 | conn_id="redshift_default", 153 | sql="row_quality_redshift_forestfire_check.sql", 154 | params=values, 155 | ) 156 | 157 | """ 158 | #### Drop Redshift table 159 | Drops the Redshift table if it exists already. This is to make sure that the 160 | data in the success and failure cases do not interfere with each other during 161 | the data quality check. 162 | """ 163 | drop_redshift_table = PostgresOperator( 164 | task_id="drop_table", 165 | sql="drop_redshift_forestfire_table.sql", 166 | postgres_conn_id="redshift_default", 167 | ) 168 | 169 | begin = DummyOperator(task_id="begin") 170 | end = DummyOperator(task_id="end") 171 | 172 | chain( 173 | begin, 174 | upload_file, 175 | validate_file, 176 | create_redshift_table, 177 | load_to_redshift, 178 | validate_redshift, 179 | quality_check_group, 180 | drop_redshift_table, 181 | end, 182 | ) 183 | -------------------------------------------------------------------------------- /dags/snowflake_examples/complex_snowflake_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Snowflake ELT Pipeline with Multiple Datasets and Data Qality Checks 3 | 4 | Run data quality checks, in SQL, on multiple Snowflake tables. 5 | 6 | This DAG uses the forestfires public dataset on ForestFires to run data quality checks on multiple tables in Snowflake. 7 | In the event of a failure, a Slack notification will be fired off. In this example, data quality checks are 8 | run as taskgroups after the data is uploaded. 9 | 10 | Note that this DAG deletes all data it uploaded after the DQ checks run. 11 | 12 | Ensure a Snowflake Warehouse, Database, Schema, and Role exist for the Snowflake 13 | connection provided to the operator under the connection ID `snowflake_default`. 14 | """ 15 | 16 | from airflow import DAG 17 | from airflow.models.baseoperator import chain 18 | from airflow.operators.empty import EmptyOperator 19 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator, 20 | SQLTableCheckOperator) 21 | from airflow.providers.slack.operators.slack_webhook import \ 22 | SlackWebhookOperator 23 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator 24 | from airflow.utils.dates import datetime 25 | from airflow.utils.task_group import TaskGroup 26 | 27 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires" 28 | SNOWFLAKE_COST_TABLE = "costs" 29 | SNOWFLAKE_FORESTFIRE_COST_TABLE = "forestfire_costs" 30 | 31 | SNOWFLAKE_CONN_ID = "snowflake_default" 32 | 33 | ROW_COUNT_CHECK = "COUNT(*) = 9" 34 | 35 | 36 | def slack_failure_notification(context): 37 | task_id = context.get("task_instance").task_id 38 | dag_id = context.get("task_instance").dag_id 39 | exec_date = context.get("execution_date") 40 | log_url = context.get("task_instance").log_url 41 | slack_msg = f""" 42 | :red_circle: Task Failed. 43 | *Task*: {task_id} 44 | *Dag*: {dag_id} 45 | *Execution Time*: {exec_date} 46 | *Log Url*: {log_url} 47 | """ 48 | failed_alert = SlackWebhookOperator( 49 | task_id="slack_notification", 50 | http_conn_id="slack_webhook", 51 | message=slack_msg, 52 | channel="data_engineering", 53 | username="failbot", 54 | ) 55 | return failed_alert.execute(context=context) 56 | 57 | 58 | with DAG( 59 | "complex_snowflake_transform", 60 | description="Example DAG showcasing loading, transforming, and data quality checking with multiple datasets in Snowflake.", 61 | doc_md=__doc__, 62 | start_date=datetime(2021, 1, 1), 63 | schedule_interval=None, 64 | template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/", 65 | catchup=False, 66 | ) as dag: 67 | """ 68 | #### Snowflake table creation 69 | Create the tables to store sample data. 70 | """ 71 | create_forestfire_table = SnowflakeOperator( 72 | task_id="create_forestfire_table", 73 | sql="create_forestfire_table.sql", 74 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 75 | ) 76 | 77 | create_cost_table = SnowflakeOperator( 78 | task_id="create_cost_table", 79 | sql="create_cost_table.sql", 80 | params={"table_name": SNOWFLAKE_COST_TABLE}, 81 | ) 82 | 83 | create_forestfire_cost_table = SnowflakeOperator( 84 | task_id="create_forestfire_cost_table", 85 | sql="create_forestfire_cost_table.sql", 86 | params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE}, 87 | ) 88 | 89 | """ 90 | #### Insert data 91 | Insert data into the Snowflake tables using existing SQL queries 92 | stored in the include/sql/snowflake_examples/ directory. 93 | """ 94 | load_forestfire_data = SnowflakeOperator( 95 | task_id="load_forestfire_data", 96 | sql="load_snowflake_forestfire_data.sql", 97 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 98 | ) 99 | 100 | load_cost_data = SnowflakeOperator( 101 | task_id="load_cost_data", 102 | sql="load_cost_data.sql", 103 | params={"table_name": SNOWFLAKE_COST_TABLE}, 104 | ) 105 | 106 | load_forestfire_cost_data = SnowflakeOperator( 107 | task_id="load_forestfire_cost_data", 108 | sql="load_forestfire_cost_data.sql", 109 | params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE}, 110 | ) 111 | 112 | """ 113 | #### Transform 114 | Transform the forestfire_costs table to perform 115 | sample logic. 116 | """ 117 | transform_forestfire_cost_table = SnowflakeOperator( 118 | task_id="transform_forestfire_cost_table", 119 | sql="transform_forestfire_cost_table.sql", 120 | params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE}, 121 | ) 122 | 123 | """ 124 | #### Quality checks 125 | Perform data quality checks on the various tables. 126 | """ 127 | with TaskGroup( 128 | group_id="quality_check_group_forestfire", 129 | default_args={ 130 | "conn_id": SNOWFLAKE_CONN_ID, 131 | "on_failure_callback": slack_failure_notification, 132 | }, 133 | ) as quality_check_group_forestfire: 134 | """ 135 | #### Column-level data quality check 136 | Run data quality checks on columns of the forestfire table 137 | """ 138 | forestfire_column_checks = SQLColumnCheckOperator( 139 | task_id="forestfire_column_checks", 140 | table=SNOWFLAKE_FORESTFIRE_TABLE, 141 | column_mapping={ 142 | "ID": {"null_check": {"equal_to": 0}}, 143 | "RH": {"max": {"leq_to": 100}}, 144 | }, 145 | ) 146 | 147 | """ 148 | #### Table-level data quality check 149 | Run data quality checks on the forestfire table 150 | """ 151 | forestfire_table_checks = SQLTableCheckOperator( 152 | task_id="forestfire_table_checks", 153 | table=SNOWFLAKE_FORESTFIRE_TABLE, 154 | checks={"row_count_check": {"check_statement": ROW_COUNT_CHECK}}, 155 | ) 156 | 157 | with TaskGroup( 158 | group_id="quality_check_group_cost", 159 | default_args={ 160 | "conn_id": SNOWFLAKE_CONN_ID, 161 | "on_failure_callback": slack_failure_notification, 162 | }, 163 | ) as quality_check_group_cost: 164 | """ 165 | #### Column-level data quality check 166 | Run data quality checks on columns of the forestfire table 167 | """ 168 | cost_column_checks = SQLColumnCheckOperator( 169 | task_id="cost_column_checks", 170 | table=SNOWFLAKE_COST_TABLE, 171 | column_mapping={ 172 | "ID": {"null_check": {"equal_to": 0}}, 173 | "LAND_DAMAGE_COST": {"min": {"geq_to": 0}}, 174 | "PROPERTY_DAMAGE_COST": {"min": {"geq_to": 0}}, 175 | "LOST_PROFITS_COST": {"min": {"geq_to": 0}}, 176 | }, 177 | ) 178 | 179 | """ 180 | #### Table-level data quality check 181 | Run data quality checks on the forestfire table 182 | """ 183 | cost_table_checks = SQLTableCheckOperator( 184 | task_id="cost_table_checks", 185 | table=SNOWFLAKE_COST_TABLE, 186 | checks={"row_count_check": {"check_statement": ROW_COUNT_CHECK}}, 187 | ) 188 | 189 | with TaskGroup( 190 | group_id="quality_check_group_forestfire_costs", 191 | default_args={ 192 | "conn_id": SNOWFLAKE_CONN_ID, 193 | "on_failure_callback": slack_failure_notification, 194 | }, 195 | ) as quality_check_group_forestfire_costs: 196 | """ 197 | #### Column-level data quality check 198 | Run data quality checks on columns of the forestfire table 199 | """ 200 | forestfire_costs_column_checks = SQLColumnCheckOperator( 201 | task_id="forestfire_costs_column_checks", 202 | table=SNOWFLAKE_FORESTFIRE_COST_TABLE, 203 | column_mapping={"AREA": {"min": {"geq_to": 0}}}, 204 | ) 205 | 206 | """ 207 | #### Table-level data quality check 208 | Run data quality checks on the forestfire table 209 | """ 210 | forestfire_costs_table_checks = SQLTableCheckOperator( 211 | task_id="forestfire_costs_table_checks", 212 | table=SNOWFLAKE_FORESTFIRE_COST_TABLE, 213 | checks={ 214 | "row_count_check": {"check_statement": ROW_COUNT_CHECK}, 215 | "total_cost_check": { 216 | "check_statement": "land_damage_cost + property_damage_cost + lost_profits_cost = total_cost" 217 | }, 218 | }, 219 | ) 220 | 221 | """ 222 | #### Delete tables 223 | Clean up the tables created for the example. 224 | """ 225 | delete_forestfire_table = SnowflakeOperator( 226 | task_id="delete_forestfire_table", 227 | sql="delete_snowflake_table.sql", 228 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 229 | ) 230 | 231 | delete_cost_table = SnowflakeOperator( 232 | task_id="delete_costs_table", 233 | sql="delete_snowflake_table.sql", 234 | params={"table_name": SNOWFLAKE_COST_TABLE}, 235 | ) 236 | 237 | delete_forestfire_cost_table = SnowflakeOperator( 238 | task_id="delete_forestfire_cost_table", 239 | sql="delete_snowflake_table.sql", 240 | params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE}, 241 | ) 242 | 243 | begin = EmptyOperator(task_id="begin") 244 | create_done = EmptyOperator(task_id="create_done") 245 | load_done = EmptyOperator(task_id="load_done") 246 | end = EmptyOperator(task_id="end") 247 | 248 | chain( 249 | begin, 250 | [create_forestfire_table, create_cost_table, create_forestfire_cost_table], 251 | create_done, 252 | [load_forestfire_data, load_cost_data], 253 | load_done, 254 | [quality_check_group_forestfire, quality_check_group_cost], 255 | load_forestfire_cost_data, 256 | quality_check_group_forestfire_costs, 257 | transform_forestfire_cost_table, 258 | [delete_forestfire_table, delete_cost_table, delete_forestfire_cost_table], 259 | end, 260 | ) 261 | -------------------------------------------------------------------------------- /dags/snowflake_examples/simple_snowflake.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Simple EL Pipeline with Data Quality Checks Using Snowflake 3 | 4 | Runs a data quality check, in SQL, on the forest fires dataset 5 | 6 | Note that this DAG will clean up after itself and delete all data it uploads. 7 | 8 | Ensure a Snowflake Warehouse, Database, Schema, and Role exist for the Snowflake 9 | connection provided to the Operator. The names of these data should replace the 10 | dummy values at the top of the file. 11 | 12 | A Snowflake Connection is also needed, named `snowflake_default`. 13 | 14 | What makes this a simple data quality case is: 15 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable. 16 | 2. No transformations or business logic. 17 | 3. Exact values of data to quality check are known. 18 | """ 19 | 20 | from airflow import DAG 21 | from airflow.models.baseoperator import chain 22 | from airflow.operators.empty import EmptyOperator 23 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator, 24 | SQLTableCheckOperator) 25 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator 26 | from airflow.utils.dates import datetime 27 | from airflow.utils.task_group import TaskGroup 28 | 29 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires" 30 | SNOWFLAKE_CONN_ID = "snowflake_default" 31 | 32 | 33 | with DAG( 34 | "simple_snowflake", 35 | description="Example DAG showcasing loading and data quality checking with Snowflake.", 36 | doc_md=__doc__, 37 | start_date=datetime(2021, 1, 1), 38 | schedule_interval=None, 39 | template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/", 40 | catchup=False, 41 | ) as dag: 42 | 43 | """ 44 | #### Snowflake table creation 45 | Create the table to store sample forest fire data. 46 | """ 47 | create_table = SnowflakeOperator( 48 | task_id="create_table", 49 | sql="{% include 'create_forestfire_table.sql' %}", 50 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 51 | ) 52 | 53 | """ 54 | #### Insert data 55 | Insert data into the Snowflake table using an existing SQL query (stored in 56 | the include/sql/snowflake_examples/ directory). 57 | """ 58 | load_data = SnowflakeOperator( 59 | task_id="insert_query", 60 | sql="{% include 'load_snowflake_forestfire_data.sql' %}", 61 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 62 | ) 63 | 64 | with TaskGroup( 65 | group_id="quality_checks", default_args={"conn_id": SNOWFLAKE_CONN_ID} 66 | ) as quality_check_group: 67 | """ 68 | #### Column-level data quality check 69 | Run data quality checks on columns of the audit table 70 | """ 71 | column_checks = SQLColumnCheckOperator( 72 | task_id="column_checks", 73 | table=SNOWFLAKE_FORESTFIRE_TABLE, 74 | column_mapping={"ID": {"null_check": {"equal_to": 0}}}, 75 | ) 76 | 77 | """ 78 | #### Table-level data quality check 79 | Run data quality checks on the audit table 80 | """ 81 | table_checks = SQLTableCheckOperator( 82 | task_id="table_checks", 83 | table=SNOWFLAKE_FORESTFIRE_TABLE, 84 | checks={"row_count_check": {"check_statement": "COUNT(*) = 9"}}, 85 | ) 86 | 87 | """ 88 | #### Delete table 89 | Clean up the table created for the example. 90 | """ 91 | delete_table = SnowflakeOperator( 92 | task_id="delete_table", 93 | sql="{% include 'delete_snowflake_table.sql' %}", 94 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 95 | ) 96 | 97 | begin = EmptyOperator(task_id="begin") 98 | end = EmptyOperator(task_id="end") 99 | 100 | chain(begin, create_table, load_data, quality_check_group, delete_table, end) 101 | -------------------------------------------------------------------------------- /dags/snowflake_examples/snowflake_dynamic_write_audit_publish.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### Data Quality Checks Using Snowflake and Dynamic Task Mapping. 3 | 4 | Map over a set of columns and perform data quality checks. 5 | 6 | This DAG shows how to use Airflow's dynamic task mapping to create tasks based off of a supplied list of columns to perform a data quality check. 7 | All DQ checks in this DAg are performed in SQL and are expressed in a task group. 8 | 9 | Note this DAG will clean up after itself once it's done running. 10 | """ 11 | 12 | import json 13 | from pathlib import Path 14 | 15 | from airflow import DAG 16 | from airflow.models.baseoperator import chain 17 | from airflow.operators.empty import EmptyOperator 18 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator, 19 | SQLTableCheckOperator) 20 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator 21 | from airflow.utils.dates import datetime 22 | from airflow.utils.task_group import TaskGroup 23 | 24 | from include.forestfire_checks.checks import COL_CHECKS, TABLE_CHECKS 25 | from include.libs.schema_reg.base_schema_transforms import \ 26 | snowflake_load_column_string 27 | 28 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires" 29 | SNOWFLAKE_AUDIT_TABLE = f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT" 30 | 31 | base_path = Path(__file__).parents[2] 32 | table_schema_path = f"{base_path}/include/sql/snowflake_examples/table_schemas/" 33 | 34 | with DAG( 35 | "snowflake_dynamic_write_audit_publish", 36 | doc_md=__doc__, 37 | start_date=datetime(2021, 1, 1), 38 | schedule_interval=None, 39 | template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/", 40 | default_args={"conn_id": "snowflake_default"}, 41 | catchup=False, 42 | ) as dag: 43 | """ 44 | #### Snowflake audit table creation 45 | Creates the tables to store sample data for testing 46 | """ 47 | create_forestfire_audit_table = SnowflakeOperator( 48 | task_id="create_forestfire_audit_table", 49 | sql="create_forestfire_table.sql", 50 | params={"table_name": SNOWFLAKE_AUDIT_TABLE}, 51 | ) 52 | 53 | """ 54 | #### Snowflake table creation 55 | Create the table to store verified sample data. 56 | """ 57 | create_forestfire_production_table = SnowflakeOperator( 58 | task_id="create_forestfire_production_table", 59 | sql="create_forestfire_table.sql", 60 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 61 | ) 62 | 63 | """ 64 | #### Insert data 65 | Insert data into the Snowflake audit table using an existing SQL query (stored in 66 | the include/sql/snowflake_examples/ directory). 67 | """ 68 | load_data = SnowflakeOperator( 69 | task_id="insert_query", 70 | sql="load_snowflake_forestfire_data.sql", 71 | params={"table_name": SNOWFLAKE_AUDIT_TABLE}, 72 | ) 73 | 74 | with TaskGroup(group_id="quality_checks") as quality_check_group: 75 | """ 76 | #### Column-level data quality check 77 | Run data quality checks on columns of the audit table 78 | """ 79 | column_checks = SQLColumnCheckOperator.partial( 80 | task_id="column_checks", 81 | table=SNOWFLAKE_AUDIT_TABLE, 82 | ).expand(column_mapping=COL_CHECKS) 83 | 84 | """ 85 | #### Table-level data quality check 86 | Run data quality checks on the audit table 87 | """ 88 | table_checks = SQLTableCheckOperator.partial( 89 | task_id="table_checks", 90 | table=SNOWFLAKE_AUDIT_TABLE, 91 | ).expand(checks=TABLE_CHECKS) 92 | 93 | with open( 94 | f"{table_schema_path}/forestfire_schema.json", 95 | "r", 96 | ) as f: 97 | table_schema = json.load(f).get("forestfire") 98 | table_props = table_schema.get("properties") 99 | table_dimensions = table_schema.get("dimensions") 100 | table_metrics = table_schema.get("metrics") 101 | 102 | col_string = snowflake_load_column_string(table_props) 103 | 104 | """ 105 | #### Snowflake audit to production task 106 | Loads the data from the audit table to the production table 107 | """ 108 | copy_snowflake_audit_to_production_table = SnowflakeOperator( 109 | task_id="copy_snowflake_audit_to_production_table", 110 | sql="copy_forestfire_snowflake_audit.sql", 111 | params={ 112 | "table_name": SNOWFLAKE_FORESTFIRE_TABLE, 113 | "audit_table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT", 114 | "table_schema": table_props, 115 | "col_string": col_string, 116 | }, 117 | trigger_rule="all_success", 118 | ) 119 | 120 | """ 121 | #### Delete audit table 122 | Clean up the table created for the example. 123 | """ 124 | delete_audit_table = SnowflakeOperator( 125 | task_id="delete_audit_table", 126 | sql="delete_forestfire_table.sql", 127 | params={"table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT"}, 128 | trigger_rule="all_success", 129 | ) 130 | 131 | begin = EmptyOperator(task_id="begin") 132 | end = EmptyOperator(task_id="end") 133 | 134 | chain( 135 | begin, 136 | [create_forestfire_production_table, create_forestfire_audit_table], 137 | load_data, 138 | quality_check_group, 139 | copy_snowflake_audit_to_production_table, 140 | delete_audit_table, 141 | end, 142 | ) 143 | -------------------------------------------------------------------------------- /dags/snowflake_examples/snowflake_write_audit_publish.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### ELT Pipeline with Data Quality Checks Using Snowflake 3 | 4 | Example DAG showcasing loading and data quality checking with Snowflake with a Write, Audit, Publish pattern. 5 | """ 6 | 7 | import json 8 | from pathlib import Path 9 | 10 | from airflow import DAG 11 | from airflow.models.baseoperator import chain 12 | from airflow.operators.empty import EmptyOperator 13 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator, 14 | SQLTableCheckOperator) 15 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator 16 | from airflow.utils.dates import datetime 17 | from airflow.utils.task_group import TaskGroup 18 | 19 | from include.libs.schema_reg.base_schema_transforms import \ 20 | snowflake_load_column_string 21 | 22 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires" 23 | SNOWFLAKE_AUDIT_TABLE = f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT" 24 | SNOWFLAKE_CONN_ID = "snowflake_default" 25 | 26 | base_path = Path(__file__).parents[2] 27 | table_schema_path = f"{base_path}/include/sql/snowflake_examples/table_schemas/" 28 | 29 | with DAG( 30 | "snowflake_write_audit_publish", 31 | doc_md=__doc__, 32 | start_date=datetime(2021, 1, 1), 33 | schedule_interval=None, 34 | template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/", 35 | catchup=False, 36 | ) as dag: 37 | 38 | """ 39 | #### Snowflake audit table creation 40 | Creates the tables to store sample data for testing 41 | """ 42 | create_forestfire_audit_table = SnowflakeOperator( 43 | task_id="create_forestfire_audit_table", 44 | sql="create_forestfire_table.sql", 45 | params={"table_name": SNOWFLAKE_AUDIT_TABLE}, 46 | ) 47 | 48 | """ 49 | #### Snowflake table creation 50 | Create the table to store verified sample data. 51 | """ 52 | create_forestfire_production_table = SnowflakeOperator( 53 | task_id="create_forestfire_production_table", 54 | sql="create_forestfire_table.sql", 55 | params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE}, 56 | ) 57 | 58 | """ 59 | #### Insert data 60 | Insert data into the Snowflake audit table using an existing SQL query (stored in 61 | the include/sql/snowflake_examples/ directory). 62 | """ 63 | load_data = SnowflakeOperator( 64 | task_id="insert_query", 65 | sql="load_snowflake_forestfire_data.sql", 66 | params={"table_name": SNOWFLAKE_AUDIT_TABLE}, 67 | ) 68 | 69 | with TaskGroup( 70 | group_id="quality_checks", default_args={"conn_id": SNOWFLAKE_CONN_ID} 71 | ) as quality_check_group: 72 | """ 73 | #### Column-level data quality check 74 | Run data quality checks on columns of the audit table 75 | """ 76 | column_checks = SQLColumnCheckOperator( 77 | task_id="column_checks", 78 | table=SNOWFLAKE_AUDIT_TABLE, 79 | column_mapping={"ID": {"null_check": {"equal_to": 0}}}, 80 | ) 81 | 82 | """ 83 | #### Table-level data quality check 84 | Run data quality checks on the audit table 85 | """ 86 | table_checks = SQLTableCheckOperator( 87 | task_id="table_checks", 88 | table=SNOWFLAKE_AUDIT_TABLE, 89 | checks={"row_count_check": {"check_statement": "COUNT(*) = 9"}}, 90 | ) 91 | 92 | with open( 93 | f"{table_schema_path}/forestfire_schema.json", 94 | "r", 95 | ) as f: 96 | table_schema = json.load(f).get("forestfire") 97 | table_props = table_schema.get("properties") 98 | table_dimensions = table_schema.get("dimensions") 99 | table_metrics = table_schema.get("metrics") 100 | 101 | col_string = snowflake_load_column_string(table_props) 102 | 103 | """ 104 | #### Snowflake audit to production task 105 | Loads the data from the audit table to the production table 106 | """ 107 | copy_snowflake_audit_to_production_table = SnowflakeOperator( 108 | task_id="copy_snowflake_audit_to_production_table", 109 | sql="copy_forestfire_snowflake_audit.sql", 110 | params={ 111 | "table_name": SNOWFLAKE_FORESTFIRE_TABLE, 112 | "audit_table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT", 113 | "table_schema": table_props, 114 | "col_string": col_string, 115 | }, 116 | trigger_rule="all_success", 117 | ) 118 | 119 | """ 120 | #### Delete audit table 121 | Clean up the table created for the example. 122 | """ 123 | delete_audit_table = SnowflakeOperator( 124 | task_id="delete_audit_table", 125 | sql="delete_forestfire_table.sql", 126 | params={"table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT"}, 127 | trigger_rule="all_success", 128 | ) 129 | 130 | begin = EmptyOperator(task_id="begin") 131 | end = EmptyOperator(task_id="end") 132 | 133 | chain( 134 | begin, 135 | [create_forestfire_production_table, create_forestfire_audit_table], 136 | load_data, 137 | quality_check_group, 138 | copy_snowflake_audit_to_production_table, 139 | delete_audit_table, 140 | end, 141 | ) 142 | -------------------------------------------------------------------------------- /dags/sql_examples/sql_check.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### SQL Check Operators Data Quality Example 3 | 4 | "A sample Airflow DAG to perform data quality checks using SQL Operators. 5 | 6 | Before running the DAG, ensure you have an active and reachable SQL database 7 | running, with a connection to that database in an Airflow Connection, and 8 | the data loaded. This DAG **will not** run successfully as-is. For an 9 | out-of-the-box working demo, see the sql_data_quality_redshift_etl DAG. 10 | 11 | Note: The data files for this example do **not** include an `upload_date` 12 | column. This column is needed for the interval check, and is added as a 13 | Task in sql_check_redshift_etl.py. 14 | """ 15 | 16 | from airflow import DAG 17 | from airflow.models.baseoperator import chain 18 | from airflow.operators.dummy_operator import DummyOperator 19 | from airflow.operators.sql import (SQLCheckOperator, SQLIntervalCheckOperator, 20 | SQLThresholdCheckOperator, 21 | SQLValueCheckOperator) 22 | from airflow.utils.dates import datetime 23 | from airflow.utils.task_group import TaskGroup 24 | 25 | # This table variable is a placeholder, in a live environment, it is better 26 | # to pull the table info from a Variable in a template 27 | TABLE = "yellow_tripdata" 28 | DATES = ["2019-01", "2019-02"] 29 | 30 | # By putting conn_id as a default_arg, the arg is passed to every task, 31 | # reducing boilerplate 32 | with DAG( 33 | "sql_data_quality", 34 | start_date=datetime(2021, 7, 7), 35 | doc_md=__doc__, 36 | schedule_interval=None, 37 | default_args={"conn_id": "postgres_default"}, 38 | template_searchpath="/usr/local/airflow/include/sql/sql_examples/", 39 | catchup=False, 40 | ) as dag: 41 | 42 | begin = DummyOperator(task_id="begin") 43 | end = DummyOperator(task_id="end") 44 | 45 | """ 46 | #### Run Table-Level Quality Check 47 | Ensure that the correct number of rows are present in the table. 48 | """ 49 | value_check = SQLValueCheckOperator( 50 | task_id="check_row_count", 51 | sql=f"SELECT COUNT(*) FROM {TABLE};", 52 | pass_value=20000, 53 | ) 54 | 55 | """ 56 | #### Run Interval Check 57 | Check that the average trip distance today is within a desirable threshold 58 | compared to the average trip distance yesterday. 59 | """ 60 | interval_check = SQLIntervalCheckOperator( 61 | task_id="check_interval_data", 62 | table=TABLE, 63 | days_back=-1, 64 | date_filter_column="upload_date", 65 | metrics_thresholds={"AVG(trip_distance)": 1.5}, 66 | ) 67 | 68 | """ 69 | #### Threshold Check 70 | Similar to the threshold cases in the Row-Level Check above, ensures that 71 | certain row(s) values meet the desired threshold(s). 72 | """ 73 | threshold_check = SQLThresholdCheckOperator( 74 | task_id="check_threshold", 75 | sql=f"SELECT MAX(passenger_count) FROM {TABLE};", 76 | min_threshold=1, 77 | max_threshold=8, 78 | ) 79 | 80 | """ 81 | #### Run Row-Level Quality Checks 82 | For each date of data, run checks on 10 rows to ensure basic data quality 83 | cases (found in the .sql file) pass. 84 | """ 85 | with TaskGroup(group_id="row_quality_checks") as quality_check_group: 86 | # Create 10 tasks, to spot-check 10 random rows 87 | for i in range(0, 10): 88 | """ 89 | #### Run Row-Level Quality Checks 90 | Runs a series of checks on different columns of data for a single, 91 | randomly chosen row. This acts as a spot-check on data. 92 | """ 93 | SQLCheckOperator( 94 | task_id=f"yellow_tripdata_row_quality_check_{i}", 95 | sql="row_quality_yellow_tripdata_check.sql", 96 | ) 97 | 98 | chain( 99 | begin, 100 | [quality_check_group, value_check, interval_check, threshold_check], 101 | end, 102 | ) 103 | -------------------------------------------------------------------------------- /dags/sql_examples/sql_check_redshift_etl.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### SQL Check Operators Data Quality ETL Example 3 | 4 | Use the SQLCheckOperators to perform data quality checks in ETL use cases. 5 | 6 | Before running the DAG, set the following in an Airflow or Environment Variable: 7 | - key: aws_configs 8 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]} 9 | Fully replacing [bucket_name], [key_prefix], and [table_name]. 10 | 11 | See the README for information on how to set up your Redshift connection. 12 | This DAG can be used with other databases as long as the Redshift (and possibly 13 | transfer operators) are changed. 14 | """ 15 | 16 | import pandas as pd 17 | from airflow import DAG 18 | from airflow.decorators import task 19 | from airflow.models.baseoperator import chain 20 | from airflow.operators.dummy_operator import DummyOperator 21 | from airflow.operators.sql import (SQLCheckOperator, SQLIntervalCheckOperator, 22 | SQLThresholdCheckOperator, 23 | SQLValueCheckOperator) 24 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \ 25 | LocalFilesystemToS3Operator 26 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \ 27 | S3ToRedshiftOperator 28 | from airflow.providers.postgres.operators.postgres import PostgresOperator 29 | from airflow.utils.dates import datetime 30 | from airflow.utils.task_group import TaskGroup 31 | 32 | DATES = ["2019-01", "2019-02"] 33 | TASK_DICT = {} 34 | 35 | with DAG( 36 | "sql_data_quality_redshift_etl", 37 | start_date=datetime(2021, 7, 7), 38 | doc_md=__doc__, 39 | schedule_interval=None, 40 | default_args={"conn_id": "redshift_default"}, 41 | template_searchpath="/usr/local/airflow/include/sql/sql_examples/", 42 | catchup=False, 43 | ) as dag: 44 | 45 | """ 46 | #### Dummy operators 47 | Help label start and end of dag. Converges exist because lists of tasks 48 | cannot set another list as downstream. 49 | """ 50 | begin = DummyOperator(task_id="begin") 51 | end = DummyOperator(task_id="end") 52 | converge_1 = DummyOperator(task_id="converge_1") 53 | converge_2 = DummyOperator(task_id="converge_2") 54 | 55 | """ 56 | #### Create Redshift Table 57 | For demo purposes, create a Redshift table to store the forest fire data to. 58 | The database is not automatically destroyed at the end of the example; ensure 59 | this is done manually to avoid unnecessary costs. Additionally, set-up may 60 | need to be done in Airflow connections to allow access to Redshift. 61 | """ 62 | create_redshift_table = PostgresOperator( 63 | task_id="create_table", 64 | sql="create_redshift_yellow_tripdata_table.sql", 65 | postgres_conn_id="redshift_default", 66 | ) 67 | 68 | with TaskGroup(group_id="row_quality_checks") as quality_check_group: 69 | # Create 10 tasks, to spot-check 10 random rows 70 | for i in range(0, 10): 71 | """ 72 | #### Run Row-Level Quality Checks 73 | Runs a series of checks on different columns of data for a single, 74 | randomly chosen row. This acts as a spot-check on data. Note: When 75 | using the sample data, row level checks may fail. Which column(s) of 76 | the row that failed may be checked in the logs. To further diagnose 77 | the issue, run a modified query directly in Redshift's query editor 78 | to check individual values against calculations and expectations. 79 | """ 80 | SQLCheckOperator( 81 | task_id=f"yellow_tripdata_row_quality_check_{i}", 82 | sql="row_quality_yellow_tripdata_check.sql", 83 | ) 84 | 85 | """ 86 | #### Run Table-Level Quality Check 87 | Ensure that the correct number of rows are present in the table. 88 | """ 89 | value_check = SQLValueCheckOperator( 90 | task_id="check_row_count", 91 | sql="SELECT COUNT(*) FROM {{ var.json.aws_configs.redshift_table }};", 92 | pass_value=20000, 93 | ) 94 | 95 | """ 96 | #### Run Interval Check 97 | Check that the average trip distance today is within a desirable threshold 98 | compared to the average trip distance yesterday. 99 | """ 100 | interval_check = SQLIntervalCheckOperator( 101 | task_id="check_interval_data", 102 | table="{{ var.json.aws_configs.redshift_table }}", 103 | days_back=-1, 104 | date_filter_column="upload_date", 105 | metrics_thresholds={"AVG(trip_distance)": 1.5}, 106 | ) 107 | 108 | """ 109 | #### Threshold Check 110 | Similar to the threshold cases in the Row-Level Check above, ensures that 111 | certain row(s) values meet the desired threshold(s). 112 | """ 113 | threshold_check = SQLThresholdCheckOperator( 114 | task_id="check_threshold", 115 | sql="SELECT MAX(passenger_count) FROM {{ var.json.aws_configs.redshift_table }};", 116 | min_threshold=1, 117 | max_threshold=8, 118 | ) 119 | 120 | """ 121 | #### Drop Redshift table 122 | Drops the Redshift table if it exists already. This is to make sure that the 123 | data in the success and failure cases do not interfere with each other during 124 | the data quality check. 125 | """ 126 | drop_redshift_table = PostgresOperator( 127 | task_id="drop_table", 128 | sql="drop_redshift_yellow_tripdata_table.sql", 129 | postgres_conn_id="redshift_default", 130 | ) 131 | 132 | @task 133 | def add_upload_date(file_path, upload_date): 134 | """ 135 | #### Transform Task 136 | In general, it is not recommended to perform transform operations in 137 | Airflow Tasks, as Airflow is designed to be an orchestrator, not a 138 | computation engine. However, the transform is done here as it is a 139 | relatively small operation, simply adding an upload_date column to the 140 | dataframe for use in the SQL data quality checks later. Doing the 141 | transform here also makes this example more easily extensible to the 142 | use of other backend datastores. 143 | """ 144 | trip_dict = pd.read_csv( 145 | file_path, 146 | header=0, 147 | parse_dates=["pickup_datetime"], 148 | infer_datetime_format=True, 149 | ) 150 | trip_dict["upload_date"] = upload_date 151 | trip_dict.to_csv(file_path, header=True, index=False) 152 | 153 | @task 154 | def delete_upload_date(file_path): 155 | """ 156 | #### Drop added column 157 | Drops the upload_date column used for this example, as this data is used 158 | by other example DAGs in this repository, so it should not interfere 159 | with those. 160 | """ 161 | trip_dict = pd.read_csv( 162 | file_path, 163 | header=0, 164 | parse_dates=["pickup_datetime"], 165 | infer_datetime_format=True, 166 | ) 167 | trip_dict.drop(columns="upload_date", inplace=True) 168 | trip_dict.to_csv(file_path, header=True, index=False) 169 | 170 | for i, date in enumerate(DATES): 171 | file_path = f"/usr/local/airflow/include/sample_data/yellow_trip_data/yellow_tripdata_sample_{date}.csv" 172 | 173 | TASK_DICT[f"add_upload_date_{date}"] = add_upload_date( 174 | file_path, "{{ macros.ds_add(ds, " + str(-i) + ") }}" 175 | ) 176 | 177 | """ 178 | #### Upload task 179 | Simply loads the file to a specified location in S3. 180 | """ 181 | TASK_DICT[f"upload_to_s3_{date}"] = LocalFilesystemToS3Operator( 182 | task_id=f"upload_to_s3_{date}", 183 | filename=file_path, 184 | dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + file_path, 185 | dest_bucket="{{ var.json.aws_configs.s3_bucket }}", 186 | aws_conn_id="aws_default", 187 | replace=True, 188 | ) 189 | 190 | """ 191 | #### Redshift load task 192 | Loads the S3 data from the previous load to a Redshift table (specified 193 | in the Airflow Variables backend). 194 | """ 195 | TASK_DICT[f"load_to_redshift_{date}"] = S3ToRedshiftOperator( 196 | task_id=f"load_to_redshift_{date}", 197 | s3_bucket="{{ var.json.aws_configs.s3_bucket }}", 198 | s3_key="{{ var.json.aws_configs.s3_key_prefix }}/" + file_path, 199 | schema="PUBLIC", 200 | table="{{ var.json.aws_configs.redshift_table }}", 201 | copy_options=[ 202 | "csv", 203 | "ignoreheader 1", 204 | "TIMEFORMAT AS 'YYYY-MM-DD HH24:MI:SS'", 205 | ], 206 | ) 207 | 208 | TASK_DICT[f"delete_upload_date_{date}"] = delete_upload_date(file_path) 209 | 210 | chain( 211 | begin, 212 | [TASK_DICT[f"add_upload_date_{date}"]], 213 | converge_1, 214 | [TASK_DICT[f"upload_to_s3_{date}"]], 215 | create_redshift_table, 216 | [TASK_DICT[f"load_to_redshift_{date}"]], 217 | converge_2, 218 | [quality_check_group, value_check, interval_check, threshold_check], 219 | drop_redshift_table, 220 | [TASK_DICT[f"delete_upload_date_{date}"]], 221 | end, 222 | ) 223 | -------------------------------------------------------------------------------- /include/forestfire_checks/checks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Table Level Checks 3 | """ 4 | TABLE_CHECKS = [ 5 | {"row_count_check": {"check_statement": "COUNT(*) = 9"}}, 6 | {"dmc_less_than_twice_dc_check": {"check_statement": "2 * dmc < dc"}} 7 | # could be cool to check the table schema against known columns, as well 8 | ] 9 | 10 | """ 11 | Column Level Checks 12 | """ 13 | COL_CHECKS = [ 14 | {"id": { 15 | "null_check": {"equal_to": 0}, 16 | "distinct_check": {"equal_to": 9} 17 | }}, 18 | {"ffmc": { 19 | "min": {"geq_to": 50}, 20 | "max": {"less_than": 100} 21 | }}, 22 | ] 23 | -------------------------------------------------------------------------------- /include/gcs_xcom_backend.py: -------------------------------------------------------------------------------- 1 | ## Source https://medium.com/apache-airflow/airflow-2-0-dag-authoring-redesigned-651edc397178 2 | 3 | from typing import Any 4 | from airflow.models.xcom import BaseXCom 5 | from airflow.providers.google.cloud.hooks.gcs import GCSHook 6 | 7 | import pandas as pd 8 | import uuid 9 | 10 | 11 | class GCSXComBackend(BaseXCom): 12 | PREFIX = "xcom_gcs://" 13 | BUCKET_NAME = "xcom_gcs" 14 | 15 | @staticmethod 16 | def serialize_value(value: Any): 17 | if isinstance(value, pd.DataFrame): 18 | hook = GCSHook() 19 | object_name = "data_" + str(uuid.uuid4()) 20 | with hook.provide_file_and_upload( 21 | bucket_name=GCSXComBackend.BUCKET_NAME, 22 | object_name=object_name, 23 | ) as f: 24 | value.to_csv(f.name, index=False) 25 | # Append prefix to persist information that the file 26 | # has to be downloaded from GCS 27 | value = GCSXComBackend.PREFIX + object_name 28 | return BaseXCom.serialize_value(value) 29 | 30 | @staticmethod 31 | def deserialize_value(result) -> Any: 32 | result = BaseXCom.deserialize_value(result) 33 | if isinstance(result, str) and result.startswith(GCSXComBackend.PREFIX): 34 | object_name = result.replace(GCSXComBackend.PREFIX, "") 35 | with GCSHook().provide_file( 36 | bucket_name=GCSXComBackend.BUCKET_NAME, 37 | object_name=object_name, 38 | ) as f: 39 | f.flush() 40 | result = pd.read_csv(f.name) 41 | return result 42 | -------------------------------------------------------------------------------- /include/great_expectations/.gitignore: -------------------------------------------------------------------------------- 1 | uncommitted/ -------------------------------------------------------------------------------- /include/great_expectations/checkpoints/mlflow/feature_chk.yml: -------------------------------------------------------------------------------- 1 | name: mlflow.feature_chk 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: mlflow.census_adult_income_features 8 | batch_request: 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: my_mlflow_datasource 25 | data_connector_name: default_inferred_data_connector_name 26 | data_asset_name: mlflow_dataframe 27 | data_connector_query: 28 | profilers: [] 29 | ge_cloud_id: 30 | expectation_suite_ge_cloud_id: 31 | -------------------------------------------------------------------------------- /include/great_expectations/checkpoints/mlflow/preprocess_chk.yml: -------------------------------------------------------------------------------- 1 | name: mlflow.preprocess_chk 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: mlflow.census_adult_income_preprocess 8 | batch_request: 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: my_mlflow_datasource 25 | data_connector_name: default_inferred_data_connector_name 26 | data_asset_name: mlflow_dataframe 27 | data_connector_query: 28 | profilers: [] 29 | ge_cloud_id: 30 | expectation_suite_ge_cloud_id: 31 | -------------------------------------------------------------------------------- /include/great_expectations/checkpoints/taxi/fail/chk.yml: -------------------------------------------------------------------------------- 1 | name: taxi.fail.chk 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: taxi.demo 8 | batch_request: 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: my_datasource 25 | data_connector_name: default_inferred_data_connector_name 26 | data_asset_name: yellow_tripdata_sample_2019-02.csv 27 | data_connector_query: 28 | index: -1 29 | profilers: [] 30 | ge_cloud_id: 31 | expectation_suite_ge_cloud_id: 32 | -------------------------------------------------------------------------------- /include/great_expectations/checkpoints/taxi/pass/chk.yml: -------------------------------------------------------------------------------- 1 | name: taxi.pass.chk 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: taxi.demo 8 | batch_request: 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: my_datasource 25 | data_connector_name: default_inferred_data_connector_name 26 | data_asset_name: yellow_tripdata_sample_2019-01.csv 27 | data_connector_query: 28 | index: -1 29 | profilers: [] 30 | ge_cloud_id: 31 | expectation_suite_ge_cloud_id: 32 | -------------------------------------------------------------------------------- /include/great_expectations/configs/bigquery_configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pathlib import Path 4 | from great_expectations.core.batch import BatchRequest 5 | from great_expectations.data_context.types.base import ( 6 | DataContextConfig, 7 | CheckpointConfig 8 | ) 9 | 10 | base_path = Path(__file__).parents[3] 11 | data_dir = os.path.join(base_path, "include", "data") 12 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 13 | 14 | bigquery_data_context_config = DataContextConfig( 15 | **{ 16 | "config_version": 3.0, 17 | "datasources": { 18 | "my_bigquery_datasource": { 19 | "data_connectors": { 20 | "default_inferred_data_connector_name": { 21 | "default_regex": { 22 | "group_names": ["data_asset_name"], 23 | "pattern": "(.*)", 24 | }, 25 | "base_directory": data_dir, 26 | "class_name": "InferredAssetFilesystemDataConnector", 27 | }, 28 | "default_runtime_data_connector_name": { 29 | "batch_identifiers": ["default_identifier_name"], 30 | "class_name": "RuntimeDataConnector", 31 | }, 32 | }, 33 | "execution_engine": { 34 | "class_name": "PandasExecutionEngine", 35 | }, 36 | "class_name": "Datasource", 37 | } 38 | }, 39 | "config_variables_file_path": os.path.join( 40 | ge_root_dir, "uncommitted", "config_variables.yml" 41 | ), 42 | "stores": { 43 | "expectations_store": { 44 | "class_name": "ExpectationsStore", 45 | "store_backend": { 46 | "class_name": "TupleFilesystemStoreBackend", 47 | "base_directory": os.path.join(ge_root_dir, "expectations"), 48 | }, 49 | }, 50 | "validations_store": { 51 | "class_name": "ValidationsStore", 52 | "store_backend": { 53 | "class_name": "TupleFilesystemStoreBackend", 54 | "base_directory": os.path.join( 55 | ge_root_dir, "uncommitted", "validations" 56 | ), 57 | }, 58 | }, 59 | "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"}, 60 | "checkpoint_store": { 61 | "class_name": "CheckpointStore", 62 | "store_backend": { 63 | "class_name": "TupleFilesystemStoreBackend", 64 | "suppress_store_backend_id": True, 65 | "base_directory": os.path.join(ge_root_dir, "checkpoints"), 66 | }, 67 | }, 68 | }, 69 | "expectations_store_name": "expectations_store", 70 | "validations_store_name": "validations_store", 71 | "evaluation_parameter_store_name": "evaluation_parameter_store", 72 | "checkpoint_store_name": "checkpoint_store", 73 | "data_docs_sites": { 74 | "local_site": { 75 | "class_name": "SiteBuilder", 76 | "show_how_to_buttons": True, 77 | "store_backend": { 78 | "class_name": "TupleFilesystemStoreBackend", 79 | "base_directory": os.path.join( 80 | ge_root_dir, "uncommitted", "data_docs", "local_site" 81 | ), 82 | }, 83 | "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"}, 84 | } 85 | }, 86 | "anonymous_usage_statistics": { 87 | "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd", 88 | "enabled": False, 89 | }, 90 | "notebooks": None, 91 | "concurrency": {"enabled": False}, 92 | } 93 | ) 94 | 95 | bigquery_checkpoint_config = CheckpointConfig( 96 | **{ 97 | "name": "taxi.pass.chk", 98 | "config_version": 1.0, 99 | "template_name": None, 100 | "class_name": "Checkpoint", 101 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 102 | "expectation_suite_name": "taxi.demo", 103 | "batch_request": None, 104 | "action_list": [ 105 | { 106 | "name": "store_validation_result", 107 | "action": {"class_name": "StoreValidationResultAction"}, 108 | }, 109 | { 110 | "name": "store_evaluation_params", 111 | "action": {"class_name": "StoreEvaluationParametersAction"}, 112 | }, 113 | { 114 | "name": "update_data_docs", 115 | "action": {"class_name": "UpdateDataDocsAction", "site_names": []}, 116 | }, 117 | ], 118 | "evaluation_parameters": {}, 119 | "runtime_configuration": {}, 120 | "validations": [ 121 | { 122 | "batch_request": { 123 | "datasource_name": "my_bigquery_datasource", 124 | "data_connector_name": "default_inferred_data_connector_name", 125 | "data_asset_name": "taxi", 126 | "batch_spec_passthrough": { 127 | "bigquery_temp_table": "taxi_temp" 128 | }, 129 | }, 130 | } 131 | ], 132 | "profilers": [], 133 | "ge_cloud_id": None, 134 | "expectation_suite_ge_cloud_id": None, 135 | } 136 | ) 137 | 138 | bigquery_batch_request = BatchRequest( 139 | **{ 140 | "datasource_name": "my_bigquery_datasource", 141 | "data_connector_name": "default_inferred_data_connector_name", 142 | "data_asset_name": "great_expectations_bigquery_example.taxi", 143 | "data_connector_query": {"index": -1}, 144 | } 145 | ) 146 | -------------------------------------------------------------------------------- /include/great_expectations/configs/mlflow_checkpoint_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pathlib import Path 4 | 5 | from great_expectations.data_context.types.base import ( 6 | CheckpointConfig, 7 | ) 8 | 9 | base_path = Path(__file__).parents[3] 10 | data_dir = os.path.join(base_path, "include", "data") 11 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 12 | 13 | mlflow_preprocess_checkpoint_config = CheckpointConfig( 14 | **{ 15 | "name": "mlflow.preprocess_chk", 16 | "config_version": 1.0, 17 | "template_name": None, 18 | "class_name": "Checkpoint", 19 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 20 | "expectation_suite_name": "mlflow.census_adult_income_preprocess", 21 | "batch_request": None, 22 | "action_list": [ 23 | { 24 | "name": "store_validation_result", 25 | "action": {"class_name": "StoreValidationResultAction"}, 26 | }, 27 | { 28 | "name": "store_evaluation_params", 29 | "action": {"class_name": "StoreEvaluationParametersAction"}, 30 | }, 31 | { 32 | "name": "update_data_docs", 33 | "action": {"class_name": "UpdateDataDocsAction"}, 34 | }, 35 | ], 36 | } 37 | ) 38 | 39 | mlflow_feature_checkpoint_config = CheckpointConfig( 40 | **{ 41 | "name": "mlflow.feature_chk", 42 | "config_version": 1.0, 43 | "template_name": None, 44 | "class_name": "Checkpoint", 45 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 46 | "expectation_suite_name": "mlflow.census_adult_income_features", 47 | "batch_request": None, 48 | "action_list": [ 49 | { 50 | "name": "store_validation_result", 51 | "action": {"class_name": "StoreValidationResultAction"}, 52 | }, 53 | { 54 | "name": "store_evaluation_params", 55 | "action": {"class_name": "StoreEvaluationParametersAction"}, 56 | }, 57 | { 58 | "name": "update_data_docs", 59 | "action": {"class_name": "UpdateDataDocsAction"}, 60 | }, 61 | ], 62 | } 63 | ) 64 | -------------------------------------------------------------------------------- /include/great_expectations/configs/redshift_configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pathlib import Path 4 | from great_expectations.core.batch import BatchRequest 5 | from great_expectations.data_context.types.base import ( 6 | DataContextConfig, 7 | CheckpointConfig 8 | ) 9 | 10 | base_path = Path(__file__).parents[3] 11 | data_dir = os.path.join(base_path, "include", "data") 12 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 13 | 14 | redshift_data_context_config = DataContextConfig( 15 | **{ 16 | "config_version": 3.0, 17 | "datasources": { 18 | "my_redshift_datasource": { 19 | "module_name": "great_expectations.datasource", 20 | "data_connectors": { 21 | "default_inferred_data_connector_name": { 22 | "default_regex": { 23 | "group_names": ["data_asset_name"], 24 | "pattern": "(.*)", 25 | }, 26 | "base_directory": data_dir, 27 | "module_name": "great_expectations.datasource.data_connector", 28 | "class_name": "InferredAssetFilesystemDataConnector", 29 | }, 30 | "default_runtime_data_connector_name": { 31 | "batch_identifiers": ["default_identifier_name"], 32 | "module_name": "great_expectations.datasource.data_connector", 33 | "class_name": "RuntimeDataConnector", 34 | }, 35 | }, 36 | "execution_engine": { 37 | "module_name": "great_expectations.execution_engine", 38 | "class_name": "PandasExecutionEngine", 39 | }, 40 | "class_name": "Datasource", 41 | } 42 | }, 43 | "config_variables_file_path": os.path.join( 44 | ge_root_dir, "uncommitted", "config_variables.yml" 45 | ), 46 | "stores": { 47 | "expectations_store": { 48 | "class_name": "ExpectationsStore", 49 | "store_backend": { 50 | "class_name": "TupleFilesystemStoreBackend", 51 | "base_directory": os.path.join(ge_root_dir, "expectations"), 52 | }, 53 | }, 54 | "validations_store": { 55 | "class_name": "ValidationsStore", 56 | "store_backend": { 57 | "class_name": "TupleFilesystemStoreBackend", 58 | "base_directory": os.path.join( 59 | ge_root_dir, "uncommitted", "validations" 60 | ), 61 | }, 62 | }, 63 | "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"}, 64 | "checkpoint_store": { 65 | "class_name": "CheckpointStore", 66 | "store_backend": { 67 | "class_name": "TupleFilesystemStoreBackend", 68 | "suppress_store_backend_id": True, 69 | "base_directory": os.path.join(ge_root_dir, "checkpoints"), 70 | }, 71 | }, 72 | }, 73 | "expectations_store_name": "expectations_store", 74 | "validations_store_name": "validations_store", 75 | "evaluation_parameter_store_name": "evaluation_parameter_store", 76 | "checkpoint_store_name": "checkpoint_store", 77 | "data_docs_sites": { 78 | "local_site": { 79 | "class_name": "SiteBuilder", 80 | "show_how_to_buttons": True, 81 | "store_backend": { 82 | "class_name": "TupleFilesystemStoreBackend", 83 | "base_directory": os.path.join( 84 | ge_root_dir, "uncommitted", "data_docs", "local_site" 85 | ), 86 | }, 87 | "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"}, 88 | } 89 | }, 90 | "anonymous_usage_statistics": { 91 | "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd", 92 | "enabled": False, 93 | }, 94 | "notebooks": None, 95 | "concurrency": {"enabled": False}, 96 | } 97 | ) 98 | 99 | redshift_checkpoint_config = CheckpointConfig( 100 | **{ 101 | "name": "taxi.pass.chk", 102 | "config_version": 1.0, 103 | "template_name": None, 104 | "class_name": "Checkpoint", 105 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 106 | "expectation_suite_name": "taxi.demo", 107 | "batch_request": None, 108 | "action_list": [ 109 | { 110 | "name": "store_validation_result", 111 | "action": {"class_name": "StoreValidationResultAction"}, 112 | }, 113 | { 114 | "name": "store_evaluation_params", 115 | "action": {"class_name": "StoreEvaluationParametersAction"}, 116 | }, 117 | { 118 | "name": "update_data_docs", 119 | "action": {"class_name": "UpdateDataDocsAction", "site_names": []}, 120 | }, 121 | ], 122 | "evaluation_parameters": {}, 123 | "runtime_configuration": {}, 124 | "validations": [ 125 | { 126 | "batch_request": { 127 | "datasource_name": "my_redshift_datasource", 128 | "data_connector_name": "default_inferred_data_connector_name", 129 | "data_asset_name": "public.yellow_tripdata", 130 | }, 131 | } 132 | ], 133 | "profilers": [], 134 | "ge_cloud_id": None, 135 | "expectation_suite_ge_cloud_id": None, 136 | } 137 | ) 138 | 139 | redshift_batch_request = BatchRequest( 140 | **{ 141 | "datasource_name": "my_redshift_db", 142 | "data_connector_name": "default_inferred_data_connector_name", 143 | "data_asset_name": "yellow_tripdata_sample_2019-01.csv", 144 | "data_connector_query": {"index": -1}, 145 | } 146 | ) 147 | -------------------------------------------------------------------------------- /include/great_expectations/configs/s3_configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pathlib import Path 4 | from great_expectations.core.batch import BatchRequest 5 | from great_expectations.data_context.types.base import ( 6 | DataContextConfig, 7 | CheckpointConfig 8 | ) 9 | 10 | base_path = Path(__file__).parents[3] 11 | data_dir = os.path.join(base_path, "include", "data") 12 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 13 | 14 | s3_data_context_config = DataContextConfig( 15 | **{ 16 | "config_version": 3.0, 17 | "datasources": { 18 | "my_s3_config": { 19 | "module_name": "great_expectations.datasource", 20 | "data_connectors": { 21 | "default_inferred_data_connector_name": { 22 | "default_regex": { 23 | "group_names": ["yellow_tripdata", "date"], 24 | "pattern": "(yellow_tripdata_sample)_(\d{4}-\d{2})\.csv", 25 | }, 26 | "base_directory": "benji-dq-test/test/tripdata/", 27 | "module_name": "great_expectations.datasource.data_connector", 28 | "class_name": "InferredAssetS3DataConnector", 29 | }, 30 | "default_runtime_data_connector_name": { 31 | "batch_identifiers": ["default_identifier_name"], 32 | "module_name": "great_expectations.datasource.data_connector", 33 | "class_name": "RuntimeDataConnector", 34 | }, 35 | }, 36 | "execution_engine": { 37 | "module_name": "great_expectations.execution_engine", 38 | "class_name": "SqlAlchemyExecutionEngine" 39 | }, 40 | "class_name": "Datasource", 41 | } 42 | }, 43 | "config_variables_file_path": os.path.join( 44 | ge_root_dir, "uncommitted", "config_variables.yml" 45 | ), 46 | "stores": { 47 | "expectations_store": { 48 | "class_name": "ExpectationsStore", 49 | "store_backend": { 50 | "class_name": "TupleFilesystemStoreBackend", 51 | "base_directory": os.path.join(ge_root_dir, "expectations"), 52 | }, 53 | }, 54 | "validations_store": { 55 | "class_name": "ValidationsStore", 56 | "store_backend": { 57 | "class_name": "TupleFilesystemStoreBackend", 58 | "base_directory": os.path.join( 59 | ge_root_dir, "uncommitted", "validations" 60 | ), 61 | }, 62 | }, 63 | "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"}, 64 | "checkpoint_store": { 65 | "class_name": "CheckpointStore", 66 | "store_backend": { 67 | "class_name": "TupleFilesystemStoreBackend", 68 | "suppress_store_backend_id": True, 69 | "base_directory": os.path.join(ge_root_dir, "checkpoints"), 70 | }, 71 | }, 72 | }, 73 | "expectations_store_name": "expectations_store", 74 | "validations_store_name": "validations_store", 75 | "evaluation_parameter_store_name": "evaluation_parameter_store", 76 | "checkpoint_store_name": "checkpoint_store", 77 | "data_docs_sites": { 78 | "local_site": { 79 | "class_name": "SiteBuilder", 80 | "show_how_to_buttons": True, 81 | "store_backend": { 82 | "class_name": "TupleFilesystemStoreBackend", 83 | "base_directory": os.path.join( 84 | ge_root_dir, "uncommitted", "data_docs", "local_site" 85 | ), 86 | }, 87 | "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"}, 88 | } 89 | }, 90 | "anonymous_usage_statistics": { 91 | "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd", 92 | "enabled": False, 93 | }, 94 | "notebooks": None, 95 | "concurrency": {"enabled": False}, 96 | } 97 | ) 98 | 99 | snowflake_checkpoint_config = CheckpointConfig( 100 | **{ 101 | "name": "taxi.pass.chk", 102 | "config_version": 1.0, 103 | "template_name": None, 104 | "module_name": "great_expectations.checkpoint", 105 | "class_name": "Checkpoint", 106 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 107 | "expectation_suite_name": "taxi.demo", 108 | "batch_request": None, 109 | "action_list": [ 110 | { 111 | "name": "store_validation_result", 112 | "action": {"class_name": "StoreValidationResultAction"}, 113 | }, 114 | { 115 | "name": "store_evaluation_params", 116 | "action": {"class_name": "StoreEvaluationParametersAction"}, 117 | }, 118 | { 119 | "name": "update_data_docs", 120 | "action": {"class_name": "UpdateDataDocsAction", "site_names": []}, 121 | }, 122 | ], 123 | "evaluation_parameters": {}, 124 | "runtime_configuration": {}, 125 | "validations": [ 126 | { 127 | "batch_request": { 128 | "datasource_name": "my_snowflake_db", 129 | "data_connector_name": "default_inferred_data_connector_name", 130 | "data_asset_name": "YELLOW_TRIPDATA", 131 | "data_connector_query": {"index": -1}, 132 | }, 133 | } 134 | ], 135 | "profilers": [], 136 | "ge_cloud_id": None, 137 | "expectation_suite_ge_cloud_id": None, 138 | } 139 | ) 140 | 141 | snowflake_batch_request = BatchRequest( 142 | **{ 143 | "datasource_name": "my_snowflake_db", 144 | "data_connector_name": "default_inferred_data_connector_name", 145 | "data_asset_name": "yellow_tripdata_sample_2019-01.csv", 146 | "data_connector_query": {"index": -1}, 147 | } 148 | ) 149 | -------------------------------------------------------------------------------- /include/great_expectations/configs/snowflake_configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | from pathlib import Path 5 | from great_expectations.core.batch import BatchRequest 6 | from great_expectations.data_context.types.base import ( 7 | DataContextConfig, 8 | CheckpointConfig 9 | ) 10 | 11 | base_path = Path(__file__).parents[3] 12 | data_dir = os.path.join(base_path, "include", "data") 13 | ge_root_dir = os.path.join(base_path, "include", "great_expectations") 14 | connection_string = "" 15 | 16 | # Note: The user must first configure a `config_variable.yml` file for this to work 17 | # The file is not included with this repo. 18 | with open( 19 | f"{ge_root_dir}/uncommitted/config_variables.yml", 20 | "r", 21 | ) as f: 22 | connection_string = yaml.safe_load(f).get("my_snowflake_db") 23 | 24 | snowflake_data_context_config = DataContextConfig( 25 | **{ 26 | "config_version": 3.0, 27 | "datasources": {}, 28 | "config_variables_file_path": os.path.join( 29 | ge_root_dir, "uncommitted", "config_variables.yml" 30 | ), 31 | "stores": { 32 | "expectations_store": { 33 | "class_name": "ExpectationsStore", 34 | "store_backend": { 35 | "class_name": "TupleFilesystemStoreBackend", 36 | "base_directory": os.path.join(ge_root_dir, "expectations"), 37 | }, 38 | }, 39 | "validations_store": { 40 | "class_name": "ValidationsStore", 41 | "store_backend": { 42 | "class_name": "TupleFilesystemStoreBackend", 43 | "base_directory": os.path.join( 44 | ge_root_dir, "uncommitted", "validations" 45 | ), 46 | }, 47 | }, 48 | "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"}, 49 | "checkpoint_store": { 50 | "class_name": "CheckpointStore", 51 | "store_backend": { 52 | "class_name": "TupleFilesystemStoreBackend", 53 | "suppress_store_backend_id": True, 54 | "base_directory": os.path.join(ge_root_dir, "checkpoints"), 55 | }, 56 | }, 57 | }, 58 | "expectations_store_name": "expectations_store", 59 | "validations_store_name": "validations_store", 60 | "evaluation_parameter_store_name": "evaluation_parameter_store", 61 | "checkpoint_store_name": "checkpoint_store", 62 | "data_docs_sites": { 63 | "local_site": { 64 | "class_name": "SiteBuilder", 65 | "show_how_to_buttons": True, 66 | "store_backend": { 67 | "class_name": "TupleFilesystemStoreBackend", 68 | "base_directory": os.path.join( 69 | ge_root_dir, "uncommitted", "data_docs", "local_site" 70 | ), 71 | }, 72 | "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"}, 73 | } 74 | }, 75 | "anonymous_usage_statistics": { 76 | "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd", 77 | "enabled": False, 78 | }, 79 | "notebooks": None, 80 | "concurrency": {"enabled": False}, 81 | } 82 | ) 83 | 84 | snowflake_checkpoint_config = CheckpointConfig( 85 | **{ 86 | "name": "taxi.pass.chk", 87 | "config_version": 1.0, 88 | "template_name": None, 89 | "module_name": "great_expectations.checkpoint", 90 | "class_name": "Checkpoint", 91 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 92 | "expectation_suite_name": "taxi.demo", 93 | "batch_request": None, 94 | "action_list": [ 95 | { 96 | "name": "store_validation_result", 97 | "action": {"class_name": "StoreValidationResultAction"}, 98 | }, 99 | { 100 | "name": "store_evaluation_params", 101 | "action": {"class_name": "StoreEvaluationParametersAction"}, 102 | }, 103 | { 104 | "name": "update_data_docs", 105 | "action": {"class_name": "UpdateDataDocsAction", "site_names": []}, 106 | }, 107 | ], 108 | "evaluation_parameters": {}, 109 | "runtime_configuration": {}, 110 | "validations": [ 111 | { 112 | "batch_request": { 113 | "datasource_name": "my_snowflake_datasource", 114 | "data_connector_name": "default_inferred_data_connector_name", 115 | "data_asset_name": "yellow_tripdata", 116 | }, 117 | } 118 | ], 119 | "profilers": [], 120 | "ge_cloud_id": None, 121 | "expectation_suite_ge_cloud_id": None, 122 | } 123 | ) 124 | 125 | snowflake_audit_checkpoint_config = CheckpointConfig( 126 | **{ 127 | "name": "taxi.pass.chk", 128 | "config_version": 1.0, 129 | "template_name": None, 130 | "class_name": "Checkpoint", 131 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 132 | "expectation_suite_name": "taxi.demo", 133 | "batch_request": None, 134 | "action_list": [ 135 | { 136 | "name": "store_validation_result", 137 | "action": {"class_name": "StoreValidationResultAction"}, 138 | }, 139 | { 140 | "name": "store_evaluation_params", 141 | "action": {"class_name": "StoreEvaluationParametersAction"}, 142 | }, 143 | { 144 | "name": "update_data_docs", 145 | "action": {"class_name": "UpdateDataDocsAction", "site_names": []}, 146 | }, 147 | ], 148 | "evaluation_parameters": {}, 149 | "runtime_configuration": {}, 150 | "validations": [ 151 | { 152 | "batch_request": { 153 | "datasource_name": "my_snowflake_datasource", 154 | "data_connector_name": "default_inferred_data_connector_name", 155 | "data_asset_name": "yellow_tripdata_audit", 156 | }, 157 | } 158 | ], 159 | "profilers": [], 160 | "ge_cloud_id": None, 161 | "expectation_suite_ge_cloud_id": None, 162 | } 163 | ) 164 | 165 | snowflake_batch_request = BatchRequest( 166 | **{ 167 | "datasource_name": "my_snowflake_db", 168 | "data_connector_name": "default_inferred_data_connector_name", 169 | "data_asset_name": "yellow_tripdata_sample_2019-01.csv", 170 | "data_connector_query": {"index": -1}, 171 | } 172 | ) 173 | -------------------------------------------------------------------------------- /include/great_expectations/expectations/.ge_store_backend_id: -------------------------------------------------------------------------------- 1 | store_backend_id = abcdabcd-1111-2222-3333-abcdabcdabcd -------------------------------------------------------------------------------- /include/great_expectations/expectations/mlflow/census_adult_income_features.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "census_adult_income", 4 | "expectations": [ 5 | { 6 | "expectation_type": "expect_table_columns_to_match_ordered_list", 7 | "kwargs": { 8 | "column_list": [ 9 | "capital_gain", 10 | "capital_loss", 11 | "hours_per_week", 12 | "workclass_Federal-gov", 13 | "workclass_Local-gov", 14 | "workclass_Never-worked", 15 | "workclass_Private", 16 | "workclass_Self-emp-inc", 17 | "workclass_Self-emp-not-inc", 18 | "workclass_State-gov", 19 | "workclass_Unknown", 20 | "workclass_Without-pay", 21 | "education_10th", 22 | "education_11th", 23 | "education_12th", 24 | "education_1st-4th", 25 | "education_5th-6th", 26 | "education_7th-8th", 27 | "education_9th", 28 | "education_Assoc-acdm", 29 | "education_Assoc-voc", 30 | "education_Bachelors", 31 | "education_Doctorate", 32 | "education_HS-grad", 33 | "education_Masters", 34 | "education_Preschool", 35 | "education_Prof-school", 36 | "education_Some-college", 37 | "occupation_Adm-clerical", 38 | "occupation_Armed-Forces", 39 | "occupation_Craft-repair", 40 | "occupation_Exec-managerial", 41 | "occupation_Farming-fishing", 42 | "occupation_Handlers-cleaners", 43 | "occupation_Machine-op-inspct", 44 | "occupation_Other-service", 45 | "occupation_Priv-house-serv", 46 | "occupation_Prof-specialty", 47 | "occupation_Protective-serv", 48 | "occupation_Sales", 49 | "occupation_Tech-support", 50 | "occupation_Transport-moving", 51 | "occupation_Unknown", 52 | "race_Amer-Indian-Eskimo", 53 | "race_Asian-Pac-Islander", 54 | "race_Black", 55 | "race_Other", 56 | "race_White", 57 | "sex_Female", 58 | "sex_Male", 59 | "income_bracket_>50K", 60 | "native_country_Cambodia", 61 | "native_country_Canada", 62 | "native_country_China", 63 | "native_country_Columbia", 64 | "native_country_Cuba", 65 | "native_country_Dominican-Republic", 66 | "native_country_Ecuador", 67 | "native_country_El-Salvador", 68 | "native_country_England", 69 | "native_country_France", 70 | "native_country_Germany", 71 | "native_country_Greece", 72 | "native_country_Guatemala", 73 | "native_country_Haiti", 74 | "native_country_Holand-Netherlands", 75 | "native_country_Honduras", 76 | "native_country_Hong", 77 | "native_country_Hungary", 78 | "native_country_India", 79 | "native_country_Iran", 80 | "native_country_Ireland", 81 | "native_country_Italy", 82 | "native_country_Jamaica", 83 | "native_country_Japan", 84 | "native_country_Laos", 85 | "native_country_Mexico", 86 | "native_country_Nicaragua", 87 | "native_country_Outlying-US(Guam-USVI-etc)", 88 | "native_country_Peru", 89 | "native_country_Philippines", 90 | "native_country_Poland", 91 | "native_country_Portugal", 92 | "native_country_Puerto-Rico", 93 | "native_country_Scotland", 94 | "native_country_South", 95 | "native_country_Taiwan", 96 | "native_country_Thailand", 97 | "native_country_Trinadad&Tobago", 98 | "native_country_United-States", 99 | "native_country_Unknown", 100 | "native_country_Vietnam", 101 | "native_country_Yugoslavia", 102 | "age_bins", 103 | "never_married" 104 | ] 105 | }, 106 | "meta": {} 107 | }, 108 | { 109 | "expectation_type": "expect_column_values_to_be_of_type", 110 | "kwargs": { 111 | "column": "sex_Male", 112 | "type_": "int" 113 | }, 114 | "meta": { 115 | "notes": { 116 | "content": "", 117 | "format": "markdown" 118 | } 119 | } 120 | }, 121 | { 122 | "expectation_type": "expect_column_values_to_be_of_type", 123 | "kwargs": { 124 | "column": "sex_Female", 125 | "type_": "int" 126 | }, 127 | "meta": { 128 | "notes": { 129 | "content": "", 130 | "format": "markdown" 131 | } 132 | } 133 | }, 134 | { 135 | "expectation_type": "expect_column_values_to_be_in_set", 136 | "kwargs": { 137 | "column": "sex_Male", 138 | "value_set": [ 139 | 0, 140 | 1 141 | ] 142 | }, 143 | "meta": { 144 | "notes": { 145 | "content": "", 146 | "format": "markdown" 147 | } 148 | } 149 | }, 150 | { 151 | "expectation_type": "expect_column_values_to_be_in_set", 152 | "kwargs": { 153 | "column": "sex_Female", 154 | "value_set": [ 155 | 0, 156 | 1 157 | ] 158 | }, 159 | "meta": { 160 | "notes": { 161 | "content": "", 162 | "format": "markdown" 163 | } 164 | } 165 | } 166 | ], 167 | "ge_cloud_id": null, 168 | "meta": { 169 | "great_expectations_version": "0.13.49" 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /include/great_expectations/expectations/mlflow/census_adult_income_preprocess.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "census_adult_income_preprocess", 4 | "expectations": [ 5 | { 6 | "expectation_type": "expect_table_columns_to_match_ordered_list", 7 | "kwargs": { 8 | "column_list": [ 9 | "age", 10 | "workclass", 11 | "education", 12 | "marital_status", 13 | "occupation", 14 | "race", 15 | "sex", 16 | "capital_gain", 17 | "capital_loss", 18 | "hours_per_week", 19 | "native_country", 20 | "income_bracket" 21 | ] 22 | }, 23 | "meta": {} 24 | }, 25 | { 26 | "expectation_type": "expect_column_values_to_not_be_null", 27 | "kwargs": { 28 | "column": "age", 29 | "mostly": 1.0 30 | }, 31 | "meta": { 32 | "notes": { 33 | "content": "Ensure age is not null", 34 | "format": "markdown" 35 | } 36 | } 37 | }, 38 | { 39 | "expectation_type": "expect_column_values_to_not_be_null", 40 | "kwargs": { 41 | "column": "workclass", 42 | "mostly": 1.0 43 | }, 44 | "meta": { 45 | "notes": { 46 | "content": "Ensure workclass is not null", 47 | "format": "markdown" 48 | } 49 | } 50 | }, 51 | { 52 | "expectation_type": "expect_column_values_to_be_of_type", 53 | "kwargs": { 54 | "column": "hours_per_week", 55 | "type_": "int" 56 | }, 57 | "meta": { 58 | "notes": { 59 | "content": "", 60 | "format": "markdown" 61 | } 62 | } 63 | }, 64 | { 65 | "expectation_type": "expect_column_values_to_be_in_set", 66 | "kwargs": { 67 | "column": "sex", 68 | "value_set": [ 69 | "Male", 70 | "Female" 71 | ] 72 | }, 73 | "meta": { 74 | "notes": { 75 | "content": "", 76 | "format": "markdown" 77 | } 78 | } 79 | } 80 | ], 81 | "ge_cloud_id": null, 82 | "meta": { 83 | "great_expectations_version": "0.13.49" 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /include/great_expectations/expectations/taxi/demo.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "taxi.demo", 4 | "expectations": [ 5 | { 6 | "expectation_context": { 7 | "description": null 8 | }, 9 | "expectation_type": "expect_table_row_count_to_be_between", 10 | "ge_cloud_id": null, 11 | "kwargs": { 12 | "max_value": 11000, 13 | "min_value": 1000 14 | }, 15 | "meta": {} 16 | }, 17 | { 18 | "expectation_context": { 19 | "description": null 20 | }, 21 | "expectation_type": "expect_column_values_to_not_be_null", 22 | "ge_cloud_id": null, 23 | "kwargs": { 24 | "column": "vendor_id" 25 | }, 26 | "meta": {} 27 | }, 28 | { 29 | "expectation_context": { 30 | "description": null 31 | }, 32 | "expectation_type": "expect_column_distinct_values_to_be_in_set", 33 | "ge_cloud_id": null, 34 | "kwargs": { 35 | "column": "vendor_id", 36 | "value_set": [ 37 | 1, 38 | 2, 39 | 4 40 | ] 41 | }, 42 | "meta": {} 43 | }, 44 | { 45 | "expectation_context": { 46 | "description": null 47 | }, 48 | "expectation_type": "expect_column_values_to_be_between", 49 | "ge_cloud_id": null, 50 | "kwargs": { 51 | "column": "passenger_count", 52 | "max_value": 6, 53 | "min_value": 1 54 | }, 55 | "meta": {} 56 | } 57 | ], 58 | "ge_cloud_id": null, 59 | "meta": { 60 | "great_expectations_version": "0.13.49" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /include/great_expectations/expectations/taxi/demo_fail.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "taxi.demo", 4 | "expectations": [ 5 | { 6 | "expectation_context": { 7 | "description": null 8 | }, 9 | "expectation_type": "expect_table_row_count_to_be_between", 10 | "ge_cloud_id": null, 11 | "kwargs": { 12 | "max_value": 11000, 13 | "min_value": 9000 14 | }, 15 | "meta": {} 16 | }, 17 | { 18 | "expectation_context": { 19 | "description": null 20 | }, 21 | "expectation_type": "expect_column_values_to_not_be_null", 22 | "ge_cloud_id": null, 23 | "kwargs": { 24 | "column": "vendor_id" 25 | }, 26 | "meta": {} 27 | }, 28 | { 29 | "expectation_context": { 30 | "description": null 31 | }, 32 | "expectation_type": "expect_column_distinct_values_to_be_in_set", 33 | "ge_cloud_id": null, 34 | "kwargs": { 35 | "column": "vendor_id", 36 | "value_set": [ 37 | 1, 38 | 2 39 | ] 40 | }, 41 | "meta": {} 42 | }, 43 | { 44 | "expectation_context": { 45 | "description": null 46 | }, 47 | "expectation_type": "expect_column_values_to_be_between", 48 | "ge_cloud_id": null, 49 | "kwargs": { 50 | "column": "passenger_count", 51 | "max_value": 6, 52 | "min_value": 1 53 | }, 54 | "meta": {} 55 | } 56 | ], 57 | "ge_cloud_id": null, 58 | "meta": { 59 | "citations": [ 60 | { 61 | "batch_request": { 62 | "data_asset_name": "yellow_tripdata_sample_2019-01.csv", 63 | "data_connector_name": "default_inferred_data_connector_name", 64 | "datasource_name": "my_datasource", 65 | "limit": 1000 66 | }, 67 | "citation_date": "2021-12-06T16:11:55.452248Z", 68 | "comment": "Created suite added via CLI" 69 | } 70 | ], 71 | "great_expectations_version": "0.13.45" 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /include/great_expectations/expectations/test_suite.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "test_suite", 4 | "expectations": [], 5 | "ge_cloud_id": null, 6 | "meta": { 7 | "great_expectations_version": "0.13.49" 8 | } 9 | } -------------------------------------------------------------------------------- /include/great_expectations/great_expectations.yml: -------------------------------------------------------------------------------- 1 | # Welcome to Great Expectations! Always know what to expect from your data. 2 | # 3 | # Here you can define datasources, batch kwargs generators, integrations and 4 | # more. This file is intended to be committed to your repo. For help with 5 | # configuration please: 6 | # - Read our docs: https://docs.greatexpectations.io/en/latest/reference/spare_parts/data_context_reference.html#configuration 7 | # - Join our slack channel: http://greatexpectations.io/slack 8 | 9 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility 10 | # It is auto-generated and usually does not need to be changed. 11 | config_version: 3.0 12 | 13 | # Datasources tell Great Expectations where your data lives and how to get it. 14 | # You can use the CLI command `great_expectations datasource new` to help you 15 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource.html 16 | 17 | # NOTE: Datasources in this repository are built on-the-fly by the GreatExpectationsOperator 18 | 19 | # The plugins_directory will be added to your python path for custom modules 20 | # used to override and extend Great Expectations. 21 | plugins_directory: plugins/ 22 | 23 | stores: 24 | # Stores are configurable places to store things like Expectations, Validations 25 | # Data Docs, and more. These are for advanced users only - most users can simply 26 | # leave this section alone. 27 | # 28 | # Three stores are required: expectations, validations, and 29 | # evaluation_parameters, and must exist with a valid store entry. Additional 30 | # stores can be configured for uses such as data_docs, etc. 31 | expectations_store: 32 | class_name: ExpectationsStore 33 | store_backend: 34 | class_name: TupleFilesystemStoreBackend 35 | base_directory: expectations/ 36 | 37 | validations_store: 38 | class_name: ValidationsStore 39 | store_backend: 40 | class_name: TupleFilesystemStoreBackend 41 | base_directory: uncommitted/validations/ 42 | 43 | evaluation_parameter_store: 44 | class_name: EvaluationParameterStore 45 | checkpoint_store: 46 | class_name: CheckpointStore 47 | store_backend: 48 | class_name: TupleFilesystemStoreBackend 49 | suppress_store_backend_id: true 50 | base_directory: checkpoints/ 51 | 52 | expectations_store_name: expectations_store 53 | validations_store_name: validations_store 54 | evaluation_parameter_store_name: evaluation_parameter_store 55 | checkpoint_store_name: checkpoint_store 56 | 57 | data_docs_sites: 58 | # Data Docs make it simple to visualize data quality in your project. These 59 | # include Expectations, Validations & Profiles. The are built for all 60 | # Datasources from JSON artifacts in the local repo including validations & 61 | # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html 62 | local_site: 63 | class_name: SiteBuilder 64 | show_how_to_buttons: true 65 | store_backend: 66 | class_name: TupleFilesystemStoreBackend 67 | base_directory: uncommitted/data_docs/local_site/ 68 | site_index_builder: 69 | class_name: DefaultSiteIndexBuilder 70 | 71 | anonymous_usage_statistics: 72 | data_context_id: abcdabcd-1111-2222-3333-abcdabcdabcd 73 | enabled: false 74 | notebooks: 75 | concurrency: 76 | enabled: false 77 | include_rendered_content: 78 | globally: false 79 | expectation_validation_result: false 80 | expectation_suite: false 81 | -------------------------------------------------------------------------------- /include/great_expectations/notebooks/pandas/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type PandasDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate a file on a filesystem:\n", 110 | "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you already loaded the data into a Pandas Data Frame:\n", 113 | "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "\n", 116 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 117 | "batch.head()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## 4. Validate the batch with Validation Operators\n", 125 | "\n", 126 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 127 | "multiple expectation suites and the actions that should be taken after validation.\n", 128 | "\n", 129 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 130 | "\n", 131 | "* validating a group of batches that are logically related\n", 132 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 133 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 134 | "\n", 135 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 145 | "\n", 146 | "\"\"\"\n", 147 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 148 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 149 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 150 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 151 | "be None and run_time will default to the current UTC datetime.\n", 152 | "\"\"\"\n", 153 | "\n", 154 | "run_id = {\n", 155 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 156 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 157 | "}\n", 158 | "\n", 159 | "results = context.run_validation_operator(\n", 160 | " \"action_list_operator\",\n", 161 | " assets_to_validate=[batch],\n", 162 | " run_id=run_id)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## 5. View the Validation Results in Data Docs\n", 170 | "\n", 171 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 172 | "\n", 173 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "context.open_data_docs()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## Congratulations! You ran Validations!\n", 190 | "\n", 191 | "## Next steps:\n", 192 | "\n", 193 | "### 1. Read about the typical workflow with Great Expectations:\n", 194 | "\n", 195 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 196 | "\n", 197 | "### 2. Explore the documentation & community\n", 198 | "\n", 199 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.7.0" 227 | }, 228 | "pycharm": { 229 | "stem_cell": { 230 | "cell_type": "raw", 231 | "source": [], 232 | "metadata": { 233 | "collapsed": false 234 | } 235 | } 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 4 240 | } 241 | -------------------------------------------------------------------------------- /include/great_expectations/notebooks/spark/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SparkDFDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate a file on a filesystem:\n", 110 | "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", 111 | "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n", 112 | "\n", 113 | "# If you already loaded the data into a PySpark Data Frame:\n", 114 | "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 115 | "\n", 116 | "\n", 117 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 118 | "batch.head()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## 4. Validate the batch with Validation Operators\n", 126 | "\n", 127 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 128 | "multiple expectation suites and the actions that should be taken after validation.\n", 129 | "\n", 130 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 131 | "\n", 132 | "* validating a group of batches that are logically related\n", 133 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 134 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 135 | "\n", 136 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 146 | "\n", 147 | "\"\"\"\n", 148 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 149 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 150 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 151 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 152 | "be None and run_time will default to the current UTC datetime.\n", 153 | "\"\"\"\n", 154 | "\n", 155 | "run_id = {\n", 156 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 157 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 158 | "}\n", 159 | "\n", 160 | "results = context.run_validation_operator(\n", 161 | " \"action_list_operator\",\n", 162 | " assets_to_validate=[batch],\n", 163 | " run_id=run_id)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "## 5. View the Validation Results in Data Docs\n", 171 | "\n", 172 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 173 | "\n", 174 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "context.open_data_docs()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Congratulations! You ran Validations!\n", 191 | "\n", 192 | "## Next steps:\n", 193 | "\n", 194 | "### 1. Read about the typical workflow with Great Expectations:\n", 195 | "\n", 196 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 197 | "\n", 198 | "### 2. Explore the documentation & community\n", 199 | "\n", 200 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [] 209 | } 210 | ], 211 | "metadata": { 212 | "kernelspec": { 213 | "display_name": "Python 3", 214 | "language": "python", 215 | "name": "python3" 216 | }, 217 | "language_info": { 218 | "codemirror_mode": { 219 | "name": "ipython", 220 | "version": 3 221 | }, 222 | "file_extension": ".py", 223 | "mimetype": "text/x-python", 224 | "name": "python", 225 | "nbconvert_exporter": "python", 226 | "pygments_lexer": "ipython3", 227 | "version": "3.7.0" 228 | }, 229 | "pycharm": { 230 | "stem_cell": { 231 | "cell_type": "raw", 232 | "source": [], 233 | "metadata": { 234 | "collapsed": false 235 | } 236 | } 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 4 241 | } 242 | -------------------------------------------------------------------------------- /include/great_expectations/notebooks/sql/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SqlAlchemyDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate an entire table or view in your database's default schema:\n", 110 | "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you would like to validate an entire table or view from a non-default schema in your database:\n", 113 | "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "# If you would like to validate the result set of a query:\n", 116 | "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 121 | "batch.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## 4. Validate the batch with Validation Operators\n", 129 | "\n", 130 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 131 | "multiple expectation suites and the actions that should be taken after validation.\n", 132 | "\n", 133 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 134 | "\n", 135 | "* validating a group of batches that are logically related\n", 136 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 137 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 138 | "\n", 139 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 149 | "\n", 150 | "\"\"\"\n", 151 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 152 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 153 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 154 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 155 | "be None and run_time will default to the current UTC datetime.\n", 156 | "\"\"\"\n", 157 | "\n", 158 | "run_id = {\n", 159 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 160 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 161 | "}\n", 162 | "\n", 163 | "results = context.run_validation_operator(\n", 164 | " \"action_list_operator\",\n", 165 | " assets_to_validate=[batch],\n", 166 | " run_id=run_id)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## 5. View the Validation Results in Data Docs\n", 174 | "\n", 175 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 176 | "\n", 177 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "context.open_data_docs()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Congratulations! You ran Validations!\n", 194 | "\n", 195 | "## Next steps:\n", 196 | "\n", 197 | "### 1. Read about the typical workflow with Great Expectations:\n", 198 | "\n", 199 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 200 | "\n", 201 | "### 2. Explore the documentation & community\n", 202 | "\n", 203 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.7.0" 231 | }, 232 | "pycharm": { 233 | "stem_cell": { 234 | "cell_type": "raw", 235 | "source": [], 236 | "metadata": { 237 | "collapsed": false 238 | } 239 | } 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 4 244 | } 245 | -------------------------------------------------------------------------------- /include/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /include/grid_configs.py: -------------------------------------------------------------------------------- 1 | from numpy.random.mtrand import seed 2 | from sklearn.linear_model import LogisticRegression 3 | import lightgbm as lgb 4 | 5 | 6 | 7 | models = { 8 | 'lgbm': lgb.LGBMClassifier(objective='binary', metric=['auc', 'binary_logloss'], seed=55, boosting_type='gbdt'), 9 | 'log_reg': LogisticRegression(max_iter=500) 10 | } 11 | 12 | params = { 13 | 'lgbm':{ 14 | 'learning_rate': [0.01, .05, .1], 15 | 'n_estimators': [50, 100, 150], 16 | 'num_leaves': [31, 40, 80], 17 | 'max_depth': [16, 24, 31, 40] 18 | }, 19 | 'log_reg':{ 20 | 'penalty': ['l1','l2','elasticnet'], 21 | 'C': [0.001, 0.01, 0.1, 1, 10, 100], 22 | 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] 23 | } 24 | } -------------------------------------------------------------------------------- /include/libs/schema_reg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/airflow-data-quality-demo/8847b1c9e749966a762ed5b9fa8d2075d4772352/include/libs/schema_reg/__init__.py -------------------------------------------------------------------------------- /include/libs/schema_reg/base_schema_transforms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses schema definitions in include/libs/table_schemas/snowflake//.json 3 | """ 4 | import json 5 | import logging 6 | import pandas as pd 7 | import numpy as np 8 | 9 | ### RELIES ON TEMPORARY SCHEMA DESIGN FOR ORDERING ##### 10 | # "id": { 11 | # "type": "varchar(25)", 12 | # "description": "source_order=1" 13 | # } 14 | def get_schema_source_col(key, prop): 15 | """ 16 | Temporary solution to map source columns to 17 | destination columns. 18 | Added due to some unexpected column renaming in houston 19 | and needing fixing immediately. 20 | """ 21 | if prop.get('default_source', None): 22 | return prop.get('default_source').split('=')[1] 23 | else: 24 | return key 25 | 26 | def get_schema_order(prop: dict,) -> str: 27 | """ 28 | Get 'description' field from schema definition file 29 | """ 30 | 31 | if prop.get('description', None): 32 | return int(prop.get('description').split('=')[1]) 33 | else: 34 | return -1 35 | 36 | def get_schema_type(prop: dict,) -> str: 37 | """ 38 | Get 'type' field from schema definition file 39 | """ 40 | 41 | return prop.get('type', None) 42 | 43 | 44 | def get_schema_raw_columns(table_props: dict) -> list: 45 | """ 46 | This can be used to add headers to a CSV or resolve 47 | columns that should only be included source data cleaning 48 | """ 49 | return [key for key, val in table_props.items() 50 | if val.get('description') is not None] 51 | 52 | 53 | def get_table_def_schema(TABLE_SCHEMA_DIR, transform_db, transform_schema): 54 | """ 55 | Fow now, we'll access local json files. This is meant to evolve 56 | """ 57 | with open(f'{TABLE_SCHEMA_DIR}/{transform_db}/{transform_schema}.json', 'r') as f: 58 | table_schema = json.load(f) 59 | table_def = table_schema.get('definitions') 60 | return table_def 61 | 62 | 63 | def snowflake_load_column_string(table_props: dict) -> str: 64 | """ 65 | Use the json table definition to build string necessary for 66 | selecting fields of interest for loading (i.e. omit passwords) 67 | Scrub it 68 | :param table_props: python dictionary of table properties dictionary 69 | from houston.json schema def 70 | :type table_props: dict 71 | :return col_string: string encoded for select on COPY ($1,$3,$6,et...) 72 | """ 73 | # This should work with if discription is not None 74 | try: 75 | vals = [f"${val.get('description','').split('=')[1]}" 76 | for key, val in table_props.items() 77 | if key not in ['insert_timestamp','hash_diff'] 78 | ] 79 | except Exception as e: 80 | logging.error('Bad Table Def Schema %s' % e) 81 | raise 82 | col_string = ','.join(vals) 83 | return col_string 84 | 85 | 86 | def resolve_schemas(df:pd.DataFrame, table_props: dict) -> pd.DataFrame: 87 | """ 88 | Take dataframe with raw data and remove or rename columns to match 89 | table schema, and if any remain from schema that aren't in dataframe, 90 | set null types for those colums 91 | """ 92 | df.columns = map(str.lower, df.columns) 93 | # get returned columns, the nmap returned column names to 94 | # new names old: new 95 | col_mapping = {get_schema_source_col(k, v): k.lower() 96 | for k,v in table_props.items()} 97 | 98 | df.rename(columns=col_mapping, inplace=True) 99 | current_cols = df.columns.tolist() 100 | 101 | schema_orders = {k.lower(): get_schema_order(v) 102 | for k,v in table_props.items() 103 | if get_schema_order(v) != -1} 104 | 105 | schema_cols = list(schema_orders.keys()) 106 | 107 | # Take the values we have defined in the schema and set the order 108 | schema_inters_cols = list(set(current_cols).intersection(schema_cols)) 109 | schema_inters_cols.sort(key=schema_orders.__getitem__) 110 | 111 | if schema_inters_cols: 112 | df = df.loc[:,schema_inters_cols] 113 | else: 114 | raise ValueError(f"Bad Schema Design in Sorting Columns {df.columns.tolist()}") 115 | 116 | # Check if any columns exist in full schema cols 117 | # that aren't in schema intersection cols 118 | # If so, add them with null values 119 | remaining = list(set(schema_cols) - set(schema_inters_cols)) 120 | if remaining: 121 | for col in remaining: 122 | prop = table_props.get(col) 123 | if prop.get('default', None): 124 | df.loc[:,col] = prop.get('default') 125 | else: 126 | dtype = get_schema_type(prop).lower() 127 | if (('varchar' in dtype) or ('text' in dtype) or 128 | ('string' in dtype)): 129 | df.loc[:,col] = None 130 | elif (('number' in dtype) or ('timestamp' in dtype) or 131 | ('float' in dtype) or ('int' in dtype)): 132 | df.loc[:,col] = np.nan 133 | elif 'bool' in dtype: 134 | df.loc[:,col] = False 135 | else: 136 | df.loc[:,col] = '' 137 | 138 | #make sure we rearrange 139 | schema_cols.sort(key=schema_orders.__getitem__) 140 | return df.loc[:,schema_cols] 141 | -------------------------------------------------------------------------------- /include/metrics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import matplotlib.pyplot as plt 3 | import mlflow 4 | import numpy as np 5 | from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score 6 | import pandas as pd 7 | 8 | 9 | 10 | def log_roc_curve(y_test: list, y_pred: list): 11 | fpr, tpr, thresholds = roc_curve(y_test, y_pred) 12 | plt.plot(fpr,tpr) 13 | plt.ylabel('False Positive Rate') 14 | plt.xlabel('True Positive Rate') 15 | plt.title('ROC Curve') 16 | plt.savefig("roc_curve.png") 17 | mlflow.log_artifact("roc_curve.png") 18 | plt.close() 19 | 20 | 21 | def log_confusion_matrix(y_test: list, y_pred: list): 22 | cm = confusion_matrix(y_test, y_pred) 23 | t_n, f_p, f_n, t_p = cm.ravel() 24 | mlflow.log_metrics({'True Positive': t_p, 'True Negative': t_n, 'False Positive': f_p, 'False Negatives': f_n}) 25 | 26 | ConfusionMatrixDisplay.from_predictions(y_test, y_pred) 27 | plt.savefig("confusion_matrix.png") 28 | mlflow.log_artifact("confusion_matrix.png") 29 | plt.close() 30 | 31 | 32 | def log_classification_report(y_test: list, y_pred: list): 33 | cr = classification_report(y_test, y_pred, output_dict=True) 34 | logging.info(cr) 35 | cr_metrics = pd.json_normalize(cr, sep='_').to_dict(orient='records')[0] 36 | mlflow.log_metrics(cr_metrics) 37 | 38 | 39 | def log_all_eval_metrics(y_test: list, y_pred: list): 40 | 41 | # Classification Report 42 | log_classification_report(y_test, y_pred) 43 | 44 | # Confusion Matrix 45 | log_confusion_matrix(y_test, y_pred) 46 | 47 | # ROC Curve 48 | log_roc_curve(y_test, y_pred) 49 | 50 | # AUC Score 51 | mlflow.log_metric('test_auc_score', roc_auc_score(y_test, y_pred)) 52 | 53 | 54 | def test(clf, test_set): 55 | logging.info('Gathering Validation set results') 56 | y_pred = clf.predict(test_set) 57 | 58 | return np.where(y_pred > 0.5, 1, 0) -------------------------------------------------------------------------------- /include/sample_data/cost_data/cost_data.csv: -------------------------------------------------------------------------------- 1 | 1,150000,32000,10000 2 | 2,200000,50000,50000 3 | 3,90000,120000,300000 4 | 4,230000,14000,7000 5 | 5,98000,27000,48000 6 | 6,72000,800000,0 7 | 7,50000,2500000,0 8 | 8,8000000,33000000,0 9 | 9,6325000,450000,76000 10 | -------------------------------------------------------------------------------- /include/sample_data/forestfire_data/forestfires.csv: -------------------------------------------------------------------------------- 1 | 1,2,aug,fri,91,166.9,752.6,7.1,25.9,41,3.6,0,100 2 | 2,2,feb,mon,84,9.3,34,2.1,13.9,40,5.4,0,57.8 3 | 3,4,mar,sat,69,2.4,15.5,0.7,17.4,24,5.4,0,92.9 4 | 4,4,mar,mon,87.2,23.9,64.7,4.1,11.8,35,1.8,0,1300 5 | 5,5,mar,sat,91.7,35.8,80.8,7.8,15.1,27,5.4,0,4857 6 | 6,5,sep,wed,92.9,133.3,699.6,9.2,26.4,21,4.5,0,9800 7 | 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,14 8 | 8,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,74.5 9 | 9,9,feb,thu,84.2,6.8,26.6,7.7,6.7,79,3.1,0,8880.7 10 | -------------------------------------------------------------------------------- /include/sample_data/forestfire_data/forestfires_corrupt.csv: -------------------------------------------------------------------------------- 1 | one,2,aug,fri,91,166.9,752.6,7.1,25.9,41,3.6,0,0 2 | two,2,feb,mon,84,9.3,34,2.1,13.9,40,5.4,0,0 3 | three,4,mar,satoorday,69,2.4,15.5,0.7,17.4,24,5.4,0,0 4 | four,4,mar,mon,87.2,23.9,64.7,4.1,11.8,35,1.8,0,0 5 | five,5,mar,sat,91.7,35.8,80.8,7.8,15.1,27,5.4,0,0 6 | six,5,sep,wed,92.9,133.3,abcd,9.2,26.4,21,4.5,0,0 7 | seven,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0 8 | eight,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,0 9 | nine,9,fb,thu,84.2,6.8,26.6,7.7,6.7,79,3.1,0,0 10 | -------------------------------------------------------------------------------- /include/sample_data/forestfire_data/forestfires_invalid.csv: -------------------------------------------------------------------------------- 1 | 1,2,ag,fri,91,166.9,752.6,7.1,25.9,41,3.6,0,0 2 | 2,2,feb,mon,84,9001,34,2.1,13.9,40,5.4,0,0 3 | 3,4,mar,satoorday,69,2.4,15.5,0.7,17.4,24,5.4,0,0 4 | 4,4,mar,mon,87.2,23.9,64.7,4.1,11.8,35,1.8,0,0 5 | 5,5,mar,sat,91.7,35.8,80.8,7.8,15.1,27,5.4,0,0 6 | 6,5,sep,wed,92.9,133.3,5,9.2,26.4,21,4.5,0,0 7 | 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0 8 | 8,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,0 9 | 9,9,fb,thu,84.2,6.8,26.6,7.7,6.7,79,3.1,0,0 10 | -------------------------------------------------------------------------------- /include/sql/bigquery_examples/load_bigquery_forestfire_data.sql: -------------------------------------------------------------------------------- 1 | INSERT simple_bigquery_example_dag.forestfires VALUES 2 | (1,2,'aug','fri',91,166.9,752.6,7.1,25.9,41,3.6,0,0), 3 | (2,2,'feb','mon',84,9.3,34,2.1,13.9,40,5.4,0,0), 4 | (3,4,'mar','sat',69,2.4,15.5,0.7,17.4,24,5.4,0,0), 5 | (4,4,'mar','mon',87.2,23.9,64.7,4.1,11.8,35,1.8,0,0), 6 | (5,5,'mar','sat',91.7,35.8,80.8,7.8,15.1,27,5.4,0,0), 7 | (6,5,'sep','wed',92.9,133.3,699.6,9.2,26.4,21,4.5,0,0), 8 | (7,5,'mar','fri',86.2,26.2,94.3,5.1,8.2,51,6.7,0,0), 9 | (8,6,'mar','fri',91.7,33.3,77.5,9,8.3,97,4,0.2,0), 10 | (9,9,'feb','thu',84.2,6.8,26.6,7.7,6.7,79,3.1,0,0); 11 | -------------------------------------------------------------------------------- /include/sql/bigquery_examples/row_quality_bigquery_forestfire_check.sql: -------------------------------------------------------------------------------- 1 | -- Query to check if row items match particular parameters passed in by Operator. 2 | SELECT ID, 3 | CASE y WHEN {{ params.y }} THEN 1 ELSE 0 END AS y_check, 4 | CASE month WHEN '{{ params.month }}' THEN 1 ELSE 0 END AS month_check, 5 | CASE day WHEN '{{ params.day }}' THEN 1 ELSE 0 END AS day_check, 6 | CASE ffmc WHEN {{ params.ffmc }} THEN 1 ELSE 0 END AS ffmc_check, 7 | CASE dmc WHEN {{ params.dmc }} THEN 1 ELSE 0 END AS dmc_check, 8 | CASE dc WHEN {{ params.dc }} THEN 1 ELSE 0 END AS dc_check, 9 | CASE isi WHEN {{ params.isi }} THEN 1 ELSE 0 END AS isi_check, 10 | CASE temp WHEN {{ params.temp }} THEN 1 ELSE 0 END AS temp_check, 11 | CASE rh WHEN {{ params.rh }} THEN 1 ELSE 0 END AS rh_check, 12 | CASE wind WHEN {{ params.wind }} THEN 1 ELSE 0 END AS wind_check, 13 | CASE rain WHEN {{ params.rain }} THEN 1 ELSE 0 END AS rain_check, 14 | CASE area WHEN {{ params.area }} THEN 1 ELSE 0 END AS area_check 15 | FROM {{ params.dataset }}.{{ params.table }} 16 | WHERE ID = {{ params.id }} 17 | -------------------------------------------------------------------------------- /include/sql/dbt_examples/copy_store_failures.sql: -------------------------------------------------------------------------------- 1 | -- Load store_failures dbt data from the default, overwritten table to a permanent table 2 | INSERT INTO {{ params.destination_table }} ({{ params.columns }}) 3 | SELECT {{ params.columns }} 4 | FROM {{ params.source_table }}; 5 | -------------------------------------------------------------------------------- /include/sql/firebolt_examples/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE FACT TABLE IF NOT EXISTS {{ conn.firebolt_default.schema }}.{{ params.table }} 2 | ( 3 | id INT, 4 | y INT, 5 | month VARCHAR(25), 6 | day VARCHAR(25), 7 | ffmc FLOAT, 8 | dmc FLOAT, 9 | dc FLOAT, 10 | isi FLOAT, 11 | temp FLOAT, 12 | rh FLOAT, 13 | wind FLOAT, 14 | rain FLOAT, 15 | area FLOAT 16 | ) PRIMARY INDEX id; 17 | -------------------------------------------------------------------------------- /include/sql/firebolt_examples/drop_table.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS {{ params.table }}; 2 | -------------------------------------------------------------------------------- /include/sql/firebolt_examples/load_forestfire_data.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {{ params.table }} VALUES 2 | (1,2,'aug','fri',91,166.9,752.6,7.1,25.9,41,3.6,0,0), 3 | (2,2,'feb','mon',84,9.3,34,2.1,13.9,40,5.4,0,0), 4 | (3,4,'mar','sat',69,2.4,15.5,0.7,17.4,24,5.4,0,0), 5 | (4,4,'mar','mon',87.2,23.9,64.7,4.1,11.8,35,1.8,0,0), 6 | (5,5,'mar','sat',91.7,35.8,80.8,7.8,15.1,27,5.4,0,0), 7 | (6,5,'sep','wed',92.9,133.3,699.6,9.2,26.4,21,4.5,0,0), 8 | (7,5,'mar','fri',86.2,26.2,94.3,5.1,8.2,51,6.7,0,0), 9 | (8,6,'mar','fri',91.7,33.3,77.5,9,8.3,97,4,0.2,0), 10 | (9,9,'feb','thu',84.2,6.8,26.6,7.7,6.7,79,3.1,0,0); 11 | -------------------------------------------------------------------------------- /include/sql/firebolt_examples/quality_check_template.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN({{ params.col }}) 2 | FROM( 3 | SELECT 4 | CASE WHEN {{ params.check_statement }} THEN 1 ELSE 0 END AS {{ params.col }} 5 | FROM {{ params.table }} 6 | ) 7 | -------------------------------------------------------------------------------- /include/sql/great_expectations_examples/copy_yellow_tripdata_snowflake_staging.sql: -------------------------------------------------------------------------------- 1 | {% set table_schema = params.table_schema %} 2 | MERGE INTO {{ conn.snowflake_default.schema }}.{{ params.table_name }} as dest 3 | USING ( 4 | SELECT * 5 | FROM 6 | {{ conn.snowflake_default.schema }}.{{ params.audit_table_name }} 7 | ) as stg 8 | ON dest.PICKUP_DATETIME = stg.PICKUP_DATETIME 9 | AND dest.DROPOFF_DATETIME = stg.DROPOFF_DATETIME 10 | WHEN NOT MATCHED THEN 11 | INSERT ( 12 | {%- for name, col_dict in table_schema.items() -%} 13 | {%- if loop.first %} 14 | {{ name }} 15 | {%- else %} 16 | ,{{ name }} 17 | {%- endif %} 18 | {%- endfor %} 19 | ) 20 | VALUES 21 | ( 22 | {% for name, col_dict in table_schema.items() %} 23 | {%- if not loop.first %} 24 | ,{%- endif -%} 25 | {%- if 'default' in col_dict.keys() -%} 26 | COALESCE(stg.{{ name }}, '{{col_dict.get('default', 'missing_value')}}') 27 | {%- else -%} 28 | stg.{{ name }} 29 | {%- endif -%} 30 | {%- endfor %} 31 | ) 32 | -------------------------------------------------------------------------------- /include/sql/great_expectations_examples/create_snowflake_yellow_tripdata_stage.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE STAGE {{ params.stage_name }} url=s3://{{ var.json.aws_configs.s3_bucket }} 2 | credentials=(aws_key_id='{{ conn.aws_default.login }}' aws_secret_key='{{ conn.aws_default.password }}') 3 | file_format=(type = 'CSV', skip_header = 1, time_format = 'YYYY-MM-DD HH24:MI:SS'); 4 | -------------------------------------------------------------------------------- /include/sql/great_expectations_examples/create_yellow_tripdata_redshift_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {{ params.table_name }} 2 | (vendor_id int, 3 | pickup_datetime timestamp, 4 | dropoff_datetime timestamp, 5 | passenger_count int, 6 | trip_distance float, 7 | rate_code_id int, 8 | store_and_fwd_flag varchar, 9 | pickup_location_id int, 10 | dropoff_location_id int, 11 | payment_type int, 12 | fare_amount float, 13 | extra float, 14 | mta_tax float, 15 | tip_amount float, 16 | tolls_amount float, 17 | improvement_surcharge float, 18 | total_amount float, 19 | congestion_surcharge float); 20 | -------------------------------------------------------------------------------- /include/sql/great_expectations_examples/create_yellow_tripdata_snowflake_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {{ conn.snowflake_default.schema }}.{{ params.table_name }} 2 | (vendor_id int, 3 | pickup_datetime timestamp, 4 | dropoff_datetime timestamp, 5 | passenger_count int, 6 | trip_distance float, 7 | rate_code_id int, 8 | store_and_fwd_flag varchar, 9 | pickup_location_id int, 10 | dropoff_location_id int, 11 | payment_type int, 12 | fare_amount float, 13 | extra float, 14 | mta_tax float, 15 | tip_amount float, 16 | tolls_amount float, 17 | improvement_surcharge float, 18 | total_amount float, 19 | congestion_surcharge float); 20 | -------------------------------------------------------------------------------- /include/sql/great_expectations_examples/delete_yellow_tripdata_table.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS {{ params.table_name }}; 2 | -------------------------------------------------------------------------------- /include/sql/great_expectations_examples/table_schemas/tripdata_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "yellow_tripdata_example", 3 | "yellow_tripdata": { 4 | "properties": { 5 | "VENDOR_ID": {"type": "int", "description": "source_order=1"}, 6 | "PICKUP_DATETIME": {"type": "timestamp", "description": "source_order=2"}, 7 | "DROPOFF_DATETIME": {"type": "timestamp", "description": "source_order=3"}, 8 | "PASSENGER_COUNT": {"type": "int", "description": "source_order=4"}, 9 | "TRIP_DISTANCE": {"type": "float", "description": "source_order=5"}, 10 | "RATE_CODE_ID": {"type": "int", "description": "source_order=6"}, 11 | "STORE_AND_FWD_FLAG": {"type": "varchar(64)", "description": "source_order=7"}, 12 | "PICKUP_LOCATION_ID": {"type": "int", "description": "source_order=8"}, 13 | "DROPOFF_LOCATION_ID": {"type": "int", "description": "source_order=9"}, 14 | "PAYMENT_TYPE": {"type": "int", "description": "source_order=10"}, 15 | "FARE_AMOUNT": {"type": "float", "description": "source_order=11"}, 16 | "EXTRA": {"type": "float", "description": "source_order=12"}, 17 | "MTA_TAX": {"type": "float", "description": "source_order=13"}, 18 | "TIP_AMOUNT": {"type": "float", "description": "source_order=14"}, 19 | "TOLLS_AMOUNT": {"type": "float", "description": "source_order=15"}, 20 | "IMPROVEMENT_SURCHARGE": {"type": "float", "description": "source_order=16"}, 21 | "TOTAL_AMOUNT": {"type": "float", "description": "source_order=17"}, 22 | "CONGESTION_SURCHARGE": {"type": "float", "description": "source_order=18"} 23 | }, 24 | "dimensions": ["vendor_id", 25 | "pickup_datetime", 26 | "dropoff_datetime"], 27 | "metrics":["passenger_count", 28 | "trip_distance", 29 | "rate_code_id", 30 | "store_and_fwd_flag", 31 | "pickup_location_id", 32 | "dropoff_location_id", 33 | "payment_type", 34 | "fare_amount", 35 | "extra", 36 | "mta_tax", 37 | "tip_amount", 38 | "tolls_amount", 39 | "improvement_surcharge", 40 | "total_amount", 41 | "congestion_surcharge"], 42 | "cluster_keys": { 43 | "columns":["VENDOR_ID","PICKUP_DATETIME","DROPOFF_DATETIME"], 44 | "description": null 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /include/sql/redshift_examples/create_redshift_forestfire_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {{ var.json.aws_configs.redshift_table }} 2 | (ID int,Y int,month varchar,day varchar,FFMC float,DMC float,DC float,ISI float,temp float,RH float,wind float,rain float,area float); 3 | -------------------------------------------------------------------------------- /include/sql/redshift_examples/drop_redshift_forestfire_table.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS {{ var.json.aws_configs.redshift_table }}; 2 | -------------------------------------------------------------------------------- /include/sql/redshift_examples/row_quality_redshift_forestfire_check.sql: -------------------------------------------------------------------------------- 1 | -- Query to check if row items match particular parameters passed in by Operator. 2 | SELECT ID, 3 | CASE y WHEN {{ params.y }} THEN 1 ELSE 0 END AS y_check, 4 | CASE month WHEN '{{ params.month }}' THEN 1 ELSE 0 END AS month_check, 5 | CASE day WHEN '{{ params.day }}' THEN 1 ELSE 0 END AS day_check, 6 | CASE ffmc WHEN {{ params.ffmc }} THEN 1 ELSE 0 END AS ffmc_check, 7 | CASE dmc WHEN {{ params.dmc }} THEN 1 ELSE 0 END AS dmc_check, 8 | CASE dc WHEN {{ params.dc }} THEN 1 ELSE 0 END AS dc_check, 9 | CASE isi WHEN {{ params.isi }} THEN 1 ELSE 0 END AS isi_check, 10 | CASE temp WHEN {{ params.temp }} THEN 1 ELSE 0 END AS temp_check, 11 | CASE rh WHEN {{ params.rh }} THEN 1 ELSE 0 END AS rh_check, 12 | CASE wind WHEN {{ params.wind }} THEN 1 ELSE 0 END AS wind_check, 13 | CASE rain WHEN {{ params.rain }} THEN 1 ELSE 0 END AS rain_check, 14 | CASE area WHEN {{ params.area }} THEN 1 ELSE 0 END AS area_check 15 | FROM {{ var.json.aws_configs.redshift_table }} 16 | WHERE ID = {{ params.id }} 17 | -------------------------------------------------------------------------------- /include/sql/redshift_examples/validate_redshift_forestfire_load.sql: -------------------------------------------------------------------------------- 1 | -- SQL query to validate the upload of forestfires.csv 2 | SELECT CASE 0 WHEN COUNT(trim(filename)) THEN 1 ELSE 0 END as filename_check 3 | FROM stl_load_errors 4 | WHERE filename LIKE '%{{ params.filename }}%' 5 | LIMIT 1; 6 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/copy_forestfire_snowflake_audit.sql: -------------------------------------------------------------------------------- 1 | {% set table_schema = params.table_schema %} 2 | MERGE INTO {{ conn.snowflake_default.schema }}.{{ params.table_name }} as dest 3 | USING ( 4 | SELECT * 5 | FROM 6 | {{ conn.snowflake_default.schema }}.{{ params.audit_table_name }} 7 | ) as stg 8 | ON dest.ID = stg.ID 9 | WHEN NOT MATCHED THEN 10 | INSERT ( 11 | {%- for name, col_dict in table_schema.items() -%} 12 | {%- if loop.first %} 13 | {{ name }} 14 | {%- else %} 15 | ,{{ name }} 16 | {%- endif %} 17 | {%- endfor %} 18 | ) 19 | VALUES 20 | ( 21 | {% for name, col_dict in table_schema.items() %} 22 | {%- if not loop.first %} 23 | ,{%- endif -%} 24 | {%- if 'default' in col_dict.keys() -%} 25 | COALESCE(stg.{{ name }}, '{{col_dict.get('default', 'missing_value')}}') 26 | {%- else -%} 27 | stg.{{ name }} 28 | {%- endif -%} 29 | {%- endfor %} 30 | ) 31 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/create_cost_table.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TRANSIENT TABLE {{ params.table_name }} 2 | (id INT, 3 | land_damage_cost INT, 4 | property_damage_cost INT, 5 | lost_profits_cost INT); 6 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/create_forestfire_cost_table.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TRANSIENT TABLE {{ params.table_name }} 2 | (id INT, 3 | land_damage_cost INT, 4 | property_damage_cost INT, 5 | lost_profits_cost INT, 6 | total_cost INT, 7 | y INT, 8 | month VARCHAR(25), 9 | day VARCHAR(25), 10 | area FLOAT); 11 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/create_forestfire_table.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TRANSIENT TABLE {{ params.table_name }} 2 | (id INT, 3 | y INT, 4 | month VARCHAR(25), 5 | day VARCHAR(25), 6 | ffmc FLOAT, 7 | dmc FLOAT, 8 | dc FLOAT, 9 | isi FLOAT, 10 | temp FLOAT, 11 | rh FLOAT, 12 | wind FLOAT, 13 | rain FLOAT, 14 | area FLOAT); 15 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/create_snowflake_yellow_tripdata_stage.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE STAGE {{ params.stage_name }} url=s3://{{ var.json.aws_configs.s3_bucket }} 2 | credentials=(aws_key_id='{{ conn.aws_default.login }}' aws_secret_key='{{ conn.aws_default.password }}') 3 | file_format=(type = 'CSV', skip_header = 1, time_format = 'YYYY-MM-DD HH24:MI:SS'); 4 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/create_snowflake_yellow_tripdata_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {{ conn.snowflake_default.schema }}.{{ params.table_name }} 2 | ( 3 | vendor_id int, 4 | pickup_datetime timestamp, 5 | dropoff_datetime timestamp, 6 | passenger_count int, 7 | trip_distance float, 8 | rate_code_id int, 9 | store_and_fwd_flag varchar, 10 | pickup_location_id int, 11 | dropoff_location_id int, 12 | payment_type int, 13 | fare_amount float, 14 | extra float, 15 | mta_tax float, 16 | tip_amount float, 17 | tolls_amount float, 18 | improvement_surcharge float, 19 | total_amount float, 20 | congestion_surcharge float 21 | ); 22 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/delete_forestfire_table.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS {{ params.table_name }}; 2 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/delete_snowflake_table.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS {{ params.table_name }}; 2 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/load_cost_data.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {{ params.table_name }} VALUES 2 | (1,150000,32000,10000), 3 | (2,200000,50000,50000), 4 | (3,90000,120000,300000), 5 | (4,230000,14000,7000), 6 | (5,98000,27000,48000), 7 | (6,72000,800000,0), 8 | (7,50000,2500000,0), 9 | (8,8000000,33000000,0), 10 | (9,6325000,450000,76000); 11 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/load_forestfire_cost_data.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO forestfire_costs (id, land_damage_cost, property_damage_cost, lost_profits_cost, total_cost, y, month, day, area) 2 | SELECT 3 | c.id, 4 | c.land_damage_cost, 5 | c.property_damage_cost, 6 | c.lost_profits_cost, 7 | c.land_damage_cost + c.property_damage_cost + c.lost_profits_cost, 8 | ff.y, 9 | ff.month, 10 | ff.day, 11 | ff.area 12 | FROM costs c 13 | LEFT JOIN forestfires ff 14 | ON c.id = ff.id 15 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/load_snowflake_forestfire_data.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {{ params.table_name }} VALUES 2 | (1,2,'aug','fri',91,166.9,752.6,7.1,25.9,41,3.6,0,100), 3 | (2,2,'feb','mon',84,9.3,34,2.1,13.9,40,5.4,0,57.8), 4 | (3,4,'mar','sat',69,2.4,15.5,0.7,17.4,24,5.4,0,92.9), 5 | (4,4,'mar','mon',87.2,23.9,64.7,4.1,11.8,35,1.8,0,1300), 6 | (5,5,'mar','sat',91.7,35.8,80.8,7.8,15.1,27,5.4,0,4857), 7 | (6,5,'sep','wed',92.9,133.3,699.6,9.2,26.4,21,4.5,0,9800), 8 | (7,5,'mar','fri',86.2,26.2,94.3,5.1,8.2,51,6.7,0,14), 9 | (8,6,'mar','fri',91.7,33.3,77.5,9,8.3,97,4,0.2,74.5), 10 | (9,9,'feb','thu',84.2,6.8,26.6,7.7,6.7,79,3.1,0,8880.7); 11 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/row_quality_snowflake_forestfire_check.sql: -------------------------------------------------------------------------------- 1 | -- Query to check if row items match particular parameters passed in by Operator. 2 | SELECT ID, 3 | CASE y WHEN {{ params.y }} THEN 1 ELSE 0 END AS y_check, 4 | CASE month WHEN '{{ params.month }}' THEN 1 ELSE 0 END AS month_check, 5 | CASE day WHEN '{{ params.day }}' THEN 1 ELSE 0 END AS day_check, 6 | CASE ffmc WHEN {{ params.ffmc }} THEN 1 ELSE 0 END AS ffmc_check, 7 | CASE dmc WHEN {{ params.dmc }} THEN 1 ELSE 0 END AS dmc_check, 8 | CASE dc WHEN {{ params.dc }} THEN 1 ELSE 0 END AS dc_check, 9 | CASE isi WHEN {{ params.isi }} THEN 1 ELSE 0 END AS isi_check, 10 | CASE temp WHEN {{ params.temp }} THEN 1 ELSE 0 END AS temp_check, 11 | CASE rh WHEN {{ params.rh }} THEN 1 ELSE 0 END AS rh_check, 12 | CASE wind WHEN {{ params.wind }} THEN 1 ELSE 0 END AS wind_check, 13 | CASE rain WHEN {{ params.rain }} THEN 1 ELSE 0 END AS rain_check, 14 | CASE area WHEN {{ params.area }} THEN 1 ELSE 0 END AS area_check 15 | FROM {{ params.table_name }} 16 | WHERE ID = {{ params.id }} 17 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/row_quality_yellow_tripdata_check.sql: -------------------------------------------------------------------------------- 1 | -- Query to check row items 2 | SELECT vendor_id, pickup_datetime, 3 | CASE WHEN dropoff_datetime > pickup_datetime THEN 1 ELSE 0 END AS date_check, 4 | CASE WHEN passenger_count >= 0 THEN 1 ELSE 0 END AS passenger_count_check, 5 | CASE WHEN trip_distance >= 0 AND trip_distance <= 100 THEN 1 ELSE 0 END AS trip_distance_check, 6 | CASE WHEN ROUND((fare_amount + extra + mta_tax + tip_amount + improvement_surcharge + COALESCE(congestion_surcharge, 0)), 1) = ROUND(total_amount, 1) THEN 1 7 | WHEN ROUND(fare_amount + extra + mta_tax + tip_amount + improvement_surcharge, 1) = ROUND(total_amount, 1) THEN 1 ELSE 0 END AS fare_check 8 | FROM {{ params.table }} 9 | WHERE pickup_datetime IN (SELECT pickup_datetime FROM {{ params.table }} ORDER BY RANDOM() LIMIT 1) 10 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/row_quality_yellow_tripdata_template.sql: -------------------------------------------------------------------------------- 1 | -- Template to check various columns in the yellow tripdata data set. 2 | SELECT MIN({{ params.check_name }}) 3 | FROM( 4 | SELECT 5 | CASE WHEN {{ params.check_statement }} THEN 1 ELSE 0 END AS {{ params.check_name }} 6 | FROM {{ params.table }} 7 | ) 8 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/table_schemas/forestfire_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "forestfire_example", 3 | "forestfire": { 4 | "properties": { 5 | "ID": {"type": "int", "description": "source_order=1"}, 6 | "Y": {"type": "int", "description": "source_order=2"}, 7 | "MONTH": {"type": "varchar(25)", "description": "source_order=3"}, 8 | "DAY": {"type": "varchar(25)", "description": "source_order=4"}, 9 | "FFMC": {"type": "float", "description": "source_order=5"}, 10 | "DMC": {"type": "float", "description": "source_order=6"}, 11 | "DC": {"type": "float", "description": "source_order=7"}, 12 | "ISI": {"type": "float", "description": "source_order=8"}, 13 | "TEMP": {"type": "float", "description": "source_order=9"}, 14 | "RH": {"type": "float", "description": "source_order=10"}, 15 | "WIND": {"type": "float", "description": "source_order=11"}, 16 | "RAIN": {"type": "float", "description": "source_order=12"}, 17 | "AREA": {"type": "float", "description": "source_order=13"} 18 | }, 19 | "dimensions": ["id"], 20 | "metrics":[ 21 | "y", 22 | "month", 23 | "day", 24 | "ffmc", 25 | "dmc", 26 | "dc", 27 | "isi", 28 | "temp", 29 | "rh", 30 | "wind", 31 | "rain", 32 | "area" 33 | ], 34 | "cluster_keys": { 35 | "columns":["ID"], 36 | "description": null 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /include/sql/snowflake_examples/transform_forestfire_cost_table.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | id, 3 | month, 4 | day, 5 | total_cost, 6 | area, 7 | total_cost / area as cost_per_area 8 | FROM {{ params.table_name }} 9 | -------------------------------------------------------------------------------- /include/sql/sql_examples/create_redshift_yellow_tripdata_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {{ var.json.aws_configs.redshift_table }} 2 | (vendor_id int, 3 | pickup_datetime timestamp, 4 | dropoff_datetime timestamp, 5 | passenger_count int, 6 | trip_distance float, 7 | rate_code_id int, 8 | store_and_fwd_flag varchar, 9 | pickup_location_id int, 10 | dropoff_location_id int, 11 | payment_type int, 12 | fare_amount float, 13 | extra float, 14 | mta_tax float, 15 | tip_amount float, 16 | tolls_amount float, 17 | improvement_surcharge float, 18 | total_amount float, 19 | congestion_surcharge float, 20 | upload_date timestamp); 21 | -------------------------------------------------------------------------------- /include/sql/sql_examples/drop_redshift_yellow_tripdata_table.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS {{ var.json.aws_configs.redshift_table }}; 2 | -------------------------------------------------------------------------------- /include/sql/sql_examples/row_quality_yellow_tripdata_check.sql: -------------------------------------------------------------------------------- 1 | -- Query to check row items 2 | SELECT vendor_id, pickup_datetime, 3 | CASE WHEN dropoff_datetime > pickup_datetime THEN 1 ELSE 0 END AS date_check, 4 | CASE WHEN passenger_count >= 0 THEN 1 ELSE 0 END AS passenger_count_check, 5 | CASE WHEN trip_distance >= 0 AND trip_distance <= 100 THEN 1 ELSE 0 END AS trip_distance_check, 6 | CASE WHEN ROUND((fare_amount + extra + mta_tax + tip_amount + improvement_surcharge + COALESCE(congestion_surcharge, 0)), 1) = ROUND(total_amount, 1) THEN 1 7 | WHEN ROUND(fare_amount + extra + mta_tax + tip_amount + improvement_surcharge, 1) = ROUND(total_amount, 1) THEN 1 ELSE 0 END AS fare_check 8 | FROM {{ var.json.aws_configs.redshift_table }} 9 | WHERE pickup_datetime IN (SELECT pickup_datetime FROM {{ var.json.aws_configs.redshift_table }} ORDER BY RANDOM() LIMIT 1) 10 | -------------------------------------------------------------------------------- /include/validation/forestfire_validation.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": { 3 | "y": "2", 4 | "month": "aug", 5 | "day": "fri", 6 | "ffmc": "91", 7 | "dmc": "166.9", 8 | "dc": "752.6", 9 | "isi": "7.1", 10 | "temp": "25.9", 11 | "rh": "41", 12 | "wind": "3.6", 13 | "rain": "0", 14 | "area": "0" 15 | }, 16 | "2": { 17 | "y": "2", 18 | "month": "feb", 19 | "day": "mon", 20 | "ffmc": "84", 21 | "dmc": "9.3", 22 | "dc": "34", 23 | "isi": "2.1", 24 | "temp": "13.9", 25 | "rh": "40", 26 | "wind": "5.4", 27 | "rain": "0", 28 | "area": "0" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astronomer/airflow-data-quality-demo/8847b1c9e749966a762ed5b9fa8d2075d4772352/packages.txt -------------------------------------------------------------------------------- /plugins/firebolt_operator_test.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | from typing import Any, Dict, List, Optional, Union 19 | 20 | from airflow.models import BaseOperator, BaseOperatorLink 21 | # from firebolt_provider.hooks.firebolt import FireboltHook 22 | from airflow.operators.sql import BaseSQLOperator 23 | 24 | """ 25 | def get_db_hook(self) -> SqlHook: 26 | 27 | Create and return FireboltHook. 28 | 29 | :return: a FireboltHook instance. 30 | :rtype: FireboltHook 31 | 32 | return FireboltHook( 33 | firebolt_conn_id=self.firebolt_conn_id, 34 | database=self.database, 35 | engine_name=self.engine_name, 36 | ) 37 | """ 38 | 39 | 40 | class RegistryLink(BaseOperatorLink): 41 | """Link to Registry""" 42 | 43 | name = "Astronomer Registry" 44 | 45 | def get_link(self, operator, dttm): 46 | """Get link to registry page.""" 47 | 48 | registry_link = ( 49 | "https://registry.astronomer.io/providers/{provider}/modules/{operator}" 50 | ) 51 | return registry_link.format(provider="firebolt", operator="fireboltoperator") 52 | 53 | 54 | class FireboltOperator(BaseSQLOperator): 55 | """ 56 | Executes SQL code in a Firebolt database 57 | 58 | :param firebolt_conn_id: Firebolt connection id 59 | :type firebolt_conn_id: str 60 | :param sql: the sql code to be executed. (templated) 61 | :type sql: Can receive a str representing a sql statement, 62 | a list of str (sql statements), or reference to a template file. 63 | Template reference are recognized by str ending in '.sql' 64 | :param autocommit: if True, each command is automatically committed. 65 | Currently firebolt doesn't support autocommit feature. 66 | (default value: False) 67 | :type autocommit: bool 68 | :param parameters: (optional) the parameters to render the SQL query with. 69 | :type parameters: dict or iterable 70 | :param database: name of database (will overwrite database defined 71 | in connection) 72 | :type database: str 73 | :param engine_name: name of engine (will overwrite engine_name defined in 74 | connection) 75 | :type engine_name: str 76 | """ 77 | 78 | template_fields = ('sql',) 79 | template_ext = ('.sql',) 80 | ui_color = '#b4e0ff' 81 | 82 | def __init__( 83 | self, 84 | *, 85 | sql: Union[str, List[str]], 86 | conn_id: str = 'firebolt_default', 87 | parameters: Optional[dict] = None, 88 | database: Optional[str] = None, 89 | engine_name: Optional[str] = None, 90 | autocommit: bool = False, 91 | hook_params: Optional[Dict] = None, 92 | ** kwargs: Any, 93 | ) -> None: 94 | super().__init__(**kwargs) 95 | #self.firebolt_conn_id = conn_id 96 | self.sql = sql 97 | self.database = database 98 | self.engine_name = engine_name 99 | self.parameters = parameters 100 | self.autocommit = autocommit 101 | self.hook_params = hook_params 102 | 103 | def execute(self, context: Dict[Any, Any]) -> None: 104 | """Run query on firebolt""" 105 | self.log.info('Executing: %s', self.sql) 106 | hook = self.get_db_hook() 107 | hook.run(sql=self.sql, autocommit=self.autocommit, 108 | parameters=self.parameters) 109 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-slack==7.2.0 2 | airflow-provider-great-expectations==0.2.5 3 | airflow-provider-firebolt==0.1.3 4 | apache-airflow-providers-trino==4.3.1 5 | great-expectations==0.15.50 6 | lightgbm==3.2.1 7 | matplotlib==3.5.1 8 | mlflow==1.23.0 9 | openlineage-airflow==0.19.2 10 | pandas==1.3.4 11 | scikit-learn==1.0.1 12 | sqlalchemy-bigquery==1.3.0 13 | --------------------------------------------------------------------------------